optexity 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. optexity/examples/__init__.py +0 -0
  2. optexity/examples/add_example.py +88 -0
  3. optexity/examples/download_pdf_url.py +29 -0
  4. optexity/examples/extract_price_stockanalysis.py +44 -0
  5. optexity/examples/file_upload.py +59 -0
  6. optexity/examples/i94.py +126 -0
  7. optexity/examples/i94_travel_history.py +126 -0
  8. optexity/examples/peachstate_medicaid.py +201 -0
  9. optexity/examples/supabase_login.py +75 -0
  10. optexity/inference/__init__.py +0 -0
  11. optexity/inference/agents/__init__.py +0 -0
  12. optexity/inference/agents/error_handler/__init__.py +0 -0
  13. optexity/inference/agents/error_handler/error_handler.py +39 -0
  14. optexity/inference/agents/error_handler/prompt.py +60 -0
  15. optexity/inference/agents/index_prediction/__init__.py +0 -0
  16. optexity/inference/agents/index_prediction/action_prediction_locator_axtree.py +45 -0
  17. optexity/inference/agents/index_prediction/prompt.py +14 -0
  18. optexity/inference/agents/select_value_prediction/__init__.py +0 -0
  19. optexity/inference/agents/select_value_prediction/prompt.py +20 -0
  20. optexity/inference/agents/select_value_prediction/select_value_prediction.py +39 -0
  21. optexity/inference/agents/two_fa_extraction/__init__.py +0 -0
  22. optexity/inference/agents/two_fa_extraction/prompt.py +23 -0
  23. optexity/inference/agents/two_fa_extraction/two_fa_extraction.py +47 -0
  24. optexity/inference/child_process.py +251 -0
  25. optexity/inference/core/__init__.py +0 -0
  26. optexity/inference/core/interaction/__init__.py +0 -0
  27. optexity/inference/core/interaction/handle_agentic_task.py +79 -0
  28. optexity/inference/core/interaction/handle_check.py +57 -0
  29. optexity/inference/core/interaction/handle_click.py +79 -0
  30. optexity/inference/core/interaction/handle_command.py +261 -0
  31. optexity/inference/core/interaction/handle_input.py +76 -0
  32. optexity/inference/core/interaction/handle_keypress.py +16 -0
  33. optexity/inference/core/interaction/handle_select.py +109 -0
  34. optexity/inference/core/interaction/handle_select_utils.py +132 -0
  35. optexity/inference/core/interaction/handle_upload.py +59 -0
  36. optexity/inference/core/interaction/utils.py +81 -0
  37. optexity/inference/core/logging.py +406 -0
  38. optexity/inference/core/run_assertion.py +55 -0
  39. optexity/inference/core/run_automation.py +463 -0
  40. optexity/inference/core/run_extraction.py +240 -0
  41. optexity/inference/core/run_interaction.py +254 -0
  42. optexity/inference/core/run_python_script.py +20 -0
  43. optexity/inference/core/run_two_fa.py +120 -0
  44. optexity/inference/core/two_factor_auth/__init__.py +0 -0
  45. optexity/inference/infra/__init__.py +0 -0
  46. optexity/inference/infra/browser.py +455 -0
  47. optexity/inference/infra/browser_extension.py +20 -0
  48. optexity/inference/models/__init__.py +22 -0
  49. optexity/inference/models/gemini.py +113 -0
  50. optexity/inference/models/human.py +20 -0
  51. optexity/inference/models/llm_model.py +210 -0
  52. optexity/inference/run_local.py +200 -0
  53. optexity/schema/__init__.py +0 -0
  54. optexity/schema/actions/__init__.py +0 -0
  55. optexity/schema/actions/assertion_action.py +66 -0
  56. optexity/schema/actions/extraction_action.py +143 -0
  57. optexity/schema/actions/interaction_action.py +330 -0
  58. optexity/schema/actions/misc_action.py +18 -0
  59. optexity/schema/actions/prompts.py +27 -0
  60. optexity/schema/actions/two_fa_action.py +24 -0
  61. optexity/schema/automation.py +432 -0
  62. optexity/schema/callback.py +16 -0
  63. optexity/schema/inference.py +87 -0
  64. optexity/schema/memory.py +100 -0
  65. optexity/schema/task.py +212 -0
  66. optexity/schema/token_usage.py +48 -0
  67. optexity/utils/__init__.py +0 -0
  68. optexity/utils/settings.py +54 -0
  69. optexity/utils/utils.py +76 -0
  70. {optexity-0.1.2.dist-info → optexity-0.1.3.dist-info}/METADATA +1 -1
  71. optexity-0.1.3.dist-info/RECORD +80 -0
  72. optexity-0.1.2.dist-info/RECORD +0 -11
  73. {optexity-0.1.2.dist-info → optexity-0.1.3.dist-info}/WHEEL +0 -0
  74. {optexity-0.1.2.dist-info → optexity-0.1.3.dist-info}/entry_points.txt +0 -0
  75. {optexity-0.1.2.dist-info → optexity-0.1.3.dist-info}/licenses/LICENSE +0 -0
  76. {optexity-0.1.2.dist-info → optexity-0.1.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,455 @@
1
+ import asyncio
2
+ import base64
3
+ import json
4
+ import logging
5
+ import re
6
+ from typing import Literal
7
+ from uuid import uuid4
8
+
9
+ from browser_use import Agent, BrowserSession, ChatGoogle
10
+ from browser_use.browser.views import BrowserStateSummary
11
+ from patchright._impl._errors import TimeoutError as PatchrightTimeoutError
12
+ from playwright._impl._errors import TimeoutError as PlaywrightTimeoutError
13
+ from playwright.async_api import Download, Locator, Page, Request, Response
14
+
15
+ from optexity.schema.memory import Memory, NetworkRequest, NetworkResponse
16
+ from optexity.utils.settings import settings
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class Browser:
22
+ def __init__(
23
+ self,
24
+ memory: Memory,
25
+ user_data_dir: str = None,
26
+ headless: bool = False,
27
+ proxy: str = None,
28
+ stealth: bool = True,
29
+ backend: Literal["browser-use", "browserbase"] = "browser-use",
30
+ debug_port: int = 9222,
31
+ channel: Literal["chromium", "chrome"] = "chromium",
32
+ use_proxy: bool = False,
33
+ proxy_session_id: str | None = None,
34
+ ):
35
+
36
+ if proxy:
37
+ proxy = proxy.removeprefix("http://").removeprefix("https://")
38
+ self.proxy = "http://" + proxy
39
+
40
+ self.headless = headless
41
+ self.stealth = stealth
42
+ self.user_data_dir = user_data_dir
43
+ self.backend = backend
44
+ self.debug_port = debug_port
45
+ self.use_proxy = use_proxy
46
+ self.proxy_session_id = proxy_session_id
47
+ self.playwright = None
48
+ self.browser = None
49
+ self.context = None
50
+ self.page = None
51
+ self.cdp_url = f"http://localhost:{self.debug_port}"
52
+ self.backend_agent = None
53
+ self.channel = channel
54
+ self.memory = memory
55
+ self.page_to_target_id = []
56
+ self.previous_total_pages = 0
57
+
58
+ self.active_downloads = 0
59
+ self.all_active_downloads_done = asyncio.Event()
60
+ self.all_active_downloads_done.set()
61
+
62
+ self.network_calls: list[NetworkResponse | NetworkRequest] = []
63
+
64
+ async def start(self):
65
+ logger.debug("Starting browser")
66
+ try:
67
+ if self.playwright is not None:
68
+ await self.playwright.stop()
69
+
70
+ if self.stealth:
71
+ from patchright.async_api import async_playwright
72
+ else:
73
+ from playwright.async_api import async_playwright
74
+
75
+ proxy = None
76
+ if self.use_proxy:
77
+ if settings.PROXY_URL is None:
78
+ raise ValueError("PROXY_URL is not set")
79
+ proxy = {"server": settings.PROXY_URL}
80
+ if settings.PROXY_USERNAME is not None:
81
+ if settings.PROXY_PROVIDER == "oxylabs":
82
+ assert settings.PROXY_COUNTRY, "PROXY_COUNTRY is not set"
83
+ assert settings.PROXY_USERNAME, "PROXY_USERNAME is not set"
84
+ assert settings.PROXY_PASSWORD, "PROXY_PASSWORD is not set"
85
+
86
+ proxy["username"] = (
87
+ f"customer-{settings.PROXY_USERNAME}-cc-{settings.PROXY_COUNTRY}-sessid-{self.proxy_session_id}-sesstime-20"
88
+ )
89
+ elif settings.PROXY_PROVIDER == "brightdata":
90
+
91
+ proxy["username"] = (
92
+ f"{settings.PROXY_USERNAME}-session-{self.proxy_session_id}"
93
+ )
94
+
95
+ else:
96
+ proxy["username"] = settings.PROXY_USERNAME
97
+
98
+ if settings.PROXY_PASSWORD is not None:
99
+ proxy["password"] = settings.PROXY_PASSWORD
100
+
101
+ self.playwright = await async_playwright().start()
102
+ self.browser = await self.playwright.chromium.launch(
103
+ channel=self.channel,
104
+ headless=self.headless,
105
+ proxy=proxy,
106
+ args=[
107
+ "--start-fullscreen",
108
+ "--disable-popup-blocking",
109
+ "--window-size=1920,1080",
110
+ f"--remote-debugging-port={self.debug_port}",
111
+ "--disable-gpu",
112
+ "--disable-extensions",
113
+ "--disable-background-networking",
114
+ ],
115
+ chromium_sandbox=False,
116
+ )
117
+
118
+ self.context = await self.browser.new_context(
119
+ no_viewport=True, ignore_https_errors=True
120
+ )
121
+
122
+ async def log_request(req: Request):
123
+ await self.log_request(req)
124
+
125
+ async def handle_random_download(download: Download):
126
+ await self.handle_random_download(download)
127
+
128
+ async def handle_random_url_downloads(resp: Response):
129
+ await self.handle_random_url_downloads(resp)
130
+
131
+ self.context.on("request", log_request)
132
+ self.context.on("response", handle_random_url_downloads)
133
+
134
+ self.context.on(
135
+ "page", lambda p: (p.on("download", handle_random_download))
136
+ )
137
+
138
+ self.page = await self.context.new_page()
139
+
140
+ browser_session = BrowserSession(cdp_url=self.cdp_url, keep_alive=True)
141
+
142
+ self.backend_agent = Agent(
143
+ task="",
144
+ llm=ChatGoogle(model="gemini-flash-latest"),
145
+ browser_session=browser_session,
146
+ use_vision=False,
147
+ )
148
+
149
+ await self.backend_agent.browser_session.start()
150
+
151
+ tabs = await self.backend_agent.browser_session.get_tabs()
152
+
153
+ for tab in tabs[::-1]:
154
+ if tab.target_id not in self.page_to_target_id:
155
+ self.page_to_target_id.append(tab.target_id)
156
+ self.previous_total_pages = len(self.context.pages)
157
+
158
+ logger.debug("Browser started successfully")
159
+
160
+ except Exception as e:
161
+ logger.error(f"Error starting playwright: {e}")
162
+ raise e
163
+
164
+ async def stop(self):
165
+ logger.debug("Stopping full system")
166
+ if self.backend_agent is not None:
167
+ logger.debug("Stopping backend agent")
168
+ self.backend_agent.stop()
169
+ if self.backend_agent.browser_session:
170
+ logger.debug("Resetting browser session")
171
+ await self.backend_agent.browser_session.stop()
172
+ # await self.backend_agent.browser_session._storage_state_watchdog._stop_monitoring()
173
+ # await self.backend_agent.browser_session.reset()
174
+ logger.debug("Browser session reset")
175
+ self.backend_agent = None
176
+
177
+ if self.context is not None:
178
+ logger.debug("Stopping context")
179
+ await self.context.close()
180
+ self.context = None
181
+
182
+ if self.browser is not None:
183
+ logger.debug("Stopping browser")
184
+ await self.browser.close()
185
+ self.browser = None
186
+
187
+ if self.playwright is not None:
188
+ logger.debug("Stopping playwright")
189
+ await self.playwright.stop()
190
+ self.playwright = None
191
+ logger.debug("Full system stopped")
192
+
193
+ async def get_current_page(self) -> Page | None:
194
+ if self.context is None:
195
+ return None
196
+ pages = self.context.pages
197
+ if len(pages) == 0:
198
+ self.page = await self.context.new_page()
199
+ else:
200
+ self.page = pages[-1]
201
+
202
+ return self.page
203
+
204
+ async def handle_new_tabs(self, max_wait_time: float) -> bool:
205
+
206
+ total_time = 0
207
+ while total_time < max_wait_time:
208
+ pages = self.context.pages
209
+ if len(pages) > self.previous_total_pages:
210
+ break
211
+ await asyncio.sleep(1)
212
+ total_time += 1
213
+
214
+ pages = self.context.pages
215
+ if len(pages) == self.previous_total_pages:
216
+ return False, total_time
217
+
218
+ tabs = await self.backend_agent.browser_session.get_tabs()
219
+
220
+ for tab in tabs[::-1]:
221
+ if tab.target_id not in self.page_to_target_id:
222
+ self.page_to_target_id.append(tab.target_id)
223
+ self.previous_total_pages = len(pages)
224
+
225
+ tab_id = self.page_to_target_id[-1][-4:]
226
+ action_model = self.backend_agent.ActionModel(**{"switch": {"tab_id": tab_id}})
227
+ await self.backend_agent.multi_act([action_model])
228
+ return True, total_time
229
+
230
+ async def close_current_tab(self):
231
+ if self.context is None:
232
+ return None
233
+
234
+ pages = self.context.pages
235
+
236
+ if len(pages) == 1:
237
+ logger.warning("Atleast one tab should be open, skipping close current tab")
238
+ return False
239
+
240
+ if len(self.page_to_target_id) > 1:
241
+ tab_id_after_close = self.page_to_target_id[-2][-4:]
242
+ action_model = self.backend_agent.ActionModel(
243
+ **{"switch": {"tab_id": tab_id_after_close}}
244
+ )
245
+ await self.backend_agent.multi_act([action_model])
246
+ self.page_to_target_id.pop()
247
+
248
+ last_page = pages[-1]
249
+ await last_page.close()
250
+
251
+ async def switch_tab(self, tab_index: int):
252
+ if self.context is None:
253
+ return None
254
+
255
+ pages = self.context.pages
256
+
257
+ if len(pages) == 1:
258
+ logger.warning("Atleast one tab should be open, skipping close current tab")
259
+ return False
260
+
261
+ tab_id = self.page_to_target_id[tab_index][-4:]
262
+ page = pages[tab_index]
263
+
264
+ await page.bring_to_front()
265
+
266
+ action_model = self.backend_agent.ActionModel(**{"switch": {"tab_id": tab_id}})
267
+ await self.backend_agent.multi_act([action_model])
268
+
269
+ async def get_locator_from_command(self, command: str) -> Locator:
270
+ page = await self.get_current_page()
271
+ if page is None:
272
+ return None
273
+ locator: Locator = eval(f"page.{command}")
274
+ return locator
275
+
276
+ def get_xpath_from_index(self, index: int) -> str:
277
+ raise NotImplementedError("Not implemented")
278
+
279
+ async def go_to_url(self, url: str):
280
+ try:
281
+ if url == "about:blank":
282
+ return
283
+ page = await self.get_current_page()
284
+ if page is None:
285
+ return None
286
+ await page.goto(url, timeout=10000)
287
+ except TimeoutError as e:
288
+ pass
289
+ except PatchrightTimeoutError as e:
290
+ pass
291
+ except PlaywrightTimeoutError as e:
292
+ pass
293
+
294
+ async def get_browser_state_summary(self) -> BrowserStateSummary:
295
+ browser_state_summary = await self.backend_agent.browser_session.get_browser_state_summary(
296
+ include_screenshot=True, # always capture even if use_vision=False so that cloud sync is useful (it's fast now anyway)
297
+ include_recent_events=self.backend_agent.include_recent_events,
298
+ cached=False,
299
+ )
300
+
301
+ return browser_state_summary
302
+
303
+ async def get_current_page_url(self) -> str:
304
+ try:
305
+ page = await self.get_current_page()
306
+ if page is None:
307
+ return None
308
+ return page.url
309
+ except Exception as e:
310
+ logger.error(f"Error getting current page URL: {e}")
311
+ return None
312
+
313
+ async def get_current_page_title(self) -> str:
314
+ try:
315
+ page = await self.get_current_page()
316
+ if page is None:
317
+ return None
318
+ return await page.title()
319
+ except Exception as e:
320
+ logger.error(f"Error getting current page title: {e}")
321
+ return None
322
+
323
+ async def handle_random_download(self, download: Download):
324
+ self.active_downloads += 1
325
+ self.all_active_downloads_done.clear()
326
+
327
+ temp_path = await download.path()
328
+ async with self.memory.download_lock:
329
+ if temp_path not in self.memory.raw_downloads:
330
+ self.memory.raw_downloads[temp_path] = (False, download)
331
+ self.active_downloads -= 1
332
+
333
+ if self.active_downloads == 0:
334
+ self.all_active_downloads_done.set()
335
+
336
+ async def handle_random_url_downloads(self, resp: Response):
337
+ try:
338
+
339
+ if "application/pdf" in resp.headers.get("content-type", ""):
340
+ self.active_downloads += 1
341
+ self.all_active_downloads_done.clear()
342
+
343
+ # Default filename fallback
344
+ filename = f"{uuid4()}.pdf"
345
+
346
+ # Try to get suggested filename from headers
347
+ content_disposition = resp.headers.get("content-disposition")
348
+ if content_disposition:
349
+ match = re.search(
350
+ r'filename\*?=(?:UTF-8\'\')?"?([^";]+)"?',
351
+ content_disposition,
352
+ )
353
+ if match:
354
+ filename = match.group(1)
355
+
356
+ self.memory.urls_to_downloads.append((resp.url, filename))
357
+ logger.info(f"Added URL to downloads: {resp.url}, {filename}")
358
+ self.active_downloads -= 1
359
+ except Exception as e:
360
+ logger.error(f"Error handling random responses: {e}")
361
+
362
+ if self.active_downloads == 0:
363
+ self.all_active_downloads_done.set()
364
+
365
+ async def log_request(self, req: Request):
366
+ try:
367
+ body = req.post_data # this is None for GET/HEAD
368
+ # Rebuild cookies exactly like curl -b
369
+ cookies = await req.frame.page.context.cookies()
370
+ cookie_header = "; ".join(f"{c['name']}={c['value']}" for c in cookies)
371
+
372
+ # Rebuild headers
373
+ headers = dict(req.headers)
374
+ headers["cookie"] = cookie_header
375
+
376
+ # Body as raw bytes
377
+ body = req.post_data
378
+
379
+ self.network_calls.append(
380
+ NetworkRequest(
381
+ url=req.url, method=req.method, headers=headers, body=body
382
+ )
383
+ )
384
+
385
+ except Exception as e:
386
+ # logger.error(f"Could not get body: {e}")
387
+ pass
388
+
389
+ async def attach_network_listeners(self):
390
+ page = await self.get_current_page()
391
+
392
+ # remove old listeners first
393
+ try:
394
+ page.remove_listener("response", self._on_response)
395
+ except Exception:
396
+ pass
397
+
398
+ page.on("response", self._on_response)
399
+
400
+ async def detach_network_listeners(self):
401
+ page = await self.get_current_page()
402
+ try:
403
+ page.remove_listener("response", self._on_response)
404
+ except Exception:
405
+ pass
406
+
407
+ async def _on_response(self, response: Response):
408
+ try:
409
+ body = await response.json()
410
+ except Exception:
411
+ try:
412
+ body = await response.text()
413
+ except Exception:
414
+ body = None
415
+
416
+ # Try to enrich response with request method and content length
417
+ method = None
418
+ try:
419
+ # Playwright provides request object for a response
420
+ method = response.request.method
421
+ except Exception:
422
+ pass
423
+
424
+ content_length = 0
425
+ try:
426
+ if body is not None:
427
+ if isinstance(body, (str, bytes)):
428
+ content_length = len(body)
429
+ elif isinstance(body, dict):
430
+ content_length = len(json.dumps(body))
431
+ except Exception:
432
+ pass
433
+
434
+ self.network_calls.append(
435
+ NetworkResponse(
436
+ url=response.url,
437
+ method=method,
438
+ status=response.status,
439
+ headers=response.headers,
440
+ body=body,
441
+ content_length=content_length,
442
+ )
443
+ )
444
+
445
+ async def clear_network_calls(self):
446
+ self.network_calls.clear()
447
+
448
+ async def get_screenshot(self, full_page: bool = False) -> str | None:
449
+ page = await self.get_current_page()
450
+ if page is None:
451
+ return None
452
+ screenshot_bytes = await page.screenshot(full_page=full_page)
453
+ screenshot_base64 = base64.b64encode(screenshot_bytes).decode("utf-8")
454
+
455
+ return screenshot_base64
@@ -0,0 +1,20 @@
1
+ from browser_use.browser.profile import BrowserProfile
2
+
3
+
4
+ class BrowserExtension:
5
+ def __init__(self, browser_profile: BrowserProfile = None):
6
+ self.browser_profile = (
7
+ browser_profile if browser_profile is not None else BrowserProfile()
8
+ )
9
+
10
+ def get_extension_paths(self):
11
+ return self.browser_profile._get_extension_args()
12
+
13
+
14
+ if __name__ == "__main__":
15
+ browser_profile = BrowserProfile(
16
+ user_data_dir="~/.config/browseruse/profiles/default",
17
+ headless=True,
18
+ )
19
+ paths = browser_profile._get_extension_args()
20
+ print(paths)
@@ -0,0 +1,22 @@
1
+ from .llm_model import GeminiModels, HumanModels, OpenAIModels
2
+
3
+
4
+ def get_llm_model(
5
+ model_name: GeminiModels | HumanModels | OpenAIModels, use_structured_output: bool
6
+ ):
7
+ if isinstance(model_name, GeminiModels):
8
+ from .gemini import Gemini
9
+
10
+ return Gemini(model_name, use_structured_output)
11
+
12
+ # if isinstance(model_name, OpenAIModels):
13
+ # from .openai import OpenAI
14
+
15
+ # return OpenAI(model_name, use_structured_output)
16
+
17
+ # if isinstance(model_name, HumanModels):
18
+ # from .human import HumanModel
19
+
20
+ # return HumanModel(model_name, use_structured_output)
21
+
22
+ raise ValueError(f"Invalid model type: {model_name}")
@@ -0,0 +1,113 @@
1
+ import base64
2
+ import logging
3
+ import os
4
+ from typing import Optional
5
+
6
+ import httpx
7
+ from google import genai
8
+ from google.genai import types
9
+ from pydantic import BaseModel, ValidationError
10
+
11
+ from .llm_model import GeminiModels, LLMModel, TokenUsage
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class Gemini(LLMModel):
17
+
18
+ def __init__(self, model_name: GeminiModels, use_structured_output: bool):
19
+ super().__init__(model_name, use_structured_output)
20
+
21
+ self.api_key = os.environ["GOOGLE_API_KEY"]
22
+ try:
23
+ self.client = genai.Client(api_key=self.api_key)
24
+ self.client.models.list()
25
+ except Exception as e:
26
+ raise ValueError("Invalid GOOGLE_API_KEY")
27
+
28
+ def _get_model_response_with_structured_output(
29
+ self,
30
+ prompt: str,
31
+ response_schema: BaseModel,
32
+ screenshot: Optional[str] = None,
33
+ pdf_url: Optional[str] = None,
34
+ system_instruction: Optional[str] = None,
35
+ ) -> tuple[BaseModel, TokenUsage]:
36
+
37
+ if pdf_url is not None and screenshot is not None:
38
+ raise ValueError("Cannot use both screenshot and pdf_url")
39
+
40
+ if screenshot is not None:
41
+ prompt = [
42
+ types.Part.from_bytes(
43
+ data=base64.b64decode(screenshot),
44
+ mime_type="image/png",
45
+ ),
46
+ prompt,
47
+ ]
48
+ if pdf_url is not None:
49
+ doc_data = httpx.get(pdf_url).content
50
+ prompt = [
51
+ types.Part.from_bytes(
52
+ data=doc_data,
53
+ mime_type="application/pdf",
54
+ ),
55
+ prompt,
56
+ ]
57
+
58
+ try:
59
+ if self.use_structured_output:
60
+ response = self.client.models.generate_content(
61
+ model=self.model_name.value,
62
+ contents=prompt,
63
+ config={
64
+ "response_mime_type": "application/json",
65
+ "system_instruction": system_instruction,
66
+ "response_json_schema": response_schema.model_json_schema(),
67
+ },
68
+ )
69
+
70
+ if isinstance(response.parsed, BaseModel):
71
+ parsed_response: BaseModel = response.parsed
72
+ else:
73
+ parsed_response = response_schema.model_validate(response.parsed)
74
+ else:
75
+ response = self.client.models.generate_content(
76
+ model=self.model_name.value,
77
+ contents=prompt,
78
+ config={"system_instruction": system_instruction},
79
+ )
80
+
81
+ parsed_response: BaseModel = self.parse_from_completion(
82
+ response.candidates[0].content.parts[0].text, response_schema
83
+ )
84
+ except ValidationError as e:
85
+ response = None
86
+ parsed_response = None
87
+
88
+ if response is not None:
89
+ token_usage = self.get_token_usage(
90
+ input_tokens=response.usage_metadata.prompt_token_count,
91
+ output_tokens=response.usage_metadata.candidates_token_count,
92
+ tool_use_tokens=response.usage_metadata.tool_use_prompt_token_count,
93
+ thoughts_tokens=response.usage_metadata.thoughts_token_count,
94
+ total_tokens=response.usage_metadata.total_token_count,
95
+ )
96
+ else:
97
+ token_usage = TokenUsage()
98
+ return parsed_response, token_usage
99
+
100
+ def _get_model_response(
101
+ self, prompt: str, system_instruction: Optional[str] = None
102
+ ) -> tuple[str, TokenUsage]:
103
+
104
+ response = self.client.models.generate_content(
105
+ model=self.model_name.value,
106
+ contents=prompt,
107
+ config={"system_instruction": system_instruction},
108
+ )
109
+ token_usage = self.get_token_usage(
110
+ input_tokens=response.usage_metadata.prompt_token_count,
111
+ output_tokens=response.usage_metadata.candidates_token_count,
112
+ )
113
+ return response.candidates[0].content.parts[0].text, token_usage
@@ -0,0 +1,20 @@
1
+ import asyncio
2
+
3
+ import aiofiles
4
+
5
+
6
+ class Human:
7
+
8
+ def __init__(self):
9
+ pass
10
+
11
+ async def get_next_action(self, axtree: str):
12
+
13
+ async with aiofiles.open("/tmp/axtree.txt", "w", encoding="utf-8") as f:
14
+ await f.write(axtree)
15
+
16
+ value = await asyncio.to_thread(
17
+ input, "Input the index of the element to click: "
18
+ )
19
+
20
+ return int(value)