versionhq 1.2.4.13__py3-none-any.whl → 1.2.4.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
versionhq/__init__.py CHANGED
@@ -35,7 +35,7 @@ from versionhq.agent_network.formation import form_agent_network
35
35
  from versionhq.task_graph.draft import workflow
36
36
 
37
37
 
38
- __version__ = "1.2.4.13"
38
+ __version__ = "1.2.4.14"
39
39
  __all__ = [
40
40
  "Agent",
41
41
 
@@ -5,3 +5,4 @@ from versionhq._utils.is_valid_url import is_valid_url
5
5
  from versionhq._utils.usage_metrics import UsageMetrics, ErrorType
6
6
  from versionhq._utils.convert_img_url import convert_img_url
7
7
  from versionhq._utils.is_valid_enum import is_valid_enum
8
+ from versionhq._utils.handle_directory import handle_directory
@@ -0,0 +1,15 @@
1
+ import os
2
+ import datetime
3
+ from pathlib import Path
4
+
5
+
6
+ def handle_directory(directory_name: str = None, filename: str = None, ext: str = 'png') -> Path:
7
+ """Creates and returns the absolute file path"""
8
+
9
+ os.makedirs(directory_name, exist_ok=True)
10
+
11
+ date = str(datetime.datetime.now().strftime('%j'))
12
+ cwd = Path.cwd()
13
+ DIRECTORY = cwd / f'{directory_name}/{filename}_{date}.{ext}'
14
+
15
+ return DIRECTORY
versionhq/agent/model.py CHANGED
@@ -454,14 +454,14 @@ class Agent(BaseModel):
454
454
  return rag_tools, gpt_tools, tools
455
455
 
456
456
 
457
- def _handle_gpt_tools(self, gpt_tools: list[Any] = None) -> Any: # TaskOutput
457
+ def _handle_gpt_tools(self, gpt_tools: list[Any] = None) -> Any: # TaskOutput or None
458
458
  """Generates k, v pairs from multiple GPT tool results and stores them in TaskOutput class."""
459
459
 
460
460
  from versionhq.task.model import TaskOutput
461
461
  from versionhq._utils import UsageMetrics
462
462
 
463
463
  if not gpt_tools:
464
- return
464
+ return None
465
465
 
466
466
  tool_res = dict()
467
467
  annotation_set = dict()
@@ -470,7 +470,9 @@ class Agent(BaseModel):
470
470
  for i, item in enumerate(gpt_tools):
471
471
  raw, annotations, usage = item.run()
472
472
  tool_res.update({ str(i): raw })
473
- annotation_set.update({ str(i): annotations })
473
+
474
+ if annotations:
475
+ annotation_set.update({ str(i): annotations })
474
476
  total_usage.aggregate(metrics=usage)
475
477
 
476
478
  res = TaskOutput(raw=str(tool_res), tool_output=tool_res, usage=total_usage, annotations=annotation_set)
@@ -7,7 +7,7 @@ class GPTSizeEnum(str, Enum):
7
7
  HIGH = "high"
8
8
 
9
9
 
10
- class GPTCUAEnvironmentEnum(str, Enum):
10
+ class GPTCUABrowserEnum(str, Enum):
11
11
  BROWSER = "browser"
12
12
  MAC = "mac"
13
13
  WINDOWS = "windows"
versionhq/tool/gpt/cua.py CHANGED
@@ -1,28 +1,29 @@
1
+ import base64
1
2
  import datetime
2
3
  import time
4
+ import platform
3
5
  from typing import List, Dict, Any, Tuple
4
6
 
5
7
  from versionhq._utils import convert_img_url
6
8
  from versionhq.tool.gpt import openai_client
7
- from versionhq.tool.gpt._enum import GPTCUAEnvironmentEnum, GPTCUATypeEnum, GPTSizeEnum
8
- from versionhq._utils import is_valid_enum, UsageMetrics, ErrorType, Logger, is_valid_url
9
+ from versionhq.tool.gpt._enum import GPTCUABrowserEnum, GPTCUATypeEnum, GPTSizeEnum
10
+ from versionhq._utils import is_valid_enum, UsageMetrics, ErrorType, Logger, is_valid_url, handle_directory
9
11
 
10
-
11
- allowed_browsers = ['webkit', 'chromium', 'firefox']
12
+ allowed_browsers = ['chromium', 'firefox']
12
13
 
13
14
 
14
15
  class CUAToolSchema:
15
16
  type: str = GPTCUATypeEnum.COMPUTER_USE_PREVIEW.value
16
17
  display_width: int = 1024
17
18
  display_height: int = 768
18
- environment: str = GPTCUAEnvironmentEnum.BROWSER.value
19
+ environment: str = GPTCUABrowserEnum.BROWSER.value
19
20
 
20
21
  def __init__(
21
22
  self,
22
23
  type: str | GPTCUATypeEnum = None,
23
24
  display_width: int = None,
24
25
  display_height: int = None,
25
- environment: str | GPTCUAEnvironmentEnum = None
26
+ environment: str | GPTCUABrowserEnum = None
26
27
  ):
27
28
  self.display_height = display_height if display_height else self.display_height
28
29
  self.display_width = display_width if display_width else self.display_width
@@ -30,11 +31,8 @@ class CUAToolSchema:
30
31
  if type and is_valid_enum(enum=GPTCUATypeEnum, val=type):
31
32
  self.type = type.value if isinstance(type, GPTCUATypeEnum) else type
32
33
 
33
- if environment and is_valid_enum(enum=GPTCUAEnvironmentEnum, val=environment):
34
- self.environment = environment.value if isinstance(environment, GPTCUAEnvironmentEnum) else environment
35
-
36
- self.environment = environment if environment else self.environment
37
-
34
+ if environment and is_valid_enum(enum=GPTCUABrowserEnum, val=environment):
35
+ self.environment = environment.value if isinstance(environment, GPTCUABrowserEnum) else environment
38
36
 
39
37
  @property
40
38
  def schema(self) -> Dict[str, Any]:
@@ -56,8 +54,10 @@ class GPTToolCUA:
56
54
  reasoning_effort: str = GPTSizeEnum.MEDIUM.value
57
55
  truncation: str = "auto"
58
56
 
57
+ _schema: Dict[str, Any] = dict()
59
58
  _response_ids: List[str] = list()
60
59
  _call_ids: List[str] = list()
60
+ _calls: Dict[str, Dict[str, Any]] = dict() # stores response_id and raw output object.
61
61
  _usage: UsageMetrics = UsageMetrics()
62
62
  _logger: Logger = Logger(info_file_save=True, filename="cua-task-{}".format(str(datetime.datetime.now().timestamp())) + ".png")
63
63
 
@@ -74,7 +74,7 @@ class GPTToolCUA:
74
74
  _usage: UsageMetrics = UsageMetrics()
75
75
  ):
76
76
  self.user_prompt = user_prompt
77
- self.web_url = web_url if is_valid_url(web_url) else "https://www.google.com"
77
+ self.web_url = web_url if is_valid_url(web_url) else None
78
78
  self.browser = browser if browser in allowed_browsers else 'chromium'
79
79
  self.truncation = truncation if truncation else self.truncation
80
80
  self._usage = _usage
@@ -104,104 +104,93 @@ class GPTToolCUA:
104
104
  pass
105
105
 
106
106
 
107
- def _take_screenshot(self, page: Any = None, path: str = None) -> Tuple[str | None, str | None]:
108
- import base64
109
- if not page:
110
- return None, None
111
-
112
- path = path if path else "screenshot.png"
113
- screenshot_bytes = page.screenshot()
114
- screenshot_base64 = base64.b64encode(screenshot_bytes).decode("utf-8")
115
- self._logger.log(message=f"Action: screenshot", level="info", color="blue")
116
- return screenshot_bytes, screenshot_base64
117
-
118
-
119
- def _handle_model_action(self, page: Any, action: Any, action_type: str = None) -> bool:
120
- """Creates a page object and performs actions."""
107
+ def _structure_schema(self, screenshot: str = None) -> None:
108
+ """Formats args schema for CUA calling."""
121
109
 
122
- action_type = action_type if action_type else action.type
123
- start_dt = datetime.datetime.now()
110
+ tool_schema = [item.schema for item in self.tools]
111
+ schema = dict()
112
+ inputs = list()
113
+ previous_response_id = self._response_ids[-1] if self._response_ids else None
114
+ # (self._response_ids[-1].startswith("rs") or self._response_ids[-1].startswith("resp")) else None
124
115
 
125
- try:
126
- match action_type:
127
- case "click":
128
- x, y = action.x, action.y
129
- button = action.button
130
- self._logger.log(message=f"Action: click at ({x}, {y}) with button '{button}'", level="info", color="blue")
131
- if button != "left" and button != "right":
132
- button = "left"
133
- page.mouse.click(x, y, button=button)
134
-
135
- case "scroll":
136
- x, y = action.x, action.y
137
- scroll_x, scroll_y = action.scroll_x, action.scroll_y
138
- self._logger.log(message=f"Action: scroll at ({x}, {y}) with offsets (scroll_x={scroll_x}, scroll_y={scroll_y})", level="info", color="blue")
139
- page.mouse.move(x, y)
140
- page.evaluate(f"window.scrollBy({scroll_x}, {scroll_y})")
141
-
142
- case "keypress":
143
- keys = action.keys
144
- for k in keys:
145
- self._logger.log(message=f"Action: keypress '{k}'", level="info", color="blue")
146
- if k.lower() == "enter":
147
- page.keyboard.press("Enter")
148
- elif k.lower() == "space":
149
- page.keyboard.press(" ")
150
- else:
151
- page.keyboard.press(k)
152
-
153
- case "type":
154
- text = action.text
155
- self._logger.log(message=f"Action: type text: {text}", level="info", color="blue")
156
- page.keyboard.type(text)
157
-
158
- case "wait":
159
- self._logger.log(message=f"Action: wait", level="info", color="blue")
160
- time.sleep(2)
161
-
162
- case "screenshot":
163
- pass
116
+ if self._call_ids:
117
+ inputs = [
118
+ {
119
+ "call_id": self._call_ids[-1],
120
+ "type": "computer_call_output",
121
+ }
122
+ ]
123
+ if screenshot:
124
+ inputs[0].update({ "output": { "type": "computer_screenshot", "image_url": f"data:image/png;base64,{str(screenshot)}"}})
125
+
126
+ # if self._calls:
127
+ # call = self._calls[self._call_ids[-1]]
128
+ # if call and call.call_id not in inputs[0]:
129
+ # inputs.append(call)
130
+
131
+ if previous_response_id:
132
+ schema = dict(
133
+ model=self.model,
134
+ previous_response_id=previous_response_id,
135
+ tools=tool_schema,
136
+ input=inputs,
137
+ truncation=self.truncation
138
+ )
139
+ else:
140
+ schema = dict(
141
+ model=self.model,
142
+ tools=tool_schema,
143
+ input=inputs,
144
+ truncation=self.truncation
145
+ )
164
146
 
165
- case _:
166
- self._logger.log(message=f"Unrecognized action: {action}", level="warning", color="yellow")
147
+ else:
148
+ input = [{ "role": "user", "content": self.user_prompt } ]
149
+ img_url = convert_img_url(self.img_url) if self.img_url else None
150
+ if img_url:
151
+ input.append({"type": "input_image", "image_url": f"data:image/png;base64,{img_url}"})
167
152
 
168
- except Exception as e:
169
- self._usage.record_errors(type=ErrorType.API)
170
- self._logger.log(message=f"Error handling action {action}: {e}", level="error", color="red")
153
+ schema = dict(
154
+ model=self.model,
155
+ tools=tool_schema,
156
+ input=input,
157
+ reasoning={ "effort": self.reasoning_effort},
158
+ truncation=self.truncation
159
+ )
171
160
 
172
- end_dt = datetime.datetime.now()
173
- self._usage.record_latency(start_dt=start_dt, end_dt=end_dt)
174
- return bool(self._usage.total_errors)
161
+ self._schema = schema
162
+ # return self._schema
175
163
 
176
164
 
177
- def run(self, screenshot: str = None) -> Tuple[Dict[str, Any], None, UsageMetrics]:
165
+ def _run(self, screenshot: str = None) -> Tuple[Dict[str, Any], None, UsageMetrics]:
178
166
  raw_res = dict()
179
167
  usage = self._usage if self._usage else UsageMetrics()
180
168
  start_dt = datetime.datetime.now()
181
169
 
182
170
  try:
183
- schema = self.schema
184
- if screenshot and "output" in schema["input"][0]:
185
- output_image_url = schema["input"][0]["output"]["image_url"].replace("SCREENSHOT", str(screenshot))
186
- schema["input"][0]["output"]["image_url"] = output_image_url
187
-
188
- res = openai_client.responses.create(**schema)
171
+ self._structure_schema(screenshot=screenshot)
172
+ res = openai_client.responses.create(**self._schema)
189
173
  if not res:
190
174
  usage.record_errors(ErrorType.TOOL)
191
175
  else:
176
+ self._response_ids.append(res.id)
192
177
  for item in res.output:
178
+
193
179
  match item.type:
194
180
  case "reasoning":
195
- raw_res.update(dict(reasoning=item.summary[0].text))
196
- if item.id and item.id.startwith('rs'):
197
- self._response_ids.append(item.id)
181
+ reasoning = item.summary[0].text if item.summary and isinstance(item.summary, list) else str(item.summary) if item.summary else ""
182
+ raw_res.update(dict(reasoning=reasoning))
183
+ # self._response_ids.append(item.id)
184
+
198
185
  case "computer_call":
199
186
  raw_res.update(dict(action=item.action))
200
187
  # self._response_ids.append(item.id)
201
- self._call_ids.append(item.call_id)
188
+ call_id = item.call_id
189
+ self._call_ids.append(call_id)
190
+ self._calls.update({ call_id: item })
202
191
  case _:
203
192
  pass
204
- usage.record_token_usage(**res.usage.__dict__)
193
+ usage.record_token_usage(**res.usage.__dict__)
205
194
 
206
195
  except Exception as e:
207
196
  self._logger.log(message=f"Failed to run: {str(e)}", color="red", level="error")
@@ -212,84 +201,256 @@ class GPTToolCUA:
212
201
  return raw_res, None, usage
213
202
 
214
203
 
215
- def invoke_playwright(self) -> Tuple[Dict[str, Any], None, UsageMetrics]:
204
+ def invoke_playwright(self) -> Dict[str, Any]:
216
205
  """Handles computer use loop. Ref. OpenAI official website."""
206
+ try:
207
+ from playwright.sync_api import sync_playwright
208
+ except Exception as e:
209
+ self._logger.log(level="error", message=f"Install Playwright by adding `versionhq[tools]` to requirements.txt or run `uv add playwright`. {str(e)}", color="red")
210
+ raise e
217
211
 
218
- from playwright.sync_api import sync_playwright
219
-
220
- self._logger.log(message="Start the operation.", level="info", color="blue")
212
+ import os
213
+ os.environ["DEBUG"] = "pw:browser"
214
+ self._logger.log(message="Start computer use.", level="info", color="blue")
215
+ start_dt = datetime.datetime.now()
216
+ res = None
217
+
218
+ # try:
219
+ p = sync_playwright().start()
220
+ b = p.firefox if self.browser == "firefox" else p.chromium
221
+ browser = b.launch(headless=True)
222
+ page = browser.new_page()
223
+ if not browser or not page:
224
+ return None, None, None
225
+
226
+ if self.web_url:
227
+ page.goto(self.web_url, timeout=3000000, wait_until="load", referer=None)
228
+ time.sleep(3)
229
+
230
+ res, _, usage = self._run()
231
+ self._usage.aggregate(metrics=usage)
232
+ actions = [v for k, v in res.items() if k =="action"] if res else []
233
+ action = actions[0] if actions else None
234
+
235
+ if action:
236
+ while True:
237
+ x = action.x if hasattr(action, 'x') else 0
238
+ y = action.y if hasattr(action, 'y') else 0
239
+ scroll_x = action.scroll_x if hasattr(action, 'scroll_x') else 0
240
+ scroll_y = action.scroll_y if hasattr(action, 'scroll_y') else 0
241
+ text = action.text if hasattr(action, 'text') else ''
242
+ screenshot_base64 = None
243
+ path = handle_directory(directory_name='_screenshots', filename=f'cua_playwright', ext='png')
244
+
245
+ match action.type:
246
+ case "click":
247
+ self._logger.log(message="Action: click", color="blue", level="info")
248
+ button = action.button if hasattr(action, 'button') and (action.button == 'left' or action.button == 'right') else 'left'
249
+ page.mouse.move(x, y)
250
+ page.mouse.click(x, y, button=button)
251
+ time.sleep(1)
252
+
253
+ case "scroll":
254
+ self._logger.log(message="Action: scroll", color="blue", level="info")
255
+ page.mouse.move(x, y)
256
+ page.evaluate(f"window.scrollBy({scroll_x}, {scroll_y})")
257
+ time.sleep(1)
258
+
259
+ case "move":
260
+ self._logger.log(message="Action: move", color="blue", level="info")
261
+ page.mouse.move(x, y)
262
+ page.evaluate(f"window.scrollBy({scroll_x}, {scroll_y})")
263
+ time.sleep(1)
264
+
265
+ case "keypress":
266
+ self._logger.log(message="Action: keypress", color="blue", level="info")
267
+ keys = action.keys
268
+ for k in keys:
269
+ match k.lower():
270
+ case "enter": page.keyboard.press("Enter")
271
+ case "space": page.keyboard.press(" ")
272
+ case _: page.keyboard.press(k)
273
+ time.sleep(1)
274
+
275
+ case "type":
276
+ self._logger.log(message="Action: type", color="blue", level="info")
277
+ page.keyboard.type(text)
278
+ time.sleep(1)
279
+
280
+ case "wait":
281
+ self._logger.log(message="Action: wait", color="blue", level="info")
282
+ time.sleep(3)
283
+
284
+ case "screenshot":
285
+ self._logger.log(message="Action: screenshot", color="blue", level="info")
286
+ screenshot_bytes = page.screenshot(path=path)
287
+ screenshot_base64 = base64.b64encode(screenshot_bytes).decode("utf-8")
288
+ time.sleep(1)
289
+
290
+ case _:
291
+ self._logger.log(message=f"Unrecognized action: {action}", level="warning", color="yellow")
292
+ return False
293
+
294
+ if not screenshot_base64:
295
+ screenshot_bytes = page.screenshot(path=path)
296
+ screenshot_base64 = base64.b64encode(screenshot_bytes).decode("utf-8")
297
+ time.sleep(1)
298
+
299
+ res, _, usage = self._run(screenshot=screenshot_base64)
300
+ self._usage.aggregate(metrics=usage)
301
+ if not res:
302
+ usage.record_errors(type=ErrorType.API)
303
+ break
221
304
 
222
- try:
223
- with sync_playwright() as p:
224
- b = p.firefox if self.browser == "firefox" else p.webkit if self.browser == "webkit" else p.chromium
225
- browser = b.launch(headless=True)
226
- page = browser.new_page()
227
- if not browser or not page:
228
- return None, None, None
229
-
230
- page.goto(self.web_url)
231
- res, _, usage = self.run()
232
- self._usage = usage
233
305
  actions = [v for k, v in res.items() if k =="action"] if res else []
234
306
  action = actions[0] if actions else None
235
- start_dt = datetime.datetime.now()
236
-
237
- if action:
238
- while True:
239
- self._handle_model_action(page=page, action=action)
240
- _, screenshot_base64 = self._take_screenshot(page=page)
241
- res, _, usage = self.run(screenshot=screenshot_base64)
242
- self._usage.agggregate(metrics=usage)
243
- if not res:
244
- usage.record_errors(type=ErrorType.API)
245
- break
246
-
247
- actions = [v for k, v in res.items() if k =="action"] if res else []
248
- action = actions[0] if actions else None
249
- if not action:
250
- break
251
- else:
252
- self._usage.record_errors(type=ErrorType.TOOL)
307
+ if not action:
308
+ break
309
+ else:
310
+ self._usage.record_errors(type=ErrorType.TOOL)
253
311
 
254
- except Exception as e:
255
- self._logger.log(message=f"Failed to execute. {str(e)}", color="red", level="error")
312
+ # except Exception as e:
313
+ # self._logger.log(message=f"Failed to execute. {str(e)}", color="red", level="error")
314
+ # browser.close()
256
315
 
257
316
  end_dt = datetime.datetime.now()
258
317
  self._usage.record_latency(start_dt=start_dt, end_dt=end_dt)
259
- # browser.close()
260
- return res, _, self._usage
318
+ return res
261
319
 
262
320
 
263
- @property
264
- def schema(self) -> Dict[str, Any]:
265
- """Formats args schema for CUA calling."""
321
+ def invoke_selenium(self, **kwargs) -> Dict[str, Any]:
322
+ try:
323
+ from selenium import webdriver
324
+ from selenium.webdriver.common.keys import Keys
325
+ from selenium.webdriver.common.action_chains import ActionChains
326
+ from selenium.webdriver.common.actions.action_builder import ActionBuilder
327
+ except Exception as e:
328
+ self._logger.log(level="error", message=f"Install Selenium by `uv pip install versionhq[tools]` or `uv add selenium`. {str(e)}", color="red")
329
+ raise e
266
330
 
267
- tool_schema = [item.schema for item in self.tools]
268
- schema = dict()
269
- inputs = list()
270
- previous_response_id = self._response_ids[-1] if self._response_ids and self._response_ids[-1].startswith("rs") else None
331
+ self._logger.log(message="Start computer use", level="info", color="blue")
271
332
 
272
- if self._call_ids:
273
- inputs = [
274
- {
275
- "call_id": self._call_ids[-1],
276
- "type": "computer_call_output",
277
- "output": { "type": "input_image", "image_url": f"data:image/png;base64,SCREENSHOT"}
278
- }
279
- ]
280
- schema = dict(
281
- model=self.model,
282
- previous_response_id=previous_response_id,
283
- tools=tool_schema,
284
- input=inputs,
285
- truncation=self.truncation
286
- )
333
+ start_dt = datetime.datetime.now()
334
+
335
+ driver = webdriver.Chrome(options=kwargs) if kwargs else webdriver.Chrome()
336
+ if self.tools:
337
+ driver.set_window_size(height=self.tools[0].display_height, width=self.tools[0].display_width)
338
+
339
+ if self.web_url:
340
+ driver.get(self.web_url)
341
+ time.sleep(3)
342
+
343
+ res, _, usage = self._run()
344
+ self._logger.log(message=f"Initial response: {res}", color="blue", level="info")
345
+ self._usage.aggregate(metrics=usage)
346
+ actions = [v for k, v in res.items() if k =="action"] if res else []
347
+ action = actions[0] if actions else None
348
+ action_chains = ActionChains(driver=driver)
349
+ action_builder = ActionBuilder(driver=driver)
350
+
351
+ if action:
352
+ while True:
353
+ x = action.x if hasattr(action, 'x') else 0
354
+ y = action.y if hasattr(action, 'y') else 0
355
+ scroll_x = action.scroll_x if hasattr(action, 'scroll_x') else 0
356
+ scroll_y = action.scroll_y if hasattr(action, 'scroll_y') else 0
357
+ text = action.text if hasattr(action, 'text') else ''
358
+ path = handle_directory(directory_name='_screenshots', filename=f'cua_selenium', ext='png')
359
+
360
+ match action.type:
361
+ case 'click':
362
+ self._logger.log(message="Action: click", color="blue", level="info")
363
+ driver.execute_script(f'window.scrollBy({x}, {y})')
364
+ action_chains.move_by_offset(xoffset=x, yoffset=y)
365
+ action_chains.perform()
366
+
367
+ if hasattr(action, 'button'):
368
+ match action.button:
369
+ case 'left':
370
+ action_chains.click()
371
+ case 'right':
372
+ action_chains.context_click()
373
+ action_chains.perform()
374
+ time.sleep(1)
375
+
376
+ case "scroll" | "move":
377
+ self._logger.log(message="Action: scroll", color="blue", level="info")
378
+ driver.execute_script(f'window.scrollBy({scroll_x}, {scroll_y})')
379
+ time.sleep(1)
380
+
381
+ case "keypress":
382
+ self._logger.log(message="Action: keypress", color="blue", level="info")
383
+ keys = action.keys
384
+ if keys:
385
+ for k in keys:
386
+ match k.lower():
387
+ case "enter": action_chains.key_down(Keys.ENTER).perform()
388
+ case "space": action_chains.key_down(Keys.SPACE).perform()
389
+ case "select_all":
390
+ if platform.system() == 'Darwin':
391
+ action_chains.send_keys(Keys.COMMAND + "a").perform()
392
+ else:
393
+ action_chains.send_keys(Keys.CONTROL + "a").perform()
394
+ case _:
395
+ action_chains.key_down(Keys.SHIFT).send_keys(k).key_up(Keys.SHIFT).perform()
396
+ time.sleep(1)
397
+
398
+ case "type":
399
+ self._logger.log(message="Action: type", color="blue", level="info")
400
+ action_chains.send_keys(text).perform()
401
+ time.sleep(1)
402
+
403
+ case "wait":
404
+ self._logger.log(message="Action: wait", color="blue", level="info")
405
+ action_chains.pause(3)
406
+
407
+ case "screenshot":
408
+ self._logger.log(message="Action: screenshot", color="blue", level="info")
409
+ driver.save_screenshot(path)
410
+ time.sleep(1)
411
+
412
+ case _:
413
+ self._logger.log(message=f"Unrecognized action: {action}", level="warning", color="yellow")
414
+ return False
415
+
416
+ with open(path, "rb") as image_file:
417
+ res, usage = None, None
418
+ if image_file:
419
+ screenshot_base64 = base64.b64encode(image_file.read()).decode("utf-8")
420
+ res, _, usage = self._run(screenshot=screenshot_base64)
421
+ else:
422
+ res, _, usage = self._run()
423
+
424
+ print("res", res)
425
+
426
+ self._usage.aggregate(metrics=usage)
427
+ if not res:
428
+ usage.record_errors(type=ErrorType.API)
429
+ break
287
430
 
431
+ actions = [v for k, v in res.items() if k =="action"] if res else []
432
+ action = actions[0] if actions else None
433
+ if not action:
434
+ self._logger.log(message="No action found.", color="yellow", level="warning")
435
+ break
288
436
  else:
289
- img_url = convert_img_url(self.img_url) if self.img_url else None
290
- input = [{ "role": "user", "content": self.user_prompt } ]
291
- if img_url:
292
- input.append({"type": "input_image", "image_url": f"data:image/png;base64,{img_url}"})
293
- schema = dict(model=self.model, tools=tool_schema, input=input, reasoning={ "effort": self.reasoning_effort}, truncation=self.truncation)
437
+ self._usage.record_errors(type=ErrorType.TOOL)
438
+
439
+ end_dt = datetime.datetime.now()
440
+ self._usage.record_latency(start_dt=start_dt, end_dt=end_dt)
441
+ return res
442
+
294
443
 
295
- return schema
444
+ def run(self) -> Tuple[Dict[str, Any], None, UsageMetrics]:
445
+ """Core function to execute the tool."""
446
+
447
+ res = None
448
+ try:
449
+ res = self.invoke_playwright()
450
+ except:
451
+ self._call_ids = []
452
+ self._calls = dict()
453
+ self._response_ids = []
454
+ res = self.invoke_selenium()
455
+
456
+ return res, None, self._usage
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: versionhq
3
- Version: 1.2.4.13
3
+ Version: 1.2.4.14
4
4
  Summary: Autonomous agent networks for task automation with multi-step reasoning.
5
5
  Author-email: Kuriko Iwai <kuriko@versi0n.io>
6
6
  License: MIT License
@@ -77,6 +77,7 @@ Provides-Extra: tools
77
77
  Requires-Dist: html2text>=2024.2.26; extra == "tools"
78
78
  Requires-Dist: sec-api>=1.0.28; extra == "tools"
79
79
  Requires-Dist: pytest-playwright>=0.7.0; extra == "tools"
80
+ Requires-Dist: selenium>=4.29.0; extra == "tools"
80
81
  Provides-Extra: torch
81
82
  Requires-Dist: torch>=2.6.0; extra == "torch"
82
83
  Requires-Dist: torchvision>=0.21.0; extra == "torch"
@@ -1,9 +1,10 @@
1
- versionhq/__init__.py,sha256=YQ3V-FOICFD8-rGvToBJu4vTGaOywnolUk4SPec-66k,3356
1
+ versionhq/__init__.py,sha256=7yJXhEnXuIcMKUqz042HK99oD79bvLl2hiajGP9J7OM,3356
2
2
  versionhq/_prompt/auto_feedback.py,sha256=bbj37yTa11lRHpx-sV_Wmpb4dVnDBB7_v8ageUobHXY,3780
3
3
  versionhq/_prompt/constants.py,sha256=DOwUFnVVObEFqgnaMCDnW8fnw1oPMgS8JAqOiTuqleI,932
4
4
  versionhq/_prompt/model.py,sha256=wJlDM9yzrqlXWxyw4HkYQzPii2MPfqkgTF3qhXoJN2M,8038
5
- versionhq/_utils/__init__.py,sha256=TOd3U_VCjvLzt0w-KV9cM1_ozEjzffhjyKX3F_JaqZg,418
5
+ versionhq/_utils/__init__.py,sha256=S3GvJKOTHM43JzPdaDqT6Zkan9eQJpc4biqQBXiVq6o,481
6
6
  versionhq/_utils/convert_img_url.py,sha256=BlINw4RQ632m9P4FJbqzqYlzTLESBTRkhkstAopnNNY,408
7
+ versionhq/_utils/handle_directory.py,sha256=n5y2ClC4A3f6rkv8XDfzoCqJcw-8sCJ0Q5q_ZiQ5uxw,417
7
8
  versionhq/_utils/i18n.py,sha256=TwA_PnYfDLA6VqlUDPuybdV9lgi3Frh_ASsb_X8jJo8,1483
8
9
  versionhq/_utils/is_valid_enum.py,sha256=vGGIuvhDnFU2fUyyFxJyjw-NfByK0vfFAu1ShaHBeZE,720
9
10
  versionhq/_utils/is_valid_url.py,sha256=m8Mswvb-90FJtx1Heq6hPFDbwGgrv_R3wSbZQmEPM9Q,379
@@ -14,7 +15,7 @@ versionhq/_utils/usage_metrics.py,sha256=gDK6fZgT1njX4iPIPFapWxfxIiz-zZYv72p0u6M
14
15
  versionhq/_utils/vars.py,sha256=bZ5Dx_bFKlt3hi4-NNGXqdk7B23If_WaTIju2fiTyPQ,57
15
16
  versionhq/agent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
17
  versionhq/agent/inhouse_agents.py,sha256=D2WAiXCYsnQK3_Fe7CbbtvXsHWOaN6vde6m_QoW7fH4,2629
17
- versionhq/agent/model.py,sha256=Cw9BdkDq45Ubzayq62A-nFqREBEIxMY0wfm_Xy8yP_w,26942
18
+ versionhq/agent/model.py,sha256=n4yU1f7-74piTJXEK-IahJOWzSpuwViaj7RJEMJW_Y0,26988
18
19
  versionhq/agent/parser.py,sha256=riG0dkdQCxH7uJ0AbdVdg7WvL0BXhUgJht0VtQvxJBc,4082
19
20
  versionhq/agent/rpm_controller.py,sha256=grezIxyBci_lDlwAlgWFRyR5KOocXeOhYkgN02dNFNE,2360
20
21
  versionhq/agent/TEMPLATES/Backstory.py,sha256=dkfuATUQ2g2WoUKkmgAIch-RB--bektGoQaUlsDOn0g,529
@@ -70,12 +71,12 @@ versionhq/tool/composio/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
70
71
  versionhq/tool/composio/model.py,sha256=GIFKso_e_4a3BdaulqU_i6Y9JFAExNBjzHUHR_zZeSI,8577
71
72
  versionhq/tool/composio/params.py,sha256=FvBuEXsOQUYnN7RTFxT20kAkiEYkxWKkiVtgpqOzKZQ,1843
72
73
  versionhq/tool/gpt/__init__.py,sha256=A6xCuf_GUBs7wfx904J_Vd2t1GJCcf0lMKOL7MbZce4,160
73
- versionhq/tool/gpt/_enum.py,sha256=VaONDFZJNVe30Wf3Pl9s0XvxP_Xxqv3RNFcnqyigGFk,500
74
- versionhq/tool/gpt/cua.py,sha256=5yrgz_fc3IH_uB70J51wmRBWkfH53Qx-a29nmwWyOcs,12078
74
+ versionhq/tool/gpt/_enum.py,sha256=iBtH964dyv6d326VXSJsthB7EKxFXLcZVQPfvaCtbdk,496
75
+ versionhq/tool/gpt/cua.py,sha256=KFDueZiu7idDn9l_XrOfi_1PyllID9jFHG1S6sFaBbc,19360
75
76
  versionhq/tool/gpt/file_search.py,sha256=r5JVlf-epKB8DDXyrzlkezguHUMir0JW-77LUHoy-w8,5813
76
77
  versionhq/tool/gpt/web_search.py,sha256=bpqEQopbq9KtqQ_0W7QAAJ5TyoKGiVM94-SMp5oqNFE,3483
77
- versionhq-1.2.4.13.dist-info/licenses/LICENSE,sha256=cRoGGdM73IiDs6nDWKqPlgSv7aR4n-qBXYnJlCMHCeE,1082
78
- versionhq-1.2.4.13.dist-info/METADATA,sha256=rvtqxOduTKrgS2alvluKuYC9NxUchTOGFcspTDs2VlM,21349
79
- versionhq-1.2.4.13.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
80
- versionhq-1.2.4.13.dist-info/top_level.txt,sha256=DClQwxDWqIUGeRJkA8vBlgeNsYZs4_nJWMonzFt5Wj0,10
81
- versionhq-1.2.4.13.dist-info/RECORD,,
78
+ versionhq-1.2.4.14.dist-info/licenses/LICENSE,sha256=cRoGGdM73IiDs6nDWKqPlgSv7aR4n-qBXYnJlCMHCeE,1082
79
+ versionhq-1.2.4.14.dist-info/METADATA,sha256=wPMQGhx1Xxyh-oScOhbqXrsUxJMGMdoYmoiNfZFApN8,21399
80
+ versionhq-1.2.4.14.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
81
+ versionhq-1.2.4.14.dist-info/top_level.txt,sha256=DClQwxDWqIUGeRJkA8vBlgeNsYZs4_nJWMonzFt5Wj0,10
82
+ versionhq-1.2.4.14.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (77.0.3)
2
+ Generator: setuptools (78.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5