khoj 1.41.1.dev43__py3-none-any.whl → 1.41.1.dev90__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. khoj/database/adapters/__init__.py +1 -1
  2. khoj/interface/compiled/404/index.html +2 -2
  3. khoj/interface/compiled/_next/static/chunks/{2327-f03b2a77f67b8f8c.js → 2327-aa22697ed9c8d54a.js} +1 -1
  4. khoj/interface/compiled/_next/static/chunks/4986-9ddd694756d03aa1.js +1 -0
  5. khoj/interface/compiled/_next/static/chunks/{8515-010dd769c584b672.js → 8515-f305779d95dd5780.js} +1 -1
  6. khoj/interface/compiled/_next/static/chunks/app/agents/layout-4e2a134ec26aa606.js +1 -0
  7. khoj/interface/compiled/_next/static/chunks/app/agents/{page-ceeb9a91edea74ce.js → page-996513ae80f8720c.js} +1 -1
  8. khoj/interface/compiled/_next/static/chunks/app/automations/{page-e3cb78747ab98cc7.js → page-2320231573aa9a49.js} +1 -1
  9. khoj/interface/compiled/_next/static/chunks/app/chat/layout-ad4d1792ab1a4108.js +1 -0
  10. khoj/interface/compiled/_next/static/chunks/app/chat/{page-14ac9d1ad5cb84c5.js → page-6257055246cdebd5.js} +1 -1
  11. khoj/interface/compiled/_next/static/chunks/app/{page-a4053e1bb578b2ce.js → page-d9a2e44bbcf49f82.js} +1 -1
  12. khoj/interface/compiled/_next/static/chunks/app/search/layout-f5881c7ae3ba0795.js +1 -0
  13. khoj/interface/compiled/_next/static/chunks/app/search/{page-8973da2f4c076fe1.js → page-31452bbda0e0a56f.js} +1 -1
  14. khoj/interface/compiled/_next/static/chunks/app/settings/{page-375136dbb400525b.js → page-fdb72b15ca908b43.js} +1 -1
  15. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-abb6c5f4239ad7be.js +1 -0
  16. khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-384b54fc953b18f2.js → page-5b7cb35d835af900.js} +1 -1
  17. khoj/interface/compiled/_next/static/chunks/{webpack-1169ca6e9e7e6247.js → webpack-e091508620cb8aef.js} +1 -1
  18. khoj/interface/compiled/_next/static/css/{fca983d49c3dd1a3.css → 0db53bacf81896f5.css} +1 -1
  19. khoj/interface/compiled/_next/static/css/55d4a822f8d94b67.css +1 -0
  20. khoj/interface/compiled/agents/index.html +2 -2
  21. khoj/interface/compiled/agents/index.txt +2 -2
  22. khoj/interface/compiled/automations/index.html +2 -2
  23. khoj/interface/compiled/automations/index.txt +3 -3
  24. khoj/interface/compiled/chat/index.html +2 -2
  25. khoj/interface/compiled/chat/index.txt +2 -2
  26. khoj/interface/compiled/index.html +2 -2
  27. khoj/interface/compiled/index.txt +2 -2
  28. khoj/interface/compiled/search/index.html +2 -2
  29. khoj/interface/compiled/search/index.txt +2 -2
  30. khoj/interface/compiled/settings/index.html +2 -2
  31. khoj/interface/compiled/settings/index.txt +4 -4
  32. khoj/interface/compiled/share/chat/index.html +2 -2
  33. khoj/interface/compiled/share/chat/index.txt +2 -2
  34. khoj/processor/conversation/anthropic/anthropic_chat.py +5 -0
  35. khoj/processor/conversation/google/gemini_chat.py +5 -0
  36. khoj/processor/conversation/openai/gpt.py +5 -0
  37. khoj/processor/conversation/prompts.py +12 -1
  38. khoj/processor/conversation/utils.py +12 -0
  39. khoj/processor/operator/grounding_agent.py +345 -0
  40. khoj/processor/operator/grounding_agent_uitars.py +973 -0
  41. khoj/processor/operator/operate_browser.py +152 -0
  42. khoj/processor/operator/operator_actions.py +149 -0
  43. khoj/processor/operator/operator_agent_anthropic.py +383 -0
  44. khoj/processor/operator/operator_agent_base.py +80 -0
  45. khoj/processor/operator/operator_agent_binary.py +336 -0
  46. khoj/processor/operator/operator_agent_openai.py +349 -0
  47. khoj/processor/operator/operator_environment_base.py +37 -0
  48. khoj/processor/operator/operator_environment_browser.py +395 -0
  49. khoj/routers/api_chat.py +42 -3
  50. khoj/routers/helpers.py +14 -3
  51. khoj/routers/research.py +48 -1
  52. khoj/utils/helpers.py +17 -0
  53. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev90.dist-info}/METADATA +3 -1
  54. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev90.dist-info}/RECORD +65 -55
  55. khoj/interface/compiled/_next/static/chunks/4986-14ea63faad1615a4.js +0 -1
  56. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +0 -1
  57. khoj/interface/compiled/_next/static/chunks/app/chat/layout-33934fc2d6ae6838.js +0 -1
  58. khoj/interface/compiled/_next/static/chunks/app/search/layout-c02531d586972d7d.js +0 -1
  59. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-e8e5db7830bf3f47.js +0 -1
  60. khoj/interface/compiled/_next/static/css/f29752d6e1be7624.css +0 -1
  61. /khoj/interface/compiled/_next/static/{doKtSKC0j2ECO8K8viDKD → WLmcH2J-wz36GlS6O8HSL}/_buildManifest.js +0 -0
  62. /khoj/interface/compiled/_next/static/{doKtSKC0j2ECO8K8viDKD → WLmcH2J-wz36GlS6O8HSL}/_ssgManifest.js +0 -0
  63. /khoj/interface/compiled/_next/static/chunks/{1915-ab4353eaca76f690.js → 1915-1943ee8a628b893c.js} +0 -0
  64. /khoj/interface/compiled/_next/static/chunks/{2117-1c18aa2098982bf9.js → 2117-5a41630a2bd2eae8.js} +0 -0
  65. /khoj/interface/compiled/_next/static/chunks/{4363-4efaf12abe696251.js → 4363-e6ac2203564d1a3b.js} +0 -0
  66. /khoj/interface/compiled/_next/static/chunks/{4447-5d44807c40355b1a.js → 4447-e038b251d626c340.js} +0 -0
  67. /khoj/interface/compiled/_next/static/chunks/{8667-adbe6017a66cef10.js → 8667-8136f74e9a086fca.js} +0 -0
  68. /khoj/interface/compiled/_next/static/chunks/{9259-d8bcd9da9e80c81e.js → 9259-640fdd77408475df.js} +0 -0
  69. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev90.dist-info}/WHEEL +0 -0
  70. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev90.dist-info}/entry_points.txt +0 -0
  71. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev90.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,973 @@
1
+ # Source:
2
+ # https://github.com/xlang-ai/OSWorld/blob/main/run_uitars.py
3
+ # https://github.com/xlang-ai/OSWorld/blob/main/mm_agents/uitars_agent.py
4
+ # https://github.com/xlang-ai/OSWorld/blob/main/mm_agents/prompts.py#L1222
5
+ # https://github.com/xlang-ai/OSWorld/blob/main/lib_run_single.py
6
+
7
+ import ast
8
+ import base64
9
+ import logging
10
+ import math
11
+ import re
12
+ from io import BytesIO
13
+ from typing import Any, List
14
+
15
+ import numpy as np
16
+ from openai import AzureOpenAI, OpenAI
17
+ from openai.types.chat import ChatCompletion
18
+ from PIL import Image
19
+
20
+ from khoj.processor.operator.operator_actions import *
21
+ from khoj.processor.operator.operator_environment_base import EnvState
22
+ from khoj.utils.helpers import get_chat_usage_metrics
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class GroundingAgentUitars:
28
+ FINISH_WORD = "finished"
29
+ WAIT_WORD = "wait"
30
+ ENV_FAIL_WORD = "error_env"
31
+ CALL_USER = "call_user"
32
+
33
+ IMAGE_FACTOR = 28
34
+ MIN_PIXELS = 100 * 28 * 28
35
+ MAX_PIXELS = 16384 * 28 * 28
36
+ MAX_RATIO = 200
37
+
38
+ UITARS_USR_PROMPT_THOUGHT = """
39
+ You are a GUI agent. You are given a task and a screenshot of the web browser tab you operate. You need to perform the next action to complete the task.
40
+ You control a single tab in a Chromium browser. You cannot access the OS, filesystem, the application window or the addressbar.
41
+ Try fulfill the user instruction to the best of your ability, especially when the instruction is given multiple times. Do not ignore the instruction.
42
+
43
+ ## Output Format
44
+ ```
45
+ Thought: ...
46
+ Action: ...
47
+ ```
48
+
49
+ ## Action Space
50
+ {action_space}
51
+
52
+ ## Note
53
+ - Use {language} in `Thought` part.
54
+ - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
55
+
56
+ ## User Instruction
57
+ {instruction}
58
+ """
59
+
60
+ UITARS_NORMAL_ACTION_SPACE = """
61
+ click(start_box='<|box_start|>(x1,y1)<|box_end|>')
62
+ left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
63
+ right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
64
+ drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
65
+ hotkey(key='')
66
+ type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
67
+ scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
68
+ wait() #Sleep for 5s and take a screenshot to check for any changes.
69
+ finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
70
+ """.lstrip()
71
+
72
+ def __init__(
73
+ self,
74
+ model_name: str,
75
+ client: OpenAI | AzureOpenAI,
76
+ max_iterations=50,
77
+ environment_type: Literal["computer", "web"] = "computer",
78
+ runtime_conf: dict = {
79
+ "infer_mode": "qwen25vl_normal",
80
+ "prompt_style": "qwen25vl_normal",
81
+ "input_swap": True,
82
+ "language": "English",
83
+ "history_n": 5,
84
+ "max_pixels": 16384 * 28 * 28,
85
+ "min_pixels": 100 * 28 * 28,
86
+ "callusr_tolerance": 3,
87
+ "temperature": 0.0,
88
+ "top_k": -1,
89
+ "top_p": 0.9,
90
+ "max_tokens": 500,
91
+ },
92
+ tracer: dict = None,
93
+ ):
94
+ self.model_name = model_name
95
+ self.client = client
96
+ self.tracer = tracer
97
+ self.environment_type = environment_type
98
+
99
+ self.max_iterations = max_iterations
100
+ self.runtime_conf = runtime_conf
101
+ self.temperature = self.runtime_conf["temperature"]
102
+ self.top_k = self.runtime_conf["top_k"]
103
+ self.top_p = self.runtime_conf["top_p"]
104
+ self.max_tokens = self.runtime_conf["max_tokens"]
105
+ self.infer_mode = self.runtime_conf["infer_mode"]
106
+ self.prompt_style = self.runtime_conf["prompt_style"]
107
+ self.input_swap = self.runtime_conf["input_swap"]
108
+ self.language = self.runtime_conf["language"]
109
+ self.max_pixels = self.runtime_conf["max_pixels"]
110
+ self.min_pixels = self.runtime_conf["min_pixels"]
111
+ self.callusr_tolerance = self.runtime_conf["callusr_tolerance"]
112
+
113
+ self.thoughts: list[str] = []
114
+ self.actions: list[list[OperatorAction]] = []
115
+ self.observations: list[dict] = []
116
+ self.history_images: list[bytes] = []
117
+ self.history_responses: list[str] = []
118
+
119
+ self.prompt_template = self.UITARS_USR_PROMPT_THOUGHT
120
+ self.prompt_action_space = self.UITARS_NORMAL_ACTION_SPACE
121
+
122
+ if "history_n" in self.runtime_conf:
123
+ self.history_n = self.runtime_conf["history_n"]
124
+ else:
125
+ self.history_n = 5
126
+
127
+ self.cur_callusr_count = 0
128
+
129
+ async def act(self, instruction: str, env_state: EnvState) -> tuple[str, list[OperatorAction]]:
130
+ """
131
+ Suggest the next action(s) based on the instruction and current environment.
132
+ """
133
+ messages = self._format_messages_for_api(instruction, env_state)
134
+
135
+ recent_screenshot = Image.open(BytesIO(self.history_images[-1]))
136
+ origin_resized_height = recent_screenshot.height
137
+ origin_resized_width = recent_screenshot.width
138
+
139
+ prediction, parsed_responses = self.parse_instruction_to_action(
140
+ instruction, origin_resized_height, origin_resized_width
141
+ )
142
+
143
+ temperature = self.temperature
144
+ top_k = self.top_k
145
+ try_times = 3
146
+ while not parsed_responses:
147
+ if try_times <= 0:
148
+ print(f"Reach max retry times to fetch response from client, as error flag.")
149
+ return "client error\nFAIL", []
150
+ try:
151
+ response: ChatCompletion = await self.client.chat.completions.create(
152
+ model="ui-tars",
153
+ messages=messages,
154
+ frequency_penalty=1,
155
+ max_tokens=self.max_tokens,
156
+ temperature=temperature,
157
+ # top_k=top_k,
158
+ top_p=self.top_p,
159
+ )
160
+ prediction = response.choices[0].message.content.strip()
161
+ self.tracer["usage"] = get_chat_usage_metrics(
162
+ self.model_name,
163
+ input_tokens=response.usage.prompt_tokens,
164
+ output_tokens=response.usage.completion_tokens,
165
+ usage=self.tracer["usage"],
166
+ )
167
+ except Exception as e:
168
+ logger.debug(f"Error when fetching response from client, with error: {e}")
169
+ prediction = None
170
+ try_times -= 1
171
+
172
+ try:
173
+ parsed_responses = self.parse_action_to_structure_output(
174
+ prediction, origin_resized_height, origin_resized_width, self.max_pixels, self.min_pixels
175
+ )
176
+ break
177
+ except Exception as e:
178
+ logger.debug(f"Error when parsing response from client, with error: {e}")
179
+ # If fail to parse the model response, we use sampling parameters to avoid it
180
+ prediction = None
181
+ try_times -= 1
182
+ temperature = 1
183
+ top_k = -1
184
+
185
+ if prediction is None:
186
+ return "client error\nFAIL", []
187
+
188
+ self.history_responses.append(prediction)
189
+ self.thoughts.append(prediction)
190
+
191
+ try:
192
+ parsed_responses = self.parse_action_to_structure_output(
193
+ prediction, origin_resized_height, origin_resized_width, self.max_pixels, self.min_pixels
194
+ )
195
+ except Exception as e:
196
+ print(f"Parsing action error: {prediction}, with error:\n{e}")
197
+ return f"Parsing action error: {prediction}, with error:\n{e}\nFAIL", []
198
+
199
+ return self._parse_action(parsed_responses, prediction)
200
+
201
+ def _parse_action(self, parsed_responses: list[dict], prediction: str) -> tuple[str, list[OperatorAction]]:
202
+ """
203
+ Parse the model's prediction into actions and return the result.
204
+ """
205
+ actions: List[OperatorAction] = []
206
+ last_image = Image.open(BytesIO(self.history_images[-1]))
207
+ obs_image_height = last_image.height
208
+ obs_image_width = last_image.width
209
+ for parsed_response in parsed_responses:
210
+ if parsed_response["action_type"] == self.FINISH_WORD:
211
+ self.actions.append(actions)
212
+ return f"{prediction}\nDONE", []
213
+
214
+ elif parsed_response["action_type"] == self.WAIT_WORD:
215
+ self.actions.append(actions)
216
+ return prediction, [WaitAction(duration=3)]
217
+
218
+ elif parsed_response["action_type"] == self.ENV_FAIL_WORD:
219
+ self.actions.append(actions)
220
+ return f"{prediction}\nFAIL", []
221
+
222
+ elif parsed_response["action_type"] == self.CALL_USER:
223
+ if self.callusr_tolerance > self.cur_callusr_count:
224
+ self.actions.append(actions)
225
+ self.cur_callusr_count += 1
226
+ return prediction, [RequestUserAction(request=parsed_response["text"])]
227
+ else:
228
+ self.actions.append(actions)
229
+ return f"{prediction}\nFAIL", []
230
+
231
+ if self.environment_type == "web":
232
+ actions.extend(
233
+ self.parsing_response_to_action(parsed_response, obs_image_height, obs_image_width, self.input_swap)
234
+ )
235
+ else:
236
+ pass
237
+ # TODO: Add PyautoguiAction when enable computer environment
238
+ # actions.append(
239
+ # PyautoguiAction(code=
240
+ # self.parsing_response_to_pyautogui_code(
241
+ # parsed_response, obs_image_height, obs_image_width, self.input_swap
242
+ # )
243
+ # )
244
+ # )
245
+
246
+ self.actions.append(actions)
247
+
248
+ if len(self.history_responses) >= self.max_iterations:
249
+ # Default to FAIL if exceed max steps
250
+ actions = []
251
+ prediction = f"{prediction}\nFAIL"
252
+
253
+ return prediction or "", actions
254
+
255
+ def _format_messages_for_api(self, instruction: str, env_state: EnvState):
256
+ assert len(self.observations) == len(self.actions) and len(self.actions) == len(
257
+ self.thoughts
258
+ ), "The number of observations and actions should be the same."
259
+
260
+ self.history_images.append(base64.b64decode(env_state.screenshot))
261
+ self.observations.append({"screenshot": env_state.screenshot, "accessibility_tree": None})
262
+
263
+ user_prompt = self.prompt_template.format(
264
+ instruction=instruction, action_space=self.prompt_action_space, language=self.language
265
+ )
266
+
267
+ if len(self.history_images) > self.history_n:
268
+ self.history_images = self.history_images[-self.history_n :]
269
+
270
+ messages: list[dict] = []
271
+ images: list[Any] = []
272
+ if isinstance(self.history_images, bytes):
273
+ self.history_images = [self.history_images]
274
+ elif isinstance(self.history_images, np.ndarray):
275
+ self.history_images = list(self.history_images)
276
+ elif isinstance(self.history_images, list):
277
+ pass
278
+ else:
279
+ raise TypeError(f"Unidentified images type: {type(self.history_images)}")
280
+
281
+ for _, image in enumerate(self.history_images):
282
+ if len(images) >= self.history_n:
283
+ break
284
+ try:
285
+ image = Image.open(BytesIO(image))
286
+ except Exception as e:
287
+ raise RuntimeError(f"Error opening image: {e}")
288
+
289
+ if image.width * image.height > self.max_pixels:
290
+ """
291
+ Calculate a scaling factor to reduce the pixels in image to <= max_pixels if the image exceeds/is below the pixel limit,
292
+ This scaling factor is calculated by taking the square root, ensuring the aspect ratio remains unchanged,
293
+ so that the original relative coordinates can be reused directly without conversion.
294
+ """
295
+ resize_factor = math.sqrt(self.max_pixels / (image.width * image.height))
296
+ width, height = int(image.width * resize_factor), int(image.height * resize_factor)
297
+ image = image.resize((width, height))
298
+ if image.width * image.height < self.min_pixels:
299
+ resize_factor = math.sqrt(self.min_pixels / (image.width * image.height))
300
+ width, height = math.ceil(image.width * resize_factor), math.ceil(image.height * resize_factor)
301
+ image = image.resize((width, height))
302
+
303
+ if image.mode != "RGB":
304
+ image = image.convert("RGB")
305
+
306
+ images.append(image)
307
+
308
+ messages = [
309
+ {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
310
+ {"role": "user", "content": [{"type": "text", "text": user_prompt}]},
311
+ ]
312
+
313
+ image_num = 0
314
+ if len(self.history_responses) > 0:
315
+ for history_idx, history_response in enumerate(self.history_responses):
316
+ # send at most history_n images to the model
317
+ if history_idx + self.history_n > len(self.history_responses):
318
+ cur_image = images[image_num]
319
+ encoded_string = self.pil_to_base64(cur_image)
320
+ messages.append(
321
+ {
322
+ "role": "user",
323
+ "content": [
324
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_string}"}}
325
+ ],
326
+ }
327
+ )
328
+ image_num += 1
329
+
330
+ messages.append(
331
+ {"role": "assistant", "content": [{"type": "text", "text": self.add_box_token(history_response)}]}
332
+ )
333
+
334
+ cur_image = images[image_num]
335
+ encoded_string = self.pil_to_base64(cur_image)
336
+ messages.append(
337
+ {
338
+ "role": "user",
339
+ "content": [{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_string}"}}],
340
+ }
341
+ )
342
+ image_num += 1
343
+
344
+ else:
345
+ cur_image = images[image_num]
346
+ encoded_string = self.pil_to_base64(cur_image)
347
+ messages.append(
348
+ {
349
+ "role": "user",
350
+ "content": [{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_string}"}}],
351
+ }
352
+ )
353
+ image_num += 1
354
+
355
+ return messages
356
+
357
+ def reset(self):
358
+ self.thoughts = []
359
+ self.actions = []
360
+ self.observations = []
361
+ self.history_images = []
362
+ self.history_responses = []
363
+
364
+ # Define function to parse each action
365
+ def parse_action_string(self, action_str):
366
+ try:
367
+ # Parse the string into an AST node
368
+ node = ast.parse(action_str, mode="eval")
369
+
370
+ # Ensure the node is an expression
371
+ if not isinstance(node, ast.Expression):
372
+ raise ValueError("Not an expression")
373
+
374
+ # Get the body of the expression
375
+ call = node.body
376
+
377
+ # Ensure the body is a function call
378
+ if not isinstance(call, ast.Call):
379
+ raise ValueError("Not a function call")
380
+
381
+ # Get the function name
382
+ if isinstance(call.func, ast.Name):
383
+ func_name = call.func.id
384
+ elif isinstance(call.func, ast.Attribute):
385
+ func_name = call.func.attr
386
+ else:
387
+ func_name = None
388
+
389
+ # Get the keyword arguments
390
+ kwargs = {}
391
+ for kw in call.keywords:
392
+ key = kw.arg
393
+ # Handle different types of values, assuming they are all constants
394
+ if isinstance(kw.value, ast.Constant):
395
+ value = kw.value.value
396
+ elif isinstance(kw.value, ast.Str): # Older Python compatibility
397
+ value = kw.value.s
398
+ else:
399
+ value = None
400
+ kwargs[key] = value
401
+
402
+ return {"function": func_name, "args": kwargs}
403
+
404
+ except Exception as e:
405
+ print(f"Failed to parse action '{action_str}': {e}")
406
+ return None
407
+
408
+ def escape_single_quotes(self, text):
409
+ # Match unescaped single quotes (not matching \')
410
+ pattern = r"(?<!\\)'"
411
+ return re.sub(pattern, r"\\'", text)
412
+
413
+ def round_by_factor(self, number: int, factor: int) -> int:
414
+ """Returns the closest integer to 'number' that is divisible by 'factor'."""
415
+ return round(number / factor) * factor
416
+
417
+ def ceil_by_factor(self, number: float, factor: int) -> int:
418
+ """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
419
+ return math.ceil(number / factor) * factor
420
+
421
+ def floor_by_factor(self, number: float, factor: int) -> int:
422
+ """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
423
+ return math.floor(number / factor) * factor
424
+
425
+ def smart_resize(
426
+ self,
427
+ height: int,
428
+ width: int,
429
+ factor: int = IMAGE_FACTOR,
430
+ min_pixels: int = MIN_PIXELS,
431
+ max_pixels: int = MAX_PIXELS,
432
+ ) -> tuple[int, int]:
433
+ """
434
+ Rescales the image so that the following conditions are met:
435
+
436
+ 1. Both dimensions (height and width) are divisible by 'factor'.
437
+
438
+ 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
439
+
440
+ 3. The aspect ratio of the image is maintained as closely as possible.
441
+ """
442
+ if max(height, width) / min(height, width) > self.MAX_RATIO:
443
+ raise ValueError(
444
+ f"absolute aspect ratio must be smaller than {self.MAX_RATIO}, got {max(height, width) / min(height, width)}"
445
+ )
446
+ h_bar = max(factor, self.round_by_factor(height, factor))
447
+ w_bar = max(factor, self.round_by_factor(width, factor))
448
+ if h_bar * w_bar > max_pixels:
449
+ beta = math.sqrt((height * width) / max_pixels)
450
+ h_bar = self.floor_by_factor(height / beta, factor)
451
+ w_bar = self.floor_by_factor(width / beta, factor)
452
+ elif h_bar * w_bar < min_pixels:
453
+ beta = math.sqrt(min_pixels / (height * width))
454
+ h_bar = self.ceil_by_factor(height * beta, factor)
455
+ w_bar = self.ceil_by_factor(width * beta, factor)
456
+ return h_bar, w_bar
457
+
458
+ def parse_action_to_structure_output(
459
+ self,
460
+ text: str,
461
+ origin_resized_height,
462
+ origin_resized_width,
463
+ max_pixels=16384 * 28 * 28,
464
+ min_pixels=100 * 28 * 28,
465
+ ):
466
+ text = text.strip()
467
+ smart_resize_height, smart_resize_width = self.smart_resize(
468
+ origin_resized_height,
469
+ origin_resized_width,
470
+ factor=self.IMAGE_FACTOR,
471
+ min_pixels=min_pixels,
472
+ max_pixels=max_pixels,
473
+ )
474
+
475
+ # Regular expression to match Action string
476
+ if text.startswith("Thought:"):
477
+ thought_pattern = r"Thought: (.+?)(?=\s*Action:|$)"
478
+ elif text.startswith("Reflection:"):
479
+ thought_pattern = r"Reflection: (.+?)Action_Summary: (.+?)(?=\s*Action:|$)"
480
+ elif text.startswith("Action_Summary:"):
481
+ thought_pattern = r"Action_Summary: (.+?)(?=\s*Action:|$)"
482
+ else:
483
+ thought_pattern = r"Thought: (.+?)(?=\s*Action:|$)"
484
+ reflection, thought = None, None
485
+ thought_match = re.search(thought_pattern, text, re.DOTALL)
486
+ if thought_match:
487
+ if len(thought_match.groups()) == 1:
488
+ thought = thought_match.group(1).strip()
489
+ elif len(thought_match.groups()) == 2:
490
+ thought = thought_match.group(2).strip()
491
+ reflection = thought_match.group(1).strip()
492
+ assert "Action:" in text
493
+ action_str = text.split("Action:")[-1]
494
+
495
+ tmp_all_action = action_str.split("\n\n")
496
+ all_action = []
497
+ for action_str in tmp_all_action:
498
+ if "type(content" in action_str:
499
+ # Regex to match string in content and escape single quotes
500
+ def escape_quotes(match):
501
+ content = match.group(1)
502
+ return content
503
+
504
+ # Use regex to replace
505
+ pattern = r"type\(content='(.*?)'\)" # Match type(content='...')
506
+ content = re.sub(pattern, escape_quotes, action_str)
507
+
508
+ # Process the string
509
+ action_str = self.escape_single_quotes(content)
510
+ action_str = "type(content='" + action_str + "')"
511
+ all_action.append(action_str)
512
+
513
+ parsed_actions = [self.parse_action_string(action.replace("\n", "\\n").lstrip()) for action in all_action]
514
+ actions: list[dict] = []
515
+ for action_instance, raw_str in zip(parsed_actions, all_action):
516
+ if action_instance == None:
517
+ print(f"Action can't parse: {raw_str}")
518
+ raise ValueError(f"Action can't parse: {raw_str}")
519
+ action_type = action_instance["function"]
520
+ params = action_instance["args"]
521
+
522
+ action_inputs = {}
523
+ for param_name, param in params.items():
524
+ if param == "":
525
+ continue
526
+ param = param.lstrip().rstrip() # Remove quotes, extra spaces
527
+ # Process start_box, end_box parameter format '<bbox>x1 y1 x2 y2</bbox>'
528
+ action_inputs[param_name.strip()] = param
529
+
530
+ if "start_box" in param_name or "end_box" in param_name:
531
+ ori_box = param
532
+ # Remove parentheses and split the string by commas
533
+ numbers = ori_box.replace("(", "").replace(")", "").split(",")
534
+
535
+ # Convert to float and scale by 1000 as output is absolute coordinates
536
+ float_numbers = []
537
+ for num_idx, num in enumerate(numbers):
538
+ num = float(num)
539
+ if (num_idx + 1) % 2 == 0:
540
+ float_numbers.append(float(num / smart_resize_height))
541
+ else:
542
+ float_numbers.append(float(num / smart_resize_width))
543
+
544
+ if len(float_numbers) == 2:
545
+ float_numbers = [float_numbers[0], float_numbers[1], float_numbers[0], float_numbers[1]]
546
+ action_inputs[param_name.strip()] = str(float_numbers)
547
+
548
+ actions.append(
549
+ {
550
+ "reflection": reflection,
551
+ "thought": thought,
552
+ "action_type": action_type,
553
+ "action_inputs": action_inputs,
554
+ "text": text,
555
+ }
556
+ )
557
+ return actions
558
+
559
+ def parsing_response_to_action(
560
+ self, responses, image_height: int, image_width: int, input_swap: bool = True
561
+ ) -> List[OperatorAction]:
562
+ """
563
+ Parses the output of the M model into actions in OSWorld and generates a pyautogui code string.
564
+ Parameters:
565
+ response: A dictionary containing the model's output, structured like:
566
+ {
567
+ "action_type": "hotkey",
568
+ "action_inputs": {
569
+ "hotkey": "v ctrl",
570
+ "start_box": None,
571
+ "end_box": None
572
+ }
573
+ }
574
+ Returns:
575
+ The generated operator actions list
576
+ """
577
+
578
+ actions: List[OperatorAction] = []
579
+ if isinstance(responses, dict):
580
+ responses = [responses]
581
+ for response_id, response in enumerate(responses):
582
+ if "observation" in response:
583
+ observation = response["observation"]
584
+ else:
585
+ observation = ""
586
+
587
+ if "thought" in response:
588
+ thought = response["thought"]
589
+ else:
590
+ thought = ""
591
+
592
+ if response_id == 0:
593
+ logger.debug(f"UITars Grounder:\nObservation:{observation}\nThought:\n{thought}")
594
+ else:
595
+ actions.append(WaitAction(duration=1))
596
+
597
+ action_dict = response
598
+ action_type = action_dict.get("action_type")
599
+ action_inputs = action_dict.get("action_inputs", {})
600
+
601
+ if action_type in ["hotkey", "press"]:
602
+ # Parsing hotkey action
603
+ if "key" in action_inputs:
604
+ hotkey = action_inputs.get("key", "")
605
+ else:
606
+ hotkey = action_inputs.get("hotkey", "")
607
+
608
+ if hotkey == "arrowleft":
609
+ hotkey = "left"
610
+
611
+ elif hotkey == "arrowright":
612
+ hotkey = "right"
613
+
614
+ elif hotkey == "arrowup":
615
+ hotkey = "up"
616
+
617
+ elif hotkey == "arrowdown":
618
+ hotkey = "down"
619
+
620
+ if hotkey:
621
+ # Handle other hotkeys
622
+ keys = hotkey.split() # Split the keys by space
623
+ key_combination = []
624
+ for key in keys:
625
+ if key == "space":
626
+ key = " "
627
+ key_combination.append(key)
628
+ actions.append(KeypressAction(keys=key_combination))
629
+
630
+ elif action_type == "keyup":
631
+ key_to_up = action_inputs.get("key", "")
632
+ actions.append(KeyUpAction(key=key_to_up))
633
+
634
+ elif action_type == "keydown":
635
+ key_to_down = action_inputs.get("key", "")
636
+ actions.append(KeyDownAction(key=key_to_down))
637
+
638
+ elif action_type == "type":
639
+ # Parsing typing action using clipboard
640
+ content = action_inputs.get("content", "")
641
+ content = self.escape_single_quotes(content)
642
+ stripped_content = content
643
+ if content.endswith("\n") or content.endswith("\\n"):
644
+ stripped_content = stripped_content.rstrip("\\n").rstrip("\n")
645
+ if content:
646
+ if input_swap:
647
+ # ignore copying text to clipboard for now
648
+ pass
649
+ actions.append(TypeAction(text=stripped_content))
650
+ if content.endswith("\n") or content.endswith("\\n"):
651
+ actions.append(KeypressAction(keys=["enter"]))
652
+
653
+ elif action_type in ["drag", "select"]:
654
+ # Parsing drag or select action based on start and end_boxes
655
+ start_box = action_inputs.get("start_box")
656
+ end_box = action_inputs.get("end_box")
657
+ if start_box and end_box:
658
+ x1, y1, x2, y2 = eval(start_box) # Assuming box is in [x1, y1, x2, y2]
659
+ sx = round(float((x1 + x2) / 2) * image_width, 3)
660
+ sy = round(float((y1 + y2) / 2) * image_height, 3)
661
+ x1, y1, x2, y2 = eval(end_box) # Assuming box is in [x1, y1, x2, y2]
662
+ ex = round(float((x1 + x2) / 2) * image_width, 3)
663
+ ey = round(float((y1 + y2) / 2) * image_height, 3)
664
+ actions.append(MoveAction(x=sx, y=sy))
665
+ actions.append(DragAction(path=[(sx, sy), (ex, ey)]))
666
+
667
+ elif action_type == "scroll":
668
+ # Parsing scroll action
669
+ start_box = action_inputs.get("start_box")
670
+ if start_box:
671
+ x1, y1, x2, y2 = eval(start_box) # Assuming box is in [x1, y1, x2, y2]
672
+ x = round(float((x1 + x2) / 2) * image_width, 3)
673
+ y = round(float((y1 + y2) / 2) * image_height, 3)
674
+
675
+ # First click the element, then scroll
676
+ # actions.append(ClickAction(x=x, y=y, button='left'))
677
+ else:
678
+ x = None
679
+ y = None
680
+ direction = action_inputs.get("direction", "down")
681
+
682
+ if "up" in direction.lower():
683
+ actions.append(ScrollAction(amount=5, scroll_direction="up", x=x, y=y))
684
+ elif "left" in direction.lower():
685
+ actions.append(ScrollAction(amount=5, scroll_direction="left", x=x, y=y))
686
+ elif "right" in direction.lower():
687
+ actions.append(ScrollAction(amount=5, scroll_direction="right", x=x, y=y))
688
+ else:
689
+ actions.append(ScrollAction(amount=5, scroll_direction="down", x=x, y=y))
690
+
691
+ elif action_type in ["click", "left_single", "left_double", "right_single", "hover"]:
692
+ # Parsing mouse click actions
693
+ start_box = action_inputs.get("start_box")
694
+ start_box = str(start_box)
695
+ if start_box:
696
+ start_box = eval(start_box)
697
+ if len(start_box) == 4:
698
+ x1, y1, x2, y2 = start_box # Assuming box is in [x1, y1, x2, y2]
699
+ elif len(start_box) == 2:
700
+ x1, y1 = start_box
701
+ x2 = x1
702
+ y2 = y1
703
+ x = round(float((x1 + x2) / 2) * image_width, 3)
704
+ y = round(float((y1 + y2) / 2) * image_height, 3)
705
+ if action_type == "left_single" or action_type == "click":
706
+ actions.append(ClickAction(x=x, y=y, button="left"))
707
+ elif action_type == "left_double":
708
+ actions.append(DoubleClickAction(x=x, y=y, button="left"))
709
+ elif action_type == "right_single":
710
+ actions.append(ClickAction(x=x, y=y, button="right"))
711
+ elif action_type == "hover":
712
+ actions.append(MoveAction(x=x, y=y))
713
+
714
+ elif action_type == "goto":
715
+ url = action_inputs.get("url", "")
716
+ actions.append(GotoAction(url=url))
717
+
718
+ elif action_type == "back":
719
+ actions.append(BackAction())
720
+
721
+ elif action_type in ["finished"]:
722
+ actions = []
723
+
724
+ else:
725
+ logger.error(f"\n# Unrecognized action type: {action_type}")
726
+
727
+ return actions
728
+
729
+ def parsing_response_to_pyautogui_code(
730
+ self, responses, image_height: int, image_width: int, input_swap: bool = True
731
+ ) -> str:
732
+ """
733
+ Parses model suggested actions for the GUI environment and generates the pyautogui code string to run.
734
+ Parameters:
735
+ response: A dictionary containing the model's output, structured like:
736
+ {
737
+ "action_type": "hotkey",
738
+ "action_inputs": {
739
+ "hotkey": "v ctrl",
740
+ "start_box": None,
741
+ "end_box": None
742
+ }
743
+ }
744
+ Returns:
745
+ The pyautogui code string
746
+ """
747
+
748
+ pyautogui_code = f"import pyautogui\nimport time\n"
749
+ actions = []
750
+ if isinstance(responses, dict):
751
+ responses = [responses]
752
+ for response_id, response in enumerate(responses):
753
+ if "observation" in response:
754
+ observation = response["observation"]
755
+ else:
756
+ observation = ""
757
+
758
+ if "thought" in response:
759
+ thought = response["thought"]
760
+ else:
761
+ thought = ""
762
+
763
+ if response_id == 0:
764
+ pyautogui_code += f"'''\nObservation:\n{observation}\n\nThought:\n{thought}\n'''\n"
765
+ else:
766
+ pyautogui_code += f"\ntime.sleep(1)\n"
767
+
768
+ action_dict = response
769
+ action_type = action_dict.get("action_type")
770
+ action_inputs = action_dict.get("action_inputs", {})
771
+
772
+ if action_type == "hotkey":
773
+ # Parsing hotkey action
774
+ if "key" in action_inputs:
775
+ hotkey = action_inputs.get("key", "")
776
+ else:
777
+ hotkey = action_inputs.get("hotkey", "")
778
+
779
+ if hotkey == "arrowleft":
780
+ hotkey = "left"
781
+
782
+ elif hotkey == "arrowright":
783
+ hotkey = "right"
784
+
785
+ elif hotkey == "arrowup":
786
+ hotkey = "up"
787
+
788
+ elif hotkey == "arrowdown":
789
+ hotkey = "down"
790
+
791
+ if hotkey:
792
+ actions.append(KeypressAction(keys=[hotkey]))
793
+
794
+ elif action_type == "press":
795
+ # Parsing press action
796
+ if "key" in action_inputs:
797
+ key_to_press = action_inputs.get("key", "")
798
+ else:
799
+ key_to_press = action_inputs.get("press", "")
800
+
801
+ if hotkey == "arrowleft":
802
+ hotkey = "left"
803
+
804
+ elif hotkey == "arrowright":
805
+ hotkey = "right"
806
+
807
+ elif hotkey == "arrowup":
808
+ hotkey = "up"
809
+
810
+ elif hotkey == "arrowdown":
811
+ hotkey = "down"
812
+
813
+ elif hotkey == "space":
814
+ hotkey = " "
815
+
816
+ if key_to_press:
817
+ # Simulate pressing a single key
818
+ pyautogui_code += f"\npyautogui.press({repr(key_to_press)})"
819
+
820
+ elif action_type == "keyup":
821
+ key_to_up = action_inputs.get("key", "")
822
+ pyautogui_code += f"\npyautogui.keyUp({repr(key_to_up)})"
823
+
824
+ elif action_type == "keydown":
825
+ key_to_down = action_inputs.get("key", "")
826
+ pyautogui_code += f"\npyautogui.keyDown({repr(key_to_down)})"
827
+
828
+ elif action_type == "type":
829
+ # Parsing typing action using clipboard
830
+ content = action_inputs.get("content", "")
831
+ content = self.escape_single_quotes(content)
832
+ stripped_content = content
833
+ if content.endswith("\n") or content.endswith("\\n"):
834
+ stripped_content = stripped_content.rstrip("\\n").rstrip("\n")
835
+ if content:
836
+ if input_swap:
837
+ actions += TypeAction()
838
+ pyautogui_code += f"\nimport pyperclip"
839
+ pyautogui_code += f"\npyperclip.copy('{stripped_content}')"
840
+ pyautogui_code += f"\npyautogui.hotkey('ctrl', 'v')"
841
+ pyautogui_code += f"\ntime.sleep(0.5)\n"
842
+ if content.endswith("\n") or content.endswith("\\n"):
843
+ pyautogui_code += f"\npyautogui.press('enter')"
844
+ else:
845
+ pyautogui_code += f"\npyautogui.write('{stripped_content}', interval=0.1)"
846
+ pyautogui_code += f"\ntime.sleep(0.5)\n"
847
+ if content.endswith("\n") or content.endswith("\\n"):
848
+ pyautogui_code += f"\npyautogui.press('enter')"
849
+
850
+ elif action_type in ["drag", "select"]:
851
+ # Parsing drag or select action based on start and end_boxes
852
+ start_box = action_inputs.get("start_box")
853
+ end_box = action_inputs.get("end_box")
854
+ if start_box and end_box:
855
+ x1, y1, x2, y2 = eval(start_box) # Assuming box is in [x1, y1, x2, y2]
856
+ sx = round(float((x1 + x2) / 2) * image_width, 3)
857
+ sy = round(float((y1 + y2) / 2) * image_height, 3)
858
+ x1, y1, x2, y2 = eval(end_box) # Assuming box is in [x1, y1, x2, y2]
859
+ ex = round(float((x1 + x2) / 2) * image_width, 3)
860
+ ey = round(float((y1 + y2) / 2) * image_height, 3)
861
+ pyautogui_code += (
862
+ f"\npyautogui.moveTo({sx}, {sy})\n" f"\npyautogui.dragTo({ex}, {ey}, duration=1.0)\n"
863
+ )
864
+
865
+ elif action_type == "scroll":
866
+ # Parsing scroll action
867
+ start_box = action_inputs.get("start_box")
868
+ if start_box:
869
+ x1, y1, x2, y2 = eval(start_box) # Assuming box is in [x1, y1, x2, y2]
870
+ x = round(float((x1 + x2) / 2) * image_width, 3)
871
+ y = round(float((y1 + y2) / 2) * image_height, 3)
872
+
873
+ # First click the element, then scroll
874
+ # pyautogui_code += f"\npyautogui.click({x}, {y}, button='left')"
875
+ else:
876
+ x = None
877
+ y = None
878
+ direction = action_inputs.get("direction", "")
879
+
880
+ if x == None:
881
+ if "up" in direction.lower():
882
+ pyautogui_code += f"\npyautogui.scroll(5)"
883
+ elif "down" in direction.lower():
884
+ pyautogui_code += f"\npyautogui.scroll(-5)"
885
+ else:
886
+ if "up" in direction.lower():
887
+ pyautogui_code += f"\npyautogui.scroll(5, x={x}, y={y})"
888
+ elif "down" in direction.lower():
889
+ pyautogui_code += f"\npyautogui.scroll(-5, x={x}, y={y})"
890
+
891
+ elif action_type in ["click", "left_single", "left_double", "right_single", "hover"]:
892
+ # Parsing mouse click actions
893
+ start_box = action_inputs.get("start_box")
894
+ start_box = str(start_box)
895
+ if start_box:
896
+ start_box = eval(start_box)
897
+ if len(start_box) == 4:
898
+ x1, y1, x2, y2 = start_box # Assuming box is in [x1, y1, x2, y2]
899
+ elif len(start_box) == 2:
900
+ x1, y1 = start_box
901
+ x2 = x1
902
+ y2 = y1
903
+ x = round(float((x1 + x2) / 2) * image_width, 3)
904
+ y = round(float((y1 + y2) / 2) * image_height, 3)
905
+ if action_type == "left_single" or action_type == "click":
906
+ pyautogui_code += f"\npyautogui.click({x}, {y}, button='left')"
907
+ elif action_type == "left_double":
908
+ pyautogui_code += f"\npyautogui.doubleClick({x}, {y}, button='left')"
909
+ elif action_type == "right_single":
910
+ pyautogui_code += f"\npyautogui.click({x}, {y}, button='right')"
911
+ elif action_type == "hover":
912
+ pyautogui_code += f"\npyautogui.moveTo({x}, {y})"
913
+
914
+ elif action_type in ["finished"]:
915
+ pyautogui_code = f"DONE"
916
+
917
+ else:
918
+ pyautogui_code += f"\n# Unrecognized action type: {action_type}"
919
+
920
+ return pyautogui_code
921
+
922
+ def parse_instruction_to_action(
923
+ self, instruction: str, origin_resized_height: int, origin_resized_width: int
924
+ ) -> tuple[str, list[dict]]:
925
+ """
926
+ Parse instruction into action with simple string match for GOTO and BACK actions.
927
+
928
+ Useful for actions that do not need to invoke the visual grounding model.
929
+ """
930
+ prediction, parsed_responses = None, []
931
+ # handle GOTO <URL>, BACK actions at the end of the response.
932
+ if instruction.strip().splitlines()[-1].strip().startswith("GOTO"):
933
+ url = instruction.split("GOTO")[-1].strip()
934
+ prediction = f"Thought: Let me go to {url}\nAction: goto(url='{url}')"
935
+ parsed_responses = self.parse_action_to_structure_output(
936
+ prediction, origin_resized_height, origin_resized_width, self.max_pixels, self.min_pixels
937
+ )
938
+ elif instruction.strip().endswith("BACK"):
939
+ prediction = "Thought: Let me go back to the previous page.\nAction: back()"
940
+ parsed_responses = self.parse_action_to_structure_output(
941
+ prediction, origin_resized_height, origin_resized_width, self.max_pixels, self.min_pixels
942
+ )
943
+ return prediction, parsed_responses
944
+
945
+ def add_box_token(self, input_string):
946
+ # Step 1: Split the string into individual actions
947
+ if "Action: " in input_string and "start_box=" in input_string:
948
+ suffix = input_string.split("Action: ")[0] + "Action: "
949
+ actions = input_string.split("Action: ")[1:]
950
+ processed_actions = []
951
+ for action in actions:
952
+ action = action.strip()
953
+ # Step 2: Extract coordinates (start_box or end_box) using regex
954
+ coordinates = re.findall(r"(start_box|end_box)='\((\d+),\s*(\d+)\)'", action)
955
+
956
+ updated_action = action # Start with the original action
957
+ for coord_type, x, y in coordinates:
958
+ # Convert x and y to integers
959
+ updated_action = updated_action.replace(
960
+ f"{coord_type}='({x},{y})'", f"{coord_type}='<|box_start|>({x},{y})<|box_end|>'"
961
+ )
962
+ processed_actions.append(updated_action)
963
+
964
+ # Step 5: Reconstruct the final string
965
+ final_string = suffix + "\n\n".join(processed_actions)
966
+ else:
967
+ final_string = input_string
968
+ return final_string
969
+
970
+ def pil_to_base64(self, image):
971
+ buffer = BytesIO()
972
+ image.save(buffer, format="PNG")
973
+ return base64.b64encode(buffer.getvalue()).decode("utf-8")