khoj 1.41.1.dev107__py3-none-any.whl → 1.41.1.dev144__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. khoj/database/adapters/__init__.py +20 -0
  2. khoj/database/models/__init__.py +1 -1
  3. khoj/interface/compiled/404/index.html +2 -2
  4. khoj/interface/compiled/_next/static/chunks/{8515-f305779d95dd5780.js → 5138-2cce449fd2454abf.js} +9 -9
  5. khoj/interface/compiled/_next/static/chunks/7127-d3199617463d45f0.js +1 -0
  6. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +1 -0
  7. khoj/interface/compiled/_next/static/chunks/app/agents/{page-c9ceb9b94e24b94a.js → page-e18e67cff45758c8.js} +1 -1
  8. khoj/interface/compiled/_next/static/chunks/app/automations/{page-3dc59a0df3827dc7.js → page-768a0903c4b5b06d.js} +1 -1
  9. khoj/interface/compiled/_next/static/chunks/app/chat/{page-2b27c7118d8d5a16.js → page-1153981cb9c4907f.js} +1 -1
  10. khoj/interface/compiled/_next/static/chunks/app/{page-38f1f125d7aeb4c7.js → page-a4b97dd0c2a70cfb.js} +1 -1
  11. khoj/interface/compiled/_next/static/chunks/app/search/{page-26d4492fb1200e0e.js → page-44072d929427ee56.js} +1 -1
  12. khoj/interface/compiled/_next/static/chunks/app/settings/{page-bf1a4e488b29fceb.js → page-4e8fdd30a3238357.js} +1 -1
  13. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-b3f7ae1ef8871d30.js +1 -0
  14. khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-a1f10c96366c3a4f.js → page-6a4a9050c8bddae9.js} +1 -1
  15. khoj/interface/compiled/_next/static/chunks/{webpack-c6bde5961098facd.js → webpack-34ac812e4e4e9a50.js} +1 -1
  16. khoj/interface/compiled/_next/static/css/1e9b757ee2a2b34b.css +1 -0
  17. khoj/interface/compiled/agents/index.html +2 -2
  18. khoj/interface/compiled/agents/index.txt +2 -2
  19. khoj/interface/compiled/automations/index.html +2 -2
  20. khoj/interface/compiled/automations/index.txt +2 -2
  21. khoj/interface/compiled/chat/index.html +2 -2
  22. khoj/interface/compiled/chat/index.txt +2 -2
  23. khoj/interface/compiled/index.html +2 -2
  24. khoj/interface/compiled/index.txt +2 -2
  25. khoj/interface/compiled/search/index.html +2 -2
  26. khoj/interface/compiled/search/index.txt +2 -2
  27. khoj/interface/compiled/settings/index.html +2 -2
  28. khoj/interface/compiled/settings/index.txt +2 -2
  29. khoj/interface/compiled/share/chat/index.html +2 -2
  30. khoj/interface/compiled/share/chat/index.txt +2 -2
  31. khoj/processor/conversation/anthropic/anthropic_chat.py +8 -9
  32. khoj/processor/conversation/anthropic/utils.py +30 -7
  33. khoj/processor/conversation/google/gemini_chat.py +10 -10
  34. khoj/processor/conversation/google/utils.py +20 -12
  35. khoj/processor/conversation/offline/chat_model.py +2 -7
  36. khoj/processor/conversation/openai/gpt.py +8 -9
  37. khoj/processor/conversation/utils.py +132 -21
  38. khoj/processor/operator/README.md +59 -0
  39. khoj/processor/operator/{operate_browser.py → __init__.py} +98 -34
  40. khoj/processor/operator/grounding_agent.py +229 -175
  41. khoj/processor/operator/grounding_agent_uitars.py +59 -48
  42. khoj/processor/operator/operator_actions.py +48 -0
  43. khoj/processor/operator/operator_agent_anthropic.py +298 -90
  44. khoj/processor/operator/operator_agent_base.py +45 -14
  45. khoj/processor/operator/operator_agent_binary.py +125 -57
  46. khoj/processor/operator/operator_agent_openai.py +183 -75
  47. khoj/processor/operator/operator_environment_base.py +11 -1
  48. khoj/processor/operator/operator_environment_browser.py +5 -3
  49. khoj/processor/operator/operator_environment_computer.py +658 -0
  50. khoj/routers/api_chat.py +36 -25
  51. khoj/routers/helpers.py +8 -17
  52. khoj/routers/research.py +43 -20
  53. khoj/utils/constants.py +4 -4
  54. khoj/utils/helpers.py +12 -15
  55. {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev144.dist-info}/METADATA +3 -1
  56. {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev144.dist-info}/RECORD +61 -59
  57. khoj/interface/compiled/_next/static/chunks/4986-9ddd694756d03aa1.js +0 -1
  58. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e49165209d2e406c.js +0 -1
  59. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-64a53f8ec4afa6b3.js +0 -1
  60. khoj/interface/compiled/_next/static/css/bb7ea98028b368f3.css +0 -1
  61. /khoj/interface/compiled/_next/static/{y_k1yn7bI1CgM5ZfW7jUq → aJZTO0gnTwX0Dca_dPw4r}/_buildManifest.js +0 -0
  62. /khoj/interface/compiled/_next/static/{y_k1yn7bI1CgM5ZfW7jUq → aJZTO0gnTwX0Dca_dPw4r}/_ssgManifest.js +0 -0
  63. {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev144.dist-info}/WHEEL +0 -0
  64. {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev144.dist-info}/entry_points.txt +0 -0
  65. {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev144.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,658 @@
1
+ import ast
2
+ import asyncio
3
+ import base64
4
+ import io
5
+ import logging
6
+ import platform
7
+ import subprocess
8
+ from typing import Literal, Optional, Union
9
+
10
+ from PIL import Image, ImageDraw
11
+
12
+ from khoj.processor.operator.operator_actions import DragAction, OperatorAction, Point
13
+ from khoj.processor.operator.operator_environment_base import (
14
+ Environment,
15
+ EnvState,
16
+ EnvStepResult,
17
+ )
18
+ from khoj.utils.helpers import convert_image_to_webp
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ # --- Concrete Computer Environment ---
24
+ class ComputerEnvironment(Environment):
25
+ def __init__(
26
+ self,
27
+ provider: Literal["local", "docker"] = "local",
28
+ docker_display: str = ":99",
29
+ docker_container_name: str = "khoj-computer",
30
+ ):
31
+ self.provider = provider
32
+ self.docker_display = docker_display
33
+ self.docker_container_name = docker_container_name
34
+
35
+ self.width: int = 0
36
+ self.height: int = 0
37
+ self.mouse_pos: Point = Point(x=0, y=0)
38
+
39
+ async def _execute(self, func_name, *args, **kwargs):
40
+ """
41
+ Executes a pyautogui function, abstracting the execution context.
42
+ Currently runs locally using asyncio.to_thread.
43
+ """
44
+ python_command_str = self.generate_pyautogui_command(func_name, *args, **kwargs)
45
+ # Docker execution
46
+ if self.provider == "docker":
47
+ try:
48
+ output_str = await self.docker_execute(python_command_str)
49
+ except RuntimeError as e: # Catch other Docker execution errors
50
+ logger.error(f"Error during Docker execution of {func_name}: {e}")
51
+ raise # Re-raise as a general error for the caller to handle
52
+ # Local execution
53
+ else:
54
+ process = await asyncio.to_thread(
55
+ subprocess.run,
56
+ ["python3", "-c", python_command_str],
57
+ capture_output=True,
58
+ text=True,
59
+ check=False, # We check returncode manually
60
+ )
61
+ output_str = process.stdout.strip()
62
+ if process.returncode != 0:
63
+ if "FailSafeException" in process.stderr or "FailSafeException" in process.stdout:
64
+ # Extract the message if possible, otherwise use generic
65
+ fs_msg = process.stderr or process.stdout
66
+ raise KeyboardInterrupt(fs_msg)
67
+ else:
68
+ error_msg = (
69
+ f'Local script execution failed:\nCmd: python3 -c "{python_command_str[:200]}...{python_command_str[-200:]}\n'
70
+ f"Return Code: {process.returncode}\nStderr: {process.stderr}\nStdout: {process.stdout}"
71
+ )
72
+ logger.error(error_msg)
73
+ raise RuntimeError(f"Local script execution error: {process.stderr or process.stdout}")
74
+ if not output_str or output_str == "None":
75
+ return None
76
+
77
+ try:
78
+ return ast.literal_eval(output_str)
79
+ except (ValueError, SyntaxError):
80
+ # If not a literal (e.g., some other string output), return as is
81
+ return output_str
82
+
83
+ async def start(self, width: int, height: int) -> None:
84
+ """
85
+ Initializes the computer environment.
86
+ The width and height parameters are logged, but actual screen dimensions are used.
87
+ """
88
+ screen_width, screen_height = await self._execute("size")
89
+
90
+ self.width = screen_width
91
+ self.height = screen_height
92
+ # Initialize mouse position to center, or current if available
93
+ try:
94
+ current_x, current_y = await self._execute("position")
95
+ self.mouse_pos = Point(x=current_x, y=current_y)
96
+ except Exception: # Fallback if position cannot be obtained initially
97
+ self.mouse_pos = Point(x=self.width / 2, y=self.height / 2)
98
+
99
+ logger.info(
100
+ f"Computer environment started. Screen size: {self.width}x{self.height}. "
101
+ f"Input width/height ({width}x{height}) are noted but screen dimensioning uses actual screen size. "
102
+ f"Initial mouse position: ({self.mouse_pos.x},{self.mouse_pos.y})"
103
+ )
104
+
105
+ async def _get_screenshot(self) -> Optional[str]:
106
+ try:
107
+ # Get screenshot
108
+ base64_png_str = await self._execute("screenshot")
109
+ screenshot_bytes = base64.b64decode(base64_png_str)
110
+
111
+ # Get current mouse position
112
+ current_mouse_x, current_mouse_y = await self._execute("position")
113
+ draw_pos = Point(x=current_mouse_x, y=current_mouse_y)
114
+
115
+ # Add mouse position to screenshot
116
+ screenshot_bytes_with_mouse = await self._draw_mouse_position(screenshot_bytes, draw_pos)
117
+ screenshot_webp_bytes = convert_image_to_webp(screenshot_bytes_with_mouse)
118
+ return base64.b64encode(screenshot_webp_bytes).decode("utf-8")
119
+ except KeyboardInterrupt: # Propagate keyboard interrupts
120
+ raise
121
+ except Exception as e:
122
+ logger.error(f"Failed to get screenshot: {e}", exc_info=True)
123
+ return None
124
+
125
+ async def _draw_mouse_position(self, screenshot_bytes: bytes, mouse_pos: Point) -> bytes:
126
+ if Image is None or ImageDraw is None:
127
+ return screenshot_bytes
128
+ try:
129
+ image = Image.open(io.BytesIO(screenshot_bytes))
130
+ draw = ImageDraw.Draw(image)
131
+ radius = 8
132
+ # Red circle with black border for better visibility
133
+ draw.ellipse(
134
+ (mouse_pos.x - radius, mouse_pos.y - radius, mouse_pos.x + radius, mouse_pos.y + radius),
135
+ outline="black",
136
+ fill="red",
137
+ width=2,
138
+ )
139
+ output_buffer = io.BytesIO()
140
+ image.save(output_buffer, format="PNG")
141
+ return output_buffer.getvalue()
142
+ except Exception as e:
143
+ logger.error(f"Failed to draw mouse position: {e}")
144
+ return screenshot_bytes
145
+
146
+ async def get_state(self) -> EnvState:
147
+ screenshot = await self._get_screenshot()
148
+ return EnvState(screenshot=screenshot, height=self.height, width=self.width)
149
+
150
+ async def step(self, action: OperatorAction) -> EnvStepResult:
151
+ output: Optional[Union[str, dict]] = None
152
+ error: Optional[str] = None
153
+ step_type: str = "text"
154
+
155
+ try:
156
+ match action.type:
157
+ case "click":
158
+ x, y, button_name = action.x, action.y, action.button
159
+ modifiers_to_press = self.parse_key_combination(action.modifiers) if action.modifiers else []
160
+ for mod_key in modifiers_to_press:
161
+ await self._execute("keyDown", mod_key)
162
+
163
+ if button_name == "wheel":
164
+ # Perform a small scroll action at this position (e.g., one "tick" down)
165
+ # Pyautogui scroll: positive up, negative down.
166
+ await self._execute("scroll", -1, x=x, y=y)
167
+ output = f"Scrolled wheel at ({x}, {y})"
168
+ else:
169
+ pyautogui_button = button_name.lower() if button_name else "left"
170
+ await self._execute("click", x=x, y=y, button=pyautogui_button)
171
+ output = f"{button_name.capitalize() if button_name else 'Left'} clicked at ({x}, {y})"
172
+
173
+ for mod_key in reversed(modifiers_to_press):
174
+ await self._execute("keyUp", mod_key)
175
+
176
+ self.mouse_pos = Point(x=x, y=y)
177
+ logger.debug(f"Action: {action.type} {button_name} at ({x},{y}) with modifiers {action.modifiers}")
178
+
179
+ case "double_click":
180
+ x, y = action.x, action.y
181
+ await self._execute("doubleClick", x=x, y=y)
182
+ self.mouse_pos = Point(x=x, y=y)
183
+ output = f"Double clicked at ({x}, {y})"
184
+ logger.debug(f"Action: {action.type} at ({x},{y})")
185
+
186
+ case "triple_click":
187
+ x, y = action.x, action.y
188
+ await self._execute("click", x=x, y=y, clicks=3)
189
+ self.mouse_pos = Point(x=x, y=y)
190
+ output = f"Triple clicked at ({x}, {y})"
191
+ logger.debug(f"Action: {action.type} at ({x},{y})")
192
+
193
+ case "scroll":
194
+ current_x_pos, current_y_pos = await self._execute("position")
195
+ target_x = action.x if action.x is not None else current_x_pos
196
+ target_y = action.y if action.y is not None else current_y_pos
197
+
198
+ if target_x != current_x_pos or target_y != current_y_pos:
199
+ await self._execute("moveTo", target_x, target_y)
200
+
201
+ self.mouse_pos = Point(x=target_x, y=target_y) # Update mouse pos to scroll location
202
+
203
+ if action.scroll_x is not None or action.scroll_y is not None:
204
+ scroll_x_amount = action.scroll_x or 0
205
+ scroll_y_amount = action.scroll_y or 0
206
+
207
+ if scroll_x_amount != 0:
208
+ await self._execute("hscroll", scroll_x_amount)
209
+ if scroll_y_amount != 0:
210
+ # pyautogui scroll: positive up, so negate for typical "scroll down" meaning positive y
211
+ await self._execute("scroll", -scroll_y_amount)
212
+ output = f"Scrolled by (x:{scroll_x_amount}, y:{scroll_y_amount}) at ({target_x}, {target_y})"
213
+ elif action.scroll_direction:
214
+ # Define scroll unit (number of pyautogui scroll 'clicks')
215
+ # This might need tuning based on desired sensitivity.
216
+ pyautogui_scroll_clicks_per_unit = 1
217
+ amount = action.scroll_amount or 1
218
+ total_scroll_clicks = pyautogui_scroll_clicks_per_unit * amount
219
+
220
+ if action.scroll_direction == "up":
221
+ await self._execute("scroll", total_scroll_clicks)
222
+ elif action.scroll_direction == "down":
223
+ await self._execute("scroll", -total_scroll_clicks)
224
+ elif action.scroll_direction == "left":
225
+ await self._execute("hscroll", -total_scroll_clicks)
226
+ elif action.scroll_direction == "right":
227
+ await self._execute("hscroll", total_scroll_clicks)
228
+ output = f"Scrolled {action.scroll_direction} by {amount} units at ({target_x}, {target_y})"
229
+ else:
230
+ error = "Scroll action requires either scroll_x/y or scroll_direction"
231
+ logger.debug(f"Action: {action.type} details: {output or error}")
232
+
233
+ case "keypress":
234
+ mapped_keys = [self.CUA_KEY_TO_PYAUTOGUI_KEY.get(k.lower(), k.lower()) for k in action.keys]
235
+ key_string = "N/A"
236
+ if not mapped_keys:
237
+ error = "Keypress action requires at least one key"
238
+ elif len(mapped_keys) > 1:
239
+ await self._execute("hotkey", *mapped_keys)
240
+ key_string = "+".join(mapped_keys)
241
+ else:
242
+ await self._execute("press", mapped_keys[0])
243
+ key_string = mapped_keys[0]
244
+ if not error:
245
+ output = f"Pressed key(s): {key_string}"
246
+ logger.debug(f"Action: {action.type} '{key_string}'")
247
+
248
+ case "type":
249
+ text_to_type = action.text
250
+ await self._execute("typewrite", text_to_type, interval=0.02) # Small interval
251
+ output = f"Typed text: {text_to_type}"
252
+ logger.debug(f"Action: {action.type} '{text_to_type}'")
253
+
254
+ case "wait":
255
+ duration = action.duration
256
+ await asyncio.sleep(duration)
257
+ output = f"Waited for {duration} seconds"
258
+ logger.debug(f"Action: {action.type} for {duration}s")
259
+
260
+ case "screenshot":
261
+ step_type = "image"
262
+ # The actual screenshot data is added from after_state later
263
+ output = {"message": "Screenshot captured", "url": "desktop"}
264
+ logger.debug(f"Action: {action.type}")
265
+
266
+ case "move":
267
+ x, y = action.x, action.y
268
+ await self._execute("moveTo", x, y, duration=0.2) # Small duration for smooth move
269
+ self.mouse_pos = Point(x=x, y=y)
270
+ output = f"Moved mouse to ({x}, {y})"
271
+ logger.debug(f"Action: {action.type} to ({x},{y})")
272
+
273
+ case "drag":
274
+ if not isinstance(action, DragAction):
275
+ raise TypeError("Invalid action type for drag")
276
+ drag_path = action.path
277
+ if not drag_path:
278
+ error = "Missing path for drag action"
279
+ else:
280
+ start_x, start_y = drag_path[0].x, drag_path[0].y
281
+ await self._execute("moveTo", start_x, start_y, duration=0.1)
282
+ await self._execute("mouseDown")
283
+ for point in drag_path[1:]:
284
+ await self._execute("moveTo", point.x, point.y, duration=0.05)
285
+ await self._execute("mouseUp")
286
+ self.mouse_pos = Point(x=drag_path[-1].x, y=drag_path[-1].y)
287
+ output = f"Drag along path starting at ({start_x},{start_y})"
288
+ logger.debug(f"Action: {action.type} with {len(drag_path)} points")
289
+
290
+ case "mouse_down":
291
+ pyautogui_button = action.button.lower() if action.button else "left"
292
+ await self._execute("mouseDown", button=pyautogui_button)
293
+ output = f"{action.button.capitalize() if action.button else 'Left'} mouse button down"
294
+ logger.debug(f"Action: {action.type} {action.button}")
295
+
296
+ case "mouse_up":
297
+ pyautogui_button = action.button.lower() if action.button else "left"
298
+ await self._execute("mouseUp", button=pyautogui_button)
299
+ output = f"{action.button.capitalize() if action.button else 'Left'} mouse button up"
300
+ logger.debug(f"Action: {action.type} {action.button}")
301
+
302
+ case "hold_key":
303
+ keys_to_hold_str = action.text
304
+ duration = action.duration
305
+ parsed_keys = self.parse_key_combination(keys_to_hold_str)
306
+ if not parsed_keys:
307
+ error = f"No valid keys found in '{keys_to_hold_str}' for hold_key"
308
+ else:
309
+ for key_to_hold in parsed_keys:
310
+ await self._execute("keyDown", key_to_hold)
311
+ await asyncio.sleep(duration) # Non-pyautogui, direct sleep
312
+ for key_to_hold in reversed(parsed_keys): # Release in reverse order
313
+ await self._execute("keyUp", key_to_hold)
314
+ output = (
315
+ f"Held key{'s' if len(parsed_keys) > 1 else ''} {keys_to_hold_str} for {duration} seconds"
316
+ )
317
+ logger.debug(f"Action: {action.type} '{keys_to_hold_str}' for {duration}s")
318
+
319
+ case "key_down":
320
+ key_to_press = self.CUA_KEY_TO_PYAUTOGUI_KEY.get(action.key.lower(), action.key)
321
+ await self._execute("keyDown", key_to_press)
322
+ output = f"Key down: {key_to_press}"
323
+ logger.debug(f"Action: {action.type} {key_to_press}")
324
+
325
+ case "key_up":
326
+ key_to_release = self.CUA_KEY_TO_PYAUTOGUI_KEY.get(action.key.lower(), action.key)
327
+ await self._execute("keyUp", key_to_release)
328
+ output = f"Key up: {key_to_release}"
329
+ logger.debug(f"Action: {action.type} {key_to_release}")
330
+
331
+ case "cursor_position":
332
+ pos_x, pos_y = await self._execute("position")
333
+ self.mouse_pos = Point(x=pos_x, y=pos_y)
334
+ output = f"Cursor position is ({pos_x}, {pos_y})"
335
+ logger.debug(f"Action: {action.type}, position: ({pos_x},{pos_y})")
336
+
337
+ case "goto":
338
+ output = f"Goto action (URL: {action.url}) is not applicable for ComputerEnvironment."
339
+ logger.warning(f"Unsupported action: {action.type} for ComputerEnvironment.")
340
+
341
+ case "back":
342
+ output = "Back action is not applicable for ComputerEnvironment."
343
+ logger.warning(f"Unsupported action: {action.type} for ComputerEnvironment.")
344
+
345
+ case "terminal":
346
+ # Execute terminal command
347
+ result = await self._execute_shell_command(action.command)
348
+ if result["success"]:
349
+ output = f"Command executed successfully:\n{result['output']}"
350
+ else:
351
+ error = f"Command execution failed: {result['error']}"
352
+ logger.debug(f"Action: {action.type} with command '{action.command}'")
353
+
354
+ case "text_editor_view":
355
+ # View file contents
356
+ file_path = action.path
357
+ view_range = action.view_range
358
+ # Type guard: path should be str for text editor actions
359
+ if not isinstance(file_path, str):
360
+ raise TypeError("Invalid path type for text editor view action")
361
+ escaped_path = file_path.replace("'", "'\"'\"'")
362
+ is_dir = await self._execute("os.path.isdir", escaped_path)
363
+ if is_dir:
364
+ cmd = rf"find {escaped_path} -maxdepth 2 -not -path '*/\.*'"
365
+ elif view_range:
366
+ # Use head/tail to view specific line range
367
+ start_line, end_line = view_range
368
+ lines_to_show = end_line - start_line + 1
369
+ cmd = f"head -n {end_line} '{escaped_path}' | tail -n {lines_to_show}"
370
+ else:
371
+ # View entire file
372
+ cmd = f"cat '{escaped_path}'"
373
+
374
+ result = await self._execute_shell_command(cmd)
375
+ MAX_OUTPUT_LENGTH = 15000 # Limit output length to avoid excessive data
376
+ if len(result["output"]) > MAX_OUTPUT_LENGTH:
377
+ result["output"] = f"{result['output'][:MAX_OUTPUT_LENGTH]}..."
378
+ if result["success"]:
379
+ if is_dir:
380
+ output = f"Here's the files and directories up to 2 levels deep in {file_path}, excluding hidden items:\n{result['output']}"
381
+ else:
382
+ output = f"File contents of {file_path}:\n{result['output']}"
383
+ else:
384
+ error = f"Failed to view file {file_path}: {result['error']}"
385
+ logger.debug(f"Action: {action.type} for file {file_path}")
386
+
387
+ case "text_editor_create":
388
+ # Create new file with contents
389
+ file_path = action.path
390
+ file_text = action.file_text
391
+ # Type guard: path should be str for text editor actions
392
+ if not isinstance(file_path, str):
393
+ raise TypeError("Invalid path type for text editor create action")
394
+ escaped_path = file_path.replace("'", "'\"'\"'")
395
+ escaped_content = file_text.replace("\t", " ").replace(
396
+ "'", "'\"'\"'"
397
+ ) # Escape single quotes for shell
398
+ cmd = f"echo '{escaped_content}' > '{escaped_path}'"
399
+
400
+ result = await self._execute_shell_command(cmd)
401
+ if result["success"]:
402
+ output = f"Created file {file_path} with {len(file_text)} characters"
403
+ else:
404
+ error = f"Failed to create file {file_path}: {result['error']}"
405
+ logger.debug(f"Action: {action.type} created file {file_path}")
406
+
407
+ case "text_editor_str_replace":
408
+ # Execute string replacement
409
+ file_path = action.path
410
+ old_str = action.old_str
411
+ new_str = action.new_str
412
+
413
+ # Type guard: path should be str for text editor actions
414
+ if not isinstance(file_path, str):
415
+ raise TypeError("Invalid path type for text editor str_replace action")
416
+ # Use sed for string replacement, escaping special characters
417
+ escaped_path = file_path.replace("'", "'\"'\"'")
418
+ escaped_old = (
419
+ old_str.replace("\t", " ")
420
+ .replace("\\", "\\\\")
421
+ .replace("\n", "\\n")
422
+ .replace("/", "\\/")
423
+ .replace("'", "'\"'\"'")
424
+ )
425
+ escaped_new = (
426
+ new_str.replace("\t", " ")
427
+ .replace("\\", "\\\\")
428
+ .replace("\n", "\\n")
429
+ .replace("&", "\\&")
430
+ .replace("/", "\\/")
431
+ .replace("'", "'\"'\"'")
432
+ )
433
+ cmd = f"sed -i.bak 's/{escaped_old}/{escaped_new}/g' '{escaped_path}'"
434
+
435
+ result = await self._execute_shell_command(cmd)
436
+ if result["success"]:
437
+ output = f"Replaced '{old_str[:50]}...' with '{new_str[:50]}...' in {file_path}"
438
+ else:
439
+ error = f"Failed to replace text in {file_path}: {result['error']}"
440
+ logger.debug(f"Action: {action.type} in file {file_path}")
441
+
442
+ case "text_editor_insert":
443
+ # Insert text after specified line
444
+ file_path = action.path
445
+ insert_line = action.insert_line
446
+ new_str = action.new_str
447
+
448
+ # Type guard: path should be str for text editor actions
449
+ if not isinstance(file_path, str):
450
+ error = "Invalid path type for text editor insert action.\n"
451
+ error += f"Failed to insert text in {file_path}: {result['error']}"
452
+ raise TypeError(error)
453
+ escaped_path = file_path.replace("'", "'\"'\"'")
454
+ escaped_content = (
455
+ new_str.replace("\t", " ")
456
+ .replace("\\", "\\\\")
457
+ .replace("'", "'\"'\"'")
458
+ .replace("\n", "\\\n")
459
+ )
460
+ cmd = f"sed -i.bak '{insert_line}a\\{escaped_content}' '{escaped_path}'"
461
+
462
+ result = await self._execute_shell_command(cmd)
463
+ if result["success"]:
464
+ output = f"Inserted text after line {insert_line} in {file_path}"
465
+ else:
466
+ error = f"Failed to insert text in {file_path}: {result['error']}"
467
+ logger.debug(f"Action: {action.type} at line {insert_line} in file {file_path}")
468
+
469
+ case _:
470
+ error = f"Unrecognized action type: {action.type}"
471
+ logger.warning(error)
472
+ except KeyboardInterrupt:
473
+ error = "User interrupt. Operation aborted."
474
+ logger.error(error)
475
+ except TypeError as e:
476
+ logger.error(f"Error executing action {action.type}: {e}")
477
+ except Exception as e:
478
+ error = f"Unexpected error executing action {action.type}: {str(e)}"
479
+ logger.exception(
480
+ f"Unexpected error during step execution for action: {action.model_dump_json(exclude_none=True)}"
481
+ )
482
+
483
+ after_state = await self.get_state()
484
+
485
+ if action.type == "screenshot" and step_type == "image":
486
+ output = {"image": after_state.screenshot, "url": after_state.url}
487
+
488
+ return EnvStepResult(
489
+ type=step_type,
490
+ output=output,
491
+ error=error,
492
+ current_url=after_state.url,
493
+ screenshot_base64=after_state.screenshot,
494
+ )
495
+
496
+ async def _execute_shell_command(self, command: str, new: bool = True) -> dict:
497
+ """Execute a shell command and return the result."""
498
+ try:
499
+ if self.provider == "docker":
500
+ # Execute command in Docker container
501
+ docker_args = [
502
+ "docker",
503
+ "exec",
504
+ self.docker_container_name,
505
+ "bash",
506
+ "-c",
507
+ command, # The command string is passed as a single argument to bash -c
508
+ ]
509
+ process = await asyncio.to_thread(
510
+ subprocess.run,
511
+ docker_args,
512
+ capture_output=True,
513
+ text=True,
514
+ check=False,
515
+ timeout=120,
516
+ )
517
+ else:
518
+ # Execute command locally
519
+ process = await asyncio.to_thread(
520
+ subprocess.run,
521
+ command,
522
+ shell=True,
523
+ capture_output=True,
524
+ text=True,
525
+ check=False,
526
+ start_new_session=new,
527
+ timeout=120,
528
+ )
529
+
530
+ if process.returncode == 0:
531
+ return {"success": True, "output": process.stdout, "error": None}
532
+ else:
533
+ return {"success": False, "output": process.stdout, "error": process.stderr}
534
+ except asyncio.TimeoutError:
535
+ return {"success": False, "output": "", "error": f"Command timed out after 120 seconds."}
536
+ except Exception as e:
537
+ return {"success": False, "output": "", "error": str(e)}
538
+
539
+ async def close(self) -> None:
540
+ logger.debug("Computer environment closed. No specific resources to release for PyAutoGUI.")
541
+
542
+ CUA_KEY_TO_PYAUTOGUI_KEY = {
543
+ # Modifiers
544
+ "option": "alt",
545
+ "control": "ctrl",
546
+ "cmd": "command",
547
+ "super": "win",
548
+ "meta": "command" if platform.system() == "Darwin" else "win",
549
+ # Navigation & Editing
550
+ "arrowdown": "down",
551
+ "arrowleft": "left",
552
+ "arrowright": "right",
553
+ "arrowup": "up",
554
+ "caps_lock": "capslock",
555
+ "del": "delete",
556
+ "return": "enter",
557
+ "esc": "escape",
558
+ "pgdn": "pagedown",
559
+ "pgup": "pageup",
560
+ " ": "space",
561
+ # Numpad keys (example, pyautogui uses 'num0', 'add', 'subtract', etc.)
562
+ "numpad0": "num0",
563
+ "numpad_0": "num0",
564
+ }
565
+
566
+ @staticmethod
567
+ def parse_key_combination(text: str) -> list[str]:
568
+ if not text:
569
+ return []
570
+
571
+ keys_str_list = text.lower().split("+")
572
+ mapped_keys = []
573
+ for k_str in keys_str_list:
574
+ # Use the mapped key if found, otherwise use the string itself (e.g. 'a', '1')
575
+ mapped_keys.append(ComputerEnvironment.CUA_KEY_TO_PYAUTOGUI_KEY.get(k_str.strip(), k_str.strip()))
576
+ return mapped_keys
577
+
578
+ def generate_pyautogui_command(self, func_name: str, *args, **kwargs) -> str:
579
+ args_repr = [repr(arg) for arg in args]
580
+ kwargs_repr = [f"{k}={repr(v)}" for k, v in kwargs.items()]
581
+ all_params_repr = ", ".join(args_repr + kwargs_repr)
582
+
583
+ # Base script setup
584
+ script_lines = [
585
+ "import os",
586
+ "import pyautogui",
587
+ ]
588
+
589
+ if self.provider == "docker":
590
+ script_lines.extend(
591
+ [
592
+ # Display export for Docker.
593
+ f"os.environ['DISPLAY']='{self.docker_display}'",
594
+ # Disable failsafe in Docker to avoid accidental exits
595
+ "pyautogui.FAILSAFE = False",
596
+ ]
597
+ )
598
+
599
+ # Function-specific logic
600
+ if func_name == "screenshot":
601
+ script_lines.extend(
602
+ [
603
+ "import io",
604
+ "import base64",
605
+ "img = pyautogui.screenshot()",
606
+ "buf = io.BytesIO()",
607
+ "img.save(buf, format='PNG')",
608
+ "print(base64.b64encode(buf.getvalue()).decode('utf-8'))",
609
+ ]
610
+ )
611
+ elif func_name == "size":
612
+ script_lines.extend(["size = pyautogui.size()", "print(f'({size.width}, {size.height})')"])
613
+ elif func_name == "position":
614
+ script_lines.extend(["pos = pyautogui.position()", "print(f'({pos.x}, {pos.y})')"])
615
+ else: # General command structure
616
+ script_lines.extend(
617
+ [f"result = pyautogui.{func_name}({all_params_repr})", "print(result if result is not None else '')"]
618
+ )
619
+
620
+ return "; ".join(script_lines)
621
+
622
+ async def docker_execute(self, python_command_str: str) -> Optional[str]:
623
+ if not self.docker_container_name or not self.docker_display:
624
+ logger.error("Container name or Docker display not set for Docker execution.")
625
+ return None
626
+
627
+ safe_python_cmd = python_command_str.replace('"', '\\"')
628
+ docker_full_cmd = (
629
+ f'docker exec -e DISPLAY={self.docker_display} "{self.docker_container_name}" '
630
+ f'python3 -c "{safe_python_cmd}"'
631
+ )
632
+
633
+ try:
634
+ process = await asyncio.to_thread(
635
+ subprocess.run,
636
+ docker_full_cmd,
637
+ shell=True,
638
+ capture_output=True,
639
+ text=True,
640
+ check=False, # We check returncode manually
641
+ )
642
+ if process.returncode != 0:
643
+ if "FailSafeException" in process.stderr or "FailSafeException" in process.stdout:
644
+ raise KeyboardInterrupt(process.stderr or process.stdout)
645
+ else:
646
+ error_msg = (
647
+ f"Docker command failed:\nCmd: {docker_full_cmd}\n"
648
+ f"Return Code: {process.returncode}\nStderr: {process.stderr}\nStdout: {process.stdout}"
649
+ )
650
+ logger.error(error_msg)
651
+ raise RuntimeError(f"Docker exec error: {process.stderr or process.stdout}")
652
+ return process.stdout.strip()
653
+ except KeyboardInterrupt: # Re-raise if caught from above
654
+ raise
655
+ except Exception as e:
656
+ logger.error(f"Unexpected error running command in Docker '{docker_full_cmd}': {e}")
657
+ # Encapsulate as RuntimeError to avoid leaking subprocess errors directly
658
+ raise RuntimeError(f"Unexpected Docker error: {e}") from e