khoj 1.41.1.dev107__py3-none-any.whl → 1.41.1.dev142__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khoj/database/adapters/__init__.py +20 -0
- khoj/database/models/__init__.py +1 -1
- khoj/interface/compiled/404/index.html +2 -2
- khoj/interface/compiled/_next/static/chunks/{2327-aa22697ed9c8d54a.js → 2327-f03b2a77f67b8f8c.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/{8515-f305779d95dd5780.js → 5138-81457f7f59956b56.js} +9 -9
- khoj/interface/compiled/_next/static/chunks/7127-d3199617463d45f0.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/agents/{page-c9ceb9b94e24b94a.js → page-774c78ff0f55a228.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/automations/{page-3dc59a0df3827dc7.js → page-4454891c5007b870.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/layout-33934fc2d6ae6838.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/chat/{page-2b27c7118d8d5a16.js → page-5a2559825b4d5def.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/{page-38f1f125d7aeb4c7.js → page-f7a0286dfc31ad6b.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/search/{page-26d4492fb1200e0e.js → page-f1a7f278c89e09b6.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/settings/{page-bf1a4e488b29fceb.js → page-5d9134d4a97f8834.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-abb6c5f4239ad7be.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-a1f10c96366c3a4f.js → page-32cd0ceb9ffbd777.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/{webpack-c6bde5961098facd.js → webpack-952bc0d41769db77.js} +1 -1
- khoj/interface/compiled/_next/static/css/37a73b87f02df402.css +1 -0
- khoj/interface/compiled/_next/static/css/93eeacc43e261162.css +1 -0
- khoj/interface/compiled/agents/index.html +2 -2
- khoj/interface/compiled/agents/index.txt +2 -2
- khoj/interface/compiled/automations/index.html +2 -2
- khoj/interface/compiled/automations/index.txt +3 -3
- khoj/interface/compiled/chat/index.html +2 -2
- khoj/interface/compiled/chat/index.txt +2 -2
- khoj/interface/compiled/index.html +2 -2
- khoj/interface/compiled/index.txt +2 -2
- khoj/interface/compiled/search/index.html +2 -2
- khoj/interface/compiled/search/index.txt +2 -2
- khoj/interface/compiled/settings/index.html +2 -2
- khoj/interface/compiled/settings/index.txt +4 -4
- khoj/interface/compiled/share/chat/index.html +2 -2
- khoj/interface/compiled/share/chat/index.txt +2 -2
- khoj/processor/conversation/anthropic/anthropic_chat.py +8 -9
- khoj/processor/conversation/anthropic/utils.py +30 -7
- khoj/processor/conversation/google/gemini_chat.py +10 -10
- khoj/processor/conversation/google/utils.py +20 -12
- khoj/processor/conversation/offline/chat_model.py +2 -7
- khoj/processor/conversation/openai/gpt.py +8 -9
- khoj/processor/conversation/utils.py +132 -21
- khoj/processor/operator/README.md +59 -0
- khoj/processor/operator/{operate_browser.py → __init__.py} +98 -34
- khoj/processor/operator/grounding_agent.py +229 -175
- khoj/processor/operator/grounding_agent_uitars.py +59 -48
- khoj/processor/operator/operator_actions.py +48 -0
- khoj/processor/operator/operator_agent_anthropic.py +298 -90
- khoj/processor/operator/operator_agent_base.py +45 -14
- khoj/processor/operator/operator_agent_binary.py +125 -57
- khoj/processor/operator/operator_agent_openai.py +183 -75
- khoj/processor/operator/operator_environment_base.py +11 -1
- khoj/processor/operator/operator_environment_browser.py +5 -3
- khoj/processor/operator/operator_environment_computer.py +658 -0
- khoj/routers/api_chat.py +36 -25
- khoj/routers/helpers.py +8 -17
- khoj/routers/research.py +43 -20
- khoj/utils/constants.py +4 -4
- khoj/utils/helpers.py +12 -15
- {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev142.dist-info}/METADATA +3 -1
- {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev142.dist-info}/RECORD +70 -68
- khoj/interface/compiled/_next/static/chunks/4986-9ddd694756d03aa1.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/agents/layout-e49165209d2e406c.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/layout-d5ae861e1ade9d08.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-64a53f8ec4afa6b3.js +0 -1
- khoj/interface/compiled/_next/static/css/bb7ea98028b368f3.css +0 -1
- khoj/interface/compiled/_next/static/css/ee66643a6a5bf71c.css +0 -1
- /khoj/interface/compiled/_next/static/{y_k1yn7bI1CgM5ZfW7jUq → 4CIEX6Ko-Qehhb7L-ymZw}/_buildManifest.js +0 -0
- /khoj/interface/compiled/_next/static/{y_k1yn7bI1CgM5ZfW7jUq → 4CIEX6Ko-Qehhb7L-ymZw}/_ssgManifest.js +0 -0
- /khoj/interface/compiled/_next/static/chunks/{1915-ab4353eaca76f690.js → 1915-1943ee8a628b893c.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{2117-1c18aa2098982bf9.js → 2117-5a41630a2bd2eae8.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{4363-4efaf12abe696251.js → 4363-e6ac2203564d1a3b.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{4447-5d44807c40355b1a.js → 4447-e038b251d626c340.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{8667-adbe6017a66cef10.js → 8667-8136f74e9a086fca.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{9259-d8bcd9da9e80c81e.js → 9259-640fdd77408475df.js} +0 -0
- {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev142.dist-info}/WHEEL +0 -0
- {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev142.dist-info}/entry_points.txt +0 -0
- {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev142.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,658 @@
|
|
1
|
+
import ast
|
2
|
+
import asyncio
|
3
|
+
import base64
|
4
|
+
import io
|
5
|
+
import logging
|
6
|
+
import platform
|
7
|
+
import subprocess
|
8
|
+
from typing import Literal, Optional, Union
|
9
|
+
|
10
|
+
from PIL import Image, ImageDraw
|
11
|
+
|
12
|
+
from khoj.processor.operator.operator_actions import DragAction, OperatorAction, Point
|
13
|
+
from khoj.processor.operator.operator_environment_base import (
|
14
|
+
Environment,
|
15
|
+
EnvState,
|
16
|
+
EnvStepResult,
|
17
|
+
)
|
18
|
+
from khoj.utils.helpers import convert_image_to_webp
|
19
|
+
|
20
|
+
logger = logging.getLogger(__name__)
|
21
|
+
|
22
|
+
|
23
|
+
# --- Concrete Computer Environment ---
|
24
|
+
class ComputerEnvironment(Environment):
|
25
|
+
def __init__(
|
26
|
+
self,
|
27
|
+
provider: Literal["local", "docker"] = "local",
|
28
|
+
docker_display: str = ":99",
|
29
|
+
docker_container_name: str = "khoj-computer",
|
30
|
+
):
|
31
|
+
self.provider = provider
|
32
|
+
self.docker_display = docker_display
|
33
|
+
self.docker_container_name = docker_container_name
|
34
|
+
|
35
|
+
self.width: int = 0
|
36
|
+
self.height: int = 0
|
37
|
+
self.mouse_pos: Point = Point(x=0, y=0)
|
38
|
+
|
39
|
+
async def _execute(self, func_name, *args, **kwargs):
|
40
|
+
"""
|
41
|
+
Executes a pyautogui function, abstracting the execution context.
|
42
|
+
Currently runs locally using asyncio.to_thread.
|
43
|
+
"""
|
44
|
+
python_command_str = self.generate_pyautogui_command(func_name, *args, **kwargs)
|
45
|
+
# Docker execution
|
46
|
+
if self.provider == "docker":
|
47
|
+
try:
|
48
|
+
output_str = await self.docker_execute(python_command_str)
|
49
|
+
except RuntimeError as e: # Catch other Docker execution errors
|
50
|
+
logger.error(f"Error during Docker execution of {func_name}: {e}")
|
51
|
+
raise # Re-raise as a general error for the caller to handle
|
52
|
+
# Local execution
|
53
|
+
else:
|
54
|
+
process = await asyncio.to_thread(
|
55
|
+
subprocess.run,
|
56
|
+
["python3", "-c", python_command_str],
|
57
|
+
capture_output=True,
|
58
|
+
text=True,
|
59
|
+
check=False, # We check returncode manually
|
60
|
+
)
|
61
|
+
output_str = process.stdout.strip()
|
62
|
+
if process.returncode != 0:
|
63
|
+
if "FailSafeException" in process.stderr or "FailSafeException" in process.stdout:
|
64
|
+
# Extract the message if possible, otherwise use generic
|
65
|
+
fs_msg = process.stderr or process.stdout
|
66
|
+
raise KeyboardInterrupt(fs_msg)
|
67
|
+
else:
|
68
|
+
error_msg = (
|
69
|
+
f'Local script execution failed:\nCmd: python3 -c "{python_command_str[:200]}...{python_command_str[-200:]}\n'
|
70
|
+
f"Return Code: {process.returncode}\nStderr: {process.stderr}\nStdout: {process.stdout}"
|
71
|
+
)
|
72
|
+
logger.error(error_msg)
|
73
|
+
raise RuntimeError(f"Local script execution error: {process.stderr or process.stdout}")
|
74
|
+
if not output_str or output_str == "None":
|
75
|
+
return None
|
76
|
+
|
77
|
+
try:
|
78
|
+
return ast.literal_eval(output_str)
|
79
|
+
except (ValueError, SyntaxError):
|
80
|
+
# If not a literal (e.g., some other string output), return as is
|
81
|
+
return output_str
|
82
|
+
|
83
|
+
async def start(self, width: int, height: int) -> None:
|
84
|
+
"""
|
85
|
+
Initializes the computer environment.
|
86
|
+
The width and height parameters are logged, but actual screen dimensions are used.
|
87
|
+
"""
|
88
|
+
screen_width, screen_height = await self._execute("size")
|
89
|
+
|
90
|
+
self.width = screen_width
|
91
|
+
self.height = screen_height
|
92
|
+
# Initialize mouse position to center, or current if available
|
93
|
+
try:
|
94
|
+
current_x, current_y = await self._execute("position")
|
95
|
+
self.mouse_pos = Point(x=current_x, y=current_y)
|
96
|
+
except Exception: # Fallback if position cannot be obtained initially
|
97
|
+
self.mouse_pos = Point(x=self.width / 2, y=self.height / 2)
|
98
|
+
|
99
|
+
logger.info(
|
100
|
+
f"Computer environment started. Screen size: {self.width}x{self.height}. "
|
101
|
+
f"Input width/height ({width}x{height}) are noted but screen dimensioning uses actual screen size. "
|
102
|
+
f"Initial mouse position: ({self.mouse_pos.x},{self.mouse_pos.y})"
|
103
|
+
)
|
104
|
+
|
105
|
+
async def _get_screenshot(self) -> Optional[str]:
|
106
|
+
try:
|
107
|
+
# Get screenshot
|
108
|
+
base64_png_str = await self._execute("screenshot")
|
109
|
+
screenshot_bytes = base64.b64decode(base64_png_str)
|
110
|
+
|
111
|
+
# Get current mouse position
|
112
|
+
current_mouse_x, current_mouse_y = await self._execute("position")
|
113
|
+
draw_pos = Point(x=current_mouse_x, y=current_mouse_y)
|
114
|
+
|
115
|
+
# Add mouse position to screenshot
|
116
|
+
screenshot_bytes_with_mouse = await self._draw_mouse_position(screenshot_bytes, draw_pos)
|
117
|
+
screenshot_webp_bytes = convert_image_to_webp(screenshot_bytes_with_mouse)
|
118
|
+
return base64.b64encode(screenshot_webp_bytes).decode("utf-8")
|
119
|
+
except KeyboardInterrupt: # Propagate keyboard interrupts
|
120
|
+
raise
|
121
|
+
except Exception as e:
|
122
|
+
logger.error(f"Failed to get screenshot: {e}", exc_info=True)
|
123
|
+
return None
|
124
|
+
|
125
|
+
async def _draw_mouse_position(self, screenshot_bytes: bytes, mouse_pos: Point) -> bytes:
|
126
|
+
if Image is None or ImageDraw is None:
|
127
|
+
return screenshot_bytes
|
128
|
+
try:
|
129
|
+
image = Image.open(io.BytesIO(screenshot_bytes))
|
130
|
+
draw = ImageDraw.Draw(image)
|
131
|
+
radius = 8
|
132
|
+
# Red circle with black border for better visibility
|
133
|
+
draw.ellipse(
|
134
|
+
(mouse_pos.x - radius, mouse_pos.y - radius, mouse_pos.x + radius, mouse_pos.y + radius),
|
135
|
+
outline="black",
|
136
|
+
fill="red",
|
137
|
+
width=2,
|
138
|
+
)
|
139
|
+
output_buffer = io.BytesIO()
|
140
|
+
image.save(output_buffer, format="PNG")
|
141
|
+
return output_buffer.getvalue()
|
142
|
+
except Exception as e:
|
143
|
+
logger.error(f"Failed to draw mouse position: {e}")
|
144
|
+
return screenshot_bytes
|
145
|
+
|
146
|
+
async def get_state(self) -> EnvState:
|
147
|
+
screenshot = await self._get_screenshot()
|
148
|
+
return EnvState(screenshot=screenshot, height=self.height, width=self.width)
|
149
|
+
|
150
|
+
async def step(self, action: OperatorAction) -> EnvStepResult:
|
151
|
+
output: Optional[Union[str, dict]] = None
|
152
|
+
error: Optional[str] = None
|
153
|
+
step_type: str = "text"
|
154
|
+
|
155
|
+
try:
|
156
|
+
match action.type:
|
157
|
+
case "click":
|
158
|
+
x, y, button_name = action.x, action.y, action.button
|
159
|
+
modifiers_to_press = self.parse_key_combination(action.modifiers) if action.modifiers else []
|
160
|
+
for mod_key in modifiers_to_press:
|
161
|
+
await self._execute("keyDown", mod_key)
|
162
|
+
|
163
|
+
if button_name == "wheel":
|
164
|
+
# Perform a small scroll action at this position (e.g., one "tick" down)
|
165
|
+
# Pyautogui scroll: positive up, negative down.
|
166
|
+
await self._execute("scroll", -1, x=x, y=y)
|
167
|
+
output = f"Scrolled wheel at ({x}, {y})"
|
168
|
+
else:
|
169
|
+
pyautogui_button = button_name.lower() if button_name else "left"
|
170
|
+
await self._execute("click", x=x, y=y, button=pyautogui_button)
|
171
|
+
output = f"{button_name.capitalize() if button_name else 'Left'} clicked at ({x}, {y})"
|
172
|
+
|
173
|
+
for mod_key in reversed(modifiers_to_press):
|
174
|
+
await self._execute("keyUp", mod_key)
|
175
|
+
|
176
|
+
self.mouse_pos = Point(x=x, y=y)
|
177
|
+
logger.debug(f"Action: {action.type} {button_name} at ({x},{y}) with modifiers {action.modifiers}")
|
178
|
+
|
179
|
+
case "double_click":
|
180
|
+
x, y = action.x, action.y
|
181
|
+
await self._execute("doubleClick", x=x, y=y)
|
182
|
+
self.mouse_pos = Point(x=x, y=y)
|
183
|
+
output = f"Double clicked at ({x}, {y})"
|
184
|
+
logger.debug(f"Action: {action.type} at ({x},{y})")
|
185
|
+
|
186
|
+
case "triple_click":
|
187
|
+
x, y = action.x, action.y
|
188
|
+
await self._execute("click", x=x, y=y, clicks=3)
|
189
|
+
self.mouse_pos = Point(x=x, y=y)
|
190
|
+
output = f"Triple clicked at ({x}, {y})"
|
191
|
+
logger.debug(f"Action: {action.type} at ({x},{y})")
|
192
|
+
|
193
|
+
case "scroll":
|
194
|
+
current_x_pos, current_y_pos = await self._execute("position")
|
195
|
+
target_x = action.x if action.x is not None else current_x_pos
|
196
|
+
target_y = action.y if action.y is not None else current_y_pos
|
197
|
+
|
198
|
+
if target_x != current_x_pos or target_y != current_y_pos:
|
199
|
+
await self._execute("moveTo", target_x, target_y)
|
200
|
+
|
201
|
+
self.mouse_pos = Point(x=target_x, y=target_y) # Update mouse pos to scroll location
|
202
|
+
|
203
|
+
if action.scroll_x is not None or action.scroll_y is not None:
|
204
|
+
scroll_x_amount = action.scroll_x or 0
|
205
|
+
scroll_y_amount = action.scroll_y or 0
|
206
|
+
|
207
|
+
if scroll_x_amount != 0:
|
208
|
+
await self._execute("hscroll", scroll_x_amount)
|
209
|
+
if scroll_y_amount != 0:
|
210
|
+
# pyautogui scroll: positive up, so negate for typical "scroll down" meaning positive y
|
211
|
+
await self._execute("scroll", -scroll_y_amount)
|
212
|
+
output = f"Scrolled by (x:{scroll_x_amount}, y:{scroll_y_amount}) at ({target_x}, {target_y})"
|
213
|
+
elif action.scroll_direction:
|
214
|
+
# Define scroll unit (number of pyautogui scroll 'clicks')
|
215
|
+
# This might need tuning based on desired sensitivity.
|
216
|
+
pyautogui_scroll_clicks_per_unit = 1
|
217
|
+
amount = action.scroll_amount or 1
|
218
|
+
total_scroll_clicks = pyautogui_scroll_clicks_per_unit * amount
|
219
|
+
|
220
|
+
if action.scroll_direction == "up":
|
221
|
+
await self._execute("scroll", total_scroll_clicks)
|
222
|
+
elif action.scroll_direction == "down":
|
223
|
+
await self._execute("scroll", -total_scroll_clicks)
|
224
|
+
elif action.scroll_direction == "left":
|
225
|
+
await self._execute("hscroll", -total_scroll_clicks)
|
226
|
+
elif action.scroll_direction == "right":
|
227
|
+
await self._execute("hscroll", total_scroll_clicks)
|
228
|
+
output = f"Scrolled {action.scroll_direction} by {amount} units at ({target_x}, {target_y})"
|
229
|
+
else:
|
230
|
+
error = "Scroll action requires either scroll_x/y or scroll_direction"
|
231
|
+
logger.debug(f"Action: {action.type} details: {output or error}")
|
232
|
+
|
233
|
+
case "keypress":
|
234
|
+
mapped_keys = [self.CUA_KEY_TO_PYAUTOGUI_KEY.get(k.lower(), k.lower()) for k in action.keys]
|
235
|
+
key_string = "N/A"
|
236
|
+
if not mapped_keys:
|
237
|
+
error = "Keypress action requires at least one key"
|
238
|
+
elif len(mapped_keys) > 1:
|
239
|
+
await self._execute("hotkey", *mapped_keys)
|
240
|
+
key_string = "+".join(mapped_keys)
|
241
|
+
else:
|
242
|
+
await self._execute("press", mapped_keys[0])
|
243
|
+
key_string = mapped_keys[0]
|
244
|
+
if not error:
|
245
|
+
output = f"Pressed key(s): {key_string}"
|
246
|
+
logger.debug(f"Action: {action.type} '{key_string}'")
|
247
|
+
|
248
|
+
case "type":
|
249
|
+
text_to_type = action.text
|
250
|
+
await self._execute("typewrite", text_to_type, interval=0.02) # Small interval
|
251
|
+
output = f"Typed text: {text_to_type}"
|
252
|
+
logger.debug(f"Action: {action.type} '{text_to_type}'")
|
253
|
+
|
254
|
+
case "wait":
|
255
|
+
duration = action.duration
|
256
|
+
await asyncio.sleep(duration)
|
257
|
+
output = f"Waited for {duration} seconds"
|
258
|
+
logger.debug(f"Action: {action.type} for {duration}s")
|
259
|
+
|
260
|
+
case "screenshot":
|
261
|
+
step_type = "image"
|
262
|
+
# The actual screenshot data is added from after_state later
|
263
|
+
output = {"message": "Screenshot captured", "url": "desktop"}
|
264
|
+
logger.debug(f"Action: {action.type}")
|
265
|
+
|
266
|
+
case "move":
|
267
|
+
x, y = action.x, action.y
|
268
|
+
await self._execute("moveTo", x, y, duration=0.2) # Small duration for smooth move
|
269
|
+
self.mouse_pos = Point(x=x, y=y)
|
270
|
+
output = f"Moved mouse to ({x}, {y})"
|
271
|
+
logger.debug(f"Action: {action.type} to ({x},{y})")
|
272
|
+
|
273
|
+
case "drag":
|
274
|
+
if not isinstance(action, DragAction):
|
275
|
+
raise TypeError("Invalid action type for drag")
|
276
|
+
drag_path = action.path
|
277
|
+
if not drag_path:
|
278
|
+
error = "Missing path for drag action"
|
279
|
+
else:
|
280
|
+
start_x, start_y = drag_path[0].x, drag_path[0].y
|
281
|
+
await self._execute("moveTo", start_x, start_y, duration=0.1)
|
282
|
+
await self._execute("mouseDown")
|
283
|
+
for point in drag_path[1:]:
|
284
|
+
await self._execute("moveTo", point.x, point.y, duration=0.05)
|
285
|
+
await self._execute("mouseUp")
|
286
|
+
self.mouse_pos = Point(x=drag_path[-1].x, y=drag_path[-1].y)
|
287
|
+
output = f"Drag along path starting at ({start_x},{start_y})"
|
288
|
+
logger.debug(f"Action: {action.type} with {len(drag_path)} points")
|
289
|
+
|
290
|
+
case "mouse_down":
|
291
|
+
pyautogui_button = action.button.lower() if action.button else "left"
|
292
|
+
await self._execute("mouseDown", button=pyautogui_button)
|
293
|
+
output = f"{action.button.capitalize() if action.button else 'Left'} mouse button down"
|
294
|
+
logger.debug(f"Action: {action.type} {action.button}")
|
295
|
+
|
296
|
+
case "mouse_up":
|
297
|
+
pyautogui_button = action.button.lower() if action.button else "left"
|
298
|
+
await self._execute("mouseUp", button=pyautogui_button)
|
299
|
+
output = f"{action.button.capitalize() if action.button else 'Left'} mouse button up"
|
300
|
+
logger.debug(f"Action: {action.type} {action.button}")
|
301
|
+
|
302
|
+
case "hold_key":
|
303
|
+
keys_to_hold_str = action.text
|
304
|
+
duration = action.duration
|
305
|
+
parsed_keys = self.parse_key_combination(keys_to_hold_str)
|
306
|
+
if not parsed_keys:
|
307
|
+
error = f"No valid keys found in '{keys_to_hold_str}' for hold_key"
|
308
|
+
else:
|
309
|
+
for key_to_hold in parsed_keys:
|
310
|
+
await self._execute("keyDown", key_to_hold)
|
311
|
+
await asyncio.sleep(duration) # Non-pyautogui, direct sleep
|
312
|
+
for key_to_hold in reversed(parsed_keys): # Release in reverse order
|
313
|
+
await self._execute("keyUp", key_to_hold)
|
314
|
+
output = (
|
315
|
+
f"Held key{'s' if len(parsed_keys) > 1 else ''} {keys_to_hold_str} for {duration} seconds"
|
316
|
+
)
|
317
|
+
logger.debug(f"Action: {action.type} '{keys_to_hold_str}' for {duration}s")
|
318
|
+
|
319
|
+
case "key_down":
|
320
|
+
key_to_press = self.CUA_KEY_TO_PYAUTOGUI_KEY.get(action.key.lower(), action.key)
|
321
|
+
await self._execute("keyDown", key_to_press)
|
322
|
+
output = f"Key down: {key_to_press}"
|
323
|
+
logger.debug(f"Action: {action.type} {key_to_press}")
|
324
|
+
|
325
|
+
case "key_up":
|
326
|
+
key_to_release = self.CUA_KEY_TO_PYAUTOGUI_KEY.get(action.key.lower(), action.key)
|
327
|
+
await self._execute("keyUp", key_to_release)
|
328
|
+
output = f"Key up: {key_to_release}"
|
329
|
+
logger.debug(f"Action: {action.type} {key_to_release}")
|
330
|
+
|
331
|
+
case "cursor_position":
|
332
|
+
pos_x, pos_y = await self._execute("position")
|
333
|
+
self.mouse_pos = Point(x=pos_x, y=pos_y)
|
334
|
+
output = f"Cursor position is ({pos_x}, {pos_y})"
|
335
|
+
logger.debug(f"Action: {action.type}, position: ({pos_x},{pos_y})")
|
336
|
+
|
337
|
+
case "goto":
|
338
|
+
output = f"Goto action (URL: {action.url}) is not applicable for ComputerEnvironment."
|
339
|
+
logger.warning(f"Unsupported action: {action.type} for ComputerEnvironment.")
|
340
|
+
|
341
|
+
case "back":
|
342
|
+
output = "Back action is not applicable for ComputerEnvironment."
|
343
|
+
logger.warning(f"Unsupported action: {action.type} for ComputerEnvironment.")
|
344
|
+
|
345
|
+
case "terminal":
|
346
|
+
# Execute terminal command
|
347
|
+
result = await self._execute_shell_command(action.command)
|
348
|
+
if result["success"]:
|
349
|
+
output = f"Command executed successfully:\n{result['output']}"
|
350
|
+
else:
|
351
|
+
error = f"Command execution failed: {result['error']}"
|
352
|
+
logger.debug(f"Action: {action.type} with command '{action.command}'")
|
353
|
+
|
354
|
+
case "text_editor_view":
|
355
|
+
# View file contents
|
356
|
+
file_path = action.path
|
357
|
+
view_range = action.view_range
|
358
|
+
# Type guard: path should be str for text editor actions
|
359
|
+
if not isinstance(file_path, str):
|
360
|
+
raise TypeError("Invalid path type for text editor view action")
|
361
|
+
escaped_path = file_path.replace("'", "'\"'\"'")
|
362
|
+
is_dir = await self._execute("os.path.isdir", escaped_path)
|
363
|
+
if is_dir:
|
364
|
+
cmd = rf"find {escaped_path} -maxdepth 2 -not -path '*/\.*'"
|
365
|
+
elif view_range:
|
366
|
+
# Use head/tail to view specific line range
|
367
|
+
start_line, end_line = view_range
|
368
|
+
lines_to_show = end_line - start_line + 1
|
369
|
+
cmd = f"head -n {end_line} '{escaped_path}' | tail -n {lines_to_show}"
|
370
|
+
else:
|
371
|
+
# View entire file
|
372
|
+
cmd = f"cat '{escaped_path}'"
|
373
|
+
|
374
|
+
result = await self._execute_shell_command(cmd)
|
375
|
+
MAX_OUTPUT_LENGTH = 15000 # Limit output length to avoid excessive data
|
376
|
+
if len(result["output"]) > MAX_OUTPUT_LENGTH:
|
377
|
+
result["output"] = f"{result['output'][:MAX_OUTPUT_LENGTH]}..."
|
378
|
+
if result["success"]:
|
379
|
+
if is_dir:
|
380
|
+
output = f"Here's the files and directories up to 2 levels deep in {file_path}, excluding hidden items:\n{result['output']}"
|
381
|
+
else:
|
382
|
+
output = f"File contents of {file_path}:\n{result['output']}"
|
383
|
+
else:
|
384
|
+
error = f"Failed to view file {file_path}: {result['error']}"
|
385
|
+
logger.debug(f"Action: {action.type} for file {file_path}")
|
386
|
+
|
387
|
+
case "text_editor_create":
|
388
|
+
# Create new file with contents
|
389
|
+
file_path = action.path
|
390
|
+
file_text = action.file_text
|
391
|
+
# Type guard: path should be str for text editor actions
|
392
|
+
if not isinstance(file_path, str):
|
393
|
+
raise TypeError("Invalid path type for text editor create action")
|
394
|
+
escaped_path = file_path.replace("'", "'\"'\"'")
|
395
|
+
escaped_content = file_text.replace("\t", " ").replace(
|
396
|
+
"'", "'\"'\"'"
|
397
|
+
) # Escape single quotes for shell
|
398
|
+
cmd = f"echo '{escaped_content}' > '{escaped_path}'"
|
399
|
+
|
400
|
+
result = await self._execute_shell_command(cmd)
|
401
|
+
if result["success"]:
|
402
|
+
output = f"Created file {file_path} with {len(file_text)} characters"
|
403
|
+
else:
|
404
|
+
error = f"Failed to create file {file_path}: {result['error']}"
|
405
|
+
logger.debug(f"Action: {action.type} created file {file_path}")
|
406
|
+
|
407
|
+
case "text_editor_str_replace":
|
408
|
+
# Execute string replacement
|
409
|
+
file_path = action.path
|
410
|
+
old_str = action.old_str
|
411
|
+
new_str = action.new_str
|
412
|
+
|
413
|
+
# Type guard: path should be str for text editor actions
|
414
|
+
if not isinstance(file_path, str):
|
415
|
+
raise TypeError("Invalid path type for text editor str_replace action")
|
416
|
+
# Use sed for string replacement, escaping special characters
|
417
|
+
escaped_path = file_path.replace("'", "'\"'\"'")
|
418
|
+
escaped_old = (
|
419
|
+
old_str.replace("\t", " ")
|
420
|
+
.replace("\\", "\\\\")
|
421
|
+
.replace("\n", "\\n")
|
422
|
+
.replace("/", "\\/")
|
423
|
+
.replace("'", "'\"'\"'")
|
424
|
+
)
|
425
|
+
escaped_new = (
|
426
|
+
new_str.replace("\t", " ")
|
427
|
+
.replace("\\", "\\\\")
|
428
|
+
.replace("\n", "\\n")
|
429
|
+
.replace("&", "\\&")
|
430
|
+
.replace("/", "\\/")
|
431
|
+
.replace("'", "'\"'\"'")
|
432
|
+
)
|
433
|
+
cmd = f"sed -i.bak 's/{escaped_old}/{escaped_new}/g' '{escaped_path}'"
|
434
|
+
|
435
|
+
result = await self._execute_shell_command(cmd)
|
436
|
+
if result["success"]:
|
437
|
+
output = f"Replaced '{old_str[:50]}...' with '{new_str[:50]}...' in {file_path}"
|
438
|
+
else:
|
439
|
+
error = f"Failed to replace text in {file_path}: {result['error']}"
|
440
|
+
logger.debug(f"Action: {action.type} in file {file_path}")
|
441
|
+
|
442
|
+
case "text_editor_insert":
|
443
|
+
# Insert text after specified line
|
444
|
+
file_path = action.path
|
445
|
+
insert_line = action.insert_line
|
446
|
+
new_str = action.new_str
|
447
|
+
|
448
|
+
# Type guard: path should be str for text editor actions
|
449
|
+
if not isinstance(file_path, str):
|
450
|
+
error = "Invalid path type for text editor insert action.\n"
|
451
|
+
error += f"Failed to insert text in {file_path}: {result['error']}"
|
452
|
+
raise TypeError(error)
|
453
|
+
escaped_path = file_path.replace("'", "'\"'\"'")
|
454
|
+
escaped_content = (
|
455
|
+
new_str.replace("\t", " ")
|
456
|
+
.replace("\\", "\\\\")
|
457
|
+
.replace("'", "'\"'\"'")
|
458
|
+
.replace("\n", "\\\n")
|
459
|
+
)
|
460
|
+
cmd = f"sed -i.bak '{insert_line}a\\{escaped_content}' '{escaped_path}'"
|
461
|
+
|
462
|
+
result = await self._execute_shell_command(cmd)
|
463
|
+
if result["success"]:
|
464
|
+
output = f"Inserted text after line {insert_line} in {file_path}"
|
465
|
+
else:
|
466
|
+
error = f"Failed to insert text in {file_path}: {result['error']}"
|
467
|
+
logger.debug(f"Action: {action.type} at line {insert_line} in file {file_path}")
|
468
|
+
|
469
|
+
case _:
|
470
|
+
error = f"Unrecognized action type: {action.type}"
|
471
|
+
logger.warning(error)
|
472
|
+
except KeyboardInterrupt:
|
473
|
+
error = "User interrupt. Operation aborted."
|
474
|
+
logger.error(error)
|
475
|
+
except TypeError as e:
|
476
|
+
logger.error(f"Error executing action {action.type}: {e}")
|
477
|
+
except Exception as e:
|
478
|
+
error = f"Unexpected error executing action {action.type}: {str(e)}"
|
479
|
+
logger.exception(
|
480
|
+
f"Unexpected error during step execution for action: {action.model_dump_json(exclude_none=True)}"
|
481
|
+
)
|
482
|
+
|
483
|
+
after_state = await self.get_state()
|
484
|
+
|
485
|
+
if action.type == "screenshot" and step_type == "image":
|
486
|
+
output = {"image": after_state.screenshot, "url": after_state.url}
|
487
|
+
|
488
|
+
return EnvStepResult(
|
489
|
+
type=step_type,
|
490
|
+
output=output,
|
491
|
+
error=error,
|
492
|
+
current_url=after_state.url,
|
493
|
+
screenshot_base64=after_state.screenshot,
|
494
|
+
)
|
495
|
+
|
496
|
+
async def _execute_shell_command(self, command: str, new: bool = True) -> dict:
|
497
|
+
"""Execute a shell command and return the result."""
|
498
|
+
try:
|
499
|
+
if self.provider == "docker":
|
500
|
+
# Execute command in Docker container
|
501
|
+
docker_args = [
|
502
|
+
"docker",
|
503
|
+
"exec",
|
504
|
+
self.docker_container_name,
|
505
|
+
"bash",
|
506
|
+
"-c",
|
507
|
+
command, # The command string is passed as a single argument to bash -c
|
508
|
+
]
|
509
|
+
process = await asyncio.to_thread(
|
510
|
+
subprocess.run,
|
511
|
+
docker_args,
|
512
|
+
capture_output=True,
|
513
|
+
text=True,
|
514
|
+
check=False,
|
515
|
+
timeout=120,
|
516
|
+
)
|
517
|
+
else:
|
518
|
+
# Execute command locally
|
519
|
+
process = await asyncio.to_thread(
|
520
|
+
subprocess.run,
|
521
|
+
command,
|
522
|
+
shell=True,
|
523
|
+
capture_output=True,
|
524
|
+
text=True,
|
525
|
+
check=False,
|
526
|
+
start_new_session=new,
|
527
|
+
timeout=120,
|
528
|
+
)
|
529
|
+
|
530
|
+
if process.returncode == 0:
|
531
|
+
return {"success": True, "output": process.stdout, "error": None}
|
532
|
+
else:
|
533
|
+
return {"success": False, "output": process.stdout, "error": process.stderr}
|
534
|
+
except asyncio.TimeoutError:
|
535
|
+
return {"success": False, "output": "", "error": f"Command timed out after 120 seconds."}
|
536
|
+
except Exception as e:
|
537
|
+
return {"success": False, "output": "", "error": str(e)}
|
538
|
+
|
539
|
+
async def close(self) -> None:
|
540
|
+
logger.debug("Computer environment closed. No specific resources to release for PyAutoGUI.")
|
541
|
+
|
542
|
+
CUA_KEY_TO_PYAUTOGUI_KEY = {
|
543
|
+
# Modifiers
|
544
|
+
"option": "alt",
|
545
|
+
"control": "ctrl",
|
546
|
+
"cmd": "command",
|
547
|
+
"super": "win",
|
548
|
+
"meta": "command" if platform.system() == "Darwin" else "win",
|
549
|
+
# Navigation & Editing
|
550
|
+
"arrowdown": "down",
|
551
|
+
"arrowleft": "left",
|
552
|
+
"arrowright": "right",
|
553
|
+
"arrowup": "up",
|
554
|
+
"caps_lock": "capslock",
|
555
|
+
"del": "delete",
|
556
|
+
"return": "enter",
|
557
|
+
"esc": "escape",
|
558
|
+
"pgdn": "pagedown",
|
559
|
+
"pgup": "pageup",
|
560
|
+
" ": "space",
|
561
|
+
# Numpad keys (example, pyautogui uses 'num0', 'add', 'subtract', etc.)
|
562
|
+
"numpad0": "num0",
|
563
|
+
"numpad_0": "num0",
|
564
|
+
}
|
565
|
+
|
566
|
+
@staticmethod
|
567
|
+
def parse_key_combination(text: str) -> list[str]:
|
568
|
+
if not text:
|
569
|
+
return []
|
570
|
+
|
571
|
+
keys_str_list = text.lower().split("+")
|
572
|
+
mapped_keys = []
|
573
|
+
for k_str in keys_str_list:
|
574
|
+
# Use the mapped key if found, otherwise use the string itself (e.g. 'a', '1')
|
575
|
+
mapped_keys.append(ComputerEnvironment.CUA_KEY_TO_PYAUTOGUI_KEY.get(k_str.strip(), k_str.strip()))
|
576
|
+
return mapped_keys
|
577
|
+
|
578
|
+
def generate_pyautogui_command(self, func_name: str, *args, **kwargs) -> str:
|
579
|
+
args_repr = [repr(arg) for arg in args]
|
580
|
+
kwargs_repr = [f"{k}={repr(v)}" for k, v in kwargs.items()]
|
581
|
+
all_params_repr = ", ".join(args_repr + kwargs_repr)
|
582
|
+
|
583
|
+
# Base script setup
|
584
|
+
script_lines = [
|
585
|
+
"import os",
|
586
|
+
"import pyautogui",
|
587
|
+
]
|
588
|
+
|
589
|
+
if self.provider == "docker":
|
590
|
+
script_lines.extend(
|
591
|
+
[
|
592
|
+
# Display export for Docker.
|
593
|
+
f"os.environ['DISPLAY']='{self.docker_display}'",
|
594
|
+
# Disable failsafe in Docker to avoid accidental exits
|
595
|
+
"pyautogui.FAILSAFE = False",
|
596
|
+
]
|
597
|
+
)
|
598
|
+
|
599
|
+
# Function-specific logic
|
600
|
+
if func_name == "screenshot":
|
601
|
+
script_lines.extend(
|
602
|
+
[
|
603
|
+
"import io",
|
604
|
+
"import base64",
|
605
|
+
"img = pyautogui.screenshot()",
|
606
|
+
"buf = io.BytesIO()",
|
607
|
+
"img.save(buf, format='PNG')",
|
608
|
+
"print(base64.b64encode(buf.getvalue()).decode('utf-8'))",
|
609
|
+
]
|
610
|
+
)
|
611
|
+
elif func_name == "size":
|
612
|
+
script_lines.extend(["size = pyautogui.size()", "print(f'({size.width}, {size.height})')"])
|
613
|
+
elif func_name == "position":
|
614
|
+
script_lines.extend(["pos = pyautogui.position()", "print(f'({pos.x}, {pos.y})')"])
|
615
|
+
else: # General command structure
|
616
|
+
script_lines.extend(
|
617
|
+
[f"result = pyautogui.{func_name}({all_params_repr})", "print(result if result is not None else '')"]
|
618
|
+
)
|
619
|
+
|
620
|
+
return "; ".join(script_lines)
|
621
|
+
|
622
|
+
async def docker_execute(self, python_command_str: str) -> Optional[str]:
|
623
|
+
if not self.docker_container_name or not self.docker_display:
|
624
|
+
logger.error("Container name or Docker display not set for Docker execution.")
|
625
|
+
return None
|
626
|
+
|
627
|
+
safe_python_cmd = python_command_str.replace('"', '\\"')
|
628
|
+
docker_full_cmd = (
|
629
|
+
f'docker exec -e DISPLAY={self.docker_display} "{self.docker_container_name}" '
|
630
|
+
f'python3 -c "{safe_python_cmd}"'
|
631
|
+
)
|
632
|
+
|
633
|
+
try:
|
634
|
+
process = await asyncio.to_thread(
|
635
|
+
subprocess.run,
|
636
|
+
docker_full_cmd,
|
637
|
+
shell=True,
|
638
|
+
capture_output=True,
|
639
|
+
text=True,
|
640
|
+
check=False, # We check returncode manually
|
641
|
+
)
|
642
|
+
if process.returncode != 0:
|
643
|
+
if "FailSafeException" in process.stderr or "FailSafeException" in process.stdout:
|
644
|
+
raise KeyboardInterrupt(process.stderr or process.stdout)
|
645
|
+
else:
|
646
|
+
error_msg = (
|
647
|
+
f"Docker command failed:\nCmd: {docker_full_cmd}\n"
|
648
|
+
f"Return Code: {process.returncode}\nStderr: {process.stderr}\nStdout: {process.stdout}"
|
649
|
+
)
|
650
|
+
logger.error(error_msg)
|
651
|
+
raise RuntimeError(f"Docker exec error: {process.stderr or process.stdout}")
|
652
|
+
return process.stdout.strip()
|
653
|
+
except KeyboardInterrupt: # Re-raise if caught from above
|
654
|
+
raise
|
655
|
+
except Exception as e:
|
656
|
+
logger.error(f"Unexpected error running command in Docker '{docker_full_cmd}': {e}")
|
657
|
+
# Encapsulate as RuntimeError to avoid leaking subprocess errors directly
|
658
|
+
raise RuntimeError(f"Unexpected Docker error: {e}") from e
|