lm-deluge 0.0.67__py3-none-any.whl → 0.0.88__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lm-deluge might be problematic. Click here for more details.

Files changed (92) hide show
  1. lm_deluge/__init__.py +25 -2
  2. lm_deluge/api_requests/anthropic.py +92 -17
  3. lm_deluge/api_requests/base.py +47 -11
  4. lm_deluge/api_requests/bedrock.py +7 -4
  5. lm_deluge/api_requests/chat_reasoning.py +4 -0
  6. lm_deluge/api_requests/gemini.py +138 -18
  7. lm_deluge/api_requests/openai.py +114 -21
  8. lm_deluge/client.py +282 -49
  9. lm_deluge/config.py +15 -3
  10. lm_deluge/mock_openai.py +643 -0
  11. lm_deluge/models/__init__.py +12 -1
  12. lm_deluge/models/anthropic.py +17 -2
  13. lm_deluge/models/arcee.py +16 -0
  14. lm_deluge/models/deepseek.py +36 -4
  15. lm_deluge/models/google.py +29 -0
  16. lm_deluge/models/grok.py +24 -0
  17. lm_deluge/models/kimi.py +36 -0
  18. lm_deluge/models/minimax.py +10 -0
  19. lm_deluge/models/openai.py +100 -0
  20. lm_deluge/models/openrouter.py +86 -8
  21. lm_deluge/models/together.py +11 -0
  22. lm_deluge/models/zai.py +1 -0
  23. lm_deluge/pipelines/gepa/__init__.py +95 -0
  24. lm_deluge/pipelines/gepa/core.py +354 -0
  25. lm_deluge/pipelines/gepa/docs/samples.py +696 -0
  26. lm_deluge/pipelines/gepa/examples/01_synthetic_keywords.py +140 -0
  27. lm_deluge/pipelines/gepa/examples/02_gsm8k_math.py +261 -0
  28. lm_deluge/pipelines/gepa/examples/03_hotpotqa_multihop.py +300 -0
  29. lm_deluge/pipelines/gepa/examples/04_batch_classification.py +271 -0
  30. lm_deluge/pipelines/gepa/examples/simple_qa.py +129 -0
  31. lm_deluge/pipelines/gepa/optimizer.py +435 -0
  32. lm_deluge/pipelines/gepa/proposer.py +235 -0
  33. lm_deluge/pipelines/gepa/util.py +165 -0
  34. lm_deluge/{llm_tools → pipelines}/score.py +2 -2
  35. lm_deluge/{llm_tools → pipelines}/translate.py +5 -3
  36. lm_deluge/prompt.py +224 -40
  37. lm_deluge/request_context.py +7 -2
  38. lm_deluge/tool/__init__.py +1118 -0
  39. lm_deluge/tool/builtin/anthropic/__init__.py +300 -0
  40. lm_deluge/tool/builtin/gemini.py +59 -0
  41. lm_deluge/tool/builtin/openai.py +74 -0
  42. lm_deluge/tool/cua/__init__.py +173 -0
  43. lm_deluge/tool/cua/actions.py +148 -0
  44. lm_deluge/tool/cua/base.py +27 -0
  45. lm_deluge/tool/cua/batch.py +215 -0
  46. lm_deluge/tool/cua/converters.py +466 -0
  47. lm_deluge/tool/cua/kernel.py +702 -0
  48. lm_deluge/tool/cua/trycua.py +989 -0
  49. lm_deluge/tool/prefab/__init__.py +45 -0
  50. lm_deluge/tool/prefab/batch_tool.py +156 -0
  51. lm_deluge/tool/prefab/docs.py +1119 -0
  52. lm_deluge/tool/prefab/email.py +294 -0
  53. lm_deluge/tool/prefab/filesystem.py +1711 -0
  54. lm_deluge/tool/prefab/full_text_search/__init__.py +285 -0
  55. lm_deluge/tool/prefab/full_text_search/tantivy_index.py +396 -0
  56. lm_deluge/tool/prefab/memory.py +458 -0
  57. lm_deluge/tool/prefab/otc/__init__.py +165 -0
  58. lm_deluge/tool/prefab/otc/executor.py +281 -0
  59. lm_deluge/tool/prefab/otc/parse.py +188 -0
  60. lm_deluge/tool/prefab/random.py +212 -0
  61. lm_deluge/tool/prefab/rlm/__init__.py +296 -0
  62. lm_deluge/tool/prefab/rlm/executor.py +349 -0
  63. lm_deluge/tool/prefab/rlm/parse.py +144 -0
  64. lm_deluge/tool/prefab/sandbox.py +1621 -0
  65. lm_deluge/tool/prefab/sheets.py +385 -0
  66. lm_deluge/tool/prefab/subagents.py +233 -0
  67. lm_deluge/tool/prefab/todos.py +342 -0
  68. lm_deluge/tool/prefab/tool_search.py +169 -0
  69. lm_deluge/tool/prefab/web_search.py +199 -0
  70. lm_deluge/tracker.py +16 -13
  71. lm_deluge/util/schema.py +412 -0
  72. lm_deluge/warnings.py +8 -0
  73. {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.88.dist-info}/METADATA +22 -9
  74. lm_deluge-0.0.88.dist-info/RECORD +117 -0
  75. lm_deluge/built_in_tools/anthropic/__init__.py +0 -128
  76. lm_deluge/built_in_tools/openai.py +0 -28
  77. lm_deluge/presets/cerebras.py +0 -17
  78. lm_deluge/presets/meta.py +0 -13
  79. lm_deluge/tool.py +0 -849
  80. lm_deluge-0.0.67.dist-info/RECORD +0 -72
  81. lm_deluge/{llm_tools → pipelines}/__init__.py +1 -1
  82. /lm_deluge/{llm_tools → pipelines}/classify.py +0 -0
  83. /lm_deluge/{llm_tools → pipelines}/extract.py +0 -0
  84. /lm_deluge/{llm_tools → pipelines}/locate.py +0 -0
  85. /lm_deluge/{llm_tools → pipelines}/ocr.py +0 -0
  86. /lm_deluge/{built_in_tools → tool/builtin}/anthropic/bash.py +0 -0
  87. /lm_deluge/{built_in_tools → tool/builtin}/anthropic/computer_use.py +0 -0
  88. /lm_deluge/{built_in_tools → tool/builtin}/anthropic/editor.py +0 -0
  89. /lm_deluge/{built_in_tools → tool/builtin}/base.py +0 -0
  90. {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.88.dist-info}/WHEEL +0 -0
  91. {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.88.dist-info}/licenses/LICENSE +0 -0
  92. {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.88.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,702 @@
1
+ """
2
+ Kernel (onkernel.com) implementation of ComputerExecutor.
3
+
4
+ This module provides a ComputerExecutor that connects to Kernel's browser-as-a-service
5
+ platform to execute computer use actions in a sandboxed cloud browser environment.
6
+
7
+ Requires: pip install kernel
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+
13
+ from .actions import (
14
+ Click,
15
+ CUAction,
16
+ DoubleClick,
17
+ Drag,
18
+ GoBack,
19
+ GoForward,
20
+ Keypress,
21
+ Move,
22
+ Navigate,
23
+ Scroll,
24
+ Search,
25
+ TripleClick,
26
+ Type,
27
+ Wait,
28
+ )
29
+ from .base import ComputerExecutor, CUActionResult, Screenshot as ScreenshotResult
30
+
31
+ # Lazy import kernel SDK to avoid import errors if not installed
32
+ _kernel_client = None
33
+
34
+
35
+ def _get_kernel_client():
36
+ """Get or create the Kernel client singleton."""
37
+ global _kernel_client
38
+ if _kernel_client is None:
39
+ try:
40
+ from kernel import Kernel
41
+ except ImportError:
42
+ raise ImportError(
43
+ "The 'kernel' package is required for KernelExecutor. "
44
+ "Install it with: pip install kernel"
45
+ )
46
+ _kernel_client = Kernel()
47
+ return _kernel_client
48
+
49
+
50
+ class KernelBrowser:
51
+ """
52
+ Manages a Kernel browser session lifecycle.
53
+
54
+ Usage:
55
+ async with KernelBrowser() as browser:
56
+ executor = KernelExecutor(browser.session_id)
57
+ result = executor.execute(Screenshot(kind="screenshot"))
58
+ """
59
+
60
+ def __init__(
61
+ self,
62
+ *,
63
+ headless: bool = True,
64
+ viewport_width: int = 1024,
65
+ viewport_height: int = 768,
66
+ timeout_seconds: int = 300,
67
+ persistence_id: str | None = None,
68
+ ):
69
+ """
70
+ Initialize a Kernel browser session configuration.
71
+
72
+ Args:
73
+ headless: Whether to run in headless mode (default True)
74
+ viewport_width: Browser viewport width in pixels
75
+ viewport_height: Browser viewport height in pixels
76
+ timeout_seconds: Auto-terminate after this many seconds of inactivity
77
+ persistence_id: Optional ID for session persistence (reuse cookies, etc.)
78
+ """
79
+ self.headless = headless
80
+ self.viewport_width = viewport_width
81
+ self.viewport_height = viewport_height
82
+ self.timeout_seconds = timeout_seconds
83
+ self.persistence_id = persistence_id
84
+ self.session_id: str | None = None
85
+ self._client = None
86
+
87
+ def create(self) -> "KernelBrowser":
88
+ """Create the browser session synchronously."""
89
+ self._client = _get_kernel_client()
90
+
91
+ create_params = {
92
+ "headless": self.headless,
93
+ "viewport": {
94
+ "width": self.viewport_width,
95
+ "height": self.viewport_height,
96
+ },
97
+ "timeout_seconds": self.timeout_seconds,
98
+ }
99
+
100
+ if self.persistence_id:
101
+ create_params["persistence"] = {"id": self.persistence_id}
102
+
103
+ browser = self._client.browsers.create(**create_params)
104
+ self.session_id = browser.session_id
105
+ return self
106
+
107
+ def delete(self) -> None:
108
+ """Delete the browser session."""
109
+ if self._client and self.session_id:
110
+ self._client.browsers.delete_by_id(self.session_id)
111
+ self.session_id = None
112
+
113
+ def __enter__(self) -> "KernelBrowser":
114
+ return self.create()
115
+
116
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
117
+ self.delete()
118
+
119
+
120
+ class AsyncKernelBrowser:
121
+ """
122
+ Async version of KernelBrowser for use with asyncio.
123
+
124
+ Usage:
125
+ async with AsyncKernelBrowser() as browser:
126
+ executor = AsyncKernelExecutor(browser.session_id)
127
+ result = await executor.execute(Screenshot(kind="screenshot"))
128
+ """
129
+
130
+ def __init__(
131
+ self,
132
+ *,
133
+ headless: bool = True,
134
+ viewport_width: int = 1024,
135
+ viewport_height: int = 768,
136
+ timeout_seconds: int = 300,
137
+ persistence_id: str | None = None,
138
+ ):
139
+ self.headless = headless
140
+ self.viewport_width = viewport_width
141
+ self.viewport_height = viewport_height
142
+ self.timeout_seconds = timeout_seconds
143
+ self.persistence_id = persistence_id
144
+ self.session_id: str | None = None
145
+ self._client = None
146
+
147
+ async def create(self) -> "AsyncKernelBrowser":
148
+ """Create the browser session asynchronously."""
149
+ try:
150
+ from kernel import AsyncKernel
151
+ except ImportError:
152
+ raise ImportError(
153
+ "The 'kernel' package is required for AsyncKernelBrowser. "
154
+ "Install it with: pip install kernel"
155
+ )
156
+
157
+ self._client = AsyncKernel()
158
+
159
+ create_params = {
160
+ "headless": self.headless,
161
+ "viewport": {
162
+ "width": self.viewport_width,
163
+ "height": self.viewport_height,
164
+ },
165
+ "timeout_seconds": self.timeout_seconds,
166
+ }
167
+
168
+ if self.persistence_id:
169
+ create_params["persistence"] = {"id": self.persistence_id}
170
+
171
+ browser = await self._client.browsers.create(**create_params)
172
+ self.session_id = browser.session_id
173
+ return self
174
+
175
+ async def delete(self) -> None:
176
+ """Delete the browser session."""
177
+ if self._client and self.session_id:
178
+ try:
179
+ await self._client.browsers.delete_by_id(self.session_id)
180
+ except Exception:
181
+ # Session may have already been deleted (timeout, etc.)
182
+ pass
183
+ self.session_id = None
184
+
185
+ async def __aenter__(self) -> "AsyncKernelBrowser":
186
+ return await self.create()
187
+
188
+ async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
189
+ await self.delete()
190
+
191
+
192
+ class KernelExecutor(ComputerExecutor):
193
+ """
194
+ Execute computer use actions on a Kernel browser session.
195
+
196
+ This executor maps CUAction types to Kernel's computer control API,
197
+ enabling vision-based LLM loops to control a remote browser.
198
+
199
+ Example:
200
+ with KernelBrowser() as browser:
201
+ executor = KernelExecutor(browser.session_id)
202
+
203
+ # Take a screenshot
204
+ result = executor.execute(Screenshot(kind="screenshot"))
205
+ print(f"Got {len(result['screenshot']['content'])} bytes")
206
+
207
+ # Click at coordinates
208
+ executor.execute(Click(kind="click", x=100, y=200, button="left"))
209
+
210
+ # Type text
211
+ executor.execute(Type(kind="type", text="Hello, world!"))
212
+ """
213
+
214
+ def __init__(self, session_id: str):
215
+ """
216
+ Initialize the executor with an active Kernel browser session.
217
+
218
+ Args:
219
+ session_id: The session ID from KernelBrowser.create()
220
+ """
221
+ self.session_id = session_id
222
+ self._client = _get_kernel_client()
223
+
224
+ def execute(self, action: CUAction) -> CUActionResult:
225
+ """
226
+ Execute a computer use action on the Kernel browser.
227
+
228
+ Args:
229
+ action: The action to execute (Click, Type, Screenshot, etc.)
230
+
231
+ Returns:
232
+ CUActionResult with screenshot (if applicable) and metadata
233
+ """
234
+ kind = action["kind"]
235
+
236
+ if kind == "screenshot":
237
+ return self._screenshot()
238
+ elif kind == "click":
239
+ return self._click(action) # type: ignore
240
+ elif kind == "double_click":
241
+ return self._double_click(action) # type: ignore
242
+ elif kind == "triple_click":
243
+ return self._triple_click(action) # type: ignore
244
+ elif kind == "move":
245
+ return self._move(action) # type: ignore
246
+ elif kind == "scroll":
247
+ return self._scroll(action) # type: ignore
248
+ elif kind == "type":
249
+ return self._type(action) # type: ignore
250
+ elif kind == "keypress":
251
+ return self._keypress(action) # type: ignore
252
+ elif kind == "drag":
253
+ return self._drag(action) # type: ignore
254
+ elif kind == "wait":
255
+ return self._wait(action) # type: ignore
256
+ elif kind == "navigate":
257
+ return self._navigate(action) # type: ignore
258
+ elif kind == "go_back":
259
+ return self._go_back(action) # type: ignore
260
+ elif kind == "go_forward":
261
+ return self._go_forward(action) # type: ignore
262
+ elif kind == "search":
263
+ return self._search(action) # type: ignore
264
+ else:
265
+ raise ValueError(f"Unsupported action kind: {kind}")
266
+
267
+ def _screenshot(self) -> CUActionResult:
268
+ """Capture a screenshot of the browser."""
269
+ response = self._client.browsers.computer.capture_screenshot(self.session_id)
270
+ # Response is a BinaryAPIResponse, read the content bytes
271
+ content = response.read()
272
+ return CUActionResult(
273
+ screenshot=ScreenshotResult(media_type="image/png", content=content),
274
+ data={},
275
+ )
276
+
277
+ def _click(self, action: Click) -> CUActionResult:
278
+ """Execute a click action."""
279
+ x = action["x"]
280
+ y = action["y"]
281
+ if x is None or y is None:
282
+ raise ValueError("Click action requires x and y coordinates")
283
+ button = action.get("button", "left")
284
+
285
+ self._client.browsers.computer.click_mouse(
286
+ self.session_id,
287
+ x=x,
288
+ y=y,
289
+ button=button,
290
+ num_clicks=1,
291
+ )
292
+ return CUActionResult(screenshot=None, data={"action": "click"})
293
+
294
+ def _double_click(self, action: DoubleClick) -> CUActionResult:
295
+ """Execute a double click action."""
296
+ params = {
297
+ "x": action.get("x"),
298
+ "y": action.get("y"),
299
+ "button": "left",
300
+ "num_clicks": 2,
301
+ }
302
+
303
+ self._client.browsers.computer.click_mouse(self.session_id, **params)
304
+ return CUActionResult(screenshot=None, data={"action": "double_click"})
305
+
306
+ def _triple_click(self, action: TripleClick) -> CUActionResult:
307
+ """Execute a triple click action."""
308
+ params = {
309
+ "x": action.get("x"),
310
+ "y": action.get("y"),
311
+ "button": "left",
312
+ "num_clicks": 3,
313
+ }
314
+
315
+ self._client.browsers.computer.click_mouse(self.session_id, **params)
316
+ return CUActionResult(screenshot=None, data={"action": "triple_click"})
317
+
318
+ def _move(self, action: Move) -> CUActionResult:
319
+ """Move the mouse cursor."""
320
+ self._client.browsers.computer.move_mouse(
321
+ self.session_id,
322
+ x=action["x"],
323
+ y=action["y"],
324
+ )
325
+ return CUActionResult(screenshot=None, data={"action": "move"})
326
+
327
+ def _scroll(self, action: Scroll) -> CUActionResult:
328
+ """Execute a scroll action."""
329
+ self._client.browsers.computer.scroll(
330
+ self.session_id,
331
+ x=action.get("x") or 0,
332
+ y=action.get("y") or 0,
333
+ delta_x=action["dx"],
334
+ delta_y=action["dy"],
335
+ )
336
+ return CUActionResult(screenshot=None, data={"action": "scroll"})
337
+
338
+ def _type(self, action: Type) -> CUActionResult:
339
+ """Type text."""
340
+ self._client.browsers.computer.type_text(
341
+ self.session_id,
342
+ text=action["text"],
343
+ )
344
+ return CUActionResult(screenshot=None, data={"action": "type"})
345
+
346
+ def _keypress(self, action: Keypress) -> CUActionResult:
347
+ """Press key(s)."""
348
+ # Kernel expects keys as a list of key combinations
349
+ # e.g., ["Ctrl+a", "Enter"]
350
+ self._client.browsers.computer.press_key(
351
+ self.session_id,
352
+ keys=action["keys"],
353
+ )
354
+ return CUActionResult(screenshot=None, data={"action": "keypress"})
355
+
356
+ def _drag(self, action: Drag) -> CUActionResult:
357
+ """Execute a drag action."""
358
+ # Build the path including start position
359
+ path = []
360
+ if action.get("start_x") is not None and action.get("start_y") is not None:
361
+ path.append([action["start_x"], action["start_y"]])
362
+ path.extend(action["path"])
363
+
364
+ self._client.browsers.computer.drag_mouse(
365
+ self.session_id,
366
+ path=path,
367
+ button="left",
368
+ )
369
+ return CUActionResult(screenshot=None, data={"action": "drag"})
370
+
371
+ def _wait(self, action: Wait) -> CUActionResult:
372
+ """Wait for a specified duration."""
373
+ import time
374
+
375
+ time.sleep(action["ms"] / 1000.0)
376
+ return CUActionResult(
377
+ screenshot=None, data={"action": "wait", "ms": action["ms"]}
378
+ )
379
+
380
+ def _navigate(self, action: Navigate) -> CUActionResult:
381
+ """Navigate to a URL using keyboard shortcuts."""
382
+ import time
383
+
384
+ # Ctrl+L to focus address bar, type URL, press Enter
385
+ self._client.browsers.computer.press_key(self.session_id, keys=["ctrl+l"])
386
+ time.sleep(0.2)
387
+ self._client.browsers.computer.type_text(self.session_id, text=action["url"])
388
+ time.sleep(0.1)
389
+ self._client.browsers.computer.press_key(self.session_id, keys=["Return"])
390
+ time.sleep(1.5) # Wait for page load
391
+ # Take screenshot after navigation
392
+ response = self._client.browsers.computer.capture_screenshot(self.session_id)
393
+ content = response.read()
394
+ return CUActionResult(
395
+ screenshot=ScreenshotResult(media_type="image/png", content=content),
396
+ data={"action": "navigate", "url": action["url"]},
397
+ )
398
+
399
+ def _go_back(self, action: GoBack) -> CUActionResult:
400
+ """Go back in browser history using keyboard shortcut."""
401
+ import time
402
+
403
+ self._client.browsers.computer.press_key(self.session_id, keys=["Alt+Left"])
404
+ time.sleep(0.5) # Wait for page load
405
+ response = self._client.browsers.computer.capture_screenshot(self.session_id)
406
+ content = response.read()
407
+ return CUActionResult(
408
+ screenshot=ScreenshotResult(media_type="image/png", content=content),
409
+ data={"action": "go_back"},
410
+ )
411
+
412
+ def _go_forward(self, action: GoForward) -> CUActionResult:
413
+ """Go forward in browser history using keyboard shortcut."""
414
+ import time
415
+
416
+ self._client.browsers.computer.press_key(self.session_id, keys=["Alt+Right"])
417
+ time.sleep(0.5) # Wait for page load
418
+ response = self._client.browsers.computer.capture_screenshot(self.session_id)
419
+ content = response.read()
420
+ return CUActionResult(
421
+ screenshot=ScreenshotResult(media_type="image/png", content=content),
422
+ data={"action": "go_forward"},
423
+ )
424
+
425
+ def _search(self, action: Search) -> CUActionResult:
426
+ """Navigate to Google search using keyboard shortcuts."""
427
+ import time
428
+ from urllib.parse import quote
429
+
430
+ search_url = f"https://www.google.com/search?q={quote(action['query'])}"
431
+ # Ctrl+L to focus address bar, type search URL, press Enter
432
+ self._client.browsers.computer.press_key(self.session_id, keys=["ctrl+l"])
433
+ time.sleep(0.2)
434
+ self._client.browsers.computer.type_text(self.session_id, text=search_url)
435
+ time.sleep(0.1)
436
+ self._client.browsers.computer.press_key(self.session_id, keys=["Return"])
437
+ time.sleep(1.5) # Wait for page load
438
+ response = self._client.browsers.computer.capture_screenshot(self.session_id)
439
+ content = response.read()
440
+ return CUActionResult(
441
+ screenshot=ScreenshotResult(media_type="image/png", content=content),
442
+ data={"action": "search", "query": action["query"]},
443
+ )
444
+
445
+
446
+ class AsyncKernelExecutor:
447
+ """
448
+ Async version of KernelExecutor for use with asyncio.
449
+
450
+ Example:
451
+ async with AsyncKernelBrowser() as browser:
452
+ executor = AsyncKernelExecutor(browser.session_id)
453
+ result = await executor.execute(Screenshot(kind="screenshot"))
454
+ """
455
+
456
+ def __init__(self, session_id: str):
457
+ """
458
+ Initialize the executor with an active Kernel browser session.
459
+
460
+ Args:
461
+ session_id: The session ID from AsyncKernelBrowser.create()
462
+ """
463
+ self.session_id = session_id
464
+ self._client = None
465
+
466
+ def _get_client(self):
467
+ """Lazy load the async client."""
468
+ if self._client is None:
469
+ try:
470
+ from kernel import AsyncKernel
471
+ except ImportError:
472
+ raise ImportError(
473
+ "The 'kernel' package is required for AsyncKernelExecutor. "
474
+ "Install it with: pip install kernel"
475
+ )
476
+ self._client = AsyncKernel()
477
+ return self._client
478
+
479
+ async def execute(self, action: CUAction) -> CUActionResult:
480
+ """
481
+ Execute a computer use action on the Kernel browser asynchronously.
482
+
483
+ Args:
484
+ action: The action to execute (Click, Type, Screenshot, etc.)
485
+
486
+ Returns:
487
+ CUActionResult with screenshot (if applicable) and metadata
488
+ """
489
+ kind = action["kind"]
490
+
491
+ if kind == "screenshot":
492
+ return await self._screenshot()
493
+ elif kind == "click":
494
+ return await self._click(action) # type: ignore
495
+ elif kind == "double_click":
496
+ return await self._double_click(action) # type: ignore
497
+ elif kind == "triple_click":
498
+ return await self._triple_click(action) # type: ignore
499
+ elif kind == "move":
500
+ return await self._move(action) # type: ignore
501
+ elif kind == "scroll":
502
+ return await self._scroll(action) # type: ignore
503
+ elif kind == "type":
504
+ return await self._type(action) # type: ignore
505
+ elif kind == "keypress":
506
+ return await self._keypress(action) # type: ignore
507
+ elif kind == "drag":
508
+ return await self._drag(action) # type: ignore
509
+ elif kind == "wait":
510
+ return await self._wait(action) # type: ignore
511
+ elif kind == "navigate":
512
+ return await self._navigate(action) # type: ignore
513
+ elif kind == "go_back":
514
+ return await self._go_back(action) # type: ignore
515
+ elif kind == "go_forward":
516
+ return await self._go_forward(action) # type: ignore
517
+ elif kind == "search":
518
+ return await self._search(action) # type: ignore
519
+ else:
520
+ raise ValueError(f"Unsupported action kind: {kind}")
521
+
522
+ async def _screenshot(self) -> CUActionResult:
523
+ """Capture a screenshot of the browser."""
524
+ client = self._get_client()
525
+ response = await client.browsers.computer.capture_screenshot(self.session_id)
526
+ # AsyncBinaryAPIResponse requires await on .read()
527
+ content = await response.read()
528
+ return CUActionResult(
529
+ screenshot=ScreenshotResult(media_type="image/png", content=content),
530
+ data={},
531
+ )
532
+
533
+ async def _click(self, action: Click) -> CUActionResult:
534
+ """Execute a click action."""
535
+ client = self._get_client()
536
+ params = {
537
+ "x": action["x"],
538
+ "y": action["y"],
539
+ "button": action.get("button", "left"),
540
+ "num_clicks": 1,
541
+ }
542
+
543
+ await client.browsers.computer.click_mouse(self.session_id, **params)
544
+ return CUActionResult(screenshot=None, data={"action": "click"})
545
+
546
+ async def _double_click(self, action: DoubleClick) -> CUActionResult:
547
+ """Execute a double click action."""
548
+ client = self._get_client()
549
+ params = {
550
+ "x": action.get("x"),
551
+ "y": action.get("y"),
552
+ "button": "left",
553
+ "num_clicks": 2,
554
+ }
555
+
556
+ await client.browsers.computer.click_mouse(self.session_id, **params)
557
+ return CUActionResult(screenshot=None, data={"action": "double_click"})
558
+
559
+ async def _triple_click(self, action: TripleClick) -> CUActionResult:
560
+ """Execute a triple click action."""
561
+ client = self._get_client()
562
+ params = {
563
+ "x": action.get("x"),
564
+ "y": action.get("y"),
565
+ "button": "left",
566
+ "num_clicks": 3,
567
+ }
568
+
569
+ await client.browsers.computer.click_mouse(self.session_id, **params)
570
+ return CUActionResult(screenshot=None, data={"action": "triple_click"})
571
+
572
+ async def _move(self, action: Move) -> CUActionResult:
573
+ """Move the mouse cursor."""
574
+ client = self._get_client()
575
+ await client.browsers.computer.move_mouse(
576
+ self.session_id,
577
+ x=action["x"],
578
+ y=action["y"],
579
+ )
580
+ return CUActionResult(screenshot=None, data={"action": "move"})
581
+
582
+ async def _scroll(self, action: Scroll) -> CUActionResult:
583
+ """Execute a scroll action."""
584
+ client = self._get_client()
585
+ await client.browsers.computer.scroll(
586
+ self.session_id,
587
+ x=action.get("x") or 0,
588
+ y=action.get("y") or 0,
589
+ delta_x=action["dx"],
590
+ delta_y=action["dy"],
591
+ )
592
+ return CUActionResult(screenshot=None, data={"action": "scroll"})
593
+
594
+ async def _type(self, action: Type) -> CUActionResult:
595
+ """Type text."""
596
+ client = self._get_client()
597
+ await client.browsers.computer.type_text(
598
+ self.session_id,
599
+ text=action["text"],
600
+ )
601
+ return CUActionResult(screenshot=None, data={"action": "type"})
602
+
603
+ async def _keypress(self, action: Keypress) -> CUActionResult:
604
+ """Press key(s)."""
605
+ client = self._get_client()
606
+ await client.browsers.computer.press_key(
607
+ self.session_id,
608
+ keys=action["keys"],
609
+ )
610
+ return CUActionResult(screenshot=None, data={"action": "keypress"})
611
+
612
+ async def _drag(self, action: Drag) -> CUActionResult:
613
+ """Execute a drag action."""
614
+ client = self._get_client()
615
+ path = []
616
+ if action.get("start_x") is not None and action.get("start_y") is not None:
617
+ path.append([action["start_x"], action["start_y"]])
618
+ path.extend(action["path"])
619
+
620
+ await client.browsers.computer.drag_mouse(
621
+ self.session_id,
622
+ path=path,
623
+ button="left",
624
+ )
625
+ return CUActionResult(screenshot=None, data={"action": "drag"})
626
+
627
+ async def _wait(self, action: Wait) -> CUActionResult:
628
+ """Wait for a specified duration."""
629
+ import asyncio
630
+
631
+ await asyncio.sleep(action["ms"] / 1000.0)
632
+ return CUActionResult(
633
+ screenshot=None, data={"action": "wait", "ms": action["ms"]}
634
+ )
635
+
636
+ async def _navigate(self, action: Navigate) -> CUActionResult:
637
+ """Navigate to a URL using keyboard shortcuts."""
638
+ import asyncio
639
+
640
+ client = self._get_client()
641
+ # Ctrl+L to focus address bar, type URL, press Enter
642
+ await client.browsers.computer.press_key(self.session_id, keys=["ctrl+l"])
643
+ await asyncio.sleep(0.2)
644
+ await client.browsers.computer.type_text(self.session_id, text=action["url"])
645
+ await asyncio.sleep(0.1)
646
+ await client.browsers.computer.press_key(self.session_id, keys=["Return"])
647
+ await asyncio.sleep(1.5) # Wait for page load
648
+ response = await client.browsers.computer.capture_screenshot(self.session_id)
649
+ content = await response.read()
650
+ return CUActionResult(
651
+ screenshot=ScreenshotResult(media_type="image/png", content=content),
652
+ data={"action": "navigate", "url": action["url"]},
653
+ )
654
+
655
+ async def _go_back(self, action: GoBack) -> CUActionResult:
656
+ """Go back in browser history using keyboard shortcut."""
657
+ import asyncio
658
+
659
+ client = self._get_client()
660
+ await client.browsers.computer.press_key(self.session_id, keys=["Alt+Left"])
661
+ await asyncio.sleep(0.5) # Wait for page load
662
+ response = await client.browsers.computer.capture_screenshot(self.session_id)
663
+ content = await response.read()
664
+ return CUActionResult(
665
+ screenshot=ScreenshotResult(media_type="image/png", content=content),
666
+ data={"action": "go_back"},
667
+ )
668
+
669
+ async def _go_forward(self, action: GoForward) -> CUActionResult:
670
+ """Go forward in browser history using keyboard shortcut."""
671
+ import asyncio
672
+
673
+ client = self._get_client()
674
+ await client.browsers.computer.press_key(self.session_id, keys=["Alt+Right"])
675
+ await asyncio.sleep(0.5) # Wait for page load
676
+ response = await client.browsers.computer.capture_screenshot(self.session_id)
677
+ content = await response.read()
678
+ return CUActionResult(
679
+ screenshot=ScreenshotResult(media_type="image/png", content=content),
680
+ data={"action": "go_forward"},
681
+ )
682
+
683
+ async def _search(self, action: Search) -> CUActionResult:
684
+ """Navigate to Google search using keyboard shortcuts."""
685
+ import asyncio
686
+ from urllib.parse import quote
687
+
688
+ client = self._get_client()
689
+ search_url = f"https://www.google.com/search?q={quote(action['query'])}"
690
+ # Ctrl+L to focus address bar, type search URL, press Enter
691
+ await client.browsers.computer.press_key(self.session_id, keys=["ctrl+l"])
692
+ await asyncio.sleep(0.2)
693
+ await client.browsers.computer.type_text(self.session_id, text=search_url)
694
+ await asyncio.sleep(0.1)
695
+ await client.browsers.computer.press_key(self.session_id, keys=["Return"])
696
+ await asyncio.sleep(1.5) # Wait for page load
697
+ response = await client.browsers.computer.capture_screenshot(self.session_id)
698
+ content = await response.read()
699
+ return CUActionResult(
700
+ screenshot=ScreenshotResult(media_type="image/png", content=content),
701
+ data={"action": "search", "query": action["query"]},
702
+ )