pygpt-net 2.7.6__py3-none-any.whl → 2.7.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. pygpt_net/CHANGELOG.txt +13 -0
  2. pygpt_net/__init__.py +3 -3
  3. pygpt_net/app.py +5 -1
  4. pygpt_net/controller/assistant/batch.py +2 -2
  5. pygpt_net/controller/assistant/files.py +7 -6
  6. pygpt_net/controller/assistant/threads.py +0 -0
  7. pygpt_net/controller/chat/command.py +0 -0
  8. pygpt_net/controller/chat/remote_tools.py +3 -9
  9. pygpt_net/controller/chat/stream.py +2 -2
  10. pygpt_net/controller/chat/{handler/worker.py → stream_worker.py} +13 -35
  11. pygpt_net/controller/dialogs/confirm.py +35 -58
  12. pygpt_net/controller/lang/mapping.py +9 -9
  13. pygpt_net/controller/remote_store/{google/batch.py → batch.py} +209 -252
  14. pygpt_net/controller/remote_store/remote_store.py +982 -13
  15. pygpt_net/core/command/command.py +0 -0
  16. pygpt_net/core/db/viewer.py +1 -1
  17. pygpt_net/core/debug/models.py +2 -2
  18. pygpt_net/core/realtime/worker.py +3 -1
  19. pygpt_net/{controller/remote_store/google → core/remote_store/anthropic}/__init__.py +0 -1
  20. pygpt_net/core/remote_store/anthropic/files.py +211 -0
  21. pygpt_net/core/remote_store/anthropic/store.py +208 -0
  22. pygpt_net/core/remote_store/openai/store.py +5 -4
  23. pygpt_net/core/remote_store/remote_store.py +5 -1
  24. pygpt_net/{controller/remote_store/openai → core/remote_store/xai}/__init__.py +0 -1
  25. pygpt_net/core/remote_store/xai/files.py +225 -0
  26. pygpt_net/core/remote_store/xai/store.py +219 -0
  27. pygpt_net/data/config/config.json +18 -5
  28. pygpt_net/data/config/models.json +193 -4
  29. pygpt_net/data/config/settings.json +179 -36
  30. pygpt_net/data/icons/folder_eye.svg +1 -0
  31. pygpt_net/data/icons/folder_eye_filled.svg +1 -0
  32. pygpt_net/data/icons/folder_open.svg +1 -0
  33. pygpt_net/data/icons/folder_open_filled.svg +1 -0
  34. pygpt_net/data/locale/locale.de.ini +6 -3
  35. pygpt_net/data/locale/locale.en.ini +46 -12
  36. pygpt_net/data/locale/locale.es.ini +6 -3
  37. pygpt_net/data/locale/locale.fr.ini +6 -3
  38. pygpt_net/data/locale/locale.it.ini +6 -3
  39. pygpt_net/data/locale/locale.pl.ini +7 -4
  40. pygpt_net/data/locale/locale.uk.ini +6 -3
  41. pygpt_net/data/locale/locale.zh.ini +6 -3
  42. pygpt_net/icons.qrc +4 -0
  43. pygpt_net/icons_rc.py +282 -138
  44. pygpt_net/plugin/cmd_mouse_control/worker.py +2 -1
  45. pygpt_net/plugin/cmd_mouse_control/worker_sandbox.py +2 -1
  46. pygpt_net/provider/api/anthropic/__init__.py +10 -3
  47. pygpt_net/provider/api/anthropic/chat.py +342 -11
  48. pygpt_net/provider/api/anthropic/computer.py +844 -0
  49. pygpt_net/provider/api/anthropic/remote_tools.py +172 -0
  50. pygpt_net/provider/api/anthropic/store.py +307 -0
  51. pygpt_net/{controller/chat/handler/anthropic_stream.py → provider/api/anthropic/stream.py} +99 -10
  52. pygpt_net/provider/api/anthropic/tools.py +32 -77
  53. pygpt_net/provider/api/anthropic/utils.py +30 -0
  54. pygpt_net/{controller/chat/handler → provider/api/anthropic/worker}/__init__.py +0 -0
  55. pygpt_net/provider/api/anthropic/worker/importer.py +278 -0
  56. pygpt_net/provider/api/google/chat.py +62 -9
  57. pygpt_net/provider/api/google/store.py +124 -3
  58. pygpt_net/{controller/chat/handler/google_stream.py → provider/api/google/stream.py} +92 -25
  59. pygpt_net/provider/api/google/utils.py +185 -0
  60. pygpt_net/provider/api/google/worker/importer.py +16 -28
  61. pygpt_net/provider/api/langchain/__init__.py +0 -0
  62. pygpt_net/{controller/chat/handler/langchain_stream.py → provider/api/langchain/stream.py} +1 -1
  63. pygpt_net/provider/api/llama_index/__init__.py +0 -0
  64. pygpt_net/{controller/chat/handler/llamaindex_stream.py → provider/api/llama_index/stream.py} +1 -1
  65. pygpt_net/provider/api/openai/assistants.py +2 -2
  66. pygpt_net/provider/api/openai/image.py +2 -2
  67. pygpt_net/provider/api/openai/store.py +4 -1
  68. pygpt_net/{controller/chat/handler/openai_stream.py → provider/api/openai/stream.py} +1 -1
  69. pygpt_net/provider/api/openai/utils.py +69 -3
  70. pygpt_net/provider/api/openai/worker/importer.py +19 -61
  71. pygpt_net/provider/api/openai/worker/importer_assistants.py +230 -0
  72. pygpt_net/provider/api/x_ai/__init__.py +138 -15
  73. pygpt_net/provider/api/x_ai/audio.py +43 -11
  74. pygpt_net/provider/api/x_ai/chat.py +92 -4
  75. pygpt_net/provider/api/x_ai/image.py +149 -47
  76. pygpt_net/provider/api/x_ai/realtime/__init__.py +12 -0
  77. pygpt_net/provider/api/x_ai/realtime/client.py +1825 -0
  78. pygpt_net/provider/api/x_ai/realtime/realtime.py +198 -0
  79. pygpt_net/provider/api/x_ai/{remote.py → remote_tools.py} +183 -70
  80. pygpt_net/provider/api/x_ai/responses.py +507 -0
  81. pygpt_net/provider/api/x_ai/store.py +610 -0
  82. pygpt_net/{controller/chat/handler/xai_stream.py → provider/api/x_ai/stream.py} +42 -10
  83. pygpt_net/provider/api/x_ai/tools.py +59 -8
  84. pygpt_net/{controller/chat/handler → provider/api/x_ai}/utils.py +1 -2
  85. pygpt_net/provider/api/x_ai/vision.py +1 -4
  86. pygpt_net/provider/api/x_ai/worker/importer.py +308 -0
  87. pygpt_net/provider/audio_input/xai_grok_voice.py +390 -0
  88. pygpt_net/provider/audio_output/xai_tts.py +325 -0
  89. pygpt_net/provider/core/config/patch.py +39 -3
  90. pygpt_net/provider/core/config/patches/patch_before_2_6_42.py +2 -2
  91. pygpt_net/provider/core/model/patch.py +39 -1
  92. pygpt_net/tools/image_viewer/tool.py +334 -34
  93. pygpt_net/tools/image_viewer/ui/dialogs.py +319 -22
  94. pygpt_net/tools/text_editor/ui/dialogs.py +3 -2
  95. pygpt_net/tools/text_editor/ui/widgets.py +0 -0
  96. pygpt_net/ui/dialog/assistant.py +1 -1
  97. pygpt_net/ui/dialog/plugins.py +13 -5
  98. pygpt_net/ui/dialog/remote_store.py +552 -0
  99. pygpt_net/ui/dialogs.py +3 -5
  100. pygpt_net/ui/layout/ctx/ctx_list.py +58 -7
  101. pygpt_net/ui/menu/tools.py +6 -13
  102. pygpt_net/ui/widget/dialog/base.py +16 -5
  103. pygpt_net/ui/widget/dialog/{remote_store_google.py → remote_store.py} +10 -10
  104. pygpt_net/ui/widget/element/button.py +4 -4
  105. pygpt_net/ui/widget/image/display.py +2 -2
  106. pygpt_net/ui/widget/lists/context.py +2 -2
  107. pygpt_net/ui/widget/textarea/editor.py +0 -0
  108. {pygpt_net-2.7.6.dist-info → pygpt_net-2.7.8.dist-info}/METADATA +15 -2
  109. {pygpt_net-2.7.6.dist-info → pygpt_net-2.7.8.dist-info}/RECORD +107 -89
  110. pygpt_net/controller/remote_store/google/store.py +0 -615
  111. pygpt_net/controller/remote_store/openai/batch.py +0 -524
  112. pygpt_net/controller/remote_store/openai/store.py +0 -699
  113. pygpt_net/ui/dialog/remote_store_google.py +0 -539
  114. pygpt_net/ui/dialog/remote_store_openai.py +0 -539
  115. pygpt_net/ui/widget/dialog/remote_store_openai.py +0 -56
  116. pygpt_net/ui/widget/lists/remote_store_google.py +0 -248
  117. pygpt_net/ui/widget/lists/remote_store_openai.py +0 -317
  118. {pygpt_net-2.7.6.dist-info → pygpt_net-2.7.8.dist-info}/LICENSE +0 -0
  119. {pygpt_net-2.7.6.dist-info → pygpt_net-2.7.8.dist-info}/WHEEL +0 -0
  120. {pygpt_net-2.7.6.dist-info → pygpt_net-2.7.8.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,844 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # ================================================== #
4
+ # This file is a part of PYGPT package #
5
+ # Website: https://pygpt.net #
6
+ # GitHub: https://github.com/szczyglis-dev/py-gpt #
7
+ # MIT License #
8
+ # Created By : Marcin Szczygliński #
9
+ # Updated Date: 2026.01.05 20:00:00 #
10
+ # ================================================== #
11
+
12
+ import json
13
+ import time
14
+ from typing import Any, Dict, List, Optional, Tuple
15
+
16
+ from pygpt_net.item.ctx import CtxItem
17
+
18
+
19
+ class Computer:
20
+ """
21
+ Anthropic Computer Use adapter.
22
+
23
+ Responsibilities:
24
+ - Provide Anthropic Computer Use tool spec (messages.create tools[]).
25
+ - Rewrite Anthropic computer tool_use payloads into app-compatible tool calls
26
+ for the Mouse & Keyboard plugin (same final shape as OpenAI/Google adapters).
27
+ - Handle both content_block_delta.input_json_delta and top-level input_json_delta
28
+ variants produced by Anthropic Beta streaming.
29
+ """
30
+
31
+ COMPUTER_TOOL_NAMES = {
32
+ "computer",
33
+ "computer.use",
34
+ "anthropic/computer",
35
+ "computer_use",
36
+ "computer-use",
37
+ }
38
+
39
+ # Known plugin command names (Worker supports these).
40
+ # Used to normalize arguments when upstream sends Anthropic-shaped payloads as direct function calls.
41
+ PLUGIN_COMMAND_NAMES = {
42
+ "open_web_browser",
43
+ "get_mouse_position",
44
+ "mouse_move",
45
+ "mouse_drag",
46
+ "mouse_click",
47
+ "mouse_scroll",
48
+ "get_screenshot",
49
+ "keyboard_key",
50
+ "keyboard_keys",
51
+ "keyboard_type",
52
+ "wait",
53
+ # host-native extras
54
+ "wait_5_seconds",
55
+ "go_back",
56
+ "go_forward",
57
+ "search",
58
+ "navigate",
59
+ "click_at",
60
+ "hover_at",
61
+ "type_text_at",
62
+ "key_combination",
63
+ "scroll_document",
64
+ "scroll_at",
65
+ "drag_and_drop",
66
+ # action-style
67
+ "click",
68
+ "double_click",
69
+ "move",
70
+ "type",
71
+ "keypress",
72
+ "scroll",
73
+ "drag",
74
+ }
75
+
76
+ # Action name synonyms that may appear in Anthropic payloads -> plugin command names
77
+ ACTION_SYNONYMS = {
78
+ "hover": "mouse_move",
79
+ "move": "mouse_move",
80
+ "mouse_move": "mouse_move",
81
+
82
+ "click": "mouse_click",
83
+ "left_click": "mouse_click",
84
+ "right_click": "mouse_click",
85
+ "double_click": "mouse_click",
86
+
87
+ "scroll": "mouse_scroll",
88
+ "mouse_scroll": "mouse_scroll",
89
+
90
+ "drag": "mouse_drag",
91
+ "drag_and_drop": "mouse_drag",
92
+ "mouse_drag": "mouse_drag",
93
+
94
+ "type": "keyboard_type",
95
+ "input": "keyboard_type",
96
+ "keyboard_type": "keyboard_type",
97
+
98
+ "keypress": "keyboard_keys",
99
+ "key": "keyboard_keys",
100
+ "keys": "keyboard_keys",
101
+ "key_combination": "key_combination",
102
+
103
+ "screenshot": "get_screenshot",
104
+ "get_screenshot": "get_screenshot",
105
+
106
+ "wait": "wait",
107
+ "sleep": "wait",
108
+ }
109
+
110
+ KEY_MODIFIERS = {"ctrl", "control", "alt", "shift", "cmd", "super", "start"}
111
+
112
+ def __init__(self, window=None):
113
+ """
114
+ :param window: Window instance
115
+ """
116
+ self.window = window
117
+
118
+ # --------------- Tool spec --------------- #
119
+
120
+ def get_current_env(self) -> Dict[str, Any]:
121
+ idx = self.window.ui.nodes["computer_env"].currentIndex()
122
+ return self.window.ui.nodes["computer_env"].itemData(idx)
123
+
124
+ def get_tool(self) -> dict:
125
+ is_sandbox = bool(self.window.core.config.get("remote_tools.computer_use.sandbox", False))
126
+ screen_w, screen_h = self._resolve_display_size(is_sandbox=is_sandbox)
127
+ tool_type = str(self.window.core.config.get("remote_tools.anthropic.computer.type", "computer_20250124")).strip() or "computer_20250124"
128
+ return {
129
+ "name": "computer",
130
+ "type": tool_type,
131
+ "display_width_px": int(screen_w),
132
+ "display_height_px": int(screen_h),
133
+ }
134
+
135
+ def _resolve_display_size(self, is_sandbox: bool) -> Tuple[int, int]:
136
+ screen_w = screen_h = 0
137
+ try:
138
+ screen = self.window.app.primaryScreen()
139
+ size = screen.size()
140
+ screen_w = int(size.width())
141
+ screen_h = int(size.height())
142
+ except Exception:
143
+ screen_w, screen_h = 1440, 900
144
+
145
+ if is_sandbox:
146
+ try:
147
+ vw = int(self.window.core.plugins.get_option("cmd_mouse_control", "sandbox_viewport_w"))
148
+ vh = int(self.window.core.plugins.get_option("cmd_mouse_control", "sandbox_viewport_h"))
149
+ if vw > 0 and vh > 0:
150
+ screen_w, screen_h = vw, vh
151
+ except Exception:
152
+ pass
153
+
154
+ return screen_w, screen_h
155
+
156
+ # --------------- Streaming handling --------------- #
157
+
158
+ def handle_stream_chunk(self, ctx: CtxItem, chunk, tool_calls: list) -> Tuple[List, bool]:
159
+ """
160
+ Convert Computer Use 'tool_use' streaming events into plugin tool calls.
161
+ Supports:
162
+ - content_block_delta/input_json_delta
163
+ - top-level input_json_delta
164
+ """
165
+ has_calls = False
166
+ etype = str(getattr(chunk, "type", "") or "")
167
+
168
+ cmem = self._ensure_ctx_memory(ctx)
169
+
170
+ if etype == "content_block_start":
171
+ cb = getattr(chunk, "content_block", None)
172
+ if cb and getattr(cb, "type", "") == "tool_use":
173
+ name = str(getattr(cb, "name", "") or "")
174
+ if name in self.COMPUTER_TOOL_NAMES:
175
+ idx = str(getattr(chunk, "index", 0) or 0)
176
+ tid = str(getattr(cb, "id", "") or self._gen_id(prefix="ac"))
177
+ cmem["index_to_id"][idx] = tid
178
+ cmem["buffers"].setdefault(tid, "")
179
+ cmem["active_ids"].append(tid)
180
+
181
+ elif etype == "input_json_delta":
182
+ pj = getattr(chunk, "partial_json", "") or ""
183
+ if cmem["active_ids"]:
184
+ tid = cmem["active_ids"][-1]
185
+ cmem["buffers"][tid] = cmem["buffers"].get(tid, "") + pj
186
+
187
+ elif etype == "content_block_delta":
188
+ delta = getattr(chunk, "delta", None)
189
+ if delta and getattr(delta, "type", "") == "input_json_delta":
190
+ idx = str(getattr(chunk, "index", 0) or 0)
191
+ tid = cmem["index_to_id"].get(idx)
192
+ if tid:
193
+ pj = getattr(delta, "partial_json", "") or ""
194
+ cmem["buffers"][tid] = cmem["buffers"].get(tid, "") + pj
195
+
196
+ elif etype == "content_block_stop":
197
+ idx = str(getattr(chunk, "index", 0) or 0)
198
+ tid = cmem["index_to_id"].pop(idx, None)
199
+ if not tid and cmem["active_ids"]:
200
+ tid = cmem["active_ids"].pop()
201
+ elif tid and cmem["active_ids"]:
202
+ if cmem["active_ids"] and cmem["active_ids"][-1] == tid:
203
+ cmem["active_ids"].pop()
204
+ else:
205
+ try:
206
+ cmem["active_ids"].remove(tid)
207
+ except ValueError:
208
+ pass
209
+
210
+ if tid:
211
+ payload = self._safe_json_loads(cmem["buffers"].pop(tid, ""))
212
+ if payload is not None:
213
+ try:
214
+ if not isinstance(ctx.extra, dict):
215
+ ctx.extra = {}
216
+ tu_list = ctx.extra.get("anthropic_tool_uses")
217
+ if not isinstance(tu_list, list):
218
+ tu_list = []
219
+ tu_list.append({"id": tid, "name": "computer", "input": payload})
220
+ ctx.extra["anthropic_tool_uses"] = tu_list
221
+ self.window.core.ctx.update_item(ctx)
222
+ except Exception:
223
+ pass
224
+
225
+ mapped = self._payload_to_tool_calls(tid, tid, payload)
226
+ if mapped:
227
+ tool_calls.extend(mapped)
228
+ has_calls = True
229
+
230
+ elif etype == "message_stop":
231
+ while cmem["active_ids"]:
232
+ tid = cmem["active_ids"].pop()
233
+ payload = self._safe_json_loads(cmem["buffers"].pop(tid, ""))
234
+ if payload is None:
235
+ continue
236
+ try:
237
+ if not isinstance(ctx.extra, dict):
238
+ ctx.extra = {}
239
+ tu_list = ctx.extra.get("anthropic_tool_uses")
240
+ if not isinstance(tu_list, list):
241
+ tu_list = []
242
+ tu_list.append({"id": tid, "name": "computer", "input": payload})
243
+ ctx.extra["anthropic_tool_uses"] = tu_list
244
+ self.window.core.ctx.update_item(ctx)
245
+ except Exception:
246
+ pass
247
+ mapped = self._payload_to_tool_calls(tid, tid, payload)
248
+ if mapped:
249
+ tool_calls.extend(mapped)
250
+ has_calls = True
251
+
252
+ return tool_calls, has_calls
253
+
254
+ # --------------- Public normalization for function tools --------------- #
255
+
256
+ def normalize_function_args_json(self, name: str, args_json: Optional[str]) -> Optional[str]:
257
+ """
258
+ Normalize function-call arguments (client tools) to plugin shape.
259
+ Accepts JSON string, returns JSON string. Returns None on failure.
260
+ """
261
+ if args_json is None:
262
+ return None
263
+ try:
264
+ data = json.loads(args_json)
265
+ except Exception:
266
+ return None
267
+
268
+ target_name, coerced = self._retarget_function_name_and_args(name, data if isinstance(data, dict) else {})
269
+ norm = self._normalize_params_for_plugin(target_name, coerced)
270
+ norm = self._filter_args_for_plugin(target_name, norm)
271
+ try:
272
+ return json.dumps(norm, ensure_ascii=False)
273
+ except Exception:
274
+ return None
275
+
276
+ # --------------- Non-stream helpers --------------- #
277
+
278
+ def rewrite_tool_calls(self, tool_calls: List[dict]) -> List[dict]:
279
+ """
280
+ Rewrites:
281
+ - tool_use(computer) payloads into a sequence of plugin tool calls
282
+ - direct function calls that already use plugin commands but carry Anthropic-style args
283
+ (e.g., coordinate/action) into plugin-ready args
284
+ - action-name synonyms (e.g., left_click, hover) into canonical plugin commands
285
+
286
+ Important: this method mutates the incoming items IN-PLACE so that even if caller
287
+ ignores the return value and continues using the original list/reference, the
288
+ rewritten arguments and names are preserved (prevents leaking 'action'/'coordinate').
289
+ """
290
+ out: List[dict] = []
291
+ for i, tc in enumerate(tool_calls or []):
292
+ try:
293
+ f = tc.get("function") or {}
294
+ name = str(f.get("name", "") or "")
295
+ args_raw = f.get("arguments", {})
296
+ args = self._safe_json_loads(args_raw) if isinstance(args_raw, str) else args_raw
297
+
298
+ # Case 1: Anthropic "computer" tool_use payload -> expand to sequence of plugin calls
299
+ if name in self.COMPUTER_TOOL_NAMES and isinstance(args, (dict, list)):
300
+ calls = self._payload_to_tool_calls(tc.get("id") or self._gen_id(),
301
+ tc.get("call_id") or tc.get("id") or self._gen_id(),
302
+ args)
303
+ if calls:
304
+ out.extend(calls)
305
+ continue
306
+ else:
307
+ out.append(tc)
308
+ continue
309
+
310
+ # Case 2: Direct function calls -> normalize and FILTER, then mutate in place
311
+ if isinstance(args, dict):
312
+ target_name, coerced = self._retarget_function_name_and_args(name, args)
313
+ norm = self._normalize_params_for_plugin(target_name, coerced)
314
+ norm = self._filter_args_for_plugin(target_name, norm)
315
+ pruned = self._prune_none(norm)
316
+
317
+ f["name"] = target_name
318
+ f["arguments"] = json.dumps(pruned, ensure_ascii=False)
319
+ tc["function"] = f
320
+
321
+ # Mutate the original reference inside the incoming list as well
322
+ tool_calls[i] = tc
323
+
324
+ out.append(tc)
325
+ continue
326
+
327
+ # Fallback: leave unchanged
328
+ out.append(tc)
329
+
330
+ except Exception:
331
+ out.append(tc)
332
+ return out
333
+
334
+ # --------------- Parsers / mappers --------------- #
335
+
336
+ def _payload_to_tool_calls(self, id_: str, call_id: str, payload: Any) -> List[dict]:
337
+ actions = self._extract_actions(payload)
338
+ out: List[dict] = []
339
+ for action in actions:
340
+ mapped = self._map_single_action(action, id_, call_id)
341
+ if mapped:
342
+ out.append(mapped)
343
+ return out
344
+
345
+ def _extract_actions(self, payload: Any) -> List[dict]:
346
+ if payload is None:
347
+ return []
348
+
349
+ def _coerce_action(obj: Any) -> Optional[dict]:
350
+ if obj is None:
351
+ return None
352
+ if isinstance(obj, dict):
353
+ # If Anthropic sends {"type": "...", ...}
354
+ if "type" in obj:
355
+ return obj
356
+ # If Anthropic sends {"action": "left_click", "coordinate": [...]}
357
+ if "action" in obj and isinstance(obj["action"], str) and obj["action"]:
358
+ act = {"type": str(obj["action"]).strip().lower()}
359
+ for k in ("x", "y", "button", "dx", "dy", "scroll_x", "scroll_y", "keys", "key",
360
+ "text", "value", "path", "from", "to", "coordinate", "destination", "offset", "delta",
361
+ "seconds", "sec", "count", "num_clicks", "unit"):
362
+ if k in obj:
363
+ act[k] = obj[k]
364
+ return act
365
+ # If Anthropic sends {"action": {...}}
366
+ if "action" in obj and isinstance(obj["action"], dict):
367
+ return obj["action"]
368
+ return None
369
+ if isinstance(obj, str):
370
+ s = obj.strip().lower()
371
+ if s:
372
+ return {"type": s}
373
+ return None
374
+
375
+ if isinstance(payload, dict):
376
+ if "actions" in payload and isinstance(payload["actions"], list):
377
+ out = []
378
+ for it in payload["actions"]:
379
+ coerced = _coerce_action(it)
380
+ if coerced:
381
+ out.append(coerced)
382
+ return out
383
+ coerced = _coerce_action(payload)
384
+ return [coerced] if coerced else []
385
+
386
+ if isinstance(payload, list):
387
+ out = []
388
+ for it in payload:
389
+ coerced = _coerce_action(it)
390
+ if coerced:
391
+ out.append(coerced)
392
+ return out
393
+
394
+ return []
395
+
396
+ def _extract_xy(self, action: dict) -> Tuple[Optional[int], Optional[int]]:
397
+ try:
398
+ if "x" in action and "y" in action:
399
+ return int(action["x"]), int(action["y"])
400
+ except Exception:
401
+ pass
402
+ for key in ("coordinate", "coordinates", "position", "point", "center", "location", "loc", "pos"):
403
+ val = action.get(key)
404
+ if isinstance(val, (list, tuple)) and len(val) >= 2:
405
+ try:
406
+ return int(val[0]), int(val[1])
407
+ except Exception:
408
+ pass
409
+ if isinstance(val, dict) and "x" in val and "y" in val:
410
+ try:
411
+ return int(val["x"]), int(val["y"])
412
+ except Exception:
413
+ pass
414
+ return None, None
415
+
416
+ def _extract_dxdy(self, action: dict) -> Tuple[int, int]:
417
+ def as_int(v, default=0):
418
+ try:
419
+ return int(v)
420
+ except Exception:
421
+ return default
422
+ if "dx" in action or "dy" in action:
423
+ return as_int(action.get("dx", 0)), as_int(action.get("dy", 0))
424
+ if "scroll_x" in action or "scroll_y" in action:
425
+ return as_int(action.get("scroll_x", 0)), as_int(action.get("scroll_y", 0))
426
+ for key in ("offset", "delta", "scroll", "wheel"):
427
+ val = action.get(key)
428
+ if isinstance(val, (list, tuple)) and len(val) >= 2:
429
+ return as_int(val[0]), as_int(val[1])
430
+ return 0, 0
431
+
432
+ def _parse_keys_list(self, keys_val: Any) -> List[str]:
433
+ """
434
+ Parse keys that can be string like 'ctrl+shift+p' or list of tokens.
435
+ """
436
+ if isinstance(keys_val, str):
437
+ return [p.strip() for p in keys_val.replace("+", " ").split() if p.strip()]
438
+ if isinstance(keys_val, list):
439
+ out = []
440
+ for k in keys_val:
441
+ if isinstance(k, str):
442
+ out.extend([p.strip() for p in k.replace("+", " ").split() if p.strip()])
443
+ else:
444
+ out.append(k)
445
+ return out
446
+ if keys_val is None:
447
+ return []
448
+ return [keys_val]
449
+
450
+ def _map_single_action(self, action: dict, id_: str, call_id: str) -> Optional[dict]:
451
+ atype = str(action.get("type", "") or action.get("action", "") or "").lower().strip()
452
+
453
+ # Clicks
454
+ if atype in {"click", "double_click", "dblclick", "dbl_click", "left_click", "right_click"}:
455
+ x, y = self._extract_xy(action)
456
+ button = action.get("button", "left")
457
+ num_clicks = int(action.get("count", action.get("num_clicks", 2 if "double" in atype or "dbl" in atype else 1)))
458
+ if atype == "right_click":
459
+ button = "right"
460
+ if atype == "left_click":
461
+ button = "left"
462
+ num_clicks = 1 if "num_clicks" not in action else num_clicks
463
+ args = {"button": button, "num_clicks": num_clicks}
464
+ if x is not None and y is not None:
465
+ args["x"] = x
466
+ args["y"] = y
467
+ return self._build_call(id_, call_id, "mouse_click", args)
468
+
469
+ # Move / Hover
470
+ if atype in {"move", "mouse_move", "hover"}:
471
+ x, y = self._extract_xy(action)
472
+ args = {}
473
+ if x is not None and y is not None:
474
+ args["x"] = x
475
+ args["y"] = y
476
+ return self._build_call(id_, call_id, "mouse_move", args)
477
+
478
+ # Scroll
479
+ if atype in {"scroll", "mouse_scroll"}:
480
+ x, y = self._extract_xy(action)
481
+ dx, dy = self._extract_dxdy(action)
482
+ args = {"dx": dx, "dy": dy, "unit": "px"}
483
+ if x is not None and y is not None:
484
+ args["x"] = x
485
+ args["y"] = y
486
+ return self._build_call(id_, call_id, "mouse_scroll", args)
487
+
488
+ # Type text
489
+ if atype in {"type", "keyboard_type", "input"}:
490
+ text = str(action.get("text", "") or action.get("value", "") or "")
491
+ return self._build_call(id_, call_id, "keyboard_type", {"text": text})
492
+
493
+ # Keys / key-combos
494
+ if atype in {"keypress", "key", "keys", "key_combination"}:
495
+ keys = self._parse_keys_list(action.get("keys", action.get("key")))
496
+ mods = [k for k in keys if isinstance(k, str) and k.lower() in self.KEY_MODIFIERS]
497
+ if atype == "key_combination" or len(mods) > 1:
498
+ return self._build_call(id_, call_id, "key_combination", {"keys": keys or []})
499
+ return self._build_call(id_, call_id, "keyboard_keys", {"keys": keys or []})
500
+
501
+ # Drag and drop
502
+ if atype in {"drag", "drag_and_drop", "mouse_drag"}:
503
+ path = action.get("path")
504
+ if isinstance(path, list) and len(path) >= 2 and isinstance(path[0], dict) and isinstance(path[1], dict):
505
+ try:
506
+ x0 = int(path[0].get("x")); y0 = int(path[0].get("y"))
507
+ x1 = int(path[1].get("x")); y1 = int(path[1].get("y"))
508
+ args = {"x": x0, "y": y0, "dx": x1, "dy": y1}
509
+ except Exception:
510
+ x0, y0 = self._extract_xy(path[0])
511
+ x1, y1 = self._extract_xy(path[1])
512
+ args = {"x": int(x0 or 0), "y": int(y0 or 0), "dx": int(x1 or 0), "dy": int(y1 or 0)}
513
+ return self._build_call(id_, call_id, "mouse_drag", args)
514
+ fx, fy = None, None
515
+ tx, ty = None, None
516
+ f = action.get("from")
517
+ t = action.get("to")
518
+ if isinstance(f, dict) or isinstance(f, (list, tuple)):
519
+ fx, fy = self._extract_xy({"coordinate": f} if not isinstance(f, dict) else f)
520
+ if isinstance(t, dict) or isinstance(t, (list, tuple)):
521
+ tx, ty = self._extract_xy({"coordinate": t} if not isinstance(t, dict) else t)
522
+ if fx is None or fy is None:
523
+ fx, fy = self._extract_xy(action)
524
+ if tx is None or ty is None:
525
+ for key in ("destination", "target", "end"):
526
+ val = action.get(key)
527
+ if val is not None:
528
+ tx, ty = self._extract_xy({"coordinate": val} if not isinstance(val, dict) else val)
529
+ break
530
+ if tx is None or ty is None:
531
+ tx, ty = self._extract_dxdy(action)
532
+ args = {"x": int(fx or 0), "y": int(fy or 0), "dx": int(tx or 0), "dy": int(ty or 0)}
533
+ return self._build_call(id_, call_id, "mouse_drag", args)
534
+
535
+ # Screenshot
536
+ if atype in {"screenshot", "get_screenshot"}:
537
+ return self._build_call(id_, call_id, "get_screenshot", {})
538
+
539
+ # Wait
540
+ if atype in {"wait", "sleep"}:
541
+ secs = int(action.get("seconds", action.get("sec", 2)))
542
+ return self._build_call(id_, call_id, "wait", {"seconds": secs})
543
+
544
+ # Fallback: short wait to avoid breaking flow
545
+ return self._build_call(id_, call_id, "wait", {"seconds": 1})
546
+
547
+ # --------------- Build / normalize calls --------------- #
548
+
549
+ def _normalize_params_for_plugin(self, name: str, args: dict) -> dict:
550
+ """
551
+ Normalize arguments dict to match plugin Worker expectations.
552
+ Conservative and side-effect-free; only maps/renames, does not invent values.
553
+ """
554
+ if not isinstance(args, dict):
555
+ return {}
556
+ out = dict(args)
557
+
558
+ # coordinate -> x,y
559
+ if "coordinate" in out and ("x" not in out or "y" not in out):
560
+ coord = out.get("coordinate")
561
+ if isinstance(coord, (list, tuple)) and len(coord) >= 2:
562
+ try:
563
+ out["x"] = int(coord[0])
564
+ out["y"] = int(coord[1])
565
+ except Exception:
566
+ pass
567
+ elif isinstance(coord, dict) and "x" in coord and "y" in coord:
568
+ out["x"] = coord.get("x")
569
+ out["y"] = coord.get("y")
570
+ out.pop("coordinate", None)
571
+
572
+ # scroll_x/scroll_y -> dx,dy for scroll commands
573
+ if name in ("mouse_scroll", "scroll"):
574
+ if "scroll_x" in out and "dx" not in out:
575
+ try:
576
+ out["dx"] = int(out.get("scroll_x", 0))
577
+ except Exception:
578
+ pass
579
+ if "scroll_y" in out and "dy" not in out:
580
+ try:
581
+ out["dy"] = int(out.get("scroll_y", 0))
582
+ except Exception:
583
+ pass
584
+ if "unit" in out:
585
+ unit = str(out.get("unit")).lower().strip()
586
+ if unit in ("px", "pixel", "pixels"):
587
+ out["unit"] = "px"
588
+ elif unit in ("step", "steps", "notch", "notches", "line", "lines"):
589
+ out["unit"] = "step"
590
+
591
+ # offset / delta -> dx,dy
592
+ if "dx" not in out or "dy" not in out:
593
+ for k in ("offset", "delta"):
594
+ val = out.get(k)
595
+ if isinstance(val, (list, tuple)) and len(val) >= 2:
596
+ try:
597
+ out.setdefault("dx", int(val[0]))
598
+ out.setdefault("dy", int(val[1]))
599
+ except Exception:
600
+ pass
601
+
602
+ # action -> button/num_clicks mapping
603
+ act = str(out.get("action", "") or "").lower()
604
+ if act:
605
+ is_double = ("double" in act) or ("dbl" in act)
606
+ if "right" in act:
607
+ out["button"] = "right"
608
+ else:
609
+ out["button"] = out.get("button", "left")
610
+ out["num_clicks"] = 2 if is_double else int(out.get("num_clicks", 1))
611
+ out.pop("action", None)
612
+
613
+ # click -> button (defensive)
614
+ if name in ("mouse_click", "click") and "click" in out and "button" not in out:
615
+ out["button"] = out.pop("click")
616
+
617
+ # destination -> dx,dy for drag
618
+ if name in ("mouse_drag", "drag"):
619
+ dest = out.get("destination") or out.get("target") or out.get("end")
620
+ if dest is not None and ("dx" not in out or "dy" not in out):
621
+ if isinstance(dest, (list, tuple)) and len(dest) >= 2:
622
+ try:
623
+ out["dx"] = int(dest[0])
624
+ out["dy"] = int(dest[1])
625
+ except Exception:
626
+ pass
627
+ elif isinstance(dest, dict) and "x" in dest and "y" in dest:
628
+ out["dx"] = dest.get("x")
629
+ out["dy"] = dest.get("y")
630
+ out.pop("destination", None)
631
+ out.pop("target", None)
632
+ out.pop("end", None)
633
+
634
+ # keys normalization
635
+ if name in ("keyboard_keys", "key_combination"):
636
+ if "keys" not in out and "key" in out:
637
+ out["keys"] = [out.get("key")]
638
+ out.pop("key", None)
639
+ if isinstance(out.get("keys"), str):
640
+ out["keys"] = [p.strip() for p in out["keys"].replace("+", " ").split() if p.strip()]
641
+
642
+ # text normalization for type
643
+ if name == "keyboard_type":
644
+ if "text" not in out and "value" in out:
645
+ out["text"] = out.get("value")
646
+ out.pop("value", None)
647
+
648
+ # ensure unit for scroll
649
+ if name in ("mouse_scroll", "scroll"):
650
+ if "unit" not in out:
651
+ out["unit"] = "px"
652
+
653
+ return out
654
+
655
+ def _append_call(self, tool_calls: list, id_: str, call_id: str, name: str, args: dict) -> None:
656
+ tool_calls.append(self._build_call(id_, call_id, name, args))
657
+
658
+ def _build_call(self, id_: str, call_id: str, name: str, args: dict) -> dict:
659
+ norm = self._normalize_params_for_plugin(name, args or {})
660
+ norm = self._filter_args_for_plugin(name, norm)
661
+ norm = self._prune_none(norm)
662
+ if name != "get_screenshot":
663
+ norm["no_screenshot"] = True
664
+ return {
665
+ "id": id_,
666
+ "call_id": call_id,
667
+ "type": "computer_call",
668
+ "function": {
669
+ "name": name,
670
+ "arguments": json.dumps(norm, ensure_ascii=False),
671
+ }
672
+ }
673
+
674
+ # --------------- Utils --------------- #
675
+
676
+ def _retarget_function_name_and_args(self, name: str, args: dict) -> Tuple[str, dict]:
677
+ """
678
+ Convert action-style/synonym function names to canonical plugin command names and
679
+ adjust defaults (e.g., left_click/right_click/double_click).
680
+ """
681
+ src = (name or "").strip().lower()
682
+ target = self.ACTION_SYNONYMS.get(src, src)
683
+ out = dict(args or {})
684
+
685
+ # Click synonyms defaulting button/click count
686
+ if src in ("left_click", "right_click"):
687
+ out.setdefault("button", "left" if src == "left_click" else "right")
688
+ out.setdefault("num_clicks", 1)
689
+ elif src == "double_click":
690
+ out.setdefault("button", "left")
691
+ out.setdefault("num_clicks", 2)
692
+
693
+ # coordinate -> x,y (defensive retarget)
694
+ if "coordinate" in out and ("x" not in out or "y" not in out):
695
+ coord = out.get("coordinate")
696
+ if isinstance(coord, (list, tuple)) and len(coord) >= 2:
697
+ try:
698
+ out["x"] = int(coord[0])
699
+ out["y"] = int(coord[1])
700
+ except Exception:
701
+ pass
702
+ elif isinstance(coord, dict) and "x" in coord and "y" in coord:
703
+ out["x"] = coord.get("x")
704
+ out["y"] = coord.get("y")
705
+
706
+ return target, out
707
+
708
+ def _filter_args_for_plugin(self, name: str, args: Dict[str, Any]) -> Dict[str, Any]:
709
+ """
710
+ Strict allow-list per command to ensure no unsupported keys like "action" or "coordinate"
711
+ are passed to Worker. Also performs a few defensive conversions (click->button, etc.).
712
+ """
713
+ allow: Dict[str, set] = {
714
+ "mouse_move": {"x", "y", "click", "num_clicks"},
715
+ "mouse_click": {"x", "y", "button", "num_clicks"},
716
+ "mouse_scroll": {"x", "y", "dx", "dy", "unit"},
717
+ "mouse_drag": {"x", "y", "dx", "dy"},
718
+ "keyboard_key": {"key", "modifier"},
719
+ "keyboard_keys": {"keys"},
720
+ "keyboard_type": {"text", "modifier"},
721
+ "open_web_browser": {"url", "no_screenshot"},
722
+ "get_mouse_position": {"no_screenshot"},
723
+ "get_screenshot": {"no_screenshot"},
724
+ "wait": {"seconds", "no_screenshot"},
725
+ # native extras
726
+ "wait_5_seconds": set(),
727
+ "go_back": set(),
728
+ "go_forward": set(),
729
+ "search": set(),
730
+ "navigate": {"url"},
731
+ "click_at": {"x", "y"},
732
+ "hover_at": {"x", "y"},
733
+ "type_text_at": {"x", "y", "text", "press_enter", "clear_before_typing"},
734
+ "key_combination": {"keys"},
735
+ "scroll_document": {"direction", "magnitude"},
736
+ "scroll_at": {"direction", "magnitude", "x", "y"},
737
+ "drag_and_drop": {"x", "y", "destination_x", "destination_y"},
738
+ # action-style
739
+ "click": {"x", "y", "button", "num_clicks"},
740
+ "double_click": {"x", "y", "button", "num_clicks"},
741
+ "move": {"x", "y"},
742
+ "type": {"text"},
743
+ "keypress": {"keys"},
744
+ "scroll": {"x", "y", "dx", "dy", "unit"},
745
+ "drag": {"x", "y", "dx", "dy", "path"},
746
+ }
747
+ res: Dict[str, Any] = {}
748
+
749
+ # coordinate -> x,y (final defensive conversion)
750
+ if "coordinate" in args and ("x" not in args or "y" not in args):
751
+ coord = args.get("coordinate")
752
+ if isinstance(coord, (list, tuple)) and len(coord) >= 2:
753
+ try:
754
+ args["x"] = int(coord[0])
755
+ args["y"] = int(coord[1])
756
+ except Exception:
757
+ pass
758
+ elif isinstance(coord, dict) and "x" in coord and "y" in coord:
759
+ args["x"] = coord.get("x")
760
+ args["y"] = coord.get("y")
761
+
762
+ # action -> button/num_clicks (final defensive conversion)
763
+ if "action" in args and (name in ("mouse_click", "click", "mouse_move")):
764
+ act = str(args.get("action") or "").lower()
765
+ if "right" in act:
766
+ args["button"] = "right"
767
+ else:
768
+ args["button"] = args.get("button", "left")
769
+ args["num_clicks"] = 2 if ("double" in act or "dbl" in act) else int(args.get("num_clicks", 1))
770
+
771
+ # click -> button for mouse_click
772
+ if name in ("mouse_click", "click") and "button" not in args and "click" in args:
773
+ args["button"] = args.get("click")
774
+
775
+ allowed = allow.get(name)
776
+ if allowed is None:
777
+ tmp = dict(args)
778
+ tmp.pop("action", None)
779
+ tmp.pop("coordinate", None)
780
+ return tmp
781
+
782
+ for k in allowed:
783
+ if k in args and args[k] is not None:
784
+ res[k] = args[k]
785
+
786
+ # Normalize unit for scrolling
787
+ if name in ("mouse_scroll", "scroll"):
788
+ unit = str(res.get("unit", "px")).lower()
789
+ res["unit"] = "px" if unit in ("px", "pixel", "pixels") else "step"
790
+
791
+ return res
792
+
793
+ def _ensure_ctx_memory(self, ctx: CtxItem) -> Dict[str, Dict[str, str]]:
794
+ if not isinstance(ctx.extra, dict):
795
+ ctx.extra = {}
796
+ if "anthropic_computer" not in ctx.extra or not isinstance(ctx.extra["anthropic_computer"], dict):
797
+ ctx.extra["anthropic_computer"] = {
798
+ "buffers": {},
799
+ "index_to_id": {},
800
+ "active_ids": [],
801
+ }
802
+ else:
803
+ mem = ctx.extra["anthropic_computer"]
804
+ if "buffers" not in mem:
805
+ mem["buffers"] = {}
806
+ if "index_to_id" not in mem:
807
+ mem["index_to_id"] = {}
808
+ if "active_ids" not in mem or not isinstance(mem["active_ids"], list):
809
+ mem["active_ids"] = []
810
+ return ctx.extra["anthropic_computer"]
811
+
812
+ @staticmethod
813
+ def _safe_json_loads(s: str) -> Optional[Any]:
814
+ if not isinstance(s, str):
815
+ return None
816
+ s = s.strip()
817
+ if not s:
818
+ return None
819
+ try:
820
+ return json.loads(s)
821
+ except Exception:
822
+ try:
823
+ fixed = s
824
+ if fixed.count("{") > fixed.count("}"):
825
+ fixed += "}" * (fixed.count("{") - fixed.count("}"))
826
+ if fixed.count("[") > fixed.count("]"):
827
+ fixed += "]" * (fixed.count("[") - fixed.count("]"))
828
+ return json.loads(fixed)
829
+ except Exception:
830
+ return None
831
+
832
+ @staticmethod
833
+ def _gen_id(prefix: str = "ac") -> str:
834
+ return f"{prefix}-{int(time.time() * 1000)}"
835
+
836
+ @staticmethod
837
+ def _prune_none(d: Dict[str, Any]) -> Dict[str, Any]:
838
+ """
839
+ Remove keys with None values to avoid passing None to Worker.
840
+ """
841
+ try:
842
+ return {k: v for k, v in d.items() if v is not None}
843
+ except Exception:
844
+ return d