pygpt-net 2.7.4__py3-none-any.whl → 2.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. pygpt_net/CHANGELOG.txt +7 -0
  2. pygpt_net/__init__.py +3 -3
  3. pygpt_net/app_core.py +4 -2
  4. pygpt_net/controller/__init__.py +5 -1
  5. pygpt_net/controller/assistant/assistant.py +1 -4
  6. pygpt_net/controller/assistant/batch.py +5 -504
  7. pygpt_net/controller/assistant/editor.py +5 -5
  8. pygpt_net/controller/assistant/files.py +16 -16
  9. pygpt_net/controller/chat/handler/google_stream.py +307 -1
  10. pygpt_net/controller/chat/handler/worker.py +8 -1
  11. pygpt_net/controller/chat/image.py +2 -2
  12. pygpt_net/controller/dialogs/confirm.py +73 -101
  13. pygpt_net/controller/lang/mapping.py +9 -9
  14. pygpt_net/controller/painter/capture.py +50 -1
  15. pygpt_net/controller/presets/presets.py +2 -1
  16. pygpt_net/controller/remote_store/__init__.py +12 -0
  17. pygpt_net/{provider/core/assistant_file/db_sqlite → controller/remote_store/google}/__init__.py +2 -2
  18. pygpt_net/controller/remote_store/google/batch.py +402 -0
  19. pygpt_net/controller/remote_store/google/store.py +615 -0
  20. pygpt_net/controller/remote_store/openai/__init__.py +12 -0
  21. pygpt_net/controller/remote_store/openai/batch.py +524 -0
  22. pygpt_net/controller/{assistant → remote_store/openai}/store.py +63 -60
  23. pygpt_net/controller/remote_store/remote_store.py +35 -0
  24. pygpt_net/controller/ui/ui.py +20 -1
  25. pygpt_net/core/assistants/assistants.py +3 -15
  26. pygpt_net/core/db/database.py +5 -3
  27. pygpt_net/core/locale/placeholder.py +35 -0
  28. pygpt_net/core/remote_store/__init__.py +12 -0
  29. pygpt_net/core/remote_store/google/__init__.py +11 -0
  30. pygpt_net/core/remote_store/google/files.py +224 -0
  31. pygpt_net/core/remote_store/google/store.py +248 -0
  32. pygpt_net/core/remote_store/openai/__init__.py +11 -0
  33. pygpt_net/core/{assistants → remote_store/openai}/files.py +26 -19
  34. pygpt_net/core/{assistants → remote_store/openai}/store.py +32 -15
  35. pygpt_net/core/remote_store/remote_store.py +24 -0
  36. pygpt_net/data/config/config.json +8 -4
  37. pygpt_net/data/config/models.json +77 -3
  38. pygpt_net/data/config/settings.json +45 -0
  39. pygpt_net/data/locale/locale.de.ini +41 -41
  40. pygpt_net/data/locale/locale.en.ini +53 -43
  41. pygpt_net/data/locale/locale.es.ini +41 -41
  42. pygpt_net/data/locale/locale.fr.ini +41 -41
  43. pygpt_net/data/locale/locale.it.ini +41 -41
  44. pygpt_net/data/locale/locale.pl.ini +42 -42
  45. pygpt_net/data/locale/locale.uk.ini +41 -41
  46. pygpt_net/data/locale/locale.zh.ini +41 -41
  47. pygpt_net/data/locale/plugin.cmd_history.de.ini +1 -1
  48. pygpt_net/data/locale/plugin.cmd_history.en.ini +1 -1
  49. pygpt_net/data/locale/plugin.cmd_history.es.ini +1 -1
  50. pygpt_net/data/locale/plugin.cmd_history.fr.ini +1 -1
  51. pygpt_net/data/locale/plugin.cmd_history.it.ini +1 -1
  52. pygpt_net/data/locale/plugin.cmd_history.pl.ini +1 -1
  53. pygpt_net/data/locale/plugin.cmd_history.uk.ini +1 -1
  54. pygpt_net/data/locale/plugin.cmd_history.zh.ini +1 -1
  55. pygpt_net/data/locale/plugin.cmd_mouse_control.en.ini +14 -0
  56. pygpt_net/data/locale/plugin.cmd_web.de.ini +1 -1
  57. pygpt_net/data/locale/plugin.cmd_web.en.ini +1 -1
  58. pygpt_net/data/locale/plugin.cmd_web.es.ini +1 -1
  59. pygpt_net/data/locale/plugin.cmd_web.fr.ini +1 -1
  60. pygpt_net/data/locale/plugin.cmd_web.it.ini +1 -1
  61. pygpt_net/data/locale/plugin.cmd_web.pl.ini +1 -1
  62. pygpt_net/data/locale/plugin.cmd_web.uk.ini +1 -1
  63. pygpt_net/data/locale/plugin.cmd_web.zh.ini +1 -1
  64. pygpt_net/data/locale/plugin.idx_llama_index.de.ini +2 -2
  65. pygpt_net/data/locale/plugin.idx_llama_index.en.ini +2 -2
  66. pygpt_net/data/locale/plugin.idx_llama_index.es.ini +2 -2
  67. pygpt_net/data/locale/plugin.idx_llama_index.fr.ini +2 -2
  68. pygpt_net/data/locale/plugin.idx_llama_index.it.ini +2 -2
  69. pygpt_net/data/locale/plugin.idx_llama_index.pl.ini +2 -2
  70. pygpt_net/data/locale/plugin.idx_llama_index.uk.ini +2 -2
  71. pygpt_net/data/locale/plugin.idx_llama_index.zh.ini +2 -2
  72. pygpt_net/item/assistant.py +1 -211
  73. pygpt_net/item/ctx.py +3 -1
  74. pygpt_net/item/store.py +238 -0
  75. pygpt_net/migrations/Version20260102190000.py +35 -0
  76. pygpt_net/migrations/__init__.py +3 -1
  77. pygpt_net/plugin/cmd_mouse_control/config.py +470 -1
  78. pygpt_net/plugin/cmd_mouse_control/plugin.py +488 -22
  79. pygpt_net/plugin/cmd_mouse_control/worker.py +464 -87
  80. pygpt_net/plugin/cmd_mouse_control/worker_sandbox.py +729 -0
  81. pygpt_net/plugin/idx_llama_index/config.py +2 -2
  82. pygpt_net/provider/api/google/__init__.py +16 -54
  83. pygpt_net/provider/api/google/chat.py +546 -129
  84. pygpt_net/provider/api/google/computer.py +190 -0
  85. pygpt_net/provider/api/google/realtime/realtime.py +2 -2
  86. pygpt_net/provider/api/google/remote_tools.py +93 -0
  87. pygpt_net/provider/api/google/store.py +546 -0
  88. pygpt_net/provider/api/google/worker/__init__.py +0 -0
  89. pygpt_net/provider/api/google/worker/importer.py +392 -0
  90. pygpt_net/provider/api/openai/computer.py +10 -1
  91. pygpt_net/provider/api/openai/store.py +6 -6
  92. pygpt_net/provider/api/openai/worker/importer.py +24 -24
  93. pygpt_net/provider/core/config/patch.py +16 -1
  94. pygpt_net/provider/core/config/patches/patch_before_2_6_42.py +3 -3
  95. pygpt_net/provider/core/model/patch.py +17 -3
  96. pygpt_net/provider/core/preset/json_file.py +13 -7
  97. pygpt_net/provider/core/{assistant_file → remote_file}/__init__.py +1 -1
  98. pygpt_net/provider/core/{assistant_file → remote_file}/base.py +9 -9
  99. pygpt_net/provider/core/remote_file/db_sqlite/__init__.py +12 -0
  100. pygpt_net/provider/core/{assistant_file → remote_file}/db_sqlite/patch.py +1 -1
  101. pygpt_net/provider/core/{assistant_file → remote_file}/db_sqlite/provider.py +23 -20
  102. pygpt_net/provider/core/{assistant_file → remote_file}/db_sqlite/storage.py +35 -27
  103. pygpt_net/provider/core/{assistant_file → remote_file}/db_sqlite/utils.py +5 -4
  104. pygpt_net/provider/core/{assistant_store → remote_store}/__init__.py +1 -1
  105. pygpt_net/provider/core/{assistant_store → remote_store}/base.py +10 -10
  106. pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/__init__.py +1 -1
  107. pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/patch.py +1 -1
  108. pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/provider.py +16 -15
  109. pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/storage.py +30 -23
  110. pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/utils.py +5 -4
  111. pygpt_net/provider/core/{assistant_store → remote_store}/json_file.py +9 -9
  112. pygpt_net/provider/llms/google.py +2 -2
  113. pygpt_net/ui/base/config_dialog.py +3 -2
  114. pygpt_net/ui/dialog/assistant.py +3 -3
  115. pygpt_net/ui/dialog/plugins.py +3 -1
  116. pygpt_net/ui/dialog/remote_store_google.py +539 -0
  117. pygpt_net/ui/dialog/{assistant_store.py → remote_store_openai.py} +95 -95
  118. pygpt_net/ui/dialogs.py +5 -3
  119. pygpt_net/ui/layout/chat/attachments_uploaded.py +3 -3
  120. pygpt_net/ui/layout/toolbox/computer_env.py +26 -8
  121. pygpt_net/ui/menu/tools.py +13 -5
  122. pygpt_net/ui/widget/dialog/remote_store_google.py +56 -0
  123. pygpt_net/ui/widget/dialog/{assistant_store.py → remote_store_openai.py} +9 -9
  124. pygpt_net/ui/widget/element/button.py +4 -4
  125. pygpt_net/ui/widget/lists/remote_store_google.py +248 -0
  126. pygpt_net/ui/widget/lists/{assistant_store.py → remote_store_openai.py} +21 -21
  127. pygpt_net/ui/widget/option/checkbox_list.py +47 -9
  128. pygpt_net/ui/widget/option/combo.py +39 -3
  129. {pygpt_net-2.7.4.dist-info → pygpt_net-2.7.5.dist-info}/METADATA +33 -2
  130. {pygpt_net-2.7.4.dist-info → pygpt_net-2.7.5.dist-info}/RECORD +133 -108
  131. {pygpt_net-2.7.4.dist-info → pygpt_net-2.7.5.dist-info}/LICENSE +0 -0
  132. {pygpt_net-2.7.4.dist-info → pygpt_net-2.7.5.dist-info}/WHEEL +0 -0
  133. {pygpt_net-2.7.4.dist-info → pygpt_net-2.7.5.dist-info}/entry_points.txt +0 -0
@@ -6,15 +6,16 @@
6
6
  # GitHub: https://github.com/szczyglis-dev/py-gpt #
7
7
  # MIT License #
8
8
  # Created By : Marcin Szczygliński #
9
- # Updated Date: 2025.08.28 20:00:00 #
9
+ # Updated Date: 2026.01.03 02:10:00 #
10
10
  # ================================================== #
11
11
 
12
- from typing import Optional, Dict, Any, List
12
+ import os
13
+ from typing import Optional, Dict, Any, List, Tuple
13
14
 
14
15
  from google.genai import types as gtypes
15
16
  from google.genai.types import Content, Part
16
17
 
17
- from pygpt_net.core.types import MODE_CHAT, MODE_AUDIO
18
+ from pygpt_net.core.types import MODE_CHAT, MODE_AUDIO, MODE_COMPUTER, MODE_RESEARCH
18
19
  from pygpt_net.core.bridge.context import BridgeContext, MultimodalContext
19
20
  from pygpt_net.item.attachment import AttachmentItem
20
21
  from pygpt_net.item.ctx import CtxItem
@@ -35,7 +36,7 @@ class Chat:
35
36
  extra: Optional[Dict[str, Any]] = None
36
37
  ):
37
38
  """
38
- Call Google GenAI for chat / multimodal / audio.
39
+ Call Google GenAI for chat / multimodal / audio / computer use.
39
40
 
40
41
  :param context: BridgeContext with prompt, model, history, mode, etc.
41
42
  :param extra: Extra parameters (not used currently)
@@ -62,7 +63,6 @@ class Chat:
62
63
 
63
64
  # ------------- TRANSCRIPTION PATH (audio input -> text -> feed to TTS) -------------
64
65
  if mode == MODE_AUDIO and has_audio_input:
65
- # Build minimal transcription request: [instruction text, audio part]
66
66
  transcribe_model = self.window.core.config.get("google_audio.transcribe_model", "gemini-2.5-flash")
67
67
  transcribe_prompt = self.window.core.config.get("google_audio.transcribe_prompt", "Transcribe this audio clip to text.")
68
68
  audio_part = self.window.core.api.google.audio.build_part(multimodal_ctx)
@@ -73,13 +73,10 @@ class Chat:
73
73
  ])
74
74
  ]
75
75
  trans_cfg = gtypes.GenerateContentConfig(
76
- # Keep minimal; no tools/system for transcription
77
76
  temperature=self.window.core.config.get('temperature'),
78
77
  top_p=self.window.core.config.get('top_p'),
79
78
  max_output_tokens=context.max_tokens if context.max_tokens else None,
80
79
  )
81
-
82
- # Always non-stream here (we immediately need the text for TTS)
83
80
  trans_resp = client.models.generate_content(
84
81
  model=transcribe_model,
85
82
  contents=trans_inputs,
@@ -87,20 +84,17 @@ class Chat:
87
84
  )
88
85
  transcribed_text = self.extract_text(trans_resp).strip()
89
86
  if transcribed_text:
90
- # Feed transcription into TTS as the final prompt
91
87
  prompt = transcribed_text
92
88
  ctx.input = transcribed_text
93
89
  try:
94
- # optional: store for debugging/UX
95
90
  if isinstance(ctx.extra, dict):
96
91
  ctx.extra["transcription"] = transcribed_text
97
92
  except Exception:
98
93
  pass
99
- ctx.is_audio = False # transcription is text
100
- multimodal_ctx.is_audio_input = False # disable audio input for TTS below
94
+ ctx.is_audio = False
95
+ multimodal_ctx.is_audio_input = False
101
96
 
102
- # ---------------------- REGULAR CHAT PATH (or no-audio in MODE_AUDIO) ----------------------
103
- # Build contents for chat/multimodal (will be overridden for TTS below)
97
+ # ---------------------- REGULAR CHAT/COMPUTER PATH ----------------------
104
98
  inputs = self.build_input(
105
99
  prompt=prompt,
106
100
  system_prompt=system_prompt,
@@ -108,23 +102,35 @@ class Chat:
108
102
  history=context.history,
109
103
  attachments=attachments,
110
104
  multimodal_ctx=multimodal_ctx,
105
+ mode=mode,
111
106
  )
112
107
 
113
108
  # Best-effort input tokens estimate
114
109
  self.reset_tokens()
115
- count_msgs = self._build_count_messages(prompt, system_prompt, model, context.history)
110
+ count_msgs = self._build_count_messages(prompt, system_prompt, model, context.history, mode)
116
111
  self.input_tokens += self.window.core.tokens.from_messages(count_msgs, model.id)
117
112
 
118
113
  # Tools -> merge app-defined tools with remote tools
119
114
  base_tools = self.window.core.api.google.tools.prepare(model, functions)
120
- remote_tools = self.window.core.api.google.build_remote_tools(model)
115
+ remote_tools = self.window.core.api.google.remote_tools.build_remote_tools(model)
121
116
 
122
- # Check tools compatibility
117
+ # Note: Combining native (remote) tools with function declarations is documented as Live API-only.
123
118
  if base_tools:
124
- remote_tools = [] # remote tools are not allowed if function calling is used
119
+ remote_tools = []
125
120
  tools = (base_tools or []) + (remote_tools or [])
126
- if "-image" in model.id:
127
- tools = None # function calling is not supported for image models
121
+
122
+ # Enable Computer Use tool in computer mode (use the official Tool/ComputerUse object)
123
+ if mode == MODE_COMPUTER or (model and isinstance(model.id, str) and "computer-use" in model.id.lower()):
124
+ comp_env = gtypes.Environment.ENVIRONMENT_BROWSER
125
+ tools = [gtypes.Tool(
126
+ computer_use=gtypes.ComputerUse(
127
+ environment=comp_env,
128
+ )
129
+ )] # reset tools to only Computer Use (multiple tools not supported together)
130
+
131
+ # Some models cannot use tools; keep behavior for image-only models
132
+ if model and isinstance(model.id, str) and "-image" in model.id:
133
+ tools = None
128
134
 
129
135
  # Sampling
130
136
  temperature = self.window.core.config.get('temperature')
@@ -145,10 +151,9 @@ class Chat:
145
151
  stream = False # TTS non-stream in this app
146
152
  supports_tts = self._supports_tts(model.id)
147
153
 
148
- # Force minimal single-turn input for TTS (text only), using prompt possibly replaced by transcription
149
154
  inputs = [Content(role="user", parts=[Part.from_text(text=str(prompt or ""))])]
150
155
 
151
- # Remove params not used by TTS flow (and that sometimes cause issues)
156
+ # Remove params not used by TTS flow
152
157
  for key in ("temperature", "top_p", "max_output_tokens", "system_instruction", "tools"):
153
158
  if key in cfg_kwargs:
154
159
  del cfg_kwargs[key]
@@ -171,11 +176,93 @@ class Chat:
171
176
  prebuilt_voice_config=gtypes.PrebuiltVoiceConfig(voice_name=voice_name)
172
177
  )
173
178
  )
174
- # else: fallback to text-only below
175
-
176
179
  cfg = gtypes.GenerateContentConfig(**cfg_kwargs)
177
180
  params = dict(model=model.id, contents=inputs, config=cfg)
178
181
 
182
+ if mode == MODE_RESEARCH:
183
+ ctx.use_google_interactions_api = True
184
+
185
+ # Deep Research does not support audio inputs; if an audio snippet is present, transcribe it to text first.
186
+ if has_audio_input:
187
+ try:
188
+ transcribe_model = self.window.core.config.get("google_audio.transcribe_model", "gemini-2.5-flash")
189
+ transcribe_prompt = self.window.core.config.get("google_audio.transcribe_prompt", "Transcribe this audio clip to text.")
190
+ audio_part = self.window.core.api.google.audio.build_part(multimodal_ctx)
191
+ trans_inputs = [
192
+ Content(role="user", parts=[
193
+ Part.from_text(text=transcribe_prompt),
194
+ audio_part,
195
+ ])
196
+ ]
197
+ trans_cfg = gtypes.GenerateContentConfig(
198
+ temperature=self.window.core.config.get('temperature'),
199
+ top_p=self.window.core.config.get('top_p'),
200
+ max_output_tokens=context.max_tokens if context.max_tokens else None,
201
+ )
202
+ trans_resp = client.models.generate_content(
203
+ model=transcribe_model,
204
+ contents=trans_inputs,
205
+ config=trans_cfg,
206
+ )
207
+ transcribed_text = self.extract_text(trans_resp).strip()
208
+ if transcribed_text:
209
+ prompt = (str(prompt or "").strip() + "\n\n" + transcribed_text).strip() if prompt else transcribed_text
210
+ ctx.input = transcribed_text
211
+ try:
212
+ if isinstance(ctx.extra, dict):
213
+ ctx.extra["transcription"] = transcribed_text
214
+ except Exception:
215
+ pass
216
+ except Exception:
217
+ pass
218
+ # Ensure we don't send raw audio to Interactions API
219
+ if multimodal_ctx:
220
+ multimodal_ctx.is_audio_input = False
221
+
222
+ # Build single-turn multimodal input for Interactions API (no full chat history)
223
+ research_parts = self._build_user_parts(
224
+ content=str(prompt),
225
+ attachments=attachments,
226
+ multimodal_ctx=multimodal_ctx,
227
+ )
228
+ interactions_input = self._parts_to_interactions_input(research_parts)
229
+
230
+ # Try to continue context with the last completed interaction (server-side state)
231
+ prev_interaction_id, last_event_id, last_status = self._find_last_interaction_state(
232
+ history=context.history,
233
+ ctx=ctx,
234
+ )
235
+ try:
236
+ if ctx.extra is None:
237
+ ctx.extra = {}
238
+ if prev_interaction_id:
239
+ ctx.extra["previous_interaction_id"] = prev_interaction_id
240
+ if last_event_id:
241
+ ctx.extra["google_last_event_id"] = last_event_id
242
+ if last_status:
243
+ ctx.extra["google_interaction_status"] = last_status
244
+ except Exception:
245
+ pass
246
+
247
+ # Deep Research agent must use background=True; stream=True enables live progress updates.
248
+ create_kwargs: Dict[str, Any] = {
249
+ "agent": model.id,
250
+ "input": interactions_input if interactions_input else (str(prompt or "") or " "),
251
+ "background": True,
252
+ "stream": stream,
253
+ "agent_config": {
254
+ "type": "deep-research",
255
+ "thinking_summaries": "auto"
256
+ }
257
+ }
258
+
259
+ # Continue conversation on server using previous_interaction_id if available
260
+ if prev_interaction_id:
261
+ create_kwargs["previous_interaction_id"] = prev_interaction_id
262
+
263
+ # Do not pass custom tools here; Deep Research manages its own built-in tools.
264
+ return client.interactions.create(**create_kwargs)
265
+
179
266
  if stream and mode != MODE_AUDIO:
180
267
  return client.models.generate_content_stream(**params)
181
268
  else:
@@ -189,28 +276,21 @@ class Chat:
189
276
  """
190
277
  Unpack non-streaming response from Google GenAI and set context.
191
278
 
192
- :param mode: MODE_CHAT or MODE_AUDIO
279
+ :param mode: MODE_CHAT, MODE_AUDIO or MODE_COMPUTER
193
280
  :param response: Response object
194
281
  :param ctx: CtxItem to set output, audio_output, tokens, tool_calls
195
282
  """
196
283
  if mode == MODE_AUDIO:
197
- # Prefer audio if present
198
284
  audio_bytes, mime = self.window.core.api.google.audio.extract_first_audio_part(response)
199
285
  if audio_bytes:
200
- # Google returns PCM16 24kHz mono for TTS; wrap to WAV (base64) for UI compatibility
201
- # https://ai.google.dev/gemini-api/docs/speech-generation
202
- if mime == "audio/pcm" or mime.startswith("audio/"):
286
+ if mime == "audio/pcm" or (isinstance(mime, str) and mime.startswith("audio/")):
203
287
  wav_b64 = self.window.core.api.google.audio.pcm16_to_wav_base64(audio_bytes, rate=24000)
204
288
  ctx.audio_output = wav_b64
205
289
  ctx.is_audio = True
206
- # Text transcript is typically not present for TTS; still try:
207
290
  txt = self.extract_text(response)
208
291
  ctx.output = txt or "..."
209
292
  else:
210
- # No audio present -> fallback to text
211
293
  ctx.output = self.extract_text(response)
212
-
213
- # Usage
214
294
  try:
215
295
  usage = getattr(response, "usage_metadata", None)
216
296
  if usage:
@@ -219,17 +299,58 @@ class Chat:
219
299
  ctx.set_tokens(p, c)
220
300
  except Exception:
221
301
  pass
302
+ return
222
303
 
223
- return # audio path done
304
+ # ---- chat / computer ----
305
+ ctx.output = self.extract_text(response) or ""
224
306
 
225
- # ---- regular chat/completion ----
226
- ctx.output = self.extract_text(response)
227
-
228
- # Extract function calls
307
+ # 1) Extract tool calls and store in ctx.tool_calls (backward-compatible shape)
229
308
  calls = self.extract_tool_calls(response)
230
309
  if calls:
231
310
  ctx.tool_calls = calls
232
311
 
312
+ # 2) In MODE_COMPUTER: capture raw model parts (with thought_signature) for next FunctionResponse turn
313
+ # and translate Computer Use calls into plugin commands now.
314
+ if mode == MODE_COMPUTER:
315
+ candidate = None
316
+ try:
317
+ cands = getattr(response, "candidates", None) or []
318
+ if cands:
319
+ candidate = cands[0]
320
+ except Exception:
321
+ pass
322
+
323
+ if candidate and getattr(candidate, "content", None):
324
+ parts = getattr(candidate.content, "parts", None) or []
325
+ dump = self._dump_model_parts(parts)
326
+ if dump:
327
+ if ctx.extra is None:
328
+ ctx.extra = {}
329
+ ctx.extra["prev_model_parts"] = dump
330
+
331
+ tool_calls: List[dict] = []
332
+ try:
333
+ tool_calls, has_calls = self.window.core.api.google.computer.handle_stream_chunk(
334
+ ctx=ctx,
335
+ chunk=response,
336
+ tool_calls=tool_calls,
337
+ )
338
+ except Exception as e:
339
+ has_calls = False
340
+ print(f"Gemini computer-use mapping error: {e}")
341
+
342
+ if has_calls and tool_calls:
343
+ ctx.force_call = True
344
+ self.window.core.debug.info("[chat] Google tool calls found, unpacking...")
345
+ self.window.core.command.unpack_tool_calls_chunks(ctx, tool_calls)
346
+
347
+ if calls:
348
+ if ctx.extra is None:
349
+ ctx.extra = {}
350
+ ctx.extra["function_response_required"] = True
351
+ ctx.extra["function_response_source"] = "ctx.tool_calls"
352
+ ctx.extra["function_response_reason"] = "computer_use"
353
+
233
354
  # Usage if available
234
355
  try:
235
356
  usage = getattr(response, "usage_metadata", None)
@@ -283,12 +404,11 @@ class Chat:
283
404
  :return: List of tool calls
284
405
  """
285
406
  def _to_plain_dict(obj):
286
- # Convert pydantic/genai objects to plain dict recursively
287
407
  try:
288
408
  if hasattr(obj, "to_json_dict"):
289
409
  return obj.to_json_dict()
290
410
  if hasattr(obj, "model_dump"):
291
- return obj.model_dump() # pydantic v2
411
+ return obj.model_dump()
292
412
  if hasattr(obj, "to_dict"):
293
413
  return obj.to_dict()
294
414
  except Exception:
@@ -307,7 +427,6 @@ class Chat:
307
427
  name = getattr(fc, "name", "") or ""
308
428
  args_obj = getattr(fc, "args", {}) or {}
309
429
  args_dict = _to_plain_dict(args_obj) or {}
310
- # if str, try to parse
311
430
  if isinstance(args_dict, str):
312
431
  try:
313
432
  import json
@@ -319,7 +438,7 @@ class Chat:
319
438
  "type": "function",
320
439
  "function": {
321
440
  "name": name,
322
- "arguments": args_dict, # <--- DICT, not string
441
+ "arguments": args_dict,
323
442
  }
324
443
  })
325
444
 
@@ -345,11 +464,11 @@ class Chat:
345
464
  except Exception:
346
465
  args_dict = {}
347
466
  out.append({
348
- "id": "",
467
+ "id": getattr(fn, "id", "") or "",
349
468
  "type": "function",
350
469
  "function": {
351
470
  "name": name,
352
- "arguments": args_dict, # <--- DICT
471
+ "arguments": args_dict,
353
472
  }
354
473
  })
355
474
  except Exception:
@@ -357,17 +476,274 @@ class Chat:
357
476
 
358
477
  return out
359
478
 
479
+ def build_input(
480
+ self,
481
+ prompt: str,
482
+ system_prompt: str,
483
+ model: ModelItem,
484
+ history: Optional[List[CtxItem]] = None,
485
+ attachments: Optional[Dict[str, AttachmentItem]] = None,
486
+ multimodal_ctx: Optional[MultimodalContext] = None,
487
+ mode: str = MODE_CHAT,
488
+ ) -> List[Content]:
489
+ """
490
+ Build Google GenAI contents list
491
+
492
+ :param prompt: User prompt
493
+ :param system_prompt: System prompt/instruction
494
+ :param model: ModelItem
495
+ :param history: List of CtxItem for history
496
+ :param attachments: Dict of AttachmentItem for images/screenshots
497
+ :param multimodal_ctx: MultimodalContext for audio
498
+ :param mode: MODE_CHAT / MODE_AUDIO / MODE_COMPUTER
499
+ :return: List of Content
500
+ """
501
+ # FunctionResponse turn for Computer Use (strictly immediate after functionCall)
502
+ if mode == MODE_COMPUTER and self.window.core.config.get('use_context'):
503
+ hist = self.window.core.ctx.get_history(
504
+ history,
505
+ model.id,
506
+ MODE_CHAT,
507
+ self.window.core.tokens.from_user(prompt, system_prompt),
508
+ self._fit_ctx(model),
509
+ )
510
+ fr_contents = self._build_function_responses_from_history(hist, attachments)
511
+ if fr_contents:
512
+ return fr_contents
513
+
514
+ # Build conversation history first to detect "first input"
515
+ items: List[CtxItem] = []
516
+ if self.window.core.config.get('use_context'):
517
+ items = self.window.core.ctx.get_history(
518
+ history,
519
+ model.id,
520
+ MODE_CHAT if model and MODE_CHAT in model.mode else MODE_CHAT,
521
+ self.window.core.tokens.from_user(prompt, system_prompt),
522
+ self._fit_ctx(model),
523
+ )
524
+
525
+ is_first_turn = (len(items) == 0)
526
+ is_sandbox = bool(self.window.core.config.get("remote_tools.computer_use.sandbox", False))
527
+
528
+ contents: List[Content] = []
529
+
530
+ # Append conversation history (text only)
531
+ for item in items:
532
+ if item.final_input:
533
+ contents.append(Content(role="user", parts=[Part.from_text(text=str(item.final_input))]))
534
+ if item.final_output:
535
+ contents.append(Content(role="model", parts=[Part.from_text(text=str(item.final_output))]))
536
+
537
+ # Current user message:
538
+ # - In MODE_COMPUTER attach initial screenshot only on the very first turn
539
+ if mode == MODE_COMPUTER:
540
+ initial_attachments = {}
541
+ if is_first_turn and not attachments and not is_sandbox:
542
+ self.window.controller.attachment.clear_silent()
543
+ self.window.controller.painter.capture.screenshot(attach_cursor=True, silent=True)
544
+ initial_attachments = self.window.core.attachments.get_all(mode)
545
+ send_attachments = initial_attachments if initial_attachments else attachments
546
+ parts = self._build_user_parts(
547
+ content=str(prompt),
548
+ attachments=send_attachments,
549
+ multimodal_ctx=multimodal_ctx,
550
+ )
551
+ else:
552
+ parts = self._build_user_parts(
553
+ content=str(prompt),
554
+ attachments=attachments,
555
+ multimodal_ctx=multimodal_ctx,
556
+ )
557
+ contents.append(Content(role="user", parts=parts))
558
+
559
+ return contents
560
+
561
+ def _build_function_responses_from_history(
562
+ self,
563
+ history: Optional[List[CtxItem]],
564
+ attachments: Optional[Dict[str, AttachmentItem]],
565
+ ) -> Optional[List[Content]]:
566
+ """
567
+ Build FunctionResponse contents for the immediate next turn after executing
568
+ Computer Use function calls. It reconstructs the last user -> model(functionCall) turn
569
+ and returns [user_content, model_function_call_content, tool_function_response_content].
570
+ """
571
+ if not self.window.core.config.get('use_context') or not history:
572
+ return None
573
+
574
+ last_item = history[-1]
575
+ if not getattr(last_item, "extra", None):
576
+ return None
577
+ if not last_item.extra.get("function_response_required"):
578
+ return None
579
+
580
+ # 1) Find the user message that started the current turn (previous item's input)
581
+ prior_user_text = ""
582
+ if len(history) >= 2:
583
+ prev = history[-2]
584
+ if getattr(prev, "final_input", None):
585
+ prior_user_text = str(prev.final_input)
586
+
587
+ if not prior_user_text and getattr(last_item, "input", None):
588
+ prior_user_text = str(last_item.input)
589
+
590
+ if not prior_user_text:
591
+ prior_user_text = "..."
592
+
593
+ user_content = Content(role="user", parts=[Part.from_text(text=prior_user_text)])
594
+
595
+ # 2) Rebuild the model functionCall content with thought_signature preserved
596
+ raw_parts = last_item.extra.get("prev_model_parts", [])
597
+ model_parts = self._rehydrate_model_parts(raw_parts)
598
+ if not model_parts:
599
+ model_parts = self._rehydrate_from_tool_calls(getattr(last_item, "tool_calls", []))
600
+ # append also text part if not empty
601
+ if getattr(last_item, "final_output", None):
602
+ output_text = str(last_item.final_output).strip()
603
+ if output_text:
604
+ model_parts.append(Part.from_text(text=output_text))
605
+
606
+ model_fc_content = Content(role="model", parts=model_parts)
607
+
608
+ # 3) Build a single tool content with N FunctionResponse parts (one per functionCall)
609
+ screenshot_part = self._screenshot_function_response_part(attachments)
610
+ fr_parts: List[Part] = []
611
+ for p in model_parts:
612
+ if getattr(p, "function_call", None):
613
+ fn = p.function_call
614
+ fr = Part.from_function_response(
615
+ name=fn.name,
616
+ response=self._minimal_tool_response(last_item),
617
+ parts=[screenshot_part] if screenshot_part else None
618
+ )
619
+ fr_parts.append(fr)
620
+
621
+ if not fr_parts:
622
+ return None
623
+
624
+ tool_content = Content(role="tool", parts=fr_parts)
625
+
626
+ return [user_content, model_fc_content, tool_content]
627
+
628
+ def _rehydrate_from_tool_calls(self, calls: List[dict]) -> List[Part]:
629
+ """
630
+ Fallback rehydration when prev_model_parts are unavailable (no thought signatures).
631
+ """
632
+ parts: List[Part] = []
633
+ for c in calls or []:
634
+ if not isinstance(c, dict):
635
+ continue
636
+ if c.get("type") != "function":
637
+ continue
638
+ fn = c.get("function") or {}
639
+ name = fn.get("name")
640
+ args = fn.get("arguments") or {}
641
+ if not name:
642
+ continue
643
+ parts.append(Part.from_function_call(name=name, args=args))
644
+ return parts
645
+
646
+ def _dump_model_parts(self, parts: List[Part]) -> List[dict]:
647
+ """
648
+ Dump model parts into a JSON-serializable structure, preserving thought_signature.
649
+ """
650
+ out: List[dict] = []
651
+ for p in parts or []:
652
+ ts = getattr(p, "thought_signature", None)
653
+ if getattr(p, "function_call", None):
654
+ fn = p.function_call
655
+ name = getattr(fn, "name", "") or ""
656
+ args = getattr(fn, "args", {}) or {}
657
+ out.append({
658
+ "type": "function_call",
659
+ "name": name,
660
+ "args": args,
661
+ "thought_signature": ts,
662
+ })
663
+ elif getattr(p, "text", None):
664
+ out.append({"type": "text", "text": str(p.text)})
665
+ return out
666
+
667
+ def _rehydrate_model_parts(self, raw_parts: List[dict]) -> List[Part]:
668
+ """
669
+ Recreate SDK Part objects from dumped parts, including thought_signature on the Part.
670
+ """
671
+ parts: List[Part] = []
672
+ for it in raw_parts or []:
673
+ t = (it.get("type") or "").lower()
674
+ if t == "function_call":
675
+ name = it.get("name")
676
+ args = it.get("args") or {}
677
+ ts = it.get("thought_signature")
678
+ if name:
679
+ parts.append(Part(function_call=gtypes.FunctionCall(name=name, args=args),
680
+ thought_signature=ts))
681
+ elif t == "text":
682
+ parts.append(Part.from_text(text=str(it.get("text", ""))))
683
+ return parts
684
+
685
+ def _screenshot_function_response_part(
686
+ self,
687
+ attachments: Optional[Dict[str, AttachmentItem]]
688
+ ) -> Optional[gtypes.FunctionResponsePart]:
689
+ """
690
+ Build FunctionResponsePart with inlineData PNG/JPEG screenshot from attachments.
691
+ """
692
+ if not attachments:
693
+ return None
694
+
695
+ chosen_path = None
696
+ for _, att in attachments.items():
697
+ if not att or not att.path:
698
+ continue
699
+ p = att.path
700
+ if isinstance(p, str) and os.path.exists(p):
701
+ ext = os.path.splitext(p)[1].lower()
702
+ if ext in (".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff"):
703
+ chosen_path = p
704
+ if ext == ".png":
705
+ break
706
+
707
+ if not chosen_path:
708
+ return None
709
+
710
+ try:
711
+ with open(chosen_path, "rb") as f:
712
+ data = f.read()
713
+ blob = gtypes.FunctionResponseBlob(
714
+ mime_type="image/png" if chosen_path.lower().endswith(".png") else "image/jpeg",
715
+ data=data,
716
+ )
717
+ return gtypes.FunctionResponsePart(inline_data=blob)
718
+ except Exception:
719
+ return None
720
+
721
+ @staticmethod
722
+ def _minimal_tool_response(item: CtxItem) -> Dict[str, Any]:
723
+ """
724
+ Construct a minimal structured payload for FunctionResponse.response.
725
+ """
726
+ resp: Dict[str, Any] = {"ok": True}
727
+ try:
728
+ if item and item.extra and isinstance(item.extra, dict):
729
+ outputs = item.extra.get("tool_output")
730
+ if isinstance(outputs, list) and len(outputs) > 0:
731
+ last = outputs[-1]
732
+ if isinstance(last, dict):
733
+ if "result" in last and isinstance(last["result"], dict):
734
+ resp = last["result"]
735
+ if "error" in last:
736
+ resp["error"] = last["error"]
737
+ except Exception:
738
+ pass
739
+ return resp
740
+
360
741
  def _extract_inline_images_and_links(
361
742
  self,
362
743
  response, ctx: CtxItem
363
744
  ) -> None:
364
745
  """
365
746
  Extract inline image parts (Gemini image output) and file links.
366
- - Saves inline_data (image/*) bytes to files and appends paths to ctx.images.
367
- - Appends HTTP(S) image URIs from file_data to ctx.urls.
368
-
369
- :param response: Response object
370
- :param ctx: CtxItem to set images and urls
371
747
  """
372
748
  images: list[str] = []
373
749
  urls: list[str] = []
@@ -378,7 +754,6 @@ class Chat:
378
754
  content = getattr(cand, "content", None)
379
755
  parts = getattr(content, "parts", None) or []
380
756
  for p in parts:
381
- # Inline image bytes (image preview / image generation in chat)
382
757
  blob = getattr(p, "inline_data", None)
383
758
  if blob:
384
759
  mime = (getattr(blob, "mime_type", "") or "").lower()
@@ -392,16 +767,14 @@ class Chat:
392
767
  f.write(img_bytes)
393
768
  images.append(img_path)
394
769
 
395
- # File data URI (may contain http/https or gs://)
396
770
  fdata = getattr(p, "file_data", None)
397
771
  if fdata:
398
772
  uri = getattr(fdata, "file_uri", None) or getattr(fdata, "uri", None)
399
773
  mime = (getattr(fdata, "mime_type", "") or "").lower()
400
774
  if uri and mime.startswith("image/"):
401
- # Store only as URL; downloading is out of scope here.
402
775
  if uri.startswith("http://") or uri.startswith("https://"):
403
776
  urls.append(uri)
404
- except Exception as e:
777
+ except Exception:
405
778
  pass
406
779
 
407
780
  if images:
@@ -418,9 +791,6 @@ class Chat:
418
791
  def _ensure_bytes(data) -> bytes | None:
419
792
  """
420
793
  Return raw bytes from SDK part.inline_data.data which can be bytes or base64 string.
421
-
422
- :param data: bytes or str
423
- :return: bytes or None
424
794
  """
425
795
  try:
426
796
  if isinstance(data, (bytes, bytearray)):
@@ -432,56 +802,6 @@ class Chat:
432
802
  return None
433
803
  return None
434
804
 
435
- def build_input(
436
- self,
437
- prompt: str,
438
- system_prompt: str,
439
- model: ModelItem,
440
- history: Optional[List[CtxItem]] = None,
441
- attachments: Optional[Dict[str, AttachmentItem]] = None,
442
- multimodal_ctx: Optional[MultimodalContext] = None,
443
- ) -> List[Content]:
444
- """
445
- Build Google GenAI contents list
446
-
447
- :param prompt: User prompt
448
- :param system_prompt: System prompt/instruction
449
- :param model: ModelItem
450
- :param history: List of CtxItem for history
451
- :param attachments: Dict of AttachmentItem for images
452
- :param multimodal_ctx: MultimodalContext for audio
453
- :return: List of Content
454
- """
455
- contents: List[Content] = []
456
-
457
- # System instruction is passed separately (system_instruction),
458
- # so we do not build an explicit system role part here.
459
-
460
- # Append conversation history
461
- if self.window.core.config.get('use_context'):
462
- items = self.window.core.ctx.get_history(
463
- history,
464
- model.id,
465
- MODE_CHAT if model and MODE_CHAT in model.mode else MODE_CHAT,
466
- self.window.core.tokens.from_user(prompt, system_prompt),
467
- self._fit_ctx(model),
468
- )
469
- for item in items:
470
- if item.final_input:
471
- contents.append(Content(role="user", parts=[Part.from_text(text=str(item.final_input))]))
472
- if item.final_output:
473
- contents.append(Content(role="model", parts=[Part.from_text(text=str(item.final_output))]))
474
-
475
- # Current user message with multimodal parts
476
- parts = self._build_user_parts(
477
- content=str(prompt),
478
- attachments=attachments,
479
- multimodal_ctx=multimodal_ctx,
480
- )
481
- contents.append(Content(role="user", parts=parts))
482
-
483
- return contents
484
-
485
805
  def _build_user_parts(
486
806
  self,
487
807
  content: str,
@@ -490,11 +810,6 @@ class Chat:
490
810
  ) -> List[Part]:
491
811
  """
492
812
  Build user message parts (text + images + audio)
493
-
494
- :param content: User text content
495
- :param attachments: Dict of AttachmentItem for images
496
- :param multimodal_ctx: MultimodalContext for audio
497
- :return: List of Part
498
813
  """
499
814
  self.window.core.api.google.vision.reset()
500
815
  parts: List[Part] = []
@@ -515,9 +830,6 @@ class Chat:
515
830
  def _fit_ctx(self, model: ModelItem) -> int:
516
831
  """
517
832
  Fit to max model tokens (best-effort, uses model.ctx if present)
518
-
519
- :param model: ModelItem
520
- :return: max context tokens
521
833
  """
522
834
  max_ctx_tokens = self.window.core.config.get('max_total_tokens')
523
835
  if model and model.ctx and 0 < model.ctx < max_ctx_tokens:
@@ -530,15 +842,10 @@ class Chat:
530
842
  system_prompt: str,
531
843
  model: ModelItem,
532
844
  history: Optional[List[CtxItem]] = None,
845
+ mode: str = MODE_CHAT,
533
846
  ) -> List[dict]:
534
847
  """
535
848
  Build simple messages structure for local token estimation
536
-
537
- :param prompt: User prompt
538
- :param system_prompt: System prompt/instruction
539
- :param model: ModelItem
540
- :param history: List of CtxItem for history
541
- :return: List of messages dicts with 'role' and 'content' keys
542
849
  """
543
850
  messages = []
544
851
  if system_prompt:
@@ -562,7 +869,6 @@ class Chat:
562
869
  messages.append({"role": "user", "content": str(prompt)})
563
870
  return messages
564
871
 
565
-
566
872
  def reset_tokens(self):
567
873
  """Reset input tokens counter"""
568
874
  self.input_tokens = 0
@@ -570,8 +876,6 @@ class Chat:
570
876
  def get_used_tokens(self) -> int:
571
877
  """
572
878
  Get input tokens counter (estimated before sending)
573
-
574
- :return: input tokens count
575
879
  """
576
880
  return self.input_tokens
577
881
 
@@ -579,13 +883,126 @@ class Chat:
579
883
  def _supports_tts(model_id: Optional[str]) -> bool:
580
884
  """
581
885
  Heuristic check if the model supports native TTS.
582
- - Official TTS models contain '-tts' in id (e.g. 'gemini-2.5-flash-preview-tts').
583
- - Future/preview names may contain 'native-audio'.
584
-
585
- :param model_id: Model ID
586
- :return: True if supports TTS, False otherwise
587
886
  """
588
887
  if not model_id:
589
888
  return False
590
889
  mid = model_id.lower()
591
- return ("-tts" in mid) or ("native-audio" in mid)
890
+ return ("-tts" in mid) or ("native-audio" in mid)
891
+
892
+ @staticmethod
893
+ def _find_last_interaction_state(
894
+ history: Optional[List[CtxItem]],
895
+ ctx: CtxItem,
896
+ ) -> Tuple[Optional[str], Optional[str], Optional[str]]:
897
+ """
898
+ Resolve last known Interactions state:
899
+ - previous_interaction_id: to continue conversation context
900
+ - last_event_id: to resume streaming (not used here, but returned for completeness)
901
+ - last_status: last known status string if available
902
+
903
+ Looks at current ctx.extra first, then scans history from newest to oldest.
904
+ """
905
+ prev_interaction_id: Optional[str] = None
906
+ last_event_id: Optional[str] = None
907
+ last_status: Optional[str] = None
908
+
909
+ try:
910
+ if getattr(ctx, "extra", None) and isinstance(ctx.extra, dict):
911
+ prev_interaction_id = (
912
+ ctx.extra.get("previous_interaction_id")
913
+ or ctx.extra.get("google_interaction_id")
914
+ or ctx.extra.get("google_last_interaction_id")
915
+ )
916
+ last_event_id = ctx.extra.get("google_last_event_id")
917
+ last_status = ctx.extra.get("google_interaction_status")
918
+ except Exception:
919
+ pass
920
+
921
+ if not prev_interaction_id and history:
922
+ for item in reversed(history or []):
923
+ ex = getattr(item, "extra", None)
924
+ if not ex or not isinstance(ex, dict):
925
+ continue
926
+ prev_interaction_id = (
927
+ ex.get("previous_interaction_id")
928
+ or ex.get("google_interaction_id")
929
+ or ex.get("google_last_interaction_id")
930
+ or prev_interaction_id
931
+ )
932
+ last_event_id = ex.get("google_last_event_id") or last_event_id
933
+ last_status = ex.get("google_interaction_status") or last_status
934
+ if prev_interaction_id and last_event_id:
935
+ break
936
+
937
+ return prev_interaction_id, last_event_id, last_status
938
+
939
+ @staticmethod
940
+ def _mime_to_interactions_type(mime: str) -> Optional[str]:
941
+ """
942
+ Map MIME type to Interactions input type.
943
+ """
944
+ if not mime:
945
+ return None
946
+ m = mime.lower()
947
+ if m.startswith("image/"):
948
+ return "image"
949
+ if m.startswith("audio/"):
950
+ return "audio"
951
+ if m.startswith("video/"):
952
+ return "video"
953
+ return None
954
+
955
+ @staticmethod
956
+ def _ensure_base64(data) -> Optional[str]:
957
+ """
958
+ Return base64 string from raw bytes or a base64 string.
959
+ """
960
+ try:
961
+ if data is None:
962
+ return None
963
+ if isinstance(data, str):
964
+ return data
965
+ if isinstance(data, (bytes, bytearray)):
966
+ import base64
967
+ return base64.b64encode(bytes(data)).decode("utf-8")
968
+ except Exception:
969
+ return None
970
+ return None
971
+
972
+ def _parts_to_interactions_input(self, parts: List[Part]) -> List[Dict[str, Any]]:
973
+ """
974
+ Convert Responses API Part list into Interactions API input payload.
975
+ """
976
+ out: List[Dict[str, Any]] = []
977
+
978
+ for p in parts or []:
979
+ # Text
980
+ t = getattr(p, "text", None)
981
+ if t is not None:
982
+ s = str(t).strip()
983
+ if s:
984
+ out.append({"type": "text", "text": s})
985
+ continue
986
+
987
+ # Inline data (images/audio/video)
988
+ inline = getattr(p, "inline_data", None)
989
+ if inline:
990
+ mime = (getattr(inline, "mime_type", "") or "").lower()
991
+ typ = self._mime_to_interactions_type(mime)
992
+ data = getattr(inline, "data", None)
993
+ b64 = self._ensure_base64(data)
994
+ if typ and b64:
995
+ out.append({"type": typ, "data": b64, "mime_type": mime})
996
+ continue
997
+
998
+ # File references (prefer URIs from Gemini Files API)
999
+ fdata = getattr(p, "file_data", None)
1000
+ if fdata:
1001
+ uri = getattr(fdata, "file_uri", None) or getattr(fdata, "uri", None)
1002
+ mime = (getattr(fdata, "mime_type", "") or "").lower()
1003
+ typ = self._mime_to_interactions_type(mime)
1004
+ if typ and uri:
1005
+ out.append({"type": typ, "uri": uri})
1006
+ continue
1007
+
1008
+ return out