pygpt-net 2.7.4__py3-none-any.whl → 2.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. pygpt_net/CHANGELOG.txt +15 -0
  2. pygpt_net/__init__.py +4 -4
  3. pygpt_net/app_core.py +4 -2
  4. pygpt_net/controller/__init__.py +5 -1
  5. pygpt_net/controller/assistant/assistant.py +1 -4
  6. pygpt_net/controller/assistant/batch.py +5 -504
  7. pygpt_net/controller/assistant/editor.py +5 -5
  8. pygpt_net/controller/assistant/files.py +16 -16
  9. pygpt_net/controller/chat/handler/google_stream.py +307 -1
  10. pygpt_net/controller/chat/handler/worker.py +10 -25
  11. pygpt_net/controller/chat/handler/xai_stream.py +621 -52
  12. pygpt_net/controller/chat/image.py +2 -2
  13. pygpt_net/controller/debug/fixtures.py +3 -2
  14. pygpt_net/controller/dialogs/confirm.py +73 -101
  15. pygpt_net/controller/files/files.py +65 -4
  16. pygpt_net/controller/lang/mapping.py +9 -9
  17. pygpt_net/controller/painter/capture.py +50 -1
  18. pygpt_net/controller/presets/presets.py +2 -1
  19. pygpt_net/controller/remote_store/__init__.py +12 -0
  20. pygpt_net/{provider/core/assistant_file/db_sqlite → controller/remote_store/google}/__init__.py +2 -2
  21. pygpt_net/controller/remote_store/google/batch.py +402 -0
  22. pygpt_net/controller/remote_store/google/store.py +615 -0
  23. pygpt_net/controller/remote_store/openai/__init__.py +12 -0
  24. pygpt_net/controller/remote_store/openai/batch.py +524 -0
  25. pygpt_net/controller/{assistant → remote_store/openai}/store.py +63 -60
  26. pygpt_net/controller/remote_store/remote_store.py +35 -0
  27. pygpt_net/controller/ui/ui.py +20 -1
  28. pygpt_net/core/assistants/assistants.py +3 -15
  29. pygpt_net/core/db/database.py +5 -3
  30. pygpt_net/core/filesystem/url.py +4 -1
  31. pygpt_net/core/locale/placeholder.py +35 -0
  32. pygpt_net/core/remote_store/__init__.py +12 -0
  33. pygpt_net/core/remote_store/google/__init__.py +11 -0
  34. pygpt_net/core/remote_store/google/files.py +224 -0
  35. pygpt_net/core/remote_store/google/store.py +248 -0
  36. pygpt_net/core/remote_store/openai/__init__.py +11 -0
  37. pygpt_net/core/{assistants → remote_store/openai}/files.py +26 -19
  38. pygpt_net/core/{assistants → remote_store/openai}/store.py +32 -15
  39. pygpt_net/core/remote_store/remote_store.py +24 -0
  40. pygpt_net/core/render/web/body.py +3 -2
  41. pygpt_net/core/types/chunk.py +27 -0
  42. pygpt_net/data/config/config.json +8 -4
  43. pygpt_net/data/config/models.json +77 -3
  44. pygpt_net/data/config/settings.json +45 -0
  45. pygpt_net/data/js/app/template.js +1 -1
  46. pygpt_net/data/js/app.min.js +2 -2
  47. pygpt_net/data/locale/locale.de.ini +44 -41
  48. pygpt_net/data/locale/locale.en.ini +56 -43
  49. pygpt_net/data/locale/locale.es.ini +44 -41
  50. pygpt_net/data/locale/locale.fr.ini +44 -41
  51. pygpt_net/data/locale/locale.it.ini +44 -41
  52. pygpt_net/data/locale/locale.pl.ini +45 -42
  53. pygpt_net/data/locale/locale.uk.ini +44 -41
  54. pygpt_net/data/locale/locale.zh.ini +44 -41
  55. pygpt_net/data/locale/plugin.cmd_history.de.ini +1 -1
  56. pygpt_net/data/locale/plugin.cmd_history.en.ini +1 -1
  57. pygpt_net/data/locale/plugin.cmd_history.es.ini +1 -1
  58. pygpt_net/data/locale/plugin.cmd_history.fr.ini +1 -1
  59. pygpt_net/data/locale/plugin.cmd_history.it.ini +1 -1
  60. pygpt_net/data/locale/plugin.cmd_history.pl.ini +1 -1
  61. pygpt_net/data/locale/plugin.cmd_history.uk.ini +1 -1
  62. pygpt_net/data/locale/plugin.cmd_history.zh.ini +1 -1
  63. pygpt_net/data/locale/plugin.cmd_mouse_control.en.ini +14 -0
  64. pygpt_net/data/locale/plugin.cmd_web.de.ini +1 -1
  65. pygpt_net/data/locale/plugin.cmd_web.en.ini +1 -1
  66. pygpt_net/data/locale/plugin.cmd_web.es.ini +1 -1
  67. pygpt_net/data/locale/plugin.cmd_web.fr.ini +1 -1
  68. pygpt_net/data/locale/plugin.cmd_web.it.ini +1 -1
  69. pygpt_net/data/locale/plugin.cmd_web.pl.ini +1 -1
  70. pygpt_net/data/locale/plugin.cmd_web.uk.ini +1 -1
  71. pygpt_net/data/locale/plugin.cmd_web.zh.ini +1 -1
  72. pygpt_net/data/locale/plugin.idx_llama_index.de.ini +2 -2
  73. pygpt_net/data/locale/plugin.idx_llama_index.en.ini +2 -2
  74. pygpt_net/data/locale/plugin.idx_llama_index.es.ini +2 -2
  75. pygpt_net/data/locale/plugin.idx_llama_index.fr.ini +2 -2
  76. pygpt_net/data/locale/plugin.idx_llama_index.it.ini +2 -2
  77. pygpt_net/data/locale/plugin.idx_llama_index.pl.ini +2 -2
  78. pygpt_net/data/locale/plugin.idx_llama_index.uk.ini +2 -2
  79. pygpt_net/data/locale/plugin.idx_llama_index.zh.ini +2 -2
  80. pygpt_net/item/assistant.py +1 -211
  81. pygpt_net/item/ctx.py +3 -3
  82. pygpt_net/item/store.py +238 -0
  83. pygpt_net/js_rc.py +2449 -2447
  84. pygpt_net/migrations/Version20260102190000.py +35 -0
  85. pygpt_net/migrations/__init__.py +3 -1
  86. pygpt_net/plugin/cmd_mouse_control/config.py +471 -1
  87. pygpt_net/plugin/cmd_mouse_control/plugin.py +487 -22
  88. pygpt_net/plugin/cmd_mouse_control/worker.py +464 -87
  89. pygpt_net/plugin/cmd_mouse_control/worker_sandbox.py +729 -0
  90. pygpt_net/plugin/idx_llama_index/config.py +2 -2
  91. pygpt_net/provider/api/anthropic/__init__.py +10 -8
  92. pygpt_net/provider/api/google/__init__.py +21 -58
  93. pygpt_net/provider/api/google/chat.py +545 -129
  94. pygpt_net/provider/api/google/computer.py +190 -0
  95. pygpt_net/provider/api/google/realtime/realtime.py +2 -2
  96. pygpt_net/provider/api/google/remote_tools.py +93 -0
  97. pygpt_net/provider/api/google/store.py +546 -0
  98. pygpt_net/provider/api/google/worker/__init__.py +0 -0
  99. pygpt_net/provider/api/google/worker/importer.py +392 -0
  100. pygpt_net/provider/api/openai/__init__.py +7 -3
  101. pygpt_net/provider/api/openai/computer.py +10 -1
  102. pygpt_net/provider/api/openai/responses.py +0 -0
  103. pygpt_net/provider/api/openai/store.py +6 -6
  104. pygpt_net/provider/api/openai/worker/importer.py +24 -24
  105. pygpt_net/provider/api/x_ai/__init__.py +10 -9
  106. pygpt_net/provider/api/x_ai/chat.py +272 -102
  107. pygpt_net/provider/core/config/patch.py +16 -1
  108. pygpt_net/provider/core/config/patches/patch_before_2_6_42.py +3 -3
  109. pygpt_net/provider/core/model/patch.py +17 -3
  110. pygpt_net/provider/core/preset/json_file.py +13 -7
  111. pygpt_net/provider/core/{assistant_file → remote_file}/__init__.py +1 -1
  112. pygpt_net/provider/core/{assistant_file → remote_file}/base.py +9 -9
  113. pygpt_net/provider/core/remote_file/db_sqlite/__init__.py +12 -0
  114. pygpt_net/provider/core/{assistant_file → remote_file}/db_sqlite/patch.py +1 -1
  115. pygpt_net/provider/core/{assistant_file → remote_file}/db_sqlite/provider.py +23 -20
  116. pygpt_net/provider/core/{assistant_file → remote_file}/db_sqlite/storage.py +35 -27
  117. pygpt_net/provider/core/{assistant_file → remote_file}/db_sqlite/utils.py +5 -4
  118. pygpt_net/provider/core/{assistant_store → remote_store}/__init__.py +1 -1
  119. pygpt_net/provider/core/{assistant_store → remote_store}/base.py +10 -10
  120. pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/__init__.py +1 -1
  121. pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/patch.py +1 -1
  122. pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/provider.py +16 -15
  123. pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/storage.py +30 -23
  124. pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/utils.py +5 -4
  125. pygpt_net/provider/core/{assistant_store → remote_store}/json_file.py +9 -9
  126. pygpt_net/provider/llms/google.py +2 -2
  127. pygpt_net/tools/image_viewer/ui/dialogs.py +298 -12
  128. pygpt_net/tools/text_editor/ui/widgets.py +5 -1
  129. pygpt_net/ui/base/config_dialog.py +3 -2
  130. pygpt_net/ui/base/context_menu.py +44 -1
  131. pygpt_net/ui/dialog/assistant.py +3 -3
  132. pygpt_net/ui/dialog/plugins.py +3 -1
  133. pygpt_net/ui/dialog/remote_store_google.py +539 -0
  134. pygpt_net/ui/dialog/{assistant_store.py → remote_store_openai.py} +95 -95
  135. pygpt_net/ui/dialogs.py +5 -3
  136. pygpt_net/ui/layout/chat/attachments_uploaded.py +3 -3
  137. pygpt_net/ui/layout/toolbox/computer_env.py +26 -8
  138. pygpt_net/ui/layout/toolbox/indexes.py +22 -19
  139. pygpt_net/ui/layout/toolbox/model.py +28 -5
  140. pygpt_net/ui/menu/tools.py +13 -5
  141. pygpt_net/ui/widget/dialog/remote_store_google.py +56 -0
  142. pygpt_net/ui/widget/dialog/{assistant_store.py → remote_store_openai.py} +9 -9
  143. pygpt_net/ui/widget/element/button.py +4 -4
  144. pygpt_net/ui/widget/image/display.py +25 -8
  145. pygpt_net/ui/widget/lists/remote_store_google.py +248 -0
  146. pygpt_net/ui/widget/lists/{assistant_store.py → remote_store_openai.py} +21 -21
  147. pygpt_net/ui/widget/option/checkbox_list.py +47 -9
  148. pygpt_net/ui/widget/option/combo.py +39 -3
  149. pygpt_net/ui/widget/tabs/output.py +9 -1
  150. pygpt_net/ui/widget/textarea/editor.py +14 -1
  151. pygpt_net/ui/widget/textarea/input.py +20 -7
  152. pygpt_net/ui/widget/textarea/notepad.py +24 -1
  153. pygpt_net/ui/widget/textarea/output.py +23 -1
  154. pygpt_net/ui/widget/textarea/web.py +16 -1
  155. {pygpt_net-2.7.4.dist-info → pygpt_net-2.7.6.dist-info}/METADATA +41 -2
  156. {pygpt_net-2.7.4.dist-info → pygpt_net-2.7.6.dist-info}/RECORD +158 -132
  157. {pygpt_net-2.7.4.dist-info → pygpt_net-2.7.6.dist-info}/LICENSE +0 -0
  158. {pygpt_net-2.7.4.dist-info → pygpt_net-2.7.6.dist-info}/WHEEL +0 -0
  159. {pygpt_net-2.7.4.dist-info → pygpt_net-2.7.6.dist-info}/entry_points.txt +0 -0
@@ -6,15 +6,16 @@
6
6
  # GitHub: https://github.com/szczyglis-dev/py-gpt #
7
7
  # MIT License #
8
8
  # Created By : Marcin Szczygliński #
9
- # Updated Date: 2025.08.28 20:00:00 #
9
+ # Updated Date: 2026.01.03 17:00:00 #
10
10
  # ================================================== #
11
11
 
12
- from typing import Optional, Dict, Any, List
12
+ import os
13
+ from typing import Optional, Dict, Any, List, Tuple
13
14
 
14
15
  from google.genai import types as gtypes
15
16
  from google.genai.types import Content, Part
16
17
 
17
- from pygpt_net.core.types import MODE_CHAT, MODE_AUDIO
18
+ from pygpt_net.core.types import MODE_CHAT, MODE_AUDIO, MODE_COMPUTER, MODE_RESEARCH
18
19
  from pygpt_net.core.bridge.context import BridgeContext, MultimodalContext
19
20
  from pygpt_net.item.attachment import AttachmentItem
20
21
  from pygpt_net.item.ctx import CtxItem
@@ -35,7 +36,7 @@ class Chat:
35
36
  extra: Optional[Dict[str, Any]] = None
36
37
  ):
37
38
  """
38
- Call Google GenAI for chat / multimodal / audio.
39
+ Call Google GenAI for chat / multimodal / audio / computer use.
39
40
 
40
41
  :param context: BridgeContext with prompt, model, history, mode, etc.
41
42
  :param extra: Extra parameters (not used currently)
@@ -62,7 +63,6 @@ class Chat:
62
63
 
63
64
  # ------------- TRANSCRIPTION PATH (audio input -> text -> feed to TTS) -------------
64
65
  if mode == MODE_AUDIO and has_audio_input:
65
- # Build minimal transcription request: [instruction text, audio part]
66
66
  transcribe_model = self.window.core.config.get("google_audio.transcribe_model", "gemini-2.5-flash")
67
67
  transcribe_prompt = self.window.core.config.get("google_audio.transcribe_prompt", "Transcribe this audio clip to text.")
68
68
  audio_part = self.window.core.api.google.audio.build_part(multimodal_ctx)
@@ -73,13 +73,10 @@ class Chat:
73
73
  ])
74
74
  ]
75
75
  trans_cfg = gtypes.GenerateContentConfig(
76
- # Keep minimal; no tools/system for transcription
77
76
  temperature=self.window.core.config.get('temperature'),
78
77
  top_p=self.window.core.config.get('top_p'),
79
78
  max_output_tokens=context.max_tokens if context.max_tokens else None,
80
79
  )
81
-
82
- # Always non-stream here (we immediately need the text for TTS)
83
80
  trans_resp = client.models.generate_content(
84
81
  model=transcribe_model,
85
82
  contents=trans_inputs,
@@ -87,20 +84,17 @@ class Chat:
87
84
  )
88
85
  transcribed_text = self.extract_text(trans_resp).strip()
89
86
  if transcribed_text:
90
- # Feed transcription into TTS as the final prompt
91
87
  prompt = transcribed_text
92
88
  ctx.input = transcribed_text
93
89
  try:
94
- # optional: store for debugging/UX
95
90
  if isinstance(ctx.extra, dict):
96
91
  ctx.extra["transcription"] = transcribed_text
97
92
  except Exception:
98
93
  pass
99
- ctx.is_audio = False # transcription is text
100
- multimodal_ctx.is_audio_input = False # disable audio input for TTS below
94
+ ctx.is_audio = False
95
+ multimodal_ctx.is_audio_input = False
101
96
 
102
- # ---------------------- REGULAR CHAT PATH (or no-audio in MODE_AUDIO) ----------------------
103
- # Build contents for chat/multimodal (will be overridden for TTS below)
97
+ # ---------------------- REGULAR CHAT/COMPUTER PATH ----------------------
104
98
  inputs = self.build_input(
105
99
  prompt=prompt,
106
100
  system_prompt=system_prompt,
@@ -108,23 +102,35 @@ class Chat:
108
102
  history=context.history,
109
103
  attachments=attachments,
110
104
  multimodal_ctx=multimodal_ctx,
105
+ mode=mode,
111
106
  )
112
107
 
113
108
  # Best-effort input tokens estimate
114
109
  self.reset_tokens()
115
- count_msgs = self._build_count_messages(prompt, system_prompt, model, context.history)
110
+ count_msgs = self._build_count_messages(prompt, system_prompt, model, context.history, mode)
116
111
  self.input_tokens += self.window.core.tokens.from_messages(count_msgs, model.id)
117
112
 
118
113
  # Tools -> merge app-defined tools with remote tools
119
114
  base_tools = self.window.core.api.google.tools.prepare(model, functions)
120
- remote_tools = self.window.core.api.google.build_remote_tools(model)
115
+ remote_tools = self.window.core.api.google.remote_tools.build_remote_tools(model)
121
116
 
122
- # Check tools compatibility
117
+ # Note: Combining native (remote) tools with function declarations is documented as Live API-only.
123
118
  if base_tools:
124
- remote_tools = [] # remote tools are not allowed if function calling is used
119
+ remote_tools = []
125
120
  tools = (base_tools or []) + (remote_tools or [])
126
- if "-image" in model.id:
127
- tools = None # function calling is not supported for image models
121
+
122
+ # Enable Computer Use tool in computer mode (use the official Tool/ComputerUse object)
123
+ if mode == MODE_COMPUTER or (model and isinstance(model.id, str) and "computer-use" in model.id.lower()):
124
+ comp_env = gtypes.Environment.ENVIRONMENT_BROWSER
125
+ tools = [gtypes.Tool(
126
+ computer_use=gtypes.ComputerUse(
127
+ environment=comp_env,
128
+ )
129
+ )] # reset tools to only Computer Use (multiple tools not supported together)
130
+
131
+ # Some models cannot use tools; keep behavior for image-only models
132
+ if model and isinstance(model.id, str) and "-image" in model.id:
133
+ tools = None
128
134
 
129
135
  # Sampling
130
136
  temperature = self.window.core.config.get('temperature')
@@ -145,10 +151,9 @@ class Chat:
145
151
  stream = False # TTS non-stream in this app
146
152
  supports_tts = self._supports_tts(model.id)
147
153
 
148
- # Force minimal single-turn input for TTS (text only), using prompt possibly replaced by transcription
149
154
  inputs = [Content(role="user", parts=[Part.from_text(text=str(prompt or ""))])]
150
155
 
151
- # Remove params not used by TTS flow (and that sometimes cause issues)
156
+ # Remove params not used by TTS flow
152
157
  for key in ("temperature", "top_p", "max_output_tokens", "system_instruction", "tools"):
153
158
  if key in cfg_kwargs:
154
159
  del cfg_kwargs[key]
@@ -171,11 +176,92 @@ class Chat:
171
176
  prebuilt_voice_config=gtypes.PrebuiltVoiceConfig(voice_name=voice_name)
172
177
  )
173
178
  )
174
- # else: fallback to text-only below
175
-
176
179
  cfg = gtypes.GenerateContentConfig(**cfg_kwargs)
177
180
  params = dict(model=model.id, contents=inputs, config=cfg)
178
181
 
182
+ if mode == MODE_RESEARCH:
183
+
184
+ # Deep Research does not support audio inputs; if an audio snippet is present, transcribe it to text first.
185
+ if has_audio_input:
186
+ try:
187
+ transcribe_model = self.window.core.config.get("google_audio.transcribe_model", "gemini-2.5-flash")
188
+ transcribe_prompt = self.window.core.config.get("google_audio.transcribe_prompt", "Transcribe this audio clip to text.")
189
+ audio_part = self.window.core.api.google.audio.build_part(multimodal_ctx)
190
+ trans_inputs = [
191
+ Content(role="user", parts=[
192
+ Part.from_text(text=transcribe_prompt),
193
+ audio_part,
194
+ ])
195
+ ]
196
+ trans_cfg = gtypes.GenerateContentConfig(
197
+ temperature=self.window.core.config.get('temperature'),
198
+ top_p=self.window.core.config.get('top_p'),
199
+ max_output_tokens=context.max_tokens if context.max_tokens else None,
200
+ )
201
+ trans_resp = client.models.generate_content(
202
+ model=transcribe_model,
203
+ contents=trans_inputs,
204
+ config=trans_cfg,
205
+ )
206
+ transcribed_text = self.extract_text(trans_resp).strip()
207
+ if transcribed_text:
208
+ prompt = (str(prompt or "").strip() + "\n\n" + transcribed_text).strip() if prompt else transcribed_text
209
+ ctx.input = transcribed_text
210
+ try:
211
+ if isinstance(ctx.extra, dict):
212
+ ctx.extra["transcription"] = transcribed_text
213
+ except Exception:
214
+ pass
215
+ except Exception:
216
+ pass
217
+ # Ensure we don't send raw audio to Interactions API
218
+ if multimodal_ctx:
219
+ multimodal_ctx.is_audio_input = False
220
+
221
+ # Build single-turn multimodal input for Interactions API (no full chat history)
222
+ research_parts = self._build_user_parts(
223
+ content=str(prompt),
224
+ attachments=attachments,
225
+ multimodal_ctx=multimodal_ctx,
226
+ )
227
+ interactions_input = self._parts_to_interactions_input(research_parts)
228
+
229
+ # Try to continue context with the last completed interaction (server-side state)
230
+ prev_interaction_id, last_event_id, last_status = self._find_last_interaction_state(
231
+ history=context.history,
232
+ ctx=ctx,
233
+ )
234
+ try:
235
+ if ctx.extra is None:
236
+ ctx.extra = {}
237
+ if prev_interaction_id:
238
+ ctx.extra["previous_interaction_id"] = prev_interaction_id
239
+ if last_event_id:
240
+ ctx.extra["google_last_event_id"] = last_event_id
241
+ if last_status:
242
+ ctx.extra["google_interaction_status"] = last_status
243
+ except Exception:
244
+ pass
245
+
246
+ # Deep Research agent must use background=True; stream=True enables live progress updates.
247
+ create_kwargs: Dict[str, Any] = {
248
+ "agent": model.id,
249
+ "input": interactions_input if interactions_input else (str(prompt or "") or " "),
250
+ "background": True,
251
+ "stream": stream,
252
+ "agent_config": {
253
+ "type": "deep-research",
254
+ "thinking_summaries": "auto"
255
+ }
256
+ }
257
+
258
+ # Continue conversation on server using previous_interaction_id if available
259
+ if prev_interaction_id:
260
+ create_kwargs["previous_interaction_id"] = prev_interaction_id
261
+
262
+ # Do not pass custom tools here; Deep Research manages its own built-in tools.
263
+ return client.interactions.create(**create_kwargs)
264
+
179
265
  if stream and mode != MODE_AUDIO:
180
266
  return client.models.generate_content_stream(**params)
181
267
  else:
@@ -189,28 +275,21 @@ class Chat:
189
275
  """
190
276
  Unpack non-streaming response from Google GenAI and set context.
191
277
 
192
- :param mode: MODE_CHAT or MODE_AUDIO
278
+ :param mode: MODE_CHAT, MODE_AUDIO or MODE_COMPUTER
193
279
  :param response: Response object
194
280
  :param ctx: CtxItem to set output, audio_output, tokens, tool_calls
195
281
  """
196
282
  if mode == MODE_AUDIO:
197
- # Prefer audio if present
198
283
  audio_bytes, mime = self.window.core.api.google.audio.extract_first_audio_part(response)
199
284
  if audio_bytes:
200
- # Google returns PCM16 24kHz mono for TTS; wrap to WAV (base64) for UI compatibility
201
- # https://ai.google.dev/gemini-api/docs/speech-generation
202
- if mime == "audio/pcm" or mime.startswith("audio/"):
285
+ if mime == "audio/pcm" or (isinstance(mime, str) and mime.startswith("audio/")):
203
286
  wav_b64 = self.window.core.api.google.audio.pcm16_to_wav_base64(audio_bytes, rate=24000)
204
287
  ctx.audio_output = wav_b64
205
288
  ctx.is_audio = True
206
- # Text transcript is typically not present for TTS; still try:
207
289
  txt = self.extract_text(response)
208
290
  ctx.output = txt or "..."
209
291
  else:
210
- # No audio present -> fallback to text
211
292
  ctx.output = self.extract_text(response)
212
-
213
- # Usage
214
293
  try:
215
294
  usage = getattr(response, "usage_metadata", None)
216
295
  if usage:
@@ -219,17 +298,58 @@ class Chat:
219
298
  ctx.set_tokens(p, c)
220
299
  except Exception:
221
300
  pass
301
+ return
222
302
 
223
- return # audio path done
303
+ # ---- chat / computer ----
304
+ ctx.output = self.extract_text(response) or ""
224
305
 
225
- # ---- regular chat/completion ----
226
- ctx.output = self.extract_text(response)
227
-
228
- # Extract function calls
306
+ # 1) Extract tool calls and store in ctx.tool_calls (backward-compatible shape)
229
307
  calls = self.extract_tool_calls(response)
230
308
  if calls:
231
309
  ctx.tool_calls = calls
232
310
 
311
+ # 2) In MODE_COMPUTER: capture raw model parts (with thought_signature) for next FunctionResponse turn
312
+ # and translate Computer Use calls into plugin commands now.
313
+ if mode == MODE_COMPUTER:
314
+ candidate = None
315
+ try:
316
+ cands = getattr(response, "candidates", None) or []
317
+ if cands:
318
+ candidate = cands[0]
319
+ except Exception:
320
+ pass
321
+
322
+ if candidate and getattr(candidate, "content", None):
323
+ parts = getattr(candidate.content, "parts", None) or []
324
+ dump = self._dump_model_parts(parts)
325
+ if dump:
326
+ if ctx.extra is None:
327
+ ctx.extra = {}
328
+ ctx.extra["prev_model_parts"] = dump
329
+
330
+ tool_calls: List[dict] = []
331
+ try:
332
+ tool_calls, has_calls = self.window.core.api.google.computer.handle_stream_chunk(
333
+ ctx=ctx,
334
+ chunk=response,
335
+ tool_calls=tool_calls,
336
+ )
337
+ except Exception as e:
338
+ has_calls = False
339
+ print(f"Gemini computer-use mapping error: {e}")
340
+
341
+ if has_calls and tool_calls:
342
+ ctx.force_call = True
343
+ self.window.core.debug.info("[chat] Google tool calls found, unpacking...")
344
+ self.window.core.command.unpack_tool_calls_chunks(ctx, tool_calls)
345
+
346
+ if calls:
347
+ if ctx.extra is None:
348
+ ctx.extra = {}
349
+ ctx.extra["function_response_required"] = True
350
+ ctx.extra["function_response_source"] = "ctx.tool_calls"
351
+ ctx.extra["function_response_reason"] = "computer_use"
352
+
233
353
  # Usage if available
234
354
  try:
235
355
  usage = getattr(response, "usage_metadata", None)
@@ -283,12 +403,11 @@ class Chat:
283
403
  :return: List of tool calls
284
404
  """
285
405
  def _to_plain_dict(obj):
286
- # Convert pydantic/genai objects to plain dict recursively
287
406
  try:
288
407
  if hasattr(obj, "to_json_dict"):
289
408
  return obj.to_json_dict()
290
409
  if hasattr(obj, "model_dump"):
291
- return obj.model_dump() # pydantic v2
410
+ return obj.model_dump()
292
411
  if hasattr(obj, "to_dict"):
293
412
  return obj.to_dict()
294
413
  except Exception:
@@ -307,7 +426,6 @@ class Chat:
307
426
  name = getattr(fc, "name", "") or ""
308
427
  args_obj = getattr(fc, "args", {}) or {}
309
428
  args_dict = _to_plain_dict(args_obj) or {}
310
- # if str, try to parse
311
429
  if isinstance(args_dict, str):
312
430
  try:
313
431
  import json
@@ -319,7 +437,7 @@ class Chat:
319
437
  "type": "function",
320
438
  "function": {
321
439
  "name": name,
322
- "arguments": args_dict, # <--- DICT, not string
440
+ "arguments": args_dict,
323
441
  }
324
442
  })
325
443
 
@@ -345,11 +463,11 @@ class Chat:
345
463
  except Exception:
346
464
  args_dict = {}
347
465
  out.append({
348
- "id": "",
466
+ "id": getattr(fn, "id", "") or "",
349
467
  "type": "function",
350
468
  "function": {
351
469
  "name": name,
352
- "arguments": args_dict, # <--- DICT
470
+ "arguments": args_dict,
353
471
  }
354
472
  })
355
473
  except Exception:
@@ -357,17 +475,274 @@ class Chat:
357
475
 
358
476
  return out
359
477
 
478
+ def build_input(
479
+ self,
480
+ prompt: str,
481
+ system_prompt: str,
482
+ model: ModelItem,
483
+ history: Optional[List[CtxItem]] = None,
484
+ attachments: Optional[Dict[str, AttachmentItem]] = None,
485
+ multimodal_ctx: Optional[MultimodalContext] = None,
486
+ mode: str = MODE_CHAT,
487
+ ) -> List[Content]:
488
+ """
489
+ Build Google GenAI contents list
490
+
491
+ :param prompt: User prompt
492
+ :param system_prompt: System prompt/instruction
493
+ :param model: ModelItem
494
+ :param history: List of CtxItem for history
495
+ :param attachments: Dict of AttachmentItem for images/screenshots
496
+ :param multimodal_ctx: MultimodalContext for audio
497
+ :param mode: MODE_CHAT / MODE_AUDIO / MODE_COMPUTER
498
+ :return: List of Content
499
+ """
500
+ # FunctionResponse turn for Computer Use (strictly immediate after functionCall)
501
+ if mode == MODE_COMPUTER and self.window.core.config.get('use_context'):
502
+ hist = self.window.core.ctx.get_history(
503
+ history,
504
+ model.id,
505
+ MODE_CHAT,
506
+ self.window.core.tokens.from_user(prompt, system_prompt),
507
+ self._fit_ctx(model),
508
+ )
509
+ fr_contents = self._build_function_responses_from_history(hist, attachments)
510
+ if fr_contents:
511
+ return fr_contents
512
+
513
+ # Build conversation history first to detect "first input"
514
+ items: List[CtxItem] = []
515
+ if self.window.core.config.get('use_context'):
516
+ items = self.window.core.ctx.get_history(
517
+ history,
518
+ model.id,
519
+ MODE_CHAT if model and MODE_CHAT in model.mode else MODE_CHAT,
520
+ self.window.core.tokens.from_user(prompt, system_prompt),
521
+ self._fit_ctx(model),
522
+ )
523
+
524
+ is_first_turn = (len(items) == 0)
525
+ is_sandbox = bool(self.window.core.config.get("remote_tools.computer_use.sandbox", False))
526
+
527
+ contents: List[Content] = []
528
+
529
+ # Append conversation history (text only)
530
+ for item in items:
531
+ if item.final_input:
532
+ contents.append(Content(role="user", parts=[Part.from_text(text=str(item.final_input))]))
533
+ if item.final_output:
534
+ contents.append(Content(role="model", parts=[Part.from_text(text=str(item.final_output))]))
535
+
536
+ # Current user message:
537
+ # - In MODE_COMPUTER attach initial screenshot only on the very first turn
538
+ if mode == MODE_COMPUTER:
539
+ initial_attachments = {}
540
+ if is_first_turn and not attachments and not is_sandbox:
541
+ self.window.controller.attachment.clear_silent()
542
+ self.window.controller.painter.capture.screenshot(attach_cursor=True, silent=True)
543
+ initial_attachments = self.window.core.attachments.get_all(mode)
544
+ send_attachments = initial_attachments if initial_attachments else attachments
545
+ parts = self._build_user_parts(
546
+ content=str(prompt),
547
+ attachments=send_attachments,
548
+ multimodal_ctx=multimodal_ctx,
549
+ )
550
+ else:
551
+ parts = self._build_user_parts(
552
+ content=str(prompt),
553
+ attachments=attachments,
554
+ multimodal_ctx=multimodal_ctx,
555
+ )
556
+ contents.append(Content(role="user", parts=parts))
557
+
558
+ return contents
559
+
560
+ def _build_function_responses_from_history(
561
+ self,
562
+ history: Optional[List[CtxItem]],
563
+ attachments: Optional[Dict[str, AttachmentItem]],
564
+ ) -> Optional[List[Content]]:
565
+ """
566
+ Build FunctionResponse contents for the immediate next turn after executing
567
+ Computer Use function calls. It reconstructs the last user -> model(functionCall) turn
568
+ and returns [user_content, model_function_call_content, tool_function_response_content].
569
+ """
570
+ if not self.window.core.config.get('use_context') or not history:
571
+ return None
572
+
573
+ last_item = history[-1]
574
+ if not getattr(last_item, "extra", None):
575
+ return None
576
+ if not last_item.extra.get("function_response_required"):
577
+ return None
578
+
579
+ # 1) Find the user message that started the current turn (previous item's input)
580
+ prior_user_text = ""
581
+ if len(history) >= 2:
582
+ prev = history[-2]
583
+ if getattr(prev, "final_input", None):
584
+ prior_user_text = str(prev.final_input)
585
+
586
+ if not prior_user_text and getattr(last_item, "input", None):
587
+ prior_user_text = str(last_item.input)
588
+
589
+ if not prior_user_text:
590
+ prior_user_text = "..."
591
+
592
+ user_content = Content(role="user", parts=[Part.from_text(text=prior_user_text)])
593
+
594
+ # 2) Rebuild the model functionCall content with thought_signature preserved
595
+ raw_parts = last_item.extra.get("prev_model_parts", [])
596
+ model_parts = self._rehydrate_model_parts(raw_parts)
597
+ if not model_parts:
598
+ model_parts = self._rehydrate_from_tool_calls(getattr(last_item, "tool_calls", []))
599
+ # append also text part if not empty
600
+ if getattr(last_item, "final_output", None):
601
+ output_text = str(last_item.final_output).strip()
602
+ if output_text:
603
+ model_parts.append(Part.from_text(text=output_text))
604
+
605
+ model_fc_content = Content(role="model", parts=model_parts)
606
+
607
+ # 3) Build a single tool content with N FunctionResponse parts (one per functionCall)
608
+ screenshot_part = self._screenshot_function_response_part(attachments)
609
+ fr_parts: List[Part] = []
610
+ for p in model_parts:
611
+ if getattr(p, "function_call", None):
612
+ fn = p.function_call
613
+ fr = Part.from_function_response(
614
+ name=fn.name,
615
+ response=self._minimal_tool_response(last_item),
616
+ parts=[screenshot_part] if screenshot_part else None
617
+ )
618
+ fr_parts.append(fr)
619
+
620
+ if not fr_parts:
621
+ return None
622
+
623
+ tool_content = Content(role="tool", parts=fr_parts)
624
+
625
+ return [user_content, model_fc_content, tool_content]
626
+
627
+ def _rehydrate_from_tool_calls(self, calls: List[dict]) -> List[Part]:
628
+ """
629
+ Fallback rehydration when prev_model_parts are unavailable (no thought signatures).
630
+ """
631
+ parts: List[Part] = []
632
+ for c in calls or []:
633
+ if not isinstance(c, dict):
634
+ continue
635
+ if c.get("type") != "function":
636
+ continue
637
+ fn = c.get("function") or {}
638
+ name = fn.get("name")
639
+ args = fn.get("arguments") or {}
640
+ if not name:
641
+ continue
642
+ parts.append(Part.from_function_call(name=name, args=args))
643
+ return parts
644
+
645
+ def _dump_model_parts(self, parts: List[Part]) -> List[dict]:
646
+ """
647
+ Dump model parts into a JSON-serializable structure, preserving thought_signature.
648
+ """
649
+ out: List[dict] = []
650
+ for p in parts or []:
651
+ ts = getattr(p, "thought_signature", None)
652
+ if getattr(p, "function_call", None):
653
+ fn = p.function_call
654
+ name = getattr(fn, "name", "") or ""
655
+ args = getattr(fn, "args", {}) or {}
656
+ out.append({
657
+ "type": "function_call",
658
+ "name": name,
659
+ "args": args,
660
+ "thought_signature": ts,
661
+ })
662
+ elif getattr(p, "text", None):
663
+ out.append({"type": "text", "text": str(p.text)})
664
+ return out
665
+
666
+ def _rehydrate_model_parts(self, raw_parts: List[dict]) -> List[Part]:
667
+ """
668
+ Recreate SDK Part objects from dumped parts, including thought_signature on the Part.
669
+ """
670
+ parts: List[Part] = []
671
+ for it in raw_parts or []:
672
+ t = (it.get("type") or "").lower()
673
+ if t == "function_call":
674
+ name = it.get("name")
675
+ args = it.get("args") or {}
676
+ ts = it.get("thought_signature")
677
+ if name:
678
+ parts.append(Part(function_call=gtypes.FunctionCall(name=name, args=args),
679
+ thought_signature=ts))
680
+ elif t == "text":
681
+ parts.append(Part.from_text(text=str(it.get("text", ""))))
682
+ return parts
683
+
684
+ def _screenshot_function_response_part(
685
+ self,
686
+ attachments: Optional[Dict[str, AttachmentItem]]
687
+ ) -> Optional[gtypes.FunctionResponsePart]:
688
+ """
689
+ Build FunctionResponsePart with inlineData PNG/JPEG screenshot from attachments.
690
+ """
691
+ if not attachments:
692
+ return None
693
+
694
+ chosen_path = None
695
+ for _, att in attachments.items():
696
+ if not att or not att.path:
697
+ continue
698
+ p = att.path
699
+ if isinstance(p, str) and os.path.exists(p):
700
+ ext = os.path.splitext(p)[1].lower()
701
+ if ext in (".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff"):
702
+ chosen_path = p
703
+ if ext == ".png":
704
+ break
705
+
706
+ if not chosen_path:
707
+ return None
708
+
709
+ try:
710
+ with open(chosen_path, "rb") as f:
711
+ data = f.read()
712
+ blob = gtypes.FunctionResponseBlob(
713
+ mime_type="image/png" if chosen_path.lower().endswith(".png") else "image/jpeg",
714
+ data=data,
715
+ )
716
+ return gtypes.FunctionResponsePart(inline_data=blob)
717
+ except Exception:
718
+ return None
719
+
720
+ @staticmethod
721
+ def _minimal_tool_response(item: CtxItem) -> Dict[str, Any]:
722
+ """
723
+ Construct a minimal structured payload for FunctionResponse.response.
724
+ """
725
+ resp: Dict[str, Any] = {"ok": True}
726
+ try:
727
+ if item and item.extra and isinstance(item.extra, dict):
728
+ outputs = item.extra.get("tool_output")
729
+ if isinstance(outputs, list) and len(outputs) > 0:
730
+ last = outputs[-1]
731
+ if isinstance(last, dict):
732
+ if "result" in last and isinstance(last["result"], dict):
733
+ resp = last["result"]
734
+ if "error" in last:
735
+ resp["error"] = last["error"]
736
+ except Exception:
737
+ pass
738
+ return resp
739
+
360
740
  def _extract_inline_images_and_links(
361
741
  self,
362
742
  response, ctx: CtxItem
363
743
  ) -> None:
364
744
  """
365
745
  Extract inline image parts (Gemini image output) and file links.
366
- - Saves inline_data (image/*) bytes to files and appends paths to ctx.images.
367
- - Appends HTTP(S) image URIs from file_data to ctx.urls.
368
-
369
- :param response: Response object
370
- :param ctx: CtxItem to set images and urls
371
746
  """
372
747
  images: list[str] = []
373
748
  urls: list[str] = []
@@ -378,7 +753,6 @@ class Chat:
378
753
  content = getattr(cand, "content", None)
379
754
  parts = getattr(content, "parts", None) or []
380
755
  for p in parts:
381
- # Inline image bytes (image preview / image generation in chat)
382
756
  blob = getattr(p, "inline_data", None)
383
757
  if blob:
384
758
  mime = (getattr(blob, "mime_type", "") or "").lower()
@@ -392,16 +766,14 @@ class Chat:
392
766
  f.write(img_bytes)
393
767
  images.append(img_path)
394
768
 
395
- # File data URI (may contain http/https or gs://)
396
769
  fdata = getattr(p, "file_data", None)
397
770
  if fdata:
398
771
  uri = getattr(fdata, "file_uri", None) or getattr(fdata, "uri", None)
399
772
  mime = (getattr(fdata, "mime_type", "") or "").lower()
400
773
  if uri and mime.startswith("image/"):
401
- # Store only as URL; downloading is out of scope here.
402
774
  if uri.startswith("http://") or uri.startswith("https://"):
403
775
  urls.append(uri)
404
- except Exception as e:
776
+ except Exception:
405
777
  pass
406
778
 
407
779
  if images:
@@ -418,9 +790,6 @@ class Chat:
418
790
  def _ensure_bytes(data) -> bytes | None:
419
791
  """
420
792
  Return raw bytes from SDK part.inline_data.data which can be bytes or base64 string.
421
-
422
- :param data: bytes or str
423
- :return: bytes or None
424
793
  """
425
794
  try:
426
795
  if isinstance(data, (bytes, bytearray)):
@@ -432,56 +801,6 @@ class Chat:
432
801
  return None
433
802
  return None
434
803
 
435
- def build_input(
436
- self,
437
- prompt: str,
438
- system_prompt: str,
439
- model: ModelItem,
440
- history: Optional[List[CtxItem]] = None,
441
- attachments: Optional[Dict[str, AttachmentItem]] = None,
442
- multimodal_ctx: Optional[MultimodalContext] = None,
443
- ) -> List[Content]:
444
- """
445
- Build Google GenAI contents list
446
-
447
- :param prompt: User prompt
448
- :param system_prompt: System prompt/instruction
449
- :param model: ModelItem
450
- :param history: List of CtxItem for history
451
- :param attachments: Dict of AttachmentItem for images
452
- :param multimodal_ctx: MultimodalContext for audio
453
- :return: List of Content
454
- """
455
- contents: List[Content] = []
456
-
457
- # System instruction is passed separately (system_instruction),
458
- # so we do not build an explicit system role part here.
459
-
460
- # Append conversation history
461
- if self.window.core.config.get('use_context'):
462
- items = self.window.core.ctx.get_history(
463
- history,
464
- model.id,
465
- MODE_CHAT if model and MODE_CHAT in model.mode else MODE_CHAT,
466
- self.window.core.tokens.from_user(prompt, system_prompt),
467
- self._fit_ctx(model),
468
- )
469
- for item in items:
470
- if item.final_input:
471
- contents.append(Content(role="user", parts=[Part.from_text(text=str(item.final_input))]))
472
- if item.final_output:
473
- contents.append(Content(role="model", parts=[Part.from_text(text=str(item.final_output))]))
474
-
475
- # Current user message with multimodal parts
476
- parts = self._build_user_parts(
477
- content=str(prompt),
478
- attachments=attachments,
479
- multimodal_ctx=multimodal_ctx,
480
- )
481
- contents.append(Content(role="user", parts=parts))
482
-
483
- return contents
484
-
485
804
  def _build_user_parts(
486
805
  self,
487
806
  content: str,
@@ -490,11 +809,6 @@ class Chat:
490
809
  ) -> List[Part]:
491
810
  """
492
811
  Build user message parts (text + images + audio)
493
-
494
- :param content: User text content
495
- :param attachments: Dict of AttachmentItem for images
496
- :param multimodal_ctx: MultimodalContext for audio
497
- :return: List of Part
498
812
  """
499
813
  self.window.core.api.google.vision.reset()
500
814
  parts: List[Part] = []
@@ -515,9 +829,6 @@ class Chat:
515
829
  def _fit_ctx(self, model: ModelItem) -> int:
516
830
  """
517
831
  Fit to max model tokens (best-effort, uses model.ctx if present)
518
-
519
- :param model: ModelItem
520
- :return: max context tokens
521
832
  """
522
833
  max_ctx_tokens = self.window.core.config.get('max_total_tokens')
523
834
  if model and model.ctx and 0 < model.ctx < max_ctx_tokens:
@@ -530,15 +841,10 @@ class Chat:
530
841
  system_prompt: str,
531
842
  model: ModelItem,
532
843
  history: Optional[List[CtxItem]] = None,
844
+ mode: str = MODE_CHAT,
533
845
  ) -> List[dict]:
534
846
  """
535
847
  Build simple messages structure for local token estimation
536
-
537
- :param prompt: User prompt
538
- :param system_prompt: System prompt/instruction
539
- :param model: ModelItem
540
- :param history: List of CtxItem for history
541
- :return: List of messages dicts with 'role' and 'content' keys
542
848
  """
543
849
  messages = []
544
850
  if system_prompt:
@@ -562,7 +868,6 @@ class Chat:
562
868
  messages.append({"role": "user", "content": str(prompt)})
563
869
  return messages
564
870
 
565
-
566
871
  def reset_tokens(self):
567
872
  """Reset input tokens counter"""
568
873
  self.input_tokens = 0
@@ -570,8 +875,6 @@ class Chat:
570
875
  def get_used_tokens(self) -> int:
571
876
  """
572
877
  Get input tokens counter (estimated before sending)
573
-
574
- :return: input tokens count
575
878
  """
576
879
  return self.input_tokens
577
880
 
@@ -579,13 +882,126 @@ class Chat:
579
882
  def _supports_tts(model_id: Optional[str]) -> bool:
580
883
  """
581
884
  Heuristic check if the model supports native TTS.
582
- - Official TTS models contain '-tts' in id (e.g. 'gemini-2.5-flash-preview-tts').
583
- - Future/preview names may contain 'native-audio'.
584
-
585
- :param model_id: Model ID
586
- :return: True if supports TTS, False otherwise
587
885
  """
588
886
  if not model_id:
589
887
  return False
590
888
  mid = model_id.lower()
591
- return ("-tts" in mid) or ("native-audio" in mid)
889
+ return ("-tts" in mid) or ("native-audio" in mid)
890
+
891
+ @staticmethod
892
+ def _find_last_interaction_state(
893
+ history: Optional[List[CtxItem]],
894
+ ctx: CtxItem,
895
+ ) -> Tuple[Optional[str], Optional[str], Optional[str]]:
896
+ """
897
+ Resolve last known Interactions state:
898
+ - previous_interaction_id: to continue conversation context
899
+ - last_event_id: to resume streaming (not used here, but returned for completeness)
900
+ - last_status: last known status string if available
901
+
902
+ Looks at current ctx.extra first, then scans history from newest to oldest.
903
+ """
904
+ prev_interaction_id: Optional[str] = None
905
+ last_event_id: Optional[str] = None
906
+ last_status: Optional[str] = None
907
+
908
+ try:
909
+ if getattr(ctx, "extra", None) and isinstance(ctx.extra, dict):
910
+ prev_interaction_id = (
911
+ ctx.extra.get("previous_interaction_id")
912
+ or ctx.extra.get("google_interaction_id")
913
+ or ctx.extra.get("google_last_interaction_id")
914
+ )
915
+ last_event_id = ctx.extra.get("google_last_event_id")
916
+ last_status = ctx.extra.get("google_interaction_status")
917
+ except Exception:
918
+ pass
919
+
920
+ if not prev_interaction_id and history:
921
+ for item in reversed(history or []):
922
+ ex = getattr(item, "extra", None)
923
+ if not ex or not isinstance(ex, dict):
924
+ continue
925
+ prev_interaction_id = (
926
+ ex.get("previous_interaction_id")
927
+ or ex.get("google_interaction_id")
928
+ or ex.get("google_last_interaction_id")
929
+ or prev_interaction_id
930
+ )
931
+ last_event_id = ex.get("google_last_event_id") or last_event_id
932
+ last_status = ex.get("google_interaction_status") or last_status
933
+ if prev_interaction_id and last_event_id:
934
+ break
935
+
936
+ return prev_interaction_id, last_event_id, last_status
937
+
938
+ @staticmethod
939
+ def _mime_to_interactions_type(mime: str) -> Optional[str]:
940
+ """
941
+ Map MIME type to Interactions input type.
942
+ """
943
+ if not mime:
944
+ return None
945
+ m = mime.lower()
946
+ if m.startswith("image/"):
947
+ return "image"
948
+ if m.startswith("audio/"):
949
+ return "audio"
950
+ if m.startswith("video/"):
951
+ return "video"
952
+ return None
953
+
954
+ @staticmethod
955
+ def _ensure_base64(data) -> Optional[str]:
956
+ """
957
+ Return base64 string from raw bytes or a base64 string.
958
+ """
959
+ try:
960
+ if data is None:
961
+ return None
962
+ if isinstance(data, str):
963
+ return data
964
+ if isinstance(data, (bytes, bytearray)):
965
+ import base64
966
+ return base64.b64encode(bytes(data)).decode("utf-8")
967
+ except Exception:
968
+ return None
969
+ return None
970
+
971
+ def _parts_to_interactions_input(self, parts: List[Part]) -> List[Dict[str, Any]]:
972
+ """
973
+ Convert Responses API Part list into Interactions API input payload.
974
+ """
975
+ out: List[Dict[str, Any]] = []
976
+
977
+ for p in parts or []:
978
+ # Text
979
+ t = getattr(p, "text", None)
980
+ if t is not None:
981
+ s = str(t).strip()
982
+ if s:
983
+ out.append({"type": "text", "text": s})
984
+ continue
985
+
986
+ # Inline data (images/audio/video)
987
+ inline = getattr(p, "inline_data", None)
988
+ if inline:
989
+ mime = (getattr(inline, "mime_type", "") or "").lower()
990
+ typ = self._mime_to_interactions_type(mime)
991
+ data = getattr(inline, "data", None)
992
+ b64 = self._ensure_base64(data)
993
+ if typ and b64:
994
+ out.append({"type": typ, "data": b64, "mime_type": mime})
995
+ continue
996
+
997
+ # File references (prefer URIs from Gemini Files API)
998
+ fdata = getattr(p, "file_data", None)
999
+ if fdata:
1000
+ uri = getattr(fdata, "file_uri", None) or getattr(fdata, "uri", None)
1001
+ mime = (getattr(fdata, "mime_type", "") or "").lower()
1002
+ typ = self._mime_to_interactions_type(mime)
1003
+ if typ and uri:
1004
+ out.append({"type": typ, "uri": uri})
1005
+ continue
1006
+
1007
+ return out