pygpt-net 2.7.4__py3-none-any.whl → 2.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pygpt_net/CHANGELOG.txt +7 -0
- pygpt_net/__init__.py +3 -3
- pygpt_net/app_core.py +4 -2
- pygpt_net/controller/__init__.py +5 -1
- pygpt_net/controller/assistant/assistant.py +1 -4
- pygpt_net/controller/assistant/batch.py +5 -504
- pygpt_net/controller/assistant/editor.py +5 -5
- pygpt_net/controller/assistant/files.py +16 -16
- pygpt_net/controller/chat/handler/google_stream.py +307 -1
- pygpt_net/controller/chat/handler/worker.py +8 -1
- pygpt_net/controller/chat/image.py +2 -2
- pygpt_net/controller/dialogs/confirm.py +73 -101
- pygpt_net/controller/lang/mapping.py +9 -9
- pygpt_net/controller/painter/capture.py +50 -1
- pygpt_net/controller/presets/presets.py +2 -1
- pygpt_net/controller/remote_store/__init__.py +12 -0
- pygpt_net/{provider/core/assistant_file/db_sqlite → controller/remote_store/google}/__init__.py +2 -2
- pygpt_net/controller/remote_store/google/batch.py +402 -0
- pygpt_net/controller/remote_store/google/store.py +615 -0
- pygpt_net/controller/remote_store/openai/__init__.py +12 -0
- pygpt_net/controller/remote_store/openai/batch.py +524 -0
- pygpt_net/controller/{assistant → remote_store/openai}/store.py +63 -60
- pygpt_net/controller/remote_store/remote_store.py +35 -0
- pygpt_net/controller/ui/ui.py +20 -1
- pygpt_net/core/assistants/assistants.py +3 -15
- pygpt_net/core/db/database.py +5 -3
- pygpt_net/core/locale/placeholder.py +35 -0
- pygpt_net/core/remote_store/__init__.py +12 -0
- pygpt_net/core/remote_store/google/__init__.py +11 -0
- pygpt_net/core/remote_store/google/files.py +224 -0
- pygpt_net/core/remote_store/google/store.py +248 -0
- pygpt_net/core/remote_store/openai/__init__.py +11 -0
- pygpt_net/core/{assistants → remote_store/openai}/files.py +26 -19
- pygpt_net/core/{assistants → remote_store/openai}/store.py +32 -15
- pygpt_net/core/remote_store/remote_store.py +24 -0
- pygpt_net/data/config/config.json +8 -4
- pygpt_net/data/config/models.json +77 -3
- pygpt_net/data/config/settings.json +45 -0
- pygpt_net/data/locale/locale.de.ini +41 -41
- pygpt_net/data/locale/locale.en.ini +53 -43
- pygpt_net/data/locale/locale.es.ini +41 -41
- pygpt_net/data/locale/locale.fr.ini +41 -41
- pygpt_net/data/locale/locale.it.ini +41 -41
- pygpt_net/data/locale/locale.pl.ini +42 -42
- pygpt_net/data/locale/locale.uk.ini +41 -41
- pygpt_net/data/locale/locale.zh.ini +41 -41
- pygpt_net/data/locale/plugin.cmd_history.de.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_history.en.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_history.es.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_history.fr.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_history.it.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_history.pl.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_history.uk.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_history.zh.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_mouse_control.en.ini +14 -0
- pygpt_net/data/locale/plugin.cmd_web.de.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_web.en.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_web.es.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_web.fr.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_web.it.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_web.pl.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_web.uk.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_web.zh.ini +1 -1
- pygpt_net/data/locale/plugin.idx_llama_index.de.ini +2 -2
- pygpt_net/data/locale/plugin.idx_llama_index.en.ini +2 -2
- pygpt_net/data/locale/plugin.idx_llama_index.es.ini +2 -2
- pygpt_net/data/locale/plugin.idx_llama_index.fr.ini +2 -2
- pygpt_net/data/locale/plugin.idx_llama_index.it.ini +2 -2
- pygpt_net/data/locale/plugin.idx_llama_index.pl.ini +2 -2
- pygpt_net/data/locale/plugin.idx_llama_index.uk.ini +2 -2
- pygpt_net/data/locale/plugin.idx_llama_index.zh.ini +2 -2
- pygpt_net/item/assistant.py +1 -211
- pygpt_net/item/ctx.py +3 -1
- pygpt_net/item/store.py +238 -0
- pygpt_net/migrations/Version20260102190000.py +35 -0
- pygpt_net/migrations/__init__.py +3 -1
- pygpt_net/plugin/cmd_mouse_control/config.py +470 -1
- pygpt_net/plugin/cmd_mouse_control/plugin.py +488 -22
- pygpt_net/plugin/cmd_mouse_control/worker.py +464 -87
- pygpt_net/plugin/cmd_mouse_control/worker_sandbox.py +729 -0
- pygpt_net/plugin/idx_llama_index/config.py +2 -2
- pygpt_net/provider/api/google/__init__.py +16 -54
- pygpt_net/provider/api/google/chat.py +546 -129
- pygpt_net/provider/api/google/computer.py +190 -0
- pygpt_net/provider/api/google/realtime/realtime.py +2 -2
- pygpt_net/provider/api/google/remote_tools.py +93 -0
- pygpt_net/provider/api/google/store.py +546 -0
- pygpt_net/provider/api/google/worker/__init__.py +0 -0
- pygpt_net/provider/api/google/worker/importer.py +392 -0
- pygpt_net/provider/api/openai/computer.py +10 -1
- pygpt_net/provider/api/openai/store.py +6 -6
- pygpt_net/provider/api/openai/worker/importer.py +24 -24
- pygpt_net/provider/core/config/patch.py +16 -1
- pygpt_net/provider/core/config/patches/patch_before_2_6_42.py +3 -3
- pygpt_net/provider/core/model/patch.py +17 -3
- pygpt_net/provider/core/preset/json_file.py +13 -7
- pygpt_net/provider/core/{assistant_file → remote_file}/__init__.py +1 -1
- pygpt_net/provider/core/{assistant_file → remote_file}/base.py +9 -9
- pygpt_net/provider/core/remote_file/db_sqlite/__init__.py +12 -0
- pygpt_net/provider/core/{assistant_file → remote_file}/db_sqlite/patch.py +1 -1
- pygpt_net/provider/core/{assistant_file → remote_file}/db_sqlite/provider.py +23 -20
- pygpt_net/provider/core/{assistant_file → remote_file}/db_sqlite/storage.py +35 -27
- pygpt_net/provider/core/{assistant_file → remote_file}/db_sqlite/utils.py +5 -4
- pygpt_net/provider/core/{assistant_store → remote_store}/__init__.py +1 -1
- pygpt_net/provider/core/{assistant_store → remote_store}/base.py +10 -10
- pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/__init__.py +1 -1
- pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/patch.py +1 -1
- pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/provider.py +16 -15
- pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/storage.py +30 -23
- pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/utils.py +5 -4
- pygpt_net/provider/core/{assistant_store → remote_store}/json_file.py +9 -9
- pygpt_net/provider/llms/google.py +2 -2
- pygpt_net/ui/base/config_dialog.py +3 -2
- pygpt_net/ui/dialog/assistant.py +3 -3
- pygpt_net/ui/dialog/plugins.py +3 -1
- pygpt_net/ui/dialog/remote_store_google.py +539 -0
- pygpt_net/ui/dialog/{assistant_store.py → remote_store_openai.py} +95 -95
- pygpt_net/ui/dialogs.py +5 -3
- pygpt_net/ui/layout/chat/attachments_uploaded.py +3 -3
- pygpt_net/ui/layout/toolbox/computer_env.py +26 -8
- pygpt_net/ui/menu/tools.py +13 -5
- pygpt_net/ui/widget/dialog/remote_store_google.py +56 -0
- pygpt_net/ui/widget/dialog/{assistant_store.py → remote_store_openai.py} +9 -9
- pygpt_net/ui/widget/element/button.py +4 -4
- pygpt_net/ui/widget/lists/remote_store_google.py +248 -0
- pygpt_net/ui/widget/lists/{assistant_store.py → remote_store_openai.py} +21 -21
- pygpt_net/ui/widget/option/checkbox_list.py +47 -9
- pygpt_net/ui/widget/option/combo.py +39 -3
- {pygpt_net-2.7.4.dist-info → pygpt_net-2.7.5.dist-info}/METADATA +33 -2
- {pygpt_net-2.7.4.dist-info → pygpt_net-2.7.5.dist-info}/RECORD +133 -108
- {pygpt_net-2.7.4.dist-info → pygpt_net-2.7.5.dist-info}/LICENSE +0 -0
- {pygpt_net-2.7.4.dist-info → pygpt_net-2.7.5.dist-info}/WHEEL +0 -0
- {pygpt_net-2.7.4.dist-info → pygpt_net-2.7.5.dist-info}/entry_points.txt +0 -0
|
@@ -6,15 +6,16 @@
|
|
|
6
6
|
# GitHub: https://github.com/szczyglis-dev/py-gpt #
|
|
7
7
|
# MIT License #
|
|
8
8
|
# Created By : Marcin Szczygliński #
|
|
9
|
-
# Updated Date:
|
|
9
|
+
# Updated Date: 2026.01.03 02:10:00 #
|
|
10
10
|
# ================================================== #
|
|
11
11
|
|
|
12
|
-
|
|
12
|
+
import os
|
|
13
|
+
from typing import Optional, Dict, Any, List, Tuple
|
|
13
14
|
|
|
14
15
|
from google.genai import types as gtypes
|
|
15
16
|
from google.genai.types import Content, Part
|
|
16
17
|
|
|
17
|
-
from pygpt_net.core.types import MODE_CHAT, MODE_AUDIO
|
|
18
|
+
from pygpt_net.core.types import MODE_CHAT, MODE_AUDIO, MODE_COMPUTER, MODE_RESEARCH
|
|
18
19
|
from pygpt_net.core.bridge.context import BridgeContext, MultimodalContext
|
|
19
20
|
from pygpt_net.item.attachment import AttachmentItem
|
|
20
21
|
from pygpt_net.item.ctx import CtxItem
|
|
@@ -35,7 +36,7 @@ class Chat:
|
|
|
35
36
|
extra: Optional[Dict[str, Any]] = None
|
|
36
37
|
):
|
|
37
38
|
"""
|
|
38
|
-
Call Google GenAI for chat / multimodal / audio.
|
|
39
|
+
Call Google GenAI for chat / multimodal / audio / computer use.
|
|
39
40
|
|
|
40
41
|
:param context: BridgeContext with prompt, model, history, mode, etc.
|
|
41
42
|
:param extra: Extra parameters (not used currently)
|
|
@@ -62,7 +63,6 @@ class Chat:
|
|
|
62
63
|
|
|
63
64
|
# ------------- TRANSCRIPTION PATH (audio input -> text -> feed to TTS) -------------
|
|
64
65
|
if mode == MODE_AUDIO and has_audio_input:
|
|
65
|
-
# Build minimal transcription request: [instruction text, audio part]
|
|
66
66
|
transcribe_model = self.window.core.config.get("google_audio.transcribe_model", "gemini-2.5-flash")
|
|
67
67
|
transcribe_prompt = self.window.core.config.get("google_audio.transcribe_prompt", "Transcribe this audio clip to text.")
|
|
68
68
|
audio_part = self.window.core.api.google.audio.build_part(multimodal_ctx)
|
|
@@ -73,13 +73,10 @@ class Chat:
|
|
|
73
73
|
])
|
|
74
74
|
]
|
|
75
75
|
trans_cfg = gtypes.GenerateContentConfig(
|
|
76
|
-
# Keep minimal; no tools/system for transcription
|
|
77
76
|
temperature=self.window.core.config.get('temperature'),
|
|
78
77
|
top_p=self.window.core.config.get('top_p'),
|
|
79
78
|
max_output_tokens=context.max_tokens if context.max_tokens else None,
|
|
80
79
|
)
|
|
81
|
-
|
|
82
|
-
# Always non-stream here (we immediately need the text for TTS)
|
|
83
80
|
trans_resp = client.models.generate_content(
|
|
84
81
|
model=transcribe_model,
|
|
85
82
|
contents=trans_inputs,
|
|
@@ -87,20 +84,17 @@ class Chat:
|
|
|
87
84
|
)
|
|
88
85
|
transcribed_text = self.extract_text(trans_resp).strip()
|
|
89
86
|
if transcribed_text:
|
|
90
|
-
# Feed transcription into TTS as the final prompt
|
|
91
87
|
prompt = transcribed_text
|
|
92
88
|
ctx.input = transcribed_text
|
|
93
89
|
try:
|
|
94
|
-
# optional: store for debugging/UX
|
|
95
90
|
if isinstance(ctx.extra, dict):
|
|
96
91
|
ctx.extra["transcription"] = transcribed_text
|
|
97
92
|
except Exception:
|
|
98
93
|
pass
|
|
99
|
-
ctx.is_audio = False
|
|
100
|
-
multimodal_ctx.is_audio_input = False
|
|
94
|
+
ctx.is_audio = False
|
|
95
|
+
multimodal_ctx.is_audio_input = False
|
|
101
96
|
|
|
102
|
-
# ---------------------- REGULAR CHAT PATH
|
|
103
|
-
# Build contents for chat/multimodal (will be overridden for TTS below)
|
|
97
|
+
# ---------------------- REGULAR CHAT/COMPUTER PATH ----------------------
|
|
104
98
|
inputs = self.build_input(
|
|
105
99
|
prompt=prompt,
|
|
106
100
|
system_prompt=system_prompt,
|
|
@@ -108,23 +102,35 @@ class Chat:
|
|
|
108
102
|
history=context.history,
|
|
109
103
|
attachments=attachments,
|
|
110
104
|
multimodal_ctx=multimodal_ctx,
|
|
105
|
+
mode=mode,
|
|
111
106
|
)
|
|
112
107
|
|
|
113
108
|
# Best-effort input tokens estimate
|
|
114
109
|
self.reset_tokens()
|
|
115
|
-
count_msgs = self._build_count_messages(prompt, system_prompt, model, context.history)
|
|
110
|
+
count_msgs = self._build_count_messages(prompt, system_prompt, model, context.history, mode)
|
|
116
111
|
self.input_tokens += self.window.core.tokens.from_messages(count_msgs, model.id)
|
|
117
112
|
|
|
118
113
|
# Tools -> merge app-defined tools with remote tools
|
|
119
114
|
base_tools = self.window.core.api.google.tools.prepare(model, functions)
|
|
120
|
-
remote_tools = self.window.core.api.google.build_remote_tools(model)
|
|
115
|
+
remote_tools = self.window.core.api.google.remote_tools.build_remote_tools(model)
|
|
121
116
|
|
|
122
|
-
#
|
|
117
|
+
# Note: Combining native (remote) tools with function declarations is documented as Live API-only.
|
|
123
118
|
if base_tools:
|
|
124
|
-
remote_tools = []
|
|
119
|
+
remote_tools = []
|
|
125
120
|
tools = (base_tools or []) + (remote_tools or [])
|
|
126
|
-
|
|
127
|
-
|
|
121
|
+
|
|
122
|
+
# Enable Computer Use tool in computer mode (use the official Tool/ComputerUse object)
|
|
123
|
+
if mode == MODE_COMPUTER or (model and isinstance(model.id, str) and "computer-use" in model.id.lower()):
|
|
124
|
+
comp_env = gtypes.Environment.ENVIRONMENT_BROWSER
|
|
125
|
+
tools = [gtypes.Tool(
|
|
126
|
+
computer_use=gtypes.ComputerUse(
|
|
127
|
+
environment=comp_env,
|
|
128
|
+
)
|
|
129
|
+
)] # reset tools to only Computer Use (multiple tools not supported together)
|
|
130
|
+
|
|
131
|
+
# Some models cannot use tools; keep behavior for image-only models
|
|
132
|
+
if model and isinstance(model.id, str) and "-image" in model.id:
|
|
133
|
+
tools = None
|
|
128
134
|
|
|
129
135
|
# Sampling
|
|
130
136
|
temperature = self.window.core.config.get('temperature')
|
|
@@ -145,10 +151,9 @@ class Chat:
|
|
|
145
151
|
stream = False # TTS non-stream in this app
|
|
146
152
|
supports_tts = self._supports_tts(model.id)
|
|
147
153
|
|
|
148
|
-
# Force minimal single-turn input for TTS (text only), using prompt possibly replaced by transcription
|
|
149
154
|
inputs = [Content(role="user", parts=[Part.from_text(text=str(prompt or ""))])]
|
|
150
155
|
|
|
151
|
-
# Remove params not used by TTS flow
|
|
156
|
+
# Remove params not used by TTS flow
|
|
152
157
|
for key in ("temperature", "top_p", "max_output_tokens", "system_instruction", "tools"):
|
|
153
158
|
if key in cfg_kwargs:
|
|
154
159
|
del cfg_kwargs[key]
|
|
@@ -171,11 +176,93 @@ class Chat:
|
|
|
171
176
|
prebuilt_voice_config=gtypes.PrebuiltVoiceConfig(voice_name=voice_name)
|
|
172
177
|
)
|
|
173
178
|
)
|
|
174
|
-
# else: fallback to text-only below
|
|
175
|
-
|
|
176
179
|
cfg = gtypes.GenerateContentConfig(**cfg_kwargs)
|
|
177
180
|
params = dict(model=model.id, contents=inputs, config=cfg)
|
|
178
181
|
|
|
182
|
+
if mode == MODE_RESEARCH:
|
|
183
|
+
ctx.use_google_interactions_api = True
|
|
184
|
+
|
|
185
|
+
# Deep Research does not support audio inputs; if an audio snippet is present, transcribe it to text first.
|
|
186
|
+
if has_audio_input:
|
|
187
|
+
try:
|
|
188
|
+
transcribe_model = self.window.core.config.get("google_audio.transcribe_model", "gemini-2.5-flash")
|
|
189
|
+
transcribe_prompt = self.window.core.config.get("google_audio.transcribe_prompt", "Transcribe this audio clip to text.")
|
|
190
|
+
audio_part = self.window.core.api.google.audio.build_part(multimodal_ctx)
|
|
191
|
+
trans_inputs = [
|
|
192
|
+
Content(role="user", parts=[
|
|
193
|
+
Part.from_text(text=transcribe_prompt),
|
|
194
|
+
audio_part,
|
|
195
|
+
])
|
|
196
|
+
]
|
|
197
|
+
trans_cfg = gtypes.GenerateContentConfig(
|
|
198
|
+
temperature=self.window.core.config.get('temperature'),
|
|
199
|
+
top_p=self.window.core.config.get('top_p'),
|
|
200
|
+
max_output_tokens=context.max_tokens if context.max_tokens else None,
|
|
201
|
+
)
|
|
202
|
+
trans_resp = client.models.generate_content(
|
|
203
|
+
model=transcribe_model,
|
|
204
|
+
contents=trans_inputs,
|
|
205
|
+
config=trans_cfg,
|
|
206
|
+
)
|
|
207
|
+
transcribed_text = self.extract_text(trans_resp).strip()
|
|
208
|
+
if transcribed_text:
|
|
209
|
+
prompt = (str(prompt or "").strip() + "\n\n" + transcribed_text).strip() if prompt else transcribed_text
|
|
210
|
+
ctx.input = transcribed_text
|
|
211
|
+
try:
|
|
212
|
+
if isinstance(ctx.extra, dict):
|
|
213
|
+
ctx.extra["transcription"] = transcribed_text
|
|
214
|
+
except Exception:
|
|
215
|
+
pass
|
|
216
|
+
except Exception:
|
|
217
|
+
pass
|
|
218
|
+
# Ensure we don't send raw audio to Interactions API
|
|
219
|
+
if multimodal_ctx:
|
|
220
|
+
multimodal_ctx.is_audio_input = False
|
|
221
|
+
|
|
222
|
+
# Build single-turn multimodal input for Interactions API (no full chat history)
|
|
223
|
+
research_parts = self._build_user_parts(
|
|
224
|
+
content=str(prompt),
|
|
225
|
+
attachments=attachments,
|
|
226
|
+
multimodal_ctx=multimodal_ctx,
|
|
227
|
+
)
|
|
228
|
+
interactions_input = self._parts_to_interactions_input(research_parts)
|
|
229
|
+
|
|
230
|
+
# Try to continue context with the last completed interaction (server-side state)
|
|
231
|
+
prev_interaction_id, last_event_id, last_status = self._find_last_interaction_state(
|
|
232
|
+
history=context.history,
|
|
233
|
+
ctx=ctx,
|
|
234
|
+
)
|
|
235
|
+
try:
|
|
236
|
+
if ctx.extra is None:
|
|
237
|
+
ctx.extra = {}
|
|
238
|
+
if prev_interaction_id:
|
|
239
|
+
ctx.extra["previous_interaction_id"] = prev_interaction_id
|
|
240
|
+
if last_event_id:
|
|
241
|
+
ctx.extra["google_last_event_id"] = last_event_id
|
|
242
|
+
if last_status:
|
|
243
|
+
ctx.extra["google_interaction_status"] = last_status
|
|
244
|
+
except Exception:
|
|
245
|
+
pass
|
|
246
|
+
|
|
247
|
+
# Deep Research agent must use background=True; stream=True enables live progress updates.
|
|
248
|
+
create_kwargs: Dict[str, Any] = {
|
|
249
|
+
"agent": model.id,
|
|
250
|
+
"input": interactions_input if interactions_input else (str(prompt or "") or " "),
|
|
251
|
+
"background": True,
|
|
252
|
+
"stream": stream,
|
|
253
|
+
"agent_config": {
|
|
254
|
+
"type": "deep-research",
|
|
255
|
+
"thinking_summaries": "auto"
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
# Continue conversation on server using previous_interaction_id if available
|
|
260
|
+
if prev_interaction_id:
|
|
261
|
+
create_kwargs["previous_interaction_id"] = prev_interaction_id
|
|
262
|
+
|
|
263
|
+
# Do not pass custom tools here; Deep Research manages its own built-in tools.
|
|
264
|
+
return client.interactions.create(**create_kwargs)
|
|
265
|
+
|
|
179
266
|
if stream and mode != MODE_AUDIO:
|
|
180
267
|
return client.models.generate_content_stream(**params)
|
|
181
268
|
else:
|
|
@@ -189,28 +276,21 @@ class Chat:
|
|
|
189
276
|
"""
|
|
190
277
|
Unpack non-streaming response from Google GenAI and set context.
|
|
191
278
|
|
|
192
|
-
:param mode: MODE_CHAT or
|
|
279
|
+
:param mode: MODE_CHAT, MODE_AUDIO or MODE_COMPUTER
|
|
193
280
|
:param response: Response object
|
|
194
281
|
:param ctx: CtxItem to set output, audio_output, tokens, tool_calls
|
|
195
282
|
"""
|
|
196
283
|
if mode == MODE_AUDIO:
|
|
197
|
-
# Prefer audio if present
|
|
198
284
|
audio_bytes, mime = self.window.core.api.google.audio.extract_first_audio_part(response)
|
|
199
285
|
if audio_bytes:
|
|
200
|
-
|
|
201
|
-
# https://ai.google.dev/gemini-api/docs/speech-generation
|
|
202
|
-
if mime == "audio/pcm" or mime.startswith("audio/"):
|
|
286
|
+
if mime == "audio/pcm" or (isinstance(mime, str) and mime.startswith("audio/")):
|
|
203
287
|
wav_b64 = self.window.core.api.google.audio.pcm16_to_wav_base64(audio_bytes, rate=24000)
|
|
204
288
|
ctx.audio_output = wav_b64
|
|
205
289
|
ctx.is_audio = True
|
|
206
|
-
# Text transcript is typically not present for TTS; still try:
|
|
207
290
|
txt = self.extract_text(response)
|
|
208
291
|
ctx.output = txt or "..."
|
|
209
292
|
else:
|
|
210
|
-
# No audio present -> fallback to text
|
|
211
293
|
ctx.output = self.extract_text(response)
|
|
212
|
-
|
|
213
|
-
# Usage
|
|
214
294
|
try:
|
|
215
295
|
usage = getattr(response, "usage_metadata", None)
|
|
216
296
|
if usage:
|
|
@@ -219,17 +299,58 @@ class Chat:
|
|
|
219
299
|
ctx.set_tokens(p, c)
|
|
220
300
|
except Exception:
|
|
221
301
|
pass
|
|
302
|
+
return
|
|
222
303
|
|
|
223
|
-
|
|
304
|
+
# ---- chat / computer ----
|
|
305
|
+
ctx.output = self.extract_text(response) or ""
|
|
224
306
|
|
|
225
|
-
#
|
|
226
|
-
ctx.output = self.extract_text(response)
|
|
227
|
-
|
|
228
|
-
# Extract function calls
|
|
307
|
+
# 1) Extract tool calls and store in ctx.tool_calls (backward-compatible shape)
|
|
229
308
|
calls = self.extract_tool_calls(response)
|
|
230
309
|
if calls:
|
|
231
310
|
ctx.tool_calls = calls
|
|
232
311
|
|
|
312
|
+
# 2) In MODE_COMPUTER: capture raw model parts (with thought_signature) for next FunctionResponse turn
|
|
313
|
+
# and translate Computer Use calls into plugin commands now.
|
|
314
|
+
if mode == MODE_COMPUTER:
|
|
315
|
+
candidate = None
|
|
316
|
+
try:
|
|
317
|
+
cands = getattr(response, "candidates", None) or []
|
|
318
|
+
if cands:
|
|
319
|
+
candidate = cands[0]
|
|
320
|
+
except Exception:
|
|
321
|
+
pass
|
|
322
|
+
|
|
323
|
+
if candidate and getattr(candidate, "content", None):
|
|
324
|
+
parts = getattr(candidate.content, "parts", None) or []
|
|
325
|
+
dump = self._dump_model_parts(parts)
|
|
326
|
+
if dump:
|
|
327
|
+
if ctx.extra is None:
|
|
328
|
+
ctx.extra = {}
|
|
329
|
+
ctx.extra["prev_model_parts"] = dump
|
|
330
|
+
|
|
331
|
+
tool_calls: List[dict] = []
|
|
332
|
+
try:
|
|
333
|
+
tool_calls, has_calls = self.window.core.api.google.computer.handle_stream_chunk(
|
|
334
|
+
ctx=ctx,
|
|
335
|
+
chunk=response,
|
|
336
|
+
tool_calls=tool_calls,
|
|
337
|
+
)
|
|
338
|
+
except Exception as e:
|
|
339
|
+
has_calls = False
|
|
340
|
+
print(f"Gemini computer-use mapping error: {e}")
|
|
341
|
+
|
|
342
|
+
if has_calls and tool_calls:
|
|
343
|
+
ctx.force_call = True
|
|
344
|
+
self.window.core.debug.info("[chat] Google tool calls found, unpacking...")
|
|
345
|
+
self.window.core.command.unpack_tool_calls_chunks(ctx, tool_calls)
|
|
346
|
+
|
|
347
|
+
if calls:
|
|
348
|
+
if ctx.extra is None:
|
|
349
|
+
ctx.extra = {}
|
|
350
|
+
ctx.extra["function_response_required"] = True
|
|
351
|
+
ctx.extra["function_response_source"] = "ctx.tool_calls"
|
|
352
|
+
ctx.extra["function_response_reason"] = "computer_use"
|
|
353
|
+
|
|
233
354
|
# Usage if available
|
|
234
355
|
try:
|
|
235
356
|
usage = getattr(response, "usage_metadata", None)
|
|
@@ -283,12 +404,11 @@ class Chat:
|
|
|
283
404
|
:return: List of tool calls
|
|
284
405
|
"""
|
|
285
406
|
def _to_plain_dict(obj):
|
|
286
|
-
# Convert pydantic/genai objects to plain dict recursively
|
|
287
407
|
try:
|
|
288
408
|
if hasattr(obj, "to_json_dict"):
|
|
289
409
|
return obj.to_json_dict()
|
|
290
410
|
if hasattr(obj, "model_dump"):
|
|
291
|
-
return obj.model_dump()
|
|
411
|
+
return obj.model_dump()
|
|
292
412
|
if hasattr(obj, "to_dict"):
|
|
293
413
|
return obj.to_dict()
|
|
294
414
|
except Exception:
|
|
@@ -307,7 +427,6 @@ class Chat:
|
|
|
307
427
|
name = getattr(fc, "name", "") or ""
|
|
308
428
|
args_obj = getattr(fc, "args", {}) or {}
|
|
309
429
|
args_dict = _to_plain_dict(args_obj) or {}
|
|
310
|
-
# if str, try to parse
|
|
311
430
|
if isinstance(args_dict, str):
|
|
312
431
|
try:
|
|
313
432
|
import json
|
|
@@ -319,7 +438,7 @@ class Chat:
|
|
|
319
438
|
"type": "function",
|
|
320
439
|
"function": {
|
|
321
440
|
"name": name,
|
|
322
|
-
"arguments": args_dict,
|
|
441
|
+
"arguments": args_dict,
|
|
323
442
|
}
|
|
324
443
|
})
|
|
325
444
|
|
|
@@ -345,11 +464,11 @@ class Chat:
|
|
|
345
464
|
except Exception:
|
|
346
465
|
args_dict = {}
|
|
347
466
|
out.append({
|
|
348
|
-
"id": "",
|
|
467
|
+
"id": getattr(fn, "id", "") or "",
|
|
349
468
|
"type": "function",
|
|
350
469
|
"function": {
|
|
351
470
|
"name": name,
|
|
352
|
-
"arguments": args_dict,
|
|
471
|
+
"arguments": args_dict,
|
|
353
472
|
}
|
|
354
473
|
})
|
|
355
474
|
except Exception:
|
|
@@ -357,17 +476,274 @@ class Chat:
|
|
|
357
476
|
|
|
358
477
|
return out
|
|
359
478
|
|
|
479
|
+
def build_input(
|
|
480
|
+
self,
|
|
481
|
+
prompt: str,
|
|
482
|
+
system_prompt: str,
|
|
483
|
+
model: ModelItem,
|
|
484
|
+
history: Optional[List[CtxItem]] = None,
|
|
485
|
+
attachments: Optional[Dict[str, AttachmentItem]] = None,
|
|
486
|
+
multimodal_ctx: Optional[MultimodalContext] = None,
|
|
487
|
+
mode: str = MODE_CHAT,
|
|
488
|
+
) -> List[Content]:
|
|
489
|
+
"""
|
|
490
|
+
Build Google GenAI contents list
|
|
491
|
+
|
|
492
|
+
:param prompt: User prompt
|
|
493
|
+
:param system_prompt: System prompt/instruction
|
|
494
|
+
:param model: ModelItem
|
|
495
|
+
:param history: List of CtxItem for history
|
|
496
|
+
:param attachments: Dict of AttachmentItem for images/screenshots
|
|
497
|
+
:param multimodal_ctx: MultimodalContext for audio
|
|
498
|
+
:param mode: MODE_CHAT / MODE_AUDIO / MODE_COMPUTER
|
|
499
|
+
:return: List of Content
|
|
500
|
+
"""
|
|
501
|
+
# FunctionResponse turn for Computer Use (strictly immediate after functionCall)
|
|
502
|
+
if mode == MODE_COMPUTER and self.window.core.config.get('use_context'):
|
|
503
|
+
hist = self.window.core.ctx.get_history(
|
|
504
|
+
history,
|
|
505
|
+
model.id,
|
|
506
|
+
MODE_CHAT,
|
|
507
|
+
self.window.core.tokens.from_user(prompt, system_prompt),
|
|
508
|
+
self._fit_ctx(model),
|
|
509
|
+
)
|
|
510
|
+
fr_contents = self._build_function_responses_from_history(hist, attachments)
|
|
511
|
+
if fr_contents:
|
|
512
|
+
return fr_contents
|
|
513
|
+
|
|
514
|
+
# Build conversation history first to detect "first input"
|
|
515
|
+
items: List[CtxItem] = []
|
|
516
|
+
if self.window.core.config.get('use_context'):
|
|
517
|
+
items = self.window.core.ctx.get_history(
|
|
518
|
+
history,
|
|
519
|
+
model.id,
|
|
520
|
+
MODE_CHAT if model and MODE_CHAT in model.mode else MODE_CHAT,
|
|
521
|
+
self.window.core.tokens.from_user(prompt, system_prompt),
|
|
522
|
+
self._fit_ctx(model),
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
is_first_turn = (len(items) == 0)
|
|
526
|
+
is_sandbox = bool(self.window.core.config.get("remote_tools.computer_use.sandbox", False))
|
|
527
|
+
|
|
528
|
+
contents: List[Content] = []
|
|
529
|
+
|
|
530
|
+
# Append conversation history (text only)
|
|
531
|
+
for item in items:
|
|
532
|
+
if item.final_input:
|
|
533
|
+
contents.append(Content(role="user", parts=[Part.from_text(text=str(item.final_input))]))
|
|
534
|
+
if item.final_output:
|
|
535
|
+
contents.append(Content(role="model", parts=[Part.from_text(text=str(item.final_output))]))
|
|
536
|
+
|
|
537
|
+
# Current user message:
|
|
538
|
+
# - In MODE_COMPUTER attach initial screenshot only on the very first turn
|
|
539
|
+
if mode == MODE_COMPUTER:
|
|
540
|
+
initial_attachments = {}
|
|
541
|
+
if is_first_turn and not attachments and not is_sandbox:
|
|
542
|
+
self.window.controller.attachment.clear_silent()
|
|
543
|
+
self.window.controller.painter.capture.screenshot(attach_cursor=True, silent=True)
|
|
544
|
+
initial_attachments = self.window.core.attachments.get_all(mode)
|
|
545
|
+
send_attachments = initial_attachments if initial_attachments else attachments
|
|
546
|
+
parts = self._build_user_parts(
|
|
547
|
+
content=str(prompt),
|
|
548
|
+
attachments=send_attachments,
|
|
549
|
+
multimodal_ctx=multimodal_ctx,
|
|
550
|
+
)
|
|
551
|
+
else:
|
|
552
|
+
parts = self._build_user_parts(
|
|
553
|
+
content=str(prompt),
|
|
554
|
+
attachments=attachments,
|
|
555
|
+
multimodal_ctx=multimodal_ctx,
|
|
556
|
+
)
|
|
557
|
+
contents.append(Content(role="user", parts=parts))
|
|
558
|
+
|
|
559
|
+
return contents
|
|
560
|
+
|
|
561
|
+
def _build_function_responses_from_history(
|
|
562
|
+
self,
|
|
563
|
+
history: Optional[List[CtxItem]],
|
|
564
|
+
attachments: Optional[Dict[str, AttachmentItem]],
|
|
565
|
+
) -> Optional[List[Content]]:
|
|
566
|
+
"""
|
|
567
|
+
Build FunctionResponse contents for the immediate next turn after executing
|
|
568
|
+
Computer Use function calls. It reconstructs the last user -> model(functionCall) turn
|
|
569
|
+
and returns [user_content, model_function_call_content, tool_function_response_content].
|
|
570
|
+
"""
|
|
571
|
+
if not self.window.core.config.get('use_context') or not history:
|
|
572
|
+
return None
|
|
573
|
+
|
|
574
|
+
last_item = history[-1]
|
|
575
|
+
if not getattr(last_item, "extra", None):
|
|
576
|
+
return None
|
|
577
|
+
if not last_item.extra.get("function_response_required"):
|
|
578
|
+
return None
|
|
579
|
+
|
|
580
|
+
# 1) Find the user message that started the current turn (previous item's input)
|
|
581
|
+
prior_user_text = ""
|
|
582
|
+
if len(history) >= 2:
|
|
583
|
+
prev = history[-2]
|
|
584
|
+
if getattr(prev, "final_input", None):
|
|
585
|
+
prior_user_text = str(prev.final_input)
|
|
586
|
+
|
|
587
|
+
if not prior_user_text and getattr(last_item, "input", None):
|
|
588
|
+
prior_user_text = str(last_item.input)
|
|
589
|
+
|
|
590
|
+
if not prior_user_text:
|
|
591
|
+
prior_user_text = "..."
|
|
592
|
+
|
|
593
|
+
user_content = Content(role="user", parts=[Part.from_text(text=prior_user_text)])
|
|
594
|
+
|
|
595
|
+
# 2) Rebuild the model functionCall content with thought_signature preserved
|
|
596
|
+
raw_parts = last_item.extra.get("prev_model_parts", [])
|
|
597
|
+
model_parts = self._rehydrate_model_parts(raw_parts)
|
|
598
|
+
if not model_parts:
|
|
599
|
+
model_parts = self._rehydrate_from_tool_calls(getattr(last_item, "tool_calls", []))
|
|
600
|
+
# append also text part if not empty
|
|
601
|
+
if getattr(last_item, "final_output", None):
|
|
602
|
+
output_text = str(last_item.final_output).strip()
|
|
603
|
+
if output_text:
|
|
604
|
+
model_parts.append(Part.from_text(text=output_text))
|
|
605
|
+
|
|
606
|
+
model_fc_content = Content(role="model", parts=model_parts)
|
|
607
|
+
|
|
608
|
+
# 3) Build a single tool content with N FunctionResponse parts (one per functionCall)
|
|
609
|
+
screenshot_part = self._screenshot_function_response_part(attachments)
|
|
610
|
+
fr_parts: List[Part] = []
|
|
611
|
+
for p in model_parts:
|
|
612
|
+
if getattr(p, "function_call", None):
|
|
613
|
+
fn = p.function_call
|
|
614
|
+
fr = Part.from_function_response(
|
|
615
|
+
name=fn.name,
|
|
616
|
+
response=self._minimal_tool_response(last_item),
|
|
617
|
+
parts=[screenshot_part] if screenshot_part else None
|
|
618
|
+
)
|
|
619
|
+
fr_parts.append(fr)
|
|
620
|
+
|
|
621
|
+
if not fr_parts:
|
|
622
|
+
return None
|
|
623
|
+
|
|
624
|
+
tool_content = Content(role="tool", parts=fr_parts)
|
|
625
|
+
|
|
626
|
+
return [user_content, model_fc_content, tool_content]
|
|
627
|
+
|
|
628
|
+
def _rehydrate_from_tool_calls(self, calls: List[dict]) -> List[Part]:
|
|
629
|
+
"""
|
|
630
|
+
Fallback rehydration when prev_model_parts are unavailable (no thought signatures).
|
|
631
|
+
"""
|
|
632
|
+
parts: List[Part] = []
|
|
633
|
+
for c in calls or []:
|
|
634
|
+
if not isinstance(c, dict):
|
|
635
|
+
continue
|
|
636
|
+
if c.get("type") != "function":
|
|
637
|
+
continue
|
|
638
|
+
fn = c.get("function") or {}
|
|
639
|
+
name = fn.get("name")
|
|
640
|
+
args = fn.get("arguments") or {}
|
|
641
|
+
if not name:
|
|
642
|
+
continue
|
|
643
|
+
parts.append(Part.from_function_call(name=name, args=args))
|
|
644
|
+
return parts
|
|
645
|
+
|
|
646
|
+
def _dump_model_parts(self, parts: List[Part]) -> List[dict]:
|
|
647
|
+
"""
|
|
648
|
+
Dump model parts into a JSON-serializable structure, preserving thought_signature.
|
|
649
|
+
"""
|
|
650
|
+
out: List[dict] = []
|
|
651
|
+
for p in parts or []:
|
|
652
|
+
ts = getattr(p, "thought_signature", None)
|
|
653
|
+
if getattr(p, "function_call", None):
|
|
654
|
+
fn = p.function_call
|
|
655
|
+
name = getattr(fn, "name", "") or ""
|
|
656
|
+
args = getattr(fn, "args", {}) or {}
|
|
657
|
+
out.append({
|
|
658
|
+
"type": "function_call",
|
|
659
|
+
"name": name,
|
|
660
|
+
"args": args,
|
|
661
|
+
"thought_signature": ts,
|
|
662
|
+
})
|
|
663
|
+
elif getattr(p, "text", None):
|
|
664
|
+
out.append({"type": "text", "text": str(p.text)})
|
|
665
|
+
return out
|
|
666
|
+
|
|
667
|
+
def _rehydrate_model_parts(self, raw_parts: List[dict]) -> List[Part]:
|
|
668
|
+
"""
|
|
669
|
+
Recreate SDK Part objects from dumped parts, including thought_signature on the Part.
|
|
670
|
+
"""
|
|
671
|
+
parts: List[Part] = []
|
|
672
|
+
for it in raw_parts or []:
|
|
673
|
+
t = (it.get("type") or "").lower()
|
|
674
|
+
if t == "function_call":
|
|
675
|
+
name = it.get("name")
|
|
676
|
+
args = it.get("args") or {}
|
|
677
|
+
ts = it.get("thought_signature")
|
|
678
|
+
if name:
|
|
679
|
+
parts.append(Part(function_call=gtypes.FunctionCall(name=name, args=args),
|
|
680
|
+
thought_signature=ts))
|
|
681
|
+
elif t == "text":
|
|
682
|
+
parts.append(Part.from_text(text=str(it.get("text", ""))))
|
|
683
|
+
return parts
|
|
684
|
+
|
|
685
|
+
def _screenshot_function_response_part(
|
|
686
|
+
self,
|
|
687
|
+
attachments: Optional[Dict[str, AttachmentItem]]
|
|
688
|
+
) -> Optional[gtypes.FunctionResponsePart]:
|
|
689
|
+
"""
|
|
690
|
+
Build FunctionResponsePart with inlineData PNG/JPEG screenshot from attachments.
|
|
691
|
+
"""
|
|
692
|
+
if not attachments:
|
|
693
|
+
return None
|
|
694
|
+
|
|
695
|
+
chosen_path = None
|
|
696
|
+
for _, att in attachments.items():
|
|
697
|
+
if not att or not att.path:
|
|
698
|
+
continue
|
|
699
|
+
p = att.path
|
|
700
|
+
if isinstance(p, str) and os.path.exists(p):
|
|
701
|
+
ext = os.path.splitext(p)[1].lower()
|
|
702
|
+
if ext in (".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff"):
|
|
703
|
+
chosen_path = p
|
|
704
|
+
if ext == ".png":
|
|
705
|
+
break
|
|
706
|
+
|
|
707
|
+
if not chosen_path:
|
|
708
|
+
return None
|
|
709
|
+
|
|
710
|
+
try:
|
|
711
|
+
with open(chosen_path, "rb") as f:
|
|
712
|
+
data = f.read()
|
|
713
|
+
blob = gtypes.FunctionResponseBlob(
|
|
714
|
+
mime_type="image/png" if chosen_path.lower().endswith(".png") else "image/jpeg",
|
|
715
|
+
data=data,
|
|
716
|
+
)
|
|
717
|
+
return gtypes.FunctionResponsePart(inline_data=blob)
|
|
718
|
+
except Exception:
|
|
719
|
+
return None
|
|
720
|
+
|
|
721
|
+
@staticmethod
|
|
722
|
+
def _minimal_tool_response(item: CtxItem) -> Dict[str, Any]:
|
|
723
|
+
"""
|
|
724
|
+
Construct a minimal structured payload for FunctionResponse.response.
|
|
725
|
+
"""
|
|
726
|
+
resp: Dict[str, Any] = {"ok": True}
|
|
727
|
+
try:
|
|
728
|
+
if item and item.extra and isinstance(item.extra, dict):
|
|
729
|
+
outputs = item.extra.get("tool_output")
|
|
730
|
+
if isinstance(outputs, list) and len(outputs) > 0:
|
|
731
|
+
last = outputs[-1]
|
|
732
|
+
if isinstance(last, dict):
|
|
733
|
+
if "result" in last and isinstance(last["result"], dict):
|
|
734
|
+
resp = last["result"]
|
|
735
|
+
if "error" in last:
|
|
736
|
+
resp["error"] = last["error"]
|
|
737
|
+
except Exception:
|
|
738
|
+
pass
|
|
739
|
+
return resp
|
|
740
|
+
|
|
360
741
|
def _extract_inline_images_and_links(
|
|
361
742
|
self,
|
|
362
743
|
response, ctx: CtxItem
|
|
363
744
|
) -> None:
|
|
364
745
|
"""
|
|
365
746
|
Extract inline image parts (Gemini image output) and file links.
|
|
366
|
-
- Saves inline_data (image/*) bytes to files and appends paths to ctx.images.
|
|
367
|
-
- Appends HTTP(S) image URIs from file_data to ctx.urls.
|
|
368
|
-
|
|
369
|
-
:param response: Response object
|
|
370
|
-
:param ctx: CtxItem to set images and urls
|
|
371
747
|
"""
|
|
372
748
|
images: list[str] = []
|
|
373
749
|
urls: list[str] = []
|
|
@@ -378,7 +754,6 @@ class Chat:
|
|
|
378
754
|
content = getattr(cand, "content", None)
|
|
379
755
|
parts = getattr(content, "parts", None) or []
|
|
380
756
|
for p in parts:
|
|
381
|
-
# Inline image bytes (image preview / image generation in chat)
|
|
382
757
|
blob = getattr(p, "inline_data", None)
|
|
383
758
|
if blob:
|
|
384
759
|
mime = (getattr(blob, "mime_type", "") or "").lower()
|
|
@@ -392,16 +767,14 @@ class Chat:
|
|
|
392
767
|
f.write(img_bytes)
|
|
393
768
|
images.append(img_path)
|
|
394
769
|
|
|
395
|
-
# File data URI (may contain http/https or gs://)
|
|
396
770
|
fdata = getattr(p, "file_data", None)
|
|
397
771
|
if fdata:
|
|
398
772
|
uri = getattr(fdata, "file_uri", None) or getattr(fdata, "uri", None)
|
|
399
773
|
mime = (getattr(fdata, "mime_type", "") or "").lower()
|
|
400
774
|
if uri and mime.startswith("image/"):
|
|
401
|
-
# Store only as URL; downloading is out of scope here.
|
|
402
775
|
if uri.startswith("http://") or uri.startswith("https://"):
|
|
403
776
|
urls.append(uri)
|
|
404
|
-
except Exception
|
|
777
|
+
except Exception:
|
|
405
778
|
pass
|
|
406
779
|
|
|
407
780
|
if images:
|
|
@@ -418,9 +791,6 @@ class Chat:
|
|
|
418
791
|
def _ensure_bytes(data) -> bytes | None:
|
|
419
792
|
"""
|
|
420
793
|
Return raw bytes from SDK part.inline_data.data which can be bytes or base64 string.
|
|
421
|
-
|
|
422
|
-
:param data: bytes or str
|
|
423
|
-
:return: bytes or None
|
|
424
794
|
"""
|
|
425
795
|
try:
|
|
426
796
|
if isinstance(data, (bytes, bytearray)):
|
|
@@ -432,56 +802,6 @@ class Chat:
|
|
|
432
802
|
return None
|
|
433
803
|
return None
|
|
434
804
|
|
|
435
|
-
def build_input(
|
|
436
|
-
self,
|
|
437
|
-
prompt: str,
|
|
438
|
-
system_prompt: str,
|
|
439
|
-
model: ModelItem,
|
|
440
|
-
history: Optional[List[CtxItem]] = None,
|
|
441
|
-
attachments: Optional[Dict[str, AttachmentItem]] = None,
|
|
442
|
-
multimodal_ctx: Optional[MultimodalContext] = None,
|
|
443
|
-
) -> List[Content]:
|
|
444
|
-
"""
|
|
445
|
-
Build Google GenAI contents list
|
|
446
|
-
|
|
447
|
-
:param prompt: User prompt
|
|
448
|
-
:param system_prompt: System prompt/instruction
|
|
449
|
-
:param model: ModelItem
|
|
450
|
-
:param history: List of CtxItem for history
|
|
451
|
-
:param attachments: Dict of AttachmentItem for images
|
|
452
|
-
:param multimodal_ctx: MultimodalContext for audio
|
|
453
|
-
:return: List of Content
|
|
454
|
-
"""
|
|
455
|
-
contents: List[Content] = []
|
|
456
|
-
|
|
457
|
-
# System instruction is passed separately (system_instruction),
|
|
458
|
-
# so we do not build an explicit system role part here.
|
|
459
|
-
|
|
460
|
-
# Append conversation history
|
|
461
|
-
if self.window.core.config.get('use_context'):
|
|
462
|
-
items = self.window.core.ctx.get_history(
|
|
463
|
-
history,
|
|
464
|
-
model.id,
|
|
465
|
-
MODE_CHAT if model and MODE_CHAT in model.mode else MODE_CHAT,
|
|
466
|
-
self.window.core.tokens.from_user(prompt, system_prompt),
|
|
467
|
-
self._fit_ctx(model),
|
|
468
|
-
)
|
|
469
|
-
for item in items:
|
|
470
|
-
if item.final_input:
|
|
471
|
-
contents.append(Content(role="user", parts=[Part.from_text(text=str(item.final_input))]))
|
|
472
|
-
if item.final_output:
|
|
473
|
-
contents.append(Content(role="model", parts=[Part.from_text(text=str(item.final_output))]))
|
|
474
|
-
|
|
475
|
-
# Current user message with multimodal parts
|
|
476
|
-
parts = self._build_user_parts(
|
|
477
|
-
content=str(prompt),
|
|
478
|
-
attachments=attachments,
|
|
479
|
-
multimodal_ctx=multimodal_ctx,
|
|
480
|
-
)
|
|
481
|
-
contents.append(Content(role="user", parts=parts))
|
|
482
|
-
|
|
483
|
-
return contents
|
|
484
|
-
|
|
485
805
|
def _build_user_parts(
|
|
486
806
|
self,
|
|
487
807
|
content: str,
|
|
@@ -490,11 +810,6 @@ class Chat:
|
|
|
490
810
|
) -> List[Part]:
|
|
491
811
|
"""
|
|
492
812
|
Build user message parts (text + images + audio)
|
|
493
|
-
|
|
494
|
-
:param content: User text content
|
|
495
|
-
:param attachments: Dict of AttachmentItem for images
|
|
496
|
-
:param multimodal_ctx: MultimodalContext for audio
|
|
497
|
-
:return: List of Part
|
|
498
813
|
"""
|
|
499
814
|
self.window.core.api.google.vision.reset()
|
|
500
815
|
parts: List[Part] = []
|
|
@@ -515,9 +830,6 @@ class Chat:
|
|
|
515
830
|
def _fit_ctx(self, model: ModelItem) -> int:
|
|
516
831
|
"""
|
|
517
832
|
Fit to max model tokens (best-effort, uses model.ctx if present)
|
|
518
|
-
|
|
519
|
-
:param model: ModelItem
|
|
520
|
-
:return: max context tokens
|
|
521
833
|
"""
|
|
522
834
|
max_ctx_tokens = self.window.core.config.get('max_total_tokens')
|
|
523
835
|
if model and model.ctx and 0 < model.ctx < max_ctx_tokens:
|
|
@@ -530,15 +842,10 @@ class Chat:
|
|
|
530
842
|
system_prompt: str,
|
|
531
843
|
model: ModelItem,
|
|
532
844
|
history: Optional[List[CtxItem]] = None,
|
|
845
|
+
mode: str = MODE_CHAT,
|
|
533
846
|
) -> List[dict]:
|
|
534
847
|
"""
|
|
535
848
|
Build simple messages structure for local token estimation
|
|
536
|
-
|
|
537
|
-
:param prompt: User prompt
|
|
538
|
-
:param system_prompt: System prompt/instruction
|
|
539
|
-
:param model: ModelItem
|
|
540
|
-
:param history: List of CtxItem for history
|
|
541
|
-
:return: List of messages dicts with 'role' and 'content' keys
|
|
542
849
|
"""
|
|
543
850
|
messages = []
|
|
544
851
|
if system_prompt:
|
|
@@ -562,7 +869,6 @@ class Chat:
|
|
|
562
869
|
messages.append({"role": "user", "content": str(prompt)})
|
|
563
870
|
return messages
|
|
564
871
|
|
|
565
|
-
|
|
566
872
|
def reset_tokens(self):
|
|
567
873
|
"""Reset input tokens counter"""
|
|
568
874
|
self.input_tokens = 0
|
|
@@ -570,8 +876,6 @@ class Chat:
|
|
|
570
876
|
def get_used_tokens(self) -> int:
|
|
571
877
|
"""
|
|
572
878
|
Get input tokens counter (estimated before sending)
|
|
573
|
-
|
|
574
|
-
:return: input tokens count
|
|
575
879
|
"""
|
|
576
880
|
return self.input_tokens
|
|
577
881
|
|
|
@@ -579,13 +883,126 @@ class Chat:
|
|
|
579
883
|
def _supports_tts(model_id: Optional[str]) -> bool:
|
|
580
884
|
"""
|
|
581
885
|
Heuristic check if the model supports native TTS.
|
|
582
|
-
- Official TTS models contain '-tts' in id (e.g. 'gemini-2.5-flash-preview-tts').
|
|
583
|
-
- Future/preview names may contain 'native-audio'.
|
|
584
|
-
|
|
585
|
-
:param model_id: Model ID
|
|
586
|
-
:return: True if supports TTS, False otherwise
|
|
587
886
|
"""
|
|
588
887
|
if not model_id:
|
|
589
888
|
return False
|
|
590
889
|
mid = model_id.lower()
|
|
591
|
-
return ("-tts" in mid) or ("native-audio" in mid)
|
|
890
|
+
return ("-tts" in mid) or ("native-audio" in mid)
|
|
891
|
+
|
|
892
|
+
@staticmethod
|
|
893
|
+
def _find_last_interaction_state(
|
|
894
|
+
history: Optional[List[CtxItem]],
|
|
895
|
+
ctx: CtxItem,
|
|
896
|
+
) -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
|
897
|
+
"""
|
|
898
|
+
Resolve last known Interactions state:
|
|
899
|
+
- previous_interaction_id: to continue conversation context
|
|
900
|
+
- last_event_id: to resume streaming (not used here, but returned for completeness)
|
|
901
|
+
- last_status: last known status string if available
|
|
902
|
+
|
|
903
|
+
Looks at current ctx.extra first, then scans history from newest to oldest.
|
|
904
|
+
"""
|
|
905
|
+
prev_interaction_id: Optional[str] = None
|
|
906
|
+
last_event_id: Optional[str] = None
|
|
907
|
+
last_status: Optional[str] = None
|
|
908
|
+
|
|
909
|
+
try:
|
|
910
|
+
if getattr(ctx, "extra", None) and isinstance(ctx.extra, dict):
|
|
911
|
+
prev_interaction_id = (
|
|
912
|
+
ctx.extra.get("previous_interaction_id")
|
|
913
|
+
or ctx.extra.get("google_interaction_id")
|
|
914
|
+
or ctx.extra.get("google_last_interaction_id")
|
|
915
|
+
)
|
|
916
|
+
last_event_id = ctx.extra.get("google_last_event_id")
|
|
917
|
+
last_status = ctx.extra.get("google_interaction_status")
|
|
918
|
+
except Exception:
|
|
919
|
+
pass
|
|
920
|
+
|
|
921
|
+
if not prev_interaction_id and history:
|
|
922
|
+
for item in reversed(history or []):
|
|
923
|
+
ex = getattr(item, "extra", None)
|
|
924
|
+
if not ex or not isinstance(ex, dict):
|
|
925
|
+
continue
|
|
926
|
+
prev_interaction_id = (
|
|
927
|
+
ex.get("previous_interaction_id")
|
|
928
|
+
or ex.get("google_interaction_id")
|
|
929
|
+
or ex.get("google_last_interaction_id")
|
|
930
|
+
or prev_interaction_id
|
|
931
|
+
)
|
|
932
|
+
last_event_id = ex.get("google_last_event_id") or last_event_id
|
|
933
|
+
last_status = ex.get("google_interaction_status") or last_status
|
|
934
|
+
if prev_interaction_id and last_event_id:
|
|
935
|
+
break
|
|
936
|
+
|
|
937
|
+
return prev_interaction_id, last_event_id, last_status
|
|
938
|
+
|
|
939
|
+
@staticmethod
|
|
940
|
+
def _mime_to_interactions_type(mime: str) -> Optional[str]:
|
|
941
|
+
"""
|
|
942
|
+
Map MIME type to Interactions input type.
|
|
943
|
+
"""
|
|
944
|
+
if not mime:
|
|
945
|
+
return None
|
|
946
|
+
m = mime.lower()
|
|
947
|
+
if m.startswith("image/"):
|
|
948
|
+
return "image"
|
|
949
|
+
if m.startswith("audio/"):
|
|
950
|
+
return "audio"
|
|
951
|
+
if m.startswith("video/"):
|
|
952
|
+
return "video"
|
|
953
|
+
return None
|
|
954
|
+
|
|
955
|
+
@staticmethod
|
|
956
|
+
def _ensure_base64(data) -> Optional[str]:
|
|
957
|
+
"""
|
|
958
|
+
Return base64 string from raw bytes or a base64 string.
|
|
959
|
+
"""
|
|
960
|
+
try:
|
|
961
|
+
if data is None:
|
|
962
|
+
return None
|
|
963
|
+
if isinstance(data, str):
|
|
964
|
+
return data
|
|
965
|
+
if isinstance(data, (bytes, bytearray)):
|
|
966
|
+
import base64
|
|
967
|
+
return base64.b64encode(bytes(data)).decode("utf-8")
|
|
968
|
+
except Exception:
|
|
969
|
+
return None
|
|
970
|
+
return None
|
|
971
|
+
|
|
972
|
+
def _parts_to_interactions_input(self, parts: List[Part]) -> List[Dict[str, Any]]:
|
|
973
|
+
"""
|
|
974
|
+
Convert Responses API Part list into Interactions API input payload.
|
|
975
|
+
"""
|
|
976
|
+
out: List[Dict[str, Any]] = []
|
|
977
|
+
|
|
978
|
+
for p in parts or []:
|
|
979
|
+
# Text
|
|
980
|
+
t = getattr(p, "text", None)
|
|
981
|
+
if t is not None:
|
|
982
|
+
s = str(t).strip()
|
|
983
|
+
if s:
|
|
984
|
+
out.append({"type": "text", "text": s})
|
|
985
|
+
continue
|
|
986
|
+
|
|
987
|
+
# Inline data (images/audio/video)
|
|
988
|
+
inline = getattr(p, "inline_data", None)
|
|
989
|
+
if inline:
|
|
990
|
+
mime = (getattr(inline, "mime_type", "") or "").lower()
|
|
991
|
+
typ = self._mime_to_interactions_type(mime)
|
|
992
|
+
data = getattr(inline, "data", None)
|
|
993
|
+
b64 = self._ensure_base64(data)
|
|
994
|
+
if typ and b64:
|
|
995
|
+
out.append({"type": typ, "data": b64, "mime_type": mime})
|
|
996
|
+
continue
|
|
997
|
+
|
|
998
|
+
# File references (prefer URIs from Gemini Files API)
|
|
999
|
+
fdata = getattr(p, "file_data", None)
|
|
1000
|
+
if fdata:
|
|
1001
|
+
uri = getattr(fdata, "file_uri", None) or getattr(fdata, "uri", None)
|
|
1002
|
+
mime = (getattr(fdata, "mime_type", "") or "").lower()
|
|
1003
|
+
typ = self._mime_to_interactions_type(mime)
|
|
1004
|
+
if typ and uri:
|
|
1005
|
+
out.append({"type": typ, "uri": uri})
|
|
1006
|
+
continue
|
|
1007
|
+
|
|
1008
|
+
return out
|