pygpt-net 2.7.4__py3-none-any.whl → 2.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pygpt_net/CHANGELOG.txt +15 -0
- pygpt_net/__init__.py +4 -4
- pygpt_net/app_core.py +4 -2
- pygpt_net/controller/__init__.py +5 -1
- pygpt_net/controller/assistant/assistant.py +1 -4
- pygpt_net/controller/assistant/batch.py +5 -504
- pygpt_net/controller/assistant/editor.py +5 -5
- pygpt_net/controller/assistant/files.py +16 -16
- pygpt_net/controller/chat/handler/google_stream.py +307 -1
- pygpt_net/controller/chat/handler/worker.py +10 -25
- pygpt_net/controller/chat/handler/xai_stream.py +621 -52
- pygpt_net/controller/chat/image.py +2 -2
- pygpt_net/controller/debug/fixtures.py +3 -2
- pygpt_net/controller/dialogs/confirm.py +73 -101
- pygpt_net/controller/files/files.py +65 -4
- pygpt_net/controller/lang/mapping.py +9 -9
- pygpt_net/controller/painter/capture.py +50 -1
- pygpt_net/controller/presets/presets.py +2 -1
- pygpt_net/controller/remote_store/__init__.py +12 -0
- pygpt_net/{provider/core/assistant_file/db_sqlite → controller/remote_store/google}/__init__.py +2 -2
- pygpt_net/controller/remote_store/google/batch.py +402 -0
- pygpt_net/controller/remote_store/google/store.py +615 -0
- pygpt_net/controller/remote_store/openai/__init__.py +12 -0
- pygpt_net/controller/remote_store/openai/batch.py +524 -0
- pygpt_net/controller/{assistant → remote_store/openai}/store.py +63 -60
- pygpt_net/controller/remote_store/remote_store.py +35 -0
- pygpt_net/controller/ui/ui.py +20 -1
- pygpt_net/core/assistants/assistants.py +3 -15
- pygpt_net/core/db/database.py +5 -3
- pygpt_net/core/filesystem/url.py +4 -1
- pygpt_net/core/locale/placeholder.py +35 -0
- pygpt_net/core/remote_store/__init__.py +12 -0
- pygpt_net/core/remote_store/google/__init__.py +11 -0
- pygpt_net/core/remote_store/google/files.py +224 -0
- pygpt_net/core/remote_store/google/store.py +248 -0
- pygpt_net/core/remote_store/openai/__init__.py +11 -0
- pygpt_net/core/{assistants → remote_store/openai}/files.py +26 -19
- pygpt_net/core/{assistants → remote_store/openai}/store.py +32 -15
- pygpt_net/core/remote_store/remote_store.py +24 -0
- pygpt_net/core/render/web/body.py +3 -2
- pygpt_net/core/types/chunk.py +27 -0
- pygpt_net/data/config/config.json +8 -4
- pygpt_net/data/config/models.json +77 -3
- pygpt_net/data/config/settings.json +45 -0
- pygpt_net/data/js/app/template.js +1 -1
- pygpt_net/data/js/app.min.js +2 -2
- pygpt_net/data/locale/locale.de.ini +44 -41
- pygpt_net/data/locale/locale.en.ini +56 -43
- pygpt_net/data/locale/locale.es.ini +44 -41
- pygpt_net/data/locale/locale.fr.ini +44 -41
- pygpt_net/data/locale/locale.it.ini +44 -41
- pygpt_net/data/locale/locale.pl.ini +45 -42
- pygpt_net/data/locale/locale.uk.ini +44 -41
- pygpt_net/data/locale/locale.zh.ini +44 -41
- pygpt_net/data/locale/plugin.cmd_history.de.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_history.en.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_history.es.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_history.fr.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_history.it.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_history.pl.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_history.uk.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_history.zh.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_mouse_control.en.ini +14 -0
- pygpt_net/data/locale/plugin.cmd_web.de.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_web.en.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_web.es.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_web.fr.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_web.it.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_web.pl.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_web.uk.ini +1 -1
- pygpt_net/data/locale/plugin.cmd_web.zh.ini +1 -1
- pygpt_net/data/locale/plugin.idx_llama_index.de.ini +2 -2
- pygpt_net/data/locale/plugin.idx_llama_index.en.ini +2 -2
- pygpt_net/data/locale/plugin.idx_llama_index.es.ini +2 -2
- pygpt_net/data/locale/plugin.idx_llama_index.fr.ini +2 -2
- pygpt_net/data/locale/plugin.idx_llama_index.it.ini +2 -2
- pygpt_net/data/locale/plugin.idx_llama_index.pl.ini +2 -2
- pygpt_net/data/locale/plugin.idx_llama_index.uk.ini +2 -2
- pygpt_net/data/locale/plugin.idx_llama_index.zh.ini +2 -2
- pygpt_net/item/assistant.py +1 -211
- pygpt_net/item/ctx.py +3 -3
- pygpt_net/item/store.py +238 -0
- pygpt_net/js_rc.py +2449 -2447
- pygpt_net/migrations/Version20260102190000.py +35 -0
- pygpt_net/migrations/__init__.py +3 -1
- pygpt_net/plugin/cmd_mouse_control/config.py +471 -1
- pygpt_net/plugin/cmd_mouse_control/plugin.py +487 -22
- pygpt_net/plugin/cmd_mouse_control/worker.py +464 -87
- pygpt_net/plugin/cmd_mouse_control/worker_sandbox.py +729 -0
- pygpt_net/plugin/idx_llama_index/config.py +2 -2
- pygpt_net/provider/api/anthropic/__init__.py +10 -8
- pygpt_net/provider/api/google/__init__.py +21 -58
- pygpt_net/provider/api/google/chat.py +545 -129
- pygpt_net/provider/api/google/computer.py +190 -0
- pygpt_net/provider/api/google/realtime/realtime.py +2 -2
- pygpt_net/provider/api/google/remote_tools.py +93 -0
- pygpt_net/provider/api/google/store.py +546 -0
- pygpt_net/provider/api/google/worker/__init__.py +0 -0
- pygpt_net/provider/api/google/worker/importer.py +392 -0
- pygpt_net/provider/api/openai/__init__.py +7 -3
- pygpt_net/provider/api/openai/computer.py +10 -1
- pygpt_net/provider/api/openai/responses.py +0 -0
- pygpt_net/provider/api/openai/store.py +6 -6
- pygpt_net/provider/api/openai/worker/importer.py +24 -24
- pygpt_net/provider/api/x_ai/__init__.py +10 -9
- pygpt_net/provider/api/x_ai/chat.py +272 -102
- pygpt_net/provider/core/config/patch.py +16 -1
- pygpt_net/provider/core/config/patches/patch_before_2_6_42.py +3 -3
- pygpt_net/provider/core/model/patch.py +17 -3
- pygpt_net/provider/core/preset/json_file.py +13 -7
- pygpt_net/provider/core/{assistant_file → remote_file}/__init__.py +1 -1
- pygpt_net/provider/core/{assistant_file → remote_file}/base.py +9 -9
- pygpt_net/provider/core/remote_file/db_sqlite/__init__.py +12 -0
- pygpt_net/provider/core/{assistant_file → remote_file}/db_sqlite/patch.py +1 -1
- pygpt_net/provider/core/{assistant_file → remote_file}/db_sqlite/provider.py +23 -20
- pygpt_net/provider/core/{assistant_file → remote_file}/db_sqlite/storage.py +35 -27
- pygpt_net/provider/core/{assistant_file → remote_file}/db_sqlite/utils.py +5 -4
- pygpt_net/provider/core/{assistant_store → remote_store}/__init__.py +1 -1
- pygpt_net/provider/core/{assistant_store → remote_store}/base.py +10 -10
- pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/__init__.py +1 -1
- pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/patch.py +1 -1
- pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/provider.py +16 -15
- pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/storage.py +30 -23
- pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/utils.py +5 -4
- pygpt_net/provider/core/{assistant_store → remote_store}/json_file.py +9 -9
- pygpt_net/provider/llms/google.py +2 -2
- pygpt_net/tools/image_viewer/ui/dialogs.py +298 -12
- pygpt_net/tools/text_editor/ui/widgets.py +5 -1
- pygpt_net/ui/base/config_dialog.py +3 -2
- pygpt_net/ui/base/context_menu.py +44 -1
- pygpt_net/ui/dialog/assistant.py +3 -3
- pygpt_net/ui/dialog/plugins.py +3 -1
- pygpt_net/ui/dialog/remote_store_google.py +539 -0
- pygpt_net/ui/dialog/{assistant_store.py → remote_store_openai.py} +95 -95
- pygpt_net/ui/dialogs.py +5 -3
- pygpt_net/ui/layout/chat/attachments_uploaded.py +3 -3
- pygpt_net/ui/layout/toolbox/computer_env.py +26 -8
- pygpt_net/ui/layout/toolbox/indexes.py +22 -19
- pygpt_net/ui/layout/toolbox/model.py +28 -5
- pygpt_net/ui/menu/tools.py +13 -5
- pygpt_net/ui/widget/dialog/remote_store_google.py +56 -0
- pygpt_net/ui/widget/dialog/{assistant_store.py → remote_store_openai.py} +9 -9
- pygpt_net/ui/widget/element/button.py +4 -4
- pygpt_net/ui/widget/image/display.py +25 -8
- pygpt_net/ui/widget/lists/remote_store_google.py +248 -0
- pygpt_net/ui/widget/lists/{assistant_store.py → remote_store_openai.py} +21 -21
- pygpt_net/ui/widget/option/checkbox_list.py +47 -9
- pygpt_net/ui/widget/option/combo.py +39 -3
- pygpt_net/ui/widget/tabs/output.py +9 -1
- pygpt_net/ui/widget/textarea/editor.py +14 -1
- pygpt_net/ui/widget/textarea/input.py +20 -7
- pygpt_net/ui/widget/textarea/notepad.py +24 -1
- pygpt_net/ui/widget/textarea/output.py +23 -1
- pygpt_net/ui/widget/textarea/web.py +16 -1
- {pygpt_net-2.7.4.dist-info → pygpt_net-2.7.6.dist-info}/METADATA +41 -2
- {pygpt_net-2.7.4.dist-info → pygpt_net-2.7.6.dist-info}/RECORD +158 -132
- {pygpt_net-2.7.4.dist-info → pygpt_net-2.7.6.dist-info}/LICENSE +0 -0
- {pygpt_net-2.7.4.dist-info → pygpt_net-2.7.6.dist-info}/WHEEL +0 -0
- {pygpt_net-2.7.4.dist-info → pygpt_net-2.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -6,15 +6,16 @@
|
|
|
6
6
|
# GitHub: https://github.com/szczyglis-dev/py-gpt #
|
|
7
7
|
# MIT License #
|
|
8
8
|
# Created By : Marcin Szczygliński #
|
|
9
|
-
# Updated Date:
|
|
9
|
+
# Updated Date: 2026.01.03 17:00:00 #
|
|
10
10
|
# ================================================== #
|
|
11
11
|
|
|
12
|
-
|
|
12
|
+
import os
|
|
13
|
+
from typing import Optional, Dict, Any, List, Tuple
|
|
13
14
|
|
|
14
15
|
from google.genai import types as gtypes
|
|
15
16
|
from google.genai.types import Content, Part
|
|
16
17
|
|
|
17
|
-
from pygpt_net.core.types import MODE_CHAT, MODE_AUDIO
|
|
18
|
+
from pygpt_net.core.types import MODE_CHAT, MODE_AUDIO, MODE_COMPUTER, MODE_RESEARCH
|
|
18
19
|
from pygpt_net.core.bridge.context import BridgeContext, MultimodalContext
|
|
19
20
|
from pygpt_net.item.attachment import AttachmentItem
|
|
20
21
|
from pygpt_net.item.ctx import CtxItem
|
|
@@ -35,7 +36,7 @@ class Chat:
|
|
|
35
36
|
extra: Optional[Dict[str, Any]] = None
|
|
36
37
|
):
|
|
37
38
|
"""
|
|
38
|
-
Call Google GenAI for chat / multimodal / audio.
|
|
39
|
+
Call Google GenAI for chat / multimodal / audio / computer use.
|
|
39
40
|
|
|
40
41
|
:param context: BridgeContext with prompt, model, history, mode, etc.
|
|
41
42
|
:param extra: Extra parameters (not used currently)
|
|
@@ -62,7 +63,6 @@ class Chat:
|
|
|
62
63
|
|
|
63
64
|
# ------------- TRANSCRIPTION PATH (audio input -> text -> feed to TTS) -------------
|
|
64
65
|
if mode == MODE_AUDIO and has_audio_input:
|
|
65
|
-
# Build minimal transcription request: [instruction text, audio part]
|
|
66
66
|
transcribe_model = self.window.core.config.get("google_audio.transcribe_model", "gemini-2.5-flash")
|
|
67
67
|
transcribe_prompt = self.window.core.config.get("google_audio.transcribe_prompt", "Transcribe this audio clip to text.")
|
|
68
68
|
audio_part = self.window.core.api.google.audio.build_part(multimodal_ctx)
|
|
@@ -73,13 +73,10 @@ class Chat:
|
|
|
73
73
|
])
|
|
74
74
|
]
|
|
75
75
|
trans_cfg = gtypes.GenerateContentConfig(
|
|
76
|
-
# Keep minimal; no tools/system for transcription
|
|
77
76
|
temperature=self.window.core.config.get('temperature'),
|
|
78
77
|
top_p=self.window.core.config.get('top_p'),
|
|
79
78
|
max_output_tokens=context.max_tokens if context.max_tokens else None,
|
|
80
79
|
)
|
|
81
|
-
|
|
82
|
-
# Always non-stream here (we immediately need the text for TTS)
|
|
83
80
|
trans_resp = client.models.generate_content(
|
|
84
81
|
model=transcribe_model,
|
|
85
82
|
contents=trans_inputs,
|
|
@@ -87,20 +84,17 @@ class Chat:
|
|
|
87
84
|
)
|
|
88
85
|
transcribed_text = self.extract_text(trans_resp).strip()
|
|
89
86
|
if transcribed_text:
|
|
90
|
-
# Feed transcription into TTS as the final prompt
|
|
91
87
|
prompt = transcribed_text
|
|
92
88
|
ctx.input = transcribed_text
|
|
93
89
|
try:
|
|
94
|
-
# optional: store for debugging/UX
|
|
95
90
|
if isinstance(ctx.extra, dict):
|
|
96
91
|
ctx.extra["transcription"] = transcribed_text
|
|
97
92
|
except Exception:
|
|
98
93
|
pass
|
|
99
|
-
ctx.is_audio = False
|
|
100
|
-
multimodal_ctx.is_audio_input = False
|
|
94
|
+
ctx.is_audio = False
|
|
95
|
+
multimodal_ctx.is_audio_input = False
|
|
101
96
|
|
|
102
|
-
# ---------------------- REGULAR CHAT PATH
|
|
103
|
-
# Build contents for chat/multimodal (will be overridden for TTS below)
|
|
97
|
+
# ---------------------- REGULAR CHAT/COMPUTER PATH ----------------------
|
|
104
98
|
inputs = self.build_input(
|
|
105
99
|
prompt=prompt,
|
|
106
100
|
system_prompt=system_prompt,
|
|
@@ -108,23 +102,35 @@ class Chat:
|
|
|
108
102
|
history=context.history,
|
|
109
103
|
attachments=attachments,
|
|
110
104
|
multimodal_ctx=multimodal_ctx,
|
|
105
|
+
mode=mode,
|
|
111
106
|
)
|
|
112
107
|
|
|
113
108
|
# Best-effort input tokens estimate
|
|
114
109
|
self.reset_tokens()
|
|
115
|
-
count_msgs = self._build_count_messages(prompt, system_prompt, model, context.history)
|
|
110
|
+
count_msgs = self._build_count_messages(prompt, system_prompt, model, context.history, mode)
|
|
116
111
|
self.input_tokens += self.window.core.tokens.from_messages(count_msgs, model.id)
|
|
117
112
|
|
|
118
113
|
# Tools -> merge app-defined tools with remote tools
|
|
119
114
|
base_tools = self.window.core.api.google.tools.prepare(model, functions)
|
|
120
|
-
remote_tools = self.window.core.api.google.build_remote_tools(model)
|
|
115
|
+
remote_tools = self.window.core.api.google.remote_tools.build_remote_tools(model)
|
|
121
116
|
|
|
122
|
-
#
|
|
117
|
+
# Note: Combining native (remote) tools with function declarations is documented as Live API-only.
|
|
123
118
|
if base_tools:
|
|
124
|
-
remote_tools = []
|
|
119
|
+
remote_tools = []
|
|
125
120
|
tools = (base_tools or []) + (remote_tools or [])
|
|
126
|
-
|
|
127
|
-
|
|
121
|
+
|
|
122
|
+
# Enable Computer Use tool in computer mode (use the official Tool/ComputerUse object)
|
|
123
|
+
if mode == MODE_COMPUTER or (model and isinstance(model.id, str) and "computer-use" in model.id.lower()):
|
|
124
|
+
comp_env = gtypes.Environment.ENVIRONMENT_BROWSER
|
|
125
|
+
tools = [gtypes.Tool(
|
|
126
|
+
computer_use=gtypes.ComputerUse(
|
|
127
|
+
environment=comp_env,
|
|
128
|
+
)
|
|
129
|
+
)] # reset tools to only Computer Use (multiple tools not supported together)
|
|
130
|
+
|
|
131
|
+
# Some models cannot use tools; keep behavior for image-only models
|
|
132
|
+
if model and isinstance(model.id, str) and "-image" in model.id:
|
|
133
|
+
tools = None
|
|
128
134
|
|
|
129
135
|
# Sampling
|
|
130
136
|
temperature = self.window.core.config.get('temperature')
|
|
@@ -145,10 +151,9 @@ class Chat:
|
|
|
145
151
|
stream = False # TTS non-stream in this app
|
|
146
152
|
supports_tts = self._supports_tts(model.id)
|
|
147
153
|
|
|
148
|
-
# Force minimal single-turn input for TTS (text only), using prompt possibly replaced by transcription
|
|
149
154
|
inputs = [Content(role="user", parts=[Part.from_text(text=str(prompt or ""))])]
|
|
150
155
|
|
|
151
|
-
# Remove params not used by TTS flow
|
|
156
|
+
# Remove params not used by TTS flow
|
|
152
157
|
for key in ("temperature", "top_p", "max_output_tokens", "system_instruction", "tools"):
|
|
153
158
|
if key in cfg_kwargs:
|
|
154
159
|
del cfg_kwargs[key]
|
|
@@ -171,11 +176,92 @@ class Chat:
|
|
|
171
176
|
prebuilt_voice_config=gtypes.PrebuiltVoiceConfig(voice_name=voice_name)
|
|
172
177
|
)
|
|
173
178
|
)
|
|
174
|
-
# else: fallback to text-only below
|
|
175
|
-
|
|
176
179
|
cfg = gtypes.GenerateContentConfig(**cfg_kwargs)
|
|
177
180
|
params = dict(model=model.id, contents=inputs, config=cfg)
|
|
178
181
|
|
|
182
|
+
if mode == MODE_RESEARCH:
|
|
183
|
+
|
|
184
|
+
# Deep Research does not support audio inputs; if an audio snippet is present, transcribe it to text first.
|
|
185
|
+
if has_audio_input:
|
|
186
|
+
try:
|
|
187
|
+
transcribe_model = self.window.core.config.get("google_audio.transcribe_model", "gemini-2.5-flash")
|
|
188
|
+
transcribe_prompt = self.window.core.config.get("google_audio.transcribe_prompt", "Transcribe this audio clip to text.")
|
|
189
|
+
audio_part = self.window.core.api.google.audio.build_part(multimodal_ctx)
|
|
190
|
+
trans_inputs = [
|
|
191
|
+
Content(role="user", parts=[
|
|
192
|
+
Part.from_text(text=transcribe_prompt),
|
|
193
|
+
audio_part,
|
|
194
|
+
])
|
|
195
|
+
]
|
|
196
|
+
trans_cfg = gtypes.GenerateContentConfig(
|
|
197
|
+
temperature=self.window.core.config.get('temperature'),
|
|
198
|
+
top_p=self.window.core.config.get('top_p'),
|
|
199
|
+
max_output_tokens=context.max_tokens if context.max_tokens else None,
|
|
200
|
+
)
|
|
201
|
+
trans_resp = client.models.generate_content(
|
|
202
|
+
model=transcribe_model,
|
|
203
|
+
contents=trans_inputs,
|
|
204
|
+
config=trans_cfg,
|
|
205
|
+
)
|
|
206
|
+
transcribed_text = self.extract_text(trans_resp).strip()
|
|
207
|
+
if transcribed_text:
|
|
208
|
+
prompt = (str(prompt or "").strip() + "\n\n" + transcribed_text).strip() if prompt else transcribed_text
|
|
209
|
+
ctx.input = transcribed_text
|
|
210
|
+
try:
|
|
211
|
+
if isinstance(ctx.extra, dict):
|
|
212
|
+
ctx.extra["transcription"] = transcribed_text
|
|
213
|
+
except Exception:
|
|
214
|
+
pass
|
|
215
|
+
except Exception:
|
|
216
|
+
pass
|
|
217
|
+
# Ensure we don't send raw audio to Interactions API
|
|
218
|
+
if multimodal_ctx:
|
|
219
|
+
multimodal_ctx.is_audio_input = False
|
|
220
|
+
|
|
221
|
+
# Build single-turn multimodal input for Interactions API (no full chat history)
|
|
222
|
+
research_parts = self._build_user_parts(
|
|
223
|
+
content=str(prompt),
|
|
224
|
+
attachments=attachments,
|
|
225
|
+
multimodal_ctx=multimodal_ctx,
|
|
226
|
+
)
|
|
227
|
+
interactions_input = self._parts_to_interactions_input(research_parts)
|
|
228
|
+
|
|
229
|
+
# Try to continue context with the last completed interaction (server-side state)
|
|
230
|
+
prev_interaction_id, last_event_id, last_status = self._find_last_interaction_state(
|
|
231
|
+
history=context.history,
|
|
232
|
+
ctx=ctx,
|
|
233
|
+
)
|
|
234
|
+
try:
|
|
235
|
+
if ctx.extra is None:
|
|
236
|
+
ctx.extra = {}
|
|
237
|
+
if prev_interaction_id:
|
|
238
|
+
ctx.extra["previous_interaction_id"] = prev_interaction_id
|
|
239
|
+
if last_event_id:
|
|
240
|
+
ctx.extra["google_last_event_id"] = last_event_id
|
|
241
|
+
if last_status:
|
|
242
|
+
ctx.extra["google_interaction_status"] = last_status
|
|
243
|
+
except Exception:
|
|
244
|
+
pass
|
|
245
|
+
|
|
246
|
+
# Deep Research agent must use background=True; stream=True enables live progress updates.
|
|
247
|
+
create_kwargs: Dict[str, Any] = {
|
|
248
|
+
"agent": model.id,
|
|
249
|
+
"input": interactions_input if interactions_input else (str(prompt or "") or " "),
|
|
250
|
+
"background": True,
|
|
251
|
+
"stream": stream,
|
|
252
|
+
"agent_config": {
|
|
253
|
+
"type": "deep-research",
|
|
254
|
+
"thinking_summaries": "auto"
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
# Continue conversation on server using previous_interaction_id if available
|
|
259
|
+
if prev_interaction_id:
|
|
260
|
+
create_kwargs["previous_interaction_id"] = prev_interaction_id
|
|
261
|
+
|
|
262
|
+
# Do not pass custom tools here; Deep Research manages its own built-in tools.
|
|
263
|
+
return client.interactions.create(**create_kwargs)
|
|
264
|
+
|
|
179
265
|
if stream and mode != MODE_AUDIO:
|
|
180
266
|
return client.models.generate_content_stream(**params)
|
|
181
267
|
else:
|
|
@@ -189,28 +275,21 @@ class Chat:
|
|
|
189
275
|
"""
|
|
190
276
|
Unpack non-streaming response from Google GenAI and set context.
|
|
191
277
|
|
|
192
|
-
:param mode: MODE_CHAT or
|
|
278
|
+
:param mode: MODE_CHAT, MODE_AUDIO or MODE_COMPUTER
|
|
193
279
|
:param response: Response object
|
|
194
280
|
:param ctx: CtxItem to set output, audio_output, tokens, tool_calls
|
|
195
281
|
"""
|
|
196
282
|
if mode == MODE_AUDIO:
|
|
197
|
-
# Prefer audio if present
|
|
198
283
|
audio_bytes, mime = self.window.core.api.google.audio.extract_first_audio_part(response)
|
|
199
284
|
if audio_bytes:
|
|
200
|
-
|
|
201
|
-
# https://ai.google.dev/gemini-api/docs/speech-generation
|
|
202
|
-
if mime == "audio/pcm" or mime.startswith("audio/"):
|
|
285
|
+
if mime == "audio/pcm" or (isinstance(mime, str) and mime.startswith("audio/")):
|
|
203
286
|
wav_b64 = self.window.core.api.google.audio.pcm16_to_wav_base64(audio_bytes, rate=24000)
|
|
204
287
|
ctx.audio_output = wav_b64
|
|
205
288
|
ctx.is_audio = True
|
|
206
|
-
# Text transcript is typically not present for TTS; still try:
|
|
207
289
|
txt = self.extract_text(response)
|
|
208
290
|
ctx.output = txt or "..."
|
|
209
291
|
else:
|
|
210
|
-
# No audio present -> fallback to text
|
|
211
292
|
ctx.output = self.extract_text(response)
|
|
212
|
-
|
|
213
|
-
# Usage
|
|
214
293
|
try:
|
|
215
294
|
usage = getattr(response, "usage_metadata", None)
|
|
216
295
|
if usage:
|
|
@@ -219,17 +298,58 @@ class Chat:
|
|
|
219
298
|
ctx.set_tokens(p, c)
|
|
220
299
|
except Exception:
|
|
221
300
|
pass
|
|
301
|
+
return
|
|
222
302
|
|
|
223
|
-
|
|
303
|
+
# ---- chat / computer ----
|
|
304
|
+
ctx.output = self.extract_text(response) or ""
|
|
224
305
|
|
|
225
|
-
#
|
|
226
|
-
ctx.output = self.extract_text(response)
|
|
227
|
-
|
|
228
|
-
# Extract function calls
|
|
306
|
+
# 1) Extract tool calls and store in ctx.tool_calls (backward-compatible shape)
|
|
229
307
|
calls = self.extract_tool_calls(response)
|
|
230
308
|
if calls:
|
|
231
309
|
ctx.tool_calls = calls
|
|
232
310
|
|
|
311
|
+
# 2) In MODE_COMPUTER: capture raw model parts (with thought_signature) for next FunctionResponse turn
|
|
312
|
+
# and translate Computer Use calls into plugin commands now.
|
|
313
|
+
if mode == MODE_COMPUTER:
|
|
314
|
+
candidate = None
|
|
315
|
+
try:
|
|
316
|
+
cands = getattr(response, "candidates", None) or []
|
|
317
|
+
if cands:
|
|
318
|
+
candidate = cands[0]
|
|
319
|
+
except Exception:
|
|
320
|
+
pass
|
|
321
|
+
|
|
322
|
+
if candidate and getattr(candidate, "content", None):
|
|
323
|
+
parts = getattr(candidate.content, "parts", None) or []
|
|
324
|
+
dump = self._dump_model_parts(parts)
|
|
325
|
+
if dump:
|
|
326
|
+
if ctx.extra is None:
|
|
327
|
+
ctx.extra = {}
|
|
328
|
+
ctx.extra["prev_model_parts"] = dump
|
|
329
|
+
|
|
330
|
+
tool_calls: List[dict] = []
|
|
331
|
+
try:
|
|
332
|
+
tool_calls, has_calls = self.window.core.api.google.computer.handle_stream_chunk(
|
|
333
|
+
ctx=ctx,
|
|
334
|
+
chunk=response,
|
|
335
|
+
tool_calls=tool_calls,
|
|
336
|
+
)
|
|
337
|
+
except Exception as e:
|
|
338
|
+
has_calls = False
|
|
339
|
+
print(f"Gemini computer-use mapping error: {e}")
|
|
340
|
+
|
|
341
|
+
if has_calls and tool_calls:
|
|
342
|
+
ctx.force_call = True
|
|
343
|
+
self.window.core.debug.info("[chat] Google tool calls found, unpacking...")
|
|
344
|
+
self.window.core.command.unpack_tool_calls_chunks(ctx, tool_calls)
|
|
345
|
+
|
|
346
|
+
if calls:
|
|
347
|
+
if ctx.extra is None:
|
|
348
|
+
ctx.extra = {}
|
|
349
|
+
ctx.extra["function_response_required"] = True
|
|
350
|
+
ctx.extra["function_response_source"] = "ctx.tool_calls"
|
|
351
|
+
ctx.extra["function_response_reason"] = "computer_use"
|
|
352
|
+
|
|
233
353
|
# Usage if available
|
|
234
354
|
try:
|
|
235
355
|
usage = getattr(response, "usage_metadata", None)
|
|
@@ -283,12 +403,11 @@ class Chat:
|
|
|
283
403
|
:return: List of tool calls
|
|
284
404
|
"""
|
|
285
405
|
def _to_plain_dict(obj):
|
|
286
|
-
# Convert pydantic/genai objects to plain dict recursively
|
|
287
406
|
try:
|
|
288
407
|
if hasattr(obj, "to_json_dict"):
|
|
289
408
|
return obj.to_json_dict()
|
|
290
409
|
if hasattr(obj, "model_dump"):
|
|
291
|
-
return obj.model_dump()
|
|
410
|
+
return obj.model_dump()
|
|
292
411
|
if hasattr(obj, "to_dict"):
|
|
293
412
|
return obj.to_dict()
|
|
294
413
|
except Exception:
|
|
@@ -307,7 +426,6 @@ class Chat:
|
|
|
307
426
|
name = getattr(fc, "name", "") or ""
|
|
308
427
|
args_obj = getattr(fc, "args", {}) or {}
|
|
309
428
|
args_dict = _to_plain_dict(args_obj) or {}
|
|
310
|
-
# if str, try to parse
|
|
311
429
|
if isinstance(args_dict, str):
|
|
312
430
|
try:
|
|
313
431
|
import json
|
|
@@ -319,7 +437,7 @@ class Chat:
|
|
|
319
437
|
"type": "function",
|
|
320
438
|
"function": {
|
|
321
439
|
"name": name,
|
|
322
|
-
"arguments": args_dict,
|
|
440
|
+
"arguments": args_dict,
|
|
323
441
|
}
|
|
324
442
|
})
|
|
325
443
|
|
|
@@ -345,11 +463,11 @@ class Chat:
|
|
|
345
463
|
except Exception:
|
|
346
464
|
args_dict = {}
|
|
347
465
|
out.append({
|
|
348
|
-
"id": "",
|
|
466
|
+
"id": getattr(fn, "id", "") or "",
|
|
349
467
|
"type": "function",
|
|
350
468
|
"function": {
|
|
351
469
|
"name": name,
|
|
352
|
-
"arguments": args_dict,
|
|
470
|
+
"arguments": args_dict,
|
|
353
471
|
}
|
|
354
472
|
})
|
|
355
473
|
except Exception:
|
|
@@ -357,17 +475,274 @@ class Chat:
|
|
|
357
475
|
|
|
358
476
|
return out
|
|
359
477
|
|
|
478
|
+
def build_input(
|
|
479
|
+
self,
|
|
480
|
+
prompt: str,
|
|
481
|
+
system_prompt: str,
|
|
482
|
+
model: ModelItem,
|
|
483
|
+
history: Optional[List[CtxItem]] = None,
|
|
484
|
+
attachments: Optional[Dict[str, AttachmentItem]] = None,
|
|
485
|
+
multimodal_ctx: Optional[MultimodalContext] = None,
|
|
486
|
+
mode: str = MODE_CHAT,
|
|
487
|
+
) -> List[Content]:
|
|
488
|
+
"""
|
|
489
|
+
Build Google GenAI contents list
|
|
490
|
+
|
|
491
|
+
:param prompt: User prompt
|
|
492
|
+
:param system_prompt: System prompt/instruction
|
|
493
|
+
:param model: ModelItem
|
|
494
|
+
:param history: List of CtxItem for history
|
|
495
|
+
:param attachments: Dict of AttachmentItem for images/screenshots
|
|
496
|
+
:param multimodal_ctx: MultimodalContext for audio
|
|
497
|
+
:param mode: MODE_CHAT / MODE_AUDIO / MODE_COMPUTER
|
|
498
|
+
:return: List of Content
|
|
499
|
+
"""
|
|
500
|
+
# FunctionResponse turn for Computer Use (strictly immediate after functionCall)
|
|
501
|
+
if mode == MODE_COMPUTER and self.window.core.config.get('use_context'):
|
|
502
|
+
hist = self.window.core.ctx.get_history(
|
|
503
|
+
history,
|
|
504
|
+
model.id,
|
|
505
|
+
MODE_CHAT,
|
|
506
|
+
self.window.core.tokens.from_user(prompt, system_prompt),
|
|
507
|
+
self._fit_ctx(model),
|
|
508
|
+
)
|
|
509
|
+
fr_contents = self._build_function_responses_from_history(hist, attachments)
|
|
510
|
+
if fr_contents:
|
|
511
|
+
return fr_contents
|
|
512
|
+
|
|
513
|
+
# Build conversation history first to detect "first input"
|
|
514
|
+
items: List[CtxItem] = []
|
|
515
|
+
if self.window.core.config.get('use_context'):
|
|
516
|
+
items = self.window.core.ctx.get_history(
|
|
517
|
+
history,
|
|
518
|
+
model.id,
|
|
519
|
+
MODE_CHAT if model and MODE_CHAT in model.mode else MODE_CHAT,
|
|
520
|
+
self.window.core.tokens.from_user(prompt, system_prompt),
|
|
521
|
+
self._fit_ctx(model),
|
|
522
|
+
)
|
|
523
|
+
|
|
524
|
+
is_first_turn = (len(items) == 0)
|
|
525
|
+
is_sandbox = bool(self.window.core.config.get("remote_tools.computer_use.sandbox", False))
|
|
526
|
+
|
|
527
|
+
contents: List[Content] = []
|
|
528
|
+
|
|
529
|
+
# Append conversation history (text only)
|
|
530
|
+
for item in items:
|
|
531
|
+
if item.final_input:
|
|
532
|
+
contents.append(Content(role="user", parts=[Part.from_text(text=str(item.final_input))]))
|
|
533
|
+
if item.final_output:
|
|
534
|
+
contents.append(Content(role="model", parts=[Part.from_text(text=str(item.final_output))]))
|
|
535
|
+
|
|
536
|
+
# Current user message:
|
|
537
|
+
# - In MODE_COMPUTER attach initial screenshot only on the very first turn
|
|
538
|
+
if mode == MODE_COMPUTER:
|
|
539
|
+
initial_attachments = {}
|
|
540
|
+
if is_first_turn and not attachments and not is_sandbox:
|
|
541
|
+
self.window.controller.attachment.clear_silent()
|
|
542
|
+
self.window.controller.painter.capture.screenshot(attach_cursor=True, silent=True)
|
|
543
|
+
initial_attachments = self.window.core.attachments.get_all(mode)
|
|
544
|
+
send_attachments = initial_attachments if initial_attachments else attachments
|
|
545
|
+
parts = self._build_user_parts(
|
|
546
|
+
content=str(prompt),
|
|
547
|
+
attachments=send_attachments,
|
|
548
|
+
multimodal_ctx=multimodal_ctx,
|
|
549
|
+
)
|
|
550
|
+
else:
|
|
551
|
+
parts = self._build_user_parts(
|
|
552
|
+
content=str(prompt),
|
|
553
|
+
attachments=attachments,
|
|
554
|
+
multimodal_ctx=multimodal_ctx,
|
|
555
|
+
)
|
|
556
|
+
contents.append(Content(role="user", parts=parts))
|
|
557
|
+
|
|
558
|
+
return contents
|
|
559
|
+
|
|
560
|
+
def _build_function_responses_from_history(
|
|
561
|
+
self,
|
|
562
|
+
history: Optional[List[CtxItem]],
|
|
563
|
+
attachments: Optional[Dict[str, AttachmentItem]],
|
|
564
|
+
) -> Optional[List[Content]]:
|
|
565
|
+
"""
|
|
566
|
+
Build FunctionResponse contents for the immediate next turn after executing
|
|
567
|
+
Computer Use function calls. It reconstructs the last user -> model(functionCall) turn
|
|
568
|
+
and returns [user_content, model_function_call_content, tool_function_response_content].
|
|
569
|
+
"""
|
|
570
|
+
if not self.window.core.config.get('use_context') or not history:
|
|
571
|
+
return None
|
|
572
|
+
|
|
573
|
+
last_item = history[-1]
|
|
574
|
+
if not getattr(last_item, "extra", None):
|
|
575
|
+
return None
|
|
576
|
+
if not last_item.extra.get("function_response_required"):
|
|
577
|
+
return None
|
|
578
|
+
|
|
579
|
+
# 1) Find the user message that started the current turn (previous item's input)
|
|
580
|
+
prior_user_text = ""
|
|
581
|
+
if len(history) >= 2:
|
|
582
|
+
prev = history[-2]
|
|
583
|
+
if getattr(prev, "final_input", None):
|
|
584
|
+
prior_user_text = str(prev.final_input)
|
|
585
|
+
|
|
586
|
+
if not prior_user_text and getattr(last_item, "input", None):
|
|
587
|
+
prior_user_text = str(last_item.input)
|
|
588
|
+
|
|
589
|
+
if not prior_user_text:
|
|
590
|
+
prior_user_text = "..."
|
|
591
|
+
|
|
592
|
+
user_content = Content(role="user", parts=[Part.from_text(text=prior_user_text)])
|
|
593
|
+
|
|
594
|
+
# 2) Rebuild the model functionCall content with thought_signature preserved
|
|
595
|
+
raw_parts = last_item.extra.get("prev_model_parts", [])
|
|
596
|
+
model_parts = self._rehydrate_model_parts(raw_parts)
|
|
597
|
+
if not model_parts:
|
|
598
|
+
model_parts = self._rehydrate_from_tool_calls(getattr(last_item, "tool_calls", []))
|
|
599
|
+
# append also text part if not empty
|
|
600
|
+
if getattr(last_item, "final_output", None):
|
|
601
|
+
output_text = str(last_item.final_output).strip()
|
|
602
|
+
if output_text:
|
|
603
|
+
model_parts.append(Part.from_text(text=output_text))
|
|
604
|
+
|
|
605
|
+
model_fc_content = Content(role="model", parts=model_parts)
|
|
606
|
+
|
|
607
|
+
# 3) Build a single tool content with N FunctionResponse parts (one per functionCall)
|
|
608
|
+
screenshot_part = self._screenshot_function_response_part(attachments)
|
|
609
|
+
fr_parts: List[Part] = []
|
|
610
|
+
for p in model_parts:
|
|
611
|
+
if getattr(p, "function_call", None):
|
|
612
|
+
fn = p.function_call
|
|
613
|
+
fr = Part.from_function_response(
|
|
614
|
+
name=fn.name,
|
|
615
|
+
response=self._minimal_tool_response(last_item),
|
|
616
|
+
parts=[screenshot_part] if screenshot_part else None
|
|
617
|
+
)
|
|
618
|
+
fr_parts.append(fr)
|
|
619
|
+
|
|
620
|
+
if not fr_parts:
|
|
621
|
+
return None
|
|
622
|
+
|
|
623
|
+
tool_content = Content(role="tool", parts=fr_parts)
|
|
624
|
+
|
|
625
|
+
return [user_content, model_fc_content, tool_content]
|
|
626
|
+
|
|
627
|
+
def _rehydrate_from_tool_calls(self, calls: List[dict]) -> List[Part]:
|
|
628
|
+
"""
|
|
629
|
+
Fallback rehydration when prev_model_parts are unavailable (no thought signatures).
|
|
630
|
+
"""
|
|
631
|
+
parts: List[Part] = []
|
|
632
|
+
for c in calls or []:
|
|
633
|
+
if not isinstance(c, dict):
|
|
634
|
+
continue
|
|
635
|
+
if c.get("type") != "function":
|
|
636
|
+
continue
|
|
637
|
+
fn = c.get("function") or {}
|
|
638
|
+
name = fn.get("name")
|
|
639
|
+
args = fn.get("arguments") or {}
|
|
640
|
+
if not name:
|
|
641
|
+
continue
|
|
642
|
+
parts.append(Part.from_function_call(name=name, args=args))
|
|
643
|
+
return parts
|
|
644
|
+
|
|
645
|
+
def _dump_model_parts(self, parts: List[Part]) -> List[dict]:
|
|
646
|
+
"""
|
|
647
|
+
Dump model parts into a JSON-serializable structure, preserving thought_signature.
|
|
648
|
+
"""
|
|
649
|
+
out: List[dict] = []
|
|
650
|
+
for p in parts or []:
|
|
651
|
+
ts = getattr(p, "thought_signature", None)
|
|
652
|
+
if getattr(p, "function_call", None):
|
|
653
|
+
fn = p.function_call
|
|
654
|
+
name = getattr(fn, "name", "") or ""
|
|
655
|
+
args = getattr(fn, "args", {}) or {}
|
|
656
|
+
out.append({
|
|
657
|
+
"type": "function_call",
|
|
658
|
+
"name": name,
|
|
659
|
+
"args": args,
|
|
660
|
+
"thought_signature": ts,
|
|
661
|
+
})
|
|
662
|
+
elif getattr(p, "text", None):
|
|
663
|
+
out.append({"type": "text", "text": str(p.text)})
|
|
664
|
+
return out
|
|
665
|
+
|
|
666
|
+
def _rehydrate_model_parts(self, raw_parts: List[dict]) -> List[Part]:
|
|
667
|
+
"""
|
|
668
|
+
Recreate SDK Part objects from dumped parts, including thought_signature on the Part.
|
|
669
|
+
"""
|
|
670
|
+
parts: List[Part] = []
|
|
671
|
+
for it in raw_parts or []:
|
|
672
|
+
t = (it.get("type") or "").lower()
|
|
673
|
+
if t == "function_call":
|
|
674
|
+
name = it.get("name")
|
|
675
|
+
args = it.get("args") or {}
|
|
676
|
+
ts = it.get("thought_signature")
|
|
677
|
+
if name:
|
|
678
|
+
parts.append(Part(function_call=gtypes.FunctionCall(name=name, args=args),
|
|
679
|
+
thought_signature=ts))
|
|
680
|
+
elif t == "text":
|
|
681
|
+
parts.append(Part.from_text(text=str(it.get("text", ""))))
|
|
682
|
+
return parts
|
|
683
|
+
|
|
684
|
+
def _screenshot_function_response_part(
|
|
685
|
+
self,
|
|
686
|
+
attachments: Optional[Dict[str, AttachmentItem]]
|
|
687
|
+
) -> Optional[gtypes.FunctionResponsePart]:
|
|
688
|
+
"""
|
|
689
|
+
Build FunctionResponsePart with inlineData PNG/JPEG screenshot from attachments.
|
|
690
|
+
"""
|
|
691
|
+
if not attachments:
|
|
692
|
+
return None
|
|
693
|
+
|
|
694
|
+
chosen_path = None
|
|
695
|
+
for _, att in attachments.items():
|
|
696
|
+
if not att or not att.path:
|
|
697
|
+
continue
|
|
698
|
+
p = att.path
|
|
699
|
+
if isinstance(p, str) and os.path.exists(p):
|
|
700
|
+
ext = os.path.splitext(p)[1].lower()
|
|
701
|
+
if ext in (".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff"):
|
|
702
|
+
chosen_path = p
|
|
703
|
+
if ext == ".png":
|
|
704
|
+
break
|
|
705
|
+
|
|
706
|
+
if not chosen_path:
|
|
707
|
+
return None
|
|
708
|
+
|
|
709
|
+
try:
|
|
710
|
+
with open(chosen_path, "rb") as f:
|
|
711
|
+
data = f.read()
|
|
712
|
+
blob = gtypes.FunctionResponseBlob(
|
|
713
|
+
mime_type="image/png" if chosen_path.lower().endswith(".png") else "image/jpeg",
|
|
714
|
+
data=data,
|
|
715
|
+
)
|
|
716
|
+
return gtypes.FunctionResponsePart(inline_data=blob)
|
|
717
|
+
except Exception:
|
|
718
|
+
return None
|
|
719
|
+
|
|
720
|
+
@staticmethod
|
|
721
|
+
def _minimal_tool_response(item: CtxItem) -> Dict[str, Any]:
|
|
722
|
+
"""
|
|
723
|
+
Construct a minimal structured payload for FunctionResponse.response.
|
|
724
|
+
"""
|
|
725
|
+
resp: Dict[str, Any] = {"ok": True}
|
|
726
|
+
try:
|
|
727
|
+
if item and item.extra and isinstance(item.extra, dict):
|
|
728
|
+
outputs = item.extra.get("tool_output")
|
|
729
|
+
if isinstance(outputs, list) and len(outputs) > 0:
|
|
730
|
+
last = outputs[-1]
|
|
731
|
+
if isinstance(last, dict):
|
|
732
|
+
if "result" in last and isinstance(last["result"], dict):
|
|
733
|
+
resp = last["result"]
|
|
734
|
+
if "error" in last:
|
|
735
|
+
resp["error"] = last["error"]
|
|
736
|
+
except Exception:
|
|
737
|
+
pass
|
|
738
|
+
return resp
|
|
739
|
+
|
|
360
740
|
def _extract_inline_images_and_links(
|
|
361
741
|
self,
|
|
362
742
|
response, ctx: CtxItem
|
|
363
743
|
) -> None:
|
|
364
744
|
"""
|
|
365
745
|
Extract inline image parts (Gemini image output) and file links.
|
|
366
|
-
- Saves inline_data (image/*) bytes to files and appends paths to ctx.images.
|
|
367
|
-
- Appends HTTP(S) image URIs from file_data to ctx.urls.
|
|
368
|
-
|
|
369
|
-
:param response: Response object
|
|
370
|
-
:param ctx: CtxItem to set images and urls
|
|
371
746
|
"""
|
|
372
747
|
images: list[str] = []
|
|
373
748
|
urls: list[str] = []
|
|
@@ -378,7 +753,6 @@ class Chat:
|
|
|
378
753
|
content = getattr(cand, "content", None)
|
|
379
754
|
parts = getattr(content, "parts", None) or []
|
|
380
755
|
for p in parts:
|
|
381
|
-
# Inline image bytes (image preview / image generation in chat)
|
|
382
756
|
blob = getattr(p, "inline_data", None)
|
|
383
757
|
if blob:
|
|
384
758
|
mime = (getattr(blob, "mime_type", "") or "").lower()
|
|
@@ -392,16 +766,14 @@ class Chat:
|
|
|
392
766
|
f.write(img_bytes)
|
|
393
767
|
images.append(img_path)
|
|
394
768
|
|
|
395
|
-
# File data URI (may contain http/https or gs://)
|
|
396
769
|
fdata = getattr(p, "file_data", None)
|
|
397
770
|
if fdata:
|
|
398
771
|
uri = getattr(fdata, "file_uri", None) or getattr(fdata, "uri", None)
|
|
399
772
|
mime = (getattr(fdata, "mime_type", "") or "").lower()
|
|
400
773
|
if uri and mime.startswith("image/"):
|
|
401
|
-
# Store only as URL; downloading is out of scope here.
|
|
402
774
|
if uri.startswith("http://") or uri.startswith("https://"):
|
|
403
775
|
urls.append(uri)
|
|
404
|
-
except Exception
|
|
776
|
+
except Exception:
|
|
405
777
|
pass
|
|
406
778
|
|
|
407
779
|
if images:
|
|
@@ -418,9 +790,6 @@ class Chat:
|
|
|
418
790
|
def _ensure_bytes(data) -> bytes | None:
|
|
419
791
|
"""
|
|
420
792
|
Return raw bytes from SDK part.inline_data.data which can be bytes or base64 string.
|
|
421
|
-
|
|
422
|
-
:param data: bytes or str
|
|
423
|
-
:return: bytes or None
|
|
424
793
|
"""
|
|
425
794
|
try:
|
|
426
795
|
if isinstance(data, (bytes, bytearray)):
|
|
@@ -432,56 +801,6 @@ class Chat:
|
|
|
432
801
|
return None
|
|
433
802
|
return None
|
|
434
803
|
|
|
435
|
-
def build_input(
|
|
436
|
-
self,
|
|
437
|
-
prompt: str,
|
|
438
|
-
system_prompt: str,
|
|
439
|
-
model: ModelItem,
|
|
440
|
-
history: Optional[List[CtxItem]] = None,
|
|
441
|
-
attachments: Optional[Dict[str, AttachmentItem]] = None,
|
|
442
|
-
multimodal_ctx: Optional[MultimodalContext] = None,
|
|
443
|
-
) -> List[Content]:
|
|
444
|
-
"""
|
|
445
|
-
Build Google GenAI contents list
|
|
446
|
-
|
|
447
|
-
:param prompt: User prompt
|
|
448
|
-
:param system_prompt: System prompt/instruction
|
|
449
|
-
:param model: ModelItem
|
|
450
|
-
:param history: List of CtxItem for history
|
|
451
|
-
:param attachments: Dict of AttachmentItem for images
|
|
452
|
-
:param multimodal_ctx: MultimodalContext for audio
|
|
453
|
-
:return: List of Content
|
|
454
|
-
"""
|
|
455
|
-
contents: List[Content] = []
|
|
456
|
-
|
|
457
|
-
# System instruction is passed separately (system_instruction),
|
|
458
|
-
# so we do not build an explicit system role part here.
|
|
459
|
-
|
|
460
|
-
# Append conversation history
|
|
461
|
-
if self.window.core.config.get('use_context'):
|
|
462
|
-
items = self.window.core.ctx.get_history(
|
|
463
|
-
history,
|
|
464
|
-
model.id,
|
|
465
|
-
MODE_CHAT if model and MODE_CHAT in model.mode else MODE_CHAT,
|
|
466
|
-
self.window.core.tokens.from_user(prompt, system_prompt),
|
|
467
|
-
self._fit_ctx(model),
|
|
468
|
-
)
|
|
469
|
-
for item in items:
|
|
470
|
-
if item.final_input:
|
|
471
|
-
contents.append(Content(role="user", parts=[Part.from_text(text=str(item.final_input))]))
|
|
472
|
-
if item.final_output:
|
|
473
|
-
contents.append(Content(role="model", parts=[Part.from_text(text=str(item.final_output))]))
|
|
474
|
-
|
|
475
|
-
# Current user message with multimodal parts
|
|
476
|
-
parts = self._build_user_parts(
|
|
477
|
-
content=str(prompt),
|
|
478
|
-
attachments=attachments,
|
|
479
|
-
multimodal_ctx=multimodal_ctx,
|
|
480
|
-
)
|
|
481
|
-
contents.append(Content(role="user", parts=parts))
|
|
482
|
-
|
|
483
|
-
return contents
|
|
484
|
-
|
|
485
804
|
def _build_user_parts(
|
|
486
805
|
self,
|
|
487
806
|
content: str,
|
|
@@ -490,11 +809,6 @@ class Chat:
|
|
|
490
809
|
) -> List[Part]:
|
|
491
810
|
"""
|
|
492
811
|
Build user message parts (text + images + audio)
|
|
493
|
-
|
|
494
|
-
:param content: User text content
|
|
495
|
-
:param attachments: Dict of AttachmentItem for images
|
|
496
|
-
:param multimodal_ctx: MultimodalContext for audio
|
|
497
|
-
:return: List of Part
|
|
498
812
|
"""
|
|
499
813
|
self.window.core.api.google.vision.reset()
|
|
500
814
|
parts: List[Part] = []
|
|
@@ -515,9 +829,6 @@ class Chat:
|
|
|
515
829
|
def _fit_ctx(self, model: ModelItem) -> int:
|
|
516
830
|
"""
|
|
517
831
|
Fit to max model tokens (best-effort, uses model.ctx if present)
|
|
518
|
-
|
|
519
|
-
:param model: ModelItem
|
|
520
|
-
:return: max context tokens
|
|
521
832
|
"""
|
|
522
833
|
max_ctx_tokens = self.window.core.config.get('max_total_tokens')
|
|
523
834
|
if model and model.ctx and 0 < model.ctx < max_ctx_tokens:
|
|
@@ -530,15 +841,10 @@ class Chat:
|
|
|
530
841
|
system_prompt: str,
|
|
531
842
|
model: ModelItem,
|
|
532
843
|
history: Optional[List[CtxItem]] = None,
|
|
844
|
+
mode: str = MODE_CHAT,
|
|
533
845
|
) -> List[dict]:
|
|
534
846
|
"""
|
|
535
847
|
Build simple messages structure for local token estimation
|
|
536
|
-
|
|
537
|
-
:param prompt: User prompt
|
|
538
|
-
:param system_prompt: System prompt/instruction
|
|
539
|
-
:param model: ModelItem
|
|
540
|
-
:param history: List of CtxItem for history
|
|
541
|
-
:return: List of messages dicts with 'role' and 'content' keys
|
|
542
848
|
"""
|
|
543
849
|
messages = []
|
|
544
850
|
if system_prompt:
|
|
@@ -562,7 +868,6 @@ class Chat:
|
|
|
562
868
|
messages.append({"role": "user", "content": str(prompt)})
|
|
563
869
|
return messages
|
|
564
870
|
|
|
565
|
-
|
|
566
871
|
def reset_tokens(self):
|
|
567
872
|
"""Reset input tokens counter"""
|
|
568
873
|
self.input_tokens = 0
|
|
@@ -570,8 +875,6 @@ class Chat:
|
|
|
570
875
|
def get_used_tokens(self) -> int:
|
|
571
876
|
"""
|
|
572
877
|
Get input tokens counter (estimated before sending)
|
|
573
|
-
|
|
574
|
-
:return: input tokens count
|
|
575
878
|
"""
|
|
576
879
|
return self.input_tokens
|
|
577
880
|
|
|
@@ -579,13 +882,126 @@ class Chat:
|
|
|
579
882
|
def _supports_tts(model_id: Optional[str]) -> bool:
|
|
580
883
|
"""
|
|
581
884
|
Heuristic check if the model supports native TTS.
|
|
582
|
-
- Official TTS models contain '-tts' in id (e.g. 'gemini-2.5-flash-preview-tts').
|
|
583
|
-
- Future/preview names may contain 'native-audio'.
|
|
584
|
-
|
|
585
|
-
:param model_id: Model ID
|
|
586
|
-
:return: True if supports TTS, False otherwise
|
|
587
885
|
"""
|
|
588
886
|
if not model_id:
|
|
589
887
|
return False
|
|
590
888
|
mid = model_id.lower()
|
|
591
|
-
return ("-tts" in mid) or ("native-audio" in mid)
|
|
889
|
+
return ("-tts" in mid) or ("native-audio" in mid)
|
|
890
|
+
|
|
891
|
+
@staticmethod
|
|
892
|
+
def _find_last_interaction_state(
|
|
893
|
+
history: Optional[List[CtxItem]],
|
|
894
|
+
ctx: CtxItem,
|
|
895
|
+
) -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
|
896
|
+
"""
|
|
897
|
+
Resolve last known Interactions state:
|
|
898
|
+
- previous_interaction_id: to continue conversation context
|
|
899
|
+
- last_event_id: to resume streaming (not used here, but returned for completeness)
|
|
900
|
+
- last_status: last known status string if available
|
|
901
|
+
|
|
902
|
+
Looks at current ctx.extra first, then scans history from newest to oldest.
|
|
903
|
+
"""
|
|
904
|
+
prev_interaction_id: Optional[str] = None
|
|
905
|
+
last_event_id: Optional[str] = None
|
|
906
|
+
last_status: Optional[str] = None
|
|
907
|
+
|
|
908
|
+
try:
|
|
909
|
+
if getattr(ctx, "extra", None) and isinstance(ctx.extra, dict):
|
|
910
|
+
prev_interaction_id = (
|
|
911
|
+
ctx.extra.get("previous_interaction_id")
|
|
912
|
+
or ctx.extra.get("google_interaction_id")
|
|
913
|
+
or ctx.extra.get("google_last_interaction_id")
|
|
914
|
+
)
|
|
915
|
+
last_event_id = ctx.extra.get("google_last_event_id")
|
|
916
|
+
last_status = ctx.extra.get("google_interaction_status")
|
|
917
|
+
except Exception:
|
|
918
|
+
pass
|
|
919
|
+
|
|
920
|
+
if not prev_interaction_id and history:
|
|
921
|
+
for item in reversed(history or []):
|
|
922
|
+
ex = getattr(item, "extra", None)
|
|
923
|
+
if not ex or not isinstance(ex, dict):
|
|
924
|
+
continue
|
|
925
|
+
prev_interaction_id = (
|
|
926
|
+
ex.get("previous_interaction_id")
|
|
927
|
+
or ex.get("google_interaction_id")
|
|
928
|
+
or ex.get("google_last_interaction_id")
|
|
929
|
+
or prev_interaction_id
|
|
930
|
+
)
|
|
931
|
+
last_event_id = ex.get("google_last_event_id") or last_event_id
|
|
932
|
+
last_status = ex.get("google_interaction_status") or last_status
|
|
933
|
+
if prev_interaction_id and last_event_id:
|
|
934
|
+
break
|
|
935
|
+
|
|
936
|
+
return prev_interaction_id, last_event_id, last_status
|
|
937
|
+
|
|
938
|
+
@staticmethod
|
|
939
|
+
def _mime_to_interactions_type(mime: str) -> Optional[str]:
|
|
940
|
+
"""
|
|
941
|
+
Map MIME type to Interactions input type.
|
|
942
|
+
"""
|
|
943
|
+
if not mime:
|
|
944
|
+
return None
|
|
945
|
+
m = mime.lower()
|
|
946
|
+
if m.startswith("image/"):
|
|
947
|
+
return "image"
|
|
948
|
+
if m.startswith("audio/"):
|
|
949
|
+
return "audio"
|
|
950
|
+
if m.startswith("video/"):
|
|
951
|
+
return "video"
|
|
952
|
+
return None
|
|
953
|
+
|
|
954
|
+
@staticmethod
|
|
955
|
+
def _ensure_base64(data) -> Optional[str]:
|
|
956
|
+
"""
|
|
957
|
+
Return base64 string from raw bytes or a base64 string.
|
|
958
|
+
"""
|
|
959
|
+
try:
|
|
960
|
+
if data is None:
|
|
961
|
+
return None
|
|
962
|
+
if isinstance(data, str):
|
|
963
|
+
return data
|
|
964
|
+
if isinstance(data, (bytes, bytearray)):
|
|
965
|
+
import base64
|
|
966
|
+
return base64.b64encode(bytes(data)).decode("utf-8")
|
|
967
|
+
except Exception:
|
|
968
|
+
return None
|
|
969
|
+
return None
|
|
970
|
+
|
|
971
|
+
def _parts_to_interactions_input(self, parts: List[Part]) -> List[Dict[str, Any]]:
|
|
972
|
+
"""
|
|
973
|
+
Convert Responses API Part list into Interactions API input payload.
|
|
974
|
+
"""
|
|
975
|
+
out: List[Dict[str, Any]] = []
|
|
976
|
+
|
|
977
|
+
for p in parts or []:
|
|
978
|
+
# Text
|
|
979
|
+
t = getattr(p, "text", None)
|
|
980
|
+
if t is not None:
|
|
981
|
+
s = str(t).strip()
|
|
982
|
+
if s:
|
|
983
|
+
out.append({"type": "text", "text": s})
|
|
984
|
+
continue
|
|
985
|
+
|
|
986
|
+
# Inline data (images/audio/video)
|
|
987
|
+
inline = getattr(p, "inline_data", None)
|
|
988
|
+
if inline:
|
|
989
|
+
mime = (getattr(inline, "mime_type", "") or "").lower()
|
|
990
|
+
typ = self._mime_to_interactions_type(mime)
|
|
991
|
+
data = getattr(inline, "data", None)
|
|
992
|
+
b64 = self._ensure_base64(data)
|
|
993
|
+
if typ and b64:
|
|
994
|
+
out.append({"type": typ, "data": b64, "mime_type": mime})
|
|
995
|
+
continue
|
|
996
|
+
|
|
997
|
+
# File references (prefer URIs from Gemini Files API)
|
|
998
|
+
fdata = getattr(p, "file_data", None)
|
|
999
|
+
if fdata:
|
|
1000
|
+
uri = getattr(fdata, "file_uri", None) or getattr(fdata, "uri", None)
|
|
1001
|
+
mime = (getattr(fdata, "mime_type", "") or "").lower()
|
|
1002
|
+
typ = self._mime_to_interactions_type(mime)
|
|
1003
|
+
if typ and uri:
|
|
1004
|
+
out.append({"type": typ, "uri": uri})
|
|
1005
|
+
continue
|
|
1006
|
+
|
|
1007
|
+
return out
|