pygpt-net 2.6.29__py3-none-any.whl → 2.6.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. pygpt_net/CHANGELOG.txt +15 -0
  2. pygpt_net/__init__.py +3 -3
  3. pygpt_net/app.py +4 -0
  4. pygpt_net/{container.py → app_core.py} +5 -6
  5. pygpt_net/controller/__init__.py +5 -2
  6. pygpt_net/controller/access/control.py +1 -9
  7. pygpt_net/controller/assistant/assistant.py +4 -4
  8. pygpt_net/controller/assistant/batch.py +7 -7
  9. pygpt_net/controller/assistant/files.py +4 -4
  10. pygpt_net/controller/assistant/threads.py +3 -3
  11. pygpt_net/controller/attachment/attachment.py +4 -7
  12. pygpt_net/controller/audio/audio.py +25 -1
  13. pygpt_net/controller/audio/ui.py +2 -2
  14. pygpt_net/controller/chat/audio.py +1 -8
  15. pygpt_net/controller/chat/common.py +30 -4
  16. pygpt_net/controller/chat/handler/stream_worker.py +1124 -0
  17. pygpt_net/controller/chat/output.py +8 -3
  18. pygpt_net/controller/chat/stream.py +4 -405
  19. pygpt_net/controller/chat/text.py +3 -2
  20. pygpt_net/controller/chat/vision.py +11 -19
  21. pygpt_net/controller/config/placeholder.py +1 -1
  22. pygpt_net/controller/ctx/ctx.py +1 -1
  23. pygpt_net/controller/ctx/summarizer.py +1 -1
  24. pygpt_net/controller/kernel/kernel.py +11 -3
  25. pygpt_net/controller/kernel/reply.py +5 -1
  26. pygpt_net/controller/mode/mode.py +21 -12
  27. pygpt_net/controller/plugins/settings.py +3 -2
  28. pygpt_net/controller/presets/editor.py +112 -99
  29. pygpt_net/controller/realtime/__init__.py +12 -0
  30. pygpt_net/controller/realtime/manager.py +53 -0
  31. pygpt_net/controller/realtime/realtime.py +268 -0
  32. pygpt_net/controller/theme/theme.py +3 -2
  33. pygpt_net/controller/ui/mode.py +7 -0
  34. pygpt_net/controller/ui/ui.py +19 -1
  35. pygpt_net/controller/ui/vision.py +4 -4
  36. pygpt_net/core/agents/legacy.py +2 -2
  37. pygpt_net/core/agents/runners/openai_workflow.py +2 -2
  38. pygpt_net/core/assistants/files.py +5 -5
  39. pygpt_net/core/assistants/store.py +4 -4
  40. pygpt_net/core/audio/audio.py +6 -1
  41. pygpt_net/core/audio/backend/native/__init__.py +12 -0
  42. pygpt_net/core/audio/backend/{native.py → native/native.py} +426 -127
  43. pygpt_net/core/audio/backend/native/player.py +139 -0
  44. pygpt_net/core/audio/backend/native/realtime.py +250 -0
  45. pygpt_net/core/audio/backend/pyaudio/__init__.py +12 -0
  46. pygpt_net/core/audio/backend/pyaudio/playback.py +194 -0
  47. pygpt_net/core/audio/backend/pyaudio/pyaudio.py +923 -0
  48. pygpt_net/core/audio/backend/pyaudio/realtime.py +275 -0
  49. pygpt_net/core/audio/backend/pygame/__init__.py +12 -0
  50. pygpt_net/core/audio/backend/{pygame.py → pygame/pygame.py} +130 -19
  51. pygpt_net/core/audio/backend/shared/__init__.py +38 -0
  52. pygpt_net/core/audio/backend/shared/conversions.py +211 -0
  53. pygpt_net/core/audio/backend/shared/envelope.py +38 -0
  54. pygpt_net/core/audio/backend/shared/player.py +137 -0
  55. pygpt_net/core/audio/backend/shared/rt.py +52 -0
  56. pygpt_net/core/audio/capture.py +5 -0
  57. pygpt_net/core/audio/output.py +13 -2
  58. pygpt_net/core/audio/whisper.py +6 -2
  59. pygpt_net/core/bridge/bridge.py +4 -3
  60. pygpt_net/core/bridge/worker.py +31 -9
  61. pygpt_net/core/debug/console/console.py +2 -2
  62. pygpt_net/core/debug/presets.py +2 -2
  63. pygpt_net/core/dispatcher/dispatcher.py +37 -1
  64. pygpt_net/core/events/__init__.py +2 -1
  65. pygpt_net/core/events/realtime.py +55 -0
  66. pygpt_net/core/experts/experts.py +2 -2
  67. pygpt_net/core/image/image.py +51 -1
  68. pygpt_net/core/modes/modes.py +2 -2
  69. pygpt_net/core/presets/presets.py +3 -3
  70. pygpt_net/core/realtime/options.py +87 -0
  71. pygpt_net/core/realtime/shared/__init__.py +0 -0
  72. pygpt_net/core/realtime/shared/audio.py +213 -0
  73. pygpt_net/core/realtime/shared/loop.py +64 -0
  74. pygpt_net/core/realtime/shared/session.py +59 -0
  75. pygpt_net/core/realtime/shared/text.py +37 -0
  76. pygpt_net/core/realtime/shared/tools.py +276 -0
  77. pygpt_net/core/realtime/shared/turn.py +38 -0
  78. pygpt_net/core/realtime/shared/types.py +16 -0
  79. pygpt_net/core/realtime/worker.py +164 -0
  80. pygpt_net/core/tokens/tokens.py +4 -4
  81. pygpt_net/core/types/__init__.py +1 -0
  82. pygpt_net/core/types/image.py +48 -0
  83. pygpt_net/core/types/mode.py +5 -2
  84. pygpt_net/core/vision/analyzer.py +1 -1
  85. pygpt_net/data/config/config.json +13 -4
  86. pygpt_net/data/config/models.json +219 -101
  87. pygpt_net/data/config/modes.json +3 -9
  88. pygpt_net/data/config/settings.json +135 -27
  89. pygpt_net/data/config/settings_section.json +2 -2
  90. pygpt_net/data/locale/locale.de.ini +7 -7
  91. pygpt_net/data/locale/locale.en.ini +25 -12
  92. pygpt_net/data/locale/locale.es.ini +7 -7
  93. pygpt_net/data/locale/locale.fr.ini +7 -7
  94. pygpt_net/data/locale/locale.it.ini +7 -7
  95. pygpt_net/data/locale/locale.pl.ini +8 -8
  96. pygpt_net/data/locale/locale.uk.ini +7 -7
  97. pygpt_net/data/locale/locale.zh.ini +3 -3
  98. pygpt_net/data/locale/plugin.audio_input.en.ini +4 -0
  99. pygpt_net/data/locale/plugin.audio_output.en.ini +4 -0
  100. pygpt_net/item/model.py +23 -3
  101. pygpt_net/plugin/audio_input/plugin.py +37 -4
  102. pygpt_net/plugin/audio_input/simple.py +57 -8
  103. pygpt_net/plugin/cmd_files/worker.py +3 -0
  104. pygpt_net/plugin/openai_dalle/plugin.py +4 -4
  105. pygpt_net/plugin/openai_vision/plugin.py +12 -13
  106. pygpt_net/provider/agents/openai/agent.py +5 -5
  107. pygpt_net/provider/agents/openai/agent_b2b.py +5 -5
  108. pygpt_net/provider/agents/openai/agent_planner.py +5 -6
  109. pygpt_net/provider/agents/openai/agent_with_experts.py +5 -5
  110. pygpt_net/provider/agents/openai/agent_with_experts_feedback.py +4 -4
  111. pygpt_net/provider/agents/openai/agent_with_feedback.py +4 -4
  112. pygpt_net/provider/agents/openai/bot_researcher.py +2 -2
  113. pygpt_net/provider/agents/openai/bots/research_bot/agents/planner_agent.py +1 -1
  114. pygpt_net/provider/agents/openai/bots/research_bot/agents/search_agent.py +1 -1
  115. pygpt_net/provider/agents/openai/bots/research_bot/agents/writer_agent.py +1 -1
  116. pygpt_net/provider/agents/openai/evolve.py +5 -5
  117. pygpt_net/provider/agents/openai/supervisor.py +4 -4
  118. pygpt_net/provider/api/__init__.py +27 -0
  119. pygpt_net/provider/api/anthropic/__init__.py +68 -0
  120. pygpt_net/provider/api/google/__init__.py +295 -0
  121. pygpt_net/provider/api/google/audio.py +121 -0
  122. pygpt_net/provider/api/google/chat.py +591 -0
  123. pygpt_net/provider/api/google/image.py +427 -0
  124. pygpt_net/provider/api/google/realtime/__init__.py +12 -0
  125. pygpt_net/provider/api/google/realtime/client.py +1945 -0
  126. pygpt_net/provider/api/google/realtime/realtime.py +186 -0
  127. pygpt_net/provider/api/google/tools.py +222 -0
  128. pygpt_net/provider/api/google/vision.py +129 -0
  129. pygpt_net/provider/{gpt → api/openai}/__init__.py +24 -4
  130. pygpt_net/provider/api/openai/agents/__init__.py +0 -0
  131. pygpt_net/provider/{gpt → api/openai}/agents/computer.py +1 -1
  132. pygpt_net/provider/{gpt → api/openai}/agents/experts.py +1 -1
  133. pygpt_net/provider/{gpt → api/openai}/agents/response.py +1 -1
  134. pygpt_net/provider/{gpt → api/openai}/assistants.py +1 -1
  135. pygpt_net/provider/{gpt → api/openai}/chat.py +15 -8
  136. pygpt_net/provider/{gpt → api/openai}/completion.py +1 -1
  137. pygpt_net/provider/{gpt → api/openai}/image.py +1 -1
  138. pygpt_net/provider/api/openai/realtime/__init__.py +12 -0
  139. pygpt_net/provider/api/openai/realtime/client.py +1828 -0
  140. pygpt_net/provider/api/openai/realtime/realtime.py +194 -0
  141. pygpt_net/provider/{gpt → api/openai}/remote_tools.py +1 -1
  142. pygpt_net/provider/{gpt → api/openai}/responses.py +34 -20
  143. pygpt_net/provider/{gpt → api/openai}/store.py +2 -2
  144. pygpt_net/provider/{gpt → api/openai}/vision.py +1 -1
  145. pygpt_net/provider/api/openai/worker/__init__.py +0 -0
  146. pygpt_net/provider/{gpt → api/openai}/worker/assistants.py +4 -4
  147. pygpt_net/provider/{gpt → api/openai}/worker/importer.py +10 -10
  148. pygpt_net/provider/audio_input/google_genai.py +103 -0
  149. pygpt_net/provider/audio_input/openai_whisper.py +1 -1
  150. pygpt_net/provider/audio_output/google_genai_tts.py +229 -0
  151. pygpt_net/provider/audio_output/openai_tts.py +9 -6
  152. pygpt_net/provider/core/config/patch.py +26 -0
  153. pygpt_net/provider/core/model/patch.py +20 -0
  154. pygpt_net/provider/core/preset/json_file.py +2 -4
  155. pygpt_net/provider/llms/anthropic.py +2 -5
  156. pygpt_net/provider/llms/base.py +4 -3
  157. pygpt_net/provider/llms/google.py +8 -9
  158. pygpt_net/provider/llms/openai.py +1 -1
  159. pygpt_net/provider/loaders/hub/image_vision/base.py +1 -1
  160. pygpt_net/ui/dialog/preset.py +71 -55
  161. pygpt_net/ui/layout/toolbox/footer.py +16 -0
  162. pygpt_net/ui/layout/toolbox/image.py +5 -0
  163. pygpt_net/ui/main.py +6 -4
  164. pygpt_net/ui/widget/option/combo.py +15 -1
  165. pygpt_net/utils.py +9 -0
  166. {pygpt_net-2.6.29.dist-info → pygpt_net-2.6.31.dist-info}/METADATA +55 -55
  167. {pygpt_net-2.6.29.dist-info → pygpt_net-2.6.31.dist-info}/RECORD +181 -135
  168. pygpt_net/core/audio/backend/pyaudio.py +0 -554
  169. /pygpt_net/{provider/gpt/agents → controller/chat/handler}/__init__.py +0 -0
  170. /pygpt_net/{provider/gpt/worker → core/realtime}/__init__.py +0 -0
  171. /pygpt_net/provider/{gpt → api/openai}/agents/client.py +0 -0
  172. /pygpt_net/provider/{gpt → api/openai}/agents/remote_tools.py +0 -0
  173. /pygpt_net/provider/{gpt → api/openai}/agents/utils.py +0 -0
  174. /pygpt_net/provider/{gpt → api/openai}/audio.py +0 -0
  175. /pygpt_net/provider/{gpt → api/openai}/computer.py +0 -0
  176. /pygpt_net/provider/{gpt → api/openai}/container.py +0 -0
  177. /pygpt_net/provider/{gpt → api/openai}/summarizer.py +0 -0
  178. /pygpt_net/provider/{gpt → api/openai}/tools.py +0 -0
  179. /pygpt_net/provider/{gpt → api/openai}/utils.py +0 -0
  180. {pygpt_net-2.6.29.dist-info → pygpt_net-2.6.31.dist-info}/LICENSE +0 -0
  181. {pygpt_net-2.6.29.dist-info → pygpt_net-2.6.31.dist-info}/WHEEL +0 -0
  182. {pygpt_net-2.6.29.dist-info → pygpt_net-2.6.31.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,591 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # ================================================== #
4
+ # This file is a part of PYGPT package #
5
+ # Website: https://pygpt.net #
6
+ # GitHub: https://github.com/szczyglis-dev/py-gpt #
7
+ # MIT License #
8
+ # Created By : Marcin Szczygliński #
9
+ # Updated Date: 2025.08.28 20:00:00 #
10
+ # ================================================== #
11
+
12
+ from typing import Optional, Dict, Any, List
13
+
14
+ from google.genai import types as gtypes
15
+ from google.genai.types import Content, Part
16
+
17
+ from pygpt_net.core.types import MODE_CHAT, MODE_AUDIO
18
+ from pygpt_net.core.bridge.context import BridgeContext, MultimodalContext
19
+ from pygpt_net.item.attachment import AttachmentItem
20
+ from pygpt_net.item.ctx import CtxItem
21
+ from pygpt_net.item.model import ModelItem
22
+
23
+
24
+ class Chat:
25
+ def __init__(self, window=None):
26
+ """
27
+ Chat wrapper for Google GenAI.
28
+ """
29
+ self.window = window
30
+ self.input_tokens = 0
31
+
32
+ def send(
33
+ self,
34
+ context: BridgeContext,
35
+ extra: Optional[Dict[str, Any]] = None
36
+ ):
37
+ """
38
+ Call Google GenAI for chat / multimodal / audio.
39
+
40
+ :param context: BridgeContext with prompt, model, history, mode, etc.
41
+ :param extra: Extra parameters (not used currently)
42
+ :return: Response object or generator (if streaming)
43
+ """
44
+ prompt = context.prompt
45
+ stream = context.stream
46
+ system_prompt = context.system_prompt
47
+ model = context.model
48
+ functions = context.external_functions
49
+ attachments = context.attachments
50
+ multimodal_ctx = context.multimodal_ctx
51
+ mode = context.mode
52
+ ctx = context.ctx or CtxItem()
53
+
54
+ client = self.window.core.api.google.get_client(context.mode, model)
55
+
56
+ # Detect audio-input present
57
+ has_audio_input = bool(
58
+ multimodal_ctx
59
+ and getattr(multimodal_ctx, "is_audio_input", False)
60
+ and getattr(multimodal_ctx, "audio_data", None)
61
+ )
62
+
63
+ # ------------- TRANSCRIPTION PATH (audio input -> text -> feed to TTS) -------------
64
+ if mode == MODE_AUDIO and has_audio_input:
65
+ # Build minimal transcription request: [instruction text, audio part]
66
+ transcribe_model = self.window.core.config.get("google_audio.transcribe_model", "gemini-2.5-flash")
67
+ transcribe_prompt = self.window.core.config.get("google_audio.transcribe_prompt", "Transcribe this audio clip to text.")
68
+ audio_part = self.window.core.api.google.audio.build_part(multimodal_ctx)
69
+ trans_inputs = [
70
+ Content(role="user", parts=[
71
+ Part.from_text(text=transcribe_prompt),
72
+ audio_part,
73
+ ])
74
+ ]
75
+ trans_cfg = gtypes.GenerateContentConfig(
76
+ # Keep minimal; no tools/system for transcription
77
+ temperature=self.window.core.config.get('temperature'),
78
+ top_p=self.window.core.config.get('top_p'),
79
+ max_output_tokens=context.max_tokens if context.max_tokens else None,
80
+ )
81
+
82
+ # Always non-stream here (we immediately need the text for TTS)
83
+ trans_resp = client.models.generate_content(
84
+ model=transcribe_model,
85
+ contents=trans_inputs,
86
+ config=trans_cfg,
87
+ )
88
+ transcribed_text = self.extract_text(trans_resp).strip()
89
+ if transcribed_text:
90
+ # Feed transcription into TTS as the final prompt
91
+ prompt = transcribed_text
92
+ ctx.input = transcribed_text
93
+ try:
94
+ # optional: store for debugging/UX
95
+ if isinstance(ctx.extra, dict):
96
+ ctx.extra["transcription"] = transcribed_text
97
+ except Exception:
98
+ pass
99
+ ctx.is_audio = False # transcription is text
100
+ multimodal_ctx.is_audio_input = False # disable audio input for TTS below
101
+
102
+ # ---------------------- REGULAR CHAT PATH (or no-audio in MODE_AUDIO) ----------------------
103
+ # Build contents for chat/multimodal (will be overridden for TTS below)
104
+ inputs = self.build_input(
105
+ prompt=prompt,
106
+ system_prompt=system_prompt,
107
+ model=model,
108
+ history=context.history,
109
+ attachments=attachments,
110
+ multimodal_ctx=multimodal_ctx,
111
+ )
112
+
113
+ # Best-effort input tokens estimate
114
+ self.reset_tokens()
115
+ count_msgs = self._build_count_messages(prompt, system_prompt, model, context.history)
116
+ self.input_tokens += self.window.core.tokens.from_messages(count_msgs, model.id)
117
+
118
+ # Tools -> merge app-defined tools with remote tools
119
+ base_tools = self.window.core.api.google.tools.prepare(model, functions)
120
+ remote_tools = self.window.core.api.google.build_remote_tools(model)
121
+
122
+ # Check tools compatibility
123
+ if base_tools:
124
+ remote_tools = [] # remote tools are not allowed if function calling is used
125
+ tools = (base_tools or []) + (remote_tools or [])
126
+ if "-image" in model.id:
127
+ tools = None # function calling is not supported for image models
128
+
129
+ # Sampling
130
+ temperature = self.window.core.config.get('temperature')
131
+ top_p = self.window.core.config.get('top_p')
132
+ max_tokens = context.max_tokens if context.max_tokens else None
133
+
134
+ # Base config
135
+ cfg_kwargs: Dict[str, Any] = dict(
136
+ temperature=temperature,
137
+ top_p=top_p,
138
+ max_output_tokens=max_tokens,
139
+ system_instruction=system_prompt if system_prompt else None,
140
+ tools=tools if tools else None,
141
+ )
142
+
143
+ # ---------- AUDIO MODE (output TTS) ----------
144
+ if mode == MODE_AUDIO:
145
+ stream = False # TTS non-stream in this app
146
+ supports_tts = self._supports_tts(model.id)
147
+
148
+ # Force minimal single-turn input for TTS (text only), using prompt possibly replaced by transcription
149
+ inputs = [Content(role="user", parts=[Part.from_text(text=str(prompt or ""))])]
150
+
151
+ # Remove params not used by TTS flow (and that sometimes cause issues)
152
+ for key in ("temperature", "top_p", "max_output_tokens", "system_instruction", "tools"):
153
+ if key in cfg_kwargs:
154
+ del cfg_kwargs[key]
155
+
156
+ # Voice selection (case-sensitive name)
157
+ voice_name = "Kore"
158
+ try:
159
+ tmp = self.window.core.plugins.get_option("audio_output", "google_genai_tts_voice")
160
+ if tmp:
161
+ name = str(tmp).strip()
162
+ mapping = {"kore": "Kore", "puck": "Puck", "charon": "Charon", "verse": "Verse", "legend": "Legend"}
163
+ voice_name = mapping.get(name.lower(), name)
164
+ except Exception:
165
+ pass
166
+
167
+ if supports_tts:
168
+ cfg_kwargs["response_modalities"] = ["AUDIO"]
169
+ cfg_kwargs["speech_config"] = gtypes.SpeechConfig(
170
+ voice_config=gtypes.VoiceConfig(
171
+ prebuilt_voice_config=gtypes.PrebuiltVoiceConfig(voice_name=voice_name)
172
+ )
173
+ )
174
+ # else: fallback to text-only below
175
+
176
+ cfg = gtypes.GenerateContentConfig(**cfg_kwargs)
177
+ params = dict(model=model.id, contents=inputs, config=cfg)
178
+
179
+ if stream and mode != MODE_AUDIO:
180
+ return client.models.generate_content_stream(**params)
181
+ else:
182
+ return client.models.generate_content(**params)
183
+
184
+ def unpack_response(
185
+ self,
186
+ mode: str,
187
+ response, ctx: CtxItem
188
+ ):
189
+ """
190
+ Unpack non-streaming response from Google GenAI and set context.
191
+
192
+ :param mode: MODE_CHAT or MODE_AUDIO
193
+ :param response: Response object
194
+ :param ctx: CtxItem to set output, audio_output, tokens, tool_calls
195
+ """
196
+ if mode == MODE_AUDIO:
197
+ # Prefer audio if present
198
+ audio_bytes, mime = self.window.core.api.google.audio.extract_first_audio_part(response)
199
+ if audio_bytes:
200
+ # Google returns PCM16 24kHz mono for TTS; wrap to WAV (base64) for UI compatibility
201
+ # https://ai.google.dev/gemini-api/docs/speech-generation
202
+ if mime == "audio/pcm" or mime.startswith("audio/"):
203
+ wav_b64 = self.window.core.api.google.audio.pcm16_to_wav_base64(audio_bytes, rate=24000)
204
+ ctx.audio_output = wav_b64
205
+ ctx.is_audio = True
206
+ # Text transcript is typically not present for TTS; still try:
207
+ txt = self.extract_text(response)
208
+ ctx.output = txt or "..."
209
+ else:
210
+ # No audio present -> fallback to text
211
+ ctx.output = self.extract_text(response)
212
+
213
+ # Usage
214
+ try:
215
+ usage = getattr(response, "usage_metadata", None)
216
+ if usage:
217
+ p = getattr(usage, "prompt_token_count", 0) or 0
218
+ c = getattr(usage, "candidates_token_count", 0) or 0
219
+ ctx.set_tokens(p, c)
220
+ except Exception:
221
+ pass
222
+
223
+ return # audio path done
224
+
225
+ # ---- regular chat/completion ----
226
+ ctx.output = self.extract_text(response)
227
+
228
+ # Extract function calls
229
+ calls = self.extract_tool_calls(response)
230
+ if calls:
231
+ ctx.tool_calls = calls
232
+
233
+ # Usage if available
234
+ try:
235
+ usage = getattr(response, "usage_metadata", None)
236
+ if usage:
237
+ p = getattr(usage, "prompt_token_count", 0) or 0
238
+ c = getattr(usage, "candidates_token_count", 0) or 0
239
+ ctx.set_tokens(p, c)
240
+ except Exception:
241
+ pass
242
+
243
+ # Best-effort: inline images / links (image-output in chat)
244
+ try:
245
+ self._extract_inline_images_and_links(response, ctx)
246
+ except Exception:
247
+ pass
248
+
249
+ def extract_text(self, response) -> str:
250
+ """
251
+ Extract output text.
252
+
253
+ Prefer response.text (Python SDK), then fallback to parts[].text.
254
+
255
+ :param response: Response object
256
+ :return: Extracted text
257
+ """
258
+ txt = getattr(response, "text", None) or getattr(response, "output_text", None)
259
+ if txt:
260
+ return str(txt).strip()
261
+ try:
262
+ cands = getattr(response, "candidates", None) or []
263
+ if cands:
264
+ parts = getattr(cands[0], "content", None)
265
+ parts = getattr(parts, "parts", None) or []
266
+ out = []
267
+ for p in parts:
268
+ t = getattr(p, "text", None)
269
+ if t:
270
+ out.append(str(t))
271
+ return "".join(out).strip()
272
+ except Exception:
273
+ pass
274
+ return ""
275
+
276
+ def extract_tool_calls(self, response) -> List[dict]:
277
+ """
278
+ Extract tool calls in a format compatible with app's tool execution.
279
+ Prefer response.function_calls (Python SDK), then fallback to parts[].function_call.
280
+ Returns arguments as a Python dict (not a JSON string).
281
+
282
+ :param response: Response object
283
+ :return: List of tool calls
284
+ """
285
+ def _to_plain_dict(obj):
286
+ # Convert pydantic/genai objects to plain dict recursively
287
+ try:
288
+ if hasattr(obj, "to_json_dict"):
289
+ return obj.to_json_dict()
290
+ if hasattr(obj, "model_dump"):
291
+ return obj.model_dump() # pydantic v2
292
+ if hasattr(obj, "to_dict"):
293
+ return obj.to_dict()
294
+ except Exception:
295
+ pass
296
+ if isinstance(obj, dict):
297
+ return {k: _to_plain_dict(v) for k, v in obj.items()}
298
+ if isinstance(obj, (list, tuple)):
299
+ return [_to_plain_dict(x) for x in obj]
300
+ return obj
301
+
302
+ out: List[dict] = []
303
+
304
+ # 1) response.function_calls
305
+ fcs = getattr(response, "function_calls", None) or []
306
+ for fc in fcs:
307
+ name = getattr(fc, "name", "") or ""
308
+ args_obj = getattr(fc, "args", {}) or {}
309
+ args_dict = _to_plain_dict(args_obj) or {}
310
+ # if str, try to parse
311
+ if isinstance(args_dict, str):
312
+ try:
313
+ import json
314
+ args_dict = json.loads(args_dict)
315
+ except Exception:
316
+ args_dict = {}
317
+ out.append({
318
+ "id": getattr(fc, "id", "") or "",
319
+ "type": "function",
320
+ "function": {
321
+ "name": name,
322
+ "arguments": args_dict, # <--- DICT, not string
323
+ }
324
+ })
325
+
326
+ if out:
327
+ return out
328
+
329
+ # 2) Fallback: candidates -> parts[].function_call
330
+ try:
331
+ cands = getattr(response, "candidates", None) or []
332
+ for cand in cands:
333
+ parts = getattr(getattr(cand, "content", None), "parts", None) or []
334
+ for part in parts:
335
+ fn = getattr(part, "function_call", None)
336
+ if not fn:
337
+ continue
338
+ name = getattr(fn, "name", "") or ""
339
+ args_obj = getattr(fn, "args", {}) or {}
340
+ args_dict = _to_plain_dict(args_obj) or {}
341
+ if isinstance(args_dict, str):
342
+ try:
343
+ import json
344
+ args_dict = json.loads(args_dict)
345
+ except Exception:
346
+ args_dict = {}
347
+ out.append({
348
+ "id": "",
349
+ "type": "function",
350
+ "function": {
351
+ "name": name,
352
+ "arguments": args_dict, # <--- DICT
353
+ }
354
+ })
355
+ except Exception:
356
+ pass
357
+
358
+ return out
359
+
360
+ def _extract_inline_images_and_links(
361
+ self,
362
+ response, ctx: CtxItem
363
+ ) -> None:
364
+ """
365
+ Extract inline image parts (Gemini image output) and file links.
366
+ - Saves inline_data (image/*) bytes to files and appends paths to ctx.images.
367
+ - Appends HTTP(S) image URIs from file_data to ctx.urls.
368
+
369
+ :param response: Response object
370
+ :param ctx: CtxItem to set images and urls
371
+ """
372
+ images: list[str] = []
373
+ urls: list[str] = []
374
+
375
+ try:
376
+ cands = getattr(response, "candidates", None) or []
377
+ for cand in cands:
378
+ content = getattr(cand, "content", None)
379
+ parts = getattr(content, "parts", None) or []
380
+ for p in parts:
381
+ # Inline image bytes (image preview / image generation in chat)
382
+ blob = getattr(p, "inline_data", None)
383
+ if blob:
384
+ mime = (getattr(blob, "mime_type", "") or "").lower()
385
+ if mime.startswith("image/"):
386
+ data = getattr(blob, "data", None)
387
+ if data:
388
+ img_bytes = self._ensure_bytes(data)
389
+ if img_bytes:
390
+ img_path = self.window.core.image.gen_unique_path(ctx)
391
+ with open(img_path, "wb") as f:
392
+ f.write(img_bytes)
393
+ images.append(img_path)
394
+
395
+ # File data URI (may contain http/https or gs://)
396
+ fdata = getattr(p, "file_data", None)
397
+ if fdata:
398
+ uri = getattr(fdata, "file_uri", None) or getattr(fdata, "uri", None)
399
+ mime = (getattr(fdata, "mime_type", "") or "").lower()
400
+ if uri and mime.startswith("image/"):
401
+ # Store only as URL; downloading is out of scope here.
402
+ if uri.startswith("http://") or uri.startswith("https://"):
403
+ urls.append(uri)
404
+ except Exception:
405
+ # Best-effort only
406
+ pass
407
+
408
+ if images:
409
+ if not isinstance(ctx.images, list):
410
+ ctx.images = []
411
+ ctx.images.extend(images)
412
+
413
+ if urls:
414
+ if ctx.urls is None:
415
+ ctx.urls = []
416
+ ctx.urls.extend(urls)
417
+
418
+ @staticmethod
419
+ def _ensure_bytes(data) -> bytes | None:
420
+ """
421
+ Return raw bytes from SDK part.inline_data.data which can be bytes or base64 string.
422
+
423
+ :param data: bytes or str
424
+ :return: bytes or None
425
+ """
426
+ try:
427
+ if isinstance(data, (bytes, bytearray)):
428
+ return bytes(data)
429
+ if isinstance(data, str):
430
+ import base64
431
+ return base64.b64decode(data)
432
+ except Exception:
433
+ return None
434
+ return None
435
+
436
+ def build_input(
437
+ self,
438
+ prompt: str,
439
+ system_prompt: str,
440
+ model: ModelItem,
441
+ history: Optional[List[CtxItem]] = None,
442
+ attachments: Optional[Dict[str, AttachmentItem]] = None,
443
+ multimodal_ctx: Optional[MultimodalContext] = None,
444
+ ) -> List[Content]:
445
+ """
446
+ Build Google GenAI contents list
447
+
448
+ :param prompt: User prompt
449
+ :param system_prompt: System prompt/instruction
450
+ :param model: ModelItem
451
+ :param history: List of CtxItem for history
452
+ :param attachments: Dict of AttachmentItem for images
453
+ :param multimodal_ctx: MultimodalContext for audio
454
+ :return: List of Content
455
+ """
456
+ contents: List[Content] = []
457
+
458
+ # System instruction is passed separately (system_instruction),
459
+ # so we do not build an explicit system role part here.
460
+
461
+ # Append conversation history
462
+ if self.window.core.config.get('use_context'):
463
+ items = self.window.core.ctx.get_history(
464
+ history,
465
+ model.id,
466
+ MODE_CHAT if model and MODE_CHAT in model.mode else MODE_CHAT,
467
+ self.window.core.tokens.from_user(prompt, system_prompt),
468
+ self._fit_ctx(model),
469
+ )
470
+ for item in items:
471
+ if item.final_input:
472
+ contents.append(Content(role="user", parts=[Part.from_text(text=str(item.final_input))]))
473
+ if item.final_output:
474
+ contents.append(Content(role="model", parts=[Part.from_text(text=str(item.final_output))]))
475
+
476
+ # Current user message with multimodal parts
477
+ parts = self._build_user_parts(
478
+ content=str(prompt),
479
+ attachments=attachments,
480
+ multimodal_ctx=multimodal_ctx,
481
+ )
482
+ contents.append(Content(role="user", parts=parts))
483
+
484
+ return contents
485
+
486
+ def _build_user_parts(
487
+ self,
488
+ content: str,
489
+ attachments: Optional[Dict[str, AttachmentItem]] = None,
490
+ multimodal_ctx: Optional[MultimodalContext] = None,
491
+ ) -> List[Part]:
492
+ """
493
+ Build user message parts (text + images + audio)
494
+
495
+ :param content: User text content
496
+ :param attachments: Dict of AttachmentItem for images
497
+ :param multimodal_ctx: MultimodalContext for audio
498
+ :return: List of Part
499
+ """
500
+ parts: List[Part] = []
501
+ if content:
502
+ parts.append(Part.from_text(text=str(content)))
503
+
504
+ if attachments:
505
+ img_parts = self.window.core.api.google.vision.build_parts(content, attachments)
506
+ parts.extend(img_parts)
507
+
508
+ if multimodal_ctx and multimodal_ctx.is_audio_input and multimodal_ctx.audio_data:
509
+ audio_format = (multimodal_ctx.audio_format or "wav").lower()
510
+ mime = f"audio/{audio_format}"
511
+ parts.append(Part.from_bytes(data=multimodal_ctx.audio_data, mime_type=mime))
512
+
513
+ return parts
514
+
515
+ def _fit_ctx(self, model: ModelItem) -> int:
516
+ """
517
+ Fit to max model tokens (best-effort, uses model.ctx if present)
518
+
519
+ :param model: ModelItem
520
+ :return: max context tokens
521
+ """
522
+ max_ctx_tokens = self.window.core.config.get('max_total_tokens')
523
+ if model and model.ctx and 0 < model.ctx < max_ctx_tokens:
524
+ max_ctx_tokens = model.ctx
525
+ return max_ctx_tokens
526
+
527
+ def _build_count_messages(
528
+ self,
529
+ prompt: str,
530
+ system_prompt: str,
531
+ model: ModelItem,
532
+ history: Optional[List[CtxItem]] = None,
533
+ ) -> List[dict]:
534
+ """
535
+ Build simple messages structure for local token estimation
536
+
537
+ :param prompt: User prompt
538
+ :param system_prompt: System prompt/instruction
539
+ :param model: ModelItem
540
+ :param history: List of CtxItem for history
541
+ :return: List of messages dicts with 'role' and 'content' keys
542
+ """
543
+ messages = []
544
+ if system_prompt:
545
+ messages.append({"role": "system", "content": system_prompt})
546
+
547
+ if self.window.core.config.get('use_context'):
548
+ used_tokens = self.window.core.tokens.from_user(prompt, system_prompt)
549
+ items = self.window.core.ctx.get_history(
550
+ history,
551
+ model.id,
552
+ MODE_CHAT,
553
+ used_tokens,
554
+ self._fit_ctx(model),
555
+ )
556
+ for item in items:
557
+ if item.final_input:
558
+ messages.append({"role": "user", "content": str(item.final_input)})
559
+ if item.final_output:
560
+ messages.append({"role": "assistant", "content": str(item.final_output)})
561
+
562
+ messages.append({"role": "user", "content": str(prompt)})
563
+ return messages
564
+
565
+
566
+ def reset_tokens(self):
567
+ """Reset input tokens counter"""
568
+ self.input_tokens = 0
569
+
570
+ def get_used_tokens(self) -> int:
571
+ """
572
+ Get input tokens counter (estimated before sending)
573
+
574
+ :return: input tokens count
575
+ """
576
+ return self.input_tokens
577
+
578
+ @staticmethod
579
+ def _supports_tts(model_id: Optional[str]) -> bool:
580
+ """
581
+ Heuristic check if the model supports native TTS.
582
+ - Official TTS models contain '-tts' in id (e.g. 'gemini-2.5-flash-preview-tts').
583
+ - Future/preview names may contain 'native-audio'.
584
+
585
+ :param model_id: Model ID
586
+ :return: True if supports TTS, False otherwise
587
+ """
588
+ if not model_id:
589
+ return False
590
+ mid = model_id.lower()
591
+ return ("-tts" in mid) or ("native-audio" in mid)