pygpt-net 2.6.28__py3-none-any.whl → 2.6.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. pygpt_net/CHANGELOG.txt +13 -0
  2. pygpt_net/__init__.py +3 -3
  3. pygpt_net/{container.py → app_core.py} +5 -6
  4. pygpt_net/controller/access/control.py +1 -9
  5. pygpt_net/controller/assistant/assistant.py +4 -4
  6. pygpt_net/controller/assistant/batch.py +7 -7
  7. pygpt_net/controller/assistant/files.py +4 -4
  8. pygpt_net/controller/assistant/threads.py +3 -3
  9. pygpt_net/controller/attachment/attachment.py +4 -7
  10. pygpt_net/controller/chat/common.py +1 -1
  11. pygpt_net/controller/chat/stream.py +961 -294
  12. pygpt_net/controller/chat/vision.py +11 -19
  13. pygpt_net/controller/config/placeholder.py +1 -1
  14. pygpt_net/controller/ctx/ctx.py +1 -1
  15. pygpt_net/controller/ctx/summarizer.py +1 -1
  16. pygpt_net/controller/mode/mode.py +21 -12
  17. pygpt_net/controller/plugins/settings.py +3 -2
  18. pygpt_net/controller/presets/editor.py +112 -99
  19. pygpt_net/controller/theme/common.py +2 -0
  20. pygpt_net/controller/theme/theme.py +6 -2
  21. pygpt_net/controller/ui/vision.py +4 -4
  22. pygpt_net/core/agents/legacy.py +2 -2
  23. pygpt_net/core/agents/runners/openai_workflow.py +2 -2
  24. pygpt_net/core/assistants/files.py +5 -5
  25. pygpt_net/core/assistants/store.py +4 -4
  26. pygpt_net/core/bridge/bridge.py +3 -3
  27. pygpt_net/core/bridge/worker.py +28 -9
  28. pygpt_net/core/debug/console/console.py +2 -2
  29. pygpt_net/core/debug/presets.py +2 -2
  30. pygpt_net/core/experts/experts.py +2 -2
  31. pygpt_net/core/idx/llm.py +21 -3
  32. pygpt_net/core/modes/modes.py +2 -2
  33. pygpt_net/core/presets/presets.py +3 -3
  34. pygpt_net/core/tokens/tokens.py +4 -4
  35. pygpt_net/core/types/mode.py +5 -2
  36. pygpt_net/core/vision/analyzer.py +1 -1
  37. pygpt_net/data/config/config.json +6 -3
  38. pygpt_net/data/config/models.json +75 -3
  39. pygpt_net/data/config/modes.json +3 -9
  40. pygpt_net/data/config/settings.json +112 -55
  41. pygpt_net/data/config/settings_section.json +2 -2
  42. pygpt_net/data/locale/locale.de.ini +2 -2
  43. pygpt_net/data/locale/locale.en.ini +9 -2
  44. pygpt_net/data/locale/locale.es.ini +2 -2
  45. pygpt_net/data/locale/locale.fr.ini +2 -2
  46. pygpt_net/data/locale/locale.it.ini +2 -2
  47. pygpt_net/data/locale/locale.pl.ini +3 -3
  48. pygpt_net/data/locale/locale.uk.ini +2 -2
  49. pygpt_net/data/locale/locale.zh.ini +2 -2
  50. pygpt_net/item/model.py +23 -3
  51. pygpt_net/plugin/openai_dalle/plugin.py +4 -4
  52. pygpt_net/plugin/openai_vision/plugin.py +12 -13
  53. pygpt_net/provider/agents/openai/agent.py +5 -5
  54. pygpt_net/provider/agents/openai/agent_b2b.py +5 -5
  55. pygpt_net/provider/agents/openai/agent_planner.py +5 -6
  56. pygpt_net/provider/agents/openai/agent_with_experts.py +5 -5
  57. pygpt_net/provider/agents/openai/agent_with_experts_feedback.py +4 -4
  58. pygpt_net/provider/agents/openai/agent_with_feedback.py +4 -4
  59. pygpt_net/provider/agents/openai/bot_researcher.py +2 -2
  60. pygpt_net/provider/agents/openai/bots/research_bot/agents/planner_agent.py +1 -1
  61. pygpt_net/provider/agents/openai/bots/research_bot/agents/search_agent.py +1 -1
  62. pygpt_net/provider/agents/openai/bots/research_bot/agents/writer_agent.py +1 -1
  63. pygpt_net/provider/agents/openai/evolve.py +5 -5
  64. pygpt_net/provider/agents/openai/supervisor.py +4 -4
  65. pygpt_net/provider/api/__init__.py +27 -0
  66. pygpt_net/provider/api/anthropic/__init__.py +68 -0
  67. pygpt_net/provider/api/google/__init__.py +262 -0
  68. pygpt_net/provider/api/google/audio.py +114 -0
  69. pygpt_net/provider/api/google/chat.py +552 -0
  70. pygpt_net/provider/api/google/image.py +287 -0
  71. pygpt_net/provider/api/google/tools.py +222 -0
  72. pygpt_net/provider/api/google/vision.py +129 -0
  73. pygpt_net/provider/{gpt → api/openai}/__init__.py +2 -2
  74. pygpt_net/provider/{gpt → api/openai}/agents/computer.py +1 -1
  75. pygpt_net/provider/{gpt → api/openai}/agents/experts.py +1 -1
  76. pygpt_net/provider/{gpt → api/openai}/agents/response.py +1 -1
  77. pygpt_net/provider/{gpt → api/openai}/assistants.py +1 -1
  78. pygpt_net/provider/{gpt → api/openai}/chat.py +15 -8
  79. pygpt_net/provider/{gpt → api/openai}/completion.py +1 -1
  80. pygpt_net/provider/{gpt → api/openai}/image.py +1 -1
  81. pygpt_net/provider/{gpt → api/openai}/remote_tools.py +1 -1
  82. pygpt_net/provider/{gpt → api/openai}/responses.py +34 -20
  83. pygpt_net/provider/{gpt → api/openai}/store.py +2 -2
  84. pygpt_net/provider/{gpt → api/openai}/vision.py +1 -1
  85. pygpt_net/provider/{gpt → api/openai}/worker/assistants.py +4 -4
  86. pygpt_net/provider/{gpt → api/openai}/worker/importer.py +10 -10
  87. pygpt_net/provider/audio_input/openai_whisper.py +1 -1
  88. pygpt_net/provider/audio_output/google_tts.py +12 -0
  89. pygpt_net/provider/audio_output/openai_tts.py +1 -1
  90. pygpt_net/provider/core/config/patch.py +11 -0
  91. pygpt_net/provider/core/model/patch.py +9 -0
  92. pygpt_net/provider/core/preset/json_file.py +2 -4
  93. pygpt_net/provider/llms/anthropic.py +2 -5
  94. pygpt_net/provider/llms/base.py +4 -3
  95. pygpt_net/provider/llms/openai.py +1 -1
  96. pygpt_net/provider/loaders/hub/image_vision/base.py +1 -1
  97. pygpt_net/ui/dialog/preset.py +71 -55
  98. pygpt_net/ui/main.py +6 -4
  99. pygpt_net/utils.py +9 -0
  100. {pygpt_net-2.6.28.dist-info → pygpt_net-2.6.30.dist-info}/METADATA +42 -48
  101. {pygpt_net-2.6.28.dist-info → pygpt_net-2.6.30.dist-info}/RECORD +115 -107
  102. /pygpt_net/provider/{gpt → api/openai}/agents/__init__.py +0 -0
  103. /pygpt_net/provider/{gpt → api/openai}/agents/client.py +0 -0
  104. /pygpt_net/provider/{gpt → api/openai}/agents/remote_tools.py +0 -0
  105. /pygpt_net/provider/{gpt → api/openai}/agents/utils.py +0 -0
  106. /pygpt_net/provider/{gpt → api/openai}/audio.py +0 -0
  107. /pygpt_net/provider/{gpt → api/openai}/computer.py +0 -0
  108. /pygpt_net/provider/{gpt → api/openai}/container.py +0 -0
  109. /pygpt_net/provider/{gpt → api/openai}/summarizer.py +0 -0
  110. /pygpt_net/provider/{gpt → api/openai}/tools.py +0 -0
  111. /pygpt_net/provider/{gpt → api/openai}/utils.py +0 -0
  112. /pygpt_net/provider/{gpt → api/openai}/worker/__init__.py +0 -0
  113. {pygpt_net-2.6.28.dist-info → pygpt_net-2.6.30.dist-info}/LICENSE +0 -0
  114. {pygpt_net-2.6.28.dist-info → pygpt_net-2.6.30.dist-info}/WHEEL +0 -0
  115. {pygpt_net-2.6.28.dist-info → pygpt_net-2.6.30.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,114 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # ================================================== #
4
+ # This file is a part of PYGPT package #
5
+ # Website: https://pygpt.net #
6
+ # GitHub: https://github.com/szczyglis-dev/py-gpt #
7
+ # MIT License #
8
+ # Created By : Marcin Szczygliński #
9
+ # Updated Date: 2025.08.28 20:00:00 #
10
+ # ================================================== #
11
+
12
+ import base64
13
+ import io
14
+ import wave
15
+ from typing import Optional, Tuple
16
+
17
+ from google.genai.types import Part
18
+ from pygpt_net.core.bridge.context import MultimodalContext
19
+
20
+
21
+ class Audio:
22
+ def __init__(self, window=None):
23
+ """
24
+ Audio helpers for Google GenAI.
25
+ - Build audio input parts for requests
26
+ - Convert Google PCM output to WAV (base64) for UI compatibility
27
+ """
28
+ self.window = window
29
+
30
+ # ---------- INPUT (user -> model) ----------
31
+
32
+ def build_part(
33
+ self,
34
+ multimodal_ctx: Optional[MultimodalContext]
35
+ ) -> Optional[Part]:
36
+ """
37
+ Build audio Part from multimodal context (inline bytes).
38
+
39
+ :param multimodal_ctx: MultimodalContext
40
+ :return: Part or None
41
+ """
42
+ if not multimodal_ctx or not multimodal_ctx.is_audio_input or not multimodal_ctx.audio_data:
43
+ return None
44
+ audio_format = (multimodal_ctx.audio_format or "wav").lower()
45
+ mime = f"audio/{audio_format}"
46
+ return Part.from_bytes(data=multimodal_ctx.audio_data, mime_type=mime)
47
+
48
+ # ---------- OUTPUT (model -> UI) ----------
49
+
50
+ def extract_first_audio_part(
51
+ self,
52
+ response
53
+ ) -> Tuple[Optional[bytes], Optional[str]]:
54
+ """
55
+ Extract first audio inline_data from a non-streaming response.
56
+
57
+ :param response: Google response object
58
+ :return: (audio_bytes, mime_type) or (None, None)
59
+ """
60
+ try:
61
+ candidates = getattr(response, "candidates", None) or []
62
+ for cand in candidates:
63
+ content = getattr(cand, "content", None)
64
+ parts = getattr(content, "parts", None) or []
65
+ for p in parts:
66
+ inline = getattr(p, "inline_data", None)
67
+ if not inline:
68
+ continue
69
+ mime = (getattr(inline, "mime_type", "") or "").lower()
70
+ if not mime.startswith("audio/"):
71
+ continue
72
+ data = getattr(inline, "data", None)
73
+ audio_bytes = self._ensure_bytes(data)
74
+ if audio_bytes:
75
+ return audio_bytes, mime
76
+ except Exception:
77
+ pass
78
+ return None, None
79
+
80
+ def pcm16_to_wav_base64(
81
+ self,
82
+ pcm_bytes: bytes,
83
+ rate: int = 24000,
84
+ channels: int = 1,
85
+ sample_width: int = 2
86
+ ) -> str:
87
+ """
88
+ Wrap raw PCM16 mono @ 24kHz into WAV and return base64-encoded payload.
89
+
90
+ :param pcm_bytes: Raw PCM16 bytes
91
+ :param rate: Sample rate (Hz), default 24000 for Google TTS
92
+ :param channels: Channels, default 1
93
+ :param sample_width: Bytes per sample, default 2 for PCM16
94
+ :return: Base64-encoded WAV
95
+ """
96
+ buf = io.BytesIO()
97
+ with wave.open(buf, "wb") as wf:
98
+ wf.setnchannels(channels)
99
+ wf.setsampwidth(sample_width)
100
+ wf.setframerate(rate)
101
+ wf.writeframes(pcm_bytes)
102
+ return base64.b64encode(buf.getvalue()).decode("utf-8")
103
+
104
+ @staticmethod
105
+ def _ensure_bytes(data) -> Optional[bytes]:
106
+ """Return raw bytes from inline_data.data (bytes or base64 string)."""
107
+ try:
108
+ if isinstance(data, (bytes, bytearray)):
109
+ return bytes(data)
110
+ if isinstance(data, str):
111
+ return base64.b64decode(data)
112
+ except Exception:
113
+ return None
114
+ return None
@@ -0,0 +1,552 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # ================================================== #
4
+ # This file is a part of PYGPT package #
5
+ # Website: https://pygpt.net #
6
+ # GitHub: https://github.com/szczyglis-dev/py-gpt #
7
+ # MIT License #
8
+ # Created By : Marcin Szczygliński #
9
+ # Updated Date: 2025.08.28 20:00:00 #
10
+ # ================================================== #
11
+
12
+ from typing import Optional, Dict, Any, List
13
+
14
+ from google.genai import types as gtypes
15
+ from google.genai.types import Content, Part
16
+
17
+ from pygpt_net.core.types import MODE_CHAT, MODE_AUDIO
18
+ from pygpt_net.core.bridge.context import BridgeContext, MultimodalContext
19
+ from pygpt_net.item.attachment import AttachmentItem
20
+ from pygpt_net.item.ctx import CtxItem
21
+ from pygpt_net.item.model import ModelItem
22
+
23
+
24
+ class Chat:
25
+ def __init__(self, window=None):
26
+ """
27
+ Chat wrapper for Google GenAI.
28
+ """
29
+ self.window = window
30
+ self.input_tokens = 0
31
+
32
+ def send(self, context: BridgeContext, extra: Optional[Dict[str, Any]] = None):
33
+ """
34
+ Call Google GenAI for chat / multimodal / audio.
35
+ """
36
+ prompt = context.prompt
37
+ stream = context.stream
38
+ system_prompt = context.system_prompt
39
+ model = context.model
40
+ functions = context.external_functions
41
+ attachments = context.attachments
42
+ multimodal_ctx = context.multimodal_ctx
43
+ mode = context.mode
44
+ ctx = context.ctx or CtxItem()
45
+
46
+ client = self.window.core.api.google.get_client(context.mode, model)
47
+
48
+ # Detect audio-input present
49
+ has_audio_input = bool(
50
+ multimodal_ctx
51
+ and getattr(multimodal_ctx, "is_audio_input", False)
52
+ and getattr(multimodal_ctx, "audio_data", None)
53
+ )
54
+
55
+ # ------------- TRANSCRIPTION PATH (audio input -> text -> feed to TTS) -------------
56
+ if mode == MODE_AUDIO and has_audio_input:
57
+ # Build minimal transcription request: [instruction text, audio part]
58
+ transcribe_model = self.window.core.config.get("google_audio.transcribe_model", "gemini-2.5-flash")
59
+ transcribe_prompt = self.window.core.config.get("google_audio.transcribe_prompt", "Transcribe this audio clip to text.")
60
+ audio_part = self.window.core.api.google.audio.build_part(multimodal_ctx)
61
+ trans_inputs = [
62
+ Content(role="user", parts=[
63
+ Part.from_text(text=transcribe_prompt),
64
+ audio_part,
65
+ ])
66
+ ]
67
+ trans_cfg = gtypes.GenerateContentConfig(
68
+ # Keep minimal; no tools/system for transcription
69
+ temperature=self.window.core.config.get('temperature'),
70
+ top_p=self.window.core.config.get('top_p'),
71
+ max_output_tokens=context.max_tokens if context.max_tokens else None,
72
+ )
73
+
74
+ # Always non-stream here (we immediately need the text for TTS)
75
+ trans_resp = client.models.generate_content(
76
+ model=transcribe_model,
77
+ contents=trans_inputs,
78
+ config=trans_cfg,
79
+ )
80
+ transcribed_text = self.extract_text(trans_resp).strip()
81
+ if transcribed_text:
82
+ # Feed transcription into TTS as the final prompt
83
+ prompt = transcribed_text
84
+ ctx.input = transcribed_text
85
+ try:
86
+ # optional: store for debugging/UX
87
+ if isinstance(ctx.extra, dict):
88
+ ctx.extra["transcription"] = transcribed_text
89
+ except Exception:
90
+ pass
91
+ ctx.is_audio = False # transcription is text
92
+ multimodal_ctx.is_audio_input = False # disable audio input for TTS below
93
+
94
+ # ---------------------- REGULAR CHAT PATH (or no-audio in MODE_AUDIO) ----------------------
95
+ # Build contents for chat/multimodal (will be overridden for TTS below)
96
+ inputs = self.build_input(
97
+ prompt=prompt,
98
+ system_prompt=system_prompt,
99
+ model=model,
100
+ history=context.history,
101
+ attachments=attachments,
102
+ multimodal_ctx=multimodal_ctx,
103
+ )
104
+
105
+ # Best-effort input tokens estimate
106
+ self.reset_tokens()
107
+ count_msgs = self._build_count_messages(prompt, system_prompt, model, context.history)
108
+ self.input_tokens += self.window.core.tokens.from_messages(count_msgs, model.id)
109
+
110
+ # Tools -> merge app-defined tools with remote tools
111
+ base_tools = self.window.core.api.google.tools.prepare(model, functions)
112
+ remote_tools = self.window.core.api.google.build_remote_tools(model)
113
+ if base_tools:
114
+ remote_tools = [] # do not mix local and remote tools
115
+ tools = (base_tools or []) + (remote_tools or [])
116
+
117
+ # Sampling
118
+ temperature = self.window.core.config.get('temperature')
119
+ top_p = self.window.core.config.get('top_p')
120
+ max_tokens = context.max_tokens if context.max_tokens else None
121
+
122
+ # Base config
123
+ cfg_kwargs: Dict[str, Any] = dict(
124
+ temperature=temperature,
125
+ top_p=top_p,
126
+ max_output_tokens=max_tokens,
127
+ system_instruction=system_prompt if system_prompt else None,
128
+ tools=tools if tools else None,
129
+ )
130
+
131
+ # ---------- AUDIO MODE (output TTS) ----------
132
+ if mode == MODE_AUDIO:
133
+ stream = False # TTS non-stream in this app
134
+ supports_tts = self._supports_tts(model.id)
135
+
136
+ # Force minimal single-turn input for TTS (text only), using prompt possibly replaced by transcription
137
+ inputs = [Content(role="user", parts=[Part.from_text(text=str(prompt or ""))])]
138
+
139
+ # Remove params not used by TTS flow (and that sometimes cause issues)
140
+ for key in ("temperature", "top_p", "max_output_tokens", "system_instruction", "tools"):
141
+ if key in cfg_kwargs:
142
+ del cfg_kwargs[key]
143
+
144
+ # Voice selection (case-sensitive name)
145
+ voice_name = "Kore"
146
+ try:
147
+ tmp = self.window.core.plugins.get_option("audio_output", "google_voice_native")
148
+ if tmp:
149
+ name = str(tmp).strip()
150
+ mapping = {"kore": "Kore", "puck": "Puck", "charon": "Charon", "verse": "Verse", "legend": "Legend"}
151
+ voice_name = mapping.get(name.lower(), name)
152
+ except Exception:
153
+ pass
154
+
155
+ if supports_tts:
156
+ cfg_kwargs["response_modalities"] = ["AUDIO"]
157
+ cfg_kwargs["speech_config"] = gtypes.SpeechConfig(
158
+ voice_config=gtypes.VoiceConfig(
159
+ prebuilt_voice_config=gtypes.PrebuiltVoiceConfig(voice_name=voice_name)
160
+ )
161
+ )
162
+ # else: fallback to text-only below
163
+
164
+ cfg = gtypes.GenerateContentConfig(**cfg_kwargs)
165
+ params = dict(model=model.id, contents=inputs, config=cfg)
166
+
167
+ if stream and mode != MODE_AUDIO:
168
+ return client.models.generate_content_stream(**params)
169
+ else:
170
+ return client.models.generate_content(**params)
171
+
172
+ def unpack_response(self, mode: str, response, ctx: CtxItem):
173
+ """
174
+ Unpack non-streaming response from Google GenAI and set context.
175
+ """
176
+ if mode == MODE_AUDIO:
177
+ # Prefer audio if present
178
+ audio_bytes, mime = self.window.core.api.google.audio.extract_first_audio_part(response)
179
+ if audio_bytes:
180
+ # Google returns PCM16 24kHz mono for TTS; wrap to WAV (base64) for UI compatibility
181
+ # https://ai.google.dev/gemini-api/docs/speech-generation
182
+ if mime == "audio/pcm" or mime.startswith("audio/"):
183
+ wav_b64 = self.window.core.api.google.audio.pcm16_to_wav_base64(audio_bytes, rate=24000)
184
+ ctx.audio_output = wav_b64
185
+ ctx.is_audio = True
186
+ # Text transcript is typically not present for TTS; still try:
187
+ txt = self.extract_text(response)
188
+ ctx.output = txt or "..."
189
+ else:
190
+ # No audio present -> fallback to text
191
+ ctx.output = self.extract_text(response)
192
+
193
+ # Usage
194
+ try:
195
+ usage = getattr(response, "usage_metadata", None)
196
+ if usage:
197
+ p = getattr(usage, "prompt_token_count", 0) or 0
198
+ c = getattr(usage, "candidates_token_count", 0) or 0
199
+ ctx.set_tokens(p, c)
200
+ except Exception:
201
+ pass
202
+
203
+ return # audio path done
204
+
205
+ # ---- regular chat/completion ----
206
+ ctx.output = self.extract_text(response)
207
+
208
+ # Extract function calls
209
+ calls = self.extract_tool_calls(response)
210
+ if calls:
211
+ ctx.tool_calls = calls
212
+
213
+ # Usage if available
214
+ try:
215
+ usage = getattr(response, "usage_metadata", None)
216
+ if usage:
217
+ p = getattr(usage, "prompt_token_count", 0) or 0
218
+ c = getattr(usage, "candidates_token_count", 0) or 0
219
+ ctx.set_tokens(p, c)
220
+ except Exception:
221
+ pass
222
+
223
+ # Best-effort: inline images / links (image-output in chat)
224
+ try:
225
+ self._extract_inline_images_and_links(response, ctx)
226
+ except Exception:
227
+ pass
228
+
229
+ def extract_text(self, response) -> str:
230
+ """
231
+ Extract output text.
232
+ """
233
+ txt = getattr(response, "text", None) or getattr(response, "output_text", None)
234
+ if txt:
235
+ return str(txt).strip()
236
+ try:
237
+ cands = getattr(response, "candidates", None) or []
238
+ if cands:
239
+ parts = getattr(cands[0], "content", None)
240
+ parts = getattr(parts, "parts", None) or []
241
+ out = []
242
+ for p in parts:
243
+ t = getattr(p, "text", None)
244
+ if t:
245
+ out.append(str(t))
246
+ return "".join(out).strip()
247
+ except Exception:
248
+ pass
249
+ return ""
250
+
251
+ def extract_tool_calls(self, response) -> List[dict]:
252
+ """
253
+ Extract tool calls in a format compatible with app's tool execution.
254
+ Prefer response.function_calls (Python SDK), then fallback to parts[].function_call.
255
+ Returns arguments as a Python dict (not a JSON string).
256
+
257
+ :param response: Response object
258
+ :return: List of tool calls
259
+ """
260
+ def _to_plain_dict(obj):
261
+ # Convert pydantic/genai objects to plain dict recursively
262
+ try:
263
+ if hasattr(obj, "to_json_dict"):
264
+ return obj.to_json_dict()
265
+ if hasattr(obj, "model_dump"):
266
+ return obj.model_dump() # pydantic v2
267
+ if hasattr(obj, "to_dict"):
268
+ return obj.to_dict()
269
+ except Exception:
270
+ pass
271
+ if isinstance(obj, dict):
272
+ return {k: _to_plain_dict(v) for k, v in obj.items()}
273
+ if isinstance(obj, (list, tuple)):
274
+ return [_to_plain_dict(x) for x in obj]
275
+ return obj
276
+
277
+ out: List[dict] = []
278
+
279
+ # 1) response.function_calls
280
+ fcs = getattr(response, "function_calls", None) or []
281
+ for fc in fcs:
282
+ name = getattr(fc, "name", "") or ""
283
+ args_obj = getattr(fc, "args", {}) or {}
284
+ args_dict = _to_plain_dict(args_obj) or {}
285
+ # if str, try to parse
286
+ if isinstance(args_dict, str):
287
+ try:
288
+ import json
289
+ args_dict = json.loads(args_dict)
290
+ except Exception:
291
+ args_dict = {}
292
+ out.append({
293
+ "id": getattr(fc, "id", "") or "",
294
+ "type": "function",
295
+ "function": {
296
+ "name": name,
297
+ "arguments": args_dict, # <--- DICT, not string
298
+ }
299
+ })
300
+
301
+ if out:
302
+ return out
303
+
304
+ # 2) Fallback: candidates -> parts[].function_call
305
+ try:
306
+ cands = getattr(response, "candidates", None) or []
307
+ for cand in cands:
308
+ parts = getattr(getattr(cand, "content", None), "parts", None) or []
309
+ for part in parts:
310
+ fn = getattr(part, "function_call", None)
311
+ if not fn:
312
+ continue
313
+ name = getattr(fn, "name", "") or ""
314
+ args_obj = getattr(fn, "args", {}) or {}
315
+ args_dict = _to_plain_dict(args_obj) or {}
316
+ if isinstance(args_dict, str):
317
+ try:
318
+ import json
319
+ args_dict = json.loads(args_dict)
320
+ except Exception:
321
+ args_dict = {}
322
+ out.append({
323
+ "id": "",
324
+ "type": "function",
325
+ "function": {
326
+ "name": name,
327
+ "arguments": args_dict, # <--- DICT
328
+ }
329
+ })
330
+ except Exception:
331
+ pass
332
+
333
+ return out
334
+
335
+ def _extract_inline_images_and_links(self, response, ctx: CtxItem) -> None:
336
+ """
337
+ Extract inline image parts (Gemini image output) and file links.
338
+ - Saves inline_data (image/*) bytes to files and appends paths to ctx.images.
339
+ - Appends HTTP(S) image URIs from file_data to ctx.urls.
340
+ """
341
+ images: list[str] = []
342
+ urls: list[str] = []
343
+
344
+ try:
345
+ cands = getattr(response, "candidates", None) or []
346
+ for cand in cands:
347
+ content = getattr(cand, "content", None)
348
+ parts = getattr(content, "parts", None) or []
349
+ for p in parts:
350
+ # Inline image bytes (image preview / image generation in chat)
351
+ blob = getattr(p, "inline_data", None)
352
+ if blob:
353
+ mime = (getattr(blob, "mime_type", "") or "").lower()
354
+ if mime.startswith("image/"):
355
+ data = getattr(blob, "data", None)
356
+ if data:
357
+ img_bytes = self._ensure_bytes(data)
358
+ if img_bytes:
359
+ img_path = self.window.core.image.gen_unique_path(ctx)
360
+ with open(img_path, "wb") as f:
361
+ f.write(img_bytes)
362
+ images.append(img_path)
363
+
364
+ # File data URI (may contain http/https or gs://)
365
+ fdata = getattr(p, "file_data", None)
366
+ if fdata:
367
+ uri = getattr(fdata, "file_uri", None) or getattr(fdata, "uri", None)
368
+ mime = (getattr(fdata, "mime_type", "") or "").lower()
369
+ if uri and mime.startswith("image/"):
370
+ # Store only as URL; downloading is out of scope here.
371
+ if uri.startswith("http://") or uri.startswith("https://"):
372
+ urls.append(uri)
373
+ except Exception:
374
+ # Best-effort only
375
+ pass
376
+
377
+ if images:
378
+ if not isinstance(ctx.images, list):
379
+ ctx.images = []
380
+ ctx.images.extend(images)
381
+
382
+ if urls:
383
+ if ctx.urls is None:
384
+ ctx.urls = []
385
+ ctx.urls.extend(urls)
386
+
387
+ @staticmethod
388
+ def _ensure_bytes(data) -> bytes | None:
389
+ """Return raw bytes from SDK part.inline_data.data which can be bytes or base64 string."""
390
+ try:
391
+ if isinstance(data, (bytes, bytearray)):
392
+ return bytes(data)
393
+ if isinstance(data, str):
394
+ import base64
395
+ return base64.b64decode(data)
396
+ except Exception:
397
+ return None
398
+ return None
399
+
400
+ def build_input(
401
+ self,
402
+ prompt: str,
403
+ system_prompt: str,
404
+ model: ModelItem,
405
+ history: Optional[List[CtxItem]] = None,
406
+ attachments: Optional[Dict[str, AttachmentItem]] = None,
407
+ multimodal_ctx: Optional[MultimodalContext] = None,
408
+ ) -> List[Content]:
409
+ """
410
+ Build Google GenAI contents list
411
+
412
+ :param prompt: User prompt
413
+ :param system_prompt: System prompt/instruction
414
+ :param model: ModelItem
415
+ :param history: List of CtxItem for history
416
+ :param attachments: Dict of AttachmentItem for images
417
+ :param multimodal_ctx: MultimodalContext for audio
418
+ :return: List of Content
419
+ """
420
+ contents: List[Content] = []
421
+
422
+ # System instruction is passed separately (system_instruction),
423
+ # so we do not build an explicit system role part here.
424
+
425
+ # Append conversation history
426
+ if self.window.core.config.get('use_context'):
427
+ items = self.window.core.ctx.get_history(
428
+ history,
429
+ model.id,
430
+ MODE_CHAT if model and MODE_CHAT in model.mode else MODE_CHAT,
431
+ self.window.core.tokens.from_user(prompt, system_prompt),
432
+ self._fit_ctx(model),
433
+ )
434
+ for item in items:
435
+ if item.final_input:
436
+ contents.append(Content(role="user", parts=[Part.from_text(text=str(item.final_input))]))
437
+ if item.final_output:
438
+ contents.append(Content(role="model", parts=[Part.from_text(text=str(item.final_output))]))
439
+
440
+ # Current user message with multimodal parts
441
+ parts = self._build_user_parts(
442
+ content=str(prompt),
443
+ attachments=attachments,
444
+ multimodal_ctx=multimodal_ctx,
445
+ )
446
+ contents.append(Content(role="user", parts=parts))
447
+
448
+ return contents
449
+
450
+ def _build_user_parts(
451
+ self,
452
+ content: str,
453
+ attachments: Optional[Dict[str, AttachmentItem]] = None,
454
+ multimodal_ctx: Optional[MultimodalContext] = None,
455
+ ) -> List[Part]:
456
+ """
457
+ Build user message parts (text + images + audio)
458
+
459
+ :param content: User text content
460
+ :param attachments: Dict of AttachmentItem for images
461
+ :param multimodal_ctx: MultimodalContext for audio
462
+ :return: List of Part
463
+ """
464
+ parts: List[Part] = []
465
+ if content:
466
+ parts.append(Part.from_text(text=str(content)))
467
+
468
+ if attachments:
469
+ img_parts = self.window.core.api.google.vision.build_parts(content, attachments)
470
+ parts.extend(img_parts)
471
+
472
+ if multimodal_ctx and multimodal_ctx.is_audio_input and multimodal_ctx.audio_data:
473
+ audio_format = (multimodal_ctx.audio_format or "wav").lower()
474
+ mime = f"audio/{audio_format}"
475
+ parts.append(Part.from_bytes(data=multimodal_ctx.audio_data, mime_type=mime))
476
+
477
+ return parts
478
+
479
+ def _fit_ctx(self, model: ModelItem) -> int:
480
+ """
481
+ Fit to max model tokens (best-effort, uses model.ctx if present)
482
+
483
+ :param model: ModelItem
484
+ :return: max context tokens
485
+ """
486
+ max_ctx_tokens = self.window.core.config.get('max_total_tokens')
487
+ if model and model.ctx and 0 < model.ctx < max_ctx_tokens:
488
+ max_ctx_tokens = model.ctx
489
+ return max_ctx_tokens
490
+
491
+ def _build_count_messages(
492
+ self,
493
+ prompt: str,
494
+ system_prompt: str,
495
+ model: ModelItem,
496
+ history: Optional[List[CtxItem]] = None,
497
+ ) -> List[dict]:
498
+ """
499
+ Build simple messages structure for local token estimation
500
+
501
+ :param prompt: User prompt
502
+ :param system_prompt: System prompt/instruction
503
+ :param model: ModelItem
504
+ :param history: List of CtxItem for history
505
+ :return: List of messages dicts with 'role' and 'content' keys
506
+ """
507
+ messages = []
508
+ if system_prompt:
509
+ messages.append({"role": "system", "content": system_prompt})
510
+
511
+ if self.window.core.config.get('use_context'):
512
+ used_tokens = self.window.core.tokens.from_user(prompt, system_prompt)
513
+ items = self.window.core.ctx.get_history(
514
+ history,
515
+ model.id,
516
+ MODE_CHAT,
517
+ used_tokens,
518
+ self._fit_ctx(model),
519
+ )
520
+ for item in items:
521
+ if item.final_input:
522
+ messages.append({"role": "user", "content": str(item.final_input)})
523
+ if item.final_output:
524
+ messages.append({"role": "assistant", "content": str(item.final_output)})
525
+
526
+ messages.append({"role": "user", "content": str(prompt)})
527
+ return messages
528
+
529
+
530
+ def reset_tokens(self):
531
+ """Reset input tokens counter"""
532
+ self.input_tokens = 0
533
+
534
+ def get_used_tokens(self) -> int:
535
+ """
536
+ Get input tokens counter (estimated before sending)
537
+
538
+ :return: input tokens count
539
+ """
540
+ return self.input_tokens
541
+
542
+ @staticmethod
543
+ def _supports_tts(model_id: Optional[str]) -> bool:
544
+ """
545
+ Heuristic check if the model supports native TTS.
546
+ - Official TTS models contain '-tts' in id (e.g. 'gemini-2.5-flash-preview-tts').
547
+ - Future/preview names may contain 'native-audio'.
548
+ """
549
+ if not model_id:
550
+ return False
551
+ mid = model_id.lower()
552
+ return ("-tts" in mid) or ("native-audio" in mid)