pygpt-net 2.6.28__py3-none-any.whl → 2.6.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pygpt_net/CHANGELOG.txt +13 -0
- pygpt_net/__init__.py +3 -3
- pygpt_net/{container.py → app_core.py} +5 -6
- pygpt_net/controller/access/control.py +1 -9
- pygpt_net/controller/assistant/assistant.py +4 -4
- pygpt_net/controller/assistant/batch.py +7 -7
- pygpt_net/controller/assistant/files.py +4 -4
- pygpt_net/controller/assistant/threads.py +3 -3
- pygpt_net/controller/attachment/attachment.py +4 -7
- pygpt_net/controller/chat/common.py +1 -1
- pygpt_net/controller/chat/stream.py +961 -294
- pygpt_net/controller/chat/vision.py +11 -19
- pygpt_net/controller/config/placeholder.py +1 -1
- pygpt_net/controller/ctx/ctx.py +1 -1
- pygpt_net/controller/ctx/summarizer.py +1 -1
- pygpt_net/controller/mode/mode.py +21 -12
- pygpt_net/controller/plugins/settings.py +3 -2
- pygpt_net/controller/presets/editor.py +112 -99
- pygpt_net/controller/theme/common.py +2 -0
- pygpt_net/controller/theme/theme.py +6 -2
- pygpt_net/controller/ui/vision.py +4 -4
- pygpt_net/core/agents/legacy.py +2 -2
- pygpt_net/core/agents/runners/openai_workflow.py +2 -2
- pygpt_net/core/assistants/files.py +5 -5
- pygpt_net/core/assistants/store.py +4 -4
- pygpt_net/core/bridge/bridge.py +3 -3
- pygpt_net/core/bridge/worker.py +28 -9
- pygpt_net/core/debug/console/console.py +2 -2
- pygpt_net/core/debug/presets.py +2 -2
- pygpt_net/core/experts/experts.py +2 -2
- pygpt_net/core/idx/llm.py +21 -3
- pygpt_net/core/modes/modes.py +2 -2
- pygpt_net/core/presets/presets.py +3 -3
- pygpt_net/core/tokens/tokens.py +4 -4
- pygpt_net/core/types/mode.py +5 -2
- pygpt_net/core/vision/analyzer.py +1 -1
- pygpt_net/data/config/config.json +6 -3
- pygpt_net/data/config/models.json +75 -3
- pygpt_net/data/config/modes.json +3 -9
- pygpt_net/data/config/settings.json +112 -55
- pygpt_net/data/config/settings_section.json +2 -2
- pygpt_net/data/locale/locale.de.ini +2 -2
- pygpt_net/data/locale/locale.en.ini +9 -2
- pygpt_net/data/locale/locale.es.ini +2 -2
- pygpt_net/data/locale/locale.fr.ini +2 -2
- pygpt_net/data/locale/locale.it.ini +2 -2
- pygpt_net/data/locale/locale.pl.ini +3 -3
- pygpt_net/data/locale/locale.uk.ini +2 -2
- pygpt_net/data/locale/locale.zh.ini +2 -2
- pygpt_net/item/model.py +23 -3
- pygpt_net/plugin/openai_dalle/plugin.py +4 -4
- pygpt_net/plugin/openai_vision/plugin.py +12 -13
- pygpt_net/provider/agents/openai/agent.py +5 -5
- pygpt_net/provider/agents/openai/agent_b2b.py +5 -5
- pygpt_net/provider/agents/openai/agent_planner.py +5 -6
- pygpt_net/provider/agents/openai/agent_with_experts.py +5 -5
- pygpt_net/provider/agents/openai/agent_with_experts_feedback.py +4 -4
- pygpt_net/provider/agents/openai/agent_with_feedback.py +4 -4
- pygpt_net/provider/agents/openai/bot_researcher.py +2 -2
- pygpt_net/provider/agents/openai/bots/research_bot/agents/planner_agent.py +1 -1
- pygpt_net/provider/agents/openai/bots/research_bot/agents/search_agent.py +1 -1
- pygpt_net/provider/agents/openai/bots/research_bot/agents/writer_agent.py +1 -1
- pygpt_net/provider/agents/openai/evolve.py +5 -5
- pygpt_net/provider/agents/openai/supervisor.py +4 -4
- pygpt_net/provider/api/__init__.py +27 -0
- pygpt_net/provider/api/anthropic/__init__.py +68 -0
- pygpt_net/provider/api/google/__init__.py +262 -0
- pygpt_net/provider/api/google/audio.py +114 -0
- pygpt_net/provider/api/google/chat.py +552 -0
- pygpt_net/provider/api/google/image.py +287 -0
- pygpt_net/provider/api/google/tools.py +222 -0
- pygpt_net/provider/api/google/vision.py +129 -0
- pygpt_net/provider/{gpt → api/openai}/__init__.py +2 -2
- pygpt_net/provider/{gpt → api/openai}/agents/computer.py +1 -1
- pygpt_net/provider/{gpt → api/openai}/agents/experts.py +1 -1
- pygpt_net/provider/{gpt → api/openai}/agents/response.py +1 -1
- pygpt_net/provider/{gpt → api/openai}/assistants.py +1 -1
- pygpt_net/provider/{gpt → api/openai}/chat.py +15 -8
- pygpt_net/provider/{gpt → api/openai}/completion.py +1 -1
- pygpt_net/provider/{gpt → api/openai}/image.py +1 -1
- pygpt_net/provider/{gpt → api/openai}/remote_tools.py +1 -1
- pygpt_net/provider/{gpt → api/openai}/responses.py +34 -20
- pygpt_net/provider/{gpt → api/openai}/store.py +2 -2
- pygpt_net/provider/{gpt → api/openai}/vision.py +1 -1
- pygpt_net/provider/{gpt → api/openai}/worker/assistants.py +4 -4
- pygpt_net/provider/{gpt → api/openai}/worker/importer.py +10 -10
- pygpt_net/provider/audio_input/openai_whisper.py +1 -1
- pygpt_net/provider/audio_output/google_tts.py +12 -0
- pygpt_net/provider/audio_output/openai_tts.py +1 -1
- pygpt_net/provider/core/config/patch.py +11 -0
- pygpt_net/provider/core/model/patch.py +9 -0
- pygpt_net/provider/core/preset/json_file.py +2 -4
- pygpt_net/provider/llms/anthropic.py +2 -5
- pygpt_net/provider/llms/base.py +4 -3
- pygpt_net/provider/llms/openai.py +1 -1
- pygpt_net/provider/loaders/hub/image_vision/base.py +1 -1
- pygpt_net/ui/dialog/preset.py +71 -55
- pygpt_net/ui/main.py +6 -4
- pygpt_net/utils.py +9 -0
- {pygpt_net-2.6.28.dist-info → pygpt_net-2.6.30.dist-info}/METADATA +42 -48
- {pygpt_net-2.6.28.dist-info → pygpt_net-2.6.30.dist-info}/RECORD +115 -107
- /pygpt_net/provider/{gpt → api/openai}/agents/__init__.py +0 -0
- /pygpt_net/provider/{gpt → api/openai}/agents/client.py +0 -0
- /pygpt_net/provider/{gpt → api/openai}/agents/remote_tools.py +0 -0
- /pygpt_net/provider/{gpt → api/openai}/agents/utils.py +0 -0
- /pygpt_net/provider/{gpt → api/openai}/audio.py +0 -0
- /pygpt_net/provider/{gpt → api/openai}/computer.py +0 -0
- /pygpt_net/provider/{gpt → api/openai}/container.py +0 -0
- /pygpt_net/provider/{gpt → api/openai}/summarizer.py +0 -0
- /pygpt_net/provider/{gpt → api/openai}/tools.py +0 -0
- /pygpt_net/provider/{gpt → api/openai}/utils.py +0 -0
- /pygpt_net/provider/{gpt → api/openai}/worker/__init__.py +0 -0
- {pygpt_net-2.6.28.dist-info → pygpt_net-2.6.30.dist-info}/LICENSE +0 -0
- {pygpt_net-2.6.28.dist-info → pygpt_net-2.6.30.dist-info}/WHEEL +0 -0
- {pygpt_net-2.6.28.dist-info → pygpt_net-2.6.30.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# ================================================== #
|
|
4
|
+
# This file is a part of PYGPT package #
|
|
5
|
+
# Website: https://pygpt.net #
|
|
6
|
+
# GitHub: https://github.com/szczyglis-dev/py-gpt #
|
|
7
|
+
# MIT License #
|
|
8
|
+
# Created By : Marcin Szczygliński #
|
|
9
|
+
# Updated Date: 2025.08.28 20:00:00 #
|
|
10
|
+
# ================================================== #
|
|
11
|
+
|
|
12
|
+
import base64
|
|
13
|
+
import io
|
|
14
|
+
import wave
|
|
15
|
+
from typing import Optional, Tuple
|
|
16
|
+
|
|
17
|
+
from google.genai.types import Part
|
|
18
|
+
from pygpt_net.core.bridge.context import MultimodalContext
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Audio:
|
|
22
|
+
def __init__(self, window=None):
|
|
23
|
+
"""
|
|
24
|
+
Audio helpers for Google GenAI.
|
|
25
|
+
- Build audio input parts for requests
|
|
26
|
+
- Convert Google PCM output to WAV (base64) for UI compatibility
|
|
27
|
+
"""
|
|
28
|
+
self.window = window
|
|
29
|
+
|
|
30
|
+
# ---------- INPUT (user -> model) ----------
|
|
31
|
+
|
|
32
|
+
def build_part(
|
|
33
|
+
self,
|
|
34
|
+
multimodal_ctx: Optional[MultimodalContext]
|
|
35
|
+
) -> Optional[Part]:
|
|
36
|
+
"""
|
|
37
|
+
Build audio Part from multimodal context (inline bytes).
|
|
38
|
+
|
|
39
|
+
:param multimodal_ctx: MultimodalContext
|
|
40
|
+
:return: Part or None
|
|
41
|
+
"""
|
|
42
|
+
if not multimodal_ctx or not multimodal_ctx.is_audio_input or not multimodal_ctx.audio_data:
|
|
43
|
+
return None
|
|
44
|
+
audio_format = (multimodal_ctx.audio_format or "wav").lower()
|
|
45
|
+
mime = f"audio/{audio_format}"
|
|
46
|
+
return Part.from_bytes(data=multimodal_ctx.audio_data, mime_type=mime)
|
|
47
|
+
|
|
48
|
+
# ---------- OUTPUT (model -> UI) ----------
|
|
49
|
+
|
|
50
|
+
def extract_first_audio_part(
|
|
51
|
+
self,
|
|
52
|
+
response
|
|
53
|
+
) -> Tuple[Optional[bytes], Optional[str]]:
|
|
54
|
+
"""
|
|
55
|
+
Extract first audio inline_data from a non-streaming response.
|
|
56
|
+
|
|
57
|
+
:param response: Google response object
|
|
58
|
+
:return: (audio_bytes, mime_type) or (None, None)
|
|
59
|
+
"""
|
|
60
|
+
try:
|
|
61
|
+
candidates = getattr(response, "candidates", None) or []
|
|
62
|
+
for cand in candidates:
|
|
63
|
+
content = getattr(cand, "content", None)
|
|
64
|
+
parts = getattr(content, "parts", None) or []
|
|
65
|
+
for p in parts:
|
|
66
|
+
inline = getattr(p, "inline_data", None)
|
|
67
|
+
if not inline:
|
|
68
|
+
continue
|
|
69
|
+
mime = (getattr(inline, "mime_type", "") or "").lower()
|
|
70
|
+
if not mime.startswith("audio/"):
|
|
71
|
+
continue
|
|
72
|
+
data = getattr(inline, "data", None)
|
|
73
|
+
audio_bytes = self._ensure_bytes(data)
|
|
74
|
+
if audio_bytes:
|
|
75
|
+
return audio_bytes, mime
|
|
76
|
+
except Exception:
|
|
77
|
+
pass
|
|
78
|
+
return None, None
|
|
79
|
+
|
|
80
|
+
def pcm16_to_wav_base64(
|
|
81
|
+
self,
|
|
82
|
+
pcm_bytes: bytes,
|
|
83
|
+
rate: int = 24000,
|
|
84
|
+
channels: int = 1,
|
|
85
|
+
sample_width: int = 2
|
|
86
|
+
) -> str:
|
|
87
|
+
"""
|
|
88
|
+
Wrap raw PCM16 mono @ 24kHz into WAV and return base64-encoded payload.
|
|
89
|
+
|
|
90
|
+
:param pcm_bytes: Raw PCM16 bytes
|
|
91
|
+
:param rate: Sample rate (Hz), default 24000 for Google TTS
|
|
92
|
+
:param channels: Channels, default 1
|
|
93
|
+
:param sample_width: Bytes per sample, default 2 for PCM16
|
|
94
|
+
:return: Base64-encoded WAV
|
|
95
|
+
"""
|
|
96
|
+
buf = io.BytesIO()
|
|
97
|
+
with wave.open(buf, "wb") as wf:
|
|
98
|
+
wf.setnchannels(channels)
|
|
99
|
+
wf.setsampwidth(sample_width)
|
|
100
|
+
wf.setframerate(rate)
|
|
101
|
+
wf.writeframes(pcm_bytes)
|
|
102
|
+
return base64.b64encode(buf.getvalue()).decode("utf-8")
|
|
103
|
+
|
|
104
|
+
@staticmethod
|
|
105
|
+
def _ensure_bytes(data) -> Optional[bytes]:
|
|
106
|
+
"""Return raw bytes from inline_data.data (bytes or base64 string)."""
|
|
107
|
+
try:
|
|
108
|
+
if isinstance(data, (bytes, bytearray)):
|
|
109
|
+
return bytes(data)
|
|
110
|
+
if isinstance(data, str):
|
|
111
|
+
return base64.b64decode(data)
|
|
112
|
+
except Exception:
|
|
113
|
+
return None
|
|
114
|
+
return None
|
|
@@ -0,0 +1,552 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# ================================================== #
|
|
4
|
+
# This file is a part of PYGPT package #
|
|
5
|
+
# Website: https://pygpt.net #
|
|
6
|
+
# GitHub: https://github.com/szczyglis-dev/py-gpt #
|
|
7
|
+
# MIT License #
|
|
8
|
+
# Created By : Marcin Szczygliński #
|
|
9
|
+
# Updated Date: 2025.08.28 20:00:00 #
|
|
10
|
+
# ================================================== #
|
|
11
|
+
|
|
12
|
+
from typing import Optional, Dict, Any, List
|
|
13
|
+
|
|
14
|
+
from google.genai import types as gtypes
|
|
15
|
+
from google.genai.types import Content, Part
|
|
16
|
+
|
|
17
|
+
from pygpt_net.core.types import MODE_CHAT, MODE_AUDIO
|
|
18
|
+
from pygpt_net.core.bridge.context import BridgeContext, MultimodalContext
|
|
19
|
+
from pygpt_net.item.attachment import AttachmentItem
|
|
20
|
+
from pygpt_net.item.ctx import CtxItem
|
|
21
|
+
from pygpt_net.item.model import ModelItem
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Chat:
|
|
25
|
+
def __init__(self, window=None):
|
|
26
|
+
"""
|
|
27
|
+
Chat wrapper for Google GenAI.
|
|
28
|
+
"""
|
|
29
|
+
self.window = window
|
|
30
|
+
self.input_tokens = 0
|
|
31
|
+
|
|
32
|
+
def send(self, context: BridgeContext, extra: Optional[Dict[str, Any]] = None):
|
|
33
|
+
"""
|
|
34
|
+
Call Google GenAI for chat / multimodal / audio.
|
|
35
|
+
"""
|
|
36
|
+
prompt = context.prompt
|
|
37
|
+
stream = context.stream
|
|
38
|
+
system_prompt = context.system_prompt
|
|
39
|
+
model = context.model
|
|
40
|
+
functions = context.external_functions
|
|
41
|
+
attachments = context.attachments
|
|
42
|
+
multimodal_ctx = context.multimodal_ctx
|
|
43
|
+
mode = context.mode
|
|
44
|
+
ctx = context.ctx or CtxItem()
|
|
45
|
+
|
|
46
|
+
client = self.window.core.api.google.get_client(context.mode, model)
|
|
47
|
+
|
|
48
|
+
# Detect audio-input present
|
|
49
|
+
has_audio_input = bool(
|
|
50
|
+
multimodal_ctx
|
|
51
|
+
and getattr(multimodal_ctx, "is_audio_input", False)
|
|
52
|
+
and getattr(multimodal_ctx, "audio_data", None)
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# ------------- TRANSCRIPTION PATH (audio input -> text -> feed to TTS) -------------
|
|
56
|
+
if mode == MODE_AUDIO and has_audio_input:
|
|
57
|
+
# Build minimal transcription request: [instruction text, audio part]
|
|
58
|
+
transcribe_model = self.window.core.config.get("google_audio.transcribe_model", "gemini-2.5-flash")
|
|
59
|
+
transcribe_prompt = self.window.core.config.get("google_audio.transcribe_prompt", "Transcribe this audio clip to text.")
|
|
60
|
+
audio_part = self.window.core.api.google.audio.build_part(multimodal_ctx)
|
|
61
|
+
trans_inputs = [
|
|
62
|
+
Content(role="user", parts=[
|
|
63
|
+
Part.from_text(text=transcribe_prompt),
|
|
64
|
+
audio_part,
|
|
65
|
+
])
|
|
66
|
+
]
|
|
67
|
+
trans_cfg = gtypes.GenerateContentConfig(
|
|
68
|
+
# Keep minimal; no tools/system for transcription
|
|
69
|
+
temperature=self.window.core.config.get('temperature'),
|
|
70
|
+
top_p=self.window.core.config.get('top_p'),
|
|
71
|
+
max_output_tokens=context.max_tokens if context.max_tokens else None,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Always non-stream here (we immediately need the text for TTS)
|
|
75
|
+
trans_resp = client.models.generate_content(
|
|
76
|
+
model=transcribe_model,
|
|
77
|
+
contents=trans_inputs,
|
|
78
|
+
config=trans_cfg,
|
|
79
|
+
)
|
|
80
|
+
transcribed_text = self.extract_text(trans_resp).strip()
|
|
81
|
+
if transcribed_text:
|
|
82
|
+
# Feed transcription into TTS as the final prompt
|
|
83
|
+
prompt = transcribed_text
|
|
84
|
+
ctx.input = transcribed_text
|
|
85
|
+
try:
|
|
86
|
+
# optional: store for debugging/UX
|
|
87
|
+
if isinstance(ctx.extra, dict):
|
|
88
|
+
ctx.extra["transcription"] = transcribed_text
|
|
89
|
+
except Exception:
|
|
90
|
+
pass
|
|
91
|
+
ctx.is_audio = False # transcription is text
|
|
92
|
+
multimodal_ctx.is_audio_input = False # disable audio input for TTS below
|
|
93
|
+
|
|
94
|
+
# ---------------------- REGULAR CHAT PATH (or no-audio in MODE_AUDIO) ----------------------
|
|
95
|
+
# Build contents for chat/multimodal (will be overridden for TTS below)
|
|
96
|
+
inputs = self.build_input(
|
|
97
|
+
prompt=prompt,
|
|
98
|
+
system_prompt=system_prompt,
|
|
99
|
+
model=model,
|
|
100
|
+
history=context.history,
|
|
101
|
+
attachments=attachments,
|
|
102
|
+
multimodal_ctx=multimodal_ctx,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Best-effort input tokens estimate
|
|
106
|
+
self.reset_tokens()
|
|
107
|
+
count_msgs = self._build_count_messages(prompt, system_prompt, model, context.history)
|
|
108
|
+
self.input_tokens += self.window.core.tokens.from_messages(count_msgs, model.id)
|
|
109
|
+
|
|
110
|
+
# Tools -> merge app-defined tools with remote tools
|
|
111
|
+
base_tools = self.window.core.api.google.tools.prepare(model, functions)
|
|
112
|
+
remote_tools = self.window.core.api.google.build_remote_tools(model)
|
|
113
|
+
if base_tools:
|
|
114
|
+
remote_tools = [] # do not mix local and remote tools
|
|
115
|
+
tools = (base_tools or []) + (remote_tools or [])
|
|
116
|
+
|
|
117
|
+
# Sampling
|
|
118
|
+
temperature = self.window.core.config.get('temperature')
|
|
119
|
+
top_p = self.window.core.config.get('top_p')
|
|
120
|
+
max_tokens = context.max_tokens if context.max_tokens else None
|
|
121
|
+
|
|
122
|
+
# Base config
|
|
123
|
+
cfg_kwargs: Dict[str, Any] = dict(
|
|
124
|
+
temperature=temperature,
|
|
125
|
+
top_p=top_p,
|
|
126
|
+
max_output_tokens=max_tokens,
|
|
127
|
+
system_instruction=system_prompt if system_prompt else None,
|
|
128
|
+
tools=tools if tools else None,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# ---------- AUDIO MODE (output TTS) ----------
|
|
132
|
+
if mode == MODE_AUDIO:
|
|
133
|
+
stream = False # TTS non-stream in this app
|
|
134
|
+
supports_tts = self._supports_tts(model.id)
|
|
135
|
+
|
|
136
|
+
# Force minimal single-turn input for TTS (text only), using prompt possibly replaced by transcription
|
|
137
|
+
inputs = [Content(role="user", parts=[Part.from_text(text=str(prompt or ""))])]
|
|
138
|
+
|
|
139
|
+
# Remove params not used by TTS flow (and that sometimes cause issues)
|
|
140
|
+
for key in ("temperature", "top_p", "max_output_tokens", "system_instruction", "tools"):
|
|
141
|
+
if key in cfg_kwargs:
|
|
142
|
+
del cfg_kwargs[key]
|
|
143
|
+
|
|
144
|
+
# Voice selection (case-sensitive name)
|
|
145
|
+
voice_name = "Kore"
|
|
146
|
+
try:
|
|
147
|
+
tmp = self.window.core.plugins.get_option("audio_output", "google_voice_native")
|
|
148
|
+
if tmp:
|
|
149
|
+
name = str(tmp).strip()
|
|
150
|
+
mapping = {"kore": "Kore", "puck": "Puck", "charon": "Charon", "verse": "Verse", "legend": "Legend"}
|
|
151
|
+
voice_name = mapping.get(name.lower(), name)
|
|
152
|
+
except Exception:
|
|
153
|
+
pass
|
|
154
|
+
|
|
155
|
+
if supports_tts:
|
|
156
|
+
cfg_kwargs["response_modalities"] = ["AUDIO"]
|
|
157
|
+
cfg_kwargs["speech_config"] = gtypes.SpeechConfig(
|
|
158
|
+
voice_config=gtypes.VoiceConfig(
|
|
159
|
+
prebuilt_voice_config=gtypes.PrebuiltVoiceConfig(voice_name=voice_name)
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
# else: fallback to text-only below
|
|
163
|
+
|
|
164
|
+
cfg = gtypes.GenerateContentConfig(**cfg_kwargs)
|
|
165
|
+
params = dict(model=model.id, contents=inputs, config=cfg)
|
|
166
|
+
|
|
167
|
+
if stream and mode != MODE_AUDIO:
|
|
168
|
+
return client.models.generate_content_stream(**params)
|
|
169
|
+
else:
|
|
170
|
+
return client.models.generate_content(**params)
|
|
171
|
+
|
|
172
|
+
def unpack_response(self, mode: str, response, ctx: CtxItem):
|
|
173
|
+
"""
|
|
174
|
+
Unpack non-streaming response from Google GenAI and set context.
|
|
175
|
+
"""
|
|
176
|
+
if mode == MODE_AUDIO:
|
|
177
|
+
# Prefer audio if present
|
|
178
|
+
audio_bytes, mime = self.window.core.api.google.audio.extract_first_audio_part(response)
|
|
179
|
+
if audio_bytes:
|
|
180
|
+
# Google returns PCM16 24kHz mono for TTS; wrap to WAV (base64) for UI compatibility
|
|
181
|
+
# https://ai.google.dev/gemini-api/docs/speech-generation
|
|
182
|
+
if mime == "audio/pcm" or mime.startswith("audio/"):
|
|
183
|
+
wav_b64 = self.window.core.api.google.audio.pcm16_to_wav_base64(audio_bytes, rate=24000)
|
|
184
|
+
ctx.audio_output = wav_b64
|
|
185
|
+
ctx.is_audio = True
|
|
186
|
+
# Text transcript is typically not present for TTS; still try:
|
|
187
|
+
txt = self.extract_text(response)
|
|
188
|
+
ctx.output = txt or "..."
|
|
189
|
+
else:
|
|
190
|
+
# No audio present -> fallback to text
|
|
191
|
+
ctx.output = self.extract_text(response)
|
|
192
|
+
|
|
193
|
+
# Usage
|
|
194
|
+
try:
|
|
195
|
+
usage = getattr(response, "usage_metadata", None)
|
|
196
|
+
if usage:
|
|
197
|
+
p = getattr(usage, "prompt_token_count", 0) or 0
|
|
198
|
+
c = getattr(usage, "candidates_token_count", 0) or 0
|
|
199
|
+
ctx.set_tokens(p, c)
|
|
200
|
+
except Exception:
|
|
201
|
+
pass
|
|
202
|
+
|
|
203
|
+
return # audio path done
|
|
204
|
+
|
|
205
|
+
# ---- regular chat/completion ----
|
|
206
|
+
ctx.output = self.extract_text(response)
|
|
207
|
+
|
|
208
|
+
# Extract function calls
|
|
209
|
+
calls = self.extract_tool_calls(response)
|
|
210
|
+
if calls:
|
|
211
|
+
ctx.tool_calls = calls
|
|
212
|
+
|
|
213
|
+
# Usage if available
|
|
214
|
+
try:
|
|
215
|
+
usage = getattr(response, "usage_metadata", None)
|
|
216
|
+
if usage:
|
|
217
|
+
p = getattr(usage, "prompt_token_count", 0) or 0
|
|
218
|
+
c = getattr(usage, "candidates_token_count", 0) or 0
|
|
219
|
+
ctx.set_tokens(p, c)
|
|
220
|
+
except Exception:
|
|
221
|
+
pass
|
|
222
|
+
|
|
223
|
+
# Best-effort: inline images / links (image-output in chat)
|
|
224
|
+
try:
|
|
225
|
+
self._extract_inline_images_and_links(response, ctx)
|
|
226
|
+
except Exception:
|
|
227
|
+
pass
|
|
228
|
+
|
|
229
|
+
def extract_text(self, response) -> str:
|
|
230
|
+
"""
|
|
231
|
+
Extract output text.
|
|
232
|
+
"""
|
|
233
|
+
txt = getattr(response, "text", None) or getattr(response, "output_text", None)
|
|
234
|
+
if txt:
|
|
235
|
+
return str(txt).strip()
|
|
236
|
+
try:
|
|
237
|
+
cands = getattr(response, "candidates", None) or []
|
|
238
|
+
if cands:
|
|
239
|
+
parts = getattr(cands[0], "content", None)
|
|
240
|
+
parts = getattr(parts, "parts", None) or []
|
|
241
|
+
out = []
|
|
242
|
+
for p in parts:
|
|
243
|
+
t = getattr(p, "text", None)
|
|
244
|
+
if t:
|
|
245
|
+
out.append(str(t))
|
|
246
|
+
return "".join(out).strip()
|
|
247
|
+
except Exception:
|
|
248
|
+
pass
|
|
249
|
+
return ""
|
|
250
|
+
|
|
251
|
+
def extract_tool_calls(self, response) -> List[dict]:
|
|
252
|
+
"""
|
|
253
|
+
Extract tool calls in a format compatible with app's tool execution.
|
|
254
|
+
Prefer response.function_calls (Python SDK), then fallback to parts[].function_call.
|
|
255
|
+
Returns arguments as a Python dict (not a JSON string).
|
|
256
|
+
|
|
257
|
+
:param response: Response object
|
|
258
|
+
:return: List of tool calls
|
|
259
|
+
"""
|
|
260
|
+
def _to_plain_dict(obj):
|
|
261
|
+
# Convert pydantic/genai objects to plain dict recursively
|
|
262
|
+
try:
|
|
263
|
+
if hasattr(obj, "to_json_dict"):
|
|
264
|
+
return obj.to_json_dict()
|
|
265
|
+
if hasattr(obj, "model_dump"):
|
|
266
|
+
return obj.model_dump() # pydantic v2
|
|
267
|
+
if hasattr(obj, "to_dict"):
|
|
268
|
+
return obj.to_dict()
|
|
269
|
+
except Exception:
|
|
270
|
+
pass
|
|
271
|
+
if isinstance(obj, dict):
|
|
272
|
+
return {k: _to_plain_dict(v) for k, v in obj.items()}
|
|
273
|
+
if isinstance(obj, (list, tuple)):
|
|
274
|
+
return [_to_plain_dict(x) for x in obj]
|
|
275
|
+
return obj
|
|
276
|
+
|
|
277
|
+
out: List[dict] = []
|
|
278
|
+
|
|
279
|
+
# 1) response.function_calls
|
|
280
|
+
fcs = getattr(response, "function_calls", None) or []
|
|
281
|
+
for fc in fcs:
|
|
282
|
+
name = getattr(fc, "name", "") or ""
|
|
283
|
+
args_obj = getattr(fc, "args", {}) or {}
|
|
284
|
+
args_dict = _to_plain_dict(args_obj) or {}
|
|
285
|
+
# if str, try to parse
|
|
286
|
+
if isinstance(args_dict, str):
|
|
287
|
+
try:
|
|
288
|
+
import json
|
|
289
|
+
args_dict = json.loads(args_dict)
|
|
290
|
+
except Exception:
|
|
291
|
+
args_dict = {}
|
|
292
|
+
out.append({
|
|
293
|
+
"id": getattr(fc, "id", "") or "",
|
|
294
|
+
"type": "function",
|
|
295
|
+
"function": {
|
|
296
|
+
"name": name,
|
|
297
|
+
"arguments": args_dict, # <--- DICT, not string
|
|
298
|
+
}
|
|
299
|
+
})
|
|
300
|
+
|
|
301
|
+
if out:
|
|
302
|
+
return out
|
|
303
|
+
|
|
304
|
+
# 2) Fallback: candidates -> parts[].function_call
|
|
305
|
+
try:
|
|
306
|
+
cands = getattr(response, "candidates", None) or []
|
|
307
|
+
for cand in cands:
|
|
308
|
+
parts = getattr(getattr(cand, "content", None), "parts", None) or []
|
|
309
|
+
for part in parts:
|
|
310
|
+
fn = getattr(part, "function_call", None)
|
|
311
|
+
if not fn:
|
|
312
|
+
continue
|
|
313
|
+
name = getattr(fn, "name", "") or ""
|
|
314
|
+
args_obj = getattr(fn, "args", {}) or {}
|
|
315
|
+
args_dict = _to_plain_dict(args_obj) or {}
|
|
316
|
+
if isinstance(args_dict, str):
|
|
317
|
+
try:
|
|
318
|
+
import json
|
|
319
|
+
args_dict = json.loads(args_dict)
|
|
320
|
+
except Exception:
|
|
321
|
+
args_dict = {}
|
|
322
|
+
out.append({
|
|
323
|
+
"id": "",
|
|
324
|
+
"type": "function",
|
|
325
|
+
"function": {
|
|
326
|
+
"name": name,
|
|
327
|
+
"arguments": args_dict, # <--- DICT
|
|
328
|
+
}
|
|
329
|
+
})
|
|
330
|
+
except Exception:
|
|
331
|
+
pass
|
|
332
|
+
|
|
333
|
+
return out
|
|
334
|
+
|
|
335
|
+
def _extract_inline_images_and_links(self, response, ctx: CtxItem) -> None:
|
|
336
|
+
"""
|
|
337
|
+
Extract inline image parts (Gemini image output) and file links.
|
|
338
|
+
- Saves inline_data (image/*) bytes to files and appends paths to ctx.images.
|
|
339
|
+
- Appends HTTP(S) image URIs from file_data to ctx.urls.
|
|
340
|
+
"""
|
|
341
|
+
images: list[str] = []
|
|
342
|
+
urls: list[str] = []
|
|
343
|
+
|
|
344
|
+
try:
|
|
345
|
+
cands = getattr(response, "candidates", None) or []
|
|
346
|
+
for cand in cands:
|
|
347
|
+
content = getattr(cand, "content", None)
|
|
348
|
+
parts = getattr(content, "parts", None) or []
|
|
349
|
+
for p in parts:
|
|
350
|
+
# Inline image bytes (image preview / image generation in chat)
|
|
351
|
+
blob = getattr(p, "inline_data", None)
|
|
352
|
+
if blob:
|
|
353
|
+
mime = (getattr(blob, "mime_type", "") or "").lower()
|
|
354
|
+
if mime.startswith("image/"):
|
|
355
|
+
data = getattr(blob, "data", None)
|
|
356
|
+
if data:
|
|
357
|
+
img_bytes = self._ensure_bytes(data)
|
|
358
|
+
if img_bytes:
|
|
359
|
+
img_path = self.window.core.image.gen_unique_path(ctx)
|
|
360
|
+
with open(img_path, "wb") as f:
|
|
361
|
+
f.write(img_bytes)
|
|
362
|
+
images.append(img_path)
|
|
363
|
+
|
|
364
|
+
# File data URI (may contain http/https or gs://)
|
|
365
|
+
fdata = getattr(p, "file_data", None)
|
|
366
|
+
if fdata:
|
|
367
|
+
uri = getattr(fdata, "file_uri", None) or getattr(fdata, "uri", None)
|
|
368
|
+
mime = (getattr(fdata, "mime_type", "") or "").lower()
|
|
369
|
+
if uri and mime.startswith("image/"):
|
|
370
|
+
# Store only as URL; downloading is out of scope here.
|
|
371
|
+
if uri.startswith("http://") or uri.startswith("https://"):
|
|
372
|
+
urls.append(uri)
|
|
373
|
+
except Exception:
|
|
374
|
+
# Best-effort only
|
|
375
|
+
pass
|
|
376
|
+
|
|
377
|
+
if images:
|
|
378
|
+
if not isinstance(ctx.images, list):
|
|
379
|
+
ctx.images = []
|
|
380
|
+
ctx.images.extend(images)
|
|
381
|
+
|
|
382
|
+
if urls:
|
|
383
|
+
if ctx.urls is None:
|
|
384
|
+
ctx.urls = []
|
|
385
|
+
ctx.urls.extend(urls)
|
|
386
|
+
|
|
387
|
+
@staticmethod
|
|
388
|
+
def _ensure_bytes(data) -> bytes | None:
|
|
389
|
+
"""Return raw bytes from SDK part.inline_data.data which can be bytes or base64 string."""
|
|
390
|
+
try:
|
|
391
|
+
if isinstance(data, (bytes, bytearray)):
|
|
392
|
+
return bytes(data)
|
|
393
|
+
if isinstance(data, str):
|
|
394
|
+
import base64
|
|
395
|
+
return base64.b64decode(data)
|
|
396
|
+
except Exception:
|
|
397
|
+
return None
|
|
398
|
+
return None
|
|
399
|
+
|
|
400
|
+
def build_input(
|
|
401
|
+
self,
|
|
402
|
+
prompt: str,
|
|
403
|
+
system_prompt: str,
|
|
404
|
+
model: ModelItem,
|
|
405
|
+
history: Optional[List[CtxItem]] = None,
|
|
406
|
+
attachments: Optional[Dict[str, AttachmentItem]] = None,
|
|
407
|
+
multimodal_ctx: Optional[MultimodalContext] = None,
|
|
408
|
+
) -> List[Content]:
|
|
409
|
+
"""
|
|
410
|
+
Build Google GenAI contents list
|
|
411
|
+
|
|
412
|
+
:param prompt: User prompt
|
|
413
|
+
:param system_prompt: System prompt/instruction
|
|
414
|
+
:param model: ModelItem
|
|
415
|
+
:param history: List of CtxItem for history
|
|
416
|
+
:param attachments: Dict of AttachmentItem for images
|
|
417
|
+
:param multimodal_ctx: MultimodalContext for audio
|
|
418
|
+
:return: List of Content
|
|
419
|
+
"""
|
|
420
|
+
contents: List[Content] = []
|
|
421
|
+
|
|
422
|
+
# System instruction is passed separately (system_instruction),
|
|
423
|
+
# so we do not build an explicit system role part here.
|
|
424
|
+
|
|
425
|
+
# Append conversation history
|
|
426
|
+
if self.window.core.config.get('use_context'):
|
|
427
|
+
items = self.window.core.ctx.get_history(
|
|
428
|
+
history,
|
|
429
|
+
model.id,
|
|
430
|
+
MODE_CHAT if model and MODE_CHAT in model.mode else MODE_CHAT,
|
|
431
|
+
self.window.core.tokens.from_user(prompt, system_prompt),
|
|
432
|
+
self._fit_ctx(model),
|
|
433
|
+
)
|
|
434
|
+
for item in items:
|
|
435
|
+
if item.final_input:
|
|
436
|
+
contents.append(Content(role="user", parts=[Part.from_text(text=str(item.final_input))]))
|
|
437
|
+
if item.final_output:
|
|
438
|
+
contents.append(Content(role="model", parts=[Part.from_text(text=str(item.final_output))]))
|
|
439
|
+
|
|
440
|
+
# Current user message with multimodal parts
|
|
441
|
+
parts = self._build_user_parts(
|
|
442
|
+
content=str(prompt),
|
|
443
|
+
attachments=attachments,
|
|
444
|
+
multimodal_ctx=multimodal_ctx,
|
|
445
|
+
)
|
|
446
|
+
contents.append(Content(role="user", parts=parts))
|
|
447
|
+
|
|
448
|
+
return contents
|
|
449
|
+
|
|
450
|
+
def _build_user_parts(
|
|
451
|
+
self,
|
|
452
|
+
content: str,
|
|
453
|
+
attachments: Optional[Dict[str, AttachmentItem]] = None,
|
|
454
|
+
multimodal_ctx: Optional[MultimodalContext] = None,
|
|
455
|
+
) -> List[Part]:
|
|
456
|
+
"""
|
|
457
|
+
Build user message parts (text + images + audio)
|
|
458
|
+
|
|
459
|
+
:param content: User text content
|
|
460
|
+
:param attachments: Dict of AttachmentItem for images
|
|
461
|
+
:param multimodal_ctx: MultimodalContext for audio
|
|
462
|
+
:return: List of Part
|
|
463
|
+
"""
|
|
464
|
+
parts: List[Part] = []
|
|
465
|
+
if content:
|
|
466
|
+
parts.append(Part.from_text(text=str(content)))
|
|
467
|
+
|
|
468
|
+
if attachments:
|
|
469
|
+
img_parts = self.window.core.api.google.vision.build_parts(content, attachments)
|
|
470
|
+
parts.extend(img_parts)
|
|
471
|
+
|
|
472
|
+
if multimodal_ctx and multimodal_ctx.is_audio_input and multimodal_ctx.audio_data:
|
|
473
|
+
audio_format = (multimodal_ctx.audio_format or "wav").lower()
|
|
474
|
+
mime = f"audio/{audio_format}"
|
|
475
|
+
parts.append(Part.from_bytes(data=multimodal_ctx.audio_data, mime_type=mime))
|
|
476
|
+
|
|
477
|
+
return parts
|
|
478
|
+
|
|
479
|
+
def _fit_ctx(self, model: ModelItem) -> int:
|
|
480
|
+
"""
|
|
481
|
+
Fit to max model tokens (best-effort, uses model.ctx if present)
|
|
482
|
+
|
|
483
|
+
:param model: ModelItem
|
|
484
|
+
:return: max context tokens
|
|
485
|
+
"""
|
|
486
|
+
max_ctx_tokens = self.window.core.config.get('max_total_tokens')
|
|
487
|
+
if model and model.ctx and 0 < model.ctx < max_ctx_tokens:
|
|
488
|
+
max_ctx_tokens = model.ctx
|
|
489
|
+
return max_ctx_tokens
|
|
490
|
+
|
|
491
|
+
def _build_count_messages(
|
|
492
|
+
self,
|
|
493
|
+
prompt: str,
|
|
494
|
+
system_prompt: str,
|
|
495
|
+
model: ModelItem,
|
|
496
|
+
history: Optional[List[CtxItem]] = None,
|
|
497
|
+
) -> List[dict]:
|
|
498
|
+
"""
|
|
499
|
+
Build simple messages structure for local token estimation
|
|
500
|
+
|
|
501
|
+
:param prompt: User prompt
|
|
502
|
+
:param system_prompt: System prompt/instruction
|
|
503
|
+
:param model: ModelItem
|
|
504
|
+
:param history: List of CtxItem for history
|
|
505
|
+
:return: List of messages dicts with 'role' and 'content' keys
|
|
506
|
+
"""
|
|
507
|
+
messages = []
|
|
508
|
+
if system_prompt:
|
|
509
|
+
messages.append({"role": "system", "content": system_prompt})
|
|
510
|
+
|
|
511
|
+
if self.window.core.config.get('use_context'):
|
|
512
|
+
used_tokens = self.window.core.tokens.from_user(prompt, system_prompt)
|
|
513
|
+
items = self.window.core.ctx.get_history(
|
|
514
|
+
history,
|
|
515
|
+
model.id,
|
|
516
|
+
MODE_CHAT,
|
|
517
|
+
used_tokens,
|
|
518
|
+
self._fit_ctx(model),
|
|
519
|
+
)
|
|
520
|
+
for item in items:
|
|
521
|
+
if item.final_input:
|
|
522
|
+
messages.append({"role": "user", "content": str(item.final_input)})
|
|
523
|
+
if item.final_output:
|
|
524
|
+
messages.append({"role": "assistant", "content": str(item.final_output)})
|
|
525
|
+
|
|
526
|
+
messages.append({"role": "user", "content": str(prompt)})
|
|
527
|
+
return messages
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
def reset_tokens(self):
|
|
531
|
+
"""Reset input tokens counter"""
|
|
532
|
+
self.input_tokens = 0
|
|
533
|
+
|
|
534
|
+
def get_used_tokens(self) -> int:
|
|
535
|
+
"""
|
|
536
|
+
Get input tokens counter (estimated before sending)
|
|
537
|
+
|
|
538
|
+
:return: input tokens count
|
|
539
|
+
"""
|
|
540
|
+
return self.input_tokens
|
|
541
|
+
|
|
542
|
+
@staticmethod
|
|
543
|
+
def _supports_tts(model_id: Optional[str]) -> bool:
|
|
544
|
+
"""
|
|
545
|
+
Heuristic check if the model supports native TTS.
|
|
546
|
+
- Official TTS models contain '-tts' in id (e.g. 'gemini-2.5-flash-preview-tts').
|
|
547
|
+
- Future/preview names may contain 'native-audio'.
|
|
548
|
+
"""
|
|
549
|
+
if not model_id:
|
|
550
|
+
return False
|
|
551
|
+
mid = model_id.lower()
|
|
552
|
+
return ("-tts" in mid) or ("native-audio" in mid)
|