pygpt-net 2.6.30__py3-none-any.whl → 2.6.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pygpt_net/CHANGELOG.txt +15 -0
- pygpt_net/__init__.py +3 -3
- pygpt_net/app.py +7 -1
- pygpt_net/app_core.py +3 -1
- pygpt_net/config.py +3 -1
- pygpt_net/controller/__init__.py +9 -2
- pygpt_net/controller/audio/audio.py +38 -1
- pygpt_net/controller/audio/ui.py +2 -2
- pygpt_net/controller/chat/audio.py +1 -8
- pygpt_net/controller/chat/common.py +23 -62
- pygpt_net/controller/chat/handler/__init__.py +0 -0
- pygpt_net/controller/chat/handler/stream_worker.py +1124 -0
- pygpt_net/controller/chat/output.py +8 -3
- pygpt_net/controller/chat/stream.py +3 -1071
- pygpt_net/controller/chat/text.py +3 -2
- pygpt_net/controller/kernel/kernel.py +11 -3
- pygpt_net/controller/kernel/reply.py +5 -1
- pygpt_net/controller/lang/custom.py +2 -2
- pygpt_net/controller/media/__init__.py +12 -0
- pygpt_net/controller/media/media.py +115 -0
- pygpt_net/controller/realtime/__init__.py +12 -0
- pygpt_net/controller/realtime/manager.py +53 -0
- pygpt_net/controller/realtime/realtime.py +293 -0
- pygpt_net/controller/ui/mode.py +23 -2
- pygpt_net/controller/ui/ui.py +19 -1
- pygpt_net/core/audio/audio.py +6 -1
- pygpt_net/core/audio/backend/native/__init__.py +12 -0
- pygpt_net/core/audio/backend/{native.py → native/native.py} +426 -127
- pygpt_net/core/audio/backend/native/player.py +139 -0
- pygpt_net/core/audio/backend/native/realtime.py +250 -0
- pygpt_net/core/audio/backend/pyaudio/__init__.py +12 -0
- pygpt_net/core/audio/backend/pyaudio/playback.py +194 -0
- pygpt_net/core/audio/backend/pyaudio/pyaudio.py +923 -0
- pygpt_net/core/audio/backend/pyaudio/realtime.py +312 -0
- pygpt_net/core/audio/backend/pygame/__init__.py +12 -0
- pygpt_net/core/audio/backend/{pygame.py → pygame/pygame.py} +130 -19
- pygpt_net/core/audio/backend/shared/__init__.py +38 -0
- pygpt_net/core/audio/backend/shared/conversions.py +211 -0
- pygpt_net/core/audio/backend/shared/envelope.py +38 -0
- pygpt_net/core/audio/backend/shared/player.py +137 -0
- pygpt_net/core/audio/backend/shared/rt.py +52 -0
- pygpt_net/core/audio/capture.py +5 -0
- pygpt_net/core/audio/output.py +14 -2
- pygpt_net/core/audio/whisper.py +6 -2
- pygpt_net/core/bridge/bridge.py +2 -1
- pygpt_net/core/bridge/worker.py +4 -1
- pygpt_net/core/dispatcher/dispatcher.py +37 -1
- pygpt_net/core/events/__init__.py +2 -1
- pygpt_net/core/events/realtime.py +55 -0
- pygpt_net/core/image/image.py +56 -5
- pygpt_net/core/realtime/__init__.py +0 -0
- pygpt_net/core/realtime/options.py +87 -0
- pygpt_net/core/realtime/shared/__init__.py +0 -0
- pygpt_net/core/realtime/shared/audio.py +213 -0
- pygpt_net/core/realtime/shared/loop.py +64 -0
- pygpt_net/core/realtime/shared/session.py +59 -0
- pygpt_net/core/realtime/shared/text.py +37 -0
- pygpt_net/core/realtime/shared/tools.py +276 -0
- pygpt_net/core/realtime/shared/turn.py +38 -0
- pygpt_net/core/realtime/shared/types.py +16 -0
- pygpt_net/core/realtime/worker.py +160 -0
- pygpt_net/core/render/web/body.py +24 -3
- pygpt_net/core/text/utils.py +54 -2
- pygpt_net/core/types/__init__.py +1 -0
- pygpt_net/core/types/image.py +54 -0
- pygpt_net/core/video/__init__.py +12 -0
- pygpt_net/core/video/video.py +290 -0
- pygpt_net/data/config/config.json +26 -5
- pygpt_net/data/config/models.json +221 -103
- pygpt_net/data/config/settings.json +244 -6
- pygpt_net/data/css/web-blocks.css +6 -0
- pygpt_net/data/css/web-chatgpt.css +6 -0
- pygpt_net/data/css/web-chatgpt_wide.css +6 -0
- pygpt_net/data/locale/locale.de.ini +35 -7
- pygpt_net/data/locale/locale.en.ini +56 -17
- pygpt_net/data/locale/locale.es.ini +35 -7
- pygpt_net/data/locale/locale.fr.ini +35 -7
- pygpt_net/data/locale/locale.it.ini +35 -7
- pygpt_net/data/locale/locale.pl.ini +38 -7
- pygpt_net/data/locale/locale.uk.ini +35 -7
- pygpt_net/data/locale/locale.zh.ini +31 -3
- pygpt_net/data/locale/plugin.audio_input.en.ini +4 -0
- pygpt_net/data/locale/plugin.audio_output.en.ini +4 -0
- pygpt_net/data/locale/plugin.cmd_web.en.ini +8 -0
- pygpt_net/item/model.py +22 -1
- pygpt_net/plugin/audio_input/plugin.py +37 -4
- pygpt_net/plugin/audio_input/simple.py +57 -8
- pygpt_net/plugin/cmd_files/worker.py +3 -0
- pygpt_net/provider/api/google/__init__.py +76 -7
- pygpt_net/provider/api/google/audio.py +8 -1
- pygpt_net/provider/api/google/chat.py +45 -6
- pygpt_net/provider/api/google/image.py +226 -86
- pygpt_net/provider/api/google/realtime/__init__.py +12 -0
- pygpt_net/provider/api/google/realtime/client.py +1945 -0
- pygpt_net/provider/api/google/realtime/realtime.py +186 -0
- pygpt_net/provider/api/google/video.py +364 -0
- pygpt_net/provider/api/openai/__init__.py +22 -2
- pygpt_net/provider/api/openai/realtime/__init__.py +12 -0
- pygpt_net/provider/api/openai/realtime/client.py +1828 -0
- pygpt_net/provider/api/openai/realtime/realtime.py +193 -0
- pygpt_net/provider/audio_input/google_genai.py +103 -0
- pygpt_net/provider/audio_output/google_genai_tts.py +229 -0
- pygpt_net/provider/audio_output/google_tts.py +0 -12
- pygpt_net/provider/audio_output/openai_tts.py +8 -5
- pygpt_net/provider/core/config/patch.py +241 -178
- pygpt_net/provider/core/model/patch.py +28 -2
- pygpt_net/provider/llms/google.py +8 -9
- pygpt_net/provider/web/duckduck_search.py +212 -0
- pygpt_net/ui/layout/toolbox/audio.py +55 -0
- pygpt_net/ui/layout/toolbox/footer.py +14 -42
- pygpt_net/ui/layout/toolbox/image.py +7 -13
- pygpt_net/ui/layout/toolbox/raw.py +52 -0
- pygpt_net/ui/layout/toolbox/split.py +48 -0
- pygpt_net/ui/layout/toolbox/toolbox.py +8 -8
- pygpt_net/ui/layout/toolbox/video.py +49 -0
- pygpt_net/ui/widget/option/combo.py +15 -1
- {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/METADATA +46 -22
- {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/RECORD +121 -73
- pygpt_net/core/audio/backend/pyaudio.py +0 -554
- {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/LICENSE +0 -0
- {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/WHEEL +0 -0
- {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# ================================================== #
|
|
4
|
+
# This file is a part of PYGPT package #
|
|
5
|
+
# Website: https://pygpt.net #
|
|
6
|
+
# GitHub: https://github.com/szczyglis-dev/py-gpt #
|
|
7
|
+
# MIT License #
|
|
8
|
+
# Created By : Marcin Szczygliński #
|
|
9
|
+
# Updated Date: 2025.09.01 23:00:00 #
|
|
10
|
+
# ================================================== #
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
from typing import Optional, Dict, Any
|
|
14
|
+
|
|
15
|
+
from pygpt_net.core.bridge import BridgeContext
|
|
16
|
+
from pygpt_net.core.events import RealtimeEvent
|
|
17
|
+
from pygpt_net.core.realtime.options import RealtimeOptions
|
|
18
|
+
from pygpt_net.core.realtime.shared.session import extract_last_session_id
|
|
19
|
+
from pygpt_net.item.model import ModelItem
|
|
20
|
+
from pygpt_net.utils import trans
|
|
21
|
+
|
|
22
|
+
from .client import OpenAIRealtimeClient
|
|
23
|
+
|
|
24
|
+
class Realtime:
|
|
25
|
+
|
|
26
|
+
PROVIDER = "openai"
|
|
27
|
+
|
|
28
|
+
def __init__(self, window=None):
|
|
29
|
+
"""
|
|
30
|
+
OpenAI API realtime controller
|
|
31
|
+
|
|
32
|
+
:param window: Window instance
|
|
33
|
+
"""
|
|
34
|
+
self.window = window
|
|
35
|
+
self.handler = OpenAIRealtimeClient(window)
|
|
36
|
+
self.prev_auto_turn = False
|
|
37
|
+
self.prev_vad_silence = 2000
|
|
38
|
+
self.prev_vad_prefix = 300
|
|
39
|
+
|
|
40
|
+
def begin(
|
|
41
|
+
self,
|
|
42
|
+
context: BridgeContext,
|
|
43
|
+
model: Optional[ModelItem] = None,
|
|
44
|
+
extra: Optional[Dict[str, Any]] = None,
|
|
45
|
+
rt_signals=None
|
|
46
|
+
) -> bool:
|
|
47
|
+
"""
|
|
48
|
+
Begin realtime session if applicable
|
|
49
|
+
|
|
50
|
+
:param context: BridgeContext
|
|
51
|
+
:param model: Optional[ModelItem]
|
|
52
|
+
:param extra: Optional dict with extra parameters
|
|
53
|
+
:param rt_signals: RealtimeSignals
|
|
54
|
+
:return: True if realtime session started, False otherwise
|
|
55
|
+
"""
|
|
56
|
+
mm = context.multimodal_ctx
|
|
57
|
+
audio_bytes = getattr(mm, "audio_data", None) if mm and getattr(mm, "is_audio_input", False) else None
|
|
58
|
+
audio_format = getattr(mm, "audio_format", None) if mm else None
|
|
59
|
+
audio_rate = getattr(mm, "audio_rate", None) if mm else None
|
|
60
|
+
is_debug = self.window.core.config.get("log.realtime", False)
|
|
61
|
+
auto_turn = self.window.core.config.get("audio.input.auto_turn", True)
|
|
62
|
+
opt_vad_silence = self.window.core.config.get("audio.input.vad.silence", 2000)
|
|
63
|
+
opt_vad_prefix = self.window.core.config.get("audio.input.vad.prefix", 300)
|
|
64
|
+
|
|
65
|
+
# setup manager
|
|
66
|
+
self.window.controller.realtime.set_current_active(self.PROVIDER)
|
|
67
|
+
self.window.controller.realtime.set_busy()
|
|
68
|
+
self.handler.set_debug(is_debug)
|
|
69
|
+
|
|
70
|
+
# tools
|
|
71
|
+
tools = self.window.core.api.openai.tools.prepare(model, context.external_functions)
|
|
72
|
+
|
|
73
|
+
# remote tools
|
|
74
|
+
remote_tools = []
|
|
75
|
+
remote_tools = self.window.core.api.openai.remote_tools.append_to_tools(
|
|
76
|
+
mode=context.mode,
|
|
77
|
+
model=model,
|
|
78
|
+
stream=context.stream,
|
|
79
|
+
is_expert_call=context.is_expert_call,
|
|
80
|
+
tools=remote_tools,
|
|
81
|
+
preset=context.preset,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# handle sub-reply (tool results from tool calls)
|
|
85
|
+
if context.ctx.internal:
|
|
86
|
+
if context.ctx.prev_ctx and context.ctx.prev_ctx.extra.get("prev_tool_calls"):
|
|
87
|
+
tool_calls = context.ctx.prev_ctx.extra.get("prev_tool_calls", [])
|
|
88
|
+
tool_call_id = None
|
|
89
|
+
if isinstance(tool_calls, list) and len(tool_calls) > 0:
|
|
90
|
+
tool_call_id = tool_calls[0].get("call_id", "") # get first call_id
|
|
91
|
+
if not tool_call_id:
|
|
92
|
+
tool_call_id = tool_calls[0].get("id", "") # fallback to id
|
|
93
|
+
if tool_call_id:
|
|
94
|
+
tool_results = context.ctx.input
|
|
95
|
+
try:
|
|
96
|
+
tool_results = json.loads(tool_results)
|
|
97
|
+
except Exception:
|
|
98
|
+
pass
|
|
99
|
+
self.handler.send_tool_results_sync({
|
|
100
|
+
tool_call_id: tool_results
|
|
101
|
+
})
|
|
102
|
+
self.handler.update_ctx(context.ctx)
|
|
103
|
+
return True # do not start new session, just send tool results
|
|
104
|
+
|
|
105
|
+
# update auto-turn in active session
|
|
106
|
+
if (self.handler.is_session_active()
|
|
107
|
+
and (auto_turn != self.prev_auto_turn
|
|
108
|
+
or opt_vad_silence != self.prev_vad_silence
|
|
109
|
+
or opt_vad_prefix != self.prev_vad_prefix)):
|
|
110
|
+
self.handler.update_session_autoturn_sync(auto_turn, opt_vad_silence, opt_vad_prefix)
|
|
111
|
+
|
|
112
|
+
# if auto-turn is enabled and prompt is empty, update session and context only
|
|
113
|
+
if auto_turn and self.handler.is_session_active() and (context.prompt.strip() == "" or context.prompt == "..."):
|
|
114
|
+
self.handler.update_session_tools_sync(tools, remote_tools)
|
|
115
|
+
self.handler.update_ctx(context.ctx)
|
|
116
|
+
self.window.update_status(trans("speech.listening"))
|
|
117
|
+
return True # do not send new request if session is active
|
|
118
|
+
|
|
119
|
+
# Last session ID
|
|
120
|
+
last_session_id = extract_last_session_id(context.history)
|
|
121
|
+
if is_debug:
|
|
122
|
+
print("[realtime session] Last ID", last_session_id)
|
|
123
|
+
|
|
124
|
+
# Voice
|
|
125
|
+
voice = "alloy"
|
|
126
|
+
try:
|
|
127
|
+
v = self.window.core.plugins.get_option("audio_output", "openai_voice")
|
|
128
|
+
if v:
|
|
129
|
+
voice = str(v)
|
|
130
|
+
except Exception:
|
|
131
|
+
pass
|
|
132
|
+
|
|
133
|
+
# Options
|
|
134
|
+
opts = RealtimeOptions(
|
|
135
|
+
provider=self.PROVIDER,
|
|
136
|
+
model=context.model.id,
|
|
137
|
+
system_prompt=context.system_prompt,
|
|
138
|
+
prompt=context.prompt,
|
|
139
|
+
voice=voice,
|
|
140
|
+
audio_data=audio_bytes,
|
|
141
|
+
audio_format=audio_format,
|
|
142
|
+
audio_rate=audio_rate,
|
|
143
|
+
vad="server_vad",
|
|
144
|
+
extra=extra or {},
|
|
145
|
+
tools=tools,
|
|
146
|
+
remote_tools=remote_tools,
|
|
147
|
+
rt_signals=rt_signals,
|
|
148
|
+
rt_session_id=last_session_id,
|
|
149
|
+
auto_turn=auto_turn,
|
|
150
|
+
vad_end_silence_ms=opt_vad_silence,
|
|
151
|
+
vad_prefix_padding_ms=opt_vad_prefix,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Start or append to realtime session via manager
|
|
155
|
+
try:
|
|
156
|
+
if is_debug:
|
|
157
|
+
print("[realtime] Starting session with options:", opts.to_dict())
|
|
158
|
+
rt = self.window.controller.realtime.manager
|
|
159
|
+
rt.start(context.ctx, opts)
|
|
160
|
+
|
|
161
|
+
self.prev_auto_turn = auto_turn
|
|
162
|
+
self.prev_vad_silence = opt_vad_silence
|
|
163
|
+
self.prev_vad_prefix = opt_vad_prefix
|
|
164
|
+
return True
|
|
165
|
+
except Exception as e:
|
|
166
|
+
self.window.core.debug.log(e)
|
|
167
|
+
return False # fallback to non-live path
|
|
168
|
+
|
|
169
|
+
def handle_audio_input(self, event: RealtimeEvent):
|
|
170
|
+
"""
|
|
171
|
+
Handle Realtime audio input event
|
|
172
|
+
|
|
173
|
+
:param event: RealtimeEvent
|
|
174
|
+
"""
|
|
175
|
+
self.handler.rt_handle_audio_input_sync(event)
|
|
176
|
+
|
|
177
|
+
def manual_commit(self):
|
|
178
|
+
"""Manually commit audio input to realtime session"""
|
|
179
|
+
self.handler.force_response_now_sync()
|
|
180
|
+
|
|
181
|
+
def shutdown(self):
|
|
182
|
+
"""Shutdown realtime loops"""
|
|
183
|
+
if self.handler.is_session_active():
|
|
184
|
+
self.handler.close_session_sync()
|
|
185
|
+
try:
|
|
186
|
+
self.handler.stop_loop_sync()
|
|
187
|
+
except Exception:
|
|
188
|
+
pass
|
|
189
|
+
|
|
190
|
+
def reset(self):
|
|
191
|
+
"""Close realtime session"""
|
|
192
|
+
if self.handler.is_session_active():
|
|
193
|
+
self.handler.close_session_sync()
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# ================================================== #
|
|
4
|
+
# This file is a part of PYGPT package #
|
|
5
|
+
# Website: https://pygpt.net #
|
|
6
|
+
# GitHub: https://github.com/szczyglis-dev/py-gpt #
|
|
7
|
+
# MIT License #
|
|
8
|
+
# Created By : Marcin Szczygliński #
|
|
9
|
+
# Updated Date: 2025.08.29 18:00:00 #
|
|
10
|
+
# ================================================== #
|
|
11
|
+
|
|
12
|
+
from .base import BaseProvider
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class GoogleGenAIAudioInput(BaseProvider):
|
|
16
|
+
|
|
17
|
+
PROMPT_TRANSCRIBE = (
|
|
18
|
+
"You are a speech-to-text transcriber. "
|
|
19
|
+
"Return only the verbatim transcript as plain text. "
|
|
20
|
+
"Do not add any explanations, timestamps, labels or formatting."
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
def __init__(self, *args, **kwargs):
|
|
24
|
+
"""
|
|
25
|
+
Google GenAI (Gemini) audio provider for transcription (via API).
|
|
26
|
+
|
|
27
|
+
:param args: args
|
|
28
|
+
:param kwargs: kwargs
|
|
29
|
+
"""
|
|
30
|
+
super(GoogleGenAIAudioInput, self).__init__(*args, **kwargs)
|
|
31
|
+
self.plugin = kwargs.get("plugin")
|
|
32
|
+
self.id = "google_genai"
|
|
33
|
+
self.name = "Google GenAI"
|
|
34
|
+
|
|
35
|
+
def init_options(self):
|
|
36
|
+
"""Initialize options"""
|
|
37
|
+
# Keep option shape consistent with Whisper provider
|
|
38
|
+
self.plugin.add_option(
|
|
39
|
+
"google_genai_audio_model",
|
|
40
|
+
type="text",
|
|
41
|
+
value="gemini-2.5-flash",
|
|
42
|
+
label="Model",
|
|
43
|
+
tab="google_genai",
|
|
44
|
+
description="Specify Gemini model supporting audio, e.g., gemini-2.5-flash",
|
|
45
|
+
)
|
|
46
|
+
self.plugin.add_option(
|
|
47
|
+
"google_genai_audio_prompt",
|
|
48
|
+
type="textarea",
|
|
49
|
+
value=self.PROMPT_TRANSCRIBE,
|
|
50
|
+
label="System Prompt",
|
|
51
|
+
tab="google_genai",
|
|
52
|
+
description="System prompt to guide the transcription output",
|
|
53
|
+
tooltip="System prompt for transcription",
|
|
54
|
+
persist=True,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
def transcribe(self, path: str) -> str:
|
|
58
|
+
"""
|
|
59
|
+
Audio to text transcription using Google GenAI (Gemini).
|
|
60
|
+
|
|
61
|
+
:param path: path to audio file to transcribe
|
|
62
|
+
:return: transcribed text
|
|
63
|
+
"""
|
|
64
|
+
# Get pre-configured GenAI client from the app core
|
|
65
|
+
client = self.plugin.window.core.api.google.get_client()
|
|
66
|
+
|
|
67
|
+
# Upload the audio file via the Files API
|
|
68
|
+
uploaded_file = client.files.upload(file=path)
|
|
69
|
+
|
|
70
|
+
# Ask the model to produce a plain text transcript only
|
|
71
|
+
# Using system_instruction keeps the public API surface simple (no extra options needed)
|
|
72
|
+
config = {
|
|
73
|
+
"system_instruction": self.plugin.get_option_value("google_genai_audio_prompt") or self.PROMPT_TRANSCRIBE,
|
|
74
|
+
"temperature": 0.0,
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
# Generate content (transcription) with the selected model
|
|
78
|
+
model_name = self.plugin.get_option_value("google_genai_audio_model")
|
|
79
|
+
response = client.models.generate_content(
|
|
80
|
+
model=model_name,
|
|
81
|
+
contents=[uploaded_file],
|
|
82
|
+
config=config,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# The SDK exposes the unified .text property for convenience
|
|
86
|
+
return response.text or ""
|
|
87
|
+
|
|
88
|
+
def is_configured(self) -> bool:
|
|
89
|
+
"""
|
|
90
|
+
Check if provider is configured
|
|
91
|
+
|
|
92
|
+
:return: True if configured, False otherwise
|
|
93
|
+
"""
|
|
94
|
+
api_key = self.plugin.window.core.config.get("api_key_google")
|
|
95
|
+
return api_key is not None and api_key != ""
|
|
96
|
+
|
|
97
|
+
def get_config_message(self) -> str:
|
|
98
|
+
"""
|
|
99
|
+
Return message to display when provider is not configured
|
|
100
|
+
|
|
101
|
+
:return: message
|
|
102
|
+
"""
|
|
103
|
+
return "Google GenAI API key is not set yet. Please configure it in settings."
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# ================================================== #
|
|
4
|
+
# This file is a part of PYGPT package #
|
|
5
|
+
# Website: https://pygpt.net #
|
|
6
|
+
# GitHub: https://github.com/szczyglis-dev/py-gpt #
|
|
7
|
+
# MIT License #
|
|
8
|
+
# Created By : Marcin Szczygliński #
|
|
9
|
+
# Updated Date: 2025.08.29 18:00:00 #
|
|
10
|
+
# ================================================== #
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import wave
|
|
14
|
+
import base64
|
|
15
|
+
|
|
16
|
+
from .base import BaseProvider
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class GoogleGenAITextToSpeech(BaseProvider):
|
|
20
|
+
def __init__(self, *args, **kwargs):
|
|
21
|
+
"""
|
|
22
|
+
Google GenAI Text-to-Speech provider (Gemini TTS via API).
|
|
23
|
+
|
|
24
|
+
:param args: args
|
|
25
|
+
:param kwargs: kwargs
|
|
26
|
+
"""
|
|
27
|
+
super(GoogleGenAITextToSpeech, self).__init__(*args, **kwargs)
|
|
28
|
+
self.plugin = kwargs.get("plugin")
|
|
29
|
+
self.id = "google_genai_tts"
|
|
30
|
+
self.name = "Google GenAI TTS"
|
|
31
|
+
|
|
32
|
+
# Supported preview TTS models (fallback to flash if invalid)
|
|
33
|
+
self.allowed_models = [
|
|
34
|
+
"gemini-2.5-flash-preview-tts",
|
|
35
|
+
"gemini-2.5-pro-preview-tts",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
# Prebuilt voice names exposed by Gemini TTS
|
|
39
|
+
# Keep list in sync with official docs; fallback to "Puck" if invalid.
|
|
40
|
+
self.allowed_voices = [
|
|
41
|
+
"Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus",
|
|
42
|
+
"Aoede", "Callirrhoe", "Autonoe", "Enceladus", "Iapetus",
|
|
43
|
+
"Umbriel", "Algieba", "Despina", "Erinome", "Algenib",
|
|
44
|
+
"Rasalgethi", "Laomedeia", "Achernar", "Alnilam", "Schedar",
|
|
45
|
+
"Gacrux", "Pulcherrima", "Achird", "Zubenelgenubi",
|
|
46
|
+
"Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
def init_options(self):
|
|
50
|
+
"""Initialize options"""
|
|
51
|
+
# Keep option names consistent with the app style; simple text fields are enough.
|
|
52
|
+
self.plugin.add_option(
|
|
53
|
+
"google_genai_tts_model",
|
|
54
|
+
type="text",
|
|
55
|
+
value="gemini-2.5-flash-preview-tts",
|
|
56
|
+
label="Model",
|
|
57
|
+
tab="google_genai_tts",
|
|
58
|
+
description="Specify Gemini TTS model, e.g.: gemini-2.5-flash-preview-tts or gemini-2.5-pro-preview-tts",
|
|
59
|
+
)
|
|
60
|
+
self.plugin.add_option(
|
|
61
|
+
"google_genai_tts_voice",
|
|
62
|
+
type="text",
|
|
63
|
+
value="Kore",
|
|
64
|
+
label="Voice",
|
|
65
|
+
tab="google_genai_tts",
|
|
66
|
+
description="Specify voice, e.g.: Puck, Kore, Charon, Leda, Zephyr... (case-sensitive)",
|
|
67
|
+
urls={
|
|
68
|
+
"Voices": "https://ai.google.dev/gemini-api/docs/speech-generation"
|
|
69
|
+
},
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def speech(self, text: str) -> str:
|
|
73
|
+
"""
|
|
74
|
+
Text to speech synthesis using Google GenAI (Gemini TTS).
|
|
75
|
+
|
|
76
|
+
:param text: text to synthesize
|
|
77
|
+
:return: path to generated audio file
|
|
78
|
+
"""
|
|
79
|
+
# Get pre-configured GenAI client
|
|
80
|
+
client = self.plugin.window.core.api.google.get_client()
|
|
81
|
+
|
|
82
|
+
# Resolve path where audio should be written
|
|
83
|
+
output_file = self.plugin.output_file
|
|
84
|
+
path = os.path.join(self.plugin.window.core.config.path, output_file)
|
|
85
|
+
|
|
86
|
+
# Validate/select model
|
|
87
|
+
model = self.plugin.get_option_value("google_genai_tts_model") or "gemini-2.5-flash-preview-tts"
|
|
88
|
+
model = self._normalize_model_name(model)
|
|
89
|
+
if model not in self.allowed_models:
|
|
90
|
+
model = "gemini-2.5-flash-preview-tts"
|
|
91
|
+
|
|
92
|
+
# Validate/select voice
|
|
93
|
+
voice = self.plugin.get_option_value("google_genai_tts_voice") or "Kore"
|
|
94
|
+
# if voice not in self.allowed_voices:
|
|
95
|
+
# voice = "Kore"
|
|
96
|
+
|
|
97
|
+
# Build generation config for audio modality + voice
|
|
98
|
+
# Using explicit types for clarity and forward-compatibility
|
|
99
|
+
try:
|
|
100
|
+
from google.genai import types
|
|
101
|
+
except Exception as ex:
|
|
102
|
+
# Fail fast if SDK is missing or incompatible
|
|
103
|
+
raise RuntimeError("google.genai SDK is not available. Please install/update Google GenAI SDK.") from ex
|
|
104
|
+
|
|
105
|
+
gen_config = types.GenerateContentConfig(
|
|
106
|
+
response_modalities=["AUDIO"],
|
|
107
|
+
speech_config=types.SpeechConfig(
|
|
108
|
+
voice_config=types.VoiceConfig(
|
|
109
|
+
prebuilt_voice_config=types.PrebuiltVoiceConfig(
|
|
110
|
+
voice_name=voice
|
|
111
|
+
)
|
|
112
|
+
)
|
|
113
|
+
),
|
|
114
|
+
temperature=0.8, # balanced default; keep configurable later if needed
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Perform TTS request
|
|
118
|
+
response = client.models.generate_content(
|
|
119
|
+
model=model,
|
|
120
|
+
contents=text,
|
|
121
|
+
config=gen_config,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Extract PCM bytes from the first candidate/part
|
|
125
|
+
pcm = self._extract_pcm_bytes(response)
|
|
126
|
+
|
|
127
|
+
# Persist as standard WAV (PCM 16-bit, mono, 24 kHz)
|
|
128
|
+
self._save_wav(path, pcm, channels=1, rate=24000, sample_width=2)
|
|
129
|
+
|
|
130
|
+
return str(path)
|
|
131
|
+
|
|
132
|
+
def _extract_pcm_bytes(self, response) -> bytes:
|
|
133
|
+
"""
|
|
134
|
+
Extract PCM bytes from generate_content response.
|
|
135
|
+
|
|
136
|
+
:param response: Google GenAI response object
|
|
137
|
+
:return: raw PCM byte data
|
|
138
|
+
"""
|
|
139
|
+
# Defensive extraction to support minor SDK variations
|
|
140
|
+
data = None
|
|
141
|
+
try:
|
|
142
|
+
cand = response.candidates[0]
|
|
143
|
+
part = cand.content.parts[0]
|
|
144
|
+
if getattr(part, "inline_data", None) and getattr(part.inline_data, "data", None):
|
|
145
|
+
data = part.inline_data.data
|
|
146
|
+
except Exception:
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
if data is None:
|
|
150
|
+
# Some SDK builds may return base64 str; try resolving alternative layout
|
|
151
|
+
try:
|
|
152
|
+
parts = getattr(response, "candidates", [])[0].content.parts
|
|
153
|
+
for p in parts:
|
|
154
|
+
if getattr(p, "inline_data", None) and getattr(p.inline_data, "data", None):
|
|
155
|
+
data = p.inline_data.data
|
|
156
|
+
break
|
|
157
|
+
except Exception:
|
|
158
|
+
pass
|
|
159
|
+
|
|
160
|
+
if data is None:
|
|
161
|
+
raise RuntimeError("No audio data returned by Gemini TTS response.")
|
|
162
|
+
|
|
163
|
+
# Normalize to raw bytes
|
|
164
|
+
if isinstance(data, (bytes, bytearray)):
|
|
165
|
+
return bytes(data)
|
|
166
|
+
if isinstance(data, str):
|
|
167
|
+
# Fallback: treat as base64-encoded PCM
|
|
168
|
+
return base64.b64decode(data)
|
|
169
|
+
|
|
170
|
+
# Last resort: try bytes() cast
|
|
171
|
+
try:
|
|
172
|
+
return bytes(data)
|
|
173
|
+
except Exception as ex:
|
|
174
|
+
raise RuntimeError("Unsupported audio payload type returned by Gemini TTS.") from ex
|
|
175
|
+
|
|
176
|
+
def _save_wav(
|
|
177
|
+
self,
|
|
178
|
+
filename: str,
|
|
179
|
+
pcm_bytes: bytes,
|
|
180
|
+
channels: int = 1,
|
|
181
|
+
rate: int = 24000,
|
|
182
|
+
sample_width: int = 2
|
|
183
|
+
):
|
|
184
|
+
"""
|
|
185
|
+
Save raw PCM bytes to a WAV file.
|
|
186
|
+
|
|
187
|
+
:param filename: output WAV file path
|
|
188
|
+
:param pcm_bytes: raw PCM byte data
|
|
189
|
+
:param channels: number of audio channels (1=mono, 2=stereo)
|
|
190
|
+
:param rate: sample rate in Hz (e.g., 24000)
|
|
191
|
+
:param sample_width: sample width in bytes (e.g., 2 for 16-bit)
|
|
192
|
+
"""
|
|
193
|
+
# Ensure parent directory exists
|
|
194
|
+
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
|
195
|
+
|
|
196
|
+
# Write PCM payload as WAV
|
|
197
|
+
with wave.open(filename, "wb") as wf:
|
|
198
|
+
wf.setnchannels(channels)
|
|
199
|
+
wf.setsampwidth(sample_width) # bytes per sample (2 -> 16-bit)
|
|
200
|
+
wf.setframerate(rate)
|
|
201
|
+
wf.writeframes(pcm_bytes)
|
|
202
|
+
|
|
203
|
+
def _normalize_model_name(self, model: str) -> str:
|
|
204
|
+
"""
|
|
205
|
+
Normalize model id (strip optional 'models/' prefix).
|
|
206
|
+
|
|
207
|
+
:param model: model id
|
|
208
|
+
"""
|
|
209
|
+
try:
|
|
210
|
+
return model.split("/")[-1]
|
|
211
|
+
except Exception:
|
|
212
|
+
return model
|
|
213
|
+
|
|
214
|
+
def is_configured(self) -> bool:
|
|
215
|
+
"""
|
|
216
|
+
Check if provider is configured
|
|
217
|
+
|
|
218
|
+
:return: True if configured, False otherwise
|
|
219
|
+
"""
|
|
220
|
+
api_key = self.plugin.window.core.config.get("api_key_google")
|
|
221
|
+
return api_key is not None and api_key != ""
|
|
222
|
+
|
|
223
|
+
def get_config_message(self) -> str:
|
|
224
|
+
"""
|
|
225
|
+
Return message to display when provider is not configured
|
|
226
|
+
|
|
227
|
+
:return: message
|
|
228
|
+
"""
|
|
229
|
+
return "Google GenAI API key is not set yet. Please configure it in settings."
|
|
@@ -58,18 +58,6 @@ class GoogleTextToSpeech(BaseProvider):
|
|
|
58
58
|
"Voices": "https://cloud.google.com/text-to-speech/docs/voices"
|
|
59
59
|
},
|
|
60
60
|
)
|
|
61
|
-
self.plugin.add_option(
|
|
62
|
-
"google_voice_native",
|
|
63
|
-
type="text",
|
|
64
|
-
value="Kore",
|
|
65
|
-
label="Voice (Gemini API)",
|
|
66
|
-
tab="google",
|
|
67
|
-
description="Specify voice for Gemini API (supported voices may differ)",
|
|
68
|
-
tooltip="Voice name",
|
|
69
|
-
urls={
|
|
70
|
-
"Voices": "https://ai.google.dev/gemini-api/docs/speech-generation"
|
|
71
|
-
},
|
|
72
|
-
)
|
|
73
61
|
self.plugin.add_option(
|
|
74
62
|
"google_lang",
|
|
75
63
|
type="text",
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
# GitHub: https://github.com/szczyglis-dev/py-gpt #
|
|
7
7
|
# MIT License #
|
|
8
8
|
# Created By : Marcin Szczygliński #
|
|
9
|
-
# Updated Date: 2025.08.
|
|
9
|
+
# Updated Date: 2025.08.29 18:00:00 #
|
|
10
10
|
# ================================================== #
|
|
11
11
|
|
|
12
12
|
import os
|
|
@@ -51,6 +51,9 @@ class OpenAITextToSpeech(BaseProvider):
|
|
|
51
51
|
use="audio_tts_whisper_voices",
|
|
52
52
|
description="Specify voice, available voices: "
|
|
53
53
|
"alloy, echo, fable, onyx, nova, shimmer",
|
|
54
|
+
urls={
|
|
55
|
+
"Voices": "https://platform.openai.com/docs/guides/text-to-speech/voice-options"
|
|
56
|
+
},
|
|
54
57
|
)
|
|
55
58
|
|
|
56
59
|
def speech(self, text: str) -> str:
|
|
@@ -65,10 +68,10 @@ class OpenAITextToSpeech(BaseProvider):
|
|
|
65
68
|
voice = self.plugin.get_option_value('openai_voice')
|
|
66
69
|
model = self.plugin.get_option_value('openai_model')
|
|
67
70
|
allowed_voices = self.plugin.window.core.audio.whisper.get_voices()
|
|
68
|
-
if model not in self.allowed_models:
|
|
69
|
-
model = 'tts-1'
|
|
70
|
-
if voice not in allowed_voices:
|
|
71
|
-
voice = 'alloy'
|
|
71
|
+
# if model not in self.allowed_models:
|
|
72
|
+
# model = 'tts-1'
|
|
73
|
+
# if voice not in allowed_voices:
|
|
74
|
+
# voice = 'alloy'
|
|
72
75
|
path = os.path.join(
|
|
73
76
|
self.plugin.window.core.config.path,
|
|
74
77
|
output_file,
|