pygpt-net 2.6.30__py3-none-any.whl → 2.6.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. pygpt_net/CHANGELOG.txt +15 -0
  2. pygpt_net/__init__.py +3 -3
  3. pygpt_net/app.py +7 -1
  4. pygpt_net/app_core.py +3 -1
  5. pygpt_net/config.py +3 -1
  6. pygpt_net/controller/__init__.py +9 -2
  7. pygpt_net/controller/audio/audio.py +38 -1
  8. pygpt_net/controller/audio/ui.py +2 -2
  9. pygpt_net/controller/chat/audio.py +1 -8
  10. pygpt_net/controller/chat/common.py +23 -62
  11. pygpt_net/controller/chat/handler/__init__.py +0 -0
  12. pygpt_net/controller/chat/handler/stream_worker.py +1124 -0
  13. pygpt_net/controller/chat/output.py +8 -3
  14. pygpt_net/controller/chat/stream.py +3 -1071
  15. pygpt_net/controller/chat/text.py +3 -2
  16. pygpt_net/controller/kernel/kernel.py +11 -3
  17. pygpt_net/controller/kernel/reply.py +5 -1
  18. pygpt_net/controller/lang/custom.py +2 -2
  19. pygpt_net/controller/media/__init__.py +12 -0
  20. pygpt_net/controller/media/media.py +115 -0
  21. pygpt_net/controller/realtime/__init__.py +12 -0
  22. pygpt_net/controller/realtime/manager.py +53 -0
  23. pygpt_net/controller/realtime/realtime.py +293 -0
  24. pygpt_net/controller/ui/mode.py +23 -2
  25. pygpt_net/controller/ui/ui.py +19 -1
  26. pygpt_net/core/audio/audio.py +6 -1
  27. pygpt_net/core/audio/backend/native/__init__.py +12 -0
  28. pygpt_net/core/audio/backend/{native.py → native/native.py} +426 -127
  29. pygpt_net/core/audio/backend/native/player.py +139 -0
  30. pygpt_net/core/audio/backend/native/realtime.py +250 -0
  31. pygpt_net/core/audio/backend/pyaudio/__init__.py +12 -0
  32. pygpt_net/core/audio/backend/pyaudio/playback.py +194 -0
  33. pygpt_net/core/audio/backend/pyaudio/pyaudio.py +923 -0
  34. pygpt_net/core/audio/backend/pyaudio/realtime.py +312 -0
  35. pygpt_net/core/audio/backend/pygame/__init__.py +12 -0
  36. pygpt_net/core/audio/backend/{pygame.py → pygame/pygame.py} +130 -19
  37. pygpt_net/core/audio/backend/shared/__init__.py +38 -0
  38. pygpt_net/core/audio/backend/shared/conversions.py +211 -0
  39. pygpt_net/core/audio/backend/shared/envelope.py +38 -0
  40. pygpt_net/core/audio/backend/shared/player.py +137 -0
  41. pygpt_net/core/audio/backend/shared/rt.py +52 -0
  42. pygpt_net/core/audio/capture.py +5 -0
  43. pygpt_net/core/audio/output.py +14 -2
  44. pygpt_net/core/audio/whisper.py +6 -2
  45. pygpt_net/core/bridge/bridge.py +2 -1
  46. pygpt_net/core/bridge/worker.py +4 -1
  47. pygpt_net/core/dispatcher/dispatcher.py +37 -1
  48. pygpt_net/core/events/__init__.py +2 -1
  49. pygpt_net/core/events/realtime.py +55 -0
  50. pygpt_net/core/image/image.py +56 -5
  51. pygpt_net/core/realtime/__init__.py +0 -0
  52. pygpt_net/core/realtime/options.py +87 -0
  53. pygpt_net/core/realtime/shared/__init__.py +0 -0
  54. pygpt_net/core/realtime/shared/audio.py +213 -0
  55. pygpt_net/core/realtime/shared/loop.py +64 -0
  56. pygpt_net/core/realtime/shared/session.py +59 -0
  57. pygpt_net/core/realtime/shared/text.py +37 -0
  58. pygpt_net/core/realtime/shared/tools.py +276 -0
  59. pygpt_net/core/realtime/shared/turn.py +38 -0
  60. pygpt_net/core/realtime/shared/types.py +16 -0
  61. pygpt_net/core/realtime/worker.py +160 -0
  62. pygpt_net/core/render/web/body.py +24 -3
  63. pygpt_net/core/text/utils.py +54 -2
  64. pygpt_net/core/types/__init__.py +1 -0
  65. pygpt_net/core/types/image.py +54 -0
  66. pygpt_net/core/video/__init__.py +12 -0
  67. pygpt_net/core/video/video.py +290 -0
  68. pygpt_net/data/config/config.json +26 -5
  69. pygpt_net/data/config/models.json +221 -103
  70. pygpt_net/data/config/settings.json +244 -6
  71. pygpt_net/data/css/web-blocks.css +6 -0
  72. pygpt_net/data/css/web-chatgpt.css +6 -0
  73. pygpt_net/data/css/web-chatgpt_wide.css +6 -0
  74. pygpt_net/data/locale/locale.de.ini +35 -7
  75. pygpt_net/data/locale/locale.en.ini +56 -17
  76. pygpt_net/data/locale/locale.es.ini +35 -7
  77. pygpt_net/data/locale/locale.fr.ini +35 -7
  78. pygpt_net/data/locale/locale.it.ini +35 -7
  79. pygpt_net/data/locale/locale.pl.ini +38 -7
  80. pygpt_net/data/locale/locale.uk.ini +35 -7
  81. pygpt_net/data/locale/locale.zh.ini +31 -3
  82. pygpt_net/data/locale/plugin.audio_input.en.ini +4 -0
  83. pygpt_net/data/locale/plugin.audio_output.en.ini +4 -0
  84. pygpt_net/data/locale/plugin.cmd_web.en.ini +8 -0
  85. pygpt_net/item/model.py +22 -1
  86. pygpt_net/plugin/audio_input/plugin.py +37 -4
  87. pygpt_net/plugin/audio_input/simple.py +57 -8
  88. pygpt_net/plugin/cmd_files/worker.py +3 -0
  89. pygpt_net/provider/api/google/__init__.py +76 -7
  90. pygpt_net/provider/api/google/audio.py +8 -1
  91. pygpt_net/provider/api/google/chat.py +45 -6
  92. pygpt_net/provider/api/google/image.py +226 -86
  93. pygpt_net/provider/api/google/realtime/__init__.py +12 -0
  94. pygpt_net/provider/api/google/realtime/client.py +1945 -0
  95. pygpt_net/provider/api/google/realtime/realtime.py +186 -0
  96. pygpt_net/provider/api/google/video.py +364 -0
  97. pygpt_net/provider/api/openai/__init__.py +22 -2
  98. pygpt_net/provider/api/openai/realtime/__init__.py +12 -0
  99. pygpt_net/provider/api/openai/realtime/client.py +1828 -0
  100. pygpt_net/provider/api/openai/realtime/realtime.py +193 -0
  101. pygpt_net/provider/audio_input/google_genai.py +103 -0
  102. pygpt_net/provider/audio_output/google_genai_tts.py +229 -0
  103. pygpt_net/provider/audio_output/google_tts.py +0 -12
  104. pygpt_net/provider/audio_output/openai_tts.py +8 -5
  105. pygpt_net/provider/core/config/patch.py +241 -178
  106. pygpt_net/provider/core/model/patch.py +28 -2
  107. pygpt_net/provider/llms/google.py +8 -9
  108. pygpt_net/provider/web/duckduck_search.py +212 -0
  109. pygpt_net/ui/layout/toolbox/audio.py +55 -0
  110. pygpt_net/ui/layout/toolbox/footer.py +14 -42
  111. pygpt_net/ui/layout/toolbox/image.py +7 -13
  112. pygpt_net/ui/layout/toolbox/raw.py +52 -0
  113. pygpt_net/ui/layout/toolbox/split.py +48 -0
  114. pygpt_net/ui/layout/toolbox/toolbox.py +8 -8
  115. pygpt_net/ui/layout/toolbox/video.py +49 -0
  116. pygpt_net/ui/widget/option/combo.py +15 -1
  117. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/METADATA +46 -22
  118. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/RECORD +121 -73
  119. pygpt_net/core/audio/backend/pyaudio.py +0 -554
  120. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/LICENSE +0 -0
  121. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/WHEEL +0 -0
  122. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,193 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # ================================================== #
4
+ # This file is a part of PYGPT package #
5
+ # Website: https://pygpt.net #
6
+ # GitHub: https://github.com/szczyglis-dev/py-gpt #
7
+ # MIT License #
8
+ # Created By : Marcin Szczygliński #
9
+ # Updated Date: 2025.09.01 23:00:00 #
10
+ # ================================================== #
11
+
12
+ import json
13
+ from typing import Optional, Dict, Any
14
+
15
+ from pygpt_net.core.bridge import BridgeContext
16
+ from pygpt_net.core.events import RealtimeEvent
17
+ from pygpt_net.core.realtime.options import RealtimeOptions
18
+ from pygpt_net.core.realtime.shared.session import extract_last_session_id
19
+ from pygpt_net.item.model import ModelItem
20
+ from pygpt_net.utils import trans
21
+
22
+ from .client import OpenAIRealtimeClient
23
+
24
+ class Realtime:
25
+
26
+ PROVIDER = "openai"
27
+
28
+ def __init__(self, window=None):
29
+ """
30
+ OpenAI API realtime controller
31
+
32
+ :param window: Window instance
33
+ """
34
+ self.window = window
35
+ self.handler = OpenAIRealtimeClient(window)
36
+ self.prev_auto_turn = False
37
+ self.prev_vad_silence = 2000
38
+ self.prev_vad_prefix = 300
39
+
40
+ def begin(
41
+ self,
42
+ context: BridgeContext,
43
+ model: Optional[ModelItem] = None,
44
+ extra: Optional[Dict[str, Any]] = None,
45
+ rt_signals=None
46
+ ) -> bool:
47
+ """
48
+ Begin realtime session if applicable
49
+
50
+ :param context: BridgeContext
51
+ :param model: Optional[ModelItem]
52
+ :param extra: Optional dict with extra parameters
53
+ :param rt_signals: RealtimeSignals
54
+ :return: True if realtime session started, False otherwise
55
+ """
56
+ mm = context.multimodal_ctx
57
+ audio_bytes = getattr(mm, "audio_data", None) if mm and getattr(mm, "is_audio_input", False) else None
58
+ audio_format = getattr(mm, "audio_format", None) if mm else None
59
+ audio_rate = getattr(mm, "audio_rate", None) if mm else None
60
+ is_debug = self.window.core.config.get("log.realtime", False)
61
+ auto_turn = self.window.core.config.get("audio.input.auto_turn", True)
62
+ opt_vad_silence = self.window.core.config.get("audio.input.vad.silence", 2000)
63
+ opt_vad_prefix = self.window.core.config.get("audio.input.vad.prefix", 300)
64
+
65
+ # setup manager
66
+ self.window.controller.realtime.set_current_active(self.PROVIDER)
67
+ self.window.controller.realtime.set_busy()
68
+ self.handler.set_debug(is_debug)
69
+
70
+ # tools
71
+ tools = self.window.core.api.openai.tools.prepare(model, context.external_functions)
72
+
73
+ # remote tools
74
+ remote_tools = []
75
+ remote_tools = self.window.core.api.openai.remote_tools.append_to_tools(
76
+ mode=context.mode,
77
+ model=model,
78
+ stream=context.stream,
79
+ is_expert_call=context.is_expert_call,
80
+ tools=remote_tools,
81
+ preset=context.preset,
82
+ )
83
+
84
+ # handle sub-reply (tool results from tool calls)
85
+ if context.ctx.internal:
86
+ if context.ctx.prev_ctx and context.ctx.prev_ctx.extra.get("prev_tool_calls"):
87
+ tool_calls = context.ctx.prev_ctx.extra.get("prev_tool_calls", [])
88
+ tool_call_id = None
89
+ if isinstance(tool_calls, list) and len(tool_calls) > 0:
90
+ tool_call_id = tool_calls[0].get("call_id", "") # get first call_id
91
+ if not tool_call_id:
92
+ tool_call_id = tool_calls[0].get("id", "") # fallback to id
93
+ if tool_call_id:
94
+ tool_results = context.ctx.input
95
+ try:
96
+ tool_results = json.loads(tool_results)
97
+ except Exception:
98
+ pass
99
+ self.handler.send_tool_results_sync({
100
+ tool_call_id: tool_results
101
+ })
102
+ self.handler.update_ctx(context.ctx)
103
+ return True # do not start new session, just send tool results
104
+
105
+ # update auto-turn in active session
106
+ if (self.handler.is_session_active()
107
+ and (auto_turn != self.prev_auto_turn
108
+ or opt_vad_silence != self.prev_vad_silence
109
+ or opt_vad_prefix != self.prev_vad_prefix)):
110
+ self.handler.update_session_autoturn_sync(auto_turn, opt_vad_silence, opt_vad_prefix)
111
+
112
+ # if auto-turn is enabled and prompt is empty, update session and context only
113
+ if auto_turn and self.handler.is_session_active() and (context.prompt.strip() == "" or context.prompt == "..."):
114
+ self.handler.update_session_tools_sync(tools, remote_tools)
115
+ self.handler.update_ctx(context.ctx)
116
+ self.window.update_status(trans("speech.listening"))
117
+ return True # do not send new request if session is active
118
+
119
+ # Last session ID
120
+ last_session_id = extract_last_session_id(context.history)
121
+ if is_debug:
122
+ print("[realtime session] Last ID", last_session_id)
123
+
124
+ # Voice
125
+ voice = "alloy"
126
+ try:
127
+ v = self.window.core.plugins.get_option("audio_output", "openai_voice")
128
+ if v:
129
+ voice = str(v)
130
+ except Exception:
131
+ pass
132
+
133
+ # Options
134
+ opts = RealtimeOptions(
135
+ provider=self.PROVIDER,
136
+ model=context.model.id,
137
+ system_prompt=context.system_prompt,
138
+ prompt=context.prompt,
139
+ voice=voice,
140
+ audio_data=audio_bytes,
141
+ audio_format=audio_format,
142
+ audio_rate=audio_rate,
143
+ vad="server_vad",
144
+ extra=extra or {},
145
+ tools=tools,
146
+ remote_tools=remote_tools,
147
+ rt_signals=rt_signals,
148
+ rt_session_id=last_session_id,
149
+ auto_turn=auto_turn,
150
+ vad_end_silence_ms=opt_vad_silence,
151
+ vad_prefix_padding_ms=opt_vad_prefix,
152
+ )
153
+
154
+ # Start or append to realtime session via manager
155
+ try:
156
+ if is_debug:
157
+ print("[realtime] Starting session with options:", opts.to_dict())
158
+ rt = self.window.controller.realtime.manager
159
+ rt.start(context.ctx, opts)
160
+
161
+ self.prev_auto_turn = auto_turn
162
+ self.prev_vad_silence = opt_vad_silence
163
+ self.prev_vad_prefix = opt_vad_prefix
164
+ return True
165
+ except Exception as e:
166
+ self.window.core.debug.log(e)
167
+ return False # fallback to non-live path
168
+
169
+ def handle_audio_input(self, event: RealtimeEvent):
170
+ """
171
+ Handle Realtime audio input event
172
+
173
+ :param event: RealtimeEvent
174
+ """
175
+ self.handler.rt_handle_audio_input_sync(event)
176
+
177
+ def manual_commit(self):
178
+ """Manually commit audio input to realtime session"""
179
+ self.handler.force_response_now_sync()
180
+
181
+ def shutdown(self):
182
+ """Shutdown realtime loops"""
183
+ if self.handler.is_session_active():
184
+ self.handler.close_session_sync()
185
+ try:
186
+ self.handler.stop_loop_sync()
187
+ except Exception:
188
+ pass
189
+
190
+ def reset(self):
191
+ """Close realtime session"""
192
+ if self.handler.is_session_active():
193
+ self.handler.close_session_sync()
@@ -0,0 +1,103 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # ================================================== #
4
+ # This file is a part of PYGPT package #
5
+ # Website: https://pygpt.net #
6
+ # GitHub: https://github.com/szczyglis-dev/py-gpt #
7
+ # MIT License #
8
+ # Created By : Marcin Szczygliński #
9
+ # Updated Date: 2025.08.29 18:00:00 #
10
+ # ================================================== #
11
+
12
+ from .base import BaseProvider
13
+
14
+
15
+ class GoogleGenAIAudioInput(BaseProvider):
16
+
17
+ PROMPT_TRANSCRIBE = (
18
+ "You are a speech-to-text transcriber. "
19
+ "Return only the verbatim transcript as plain text. "
20
+ "Do not add any explanations, timestamps, labels or formatting."
21
+ )
22
+
23
+ def __init__(self, *args, **kwargs):
24
+ """
25
+ Google GenAI (Gemini) audio provider for transcription (via API).
26
+
27
+ :param args: args
28
+ :param kwargs: kwargs
29
+ """
30
+ super(GoogleGenAIAudioInput, self).__init__(*args, **kwargs)
31
+ self.plugin = kwargs.get("plugin")
32
+ self.id = "google_genai"
33
+ self.name = "Google GenAI"
34
+
35
+ def init_options(self):
36
+ """Initialize options"""
37
+ # Keep option shape consistent with Whisper provider
38
+ self.plugin.add_option(
39
+ "google_genai_audio_model",
40
+ type="text",
41
+ value="gemini-2.5-flash",
42
+ label="Model",
43
+ tab="google_genai",
44
+ description="Specify Gemini model supporting audio, e.g., gemini-2.5-flash",
45
+ )
46
+ self.plugin.add_option(
47
+ "google_genai_audio_prompt",
48
+ type="textarea",
49
+ value=self.PROMPT_TRANSCRIBE,
50
+ label="System Prompt",
51
+ tab="google_genai",
52
+ description="System prompt to guide the transcription output",
53
+ tooltip="System prompt for transcription",
54
+ persist=True,
55
+ )
56
+
57
+ def transcribe(self, path: str) -> str:
58
+ """
59
+ Audio to text transcription using Google GenAI (Gemini).
60
+
61
+ :param path: path to audio file to transcribe
62
+ :return: transcribed text
63
+ """
64
+ # Get pre-configured GenAI client from the app core
65
+ client = self.plugin.window.core.api.google.get_client()
66
+
67
+ # Upload the audio file via the Files API
68
+ uploaded_file = client.files.upload(file=path)
69
+
70
+ # Ask the model to produce a plain text transcript only
71
+ # Using system_instruction keeps the public API surface simple (no extra options needed)
72
+ config = {
73
+ "system_instruction": self.plugin.get_option_value("google_genai_audio_prompt") or self.PROMPT_TRANSCRIBE,
74
+ "temperature": 0.0,
75
+ }
76
+
77
+ # Generate content (transcription) with the selected model
78
+ model_name = self.plugin.get_option_value("google_genai_audio_model")
79
+ response = client.models.generate_content(
80
+ model=model_name,
81
+ contents=[uploaded_file],
82
+ config=config,
83
+ )
84
+
85
+ # The SDK exposes the unified .text property for convenience
86
+ return response.text or ""
87
+
88
+ def is_configured(self) -> bool:
89
+ """
90
+ Check if provider is configured
91
+
92
+ :return: True if configured, False otherwise
93
+ """
94
+ api_key = self.plugin.window.core.config.get("api_key_google")
95
+ return api_key is not None and api_key != ""
96
+
97
+ def get_config_message(self) -> str:
98
+ """
99
+ Return message to display when provider is not configured
100
+
101
+ :return: message
102
+ """
103
+ return "Google GenAI API key is not set yet. Please configure it in settings."
@@ -0,0 +1,229 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # ================================================== #
4
+ # This file is a part of PYGPT package #
5
+ # Website: https://pygpt.net #
6
+ # GitHub: https://github.com/szczyglis-dev/py-gpt #
7
+ # MIT License #
8
+ # Created By : Marcin Szczygliński #
9
+ # Updated Date: 2025.08.29 18:00:00 #
10
+ # ================================================== #
11
+
12
+ import os
13
+ import wave
14
+ import base64
15
+
16
+ from .base import BaseProvider
17
+
18
+
19
+ class GoogleGenAITextToSpeech(BaseProvider):
20
+ def __init__(self, *args, **kwargs):
21
+ """
22
+ Google GenAI Text-to-Speech provider (Gemini TTS via API).
23
+
24
+ :param args: args
25
+ :param kwargs: kwargs
26
+ """
27
+ super(GoogleGenAITextToSpeech, self).__init__(*args, **kwargs)
28
+ self.plugin = kwargs.get("plugin")
29
+ self.id = "google_genai_tts"
30
+ self.name = "Google GenAI TTS"
31
+
32
+ # Supported preview TTS models (fallback to flash if invalid)
33
+ self.allowed_models = [
34
+ "gemini-2.5-flash-preview-tts",
35
+ "gemini-2.5-pro-preview-tts",
36
+ ]
37
+
38
+ # Prebuilt voice names exposed by Gemini TTS
39
+ # Keep list in sync with official docs; fallback to "Puck" if invalid.
40
+ self.allowed_voices = [
41
+ "Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus",
42
+ "Aoede", "Callirrhoe", "Autonoe", "Enceladus", "Iapetus",
43
+ "Umbriel", "Algieba", "Despina", "Erinome", "Algenib",
44
+ "Rasalgethi", "Laomedeia", "Achernar", "Alnilam", "Schedar",
45
+ "Gacrux", "Pulcherrima", "Achird", "Zubenelgenubi",
46
+ "Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat",
47
+ ]
48
+
49
+ def init_options(self):
50
+ """Initialize options"""
51
+ # Keep option names consistent with the app style; simple text fields are enough.
52
+ self.plugin.add_option(
53
+ "google_genai_tts_model",
54
+ type="text",
55
+ value="gemini-2.5-flash-preview-tts",
56
+ label="Model",
57
+ tab="google_genai_tts",
58
+ description="Specify Gemini TTS model, e.g.: gemini-2.5-flash-preview-tts or gemini-2.5-pro-preview-tts",
59
+ )
60
+ self.plugin.add_option(
61
+ "google_genai_tts_voice",
62
+ type="text",
63
+ value="Kore",
64
+ label="Voice",
65
+ tab="google_genai_tts",
66
+ description="Specify voice, e.g.: Puck, Kore, Charon, Leda, Zephyr... (case-sensitive)",
67
+ urls={
68
+ "Voices": "https://ai.google.dev/gemini-api/docs/speech-generation"
69
+ },
70
+ )
71
+
72
+ def speech(self, text: str) -> str:
73
+ """
74
+ Text to speech synthesis using Google GenAI (Gemini TTS).
75
+
76
+ :param text: text to synthesize
77
+ :return: path to generated audio file
78
+ """
79
+ # Get pre-configured GenAI client
80
+ client = self.plugin.window.core.api.google.get_client()
81
+
82
+ # Resolve path where audio should be written
83
+ output_file = self.plugin.output_file
84
+ path = os.path.join(self.plugin.window.core.config.path, output_file)
85
+
86
+ # Validate/select model
87
+ model = self.plugin.get_option_value("google_genai_tts_model") or "gemini-2.5-flash-preview-tts"
88
+ model = self._normalize_model_name(model)
89
+ if model not in self.allowed_models:
90
+ model = "gemini-2.5-flash-preview-tts"
91
+
92
+ # Validate/select voice
93
+ voice = self.plugin.get_option_value("google_genai_tts_voice") or "Kore"
94
+ # if voice not in self.allowed_voices:
95
+ # voice = "Kore"
96
+
97
+ # Build generation config for audio modality + voice
98
+ # Using explicit types for clarity and forward-compatibility
99
+ try:
100
+ from google.genai import types
101
+ except Exception as ex:
102
+ # Fail fast if SDK is missing or incompatible
103
+ raise RuntimeError("google.genai SDK is not available. Please install/update Google GenAI SDK.") from ex
104
+
105
+ gen_config = types.GenerateContentConfig(
106
+ response_modalities=["AUDIO"],
107
+ speech_config=types.SpeechConfig(
108
+ voice_config=types.VoiceConfig(
109
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(
110
+ voice_name=voice
111
+ )
112
+ )
113
+ ),
114
+ temperature=0.8, # balanced default; keep configurable later if needed
115
+ )
116
+
117
+ # Perform TTS request
118
+ response = client.models.generate_content(
119
+ model=model,
120
+ contents=text,
121
+ config=gen_config,
122
+ )
123
+
124
+ # Extract PCM bytes from the first candidate/part
125
+ pcm = self._extract_pcm_bytes(response)
126
+
127
+ # Persist as standard WAV (PCM 16-bit, mono, 24 kHz)
128
+ self._save_wav(path, pcm, channels=1, rate=24000, sample_width=2)
129
+
130
+ return str(path)
131
+
132
+ def _extract_pcm_bytes(self, response) -> bytes:
133
+ """
134
+ Extract PCM bytes from generate_content response.
135
+
136
+ :param response: Google GenAI response object
137
+ :return: raw PCM byte data
138
+ """
139
+ # Defensive extraction to support minor SDK variations
140
+ data = None
141
+ try:
142
+ cand = response.candidates[0]
143
+ part = cand.content.parts[0]
144
+ if getattr(part, "inline_data", None) and getattr(part.inline_data, "data", None):
145
+ data = part.inline_data.data
146
+ except Exception:
147
+ pass
148
+
149
+ if data is None:
150
+ # Some SDK builds may return base64 str; try resolving alternative layout
151
+ try:
152
+ parts = getattr(response, "candidates", [])[0].content.parts
153
+ for p in parts:
154
+ if getattr(p, "inline_data", None) and getattr(p.inline_data, "data", None):
155
+ data = p.inline_data.data
156
+ break
157
+ except Exception:
158
+ pass
159
+
160
+ if data is None:
161
+ raise RuntimeError("No audio data returned by Gemini TTS response.")
162
+
163
+ # Normalize to raw bytes
164
+ if isinstance(data, (bytes, bytearray)):
165
+ return bytes(data)
166
+ if isinstance(data, str):
167
+ # Fallback: treat as base64-encoded PCM
168
+ return base64.b64decode(data)
169
+
170
+ # Last resort: try bytes() cast
171
+ try:
172
+ return bytes(data)
173
+ except Exception as ex:
174
+ raise RuntimeError("Unsupported audio payload type returned by Gemini TTS.") from ex
175
+
176
+ def _save_wav(
177
+ self,
178
+ filename: str,
179
+ pcm_bytes: bytes,
180
+ channels: int = 1,
181
+ rate: int = 24000,
182
+ sample_width: int = 2
183
+ ):
184
+ """
185
+ Save raw PCM bytes to a WAV file.
186
+
187
+ :param filename: output WAV file path
188
+ :param pcm_bytes: raw PCM byte data
189
+ :param channels: number of audio channels (1=mono, 2=stereo)
190
+ :param rate: sample rate in Hz (e.g., 24000)
191
+ :param sample_width: sample width in bytes (e.g., 2 for 16-bit)
192
+ """
193
+ # Ensure parent directory exists
194
+ os.makedirs(os.path.dirname(filename), exist_ok=True)
195
+
196
+ # Write PCM payload as WAV
197
+ with wave.open(filename, "wb") as wf:
198
+ wf.setnchannels(channels)
199
+ wf.setsampwidth(sample_width) # bytes per sample (2 -> 16-bit)
200
+ wf.setframerate(rate)
201
+ wf.writeframes(pcm_bytes)
202
+
203
+ def _normalize_model_name(self, model: str) -> str:
204
+ """
205
+ Normalize model id (strip optional 'models/' prefix).
206
+
207
+ :param model: model id
208
+ """
209
+ try:
210
+ return model.split("/")[-1]
211
+ except Exception:
212
+ return model
213
+
214
+ def is_configured(self) -> bool:
215
+ """
216
+ Check if provider is configured
217
+
218
+ :return: True if configured, False otherwise
219
+ """
220
+ api_key = self.plugin.window.core.config.get("api_key_google")
221
+ return api_key is not None and api_key != ""
222
+
223
+ def get_config_message(self) -> str:
224
+ """
225
+ Return message to display when provider is not configured
226
+
227
+ :return: message
228
+ """
229
+ return "Google GenAI API key is not set yet. Please configure it in settings."
@@ -58,18 +58,6 @@ class GoogleTextToSpeech(BaseProvider):
58
58
  "Voices": "https://cloud.google.com/text-to-speech/docs/voices"
59
59
  },
60
60
  )
61
- self.plugin.add_option(
62
- "google_voice_native",
63
- type="text",
64
- value="Kore",
65
- label="Voice (Gemini API)",
66
- tab="google",
67
- description="Specify voice for Gemini API (supported voices may differ)",
68
- tooltip="Voice name",
69
- urls={
70
- "Voices": "https://ai.google.dev/gemini-api/docs/speech-generation"
71
- },
72
- )
73
61
  self.plugin.add_option(
74
62
  "google_lang",
75
63
  type="text",
@@ -6,7 +6,7 @@
6
6
  # GitHub: https://github.com/szczyglis-dev/py-gpt #
7
7
  # MIT License #
8
8
  # Created By : Marcin Szczygliński #
9
- # Updated Date: 2025.08.07 22:00:00 #
9
+ # Updated Date: 2025.08.29 18:00:00 #
10
10
  # ================================================== #
11
11
 
12
12
  import os
@@ -51,6 +51,9 @@ class OpenAITextToSpeech(BaseProvider):
51
51
  use="audio_tts_whisper_voices",
52
52
  description="Specify voice, available voices: "
53
53
  "alloy, echo, fable, onyx, nova, shimmer",
54
+ urls={
55
+ "Voices": "https://platform.openai.com/docs/guides/text-to-speech/voice-options"
56
+ },
54
57
  )
55
58
 
56
59
  def speech(self, text: str) -> str:
@@ -65,10 +68,10 @@ class OpenAITextToSpeech(BaseProvider):
65
68
  voice = self.plugin.get_option_value('openai_voice')
66
69
  model = self.plugin.get_option_value('openai_model')
67
70
  allowed_voices = self.plugin.window.core.audio.whisper.get_voices()
68
- if model not in self.allowed_models:
69
- model = 'tts-1'
70
- if voice not in allowed_voices:
71
- voice = 'alloy'
71
+ # if model not in self.allowed_models:
72
+ # model = 'tts-1'
73
+ # if voice not in allowed_voices:
74
+ # voice = 'alloy'
72
75
  path = os.path.join(
73
76
  self.plugin.window.core.config.path,
74
77
  output_file,