pygpt-net 2.6.30__py3-none-any.whl → 2.6.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. pygpt_net/CHANGELOG.txt +15 -0
  2. pygpt_net/__init__.py +3 -3
  3. pygpt_net/app.py +7 -1
  4. pygpt_net/app_core.py +3 -1
  5. pygpt_net/config.py +3 -1
  6. pygpt_net/controller/__init__.py +9 -2
  7. pygpt_net/controller/audio/audio.py +38 -1
  8. pygpt_net/controller/audio/ui.py +2 -2
  9. pygpt_net/controller/chat/audio.py +1 -8
  10. pygpt_net/controller/chat/common.py +23 -62
  11. pygpt_net/controller/chat/handler/__init__.py +0 -0
  12. pygpt_net/controller/chat/handler/stream_worker.py +1124 -0
  13. pygpt_net/controller/chat/output.py +8 -3
  14. pygpt_net/controller/chat/stream.py +3 -1071
  15. pygpt_net/controller/chat/text.py +3 -2
  16. pygpt_net/controller/kernel/kernel.py +11 -3
  17. pygpt_net/controller/kernel/reply.py +5 -1
  18. pygpt_net/controller/lang/custom.py +2 -2
  19. pygpt_net/controller/media/__init__.py +12 -0
  20. pygpt_net/controller/media/media.py +115 -0
  21. pygpt_net/controller/realtime/__init__.py +12 -0
  22. pygpt_net/controller/realtime/manager.py +53 -0
  23. pygpt_net/controller/realtime/realtime.py +293 -0
  24. pygpt_net/controller/ui/mode.py +23 -2
  25. pygpt_net/controller/ui/ui.py +19 -1
  26. pygpt_net/core/audio/audio.py +6 -1
  27. pygpt_net/core/audio/backend/native/__init__.py +12 -0
  28. pygpt_net/core/audio/backend/{native.py → native/native.py} +426 -127
  29. pygpt_net/core/audio/backend/native/player.py +139 -0
  30. pygpt_net/core/audio/backend/native/realtime.py +250 -0
  31. pygpt_net/core/audio/backend/pyaudio/__init__.py +12 -0
  32. pygpt_net/core/audio/backend/pyaudio/playback.py +194 -0
  33. pygpt_net/core/audio/backend/pyaudio/pyaudio.py +923 -0
  34. pygpt_net/core/audio/backend/pyaudio/realtime.py +312 -0
  35. pygpt_net/core/audio/backend/pygame/__init__.py +12 -0
  36. pygpt_net/core/audio/backend/{pygame.py → pygame/pygame.py} +130 -19
  37. pygpt_net/core/audio/backend/shared/__init__.py +38 -0
  38. pygpt_net/core/audio/backend/shared/conversions.py +211 -0
  39. pygpt_net/core/audio/backend/shared/envelope.py +38 -0
  40. pygpt_net/core/audio/backend/shared/player.py +137 -0
  41. pygpt_net/core/audio/backend/shared/rt.py +52 -0
  42. pygpt_net/core/audio/capture.py +5 -0
  43. pygpt_net/core/audio/output.py +14 -2
  44. pygpt_net/core/audio/whisper.py +6 -2
  45. pygpt_net/core/bridge/bridge.py +2 -1
  46. pygpt_net/core/bridge/worker.py +4 -1
  47. pygpt_net/core/dispatcher/dispatcher.py +37 -1
  48. pygpt_net/core/events/__init__.py +2 -1
  49. pygpt_net/core/events/realtime.py +55 -0
  50. pygpt_net/core/image/image.py +56 -5
  51. pygpt_net/core/realtime/__init__.py +0 -0
  52. pygpt_net/core/realtime/options.py +87 -0
  53. pygpt_net/core/realtime/shared/__init__.py +0 -0
  54. pygpt_net/core/realtime/shared/audio.py +213 -0
  55. pygpt_net/core/realtime/shared/loop.py +64 -0
  56. pygpt_net/core/realtime/shared/session.py +59 -0
  57. pygpt_net/core/realtime/shared/text.py +37 -0
  58. pygpt_net/core/realtime/shared/tools.py +276 -0
  59. pygpt_net/core/realtime/shared/turn.py +38 -0
  60. pygpt_net/core/realtime/shared/types.py +16 -0
  61. pygpt_net/core/realtime/worker.py +160 -0
  62. pygpt_net/core/render/web/body.py +24 -3
  63. pygpt_net/core/text/utils.py +54 -2
  64. pygpt_net/core/types/__init__.py +1 -0
  65. pygpt_net/core/types/image.py +54 -0
  66. pygpt_net/core/video/__init__.py +12 -0
  67. pygpt_net/core/video/video.py +290 -0
  68. pygpt_net/data/config/config.json +26 -5
  69. pygpt_net/data/config/models.json +221 -103
  70. pygpt_net/data/config/settings.json +244 -6
  71. pygpt_net/data/css/web-blocks.css +6 -0
  72. pygpt_net/data/css/web-chatgpt.css +6 -0
  73. pygpt_net/data/css/web-chatgpt_wide.css +6 -0
  74. pygpt_net/data/locale/locale.de.ini +35 -7
  75. pygpt_net/data/locale/locale.en.ini +56 -17
  76. pygpt_net/data/locale/locale.es.ini +35 -7
  77. pygpt_net/data/locale/locale.fr.ini +35 -7
  78. pygpt_net/data/locale/locale.it.ini +35 -7
  79. pygpt_net/data/locale/locale.pl.ini +38 -7
  80. pygpt_net/data/locale/locale.uk.ini +35 -7
  81. pygpt_net/data/locale/locale.zh.ini +31 -3
  82. pygpt_net/data/locale/plugin.audio_input.en.ini +4 -0
  83. pygpt_net/data/locale/plugin.audio_output.en.ini +4 -0
  84. pygpt_net/data/locale/plugin.cmd_web.en.ini +8 -0
  85. pygpt_net/item/model.py +22 -1
  86. pygpt_net/plugin/audio_input/plugin.py +37 -4
  87. pygpt_net/plugin/audio_input/simple.py +57 -8
  88. pygpt_net/plugin/cmd_files/worker.py +3 -0
  89. pygpt_net/provider/api/google/__init__.py +76 -7
  90. pygpt_net/provider/api/google/audio.py +8 -1
  91. pygpt_net/provider/api/google/chat.py +45 -6
  92. pygpt_net/provider/api/google/image.py +226 -86
  93. pygpt_net/provider/api/google/realtime/__init__.py +12 -0
  94. pygpt_net/provider/api/google/realtime/client.py +1945 -0
  95. pygpt_net/provider/api/google/realtime/realtime.py +186 -0
  96. pygpt_net/provider/api/google/video.py +364 -0
  97. pygpt_net/provider/api/openai/__init__.py +22 -2
  98. pygpt_net/provider/api/openai/realtime/__init__.py +12 -0
  99. pygpt_net/provider/api/openai/realtime/client.py +1828 -0
  100. pygpt_net/provider/api/openai/realtime/realtime.py +193 -0
  101. pygpt_net/provider/audio_input/google_genai.py +103 -0
  102. pygpt_net/provider/audio_output/google_genai_tts.py +229 -0
  103. pygpt_net/provider/audio_output/google_tts.py +0 -12
  104. pygpt_net/provider/audio_output/openai_tts.py +8 -5
  105. pygpt_net/provider/core/config/patch.py +241 -178
  106. pygpt_net/provider/core/model/patch.py +28 -2
  107. pygpt_net/provider/llms/google.py +8 -9
  108. pygpt_net/provider/web/duckduck_search.py +212 -0
  109. pygpt_net/ui/layout/toolbox/audio.py +55 -0
  110. pygpt_net/ui/layout/toolbox/footer.py +14 -42
  111. pygpt_net/ui/layout/toolbox/image.py +7 -13
  112. pygpt_net/ui/layout/toolbox/raw.py +52 -0
  113. pygpt_net/ui/layout/toolbox/split.py +48 -0
  114. pygpt_net/ui/layout/toolbox/toolbox.py +8 -8
  115. pygpt_net/ui/layout/toolbox/video.py +49 -0
  116. pygpt_net/ui/widget/option/combo.py +15 -1
  117. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/METADATA +46 -22
  118. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/RECORD +121 -73
  119. pygpt_net/core/audio/backend/pyaudio.py +0 -554
  120. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/LICENSE +0 -0
  121. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/WHEEL +0 -0
  122. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,186 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # ================================================== #
4
+ # This file is a part of PYGPT package #
5
+ # Website: https://pygpt.net #
6
+ # GitHub: https://github.com/szczyglis-dev/py-gpt #
7
+ # MIT License #
8
+ # Created By : Marcin Szczygliński #
9
+ # Updated Date: 2025.08.31 23:00:00 #
10
+ # ================================================== #
11
+
12
+ import json
13
+ from typing import Optional, Dict, Any
14
+
15
+ from pygpt_net.core.events import RealtimeEvent
16
+ from pygpt_net.core.realtime.options import RealtimeOptions
17
+ from pygpt_net.core.bridge.context import BridgeContext
18
+ from pygpt_net.core.realtime.shared.session import extract_last_session_id
19
+ from pygpt_net.item.model import ModelItem
20
+
21
+ from .client import GoogleLiveClient
22
+
23
+
24
+ class Realtime:
25
+
26
+ PROVIDER = "google"
27
+
28
+ def __init__(self, window=None):
29
+ """
30
+ Google GenAI API realtime controller
31
+
32
+ :param window: Window instance
33
+ """
34
+ self.window = window
35
+ self.handler = GoogleLiveClient(window)
36
+ self.prev_auto_turn = False
37
+ self.prev_vad_silence = 2000
38
+ self.prev_vad_prefix = 300
39
+
40
+ def begin(
41
+ self,
42
+ context: BridgeContext,
43
+ model: Optional[ModelItem] = None,
44
+ extra: Optional[Dict[str, Any]] = None,
45
+ rt_signals=None
46
+ ) -> bool:
47
+ """
48
+ Begin realtime session if applicable
49
+
50
+ :param context: BridgeContext
51
+ :param model: Optional[ModelItem]
52
+ :param extra: Optional dict with extra parameters
53
+ :param rt_signals: Optional RealtimeSignals
54
+ :return: bool - True if realtime session started, False otherwise
55
+ """
56
+ # Build realtime options
57
+ mm = context.multimodal_ctx
58
+ audio_bytes = getattr(mm, "audio_data", None) if mm and getattr(mm, "is_audio_input", False) else None
59
+ audio_format = getattr(mm, "audio_format", None) if mm else None
60
+ audio_rate = getattr(mm, "audio_rate", None) if mm else None
61
+ is_debug = self.window.core.config.get("log.realtime", False)
62
+ auto_turn = self.window.core.config.get("audio.input.auto_turn", True)
63
+ opt_vad_silence = self.window.core.config.get("audio.input.vad.silence", 2000)
64
+ opt_vad_prefix = self.window.core.config.get("audio.input.vad.prefix", 300)
65
+
66
+ # setup manager
67
+ self.window.controller.realtime.set_current_active(self.PROVIDER)
68
+ self.window.controller.realtime.set_busy()
69
+ self.handler.set_debug(is_debug)
70
+
71
+ # handle sub-reply (tool results from tool calls)
72
+ if context.ctx.internal:
73
+ if context.ctx.prev_ctx and context.ctx.prev_ctx.extra.get("prev_tool_calls"):
74
+ tool_calls = context.ctx.prev_ctx.extra.get("prev_tool_calls", [])
75
+ tool_call_id = None
76
+ if isinstance(tool_calls, list) and len(tool_calls) > 0:
77
+ tool_call_id = tool_calls[0].get("call_id", "") # get first call_id
78
+ if not tool_call_id:
79
+ tool_call_id = tool_calls[0].get("id", "") # fallback to id
80
+ if tool_call_id:
81
+ tool_results = context.ctx.input
82
+ try:
83
+ tool_results = json.loads(tool_results)
84
+ except Exception:
85
+ pass
86
+ self.handler.send_tool_results_sync({
87
+ tool_call_id: tool_results
88
+ })
89
+ return True # do not start new session, just send tool results
90
+
91
+ # update auto-turn in active session
92
+ if (self.handler.is_session_active()
93
+ and (auto_turn != self.prev_auto_turn
94
+ or opt_vad_silence != self.prev_vad_silence
95
+ or opt_vad_prefix != self.prev_vad_prefix)):
96
+ self.handler.update_session_autoturn_sync(auto_turn, opt_vad_silence, opt_vad_prefix)
97
+
98
+ # Tools
99
+ tools = self.window.core.api.google.tools.prepare(model, context.external_functions)
100
+ remote_tools = self.window.core.api.google.build_remote_tools(model)
101
+ if tools:
102
+ remote_tools = [] # in Google, remote tools are not allowed if function calling is used
103
+
104
+ # if auto-turn is enabled and prompt is empty, update session and context only
105
+ if auto_turn and self.handler.is_session_active() and (context.prompt.strip() == "" or context.prompt == "..."):
106
+ self.handler.update_session_tools_sync(tools, remote_tools)
107
+ self.handler.update_ctx(context.ctx)
108
+ return True # do not send new request if session is active
109
+
110
+ # Last session ID
111
+ last_session_id = extract_last_session_id(context.history)
112
+ if is_debug:
113
+ print("[realtime session] Last ID", last_session_id)
114
+
115
+ # Voice
116
+ voice_name = "Kore"
117
+ try:
118
+ v = self.window.core.plugins.get_option("audio_output", "google_genai_tts_voice")
119
+ if v:
120
+ mapping = {"kore": "Kore", "puck": "Puck", "charon": "Charon", "verse": "Verse",
121
+ "legend": "Legend"}
122
+ voice_name = mapping.get(str(v).strip().lower(), str(v))
123
+ except Exception:
124
+ pass
125
+
126
+ # Options
127
+ opts = RealtimeOptions(
128
+ provider=self.PROVIDER,
129
+ model=model.id,
130
+ system_prompt=context.system_prompt,
131
+ prompt=context.prompt,
132
+ voice=voice_name,
133
+ audio_data=audio_bytes,
134
+ audio_format=audio_format,
135
+ audio_rate=audio_rate,
136
+ vad=None,
137
+ extra=extra or {},
138
+ tools=tools,
139
+ remote_tools=remote_tools,
140
+ rt_signals=rt_signals,
141
+ rt_session_id=last_session_id,
142
+ auto_turn=auto_turn,
143
+ vad_end_silence_ms=opt_vad_silence,
144
+ vad_prefix_padding_ms=opt_vad_prefix,
145
+ )
146
+
147
+ # Start or append to realtime session via manager
148
+ try:
149
+ if is_debug:
150
+ print("[realtime] Starting session with options:", opts.to_dict())
151
+ rt = self.window.controller.realtime.manager
152
+ rt.start(context.ctx, opts)
153
+
154
+ self.prev_auto_turn = auto_turn
155
+ self.prev_vad_silence = opt_vad_silence
156
+ self.prev_vad_prefix = opt_vad_prefix
157
+ return True
158
+ except Exception as e:
159
+ self.window.core.debug.log(e)
160
+ return False # fallback to non-live path
161
+
162
+ def handle_audio_input(self, event: RealtimeEvent):
163
+ """
164
+ Handle Realtime audio input event
165
+
166
+ :param event: RealtimeEvent
167
+ """
168
+ self.handler.rt_handle_audio_input_sync(event)
169
+
170
+ def manual_commit(self):
171
+ """Manually commit audio input to realtime session"""
172
+ self.handler.force_response_now_sync()
173
+
174
+ def shutdown(self):
175
+ """Shutdown realtime loops"""
176
+ if self.handler.is_session_active():
177
+ self.handler.close_session_sync()
178
+ try:
179
+ self.handler.stop_loop_sync()
180
+ except Exception:
181
+ pass
182
+
183
+ def reset(self):
184
+ """Close realtime session"""
185
+ if self.handler.is_session_active():
186
+ self.handler.close_session_sync()
@@ -0,0 +1,364 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # ================================================== #
4
+ # This file is a part of PYGPT package #
5
+ # Website: https://pygpt.net #
6
+ # GitHub: https://github.com/szczyglis-dev/py-gpt #
7
+ # MIT License #
8
+ # Created By : Marcin Szczygliński #
9
+ # Updated Date: 2025.09.01 23:00:00 #
10
+ # ================================================== #
11
+
12
+ import base64, datetime, os, requests
13
+ import mimetypes
14
+ import time
15
+
16
+ from typing import Optional, Dict, Any, List
17
+ from google import genai
18
+ from google.genai import types as gtypes
19
+
20
+ from PySide6.QtCore import QObject, Signal, QRunnable, Slot
21
+
22
+ from pygpt_net.core.events import KernelEvent
23
+ from pygpt_net.core.bridge.context import BridgeContext
24
+ from pygpt_net.item.ctx import CtxItem
25
+ from pygpt_net.utils import trans
26
+
27
+
28
+ class Video:
29
+
30
+ MODE_GENERATE = "generate"
31
+ MODE_IMAGE_TO_VIDEO = "image2video"
32
+
33
+ def __init__(self, window=None):
34
+ self.window = window
35
+ self.worker = None
36
+
37
+ def generate(
38
+ self,
39
+ context: BridgeContext,
40
+ extra: Optional[Dict[str, Any]] = None,
41
+ sync: bool = True
42
+ ) -> bool:
43
+ """
44
+ Generate video(s) using Google GenAI Veo.
45
+
46
+ :param context: BridgeContext with prompt, model, attachments
47
+ :param extra: extra parameters (num, inline, duration, aspect_ratio)
48
+ :param sync: run synchronously (blocking) if True
49
+ :return: True if started
50
+ """
51
+ extra = extra or {}
52
+ ctx = context.ctx or CtxItem()
53
+ model = context.model
54
+ prompt = context.prompt
55
+ num = int(extra.get("num", 1))
56
+ inline = bool(extra.get("inline", False))
57
+
58
+ # decide sub-mode based on attachments (image-to-video when image is attached)
59
+ sub_mode = self.MODE_GENERATE
60
+ attachments = context.attachments or {}
61
+ if self._has_image_attachment(attachments):
62
+ sub_mode = self.MODE_IMAGE_TO_VIDEO
63
+
64
+ # model used to improve the prompt (not video model)
65
+ prompt_model = self.window.core.models.from_defaults()
66
+ tmp = self.window.core.config.get('video.prompt_model')
67
+ if self.window.core.models.has(tmp):
68
+ prompt_model = self.window.core.models.get(tmp)
69
+
70
+ worker = VideoWorker()
71
+ worker.window = self.window
72
+ worker.client = self.window.core.api.google.get_client()
73
+ worker.ctx = ctx
74
+ worker.mode = sub_mode
75
+ worker.attachments = attachments
76
+ worker.model = model.id # Veo model id
77
+ worker.input_prompt = prompt
78
+ worker.model_prompt = prompt_model # LLM for prompt rewriting
79
+ worker.system_prompt = self.window.core.prompt.get('video')
80
+ worker.raw = self.window.core.config.get('img_raw')
81
+ worker.num = num
82
+ worker.inline = inline
83
+
84
+ # optional params
85
+ worker.aspect_ratio = str(extra.get("aspect_ratio") or self.window.core.config.get('video.aspect_ratio') or "16:9")
86
+ worker.duration_seconds = int(extra.get("duration") or self.window.core.config.get('video.duration') or 8)
87
+ worker.fps = int(extra.get("fps") or self.window.core.config.get('video.fps') or 24)
88
+ worker.seed = extra.get("seed") or self.window.core.config.get('video.seed') or None
89
+ worker.negative_prompt = extra.get("negative_prompt") or self.window.core.config.get('video.negative_prompt') or None
90
+ worker.generate_audio = bool(extra.get("generate_audio", self.window.core.config.get('video.generate_audio') or False))
91
+ worker.resolution = (extra.get("resolution") or self.window.core.config.get('video.resolution') or "720p")
92
+
93
+ self.worker = worker
94
+ self.worker.signals.finished.connect(self.window.core.video.handle_finished)
95
+ self.worker.signals.finished_inline.connect(self.window.core.video.handle_finished_inline)
96
+ self.worker.signals.status.connect(self.window.core.video.handle_status)
97
+ self.worker.signals.error.connect(self.window.core.video.handle_error)
98
+
99
+ if sync or not self.window.controller.kernel.async_allowed(ctx):
100
+ self.worker.run()
101
+ return True
102
+
103
+ self.window.dispatch(KernelEvent(KernelEvent.STATE_BUSY, {"id": "video"}))
104
+ self.window.threadpool.start(self.worker)
105
+ return True
106
+
107
+ def _has_image_attachment(self, attachments: Dict[str, Any]) -> bool:
108
+ """Check if at least one image attachment is present."""
109
+ for _, att in (attachments or {}).items():
110
+ try:
111
+ p = getattr(att, "path", None)
112
+ if p and os.path.exists(p):
113
+ mt, _ = mimetypes.guess_type(p)
114
+ if mt and mt.startswith("image/"):
115
+ return True
116
+ except Exception:
117
+ continue
118
+ return False
119
+
120
+
121
+ class VideoSignals(QObject):
122
+ finished = Signal(object, list, str) # ctx, paths, prompt
123
+ finished_inline = Signal(object, list, str) # ctx, paths, prompt
124
+ status = Signal(object) # message
125
+ error = Signal(object) # exception
126
+
127
+
128
+ class VideoWorker(QRunnable):
129
+ def __init__(self, *args, **kwargs):
130
+ super().__init__()
131
+ self.signals = VideoSignals()
132
+ self.window = None
133
+ self.client: Optional[genai.Client] = None
134
+ self.ctx: Optional[CtxItem] = None
135
+
136
+ # params
137
+ self.mode = Video.MODE_GENERATE
138
+ self.attachments: Dict[str, Any] = {}
139
+ self.model = "veo-3.0-generate-001"
140
+ self.model_prompt = None
141
+ self.input_prompt = ""
142
+ self.system_prompt = ""
143
+ self.inline = False
144
+ self.raw = False
145
+ self.num = 1
146
+
147
+ # video generation params
148
+ self.aspect_ratio = "16:9"
149
+ self.duration_seconds = 8
150
+ self.fps = 24
151
+ self.seed: Optional[int] = None
152
+ self.negative_prompt: Optional[str] = None
153
+ self.generate_audio: bool = False # Veo 3 only
154
+ self.resolution: str = "720p" # Veo 3 supports 720p/1080p
155
+
156
+ # limits / capabilities
157
+ # self.veo_max_num = 4 # Veo returns up to 4 videos
158
+ self.veo_max_num = 1 # limit to 1 in Gemini API
159
+
160
+ # fallbacks
161
+ self.DEFAULT_VEO_MODEL = "veo-3.0-generate-001"
162
+
163
+ @Slot()
164
+ def run(self):
165
+ try:
166
+ # optional prompt enhancement
167
+ if not self.raw and not self.inline and self.input_prompt:
168
+ try:
169
+ self.signals.status.emit(trans('vid.status.prompt.wait'))
170
+ bridge_context = BridgeContext(
171
+ prompt=self.input_prompt,
172
+ system_prompt=self.system_prompt,
173
+ model=self.model_prompt,
174
+ max_tokens=200,
175
+ temperature=1.0,
176
+ )
177
+ ev = KernelEvent(KernelEvent.CALL, {'context': bridge_context, 'extra': {}})
178
+ self.window.dispatch(ev)
179
+ resp = ev.data.get('response')
180
+ if resp:
181
+ self.input_prompt = resp
182
+ except Exception as e:
183
+ self.signals.error.emit(e)
184
+ self.signals.status.emit(trans('vid.status.prompt.error') + ": " + str(e))
185
+
186
+ # prepare config
187
+ num = min(self.num, self.veo_max_num)
188
+ cfg_kwargs = {
189
+ "number_of_videos": num,
190
+ #"duration_seconds": self._duration_for_model(self.model, self.duration_seconds),
191
+ }
192
+ if self.aspect_ratio:
193
+ cfg_kwargs["aspect_ratio"] = self.aspect_ratio
194
+ if self.seed is not None:
195
+ cfg_kwargs["seed"] = int(self.seed)
196
+ if self.negative_prompt:
197
+ cfg_kwargs["negative_prompt"] = self.negative_prompt
198
+ if self._is_veo3(self.model):
199
+ # Veo 3 supports audio and resolution
200
+ # WARN: but not Gemini API:
201
+ pass
202
+ """
203
+ cfg_kwargs["generate_audio"] = bool(self.generate_audio)
204
+ if self.resolution:
205
+ cfg_kwargs["resolution"] = self.resolution
206
+ """
207
+
208
+ config = gtypes.GenerateVideosConfig(**cfg_kwargs)
209
+
210
+ # build request
211
+ req_kwargs = {
212
+ "model": self.model or self.DEFAULT_VEO_MODEL,
213
+ "prompt": self.input_prompt or "",
214
+ "config": config,
215
+ }
216
+
217
+ # image-to-video if an image attachment is present and supported
218
+ base_img = self._first_image_attachment(self.attachments)
219
+ if self.mode == Video.MODE_IMAGE_TO_VIDEO and base_img is not None and self._supports_image_to_video(self.model):
220
+ req_kwargs["image"] = gtypes.Image.from_file(location=base_img)
221
+
222
+ self.signals.status.emit(trans('vid.status.generating') + f": {self.input_prompt}...")
223
+
224
+ # start long-running operation
225
+ operation = self.client.models.generate_videos(**req_kwargs)
226
+
227
+ # poll until done
228
+ while not getattr(operation, "done", False):
229
+ time.sleep(10)
230
+ operation = self.client.operations.get(operation)
231
+
232
+ # extract response payload
233
+ op_resp = getattr(operation, "response", None) or getattr(operation, "result", None)
234
+ if not op_resp:
235
+ raise RuntimeError("Empty operation response.")
236
+
237
+ gen_list = getattr(op_resp, "generated_videos", None) or []
238
+ if not gen_list:
239
+ raise RuntimeError("No videos generated.")
240
+
241
+ # download and save all outputs up to num
242
+ paths: List[str] = []
243
+ for idx, gv in enumerate(gen_list[:num]):
244
+ data = self._download_video_bytes(getattr(gv, "video", None))
245
+ p = self._save(idx, data)
246
+ if p:
247
+ paths.append(p)
248
+
249
+ if self.inline:
250
+ self.signals.finished_inline.emit(self.ctx, paths, self.input_prompt)
251
+ else:
252
+ self.signals.finished.emit(self.ctx, paths, self.input_prompt)
253
+
254
+ except Exception as e:
255
+ self.signals.error.emit(e)
256
+ finally:
257
+ self._cleanup()
258
+
259
+ # ---------- helpers ----------
260
+
261
+ def _is_veo3(self, model_id: str) -> bool:
262
+ mid = str(model_id or "").lower()
263
+ return mid.startswith("veo-3.")
264
+
265
+ def _supports_image_to_video(self, model_id: str) -> bool:
266
+ """Return True if the model supports image->video."""
267
+ mid = str(model_id or "").lower()
268
+ # Official support for image-to-video on veo-2 and veo-3 preview; keep extendable.
269
+ return ("veo-2.0" in mid) or ("veo-3.0-generate-preview" in mid) or ("veo-3.0-fast-generate-preview" in mid)
270
+
271
+ def _duration_for_model(self, model_id: str, requested: int) -> int:
272
+ """Adjust duration constraints to model-specific limits."""
273
+ mid = str(model_id or "").lower()
274
+ if "veo-2.0" in mid:
275
+ # Veo 2 supports 5–8s, default 8s.
276
+ return max(5, min(8, int(requested or 8)))
277
+ if "veo-3.0" in mid:
278
+ # Veo 3 commonly uses 8s clips; honor request if provided, otherwise 8s.
279
+ return int(requested or 8)
280
+ return int(requested or 8)
281
+
282
+ def _first_image_attachment(self, attachments: Dict[str, Any]) -> Optional[str]:
283
+ """Return path of the first image attachment, if any."""
284
+ for _, att in (attachments or {}).items():
285
+ try:
286
+ p = getattr(att, "path", None)
287
+ if p and os.path.exists(p):
288
+ mt, _ = mimetypes.guess_type(p)
289
+ if mt and mt.startswith("image/"):
290
+ return p
291
+ except Exception:
292
+ continue
293
+ return None
294
+
295
+ def _download_video_bytes(self, file_ref) -> Optional[bytes]:
296
+ """
297
+ Download video bytes using the Files service.
298
+ Falls back to direct URL download if necessary.
299
+ """
300
+ if not file_ref:
301
+ return None
302
+
303
+ # Preferred: SDK-managed download (handles URIs and sets video_bytes).
304
+ try:
305
+ data = self.client.files.download(file=file_ref)
306
+ if isinstance(data, (bytes, bytearray)):
307
+ return bytes(data)
308
+ except Exception:
309
+ pass
310
+
311
+ # Fallback: try to fetch by uri or url.
312
+ uri = getattr(file_ref, "uri", None) or getattr(file_ref, "url", None) or getattr(file_ref, "download_uri", None)
313
+ if uri:
314
+ try:
315
+ r = requests.get(uri, timeout=120)
316
+ if r.status_code == 200:
317
+ return r.content
318
+ except Exception:
319
+ pass
320
+
321
+ # Last resort: try inline/base64 if present.
322
+ b64 = getattr(file_ref, "video_bytes", None)
323
+ if isinstance(b64, (bytes, bytearray)):
324
+ return bytes(b64)
325
+ if isinstance(b64, str):
326
+ try:
327
+ return base64.b64decode(b64)
328
+ except Exception:
329
+ return None
330
+ return None
331
+
332
+ def _save(self, idx: int, data: Optional[bytes]) -> Optional[str]:
333
+ """Save video bytes to file and return path."""
334
+ if not data:
335
+ return None
336
+ name = (
337
+ datetime.date.today().strftime("%Y-%m-%d") + "_" +
338
+ datetime.datetime.now().strftime("%H-%M-%S") + "-" +
339
+ self.window.core.video.make_safe_filename(self.input_prompt) + "-" +
340
+ str(idx + 1) + ".mp4"
341
+ )
342
+ path = os.path.join(self.window.core.config.get_user_dir("video"), name)
343
+ self.signals.status.emit(trans('vid.status.downloading') + f" ({idx + 1} / {self.num}) -> {path}")
344
+
345
+ if self.window.core.video.save_video(path, data):
346
+ return str(path)
347
+
348
+ try:
349
+ os.makedirs(os.path.dirname(path), exist_ok=True)
350
+ with open(path, "wb") as f:
351
+ f.write(data)
352
+ return str(path)
353
+ except Exception:
354
+ return None
355
+
356
+ def _cleanup(self):
357
+ """Cleanup resources."""
358
+ sig = self.signals
359
+ self.signals = None
360
+ if sig is not None:
361
+ try:
362
+ sig.deleteLater()
363
+ except RuntimeError:
364
+ pass
@@ -6,7 +6,7 @@
6
6
  # GitHub: https://github.com/szczyglis-dev/py-gpt #
7
7
  # MIT License #
8
8
  # Created By : Marcin Szczygliński #
9
- # Updated Date: 2025.08.19 07:00:00 #
9
+ # Updated Date: 2025.08.30 06:00:00 #
10
10
  # ================================================== #
11
11
 
12
12
  from openai import OpenAI
@@ -33,6 +33,7 @@ from .container import Container
33
33
  from .image import Image
34
34
  from .remote_tools import RemoteTools
35
35
  from .responses import Responses
36
+ from .realtime import Realtime
36
37
  from .store import Store
37
38
  from .summarizer import Summarizer
38
39
  from .tools import Tools
@@ -57,6 +58,7 @@ class ApiOpenAI:
57
58
  self.image = Image(window)
58
59
  self.remote_tools = RemoteTools(window)
59
60
  self.responses = Responses(window)
61
+ self.realtime = Realtime(window)
60
62
  self.store = Store(window)
61
63
  self.summarizer = Summarizer(window)
62
64
  self.tools = Tools(window)
@@ -90,12 +92,18 @@ class ApiOpenAI:
90
92
  self.last_client_args = args
91
93
  return self.client
92
94
 
93
- def call(self, context: BridgeContext, extra: dict = None) -> bool:
95
+ def call(
96
+ self,
97
+ context: BridgeContext,
98
+ extra: dict = None,
99
+ rt_signals = None
100
+ ) -> bool:
94
101
  """
95
102
  Call OpenAI API
96
103
 
97
104
  :param context: Bridge context
98
105
  :param extra: Extra arguments
106
+ :param rt_signals: Realtime signals for audio streaming
99
107
  :return: result
100
108
  """
101
109
  mode = context.mode
@@ -145,6 +153,18 @@ class ApiOpenAI:
145
153
  MODE_RESEARCH,
146
154
  MODE_COMPUTER,
147
155
  ]:
156
+ if mode == MODE_AUDIO and stream:
157
+
158
+ # Realtime API for audio streaming
159
+ is_realtime = self.realtime.begin(
160
+ context=context,
161
+ model=model,
162
+ extra=extra or {},
163
+ rt_signals=rt_signals
164
+ )
165
+ if is_realtime:
166
+ return True
167
+
148
168
  # responses API
149
169
  if use_responses_api:
150
170
  response = self.responses.send(
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # ================================================== #
4
+ # This file is a part of PYGPT package #
5
+ # Website: https://pygpt.net #
6
+ # GitHub: https://github.com/szczyglis-dev/py-gpt #
7
+ # MIT License #
8
+ # Created By : Marcin Szczygliński #
9
+ # Updated Date: 2025.08.31 23:00:00 #
10
+ # ================================================== #
11
+
12
+ from .realtime import Realtime