pygpt-net 2.6.30__py3-none-any.whl → 2.6.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. pygpt_net/CHANGELOG.txt +8 -0
  2. pygpt_net/__init__.py +3 -3
  3. pygpt_net/app.py +4 -0
  4. pygpt_net/controller/__init__.py +5 -2
  5. pygpt_net/controller/audio/audio.py +25 -1
  6. pygpt_net/controller/audio/ui.py +2 -2
  7. pygpt_net/controller/chat/audio.py +1 -8
  8. pygpt_net/controller/chat/common.py +29 -3
  9. pygpt_net/controller/chat/handler/__init__.py +0 -0
  10. pygpt_net/controller/chat/handler/stream_worker.py +1124 -0
  11. pygpt_net/controller/chat/output.py +8 -3
  12. pygpt_net/controller/chat/stream.py +3 -1071
  13. pygpt_net/controller/chat/text.py +3 -2
  14. pygpt_net/controller/kernel/kernel.py +11 -3
  15. pygpt_net/controller/kernel/reply.py +5 -1
  16. pygpt_net/controller/realtime/__init__.py +12 -0
  17. pygpt_net/controller/realtime/manager.py +53 -0
  18. pygpt_net/controller/realtime/realtime.py +268 -0
  19. pygpt_net/controller/ui/mode.py +7 -0
  20. pygpt_net/controller/ui/ui.py +19 -1
  21. pygpt_net/core/audio/audio.py +6 -1
  22. pygpt_net/core/audio/backend/native/__init__.py +12 -0
  23. pygpt_net/core/audio/backend/{native.py → native/native.py} +426 -127
  24. pygpt_net/core/audio/backend/native/player.py +139 -0
  25. pygpt_net/core/audio/backend/native/realtime.py +250 -0
  26. pygpt_net/core/audio/backend/pyaudio/__init__.py +12 -0
  27. pygpt_net/core/audio/backend/pyaudio/playback.py +194 -0
  28. pygpt_net/core/audio/backend/pyaudio/pyaudio.py +923 -0
  29. pygpt_net/core/audio/backend/pyaudio/realtime.py +275 -0
  30. pygpt_net/core/audio/backend/pygame/__init__.py +12 -0
  31. pygpt_net/core/audio/backend/{pygame.py → pygame/pygame.py} +130 -19
  32. pygpt_net/core/audio/backend/shared/__init__.py +38 -0
  33. pygpt_net/core/audio/backend/shared/conversions.py +211 -0
  34. pygpt_net/core/audio/backend/shared/envelope.py +38 -0
  35. pygpt_net/core/audio/backend/shared/player.py +137 -0
  36. pygpt_net/core/audio/backend/shared/rt.py +52 -0
  37. pygpt_net/core/audio/capture.py +5 -0
  38. pygpt_net/core/audio/output.py +13 -2
  39. pygpt_net/core/audio/whisper.py +6 -2
  40. pygpt_net/core/bridge/bridge.py +2 -1
  41. pygpt_net/core/bridge/worker.py +4 -1
  42. pygpt_net/core/dispatcher/dispatcher.py +37 -1
  43. pygpt_net/core/events/__init__.py +2 -1
  44. pygpt_net/core/events/realtime.py +55 -0
  45. pygpt_net/core/image/image.py +51 -1
  46. pygpt_net/core/realtime/__init__.py +0 -0
  47. pygpt_net/core/realtime/options.py +87 -0
  48. pygpt_net/core/realtime/shared/__init__.py +0 -0
  49. pygpt_net/core/realtime/shared/audio.py +213 -0
  50. pygpt_net/core/realtime/shared/loop.py +64 -0
  51. pygpt_net/core/realtime/shared/session.py +59 -0
  52. pygpt_net/core/realtime/shared/text.py +37 -0
  53. pygpt_net/core/realtime/shared/tools.py +276 -0
  54. pygpt_net/core/realtime/shared/turn.py +38 -0
  55. pygpt_net/core/realtime/shared/types.py +16 -0
  56. pygpt_net/core/realtime/worker.py +164 -0
  57. pygpt_net/core/types/__init__.py +1 -0
  58. pygpt_net/core/types/image.py +48 -0
  59. pygpt_net/data/config/config.json +10 -4
  60. pygpt_net/data/config/models.json +149 -103
  61. pygpt_net/data/config/settings.json +50 -0
  62. pygpt_net/data/locale/locale.de.ini +5 -5
  63. pygpt_net/data/locale/locale.en.ini +19 -13
  64. pygpt_net/data/locale/locale.es.ini +5 -5
  65. pygpt_net/data/locale/locale.fr.ini +5 -5
  66. pygpt_net/data/locale/locale.it.ini +5 -5
  67. pygpt_net/data/locale/locale.pl.ini +5 -5
  68. pygpt_net/data/locale/locale.uk.ini +5 -5
  69. pygpt_net/data/locale/locale.zh.ini +1 -1
  70. pygpt_net/data/locale/plugin.audio_input.en.ini +4 -0
  71. pygpt_net/data/locale/plugin.audio_output.en.ini +4 -0
  72. pygpt_net/plugin/audio_input/plugin.py +37 -4
  73. pygpt_net/plugin/audio_input/simple.py +57 -8
  74. pygpt_net/plugin/cmd_files/worker.py +3 -0
  75. pygpt_net/provider/api/google/__init__.py +39 -6
  76. pygpt_net/provider/api/google/audio.py +8 -1
  77. pygpt_net/provider/api/google/chat.py +45 -6
  78. pygpt_net/provider/api/google/image.py +226 -86
  79. pygpt_net/provider/api/google/realtime/__init__.py +12 -0
  80. pygpt_net/provider/api/google/realtime/client.py +1945 -0
  81. pygpt_net/provider/api/google/realtime/realtime.py +186 -0
  82. pygpt_net/provider/api/openai/__init__.py +22 -2
  83. pygpt_net/provider/api/openai/realtime/__init__.py +12 -0
  84. pygpt_net/provider/api/openai/realtime/client.py +1828 -0
  85. pygpt_net/provider/api/openai/realtime/realtime.py +194 -0
  86. pygpt_net/provider/audio_input/google_genai.py +103 -0
  87. pygpt_net/provider/audio_output/google_genai_tts.py +229 -0
  88. pygpt_net/provider/audio_output/google_tts.py +0 -12
  89. pygpt_net/provider/audio_output/openai_tts.py +8 -5
  90. pygpt_net/provider/core/config/patch.py +15 -0
  91. pygpt_net/provider/core/model/patch.py +11 -0
  92. pygpt_net/provider/llms/google.py +8 -9
  93. pygpt_net/ui/layout/toolbox/footer.py +16 -0
  94. pygpt_net/ui/layout/toolbox/image.py +5 -0
  95. pygpt_net/ui/widget/option/combo.py +15 -1
  96. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.31.dist-info}/METADATA +26 -14
  97. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.31.dist-info}/RECORD +100 -62
  98. pygpt_net/core/audio/backend/pyaudio.py +0 -554
  99. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.31.dist-info}/LICENSE +0 -0
  100. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.31.dist-info}/WHEEL +0 -0
  101. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.31.dist-info}/entry_points.txt +0 -0
@@ -1,8 +1,8 @@
1
1
  {
2
2
  "__meta__": {
3
- "version": "2.6.30",
4
- "app.version": "2.6.30",
5
- "updated_at": "2025-08-29T23:07:35"
3
+ "version": "2.6.31",
4
+ "app.version": "2.6.31",
5
+ "updated_at": "2025-09-01T23:07:35"
6
6
  },
7
7
  "items": {
8
8
  "SpeakLeash/bielik-11b-v2.3-instruct:Q4_K_M": {
@@ -873,6 +873,44 @@
873
873
  "provider": "google",
874
874
  "tool_calls": true
875
875
  },
876
+ "gemini-2.5-flash-preview-native-audio-dialog": {
877
+ "id": "gemini-2.5-flash-preview-native-audio-dialog",
878
+ "name": "gemini-2.5-flash-preview-native-audio-dialog",
879
+ "mode": [
880
+ "audio"
881
+ ],
882
+ "llama_index": {
883
+ "args": [
884
+ {
885
+ "name": "model",
886
+ "value": "models/gemini-2.5-flash-preview-native-audio-dialog",
887
+ "type": "str"
888
+ }
889
+ ],
890
+ "env": [
891
+ {
892
+ "name": "GOOGLE_API_KEY",
893
+ "value": "{api_key_google}",
894
+ "type": "str"
895
+ }
896
+ ]
897
+ },
898
+ "ctx": 128000,
899
+ "tokens": 8000,
900
+ "default": false,
901
+ "input": [
902
+ "text",
903
+ "audio"
904
+ ],
905
+ "output": [
906
+ "text",
907
+ "audio"
908
+ ],
909
+ "extra": {},
910
+ "imported": true,
911
+ "provider": "google",
912
+ "tool_calls": true
913
+ },
876
914
  "gemini-2.5-pro": {
877
915
  "id": "gemini-2.5-pro",
878
916
  "name": "gemini-2.5-pro",
@@ -1457,55 +1495,6 @@
1457
1495
  "provider": "openai",
1458
1496
  "tool_calls": true
1459
1497
  },
1460
- "gpt-4o-audio-preview": {
1461
- "id": "gpt-4o-audio-preview",
1462
- "name": "gpt-4o-audio-preview",
1463
- "mode": [
1464
- "audio"
1465
- ],
1466
- "llama_index": {
1467
- "args": [
1468
- {
1469
- "name": "model",
1470
- "value": "gpt-4o-audio-preview",
1471
- "type": "str"
1472
- }
1473
- ],
1474
- "env": [
1475
- {
1476
- "name": "OPENAI_API_KEY",
1477
- "value": "{api_key}"
1478
- },
1479
- {
1480
- "name": "OPENAI_API_BASE",
1481
- "value": "{api_endpoint}"
1482
- },
1483
- {
1484
- "name": "AZURE_OPENAI_ENDPOINT",
1485
- "value": "{api_azure_endpoint}"
1486
- },
1487
- {
1488
- "name": "OPENAI_API_VERSION",
1489
- "value": "{api_azure_version}"
1490
- }
1491
- ]
1492
- },
1493
- "ctx": 128000,
1494
- "tokens": 16384,
1495
- "default": false,
1496
- "input": [
1497
- "text",
1498
- "audio"
1499
- ],
1500
- "output": [
1501
- "text",
1502
- "audio"
1503
- ],
1504
- "extra": {},
1505
- "imported": false,
1506
- "provider": "openai",
1507
- "tool_calls": true
1508
- },
1509
1498
  "gpt-4o-mini": {
1510
1499
  "id": "gpt-4o-mini",
1511
1500
  "name": "gpt-4o-mini",
@@ -1561,55 +1550,6 @@
1561
1550
  "provider": "openai",
1562
1551
  "tool_calls": true
1563
1552
  },
1564
- "gpt-4o-mini-audio-preview": {
1565
- "id": "gpt-4o-mini-audio-preview",
1566
- "name": "gpt-4o-mini-audio-preview",
1567
- "mode": [
1568
- "audio"
1569
- ],
1570
- "llama_index": {
1571
- "args": [
1572
- {
1573
- "name": "model",
1574
- "value": "gpt-4o-mini-audio-preview",
1575
- "type": "str"
1576
- }
1577
- ],
1578
- "env": [
1579
- {
1580
- "name": "OPENAI_API_KEY",
1581
- "value": "{api_key}"
1582
- },
1583
- {
1584
- "name": "OPENAI_API_BASE",
1585
- "value": "{api_endpoint}"
1586
- },
1587
- {
1588
- "name": "AZURE_OPENAI_ENDPOINT",
1589
- "value": "{api_azure_endpoint}"
1590
- },
1591
- {
1592
- "name": "OPENAI_API_VERSION",
1593
- "value": "{api_azure_version}"
1594
- }
1595
- ]
1596
- },
1597
- "ctx": 128000,
1598
- "tokens": 16384,
1599
- "default": false,
1600
- "input": [
1601
- "text",
1602
- "audio"
1603
- ],
1604
- "output": [
1605
- "text",
1606
- "audio"
1607
- ],
1608
- "extra": {},
1609
- "imported": false,
1610
- "provider": "openai",
1611
- "tool_calls": true
1612
- },
1613
1553
  "gpt-5": {
1614
1554
  "id": "gpt-5",
1615
1555
  "name": "gpt-5 (medium)",
@@ -2303,6 +2243,112 @@
2303
2243
  "provider": "ollama",
2304
2244
  "tool_calls": true
2305
2245
  },
2246
+ "gpt-realtime": {
2247
+ "id": "gpt-realtime",
2248
+ "name": "gpt-realtime",
2249
+ "mode": [
2250
+ "audio"
2251
+ ],
2252
+ "llama_index": {
2253
+ "args": [
2254
+ {
2255
+ "name": "model",
2256
+ "value": "gpt-realtime",
2257
+ "type": "str"
2258
+ }
2259
+ ],
2260
+ "env": [
2261
+ {
2262
+ "name": "OPENAI_API_KEY",
2263
+ "value": "{api_key}",
2264
+ "type": "str"
2265
+ },
2266
+ {
2267
+ "name": "OPENAI_API_BASE",
2268
+ "value": "{api_endpoint}",
2269
+ "type": "str"
2270
+ },
2271
+ {
2272
+ "name": "AZURE_OPENAI_ENDPOINT",
2273
+ "value": "{api_azure_endpoint}",
2274
+ "type": "str"
2275
+ },
2276
+ {
2277
+ "name": "OPENAI_API_VERSION",
2278
+ "value": "{api_azure_version}",
2279
+ "type": "str"
2280
+ }
2281
+ ]
2282
+ },
2283
+ "ctx": 32000,
2284
+ "tokens": 4096,
2285
+ "default": true,
2286
+ "input": [
2287
+ "text",
2288
+ "audio"
2289
+ ],
2290
+ "output": [
2291
+ "text",
2292
+ "audio"
2293
+ ],
2294
+ "extra": {},
2295
+ "imported": false,
2296
+ "provider": "openai",
2297
+ "tool_calls": true
2298
+ },
2299
+ "gpt-4o-realtime-preview": {
2300
+ "id": "gpt-4o-realtime-preview",
2301
+ "name": "gpt-4o-realtime-preview",
2302
+ "mode": [
2303
+ "audio"
2304
+ ],
2305
+ "llama_index": {
2306
+ "args": [
2307
+ {
2308
+ "name": "model",
2309
+ "value": "gpt-4o-realtime-preview",
2310
+ "type": "str"
2311
+ }
2312
+ ],
2313
+ "env": [
2314
+ {
2315
+ "name": "OPENAI_API_KEY",
2316
+ "value": "{api_key}",
2317
+ "type": "str"
2318
+ },
2319
+ {
2320
+ "name": "OPENAI_API_BASE",
2321
+ "value": "{api_endpoint}",
2322
+ "type": "str"
2323
+ },
2324
+ {
2325
+ "name": "AZURE_OPENAI_ENDPOINT",
2326
+ "value": "{api_azure_endpoint}",
2327
+ "type": "str"
2328
+ },
2329
+ {
2330
+ "name": "OPENAI_API_VERSION",
2331
+ "value": "{api_azure_version}",
2332
+ "type": "str"
2333
+ }
2334
+ ]
2335
+ },
2336
+ "ctx": 32000,
2337
+ "tokens": 4096,
2338
+ "default": false,
2339
+ "input": [
2340
+ "text",
2341
+ "audio"
2342
+ ],
2343
+ "output": [
2344
+ "text",
2345
+ "audio"
2346
+ ],
2347
+ "extra": {},
2348
+ "imported": true,
2349
+ "provider": "openai",
2350
+ "tool_calls": true
2351
+ },
2306
2352
  "grok-2-vision": {
2307
2353
  "id": "grok-2-vision",
2308
2354
  "name": "grok-2-vision",
@@ -2597,7 +2643,7 @@
2597
2643
  "image"
2598
2644
  ],
2599
2645
  "extra": {},
2600
- "imported": true,
2646
+ "imported": false,
2601
2647
  "provider": "google",
2602
2648
  "tool_calls": true
2603
2649
  },
@@ -2633,7 +2679,7 @@
2633
2679
  "image"
2634
2680
  ],
2635
2681
  "extra": {},
2636
- "imported": true,
2682
+ "imported": false,
2637
2683
  "provider": "google",
2638
2684
  "tool_calls": true
2639
2685
  },
@@ -1501,6 +1501,30 @@
1501
1501
  "advanced": false,
1502
1502
  "tab": "options"
1503
1503
  },
1504
+ "audio.input.vad.prefix": {
1505
+ "section": "audio",
1506
+ "type": "int",
1507
+ "slider": false,
1508
+ "label": "settings.audio.input.vad.prefix",
1509
+ "value": 300,
1510
+ "min": 0,
1511
+ "multiplier": 1,
1512
+ "step": 1,
1513
+ "advanced": false,
1514
+ "tab": "options"
1515
+ },
1516
+ "audio.input.vad.silence": {
1517
+ "section": "audio",
1518
+ "type": "int",
1519
+ "slider": false,
1520
+ "label": "settings.audio.input.vad.silence",
1521
+ "value": 2000,
1522
+ "min": 0,
1523
+ "multiplier": 1,
1524
+ "step": 1,
1525
+ "advanced": false,
1526
+ "tab": "options"
1527
+ },
1504
1528
  "audio.cache.enabled": {
1505
1529
  "section": "audio",
1506
1530
  "type": "bool",
@@ -1655,6 +1679,20 @@
1655
1679
  "advanced": false,
1656
1680
  "tab": "Google"
1657
1681
  },
1682
+ "remote_tools.google.url_ctx": {
1683
+ "section": "remote_tools",
1684
+ "type": "bool",
1685
+ "slider": false,
1686
+ "label": "settings.remote_tools.google.url_ctx",
1687
+ "description": "settings.remote_tools.google.url_ctx.desc",
1688
+ "value": true,
1689
+ "min": null,
1690
+ "max": null,
1691
+ "multiplier": null,
1692
+ "step": null,
1693
+ "advanced": false,
1694
+ "tab": "Google"
1695
+ },
1658
1696
  "llama.idx.list": {
1659
1697
  "section": "llama-index",
1660
1698
  "type": "dict",
@@ -2406,6 +2444,18 @@
2406
2444
  "step": null,
2407
2445
  "advanced": false
2408
2446
  },
2447
+ "log.realtime": {
2448
+ "section": "debug",
2449
+ "type": "bool",
2450
+ "slider": false,
2451
+ "label": "Log Realtime sessions to console",
2452
+ "value": false,
2453
+ "min": null,
2454
+ "max": null,
2455
+ "multiplier": null,
2456
+ "step": null,
2457
+ "advanced": false
2458
+ },
2409
2459
  "log.assistants": {
2410
2460
  "section": "debug",
2411
2461
  "type": "bool",
@@ -845,7 +845,7 @@ mode.agent_openai.tooltip = Fortgeschrittene Agenten (OpenAI)
845
845
  mode.agent.tooltip = Einfache Agenten (legacy)
846
846
  mode.assistant = Assistent
847
847
  mode.assistant.tooltip = Chat mittels Assistants API
848
- mode.audio = Chat mit Audio
848
+ mode.audio = Realtime + audio
849
849
  mode.chat = Chat
850
850
  mode.chat.tooltip = Chatmodus (Standard)
851
851
  mode.completion = Vervollständigung
@@ -1197,9 +1197,9 @@ settings.frequency_penalty = Frequenzstrafe
1197
1197
  settings.func_call.native = Native API-Funktionsaufrufe verwenden
1198
1198
  settings.func_call.native.desc = Wenn aktiviert, benutzt die Anwendung native API-Funktionsaufrufe anstelle des internen pygpt-Formats und der unten stehenden Befehlsprompten. Nur Chat- und Assistenz-Modi.
1199
1199
  settings.img_dialog_open = Bild-Dialog nach Generierung öffnen (Bildmodus)
1200
- settings.img_prompt_model = DALL-E: Modell zur Prompterzeugung
1201
- settings.img_quality = DALL-E: image quality
1202
- settings.img_resolution = DALL-E: Bildgröße
1200
+ settings.img_prompt_model = Modell zur Prompterzeugung
1201
+ settings.img_quality = image quality
1202
+ settings.img_resolution = Bildgröße
1203
1203
  settings.layout.animation.disable = Animationen deaktivieren
1204
1204
  settings.layout.animation.disable.desc = Deaktiviert Layout-Animationen, wie animierte Ladegeräte usw.
1205
1205
  settings.layout.density = Layoutdichte
@@ -1294,7 +1294,7 @@ settings.prompt.ctx.auto_summary.user = Kontext: Auto-Zusammenfassung (Benutzern
1294
1294
  settings.prompt.ctx.auto_summary.user.desc = Platzhalter: {input}, {output}
1295
1295
  settings.prompt.expert = Experte: Masteraufforderung
1296
1296
  settings.prompt.expert.desc = Anweisung (Systemaufforderung) für den Master-Experten, wie man Sklavenexperten handhabt. Anweisungen für Sklavenexperten werden aus ihren Voreinstellungen gegeben.
1297
- settings.prompt.img = DALL-E: Bildgenerierung
1297
+ settings.prompt.img = Bildgenerierung
1298
1298
  settings.prompt.img.desc = Aufforderung zur Erzeugung von Anweisungen für DALL-E (falls Rohmodus deaktiviert ist). Nur im Bildmodus.
1299
1299
  settings.remote_tools.code_interpreter = Code-Interpreter
1300
1300
  settings.remote_tools.code_interpreter.desc = Aktivieren Sie das `code_interpreter` Remote-Tool im Chat-Modus / über OpenAI Responses API.
@@ -188,6 +188,7 @@ attachments_uploaded.clear.confirm = WARNING: are you sure you want to delete al
188
188
  attachments_uploaded.delete.confirm = WARNING: are you sure you want to delete this file from the remote server?
189
189
  attachments_uploaded.sync.tip = Tip: click on 'Sync' to retrieve the file list from OpenAI
190
190
  attachments_uploaded.tab = Uploaded
191
+ audio.auto_turn = Auto (VAD)
191
192
  audio.cache.clear.confirm = Are you sure you want to delete all cached audio files?
192
193
  audio.cache.clear.success = OK. All audio cache files cleared.
193
194
  audio.control.btn = Voice control
@@ -845,7 +846,7 @@ mode.agent_openai.tooltip = Advanced agents (OpenAI)
845
846
  mode.agent.tooltip = Simple agents (legacy)
846
847
  mode.assistant = Assistants
847
848
  mode.assistant.tooltip = Chat using Assistants API
848
- mode.audio = Chat with Audio
849
+ mode.audio = Realtime + audio
849
850
  mode.chat = Chat
850
851
  mode.chat.tooltip = Chat mode (default)
851
852
  mode.completion = Completion
@@ -1219,9 +1220,9 @@ settings.frequency_penalty = Frequency Penalty
1219
1220
  settings.func_call.native = Use native API function calls
1220
1221
  settings.func_call.native.desc = If enabled, the application will use native API function calls instead of the internal pygpt format and the command prompts from below will not be used. Chat and Assistants modes ONLY.
1221
1222
  settings.img_dialog_open = Open image dialog after generation (Image mode)
1222
- settings.img_prompt_model = DALL-E: prompt generation model
1223
- settings.img_quality = DALL-E: image quality
1224
- settings.img_resolution = DALL-E: image size
1223
+ settings.img_prompt_model = Prompt generation model
1224
+ settings.img_quality = Image quality
1225
+ settings.img_resolution = Image size
1225
1226
  settings.layout.animation.disable = Disable animations
1226
1227
  settings.layout.animation.disable.desc = Disables layout animations, like animated loaders, etc.
1227
1228
  settings.layout.density = Layout density
@@ -1318,26 +1319,28 @@ settings.prompt.ctx.auto_summary.user = Context: auto-summary (user message)
1318
1319
  settings.prompt.ctx.auto_summary.user.desc = Placeholders: {input}, {output}
1319
1320
  settings.prompt.expert = Expert: Master prompt
1320
1321
  settings.prompt.expert.desc = Instruction (system prompt) for Master expert on how to handle slave experts. Instructions for slave experts are given from their presets.
1321
- settings.prompt.img = DALL-E: image generation
1322
+ settings.prompt.img = Image generation
1322
1323
  settings.prompt.img.desc = Prompt for generating prompts for DALL-E (if raw-mode is disabled). Image mode only.
1323
1324
  settings.remote_tools.code_interpreter = Code Interpreter
1324
- settings.remote_tools.code_interpreter.desc = Enable `code_interpreter` remote tool in Chat mode / via OpenAI Responses API.
1325
+ settings.remote_tools.code_interpreter.desc = Enable `code_interpreter` remote tool - Responses API only.
1325
1326
  settings.remote_tools.file_search = File search
1326
1327
  settings.remote_tools.file_search.args = File search vector store IDs
1327
1328
  settings.remote_tools.file_search.args.desc = Vector store IDs, separated by comma (,)
1328
- settings.remote_tools.file_search.desc = Enable `file_search` remote tool in Chat mode / via OpenAI Responses API.
1329
+ settings.remote_tools.file_search.desc = Enable `file_search` remote tool - Responses API only.
1329
1330
  settings.remote_tools.google.code_interpreter = Code Interpreter
1330
- settings.remote_tools.google.code_interpreter.desc = Enable Code Interpreter remote tool in Chat mode.
1331
- settings.remote_tools.google.web_search = Google Web Search
1332
- settings.remote_tools.google.web_search.desc = Enable Google Search remote tool in Chat mode.
1331
+ settings.remote_tools.google.code_interpreter.desc = Enable Code Interpreter remote tool.
1332
+ settings.remote_tools.google.url_ctx = URL Context
1333
+ settings.remote_tools.google.url_ctx.desc = Enable URL Context remote tool.
1334
+ settings.remote_tools.google.web_search = Google Search
1335
+ settings.remote_tools.google.web_search.desc = Enable Google Search remote tool.
1333
1336
  settings.remote_tools.image = Image generation
1334
- settings.remote_tools.image.desc = Enable `image_generation` remote tool in Chat mode / via OpenAI Responses API.
1337
+ settings.remote_tools.image.desc = Enable `image_generation` remote tool - Responses API only.
1335
1338
  settings.remote_tools.mcp = Remote MCP
1336
1339
  settings.remote_tools.mcp.args = Remote MCP configuration
1337
1340
  settings.remote_tools.mcp.args.desc = Configuration in JSON format (will be used in request)
1338
- settings.remote_tools.mcp.desc = Enable `mcp` remote tool in Chat mode / via OpenAI Responses API.
1341
+ settings.remote_tools.mcp.desc = Enable `mcp` remote tool - Responses API only.
1339
1342
  settings.remote_tools.web_search = Web Search
1340
- settings.remote_tools.web_search.desc = Enable `web_search` remote tool in Chat mode / via OpenAI Responses API.
1343
+ settings.remote_tools.web_search.desc = Enable `web_search` remote tool - Responses API only.
1341
1344
  settings.render.code_syntax = Code syntax highlight
1342
1345
  settings.render.engine = Rendering engine
1343
1346
  settings.render.open_gl = OpenGL hardware acceleration
@@ -1572,3 +1575,6 @@ vision.capture.manual.captured.success = Image captured from the camera:
1572
1575
  vision.capture.name.prefix = Camera capture:
1573
1576
  vision.capture.options.title = Video capture
1574
1577
  vision.checkbox.tooltip = If checked, the vision model is active. It will be automatically activated upon image upload. You can deactivate it in real-time.
1578
+
1579
+ settings.audio.input.vad.prefix = VAD prefix padding (in ms)
1580
+ settings.audio.input.vad.silence = VAD end silence (in ms)
@@ -846,7 +846,7 @@ mode.agent_openai.tooltip = Agentes avanzados (OpenAI)
846
846
  mode.agent.tooltip = Agentes simples (legacy)
847
847
  mode.assistant = Asistente
848
848
  mode.assistant.tooltip = Chatear usando la API de Asistentes
849
- mode.audio = Chat con audio
849
+ mode.audio = Realtime + audio
850
850
  mode.chat = Chat
851
851
  mode.chat.tooltip = Modo de chat (predeterminado)
852
852
  mode.completion = Finalización
@@ -1198,9 +1198,9 @@ settings.frequency_penalty = Penalización de frecuencia
1198
1198
  settings.func_call.native = Usar llamadas a funciones API nativas
1199
1199
  settings.func_call.native.desc = Si está habilitado, la aplicación usará llamadas a funciones API nativas en lugar del formato pygpt interno y los prompt de comandos a continuación no se usarán. Solo modos de Chat y Asistentes.
1200
1200
  settings.img_dialog_open = Abrir diálogo de imagen después de generar (Modo imagen)
1201
- settings.img_prompt_model = DALL-E: modelo de generación de indicaciones
1202
- settings.img_quality = DALL-E: calidad de imagen
1203
- settings.img_resolution = DALL-E: tamaño de imagen
1201
+ settings.img_prompt_model = Modelo de generación de indicaciones
1202
+ settings.img_quality = Calidad de imagen
1203
+ settings.img_resolution = Tamaño de imagen
1204
1204
  settings.layout.animation.disable = Desactivar animaciones
1205
1205
  settings.layout.animation.disable.desc = Desactivar animaciones de diseño, como cargadores animados, etc.
1206
1206
  settings.layout.density = Densidad de la disposición
@@ -1295,7 +1295,7 @@ settings.prompt.ctx.auto_summary.user = Contexto: resumen automático (mensaje d
1295
1295
  settings.prompt.ctx.auto_summary.user.desc = Marcadores de posición: {input}, {output}
1296
1296
  settings.prompt.expert = Experto: Master prompt
1297
1297
  settings.prompt.expert.desc = Instrucción (prompt del sistema) para el experto Master cómo manejar a los expertos subordinados. Las instrucciones para los expertos subordinados se dan desde sus presets.
1298
- settings.prompt.img = DALL-E: generación de imagen
1298
+ settings.prompt.img = Generación de imagen
1299
1299
  settings.prompt.img.desc = Mensaje para generar comandos para DALL-E (si el modo crudo está desactivado). Solo modo de imagen.
1300
1300
  settings.remote_tools.code_interpreter = Intérprete de Código
1301
1301
  settings.remote_tools.code_interpreter.desc = Habilitar herramienta remota `code_interpreter` en modo Chat / vía API de Respuestas de OpenAI.
@@ -845,7 +845,7 @@ mode.agent_openai.tooltip = Agents avancés (OpenAI)
845
845
  mode.agent.tooltip = Agents simples (legacy)
846
846
  mode.assistant = Assistant
847
847
  mode.assistant.tooltip = Discuter via l'API des Assistants
848
- mode.audio = Chat avec audio
848
+ mode.audio = Realtime + audio
849
849
  mode.chat = Chat
850
850
  mode.chat.tooltip = Mode chat (par défaut)
851
851
  mode.completion = Complétion
@@ -1197,9 +1197,9 @@ settings.frequency_penalty = Pénalité de fréquence
1197
1197
  settings.func_call.native = Utiliser les appels de fonction API natives
1198
1198
  settings.func_call.native.desc = Si activé, l'application utilisera les appels de fonction API natives au lieu du format interne de pygpt et les commandes prompt ci-dessous ne seront pas utilisées. Modes uniquement Chat et Assistants.
1199
1199
  settings.img_dialog_open = Ouvrir la boîte de dialogue d'image après la génération (Mode image)
1200
- settings.img_prompt_model = DALL-E : modèle de génération d'invite
1201
- settings.img_quality = DALL-E: qualité d'image
1202
- settings.img_resolution = DALL-E : taille de l'image
1200
+ settings.img_prompt_model = Modèle de génération d'invite
1201
+ settings.img_quality = Qualité d'image
1202
+ settings.img_resolution = Taille de l'image
1203
1203
  settings.layout.animation.disable = Désactiver les animations
1204
1204
  settings.layout.animation.disable.desc = Désactive les animations de mise en page, comme les chargeurs animés, etc.
1205
1205
  settings.layout.density = Densité de la disposition
@@ -1294,7 +1294,7 @@ settings.prompt.ctx.auto_summary.user = Contexte: résumé automatique (message
1294
1294
  settings.prompt.ctx.auto_summary.user.desc = Espaces réservés: {input}, {output}
1295
1295
  settings.prompt.expert = Expert : Master prompt
1296
1296
  settings.prompt.expert.desc = Instruction (prompt système) pour l'expert Master sur comment gérer les experts esclaves. Les instructions pour les experts esclaves sont données à partir de leurs presets.
1297
- settings.prompt.img = DALL-E: génération d'image
1297
+ settings.prompt.img = Génération d'image
1298
1298
  settings.prompt.img.desc = Prompt pour générer des commandes pour DALL-E (si le mode brut est désactivé). Mode image uniquement.
1299
1299
  settings.remote_tools.code_interpreter = Interpréteur de code
1300
1300
  settings.remote_tools.code_interpreter.desc = Activer l'outil distant `code_interpreter` en mode Chat/ via OpenAI Responses API.
@@ -845,7 +845,7 @@ mode.agent_openai.tooltip = Agenti avanzati (OpenAI)
845
845
  mode.agent.tooltip = Agenti semplici (legacy)
846
846
  mode.assistant = Assistente
847
847
  mode.assistant.tooltip = Chattare utilizzando l'API degli Assistenti
848
- mode.audio = Chat con audio
848
+ mode.audio = Realtime + audio
849
849
  mode.chat = Chat
850
850
  mode.chat.tooltip = Modalità chat (predefinita)
851
851
  mode.completion = Completamento
@@ -1197,9 +1197,9 @@ settings.frequency_penalty = Penale di frequenza
1197
1197
  settings.func_call.native = Usa chiamate di funzione API native
1198
1198
  settings.func_call.native.desc = Se abilitato, l'applicazione utilizzerà le chiamate di funzione API native invece del formato interno pygpt e i prompt di comando di seguito non saranno utilizzati. Solo modalità chat e assistenti.
1199
1199
  settings.img_dialog_open = Apri la finestra di dialogo dell'immagine dopo la generazione (Modalità immagine)
1200
- settings.img_prompt_model = DALL-E: modello di generazione del prompt
1201
- settings.img_quality = DALL-E: qualità dell'immagine
1202
- settings.img_resolution = DALL-E: dimensione dell'immagine
1200
+ settings.img_prompt_model = Modello di generazione del prompt
1201
+ settings.img_quality = Qualità dell'immagine
1202
+ settings.img_resolution = Dimensione dell'immagine
1203
1203
  settings.layout.animation.disable = Disabilita animazioni
1204
1204
  settings.layout.animation.disable.desc = Disabilita le animazioni del layout, come i caricamenti animati, ecc.
1205
1205
  settings.layout.density = Densità del layout
@@ -1294,7 +1294,7 @@ settings.prompt.ctx.auto_summary.user = Contesto: auto-riassunto (messaggio dell
1294
1294
  settings.prompt.ctx.auto_summary.user.desc = Placeholder: {input}, {output}
1295
1295
  settings.prompt.expert = Esperto: Master prompt
1296
1296
  settings.prompt.expert.desc = Istruzione (prompt del sistema) per l'esperto Master su come gestire gli esperti subalterni. Le istruzioni per gli esperti subalterni sono date dalle loro preimpostazioni.
1297
- settings.prompt.img = DALL-E: generazione immagine
1297
+ settings.prompt.img = Generazione immagine
1298
1298
  settings.prompt.img.desc = Prompt per generare comandi per DALL-E (se la modalità grezza è disabilitata). Solo modalità immagine.
1299
1299
  settings.remote_tools.code_interpreter = Interprete del codice
1300
1300
  settings.remote_tools.code_interpreter.desc = Abilita l'attrezzo remoto `code_interpreter` in modalità Chat / tramite API delle Risposte di OpenAI.
@@ -846,7 +846,7 @@ mode.agent_openai.tooltip = Zaawansowani agenci (OpenAI)
846
846
  mode.agent.tooltip = Prości agenci (legacy)
847
847
  mode.assistant = Asystent
848
848
  mode.assistant.tooltip = Czat przy użyciu API Asystentów
849
- mode.audio = Czat Audio
849
+ mode.audio = Realtime + audio
850
850
  mode.chat = Czat
851
851
  mode.chat.tooltip = Tryb czatu (domyślny)
852
852
  mode.completion = Uzupełnianie
@@ -1198,9 +1198,9 @@ settings.frequency_penalty = Frequency Penalty
1198
1198
  settings.func_call.native = Używaj natywnych wywołań funkcji API
1199
1199
  settings.func_call.native.desc = Jeśli włączone, aplikacja będzie używać natywnych wywołań funkcji API zamiast wewnętrznego formatu pygpt i poniższych promptów poleceń. Tylko tryby czatu i asystentów.
1200
1200
  settings.img_dialog_open = Otwórz okno dialogowe obrazu po wygenerowaniu (Tryb obrazu)
1201
- settings.img_prompt_model = DALL-E: model do generowania promptów
1202
- settings.img_quality = DALL-E: jakość obrazu
1203
- settings.img_resolution = DALL-E: rozmiar obrazu
1201
+ settings.img_prompt_model = Model do generowania promptów
1202
+ settings.img_quality = Jakość obrazu
1203
+ settings.img_resolution = Rozmiar obrazu
1204
1204
  settings.layout.animation.disable = Wyłącz animacje
1205
1205
  settings.layout.animation.disable.desc = Wyłącza animacje układu, jak animowane ładowarki, itp.
1206
1206
  settings.layout.density = Rozmiar layoutu
@@ -1295,7 +1295,7 @@ settings.prompt.ctx.auto_summary.user = Kontekst: auto-podsumowanie (wiadomość
1295
1295
  settings.prompt.ctx.auto_summary.user.desc = Placeholdery: {input}, {output}
1296
1296
  settings.prompt.expert = Ekspert: Główna wskazówka
1297
1297
  settings.prompt.expert.desc = Instrukcja (systemowa wskazówka) dla głównego eksperta, jak obsługiwać ekspertów pomocniczych. Instrukcje dla ekspertów pomocniczych są podawane z ich ustawień.
1298
- settings.prompt.img = DALL-E: generowanie obrazu
1298
+ settings.prompt.img = Generowanie obrazu
1299
1299
  settings.prompt.img.desc = Prompt do generowania poleceń dla DALL-E (jeśli surowy tryb jest wyłączony). Tylko tryb obrazu.
1300
1300
  settings.remote_tools.code_interpreter = Interpreter kodu
1301
1301
  settings.remote_tools.code_interpreter.desc = Włącz narzędzie `code_interpreter` w trybie Czat / za pośrednictwem OpenAI Responses API.
@@ -845,7 +845,7 @@ mode.agent_openai.tooltip = Просунуті агенти (OpenAI)
845
845
  mode.agent.tooltip = Прості агенти (legacy)
846
846
  mode.assistant = Помічник
847
847
  mode.assistant.tooltip = Чат за допомогою API Асистентів
848
- mode.audio = Чат з аудіо
848
+ mode.audio = Realtime + audio
849
849
  mode.chat = Чат
850
850
  mode.chat.tooltip = Режим чату (за замовчуванням)
851
851
  mode.completion = Завершення
@@ -1197,9 +1197,9 @@ settings.frequency_penalty = Частотний штраф
1197
1197
  settings.func_call.native = Використовувати рідні виклики функцій API
1198
1198
  settings.func_call.native.desc = Якщо увімкнено, програма буде використовувати рідні виклики функцій API замість внутрішнього формату pygpt і нижченаведених запитів команд не використовуватимуться. Лише режими чату та асистентів.
1199
1199
  settings.img_dialog_open = Відкрити діалогове вікно зображення після генерації (Режим зображення)
1200
- settings.img_prompt_model = DALL-E: модель генерації запиту
1201
- settings.img_quality = DALL-E: якість зображення
1202
- settings.img_resolution = DALL-E: розмір зображення
1200
+ settings.img_prompt_model = Модель Генерації Запиту
1201
+ settings.img_quality = Якість Зображення
1202
+ settings.img_resolution = Розмір Зображення
1203
1203
  settings.layout.animation.disable = Вимкнути анімації
1204
1204
  settings.layout.animation.disable.desc = Вимикає анімації макета, як анімовані завантажувачі тощо.
1205
1205
  settings.layout.density = Щільність компонування
@@ -1294,7 +1294,7 @@ settings.prompt.ctx.auto_summary.user = Контекст: авто-резюме
1294
1294
  settings.prompt.ctx.auto_summary.user.desc = Заповнювачі: {input}, {output}
1295
1295
  settings.prompt.expert = Експерт: Основний запит
1296
1296
  settings.prompt.expert.desc = Інструкція (системний запит) для ведучого експерта, як керувати підеекспертами. Інструкції для підеекспертів даються з їхніх налаштувань.
1297
- settings.prompt.img = DALL-E: генерація зображення
1297
+ settings.prompt.img = Генерація зображення
1298
1298
  settings.prompt.img.desc = Підказка для генерації команддля DALL-E (якщо вимкнено сирівний режим). Тільки режим зображення.
1299
1299
  settings.remote_tools.code_interpreter = Інтерпретатор коду
1300
1300
  settings.remote_tools.code_interpreter.desc = Увімкніть віддалений інструмент `code_interpreter` у режимі Чат / через Responses API OpenAI.
@@ -845,7 +845,7 @@ mode.agent_openai.tooltip = 高级代理 (OpenAI)
845
845
  mode.agent.tooltip = 简单代理(自主)
846
846
  mode.assistant = 助手
847
847
  mode.assistant.tooltip = 使用助手API進行聊天
848
- mode.audio = 语音聊天
848
+ mode.audio = Realtime + audio
849
849
  mode.chat = 聊天模式
850
850
  mode.chat.tooltip = 聊天模式(預設)
851
851
  mode.completion = 完成模式
@@ -17,6 +17,10 @@ google_args.tooltip = Provide additional keyword arguments for recognize_google(
17
17
  google_cloud_args.description = Additional keyword arguments for r.recognize_google_cloud(audio, **kwargs).
18
18
  google_cloud_args.label = Additional keyword arguments
19
19
  google_cloud_args.tooltip = Provide additional keyword arguments for recognize_google_cloud()
20
+ google_genai_audio_model.description = Specify Gemini model supporting audio, e.g., gemini-2.5-flash
21
+ google_genai_audio_model.label = Model
22
+ google_genai_audio_prompt.description = System prompt for transcription
23
+ google_genai_audio_prompt.label = System Prompt
20
24
  magic_word.description = Activate listening only after the magic word is provided, like 'Hey GPT' or 'OK GPT'. Default: False.
21
25
  magic_word.label = Magic word
22
26
  magic_word_phrase_length.description = Magic word phrase length. Default: 2.