pygpt-net 2.7.8__py3-none-any.whl → 2.7.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. pygpt_net/CHANGELOG.txt +14 -0
  2. pygpt_net/LICENSE +1 -1
  3. pygpt_net/__init__.py +3 -3
  4. pygpt_net/config.py +15 -1
  5. pygpt_net/controller/chat/common.py +5 -4
  6. pygpt_net/controller/chat/image.py +3 -3
  7. pygpt_net/controller/chat/stream.py +76 -41
  8. pygpt_net/controller/chat/stream_worker.py +3 -3
  9. pygpt_net/controller/ctx/extra.py +3 -1
  10. pygpt_net/controller/dialogs/debug.py +37 -8
  11. pygpt_net/controller/kernel/kernel.py +3 -7
  12. pygpt_net/controller/lang/custom.py +25 -12
  13. pygpt_net/controller/lang/lang.py +45 -3
  14. pygpt_net/controller/lang/mapping.py +15 -2
  15. pygpt_net/controller/notepad/notepad.py +68 -25
  16. pygpt_net/controller/presets/editor.py +5 -1
  17. pygpt_net/controller/presets/presets.py +17 -5
  18. pygpt_net/controller/realtime/realtime.py +13 -1
  19. pygpt_net/controller/theme/theme.py +11 -2
  20. pygpt_net/controller/ui/tabs.py +1 -1
  21. pygpt_net/core/ctx/output.py +38 -12
  22. pygpt_net/core/db/database.py +4 -2
  23. pygpt_net/core/debug/console/console.py +30 -2
  24. pygpt_net/core/debug/context.py +2 -1
  25. pygpt_net/core/debug/ui.py +26 -4
  26. pygpt_net/core/filesystem/filesystem.py +6 -2
  27. pygpt_net/core/notepad/notepad.py +2 -2
  28. pygpt_net/core/tabs/tabs.py +79 -19
  29. pygpt_net/data/config/config.json +4 -3
  30. pygpt_net/data/config/models.json +37 -22
  31. pygpt_net/data/config/settings.json +12 -0
  32. pygpt_net/data/locale/locale.ar.ini +1833 -0
  33. pygpt_net/data/locale/locale.bg.ini +1833 -0
  34. pygpt_net/data/locale/locale.cs.ini +1833 -0
  35. pygpt_net/data/locale/locale.da.ini +1833 -0
  36. pygpt_net/data/locale/locale.de.ini +4 -1
  37. pygpt_net/data/locale/locale.en.ini +70 -67
  38. pygpt_net/data/locale/locale.es.ini +4 -1
  39. pygpt_net/data/locale/locale.fi.ini +1833 -0
  40. pygpt_net/data/locale/locale.fr.ini +4 -1
  41. pygpt_net/data/locale/locale.he.ini +1833 -0
  42. pygpt_net/data/locale/locale.hi.ini +1833 -0
  43. pygpt_net/data/locale/locale.hu.ini +1833 -0
  44. pygpt_net/data/locale/locale.it.ini +4 -1
  45. pygpt_net/data/locale/locale.ja.ini +1833 -0
  46. pygpt_net/data/locale/locale.ko.ini +1833 -0
  47. pygpt_net/data/locale/locale.nl.ini +1833 -0
  48. pygpt_net/data/locale/locale.no.ini +1833 -0
  49. pygpt_net/data/locale/locale.pl.ini +5 -2
  50. pygpt_net/data/locale/locale.pt.ini +1833 -0
  51. pygpt_net/data/locale/locale.ro.ini +1833 -0
  52. pygpt_net/data/locale/locale.ru.ini +1833 -0
  53. pygpt_net/data/locale/locale.sk.ini +1833 -0
  54. pygpt_net/data/locale/locale.sv.ini +1833 -0
  55. pygpt_net/data/locale/locale.tr.ini +1833 -0
  56. pygpt_net/data/locale/locale.uk.ini +4 -1
  57. pygpt_net/data/locale/locale.zh.ini +4 -1
  58. pygpt_net/item/notepad.py +8 -2
  59. pygpt_net/migrations/Version20260121190000.py +25 -0
  60. pygpt_net/migrations/Version20260122140000.py +25 -0
  61. pygpt_net/migrations/__init__.py +5 -1
  62. pygpt_net/preload.py +246 -3
  63. pygpt_net/provider/api/__init__.py +16 -2
  64. pygpt_net/provider/api/anthropic/__init__.py +21 -7
  65. pygpt_net/provider/api/google/__init__.py +21 -7
  66. pygpt_net/provider/api/google/image.py +89 -2
  67. pygpt_net/provider/api/google/realtime/client.py +70 -24
  68. pygpt_net/provider/api/google/realtime/realtime.py +48 -12
  69. pygpt_net/provider/api/google/video.py +2 -2
  70. pygpt_net/provider/api/openai/__init__.py +26 -11
  71. pygpt_net/provider/api/openai/image.py +79 -3
  72. pygpt_net/provider/api/openai/realtime/realtime.py +26 -6
  73. pygpt_net/provider/api/openai/responses.py +11 -31
  74. pygpt_net/provider/api/openai/video.py +2 -2
  75. pygpt_net/provider/api/x_ai/__init__.py +21 -10
  76. pygpt_net/provider/api/x_ai/realtime/client.py +185 -146
  77. pygpt_net/provider/api/x_ai/realtime/realtime.py +30 -15
  78. pygpt_net/provider/api/x_ai/remote_tools.py +83 -0
  79. pygpt_net/provider/api/x_ai/tools.py +51 -0
  80. pygpt_net/provider/core/config/patch.py +12 -1
  81. pygpt_net/provider/core/model/patch.py +36 -1
  82. pygpt_net/provider/core/notepad/db_sqlite/storage.py +53 -10
  83. pygpt_net/tools/agent_builder/ui/dialogs.py +2 -1
  84. pygpt_net/tools/audio_transcriber/ui/dialogs.py +2 -1
  85. pygpt_net/tools/code_interpreter/ui/dialogs.py +2 -1
  86. pygpt_net/tools/html_canvas/ui/dialogs.py +2 -1
  87. pygpt_net/tools/image_viewer/ui/dialogs.py +3 -5
  88. pygpt_net/tools/indexer/ui/dialogs.py +2 -1
  89. pygpt_net/tools/media_player/ui/dialogs.py +2 -1
  90. pygpt_net/tools/translator/ui/dialogs.py +2 -1
  91. pygpt_net/tools/translator/ui/widgets.py +6 -2
  92. pygpt_net/ui/dialog/about.py +2 -2
  93. pygpt_net/ui/dialog/db.py +2 -1
  94. pygpt_net/ui/dialog/debug.py +169 -6
  95. pygpt_net/ui/dialog/logger.py +6 -2
  96. pygpt_net/ui/dialog/models.py +36 -3
  97. pygpt_net/ui/dialog/preset.py +5 -1
  98. pygpt_net/ui/dialog/remote_store.py +2 -1
  99. pygpt_net/ui/main.py +3 -2
  100. pygpt_net/ui/widget/dialog/editor_file.py +2 -1
  101. pygpt_net/ui/widget/lists/debug.py +12 -7
  102. pygpt_net/ui/widget/option/checkbox.py +2 -8
  103. pygpt_net/ui/widget/option/combo.py +10 -2
  104. pygpt_net/ui/widget/textarea/console.py +156 -7
  105. pygpt_net/ui/widget/textarea/highlight.py +66 -0
  106. pygpt_net/ui/widget/textarea/input.py +624 -57
  107. pygpt_net/ui/widget/textarea/notepad.py +294 -27
  108. {pygpt_net-2.7.8.dist-info → pygpt_net-2.7.10.dist-info}/LICENSE +1 -1
  109. {pygpt_net-2.7.8.dist-info → pygpt_net-2.7.10.dist-info}/METADATA +16 -64
  110. {pygpt_net-2.7.8.dist-info → pygpt_net-2.7.10.dist-info}/RECORD +112 -91
  111. {pygpt_net-2.7.8.dist-info → pygpt_net-2.7.10.dist-info}/WHEEL +0 -0
  112. {pygpt_net-2.7.8.dist-info → pygpt_net-2.7.10.dist-info}/entry_points.txt +0 -0
@@ -6,7 +6,7 @@
6
6
  # GitHub: https://github.com/szczyglis-dev/py-gpt #
7
7
  # MIT License #
8
8
  # Created By : Marcin Szczygliński #
9
- # Updated Date: 2026.01.06 20:00:00 #
9
+ # Updated Date: 2026.01.07 23:00:00 #
10
10
  # ================================================== #
11
11
 
12
12
  import asyncio
@@ -19,7 +19,6 @@ from typing import Optional, Callable, Awaitable
19
19
  from urllib.parse import urlencode
20
20
 
21
21
  from pygpt_net.core.events import RealtimeEvent
22
- from pygpt_net.core.types import MODE_AUDIO
23
22
  from pygpt_net.item.ctx import CtxItem
24
23
  from pygpt_net.core.text.utils import has_unclosed_code_tag
25
24
 
@@ -33,9 +32,6 @@ from pygpt_net.core.realtime.shared.audio import (
33
32
  )
34
33
  from pygpt_net.core.realtime.shared.tools import (
35
34
  sanitize_function_tools,
36
- sanitize_remote_tools,
37
- prepare_tools_for_session,
38
- prepare_tools_for_response,
39
35
  tools_signature,
40
36
  build_tool_outputs_payload,
41
37
  )
@@ -50,15 +46,14 @@ class xAIIRealtimeClient:
50
46
  Key points:
51
47
  - A single background asyncio loop runs in its own thread for the lifetime of the client.
52
48
  - One websocket connection (session) at a time; multiple "turns" (send_turn) are serialized.
53
- - No server VAD: manual turn control via input_audio_buffer.* + response.create.
49
+ - Supports server VAD (auto-turn) and manual turn control (input_audio_buffer.* + response.create).
54
50
  - Safe to call run()/send_turn()/reset()/shutdown() from any thread or event loop.
55
51
 
56
52
  Session resumption:
57
53
  - The official Realtime API does not expose a documented server-side "resume" for closed WS sessions.
58
- We still persist the server-provided session.id and surface it via ctx.extra["rt_session_id"].
59
- - If opts.rt_session_id is provided and differs from the current in-memory handle, we reset the
60
- connection and attempt to reconnect with a "session_id" query parameter. If that fails, we fall
61
- back to the standard URL to avoid breaking existing functionality.
54
+ We still persist the server-provided handle (session or conversation id) and surface it via ctx.extra["rt_session_id"].
55
+ If opts.rt_session_id is provided and differs from the current in-memory handle, we reset the connection and attempt
56
+ to reconnect with a "session_id" query parameter. If that fails, we fall back to the standard URL.
62
57
  """
63
58
 
64
59
  WS_URL = "wss://api.x.ai/v1/realtime"
@@ -234,7 +229,7 @@ class xAIIRealtimeClient:
234
229
  ctx: Optional[CtxItem] = None,
235
230
  opts=None,
236
231
  on_text: Optional[Callable[[str], Awaitable[None]]] = None,
237
- on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None,
232
+ on_audio: Optional[Callable[[bytes, str], Awaitable[None]]] = None,
238
233
  should_stop: Optional[Callable[[], bool]] = None,
239
234
  timeout: float = 10.0,
240
235
  ):
@@ -274,6 +269,89 @@ class xAIIRealtimeClient:
274
269
  except Exception:
275
270
  pass
276
271
 
272
+ def _xai_tool_shape(self, tool: dict) -> dict:
273
+ """
274
+ Ensure xAI-compatible tool shape:
275
+ - function tools use top-level name/parameters (no nested "function" object)
276
+ - known provider tools: file_search (vector_store_ids), web_search, x_search
277
+ Unknown provider-only tools are dropped to avoid server-side validation issues.
278
+ """
279
+ try:
280
+ if not isinstance(tool, dict):
281
+ return tool
282
+
283
+ t = dict(tool)
284
+
285
+ # Convert OpenAI Realtime "function": {...} into xAI top-level form
286
+ if t.get("type") == "function":
287
+ if "function" in t and isinstance(t["function"], dict):
288
+ f = t["function"]
289
+ name = f.get("name") or t.get("name")
290
+ desc = f.get("description") or t.get("description") or ""
291
+ params = f.get("parameters") or t.get("parameters") or {"type": "object"}
292
+ return {
293
+ "type": "function",
294
+ "name": name,
295
+ "description": desc,
296
+ "parameters": params if isinstance(params, dict) else {"type": "object"},
297
+ }
298
+ # Already top-level form, return as-is
299
+ return {
300
+ "type": "function",
301
+ "name": t.get("name"),
302
+ "description": t.get("description") or "",
303
+ "parameters": t.get("parameters") or {"type": "object"},
304
+ }
305
+
306
+ # Map collections_search -> file_search
307
+ if t.get("type") == "collections_search":
308
+ vec_ids = t.get("collection_ids") or t.get("vector_store_ids") or []
309
+ max_num = t.get("max_num_results") if isinstance(t.get("max_num_results"), int) else None
310
+ out = {
311
+ "type": "file_search",
312
+ "vector_store_ids": vec_ids if isinstance(vec_ids, list) else [],
313
+ }
314
+ if max_num is not None:
315
+ out["max_num_results"] = max_num
316
+ return out
317
+
318
+ # Pass-through for known provider tools
319
+ if t.get("type") in ("file_search", "web_search", "x_search"):
320
+ return t
321
+
322
+ # code_interpreter is not documented for xAI Voice Agent; drop it
323
+ if t.get("type") == "code_interpreter":
324
+ return {}
325
+
326
+ return t
327
+ except Exception:
328
+ return tool
329
+
330
+ def _compose_xai_tools(self, tools: Optional[list], remote_tools: Optional[list]) -> list:
331
+ """
332
+ Compose a single list of tools in xAI shape; filters out unsupported ones.
333
+ """
334
+ out: list = []
335
+ try:
336
+ fn = tools or []
337
+ rt = remote_tools or []
338
+
339
+ # Sanitize function tools from our shared helper first
340
+ fn = sanitize_function_tools(fn) or fn
341
+
342
+ # Merge order: provider tools first (as in xAI docs), then function tools
343
+ for t in (rt or []):
344
+ shaped = self._xai_tool_shape(t)
345
+ if isinstance(shaped, dict) and shaped:
346
+ out.append(shaped)
347
+ for t in (fn or []):
348
+ shaped = self._xai_tool_shape(t)
349
+ if isinstance(shaped, dict) and shaped:
350
+ out.append(shaped)
351
+ except Exception:
352
+ pass
353
+ return out
354
+
277
355
  # -----------------------------
278
356
  # Internal: background loop/dispatch
279
357
  # -----------------------------
@@ -327,20 +405,17 @@ class xAIIRealtimeClient:
327
405
  except Exception:
328
406
  pass
329
407
 
330
- # Build WS URL with model and optional session_id for best-effort resume
331
- base_q = {"model": model_id}
408
+ # Prefer plain WS URL; fallback to query-parameter variant
409
+ url_plain = self.WS_URL
410
+ q = {"model": model_id}
332
411
  if resume_sid:
333
- base_q["session_id"] = resume_sid # if unsupported by server, connect fallback will ignore
334
- url_with_sid = f"{self.WS_URL}?{urlencode(base_q)}"
335
- url_no_sid = f"{self.WS_URL}?{urlencode({'model': model_id})}"
412
+ q["session_id"] = resume_sid
413
+ url_with_q = f"{self.WS_URL}?{urlencode(q)}"
336
414
 
337
415
  headers = {
338
416
  "Authorization": f"Bearer {api_key}",
339
417
  }
340
418
 
341
- # Transcription toggle
342
- transcribe_enabled = bool(getattr(opts, "transcribe", False))
343
-
344
419
  # Save callbacks and context
345
420
  self._on_text = on_text
346
421
  self._on_audio = on_audio
@@ -355,11 +430,10 @@ class xAIIRealtimeClient:
355
430
  if self.debug:
356
431
  print(f"[open_session] owner_loop={id(asyncio.get_running_loop())}")
357
432
 
358
- # Connect WS: first try with session_id if provided; on failure, fall back to plain URL.
433
+ # Connect WS with robust fallback
359
434
  try:
360
- target_url = url_with_sid if resume_sid else url_no_sid
361
435
  self.ws = await websockets.connect(
362
- target_url,
436
+ url_plain,
363
437
  additional_headers=headers,
364
438
  max_size=16 * 1024 * 1024,
365
439
  ping_interval=20,
@@ -367,39 +441,53 @@ class xAIIRealtimeClient:
367
441
  close_timeout=5,
368
442
  )
369
443
  except Exception as e:
370
- if resume_sid and self.debug:
371
- print(f"[open_session] connect with session_id failed ({e!r}); falling back to plain URL")
372
- if resume_sid:
444
+ if self.debug:
445
+ print(f"[open_session] connect plain failed: {e!r}")
446
+ try:
373
447
  self.ws = await websockets.connect(
374
- url_no_sid,
448
+ url_with_q,
375
449
  additional_headers=headers,
376
450
  max_size=16 * 1024 * 1024,
377
451
  ping_interval=20,
378
452
  ping_timeout=20,
379
453
  close_timeout=5,
380
454
  )
455
+ except Exception as e2:
456
+ if self.debug:
457
+ print(f"[open_session] fallback connect failed: {e2!r}")
458
+ self.ws = None
459
+
460
+ if not self.ws:
461
+ raise RuntimeError("xAI Realtime: WebSocket connect failed")
462
+
381
463
  if self.debug:
382
464
  print("[open_session] WS connected")
383
465
 
384
- # Session payload (manual by default; prepared for auto)
466
+ # Session payload compatible with xAI Voice Agent
385
467
  session_payload = {
386
468
  "type": "session.update",
387
469
  "session": {
388
- "modalities": ["text", "audio"],
389
470
  "voice": voice,
390
- "input_audio_format": "pcm16",
391
- "output_audio_format": "pcm16",
392
- # turn_detection set below via apply_turn_mode_openai
393
- **({"instructions": str(getattr(opts, "system_prompt"))} if getattr(opts, "system_prompt", None) else {}),
471
+ "audio": {
472
+ "input": {"format": {"type": "audio/pcm", "rate": self._DEFAULT_RATE}},
473
+ "output": {"format": {"type": "audio/pcm", "rate": self._DEFAULT_RATE}},
474
+ },
394
475
  },
395
476
  }
477
+ if getattr(opts, "system_prompt", None):
478
+ session_payload["session"]["instructions"] = str(getattr(opts, "system_prompt"))
479
+
480
+ # Turn detection (server VAD) or manual turns
396
481
  turn_mode = TurnMode.AUTO if bool(getattr(opts, "auto_turn", False)) else TurnMode.MANUAL
397
482
  apply_turn_mode_openai(session_payload, turn_mode)
398
483
  self._tune_openai_vad(session_payload, opts)
399
484
 
400
- # Attach tools to session (remote + functions)
485
+ # Attach tools to session (xAI expects tools only in session.update)
401
486
  try:
402
- session_tools = prepare_tools_for_session(opts)
487
+ session_tools = self._compose_xai_tools(
488
+ getattr(opts, "tools", None),
489
+ getattr(opts, "remote_tools", None),
490
+ )
403
491
  if session_tools:
404
492
  session_payload["session"]["tools"] = session_tools
405
493
  self._cached_session_tools_sig = tools_signature(session_tools)
@@ -412,17 +500,6 @@ class xAIIRealtimeClient:
412
500
  print(f"[open_session] tools sanitize error: {_e}")
413
501
  self._cached_session_tools_sig = tools_signature([])
414
502
 
415
- # Attach native input transcription if requested
416
- try:
417
- if transcribe_enabled:
418
- iat = {"model": "whisper-1"}
419
- lang = getattr(opts, "transcribe_language", None) or getattr(opts, "language", None)
420
- if lang:
421
- iat["language"] = str(lang)
422
- session_payload["session"]["input_audio_transcription"] = iat
423
- except Exception:
424
- pass
425
-
426
503
  if self.debug:
427
504
  print(f"[open_session] session_payload: {json.dumps(session_payload)}")
428
505
 
@@ -542,9 +619,11 @@ class xAIIRealtimeClient:
542
619
  return False
543
620
 
544
621
  is_auto_turn = _bool(getattr(self._last_opts or object(), "auto_turn", False))
545
- has_text = bool(prompt and str(prompt).strip() and str(prompt).strip() != "...")
622
+ has_text = False
623
+ if prompt is not None:
624
+ p = str(prompt).strip()
625
+ has_text = bool(p and p != "...")
546
626
  has_audio = bool(audio_data)
547
- # Honor explicit "reply" hint if provided by caller (e.g., opts.extra.reply == True)
548
627
  reply_hint = False
549
628
  try:
550
629
  extra = getattr(self._last_opts, "extra", None)
@@ -553,7 +632,6 @@ class xAIIRealtimeClient:
553
632
  except Exception:
554
633
  pass
555
634
 
556
- # In manual mode, do not auto-trigger response.create when there is no user input and no explicit reply request.
557
635
  if not has_text and not has_audio and not reply_hint:
558
636
  if self.debug:
559
637
  print("[send_turn] skipped: manual mode with empty input; waiting for explicit commit")
@@ -580,7 +658,7 @@ class xAIIRealtimeClient:
580
658
  },
581
659
  }))
582
660
 
583
- # Optional audio
661
+ # Optional audio (manual turn control flow)
584
662
  if has_audio:
585
663
  sr, _ch, pcm = coerce_to_pcm16_mono(audio_data, audio_format, audio_rate, fallback_rate=self._DEFAULT_RATE)
586
664
 
@@ -594,7 +672,7 @@ class xAIIRealtimeClient:
594
672
  if self.debug:
595
673
  print(f"[audio] resample failed {sr}->{self._DEFAULT_RATE}: {e}")
596
674
 
597
- await self.ws.send(json.dumps({"type": "input_audio_buffer.clear"}))
675
+ # Append PCM and commit input buffer
598
676
  for chunk in iter_pcm_chunks(pcm, sr, ms=50):
599
677
  if not chunk:
600
678
  continue
@@ -623,23 +701,11 @@ class xAIIRealtimeClient:
623
701
  self._response_done = asyncio.Event()
624
702
  wait_curr = self._response_done # snapshot for race-free waiting
625
703
 
626
- # Build optional response payload (modalities + tools/tool_choice)
627
- resp_obj = {"modalities": ["text", "audio"]}
628
- try:
629
- resp_tools, tool_choice = prepare_tools_for_response(self._last_opts)
630
- if resp_tools:
631
- resp_obj["tools"] = resp_tools
632
- if tool_choice is None:
633
- tool_choice = "auto"
634
- if tool_choice:
635
- resp_obj["tool_choice"] = tool_choice
636
- except Exception as _e:
637
- if self.debug:
638
- print(f"[send_turn] response tools compose error: {_e}")
639
-
640
- payload = {"type": "response.create"}
641
- if len(resp_obj) > 0:
642
- payload["response"] = resp_obj
704
+ # Build minimal response payload for xAI (tools are configured only via session.update)
705
+ payload = {
706
+ "type": "response.create",
707
+ "response": {"modalities": ["text", "audio"]},
708
+ }
643
709
 
644
710
  await self.ws.send(json.dumps(payload))
645
711
  if self.debug:
@@ -694,7 +760,6 @@ class xAIIRealtimeClient:
694
760
  """
695
761
  Owner-loop implementation: push live audio to input buffer in auto-turn mode.
696
762
  """
697
- # Session must be open and auto-turn must be enabled
698
763
  if not self.ws or not self._running:
699
764
  if self.debug:
700
765
  print("[_rt_handle_audio_input] Socket not open!")
@@ -727,7 +792,6 @@ class xAIIRealtimeClient:
727
792
  pcm = resample_pcm16_mono(pcm, sr, self._DEFAULT_RATE)
728
793
  sr = self._DEFAULT_RATE
729
794
  except Exception:
730
- # On resample failure, still try to send raw chunk as-is (defensive)
731
795
  sr = self._DEFAULT_RATE
732
796
  except Exception:
733
797
  return
@@ -749,14 +813,10 @@ class xAIIRealtimeClient:
749
813
  except Exception:
750
814
  return
751
815
 
752
- # If plugin reported stream end, flush the buffer once.
816
+ # With server VAD enabled, the server commits the buffer automatically.
753
817
  if is_final:
754
- try:
755
- if self.debug:
756
- print("[_rt_handle_audio_input] final chunk; committing")
757
- await self.ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
758
- except Exception:
759
- pass
818
+ if self.debug:
819
+ print("[_rt_handle_audio_input] final chunk sent (server VAD will commit)")
760
820
 
761
821
  def commit_audio_input_sync(self, timeout: float = 0.5):
762
822
  """
@@ -827,22 +887,12 @@ class xAIIRealtimeClient:
827
887
  except Exception:
828
888
  self._response_done = asyncio.Event()
829
889
 
830
- # 3) Build response payload (modalities + tools/tool_choice like in _send_turn_internal)
831
- resp_obj = {"modalities": ["text", "audio"]}
832
- try:
833
- resp_tools, tool_choice = prepare_tools_for_response(self._last_opts)
834
- if resp_tools:
835
- resp_obj["tools"] = resp_tools
836
- if tool_choice is None:
837
- tool_choice = "auto"
838
- if tool_choice:
839
- resp_obj["tool_choice"] = tool_choice
840
- except Exception:
841
- pass
842
-
843
- # 4) Trigger the assistant response now
890
+ # 3) Trigger the assistant response now
844
891
  try:
845
- await self.ws.send(json.dumps({"type": "response.create", "response": resp_obj}))
892
+ await self.ws.send(json.dumps({
893
+ "type": "response.create",
894
+ "response": {"modalities": ["text", "audio"]},
895
+ }))
846
896
  except Exception:
847
897
  return
848
898
 
@@ -893,11 +943,12 @@ class xAIIRealtimeClient:
893
943
  print("[update_session_tools] WS not open; cached for next session")
894
944
  return
895
945
 
896
- # Sanitize/compose session tools
946
+ # Compose xAI-shaped session tools (provider tools + function tools)
897
947
  try:
898
- fn = sanitize_function_tools(tools if tools is not None else getattr(self._last_opts, "tools", None))
899
- rt = sanitize_remote_tools(remote_tools if remote_tools is not None else getattr(self._last_opts, "remote_tools", None))
900
- session_tools = (rt or []) + (fn or [])
948
+ session_tools = self._compose_xai_tools(
949
+ tools if tools is not None else getattr(self._last_opts, "tools", None),
950
+ remote_tools if remote_tools is not None else getattr(self._last_opts, "remote_tools", None),
951
+ )
901
952
  except Exception as e:
902
953
  if self.debug:
903
954
  print(f"[update_session_tools] sanitize error: {e}")
@@ -990,7 +1041,7 @@ class xAIIRealtimeClient:
990
1041
  "item": {
991
1042
  "type": "function_call_output",
992
1043
  "call_id": it["call_id"],
993
- "output": it["output"], # must be a string (JSON-encoded when dict/list)
1044
+ "output": it["output"],
994
1045
  },
995
1046
  }
996
1047
  if it.get("previous_item_id"):
@@ -1007,7 +1058,10 @@ class xAIIRealtimeClient:
1007
1058
  except Exception:
1008
1059
  self._response_done = asyncio.Event()
1009
1060
  wait_ev = self._response_done # snapshot for race-free waiting
1010
- await self.ws.send(json.dumps({"type": "response.create"}))
1061
+ await self.ws.send(json.dumps({
1062
+ "type": "response.create",
1063
+ "response": {"modalities": ["text", "audio"]},
1064
+ }))
1011
1065
 
1012
1066
  # Wait for the follow-up response to complete
1013
1067
  if continue_turn and wait_for_done and wait_ev:
@@ -1047,7 +1101,6 @@ class xAIIRealtimeClient:
1047
1101
  break
1048
1102
 
1049
1103
  if isinstance(raw, bytes):
1050
- # Realtime sends JSON text frames; ignore unexpected binary
1051
1104
  continue
1052
1105
 
1053
1106
  try:
@@ -1057,7 +1110,7 @@ class xAIIRealtimeClient:
1057
1110
 
1058
1111
  etype = ev.get("type")
1059
1112
 
1060
- # ---- session lifecycle (capture server handle) ----
1113
+ # ---- session / conversation lifecycle ----
1061
1114
  if etype in ("session.created", "session.updated"):
1062
1115
  sess = ev.get("session") or {}
1063
1116
  sid = sess.get("id")
@@ -1066,7 +1119,6 @@ class xAIIRealtimeClient:
1066
1119
  set_ctx_rt_handle(self._ctx, self._rt_session_id, self.window)
1067
1120
  if self.debug:
1068
1121
  print(f"[_recv_loop] session id: {self._rt_session_id}")
1069
- # Optional: expires_at if present (not always provided)
1070
1122
  exp = sess.get("expires_at") or sess.get("expiresAt")
1071
1123
  try:
1072
1124
  if isinstance(exp, (int, float)) and exp > 0:
@@ -1076,6 +1128,16 @@ class xAIIRealtimeClient:
1076
1128
  pass
1077
1129
  continue
1078
1130
 
1131
+ if etype == "conversation.created":
1132
+ conv = ev.get("conversation") or {}
1133
+ cid = conv.get("id")
1134
+ if isinstance(cid, str) and cid.strip():
1135
+ self._rt_session_id = cid.strip()
1136
+ set_ctx_rt_handle(self._ctx, self._rt_session_id, self.window)
1137
+ if self.debug:
1138
+ print(f"[_recv_loop] conversation id: {self._rt_session_id}")
1139
+ continue
1140
+
1079
1141
  if etype == "response.created":
1080
1142
  if self.debug:
1081
1143
  print("[_recv_loop] response created")
@@ -1091,16 +1153,18 @@ class xAIIRealtimeClient:
1091
1153
  if self.debug:
1092
1154
  print("[_recv_loop] speech_stopped")
1093
1155
 
1094
- elif etype == "input_audio_buffer.committed":
1156
+ elif etype in ("conversation.item.committed", "input_audio_buffer.committed"):
1095
1157
  if self.debug:
1096
- print("[_recv_loop] audio_buffer.committed")
1097
-
1098
- # disable mic input if auto-commit
1158
+ print("[_recv_loop] audio_buffer committed")
1099
1159
  if self._last_opts:
1100
1160
  self._last_opts.rt_signals.response.emit(RealtimeEvent(RealtimeEvent.RT_OUTPUT_AUDIO_COMMIT, {
1101
1161
  "ctx": self._ctx,
1102
1162
  }))
1103
1163
 
1164
+ elif etype == "input_audio_buffer.cleared":
1165
+ if self.debug:
1166
+ print("[_recv_loop] audio_buffer.cleared")
1167
+
1104
1168
  # ---- input transcription (user speech) ----
1105
1169
  elif etype == "conversation.item.input_audio_transcription.delta":
1106
1170
  if self._transcribe_enabled():
@@ -1131,15 +1195,9 @@ class xAIIRealtimeClient:
1131
1195
  if tr:
1132
1196
  self._save_input_transcript(tr)
1133
1197
 
1134
- elif etype == "conversation.item.input_audio_transcription.failed":
1135
- if self.debug:
1136
- err = (ev.get("error") or {}).get("message") or "input transcription failed"
1137
- print(f"[_recv_loop] {err}")
1138
-
1139
- elif etype == "conversation.item.created":
1198
+ elif etype in ("conversation.item.created", "conversation.item.added"):
1140
1199
  if self.debug:
1141
- print("[_recv_loop] conversation.item.created")
1142
- # Fallback: some servers may include transcript inside the created user item
1200
+ print("[_recv_loop] conversation item event")
1143
1201
  if self._transcribe_enabled():
1144
1202
  item = ev.get("item") or {}
1145
1203
  if item.get("role") == "user":
@@ -1161,7 +1219,8 @@ class xAIIRealtimeClient:
1161
1219
  await self._on_text(str(delta))
1162
1220
  except Exception:
1163
1221
  pass
1164
- elif etype == "response.audio_transcript.delta":
1222
+
1223
+ elif etype in ("response.audio_transcript.delta", "response.output_audio_transcript.delta"):
1165
1224
  if self._transcribe_enabled():
1166
1225
  delta = ev.get("delta") or ev.get("text")
1167
1226
  if isinstance(delta, dict) and "text" in delta:
@@ -1174,9 +1233,10 @@ class xAIIRealtimeClient:
1174
1233
  except Exception:
1175
1234
  pass
1176
1235
 
1177
- elif etype in ("response.text.done", "response.output_text.done", "response.audio_transcript.done"):
1236
+ elif etype in ("response.text.done", "response.output_text.done",
1237
+ "response.audio_transcript.done", "response.output_audio_transcript.done"):
1178
1238
  if self.debug:
1179
- print("[_recv_loop] text done")
1239
+ print("[_recv_loop] text/transcript done")
1180
1240
 
1181
1241
  elif etype == "response.content_part.added":
1182
1242
  part = ev.get("part") or {}
@@ -1207,7 +1267,7 @@ class xAIIRealtimeClient:
1207
1267
  except Exception:
1208
1268
  pass
1209
1269
 
1210
- elif etype == "response.audio.delta":
1270
+ elif etype in ("response.audio.delta", "response.output_audio.delta"):
1211
1271
  b64 = ev.get("delta")
1212
1272
  if b64 and self._on_audio:
1213
1273
  try:
@@ -1216,7 +1276,7 @@ class xAIIRealtimeClient:
1216
1276
  except Exception:
1217
1277
  pass
1218
1278
 
1219
- elif etype == "response.audio.done":
1279
+ elif etype in ("response.audio.done", "response.output_audio.done"):
1220
1280
  if self.debug:
1221
1281
  print("[_recv_loop] audio done")
1222
1282
  if not audio_done and self._on_audio:
@@ -1358,7 +1418,6 @@ class xAIIRealtimeClient:
1358
1418
  elif etype == "response.done":
1359
1419
  if self.debug:
1360
1420
  print("[_recv_loop] response done")
1361
- # Ensure audio finalized
1362
1421
  if not audio_done and self._on_audio:
1363
1422
  try:
1364
1423
  await self._on_audio(b"", "audio/pcm", DEFAULT_RATE, 1, True)
@@ -1368,14 +1427,12 @@ class xAIIRealtimeClient:
1368
1427
 
1369
1428
  self._response_active = False
1370
1429
 
1371
- # Capture usage if present on response
1372
1430
  try:
1373
1431
  resp_obj = ev.get("response") or {}
1374
1432
  self._rt_capture_usage(resp_obj)
1375
1433
  except Exception:
1376
1434
  pass
1377
1435
 
1378
- # Build final output text
1379
1436
  output = "".join(self._rt_state["output_parts"]) if self._rt_state else ""
1380
1437
  if has_unclosed_code_tag(output):
1381
1438
  output += "\n```"
@@ -1387,7 +1444,6 @@ class xAIIRealtimeClient:
1387
1444
  except Exception:
1388
1445
  pass
1389
1446
 
1390
- # Persist into ctx
1391
1447
  try:
1392
1448
  if self._ctx:
1393
1449
  self._ctx.output = output or (self._ctx.output or "")
@@ -1413,7 +1469,6 @@ class xAIIRealtimeClient:
1413
1469
  except Exception:
1414
1470
  pass
1415
1471
 
1416
- # Citations
1417
1472
  if self._rt_state and self._rt_state["citations"]:
1418
1473
  if self._ctx.urls is None:
1419
1474
  self._ctx.urls = []
@@ -1421,7 +1476,6 @@ class xAIIRealtimeClient:
1421
1476
  if u not in self._ctx.urls:
1422
1477
  self._ctx.urls.append(u)
1423
1478
 
1424
- # Images
1425
1479
  if self._rt_state and self._rt_state["image_paths"]:
1426
1480
  if not isinstance(self._ctx.images, list):
1427
1481
  self._ctx.images = []
@@ -1433,7 +1487,6 @@ class xAIIRealtimeClient:
1433
1487
  except Exception:
1434
1488
  pass
1435
1489
 
1436
- # Download container files if any
1437
1490
  try:
1438
1491
  files = (self._rt_state or {}).get("files") or []
1439
1492
  if files:
@@ -1441,7 +1494,6 @@ class xAIIRealtimeClient:
1441
1494
  except Exception:
1442
1495
  pass
1443
1496
 
1444
- # Unpack tool calls if any
1445
1497
  try:
1446
1498
  tcs = (self._rt_state or {}).get("tool_calls") or []
1447
1499
  if tcs:
@@ -1456,7 +1508,6 @@ class xAIIRealtimeClient:
1456
1508
  except Exception:
1457
1509
  pass
1458
1510
 
1459
- # Persist last tool calls snapshot for mapping tool outputs
1460
1511
  try:
1461
1512
  tcs = (self._rt_state or {}).get("tool_calls") or []
1462
1513
  if tcs:
@@ -1464,23 +1515,19 @@ class xAIIRealtimeClient:
1464
1515
  except Exception:
1465
1516
  pass
1466
1517
 
1467
- # Unblock waiters
1468
1518
  if self._response_done:
1469
1519
  self._response_done.set()
1470
1520
 
1471
- # send RT_OUTPUT_TURN_END signal
1472
1521
  if self._last_opts:
1473
1522
  self._last_opts.rt_signals.response.emit(RealtimeEvent(RealtimeEvent.RT_OUTPUT_TURN_END, {
1474
1523
  "ctx": self._ctx,
1475
1524
  }))
1476
1525
 
1477
- # Reset per-response extraction state
1478
1526
  self._rt_state = None
1479
1527
 
1480
1528
  elif etype == "error":
1481
1529
  if self.debug:
1482
1530
  print(f"[_recv_loop] error event: {ev}")
1483
- # Session expiration and other errors
1484
1531
  err = ev.get("error") or {}
1485
1532
  msg = (err.get("message") or "")
1486
1533
  code = (err.get("code") or "")
@@ -1505,7 +1552,6 @@ class xAIIRealtimeClient:
1505
1552
  finally:
1506
1553
  if self.debug:
1507
1554
  print("[_recv_loop] stopped")
1508
- # Ensure any waiters are unblocked on socket teardown
1509
1555
  try:
1510
1556
  if self._response_done and not self._response_done.is_set():
1511
1557
  self._response_done.set()
@@ -1533,7 +1579,7 @@ class xAIIRealtimeClient:
1533
1579
  return str(v)
1534
1580
  except Exception:
1535
1581
  pass
1536
- return "alloy"
1582
+ return "Ara"
1537
1583
 
1538
1584
  def _extract_text_from_response_done(self, ev: dict) -> str:
1539
1585
  """
@@ -1652,7 +1698,7 @@ class xAIIRealtimeClient:
1652
1698
  if self._ctx:
1653
1699
  if not isinstance(self._ctx.extra, dict):
1654
1700
  self._ctx.extra = {}
1655
- self._ctx.input.extra["input_transcript"] = str(transcript)
1701
+ self._ctx.extra["input_transcript"] = str(transcript)
1656
1702
  if not getattr(self._last_opts, "prompt", None):
1657
1703
  self._ctx.input = str(transcript)
1658
1704
  self.window.core.ctx.update_item(self._ctx)
@@ -1667,18 +1713,15 @@ class xAIIRealtimeClient:
1667
1713
  sess = session_payload.get("session") or {}
1668
1714
  td = sess.get("turn_detection")
1669
1715
  if not isinstance(td, dict):
1670
- return # manual mode or VAD disabled
1716
+ return
1671
1717
 
1672
- # Resolve target silence (default +2000 ms)
1673
1718
  target_ms = getattr(opts, "vad_end_silence_ms", None)
1674
1719
  if not isinstance(target_ms, (int, float)) or target_ms <= 0:
1675
- # If user didn't override, ensure at least 2000 ms
1676
1720
  base = int(td.get("silence_duration_ms") or 500)
1677
1721
  target_ms = max(base, 2000)
1678
1722
 
1679
1723
  td["silence_duration_ms"] = int(target_ms)
1680
1724
 
1681
- # Optional: prefix padding before detected speech
1682
1725
  prefix_ms = getattr(opts, "vad_prefix_padding_ms", None)
1683
1726
  if isinstance(prefix_ms, (int, float)) and prefix_ms >= 0:
1684
1727
  td["prefix_padding_ms"] = int(prefix_ms)
@@ -1736,16 +1779,14 @@ class xAIIRealtimeClient:
1736
1779
 
1737
1780
  async with self._send_lock:
1738
1781
  try:
1739
- # Build base session.update; let helper set correct turn_detection shape
1740
1782
  payload: dict = {"type": "session.update", "session": {}}
1741
1783
  turn_mode = TurnMode.AUTO if enabled else TurnMode.MANUAL
1742
- apply_turn_mode_openai(payload, turn_mode) # sets session.turn_detection (AUTO) or None (MANUAL)
1784
+ apply_turn_mode_openai(payload, turn_mode)
1743
1785
 
1744
1786
  if enabled:
1745
1787
  sess = payload.get("session", {})
1746
1788
  td = sess.get("turn_detection")
1747
1789
 
1748
- # Optional VAD type override via opts.vad_type ("server_vad" | "semantic_vad")
1749
1790
  try:
1750
1791
  vad_type = getattr(self._last_opts, "vad_type", None)
1751
1792
  if isinstance(vad_type, str) and vad_type in ("server_vad", "semantic_vad"):
@@ -1754,7 +1795,6 @@ class xAIIRealtimeClient:
1754
1795
  except Exception:
1755
1796
  pass
1756
1797
 
1757
- # Optional threshold for server_vad
1758
1798
  try:
1759
1799
  thr = getattr(self._last_opts, "vad_threshold", None)
1760
1800
  if isinstance(thr, (int, float)) and isinstance(td, dict) and td.get("type") == "server_vad":
@@ -1762,17 +1802,14 @@ class xAIIRealtimeClient:
1762
1802
  except Exception:
1763
1803
  pass
1764
1804
 
1765
- # Apply defaults based on opts first
1766
1805
  self._tune_openai_vad(payload, self._last_opts)
1767
1806
 
1768
- # Then hard-override with explicit args (user provided values win)
1769
1807
  if isinstance(td, dict):
1770
1808
  if silence_ms is not None:
1771
1809
  td["silence_duration_ms"] = int(silence_ms)
1772
1810
  if prefix_ms is not None:
1773
1811
  td["prefix_padding_ms"] = int(prefix_ms)
1774
1812
 
1775
- # Optional flags from opts
1776
1813
  try:
1777
1814
  cr = getattr(self._last_opts, "vad_create_response", None)
1778
1815
  if isinstance(cr, bool):
@@ -1786,10 +1823,8 @@ class xAIIRealtimeClient:
1786
1823
  except Exception:
1787
1824
  pass
1788
1825
 
1789
- # Send the update
1790
1826
  await self.ws.send(json.dumps(payload))
1791
1827
 
1792
- # Update local opts snapshot so next calls keep the same settings
1793
1828
  try:
1794
1829
  if self._last_opts:
1795
1830
  setattr(self._last_opts, "auto_turn", bool(enabled))
@@ -1820,6 +1855,10 @@ class xAIIRealtimeClient:
1820
1855
  """Check if the WS session is currently open."""
1821
1856
  return self.ws is not None and self._running
1822
1857
 
1858
+ def is_session(self) -> bool:
1859
+ """Check if the WS session is currently open."""
1860
+ return self.ws is not None
1861
+
1823
1862
  def update_ctx(self, ctx: CtxItem):
1824
1863
  """Update the current CtxItem (for session handle persistence)."""
1825
1864
  self._ctx = ctx