solana-agent 31.2.6__py3-none-any.whl → 31.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- solana_agent/adapters/openai_realtime_ws.py +160 -31
- solana_agent/client/solana_agent.py +7 -1
- solana_agent/interfaces/client/client.py +3 -1
- solana_agent/interfaces/providers/__init__.py +0 -0
- solana_agent/interfaces/providers/realtime.py +113 -1
- solana_agent/interfaces/services/query.py +3 -1
- solana_agent/services/query.py +422 -107
- solana_agent/services/realtime.py +123 -17
- {solana_agent-31.2.6.dist-info → solana_agent-31.3.0.dist-info}/METADATA +115 -9
- {solana_agent-31.2.6.dist-info → solana_agent-31.3.0.dist-info}/RECORD +13 -12
- {solana_agent-31.2.6.dist-info → solana_agent-31.3.0.dist-info}/LICENSE +0 -0
- {solana_agent-31.2.6.dist-info → solana_agent-31.3.0.dist-info}/WHEEL +0 -0
- {solana_agent-31.2.6.dist-info → solana_agent-31.3.0.dist-info}/entry_points.txt +0 -0
solana_agent/services/query.py
CHANGED
@@ -37,6 +37,8 @@ from solana_agent.interfaces.services.knowledge_base import (
|
|
37
37
|
)
|
38
38
|
from solana_agent.interfaces.guardrails.guardrails import InputGuardrail
|
39
39
|
|
40
|
+
from solana_agent.interfaces.providers.realtime import RealtimeSessionOptions
|
41
|
+
|
40
42
|
from solana_agent.services.agent import AgentService
|
41
43
|
from solana_agent.services.routing import RoutingService
|
42
44
|
|
@@ -94,6 +96,7 @@ class QueryService(QueryServiceInterface):
|
|
94
96
|
encode_out: bool,
|
95
97
|
audio_input_format: str,
|
96
98
|
audio_output_format: str,
|
99
|
+
rt_output_modalities: Optional[List[Literal["audio", "text"]]] = None,
|
97
100
|
) -> Any:
|
98
101
|
"""Get a free (or new) realtime session for this user. Marks it busy via an internal lock.
|
99
102
|
|
@@ -148,6 +151,7 @@ class QueryService(QueryServiceInterface):
|
|
148
151
|
output_rate_hz=24000,
|
149
152
|
input_mime="audio/pcm",
|
150
153
|
output_mime="audio/pcm",
|
154
|
+
output_modalities=rt_output_modalities,
|
151
155
|
tools=initial_tools or None,
|
152
156
|
tool_choice="auto",
|
153
157
|
)
|
@@ -187,9 +191,7 @@ class QueryService(QueryServiceInterface):
|
|
187
191
|
) -> None:
|
188
192
|
self._sticky_sessions[user_id] = {
|
189
193
|
"agent": agent_name,
|
190
|
-
"started_at":
|
191
|
-
"started_at", time.time()
|
192
|
-
),
|
194
|
+
"started_at": time.time(),
|
193
195
|
"last_updated": time.time(),
|
194
196
|
"required_complete": required_complete,
|
195
197
|
}
|
@@ -201,6 +203,13 @@ class QueryService(QueryServiceInterface):
|
|
201
203
|
self._sticky_sessions[user_id]["required_complete"] = required_complete
|
202
204
|
self._sticky_sessions[user_id]["last_updated"] = time.time()
|
203
205
|
|
206
|
+
def _clear_sticky_agent(self, user_id: str) -> None:
|
207
|
+
if user_id in self._sticky_sessions:
|
208
|
+
try:
|
209
|
+
del self._sticky_sessions[user_id]
|
210
|
+
except Exception:
|
211
|
+
pass
|
212
|
+
|
204
213
|
async def _build_combined_context(
|
205
214
|
self,
|
206
215
|
user_id: str,
|
@@ -514,6 +523,7 @@ class QueryService(QueryServiceInterface):
|
|
514
523
|
vad: Optional[bool] = None,
|
515
524
|
rt_encode_input: bool = False,
|
516
525
|
rt_encode_output: bool = False,
|
526
|
+
rt_output_modalities: Optional[List[Literal["audio", "text"]]] = None,
|
517
527
|
rt_voice: Literal[
|
518
528
|
"alloy",
|
519
529
|
"ash",
|
@@ -526,6 +536,12 @@ class QueryService(QueryServiceInterface):
|
|
526
536
|
"shimmer",
|
527
537
|
"verse",
|
528
538
|
] = "marin",
|
539
|
+
# Realtime transcription configuration (new)
|
540
|
+
rt_transcription_model: Optional[str] = None,
|
541
|
+
rt_transcription_language: Optional[str] = None,
|
542
|
+
rt_transcription_prompt: Optional[str] = None,
|
543
|
+
rt_transcription_noise_reduction: Optional[bool] = None,
|
544
|
+
rt_transcription_include_logprobs: bool = False,
|
529
545
|
audio_voice: Literal[
|
530
546
|
"alloy",
|
531
547
|
"ash",
|
@@ -554,31 +570,13 @@ class QueryService(QueryServiceInterface):
|
|
554
570
|
try:
|
555
571
|
# Realtime request: HTTP STT for user + single WS for assistant audio
|
556
572
|
if realtime:
|
557
|
-
# 1)
|
573
|
+
# 1) Determine if input is audio bytes. We now ALWAYS skip HTTP STT in realtime mode.
|
574
|
+
# The realtime websocket session (optionally with built-in transcription) is authoritative.
|
558
575
|
is_audio_bytes = isinstance(query, (bytes, bytearray))
|
559
|
-
user_text = ""
|
560
|
-
|
561
|
-
if is_audio_bytes:
|
562
|
-
|
563
|
-
async def _stt_consume():
|
564
|
-
txt = ""
|
565
|
-
try:
|
566
|
-
logger.info(
|
567
|
-
f"Realtime(HTTP STT): transcribing format: {audio_input_format}"
|
568
|
-
)
|
569
|
-
async for (
|
570
|
-
t
|
571
|
-
) in self.agent_service.llm_provider.transcribe_audio( # type: ignore[attr-defined]
|
572
|
-
query, audio_input_format
|
573
|
-
):
|
574
|
-
txt += t
|
575
|
-
except Exception as e:
|
576
|
-
logger.error(f"HTTP STT error: {e}")
|
577
|
-
return txt
|
578
|
-
|
579
|
-
stt_task = asyncio.create_task(_stt_consume())
|
580
|
-
else:
|
581
|
-
user_text = str(query)
|
576
|
+
user_text = "" if is_audio_bytes else str(query)
|
577
|
+
# Provide a sensible default realtime transcription model when audio supplied
|
578
|
+
if is_audio_bytes and not rt_transcription_model:
|
579
|
+
rt_transcription_model = "gpt-4o-mini-transcribe"
|
582
580
|
|
583
581
|
# 2) Single agent selection (no multi-agent routing in realtime path)
|
584
582
|
agent_name = self._get_sticky_agent(user_id)
|
@@ -693,6 +691,17 @@ class QueryService(QueryServiceInterface):
|
|
693
691
|
encode_out = bool(
|
694
692
|
rt_encode_output or (audio_output_format.lower() != "pcm")
|
695
693
|
)
|
694
|
+
# If caller explicitly requests text-only realtime, disable output encoding entirely
|
695
|
+
if (
|
696
|
+
rt_output_modalities is not None
|
697
|
+
and "audio" not in rt_output_modalities
|
698
|
+
):
|
699
|
+
if encode_out:
|
700
|
+
logger.debug(
|
701
|
+
"Realtime(QueryService): forcing encode_out False for text-only modalities=%s",
|
702
|
+
rt_output_modalities,
|
703
|
+
)
|
704
|
+
encode_out = False
|
696
705
|
# Choose input transcoding when compressed input is provided (or explicitly requested)
|
697
706
|
is_audio_bytes = isinstance(query, (bytes, bytearray))
|
698
707
|
encode_in = bool(
|
@@ -700,7 +709,8 @@ class QueryService(QueryServiceInterface):
|
|
700
709
|
or (is_audio_bytes and audio_input_format.lower() != "pcm")
|
701
710
|
)
|
702
711
|
|
703
|
-
# Allocate or reuse a realtime session for this specific request/user
|
712
|
+
# Allocate or reuse a realtime session for this specific request/user.
|
713
|
+
# (Transcription options may be applied below; if they change after allocate we will reconfigure.)
|
704
714
|
rt = await self._alloc_realtime_session(
|
705
715
|
user_id,
|
706
716
|
api_key=api_key,
|
@@ -711,9 +721,46 @@ class QueryService(QueryServiceInterface):
|
|
711
721
|
encode_out=encode_out,
|
712
722
|
audio_input_format=audio_input_format,
|
713
723
|
audio_output_format=audio_output_format,
|
724
|
+
rt_output_modalities=rt_output_modalities,
|
714
725
|
)
|
715
726
|
# Ensure lock is released no matter what
|
716
727
|
try:
|
728
|
+
# --- Apply realtime transcription config BEFORE connecting (new) ---
|
729
|
+
if rt_transcription_model and hasattr(rt, "_options"):
|
730
|
+
try:
|
731
|
+
setattr(
|
732
|
+
rt._options,
|
733
|
+
"transcription_model",
|
734
|
+
rt_transcription_model,
|
735
|
+
)
|
736
|
+
if rt_transcription_language is not None:
|
737
|
+
setattr(
|
738
|
+
rt._options,
|
739
|
+
"transcription_language",
|
740
|
+
rt_transcription_language,
|
741
|
+
)
|
742
|
+
if rt_transcription_prompt is not None:
|
743
|
+
setattr(
|
744
|
+
rt._options,
|
745
|
+
"transcription_prompt",
|
746
|
+
rt_transcription_prompt,
|
747
|
+
)
|
748
|
+
if rt_transcription_noise_reduction is not None:
|
749
|
+
setattr(
|
750
|
+
rt._options,
|
751
|
+
"transcription_noise_reduction",
|
752
|
+
rt_transcription_noise_reduction,
|
753
|
+
)
|
754
|
+
if rt_transcription_include_logprobs:
|
755
|
+
setattr(
|
756
|
+
rt._options, "transcription_include_logprobs", True
|
757
|
+
)
|
758
|
+
except Exception:
|
759
|
+
logger.debug(
|
760
|
+
"Failed pre-connect transcription option assignment",
|
761
|
+
exc_info=True,
|
762
|
+
)
|
763
|
+
|
717
764
|
# Tool executor
|
718
765
|
async def _exec(
|
719
766
|
tool_name: str, args: Dict[str, Any]
|
@@ -755,16 +802,47 @@ class QueryService(QueryServiceInterface):
|
|
755
802
|
except Exception:
|
756
803
|
pass
|
757
804
|
|
758
|
-
#
|
805
|
+
# Begin streaming turn (defer user transcript persistence until final to avoid duplicates)
|
759
806
|
turn_id = await self.realtime_begin_turn(user_id)
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
807
|
+
# We'll buffer the full user transcript (text input or realtime audio transcription) and persist exactly once.
|
808
|
+
# Initialize empty; we'll build it strictly from realtime transcript segments to avoid
|
809
|
+
# accidental duplication with pre-supplied user_text or prior buffers.
|
810
|
+
final_user_tr: str = ""
|
811
|
+
user_persisted = False
|
812
|
+
|
813
|
+
# Feed audio into WS if audio bytes provided and audio modality requested; else treat as text
|
814
|
+
wants_audio = (
|
815
|
+
(
|
816
|
+
getattr(rt, "_options", None)
|
817
|
+
and getattr(rt, "_options").output_modalities
|
818
|
+
)
|
819
|
+
and "audio" in getattr(rt, "_options").output_modalities # type: ignore[attr-defined]
|
820
|
+
) or (
|
821
|
+
rt_output_modalities is None
|
822
|
+
or (rt_output_modalities and "audio" in rt_output_modalities)
|
823
|
+
)
|
824
|
+
# Determine if realtime transcription should be enabled (always skip HTTP STT regardless)
|
825
|
+
# realtime_transcription_enabled now implicit (options set before connect)
|
765
826
|
|
766
|
-
|
767
|
-
|
827
|
+
if is_audio_bytes and not wants_audio:
|
828
|
+
# Feed audio solely for transcription (no audio output requested)
|
829
|
+
bq = bytes(query)
|
830
|
+
logger.info(
|
831
|
+
"Realtime: appending input audio for transcription only, len=%d, fmt=%s",
|
832
|
+
len(bq),
|
833
|
+
audio_input_format,
|
834
|
+
)
|
835
|
+
await rt.append_audio(bq)
|
836
|
+
vad_enabled_value = bool(vad) if vad is not None else False
|
837
|
+
if not vad_enabled_value:
|
838
|
+
await rt.commit_input()
|
839
|
+
# Request only text response
|
840
|
+
await rt.create_response({"modalities": ["text"]})
|
841
|
+
else:
|
842
|
+
logger.debug(
|
843
|
+
"Realtime: VAD enabled (text-only output) — skipping manual response.create"
|
844
|
+
)
|
845
|
+
if is_audio_bytes and wants_audio:
|
768
846
|
bq = bytes(query)
|
769
847
|
logger.info(
|
770
848
|
"Realtime: appending input audio to WS via FFmpeg, len=%d, fmt=%s",
|
@@ -782,64 +860,329 @@ class QueryService(QueryServiceInterface):
|
|
782
860
|
logger.debug(
|
783
861
|
"Realtime: VAD enabled — skipping manual response.create"
|
784
862
|
)
|
785
|
-
else:
|
786
|
-
#
|
787
|
-
await rt.
|
863
|
+
else: # Text-only path OR caller excluded audio modality
|
864
|
+
# For text input, create conversation item first, then response
|
865
|
+
await rt.create_conversation_item(
|
788
866
|
{
|
789
|
-
"
|
790
|
-
"
|
867
|
+
"type": "message",
|
868
|
+
"role": "user",
|
869
|
+
"content": [
|
791
870
|
{"type": "input_text", "text": user_text or ""}
|
792
871
|
],
|
793
872
|
}
|
794
873
|
)
|
874
|
+
# Determine effective modalities (fall back to provided override or text only)
|
875
|
+
if rt_output_modalities is not None:
|
876
|
+
modalities = rt_output_modalities or ["text"]
|
877
|
+
else:
|
878
|
+
mo = getattr(
|
879
|
+
rt, "_options", RealtimeSessionOptions()
|
880
|
+
).output_modalities
|
881
|
+
modalities = mo if mo else ["audio"]
|
882
|
+
if "audio" not in modalities:
|
883
|
+
# Ensure we do not accidentally request audio generation
|
884
|
+
modalities = [m for m in modalities if m == "text"] or [
|
885
|
+
"text"
|
886
|
+
]
|
887
|
+
await rt.create_response(
|
888
|
+
{
|
889
|
+
"modalities": modalities,
|
890
|
+
}
|
891
|
+
)
|
795
892
|
|
796
893
|
# Collect audio and transcripts
|
797
|
-
user_tr = ""
|
894
|
+
user_tr = "" # Accumulates realtime input transcript segments (audio path)
|
798
895
|
asst_tr = ""
|
799
896
|
|
897
|
+
input_segments: List[str] = []
|
898
|
+
|
800
899
|
async def _drain_in_tr():
|
900
|
+
"""Accumulate realtime input transcript segments, de-duplicating cumulative repeats.
|
901
|
+
|
902
|
+
Some realtime providers emit growing cumulative transcripts (e.g. "Hel", "Hello") or
|
903
|
+
may occasionally resend the full final transcript. Previous logic naively concatenated
|
904
|
+
every segment which could yield duplicated text ("HelloHello") if cumulative or repeated
|
905
|
+
finals were received. This routine keeps a canonical buffer (user_tr) and only appends
|
906
|
+
the non-overlapping suffix of each new segment.
|
907
|
+
"""
|
801
908
|
nonlocal user_tr
|
802
909
|
async for t in rt.iter_input_transcript():
|
803
|
-
if t:
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
910
|
+
if not t:
|
911
|
+
continue
|
912
|
+
# Track raw segment for optional debugging
|
913
|
+
input_segments.append(t)
|
914
|
+
if not user_tr:
|
915
|
+
user_tr = t
|
916
|
+
continue
|
917
|
+
if t == user_tr:
|
918
|
+
# Exact duplicate of current buffer; skip
|
919
|
+
continue
|
920
|
+
if t.startswith(user_tr):
|
921
|
+
# Cumulative growth; append only the new suffix
|
922
|
+
user_tr += t[len(user_tr) :]
|
923
|
+
continue
|
924
|
+
# General case: find largest overlap between end of user_tr and start of t
|
925
|
+
# to avoid duplicated middle content (e.g., user_tr="My name is", t="name is John")
|
926
|
+
overlap = 0
|
927
|
+
max_check = min(len(user_tr), len(t))
|
928
|
+
for k in range(max_check, 0, -1):
|
929
|
+
if user_tr.endswith(t[:k]):
|
930
|
+
overlap = k
|
931
|
+
break
|
932
|
+
user_tr += t[overlap:]
|
933
|
+
|
934
|
+
# Check if we need both audio and text modalities
|
935
|
+
modalities = getattr(
|
936
|
+
rt, "_options", RealtimeSessionOptions()
|
937
|
+
).output_modalities or ["audio"]
|
938
|
+
use_combined_stream = "audio" in modalities and "text" in modalities
|
939
|
+
|
940
|
+
if use_combined_stream and wants_audio:
|
941
|
+
# Use combined stream for both modalities
|
942
|
+
async def _drain_out_tr():
|
943
|
+
nonlocal asst_tr
|
944
|
+
async for t in rt.iter_output_transcript():
|
945
|
+
if t:
|
946
|
+
asst_tr += t
|
947
|
+
|
948
|
+
in_task = asyncio.create_task(_drain_in_tr())
|
949
|
+
out_task = asyncio.create_task(_drain_out_tr())
|
950
|
+
try:
|
951
|
+
# Check if the service has iter_output_combined method
|
952
|
+
if hasattr(rt, "iter_output_combined"):
|
953
|
+
async for chunk in rt.iter_output_combined():
|
954
|
+
# Adapt output based on caller's requested output_format
|
955
|
+
if output_format == "text":
|
956
|
+
# Only yield text modalities as plain strings
|
957
|
+
if getattr(chunk, "modality", None) == "text":
|
958
|
+
yield chunk.data # type: ignore[attr-defined]
|
959
|
+
continue
|
960
|
+
# Audio streaming path
|
961
|
+
if getattr(chunk, "modality", None) == "audio":
|
962
|
+
# Yield raw bytes if data present
|
963
|
+
yield getattr(chunk, "data", b"")
|
964
|
+
elif (
|
965
|
+
getattr(chunk, "modality", None) == "text"
|
966
|
+
and output_format == "audio"
|
967
|
+
):
|
968
|
+
# Optionally ignore or log text while audio requested
|
969
|
+
continue
|
970
|
+
else:
|
971
|
+
# Fallback: ignore unknown modalities for now
|
972
|
+
continue
|
973
|
+
else:
|
974
|
+
# Fallback: yield audio chunks as RealtimeChunk objects
|
975
|
+
async for audio_chunk in rt.iter_output_audio_encoded():
|
976
|
+
if output_format == "text":
|
977
|
+
# Ignore audio when text requested
|
978
|
+
continue
|
979
|
+
# output_format audio: provide raw bytes
|
980
|
+
if hasattr(audio_chunk, "modality"):
|
981
|
+
if (
|
982
|
+
getattr(audio_chunk, "modality", None)
|
983
|
+
== "audio"
|
984
|
+
):
|
985
|
+
yield getattr(audio_chunk, "data", b"")
|
986
|
+
else:
|
987
|
+
yield audio_chunk
|
988
|
+
finally:
|
989
|
+
# Allow transcript drain tasks to finish to capture user/asst text before persistence
|
990
|
+
try:
|
991
|
+
await asyncio.wait_for(in_task, timeout=0.05)
|
992
|
+
except Exception:
|
993
|
+
in_task.cancel()
|
994
|
+
try:
|
995
|
+
await asyncio.wait_for(out_task, timeout=0.05)
|
996
|
+
except Exception:
|
997
|
+
out_task.cancel()
|
998
|
+
# HTTP STT path removed: realtime audio input transcript (if any) is authoritative
|
999
|
+
# Persist transcripts after combined streaming completes
|
1000
|
+
if turn_id:
|
822
1001
|
try:
|
823
|
-
|
824
|
-
|
1002
|
+
effective_user_tr = user_tr or ("".join(input_segments))
|
1003
|
+
try:
|
1004
|
+
setattr(
|
1005
|
+
self,
|
1006
|
+
"_last_realtime_user_transcript",
|
1007
|
+
effective_user_tr,
|
1008
|
+
)
|
1009
|
+
except Exception:
|
1010
|
+
pass
|
1011
|
+
if effective_user_tr:
|
1012
|
+
final_user_tr = effective_user_tr
|
1013
|
+
elif (
|
1014
|
+
isinstance(query, str)
|
1015
|
+
and query
|
1016
|
+
and not input_segments
|
1017
|
+
and not user_tr
|
1018
|
+
):
|
1019
|
+
final_user_tr = query
|
1020
|
+
if asst_tr:
|
1021
|
+
await self.realtime_update_assistant(
|
1022
|
+
user_id, turn_id, asst_tr
|
1023
|
+
)
|
1024
|
+
except Exception:
|
1025
|
+
pass
|
1026
|
+
if final_user_tr and not user_persisted:
|
1027
|
+
try:
|
1028
|
+
await self.realtime_update_user(
|
1029
|
+
user_id, turn_id, final_user_tr
|
1030
|
+
)
|
1031
|
+
user_persisted = True
|
1032
|
+
except Exception:
|
1033
|
+
pass
|
1034
|
+
try:
|
1035
|
+
await self.realtime_finalize_turn(user_id, turn_id)
|
825
1036
|
except Exception:
|
826
1037
|
pass
|
1038
|
+
if final_user_tr and not user_persisted:
|
1039
|
+
try:
|
1040
|
+
await self.realtime_update_user(
|
1041
|
+
user_id, turn_id, final_user_tr
|
1042
|
+
)
|
1043
|
+
user_persisted = True
|
1044
|
+
except Exception:
|
1045
|
+
pass
|
1046
|
+
elif wants_audio:
|
1047
|
+
# Use separate streams (legacy behavior)
|
1048
|
+
async def _drain_out_tr():
|
1049
|
+
nonlocal asst_tr
|
1050
|
+
async for t in rt.iter_output_transcript():
|
1051
|
+
if t:
|
1052
|
+
asst_tr += t
|
1053
|
+
|
1054
|
+
in_task = asyncio.create_task(_drain_in_tr())
|
1055
|
+
out_task = asyncio.create_task(_drain_out_tr())
|
1056
|
+
try:
|
1057
|
+
async for audio_chunk in rt.iter_output_audio_encoded():
|
1058
|
+
if output_format == "text":
|
1059
|
+
# Skip audio when caller wants text only
|
1060
|
+
continue
|
1061
|
+
# output_format audio: yield raw bytes
|
1062
|
+
if hasattr(audio_chunk, "modality"):
|
1063
|
+
if (
|
1064
|
+
getattr(audio_chunk, "modality", None)
|
1065
|
+
== "audio"
|
1066
|
+
):
|
1067
|
+
yield getattr(audio_chunk, "data", b"")
|
1068
|
+
else:
|
1069
|
+
yield audio_chunk
|
1070
|
+
finally:
|
1071
|
+
try:
|
1072
|
+
await asyncio.wait_for(in_task, timeout=0.05)
|
1073
|
+
except Exception:
|
1074
|
+
in_task.cancel()
|
1075
|
+
try:
|
1076
|
+
await asyncio.wait_for(out_task, timeout=0.05)
|
1077
|
+
except Exception:
|
1078
|
+
out_task.cancel()
|
1079
|
+
# HTTP STT path removed
|
1080
|
+
# Persist transcripts after audio-only streaming
|
827
1081
|
if turn_id:
|
828
1082
|
try:
|
829
|
-
|
1083
|
+
effective_user_tr = user_tr or ("".join(input_segments))
|
1084
|
+
try:
|
1085
|
+
setattr(
|
1086
|
+
self,
|
1087
|
+
"_last_realtime_user_transcript",
|
1088
|
+
effective_user_tr,
|
1089
|
+
)
|
1090
|
+
except Exception:
|
1091
|
+
pass
|
1092
|
+
# Buffer final transcript for single persistence
|
1093
|
+
if effective_user_tr:
|
1094
|
+
final_user_tr = effective_user_tr
|
1095
|
+
elif (
|
1096
|
+
isinstance(query, str)
|
1097
|
+
and query
|
1098
|
+
and not input_segments
|
1099
|
+
and not user_tr
|
1100
|
+
):
|
1101
|
+
final_user_tr = query
|
1102
|
+
if asst_tr:
|
1103
|
+
await self.realtime_update_assistant(
|
1104
|
+
user_id, turn_id, asst_tr
|
1105
|
+
)
|
1106
|
+
except Exception:
|
1107
|
+
pass
|
1108
|
+
if final_user_tr and not user_persisted:
|
1109
|
+
try:
|
830
1110
|
await self.realtime_update_user(
|
831
|
-
user_id, turn_id,
|
1111
|
+
user_id, turn_id, final_user_tr
|
832
1112
|
)
|
1113
|
+
user_persisted = True
|
1114
|
+
except Exception:
|
1115
|
+
pass
|
1116
|
+
try:
|
1117
|
+
await self.realtime_finalize_turn(user_id, turn_id)
|
1118
|
+
except Exception:
|
1119
|
+
pass
|
1120
|
+
# If no WS input transcript was captured, fall back to HTTP STT result
|
1121
|
+
else:
|
1122
|
+
# Text-only: just stream assistant transcript if available (no audio iteration)
|
1123
|
+
# If original input was audio bytes but caller only wants text output (no audio modality),
|
1124
|
+
# we still need to drain the input transcript stream to build user_tr.
|
1125
|
+
in_task_audio_only = None
|
1126
|
+
if is_audio_bytes:
|
1127
|
+
in_task_audio_only = asyncio.create_task(_drain_in_tr())
|
1128
|
+
|
1129
|
+
async def _drain_out_tr_text():
|
1130
|
+
nonlocal asst_tr
|
1131
|
+
async for t in rt.iter_output_transcript():
|
1132
|
+
if t:
|
1133
|
+
asst_tr += t
|
1134
|
+
yield t # Yield incremental text chunks directly
|
1135
|
+
|
1136
|
+
async for t in _drain_out_tr_text():
|
1137
|
+
# Provide plain text to caller
|
1138
|
+
yield t
|
1139
|
+
# Wait for input transcript (if any) before persistence
|
1140
|
+
if "in_task_audio_only" in locals() and in_task_audio_only:
|
1141
|
+
try:
|
1142
|
+
await asyncio.wait_for(in_task_audio_only, timeout=0.1)
|
1143
|
+
except Exception:
|
1144
|
+
in_task_audio_only.cancel()
|
1145
|
+
# No HTTP STT fallback
|
1146
|
+
if turn_id:
|
1147
|
+
try:
|
1148
|
+
effective_user_tr = user_tr or ("".join(input_segments))
|
1149
|
+
try:
|
1150
|
+
setattr(
|
1151
|
+
self,
|
1152
|
+
"_last_realtime_user_transcript",
|
1153
|
+
effective_user_tr,
|
1154
|
+
)
|
1155
|
+
except Exception:
|
1156
|
+
pass
|
1157
|
+
# For text-only modality but audio-origin (cumulative segments captured), persist user transcript
|
1158
|
+
if effective_user_tr:
|
1159
|
+
final_user_tr = effective_user_tr
|
1160
|
+
elif (
|
1161
|
+
isinstance(query, str)
|
1162
|
+
and query
|
1163
|
+
and not input_segments
|
1164
|
+
and not user_tr
|
1165
|
+
):
|
1166
|
+
final_user_tr = query
|
833
1167
|
if asst_tr:
|
834
1168
|
await self.realtime_update_assistant(
|
835
1169
|
user_id, turn_id, asst_tr
|
836
1170
|
)
|
837
1171
|
except Exception:
|
838
1172
|
pass
|
1173
|
+
if final_user_tr and not user_persisted:
|
1174
|
+
try:
|
1175
|
+
await self.realtime_update_user(
|
1176
|
+
user_id, turn_id, final_user_tr
|
1177
|
+
)
|
1178
|
+
user_persisted = True
|
1179
|
+
except Exception:
|
1180
|
+
pass
|
839
1181
|
try:
|
840
1182
|
await self.realtime_finalize_turn(user_id, turn_id)
|
841
1183
|
except Exception:
|
842
1184
|
pass
|
1185
|
+
# Input transcript task already awaited above
|
843
1186
|
# Clear input buffer for next turn reuse
|
844
1187
|
try:
|
845
1188
|
await rt.clear_input()
|
@@ -855,58 +1198,30 @@ class QueryService(QueryServiceInterface):
|
|
855
1198
|
pass
|
856
1199
|
return
|
857
1200
|
|
858
|
-
# 1)
|
1201
|
+
# 1) Acquire user_text (transcribe audio or direct text) for non-realtime path
|
859
1202
|
user_text = ""
|
860
1203
|
if not isinstance(query, str):
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
|
1204
|
+
try:
|
1205
|
+
logger.info(
|
1206
|
+
f"Received audio input, transcribing format: {audio_input_format}"
|
1207
|
+
)
|
1208
|
+
async for tpart in self.agent_service.llm_provider.transcribe_audio( # type: ignore[attr-defined]
|
1209
|
+
query, audio_input_format
|
1210
|
+
):
|
1211
|
+
user_text += tpart
|
1212
|
+
except Exception:
|
1213
|
+
user_text = ""
|
871
1214
|
else:
|
872
1215
|
user_text = query
|
873
|
-
logger.info(f"Received text input length: {len(user_text)}")
|
874
1216
|
|
875
1217
|
# 2) Input guardrails
|
876
|
-
original_text = user_text
|
877
1218
|
for guardrail in self.input_guardrails:
|
878
1219
|
try:
|
879
1220
|
user_text = await guardrail.process(user_text)
|
880
1221
|
except Exception as e:
|
881
1222
|
logger.debug(f"Guardrail error: {e}")
|
882
|
-
if user_text != original_text:
|
883
|
-
logger.info(
|
884
|
-
f"Input guardrails modified user text. Original length: {len(original_text)}, New length: {len(user_text)}"
|
885
|
-
)
|
886
|
-
|
887
|
-
# 3) Greetings shortcut
|
888
|
-
if not images and user_text.strip().lower() in {
|
889
|
-
"hi",
|
890
|
-
"hello",
|
891
|
-
"hey",
|
892
|
-
"ping",
|
893
|
-
"test",
|
894
|
-
}:
|
895
|
-
greeting = "Hello! How can I help you today?"
|
896
|
-
if output_format == "audio":
|
897
|
-
async for chunk in self.agent_service.llm_provider.tts(
|
898
|
-
text=greeting,
|
899
|
-
voice=audio_voice,
|
900
|
-
response_format=audio_output_format,
|
901
|
-
):
|
902
|
-
yield chunk
|
903
|
-
else:
|
904
|
-
yield greeting
|
905
|
-
if self.memory_provider:
|
906
|
-
await self._store_conversation(user_id, original_text, greeting)
|
907
|
-
return
|
908
1223
|
|
909
|
-
#
|
1224
|
+
# 3) Memory context (conversation history)
|
910
1225
|
memory_context = ""
|
911
1226
|
if self.memory_provider:
|
912
1227
|
try:
|
@@ -914,7 +1229,7 @@ class QueryService(QueryServiceInterface):
|
|
914
1229
|
except Exception:
|
915
1230
|
memory_context = ""
|
916
1231
|
|
917
|
-
#
|
1232
|
+
# 4) Knowledge base context
|
918
1233
|
kb_context = ""
|
919
1234
|
if self.knowledge_base:
|
920
1235
|
try:
|
@@ -934,7 +1249,7 @@ class QueryService(QueryServiceInterface):
|
|
934
1249
|
except Exception:
|
935
1250
|
kb_context = ""
|
936
1251
|
|
937
|
-
#
|
1252
|
+
# 5) Determine agent (sticky session aware; allow explicit switch/new conversation)
|
938
1253
|
agent_name = "default"
|
939
1254
|
prev_assistant = ""
|
940
1255
|
routing_input = user_text
|