solana-agent 31.2.6__py3-none-any.whl → 31.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -37,6 +37,8 @@ from solana_agent.interfaces.services.knowledge_base import (
37
37
  )
38
38
  from solana_agent.interfaces.guardrails.guardrails import InputGuardrail
39
39
 
40
+ from solana_agent.interfaces.providers.realtime import RealtimeSessionOptions
41
+
40
42
  from solana_agent.services.agent import AgentService
41
43
  from solana_agent.services.routing import RoutingService
42
44
 
@@ -94,6 +96,7 @@ class QueryService(QueryServiceInterface):
94
96
  encode_out: bool,
95
97
  audio_input_format: str,
96
98
  audio_output_format: str,
99
+ rt_output_modalities: Optional[List[Literal["audio", "text"]]] = None,
97
100
  ) -> Any:
98
101
  """Get a free (or new) realtime session for this user. Marks it busy via an internal lock.
99
102
 
@@ -148,6 +151,7 @@ class QueryService(QueryServiceInterface):
148
151
  output_rate_hz=24000,
149
152
  input_mime="audio/pcm",
150
153
  output_mime="audio/pcm",
154
+ output_modalities=rt_output_modalities,
151
155
  tools=initial_tools or None,
152
156
  tool_choice="auto",
153
157
  )
@@ -187,9 +191,7 @@ class QueryService(QueryServiceInterface):
187
191
  ) -> None:
188
192
  self._sticky_sessions[user_id] = {
189
193
  "agent": agent_name,
190
- "started_at": self._sticky_sessions.get(user_id, {}).get(
191
- "started_at", time.time()
192
- ),
194
+ "started_at": time.time(),
193
195
  "last_updated": time.time(),
194
196
  "required_complete": required_complete,
195
197
  }
@@ -201,6 +203,13 @@ class QueryService(QueryServiceInterface):
201
203
  self._sticky_sessions[user_id]["required_complete"] = required_complete
202
204
  self._sticky_sessions[user_id]["last_updated"] = time.time()
203
205
 
206
+ def _clear_sticky_agent(self, user_id: str) -> None:
207
+ if user_id in self._sticky_sessions:
208
+ try:
209
+ del self._sticky_sessions[user_id]
210
+ except Exception:
211
+ pass
212
+
204
213
  async def _build_combined_context(
205
214
  self,
206
215
  user_id: str,
@@ -514,6 +523,7 @@ class QueryService(QueryServiceInterface):
514
523
  vad: Optional[bool] = None,
515
524
  rt_encode_input: bool = False,
516
525
  rt_encode_output: bool = False,
526
+ rt_output_modalities: Optional[List[Literal["audio", "text"]]] = None,
517
527
  rt_voice: Literal[
518
528
  "alloy",
519
529
  "ash",
@@ -526,6 +536,12 @@ class QueryService(QueryServiceInterface):
526
536
  "shimmer",
527
537
  "verse",
528
538
  ] = "marin",
539
+ # Realtime transcription configuration (new)
540
+ rt_transcription_model: Optional[str] = None,
541
+ rt_transcription_language: Optional[str] = None,
542
+ rt_transcription_prompt: Optional[str] = None,
543
+ rt_transcription_noise_reduction: Optional[bool] = None,
544
+ rt_transcription_include_logprobs: bool = False,
529
545
  audio_voice: Literal[
530
546
  "alloy",
531
547
  "ash",
@@ -554,31 +570,13 @@ class QueryService(QueryServiceInterface):
554
570
  try:
555
571
  # Realtime request: HTTP STT for user + single WS for assistant audio
556
572
  if realtime:
557
- # 1) Launch HTTP STT in background when input is audio; don't block WS
573
+ # 1) Determine if input is audio bytes. We now ALWAYS skip HTTP STT in realtime mode.
574
+ # The realtime websocket session (optionally with built-in transcription) is authoritative.
558
575
  is_audio_bytes = isinstance(query, (bytes, bytearray))
559
- user_text = ""
560
- stt_task = None
561
- if is_audio_bytes:
562
-
563
- async def _stt_consume():
564
- txt = ""
565
- try:
566
- logger.info(
567
- f"Realtime(HTTP STT): transcribing format: {audio_input_format}"
568
- )
569
- async for (
570
- t
571
- ) in self.agent_service.llm_provider.transcribe_audio( # type: ignore[attr-defined]
572
- query, audio_input_format
573
- ):
574
- txt += t
575
- except Exception as e:
576
- logger.error(f"HTTP STT error: {e}")
577
- return txt
578
-
579
- stt_task = asyncio.create_task(_stt_consume())
580
- else:
581
- user_text = str(query)
576
+ user_text = "" if is_audio_bytes else str(query)
577
+ # Provide a sensible default realtime transcription model when audio supplied
578
+ if is_audio_bytes and not rt_transcription_model:
579
+ rt_transcription_model = "gpt-4o-mini-transcribe"
582
580
 
583
581
  # 2) Single agent selection (no multi-agent routing in realtime path)
584
582
  agent_name = self._get_sticky_agent(user_id)
@@ -693,6 +691,17 @@ class QueryService(QueryServiceInterface):
693
691
  encode_out = bool(
694
692
  rt_encode_output or (audio_output_format.lower() != "pcm")
695
693
  )
694
+ # If caller explicitly requests text-only realtime, disable output encoding entirely
695
+ if (
696
+ rt_output_modalities is not None
697
+ and "audio" not in rt_output_modalities
698
+ ):
699
+ if encode_out:
700
+ logger.debug(
701
+ "Realtime(QueryService): forcing encode_out False for text-only modalities=%s",
702
+ rt_output_modalities,
703
+ )
704
+ encode_out = False
696
705
  # Choose input transcoding when compressed input is provided (or explicitly requested)
697
706
  is_audio_bytes = isinstance(query, (bytes, bytearray))
698
707
  encode_in = bool(
@@ -700,7 +709,8 @@ class QueryService(QueryServiceInterface):
700
709
  or (is_audio_bytes and audio_input_format.lower() != "pcm")
701
710
  )
702
711
 
703
- # Allocate or reuse a realtime session for this specific request/user
712
+ # Allocate or reuse a realtime session for this specific request/user.
713
+ # (Transcription options may be applied below; if they change after allocate we will reconfigure.)
704
714
  rt = await self._alloc_realtime_session(
705
715
  user_id,
706
716
  api_key=api_key,
@@ -711,9 +721,46 @@ class QueryService(QueryServiceInterface):
711
721
  encode_out=encode_out,
712
722
  audio_input_format=audio_input_format,
713
723
  audio_output_format=audio_output_format,
724
+ rt_output_modalities=rt_output_modalities,
714
725
  )
715
726
  # Ensure lock is released no matter what
716
727
  try:
728
+ # --- Apply realtime transcription config BEFORE connecting (new) ---
729
+ if rt_transcription_model and hasattr(rt, "_options"):
730
+ try:
731
+ setattr(
732
+ rt._options,
733
+ "transcription_model",
734
+ rt_transcription_model,
735
+ )
736
+ if rt_transcription_language is not None:
737
+ setattr(
738
+ rt._options,
739
+ "transcription_language",
740
+ rt_transcription_language,
741
+ )
742
+ if rt_transcription_prompt is not None:
743
+ setattr(
744
+ rt._options,
745
+ "transcription_prompt",
746
+ rt_transcription_prompt,
747
+ )
748
+ if rt_transcription_noise_reduction is not None:
749
+ setattr(
750
+ rt._options,
751
+ "transcription_noise_reduction",
752
+ rt_transcription_noise_reduction,
753
+ )
754
+ if rt_transcription_include_logprobs:
755
+ setattr(
756
+ rt._options, "transcription_include_logprobs", True
757
+ )
758
+ except Exception:
759
+ logger.debug(
760
+ "Failed pre-connect transcription option assignment",
761
+ exc_info=True,
762
+ )
763
+
717
764
  # Tool executor
718
765
  async def _exec(
719
766
  tool_name: str, args: Dict[str, Any]
@@ -755,16 +802,47 @@ class QueryService(QueryServiceInterface):
755
802
  except Exception:
756
803
  pass
757
804
 
758
- # Persist once per turn
805
+ # Begin streaming turn (defer user transcript persistence until final to avoid duplicates)
759
806
  turn_id = await self.realtime_begin_turn(user_id)
760
- if turn_id and user_text:
761
- try:
762
- await self.realtime_update_user(user_id, turn_id, user_text)
763
- except Exception:
764
- pass
807
+ # We'll buffer the full user transcript (text input or realtime audio transcription) and persist exactly once.
808
+ # Initialize empty; we'll build it strictly from realtime transcript segments to avoid
809
+ # accidental duplication with pre-supplied user_text or prior buffers.
810
+ final_user_tr: str = ""
811
+ user_persisted = False
812
+
813
+ # Feed audio into WS if audio bytes provided and audio modality requested; else treat as text
814
+ wants_audio = (
815
+ (
816
+ getattr(rt, "_options", None)
817
+ and getattr(rt, "_options").output_modalities
818
+ )
819
+ and "audio" in getattr(rt, "_options").output_modalities # type: ignore[attr-defined]
820
+ ) or (
821
+ rt_output_modalities is None
822
+ or (rt_output_modalities and "audio" in rt_output_modalities)
823
+ )
824
+ # Determine if realtime transcription should be enabled (always skip HTTP STT regardless)
825
+ # realtime_transcription_enabled now implicit (options set before connect)
765
826
 
766
- # Feed audio into WS if audio bytes provided; else use input_text
767
- if is_audio_bytes:
827
+ if is_audio_bytes and not wants_audio:
828
+ # Feed audio solely for transcription (no audio output requested)
829
+ bq = bytes(query)
830
+ logger.info(
831
+ "Realtime: appending input audio for transcription only, len=%d, fmt=%s",
832
+ len(bq),
833
+ audio_input_format,
834
+ )
835
+ await rt.append_audio(bq)
836
+ vad_enabled_value = bool(vad) if vad is not None else False
837
+ if not vad_enabled_value:
838
+ await rt.commit_input()
839
+ # Request only text response
840
+ await rt.create_response({"modalities": ["text"]})
841
+ else:
842
+ logger.debug(
843
+ "Realtime: VAD enabled (text-only output) — skipping manual response.create"
844
+ )
845
+ if is_audio_bytes and wants_audio:
768
846
  bq = bytes(query)
769
847
  logger.info(
770
848
  "Realtime: appending input audio to WS via FFmpeg, len=%d, fmt=%s",
@@ -782,64 +860,329 @@ class QueryService(QueryServiceInterface):
782
860
  logger.debug(
783
861
  "Realtime: VAD enabled — skipping manual response.create"
784
862
  )
785
- else:
786
- # Rely on configured session voice; attach input_text only
787
- await rt.create_response(
863
+ else: # Text-only path OR caller excluded audio modality
864
+ # For text input, create conversation item first, then response
865
+ await rt.create_conversation_item(
788
866
  {
789
- "modalities": ["audio"],
790
- "input": [
867
+ "type": "message",
868
+ "role": "user",
869
+ "content": [
791
870
  {"type": "input_text", "text": user_text or ""}
792
871
  ],
793
872
  }
794
873
  )
874
+ # Determine effective modalities (fall back to provided override or text only)
875
+ if rt_output_modalities is not None:
876
+ modalities = rt_output_modalities or ["text"]
877
+ else:
878
+ mo = getattr(
879
+ rt, "_options", RealtimeSessionOptions()
880
+ ).output_modalities
881
+ modalities = mo if mo else ["audio"]
882
+ if "audio" not in modalities:
883
+ # Ensure we do not accidentally request audio generation
884
+ modalities = [m for m in modalities if m == "text"] or [
885
+ "text"
886
+ ]
887
+ await rt.create_response(
888
+ {
889
+ "modalities": modalities,
890
+ }
891
+ )
795
892
 
796
893
  # Collect audio and transcripts
797
- user_tr = ""
894
+ user_tr = "" # Accumulates realtime input transcript segments (audio path)
798
895
  asst_tr = ""
799
896
 
897
+ input_segments: List[str] = []
898
+
800
899
  async def _drain_in_tr():
900
+ """Accumulate realtime input transcript segments, de-duplicating cumulative repeats.
901
+
902
+ Some realtime providers emit growing cumulative transcripts (e.g. "Hel", "Hello") or
903
+ may occasionally resend the full final transcript. Previous logic naively concatenated
904
+ every segment which could yield duplicated text ("HelloHello") if cumulative or repeated
905
+ finals were received. This routine keeps a canonical buffer (user_tr) and only appends
906
+ the non-overlapping suffix of each new segment.
907
+ """
801
908
  nonlocal user_tr
802
909
  async for t in rt.iter_input_transcript():
803
- if t:
804
- user_tr += t
805
-
806
- async def _drain_out_tr():
807
- nonlocal asst_tr
808
- async for t in rt.iter_output_transcript():
809
- if t:
810
- asst_tr += t
811
-
812
- in_task = asyncio.create_task(_drain_in_tr())
813
- out_task = asyncio.create_task(_drain_out_tr())
814
- try:
815
- async for audio_chunk in rt.iter_output_audio_encoded():
816
- yield audio_chunk
817
- finally:
818
- in_task.cancel()
819
- out_task.cancel()
820
- # If no WS input transcript was captured, fall back to HTTP STT result
821
- if not user_tr:
910
+ if not t:
911
+ continue
912
+ # Track raw segment for optional debugging
913
+ input_segments.append(t)
914
+ if not user_tr:
915
+ user_tr = t
916
+ continue
917
+ if t == user_tr:
918
+ # Exact duplicate of current buffer; skip
919
+ continue
920
+ if t.startswith(user_tr):
921
+ # Cumulative growth; append only the new suffix
922
+ user_tr += t[len(user_tr) :]
923
+ continue
924
+ # General case: find largest overlap between end of user_tr and start of t
925
+ # to avoid duplicated middle content (e.g., user_tr="My name is", t="name is John")
926
+ overlap = 0
927
+ max_check = min(len(user_tr), len(t))
928
+ for k in range(max_check, 0, -1):
929
+ if user_tr.endswith(t[:k]):
930
+ overlap = k
931
+ break
932
+ user_tr += t[overlap:]
933
+
934
+ # Check if we need both audio and text modalities
935
+ modalities = getattr(
936
+ rt, "_options", RealtimeSessionOptions()
937
+ ).output_modalities or ["audio"]
938
+ use_combined_stream = "audio" in modalities and "text" in modalities
939
+
940
+ if use_combined_stream and wants_audio:
941
+ # Use combined stream for both modalities
942
+ async def _drain_out_tr():
943
+ nonlocal asst_tr
944
+ async for t in rt.iter_output_transcript():
945
+ if t:
946
+ asst_tr += t
947
+
948
+ in_task = asyncio.create_task(_drain_in_tr())
949
+ out_task = asyncio.create_task(_drain_out_tr())
950
+ try:
951
+ # Check if the service has iter_output_combined method
952
+ if hasattr(rt, "iter_output_combined"):
953
+ async for chunk in rt.iter_output_combined():
954
+ # Adapt output based on caller's requested output_format
955
+ if output_format == "text":
956
+ # Only yield text modalities as plain strings
957
+ if getattr(chunk, "modality", None) == "text":
958
+ yield chunk.data # type: ignore[attr-defined]
959
+ continue
960
+ # Audio streaming path
961
+ if getattr(chunk, "modality", None) == "audio":
962
+ # Yield raw bytes if data present
963
+ yield getattr(chunk, "data", b"")
964
+ elif (
965
+ getattr(chunk, "modality", None) == "text"
966
+ and output_format == "audio"
967
+ ):
968
+ # Optionally ignore or log text while audio requested
969
+ continue
970
+ else:
971
+ # Fallback: ignore unknown modalities for now
972
+ continue
973
+ else:
974
+ # Fallback: yield audio chunks as RealtimeChunk objects
975
+ async for audio_chunk in rt.iter_output_audio_encoded():
976
+ if output_format == "text":
977
+ # Ignore audio when text requested
978
+ continue
979
+ # output_format audio: provide raw bytes
980
+ if hasattr(audio_chunk, "modality"):
981
+ if (
982
+ getattr(audio_chunk, "modality", None)
983
+ == "audio"
984
+ ):
985
+ yield getattr(audio_chunk, "data", b"")
986
+ else:
987
+ yield audio_chunk
988
+ finally:
989
+ # Allow transcript drain tasks to finish to capture user/asst text before persistence
990
+ try:
991
+ await asyncio.wait_for(in_task, timeout=0.05)
992
+ except Exception:
993
+ in_task.cancel()
994
+ try:
995
+ await asyncio.wait_for(out_task, timeout=0.05)
996
+ except Exception:
997
+ out_task.cancel()
998
+ # HTTP STT path removed: realtime audio input transcript (if any) is authoritative
999
+ # Persist transcripts after combined streaming completes
1000
+ if turn_id:
822
1001
  try:
823
- if "stt_task" in locals() and stt_task is not None:
824
- user_tr = await stt_task
1002
+ effective_user_tr = user_tr or ("".join(input_segments))
1003
+ try:
1004
+ setattr(
1005
+ self,
1006
+ "_last_realtime_user_transcript",
1007
+ effective_user_tr,
1008
+ )
1009
+ except Exception:
1010
+ pass
1011
+ if effective_user_tr:
1012
+ final_user_tr = effective_user_tr
1013
+ elif (
1014
+ isinstance(query, str)
1015
+ and query
1016
+ and not input_segments
1017
+ and not user_tr
1018
+ ):
1019
+ final_user_tr = query
1020
+ if asst_tr:
1021
+ await self.realtime_update_assistant(
1022
+ user_id, turn_id, asst_tr
1023
+ )
1024
+ except Exception:
1025
+ pass
1026
+ if final_user_tr and not user_persisted:
1027
+ try:
1028
+ await self.realtime_update_user(
1029
+ user_id, turn_id, final_user_tr
1030
+ )
1031
+ user_persisted = True
1032
+ except Exception:
1033
+ pass
1034
+ try:
1035
+ await self.realtime_finalize_turn(user_id, turn_id)
825
1036
  except Exception:
826
1037
  pass
1038
+ if final_user_tr and not user_persisted:
1039
+ try:
1040
+ await self.realtime_update_user(
1041
+ user_id, turn_id, final_user_tr
1042
+ )
1043
+ user_persisted = True
1044
+ except Exception:
1045
+ pass
1046
+ elif wants_audio:
1047
+ # Use separate streams (legacy behavior)
1048
+ async def _drain_out_tr():
1049
+ nonlocal asst_tr
1050
+ async for t in rt.iter_output_transcript():
1051
+ if t:
1052
+ asst_tr += t
1053
+
1054
+ in_task = asyncio.create_task(_drain_in_tr())
1055
+ out_task = asyncio.create_task(_drain_out_tr())
1056
+ try:
1057
+ async for audio_chunk in rt.iter_output_audio_encoded():
1058
+ if output_format == "text":
1059
+ # Skip audio when caller wants text only
1060
+ continue
1061
+ # output_format audio: yield raw bytes
1062
+ if hasattr(audio_chunk, "modality"):
1063
+ if (
1064
+ getattr(audio_chunk, "modality", None)
1065
+ == "audio"
1066
+ ):
1067
+ yield getattr(audio_chunk, "data", b"")
1068
+ else:
1069
+ yield audio_chunk
1070
+ finally:
1071
+ try:
1072
+ await asyncio.wait_for(in_task, timeout=0.05)
1073
+ except Exception:
1074
+ in_task.cancel()
1075
+ try:
1076
+ await asyncio.wait_for(out_task, timeout=0.05)
1077
+ except Exception:
1078
+ out_task.cancel()
1079
+ # HTTP STT path removed
1080
+ # Persist transcripts after audio-only streaming
827
1081
  if turn_id:
828
1082
  try:
829
- if user_tr:
1083
+ effective_user_tr = user_tr or ("".join(input_segments))
1084
+ try:
1085
+ setattr(
1086
+ self,
1087
+ "_last_realtime_user_transcript",
1088
+ effective_user_tr,
1089
+ )
1090
+ except Exception:
1091
+ pass
1092
+ # Buffer final transcript for single persistence
1093
+ if effective_user_tr:
1094
+ final_user_tr = effective_user_tr
1095
+ elif (
1096
+ isinstance(query, str)
1097
+ and query
1098
+ and not input_segments
1099
+ and not user_tr
1100
+ ):
1101
+ final_user_tr = query
1102
+ if asst_tr:
1103
+ await self.realtime_update_assistant(
1104
+ user_id, turn_id, asst_tr
1105
+ )
1106
+ except Exception:
1107
+ pass
1108
+ if final_user_tr and not user_persisted:
1109
+ try:
830
1110
  await self.realtime_update_user(
831
- user_id, turn_id, user_tr
1111
+ user_id, turn_id, final_user_tr
832
1112
  )
1113
+ user_persisted = True
1114
+ except Exception:
1115
+ pass
1116
+ try:
1117
+ await self.realtime_finalize_turn(user_id, turn_id)
1118
+ except Exception:
1119
+ pass
1120
+ # If no WS input transcript was captured, fall back to HTTP STT result
1121
+ else:
1122
+ # Text-only: just stream assistant transcript if available (no audio iteration)
1123
+ # If original input was audio bytes but caller only wants text output (no audio modality),
1124
+ # we still need to drain the input transcript stream to build user_tr.
1125
+ in_task_audio_only = None
1126
+ if is_audio_bytes:
1127
+ in_task_audio_only = asyncio.create_task(_drain_in_tr())
1128
+
1129
+ async def _drain_out_tr_text():
1130
+ nonlocal asst_tr
1131
+ async for t in rt.iter_output_transcript():
1132
+ if t:
1133
+ asst_tr += t
1134
+ yield t # Yield incremental text chunks directly
1135
+
1136
+ async for t in _drain_out_tr_text():
1137
+ # Provide plain text to caller
1138
+ yield t
1139
+ # Wait for input transcript (if any) before persistence
1140
+ if "in_task_audio_only" in locals() and in_task_audio_only:
1141
+ try:
1142
+ await asyncio.wait_for(in_task_audio_only, timeout=0.1)
1143
+ except Exception:
1144
+ in_task_audio_only.cancel()
1145
+ # No HTTP STT fallback
1146
+ if turn_id:
1147
+ try:
1148
+ effective_user_tr = user_tr or ("".join(input_segments))
1149
+ try:
1150
+ setattr(
1151
+ self,
1152
+ "_last_realtime_user_transcript",
1153
+ effective_user_tr,
1154
+ )
1155
+ except Exception:
1156
+ pass
1157
+ # For text-only modality but audio-origin (cumulative segments captured), persist user transcript
1158
+ if effective_user_tr:
1159
+ final_user_tr = effective_user_tr
1160
+ elif (
1161
+ isinstance(query, str)
1162
+ and query
1163
+ and not input_segments
1164
+ and not user_tr
1165
+ ):
1166
+ final_user_tr = query
833
1167
  if asst_tr:
834
1168
  await self.realtime_update_assistant(
835
1169
  user_id, turn_id, asst_tr
836
1170
  )
837
1171
  except Exception:
838
1172
  pass
1173
+ if final_user_tr and not user_persisted:
1174
+ try:
1175
+ await self.realtime_update_user(
1176
+ user_id, turn_id, final_user_tr
1177
+ )
1178
+ user_persisted = True
1179
+ except Exception:
1180
+ pass
839
1181
  try:
840
1182
  await self.realtime_finalize_turn(user_id, turn_id)
841
1183
  except Exception:
842
1184
  pass
1185
+ # Input transcript task already awaited above
843
1186
  # Clear input buffer for next turn reuse
844
1187
  try:
845
1188
  await rt.clear_input()
@@ -855,58 +1198,30 @@ class QueryService(QueryServiceInterface):
855
1198
  pass
856
1199
  return
857
1200
 
858
- # 1) Transcribe audio or accept text
1201
+ # 1) Acquire user_text (transcribe audio or direct text) for non-realtime path
859
1202
  user_text = ""
860
1203
  if not isinstance(query, str):
861
- logger.info(
862
- f"Received audio input, transcribing format: {audio_input_format}"
863
- )
864
- async for (
865
- transcript
866
- ) in self.agent_service.llm_provider.transcribe_audio(
867
- query, audio_input_format
868
- ):
869
- user_text += transcript
870
- logger.info(f"Transcription result length: {len(user_text)}")
1204
+ try:
1205
+ logger.info(
1206
+ f"Received audio input, transcribing format: {audio_input_format}"
1207
+ )
1208
+ async for tpart in self.agent_service.llm_provider.transcribe_audio( # type: ignore[attr-defined]
1209
+ query, audio_input_format
1210
+ ):
1211
+ user_text += tpart
1212
+ except Exception:
1213
+ user_text = ""
871
1214
  else:
872
1215
  user_text = query
873
- logger.info(f"Received text input length: {len(user_text)}")
874
1216
 
875
1217
  # 2) Input guardrails
876
- original_text = user_text
877
1218
  for guardrail in self.input_guardrails:
878
1219
  try:
879
1220
  user_text = await guardrail.process(user_text)
880
1221
  except Exception as e:
881
1222
  logger.debug(f"Guardrail error: {e}")
882
- if user_text != original_text:
883
- logger.info(
884
- f"Input guardrails modified user text. Original length: {len(original_text)}, New length: {len(user_text)}"
885
- )
886
-
887
- # 3) Greetings shortcut
888
- if not images and user_text.strip().lower() in {
889
- "hi",
890
- "hello",
891
- "hey",
892
- "ping",
893
- "test",
894
- }:
895
- greeting = "Hello! How can I help you today?"
896
- if output_format == "audio":
897
- async for chunk in self.agent_service.llm_provider.tts(
898
- text=greeting,
899
- voice=audio_voice,
900
- response_format=audio_output_format,
901
- ):
902
- yield chunk
903
- else:
904
- yield greeting
905
- if self.memory_provider:
906
- await self._store_conversation(user_id, original_text, greeting)
907
- return
908
1223
 
909
- # 4) Memory context (conversation history)
1224
+ # 3) Memory context (conversation history)
910
1225
  memory_context = ""
911
1226
  if self.memory_provider:
912
1227
  try:
@@ -914,7 +1229,7 @@ class QueryService(QueryServiceInterface):
914
1229
  except Exception:
915
1230
  memory_context = ""
916
1231
 
917
- # 5) Knowledge base context
1232
+ # 4) Knowledge base context
918
1233
  kb_context = ""
919
1234
  if self.knowledge_base:
920
1235
  try:
@@ -934,7 +1249,7 @@ class QueryService(QueryServiceInterface):
934
1249
  except Exception:
935
1250
  kb_context = ""
936
1251
 
937
- # 6) Determine agent (sticky session aware; allow explicit switch/new conversation)
1252
+ # 5) Determine agent (sticky session aware; allow explicit switch/new conversation)
938
1253
  agent_name = "default"
939
1254
  prev_assistant = ""
940
1255
  routing_input = user_text