dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (195) hide show
  1. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
  2. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
  3. pipecat/adapters/base_llm_adapter.py +38 -1
  4. pipecat/adapters/services/anthropic_adapter.py +9 -14
  5. pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
  6. pipecat/adapters/services/bedrock_adapter.py +236 -13
  7. pipecat/adapters/services/gemini_adapter.py +12 -8
  8. pipecat/adapters/services/open_ai_adapter.py +19 -7
  9. pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
  10. pipecat/audio/dtmf/dtmf-0.wav +0 -0
  11. pipecat/audio/dtmf/dtmf-1.wav +0 -0
  12. pipecat/audio/dtmf/dtmf-2.wav +0 -0
  13. pipecat/audio/dtmf/dtmf-3.wav +0 -0
  14. pipecat/audio/dtmf/dtmf-4.wav +0 -0
  15. pipecat/audio/dtmf/dtmf-5.wav +0 -0
  16. pipecat/audio/dtmf/dtmf-6.wav +0 -0
  17. pipecat/audio/dtmf/dtmf-7.wav +0 -0
  18. pipecat/audio/dtmf/dtmf-8.wav +0 -0
  19. pipecat/audio/dtmf/dtmf-9.wav +0 -0
  20. pipecat/audio/dtmf/dtmf-pound.wav +0 -0
  21. pipecat/audio/dtmf/dtmf-star.wav +0 -0
  22. pipecat/audio/filters/krisp_viva_filter.py +193 -0
  23. pipecat/audio/filters/noisereduce_filter.py +15 -0
  24. pipecat/audio/turn/base_turn_analyzer.py +9 -1
  25. pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
  26. pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
  27. pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
  28. pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
  29. pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
  30. pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
  31. pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
  32. pipecat/audio/vad/data/README.md +10 -0
  33. pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
  34. pipecat/audio/vad/silero.py +9 -3
  35. pipecat/audio/vad/vad_analyzer.py +13 -1
  36. pipecat/extensions/voicemail/voicemail_detector.py +5 -5
  37. pipecat/frames/frames.py +277 -86
  38. pipecat/observers/loggers/debug_log_observer.py +3 -3
  39. pipecat/observers/loggers/llm_log_observer.py +7 -3
  40. pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
  41. pipecat/pipeline/runner.py +18 -6
  42. pipecat/pipeline/service_switcher.py +64 -36
  43. pipecat/pipeline/task.py +125 -79
  44. pipecat/pipeline/tts_switcher.py +30 -0
  45. pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
  46. pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
  47. pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
  48. pipecat/processors/aggregators/llm_context.py +40 -2
  49. pipecat/processors/aggregators/llm_response.py +32 -15
  50. pipecat/processors/aggregators/llm_response_universal.py +19 -15
  51. pipecat/processors/aggregators/user_response.py +6 -6
  52. pipecat/processors/aggregators/vision_image_frame.py +24 -2
  53. pipecat/processors/audio/audio_buffer_processor.py +43 -8
  54. pipecat/processors/dtmf_aggregator.py +174 -77
  55. pipecat/processors/filters/stt_mute_filter.py +17 -0
  56. pipecat/processors/frame_processor.py +110 -24
  57. pipecat/processors/frameworks/langchain.py +8 -2
  58. pipecat/processors/frameworks/rtvi.py +210 -68
  59. pipecat/processors/frameworks/strands_agents.py +170 -0
  60. pipecat/processors/logger.py +2 -2
  61. pipecat/processors/transcript_processor.py +26 -5
  62. pipecat/processors/user_idle_processor.py +35 -11
  63. pipecat/runner/daily.py +59 -20
  64. pipecat/runner/run.py +395 -93
  65. pipecat/runner/types.py +6 -4
  66. pipecat/runner/utils.py +51 -10
  67. pipecat/serializers/__init__.py +5 -1
  68. pipecat/serializers/asterisk.py +16 -2
  69. pipecat/serializers/convox.py +41 -4
  70. pipecat/serializers/custom.py +257 -0
  71. pipecat/serializers/exotel.py +5 -5
  72. pipecat/serializers/livekit.py +20 -0
  73. pipecat/serializers/plivo.py +5 -5
  74. pipecat/serializers/protobuf.py +6 -5
  75. pipecat/serializers/telnyx.py +2 -2
  76. pipecat/serializers/twilio.py +43 -23
  77. pipecat/serializers/vi.py +324 -0
  78. pipecat/services/ai_service.py +2 -6
  79. pipecat/services/anthropic/llm.py +2 -25
  80. pipecat/services/assemblyai/models.py +6 -0
  81. pipecat/services/assemblyai/stt.py +13 -5
  82. pipecat/services/asyncai/tts.py +5 -3
  83. pipecat/services/aws/__init__.py +1 -0
  84. pipecat/services/aws/llm.py +147 -105
  85. pipecat/services/aws/nova_sonic/__init__.py +0 -0
  86. pipecat/services/aws/nova_sonic/context.py +436 -0
  87. pipecat/services/aws/nova_sonic/frames.py +25 -0
  88. pipecat/services/aws/nova_sonic/llm.py +1265 -0
  89. pipecat/services/aws/stt.py +3 -3
  90. pipecat/services/aws_nova_sonic/__init__.py +19 -1
  91. pipecat/services/aws_nova_sonic/aws.py +11 -1151
  92. pipecat/services/aws_nova_sonic/context.py +8 -354
  93. pipecat/services/aws_nova_sonic/frames.py +13 -17
  94. pipecat/services/azure/llm.py +51 -1
  95. pipecat/services/azure/realtime/__init__.py +0 -0
  96. pipecat/services/azure/realtime/llm.py +65 -0
  97. pipecat/services/azure/stt.py +15 -0
  98. pipecat/services/cartesia/stt.py +77 -70
  99. pipecat/services/cartesia/tts.py +80 -13
  100. pipecat/services/deepgram/__init__.py +1 -0
  101. pipecat/services/deepgram/flux/__init__.py +0 -0
  102. pipecat/services/deepgram/flux/stt.py +640 -0
  103. pipecat/services/elevenlabs/__init__.py +4 -1
  104. pipecat/services/elevenlabs/stt.py +339 -0
  105. pipecat/services/elevenlabs/tts.py +87 -46
  106. pipecat/services/fish/tts.py +5 -2
  107. pipecat/services/gemini_multimodal_live/events.py +38 -524
  108. pipecat/services/gemini_multimodal_live/file_api.py +23 -173
  109. pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
  110. pipecat/services/gladia/stt.py +56 -72
  111. pipecat/services/google/__init__.py +1 -0
  112. pipecat/services/google/gemini_live/__init__.py +3 -0
  113. pipecat/services/google/gemini_live/file_api.py +189 -0
  114. pipecat/services/google/gemini_live/llm.py +1582 -0
  115. pipecat/services/google/gemini_live/llm_vertex.py +184 -0
  116. pipecat/services/google/llm.py +15 -11
  117. pipecat/services/google/llm_openai.py +3 -3
  118. pipecat/services/google/llm_vertex.py +86 -16
  119. pipecat/services/google/stt.py +4 -0
  120. pipecat/services/google/tts.py +7 -3
  121. pipecat/services/heygen/api.py +2 -0
  122. pipecat/services/heygen/client.py +8 -4
  123. pipecat/services/heygen/video.py +2 -0
  124. pipecat/services/hume/__init__.py +5 -0
  125. pipecat/services/hume/tts.py +220 -0
  126. pipecat/services/inworld/tts.py +6 -6
  127. pipecat/services/llm_service.py +15 -5
  128. pipecat/services/lmnt/tts.py +4 -2
  129. pipecat/services/mcp_service.py +4 -2
  130. pipecat/services/mem0/memory.py +6 -5
  131. pipecat/services/mistral/llm.py +29 -8
  132. pipecat/services/moondream/vision.py +42 -16
  133. pipecat/services/neuphonic/tts.py +5 -2
  134. pipecat/services/openai/__init__.py +1 -0
  135. pipecat/services/openai/base_llm.py +27 -20
  136. pipecat/services/openai/realtime/__init__.py +0 -0
  137. pipecat/services/openai/realtime/context.py +272 -0
  138. pipecat/services/openai/realtime/events.py +1106 -0
  139. pipecat/services/openai/realtime/frames.py +37 -0
  140. pipecat/services/openai/realtime/llm.py +829 -0
  141. pipecat/services/openai/tts.py +49 -10
  142. pipecat/services/openai_realtime/__init__.py +27 -0
  143. pipecat/services/openai_realtime/azure.py +21 -0
  144. pipecat/services/openai_realtime/context.py +21 -0
  145. pipecat/services/openai_realtime/events.py +21 -0
  146. pipecat/services/openai_realtime/frames.py +21 -0
  147. pipecat/services/openai_realtime_beta/azure.py +16 -0
  148. pipecat/services/openai_realtime_beta/openai.py +17 -5
  149. pipecat/services/piper/tts.py +7 -9
  150. pipecat/services/playht/tts.py +34 -4
  151. pipecat/services/rime/tts.py +12 -12
  152. pipecat/services/riva/stt.py +3 -1
  153. pipecat/services/salesforce/__init__.py +9 -0
  154. pipecat/services/salesforce/llm.py +700 -0
  155. pipecat/services/sarvam/__init__.py +7 -0
  156. pipecat/services/sarvam/stt.py +540 -0
  157. pipecat/services/sarvam/tts.py +97 -13
  158. pipecat/services/simli/video.py +2 -2
  159. pipecat/services/speechmatics/stt.py +22 -10
  160. pipecat/services/stt_service.py +47 -0
  161. pipecat/services/tavus/video.py +2 -2
  162. pipecat/services/tts_service.py +75 -22
  163. pipecat/services/vision_service.py +7 -6
  164. pipecat/services/vistaar/llm.py +51 -9
  165. pipecat/tests/utils.py +4 -4
  166. pipecat/transcriptions/language.py +41 -1
  167. pipecat/transports/base_input.py +13 -34
  168. pipecat/transports/base_output.py +140 -104
  169. pipecat/transports/daily/transport.py +199 -26
  170. pipecat/transports/heygen/__init__.py +0 -0
  171. pipecat/transports/heygen/transport.py +381 -0
  172. pipecat/transports/livekit/transport.py +228 -63
  173. pipecat/transports/local/audio.py +6 -1
  174. pipecat/transports/local/tk.py +11 -2
  175. pipecat/transports/network/fastapi_websocket.py +1 -1
  176. pipecat/transports/smallwebrtc/connection.py +103 -19
  177. pipecat/transports/smallwebrtc/request_handler.py +246 -0
  178. pipecat/transports/smallwebrtc/transport.py +65 -23
  179. pipecat/transports/tavus/transport.py +23 -12
  180. pipecat/transports/websocket/client.py +41 -5
  181. pipecat/transports/websocket/fastapi.py +21 -11
  182. pipecat/transports/websocket/server.py +14 -7
  183. pipecat/transports/whatsapp/api.py +8 -0
  184. pipecat/transports/whatsapp/client.py +47 -0
  185. pipecat/utils/base_object.py +54 -22
  186. pipecat/utils/redis.py +58 -0
  187. pipecat/utils/string.py +13 -1
  188. pipecat/utils/tracing/service_decorators.py +21 -21
  189. pipecat/serializers/genesys.py +0 -95
  190. pipecat/services/google/test-google-chirp.py +0 -45
  191. pipecat/services/openai.py +0 -698
  192. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
  193. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
  194. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
  195. /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
@@ -0,0 +1,1265 @@
1
+ #
2
+ # Copyright (c) 2024–2025, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ """AWS Nova Sonic LLM service implementation for Pipecat AI framework.
8
+
9
+ This module provides a speech-to-speech LLM service using AWS Nova Sonic, which supports
10
+ bidirectional audio streaming, text generation, and function calling capabilities.
11
+ """
12
+
13
+ import asyncio
14
+ import base64
15
+ import json
16
+ import time
17
+ import uuid
18
+ import wave
19
+ from dataclasses import dataclass
20
+ from enum import Enum
21
+ from importlib.resources import files
22
+ from typing import Any, List, Optional
23
+
24
+ from loguru import logger
25
+ from pydantic import BaseModel, Field
26
+
27
+ from pipecat.adapters.schemas.tools_schema import ToolsSchema
28
+ from pipecat.adapters.services.aws_nova_sonic_adapter import AWSNovaSonicLLMAdapter, Role
29
+ from pipecat.frames.frames import (
30
+ BotStoppedSpeakingFrame,
31
+ CancelFrame,
32
+ EndFrame,
33
+ Frame,
34
+ FunctionCallFromLLM,
35
+ InputAudioRawFrame,
36
+ InterruptionFrame,
37
+ LLMContextFrame,
38
+ LLMFullResponseEndFrame,
39
+ LLMFullResponseStartFrame,
40
+ StartFrame,
41
+ TranscriptionFrame,
42
+ TTSAudioRawFrame,
43
+ TTSStartedFrame,
44
+ TTSStoppedFrame,
45
+ TTSTextFrame,
46
+ UserStartedSpeakingFrame,
47
+ UserStoppedSpeakingFrame,
48
+ )
49
+ from pipecat.processors.aggregators.llm_context import LLMContext
50
+ from pipecat.processors.aggregators.llm_response import (
51
+ LLMAssistantAggregatorParams,
52
+ LLMUserAggregatorParams,
53
+ )
54
+ from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
55
+ from pipecat.processors.aggregators.openai_llm_context import (
56
+ OpenAILLMContext,
57
+ OpenAILLMContextFrame,
58
+ )
59
+ from pipecat.processors.frame_processor import FrameDirection
60
+ from pipecat.services.llm_service import LLMService
61
+ from pipecat.utils.time import time_now_iso8601
62
+
63
+ try:
64
+ from aws_sdk_bedrock_runtime.client import (
65
+ BedrockRuntimeClient,
66
+ InvokeModelWithBidirectionalStreamOperationInput,
67
+ )
68
+ from aws_sdk_bedrock_runtime.config import Config
69
+ from aws_sdk_bedrock_runtime.models import (
70
+ BidirectionalInputPayloadPart,
71
+ InvokeModelWithBidirectionalStreamInput,
72
+ InvokeModelWithBidirectionalStreamInputChunk,
73
+ InvokeModelWithBidirectionalStreamOperationOutput,
74
+ InvokeModelWithBidirectionalStreamOutput,
75
+ )
76
+ from smithy_aws_core.auth.sigv4 import SigV4AuthScheme
77
+ from smithy_aws_core.identity.static import StaticCredentialsResolver
78
+ from smithy_core.aio.eventstream import DuplexEventStream
79
+ except ModuleNotFoundError as e:
80
+ logger.error(f"Exception: {e}")
81
+ logger.error(
82
+ "In order to use AWS services, you need to `pip install pipecat-ai[aws-nova-sonic]`."
83
+ )
84
+ raise Exception(f"Missing module: {e}")
85
+
86
+
87
+ class AWSNovaSonicUnhandledFunctionException(Exception):
88
+ """Exception raised when the LLM attempts to call an unregistered function."""
89
+
90
+ pass
91
+
92
+
93
+ class ContentType(Enum):
94
+ """Content types supported by AWS Nova Sonic.
95
+
96
+ Parameters:
97
+ AUDIO: Audio content type.
98
+ TEXT: Text content type.
99
+ TOOL: Tool content type.
100
+ """
101
+
102
+ AUDIO = "AUDIO"
103
+ TEXT = "TEXT"
104
+ TOOL = "TOOL"
105
+
106
+
107
+ class TextStage(Enum):
108
+ """Text generation stages in AWS Nova Sonic responses.
109
+
110
+ Parameters:
111
+ FINAL: Final text that has been fully generated.
112
+ SPECULATIVE: Speculative text that is still being generated.
113
+ """
114
+
115
+ FINAL = "FINAL" # what has been said
116
+ SPECULATIVE = "SPECULATIVE" # what's planned to be said
117
+
118
+
119
+ @dataclass
120
+ class CurrentContent:
121
+ """Represents content currently being received from AWS Nova Sonic.
122
+
123
+ Parameters:
124
+ type: The type of content (audio, text, or tool).
125
+ role: The role generating the content (user, assistant, etc.).
126
+ text_stage: The stage of text generation (final or speculative).
127
+ text_content: The actual text content if applicable.
128
+ """
129
+
130
+ type: ContentType
131
+ role: Role
132
+ text_stage: TextStage # None if not text
133
+ text_content: str # starts as None, then fills in if text
134
+
135
+ def __str__(self):
136
+ """String representation of the current content."""
137
+ return (
138
+ f"CurrentContent(\n"
139
+ f" type={self.type.name},\n"
140
+ f" role={self.role.name},\n"
141
+ f" text_stage={self.text_stage.name if self.text_stage else 'None'}\n"
142
+ f")"
143
+ )
144
+
145
+
146
+ class Params(BaseModel):
147
+ """Configuration parameters for AWS Nova Sonic.
148
+
149
+ Parameters:
150
+ input_sample_rate: Audio input sample rate in Hz.
151
+ input_sample_size: Audio input sample size in bits.
152
+ input_channel_count: Number of input audio channels.
153
+ output_sample_rate: Audio output sample rate in Hz.
154
+ output_sample_size: Audio output sample size in bits.
155
+ output_channel_count: Number of output audio channels.
156
+ max_tokens: Maximum number of tokens to generate.
157
+ top_p: Nucleus sampling parameter.
158
+ temperature: Sampling temperature for text generation.
159
+ """
160
+
161
+ # Audio input
162
+ input_sample_rate: Optional[int] = Field(default=16000)
163
+ input_sample_size: Optional[int] = Field(default=16)
164
+ input_channel_count: Optional[int] = Field(default=1)
165
+
166
+ # Audio output
167
+ output_sample_rate: Optional[int] = Field(default=24000)
168
+ output_sample_size: Optional[int] = Field(default=16)
169
+ output_channel_count: Optional[int] = Field(default=1)
170
+
171
+ # Inference
172
+ max_tokens: Optional[int] = Field(default=1024)
173
+ top_p: Optional[float] = Field(default=0.9)
174
+ temperature: Optional[float] = Field(default=0.7)
175
+
176
+
177
+ class AWSNovaSonicLLMService(LLMService):
178
+ """AWS Nova Sonic speech-to-speech LLM service.
179
+
180
+ Provides bidirectional audio streaming, real-time transcription, text generation,
181
+ and function calling capabilities using AWS Nova Sonic model.
182
+ """
183
+
184
+ # Override the default adapter to use the AWSNovaSonicLLMAdapter one
185
+ adapter_class = AWSNovaSonicLLMAdapter
186
+
187
+ def __init__(
188
+ self,
189
+ *,
190
+ secret_access_key: str,
191
+ access_key_id: str,
192
+ session_token: Optional[str] = None,
193
+ region: str,
194
+ model: str = "amazon.nova-sonic-v1:0",
195
+ voice_id: str = "matthew", # matthew, tiffany, amy
196
+ params: Optional[Params] = None,
197
+ system_instruction: Optional[str] = None,
198
+ tools: Optional[ToolsSchema] = None,
199
+ send_transcription_frames: bool = True,
200
+ **kwargs,
201
+ ):
202
+ """Initializes the AWS Nova Sonic LLM service.
203
+
204
+ Args:
205
+ secret_access_key: AWS secret access key for authentication.
206
+ access_key_id: AWS access key ID for authentication.
207
+ session_token: AWS session token for authentication.
208
+ region: AWS region where the service is hosted.
209
+ model: Model identifier. Defaults to "amazon.nova-sonic-v1:0".
210
+ voice_id: Voice ID for speech synthesis. Options: matthew, tiffany, amy.
211
+ params: Model parameters for audio configuration and inference.
212
+ system_instruction: System-level instruction for the model.
213
+ tools: Available tools/functions for the model to use.
214
+ send_transcription_frames: Whether to emit transcription frames.
215
+
216
+ .. deprecated:: 0.0.91
217
+ This parameter is deprecated and will be removed in a future version.
218
+ Transcription frames are always sent.
219
+
220
+ **kwargs: Additional arguments passed to the parent LLMService.
221
+ """
222
+ super().__init__(**kwargs)
223
+ self._secret_access_key = secret_access_key
224
+ self._access_key_id = access_key_id
225
+ self._session_token = session_token
226
+ self._region = region
227
+ self._model = model
228
+ self._client: Optional[BedrockRuntimeClient] = None
229
+ self._voice_id = voice_id
230
+ self._params = params or Params()
231
+ self._system_instruction = system_instruction
232
+ self._tools = tools
233
+
234
+ if not send_transcription_frames:
235
+ import warnings
236
+
237
+ with warnings.catch_warnings():
238
+ warnings.simplefilter("always")
239
+ warnings.warn(
240
+ "`send_transcription_frames` is deprecated and will be removed in a future version. "
241
+ "Transcription frames are always sent.",
242
+ DeprecationWarning,
243
+ stacklevel=2,
244
+ )
245
+
246
+ self._context: Optional[LLMContext] = None
247
+ self._stream: Optional[
248
+ DuplexEventStream[
249
+ InvokeModelWithBidirectionalStreamInput,
250
+ InvokeModelWithBidirectionalStreamOutput,
251
+ InvokeModelWithBidirectionalStreamOperationOutput,
252
+ ]
253
+ ] = None
254
+ self._receive_task: Optional[asyncio.Task] = None
255
+ self._prompt_name: Optional[str] = None
256
+ self._input_audio_content_name: Optional[str] = None
257
+ self._content_being_received: Optional[CurrentContent] = None
258
+ self._assistant_is_responding = False
259
+ self._may_need_repush_assistant_text = False
260
+ self._ready_to_send_context = False
261
+ self._handling_bot_stopped_speaking = False
262
+ self._triggering_assistant_response = False
263
+ self._waiting_for_trigger_transcription = False
264
+ self._disconnecting = False
265
+ self._connected_time: Optional[float] = None
266
+ self._wants_connection = False
267
+ self._user_text_buffer = ""
268
+ self._assistant_text_buffer = ""
269
+ self._completed_tool_calls = set()
270
+
271
+ file_path = files("pipecat.services.aws.nova_sonic").joinpath("ready.wav")
272
+ with wave.open(file_path.open("rb"), "rb") as wav_file:
273
+ self._assistant_response_trigger_audio = wav_file.readframes(wav_file.getnframes())
274
+
275
+ #
276
+ # standard AIService frame handling
277
+ #
278
+
279
+ async def start(self, frame: StartFrame):
280
+ """Start the service and initiate connection to AWS Nova Sonic.
281
+
282
+ Args:
283
+ frame: The start frame triggering service initialization.
284
+ """
285
+ await super().start(frame)
286
+ self._wants_connection = True
287
+ await self._start_connecting()
288
+
289
+ async def stop(self, frame: EndFrame):
290
+ """Stop the service and close connections.
291
+
292
+ Args:
293
+ frame: The end frame triggering service shutdown.
294
+ """
295
+ await super().stop(frame)
296
+ self._wants_connection = False
297
+ await self._disconnect()
298
+
299
+ async def cancel(self, frame: CancelFrame):
300
+ """Cancel the service and close connections.
301
+
302
+ Args:
303
+ frame: The cancel frame triggering service cancellation.
304
+ """
305
+ await super().cancel(frame)
306
+ self._wants_connection = False
307
+ await self._disconnect()
308
+
309
+ #
310
+ # conversation resetting
311
+ #
312
+
313
+ async def reset_conversation(self):
314
+ """Reset the conversation state while preserving context.
315
+
316
+ Handles bot stopped speaking event, disconnects from the service,
317
+ and reconnects with the preserved context.
318
+ """
319
+ logger.debug("Resetting conversation")
320
+ await self._handle_bot_stopped_speaking(delay_to_catch_trailing_assistant_text=False)
321
+
322
+ # Grab context to carry through disconnect/reconnect
323
+ context = self._context
324
+
325
+ await self._disconnect()
326
+ await self._start_connecting()
327
+ await self._handle_context(context)
328
+
329
+ #
330
+ # frame processing
331
+ #
332
+
333
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
334
+ """Process incoming frames and handle service-specific logic.
335
+
336
+ Args:
337
+ frame: The frame to process.
338
+ direction: The direction the frame is traveling.
339
+ """
340
+ await super().process_frame(frame, direction)
341
+
342
+ if isinstance(frame, (LLMContextFrame, OpenAILLMContextFrame)):
343
+ context = (
344
+ frame.context
345
+ if isinstance(frame, LLMContextFrame)
346
+ else LLMContext.from_openai_context(frame.context)
347
+ )
348
+ await self._handle_context(context)
349
+ elif isinstance(frame, InputAudioRawFrame):
350
+ await self._handle_input_audio_frame(frame)
351
+ elif isinstance(frame, BotStoppedSpeakingFrame):
352
+ await self._handle_bot_stopped_speaking(delay_to_catch_trailing_assistant_text=True)
353
+ elif isinstance(frame, InterruptionFrame):
354
+ await self._handle_interruption_frame()
355
+
356
+ await self.push_frame(frame, direction)
357
+
358
+ async def _handle_context(self, context: LLMContext):
359
+ if self._disconnecting:
360
+ return
361
+
362
+ if not self._context:
363
+ # We got our initial context
364
+ # Try to finish connecting
365
+ self._context = context
366
+ await self._finish_connecting_if_context_available()
367
+ else:
368
+ # We got an updated context
369
+ # Send results for any newly-completed function calls
370
+ await self._process_completed_function_calls(send_new_results=True)
371
+
372
+ async def _handle_input_audio_frame(self, frame: InputAudioRawFrame):
373
+ # Wait until we're done sending the assistant response trigger audio before sending audio
374
+ # from the user's mic
375
+ if self._triggering_assistant_response:
376
+ return
377
+
378
+ await self._send_user_audio_event(frame.audio)
379
+
380
+ async def _handle_bot_stopped_speaking(self, delay_to_catch_trailing_assistant_text: bool):
381
+ # Protect against back-to-back BotStoppedSpeaking calls, which I've observed
382
+ if self._handling_bot_stopped_speaking:
383
+ return
384
+ self._handling_bot_stopped_speaking = True
385
+
386
+ async def finalize_assistant_response():
387
+ if self._assistant_is_responding:
388
+ # Consider the assistant finished with their response (possibly after a short delay,
389
+ # to allow for any trailing FINAL assistant text block to come in that need to make
390
+ # it into context).
391
+ #
392
+ # TODO: ideally we could base this solely on the LLM output events, but I couldn't
393
+ # figure out a reliable way to determine when we've gotten our last FINAL text block
394
+ # after the LLM is done talking.
395
+ #
396
+ # First I looked at stopReason, but it doesn't seem like the last FINAL text block
397
+ # is reliably marked END_TURN (sometimes the *first* one is, but not the last...
398
+ # bug?)
399
+ #
400
+ # Then I considered schemes where we tally or match up SPECULATIVE text blocks with
401
+ # FINAL text blocks to know how many or which FINAL blocks to expect, but user
402
+ # interruptions throw a wrench in these schemes: depending on the exact timing of
403
+ # the interruption, we should or shouldn't expect some FINAL blocks.
404
+ if delay_to_catch_trailing_assistant_text:
405
+ # This delay length is a balancing act between "catching" trailing assistant
406
+ # text that is quite delayed but not waiting so long that user text comes in
407
+ # first and results in a bit of context message order scrambling.
408
+ await asyncio.sleep(1.25)
409
+ self._assistant_is_responding = False
410
+ await self._report_assistant_response_ended()
411
+
412
+ self._handling_bot_stopped_speaking = False
413
+
414
+ # Finalize the assistant response, either now or after a delay
415
+ if delay_to_catch_trailing_assistant_text:
416
+ self.create_task(finalize_assistant_response())
417
+ else:
418
+ await finalize_assistant_response()
419
+
420
+ async def _handle_interruption_frame(self):
421
+ if self._assistant_is_responding:
422
+ self._may_need_repush_assistant_text = True
423
+
424
+ #
425
+ # LLM communication: lifecycle
426
+ #
427
+
428
+ async def _start_connecting(self):
429
+ try:
430
+ logger.info("Connecting...")
431
+
432
+ if self._client:
433
+ # Here we assume that if we have a client we are connected or connecting
434
+ return
435
+
436
+ # Set IDs for the connection
437
+ self._prompt_name = str(uuid.uuid4())
438
+ self._input_audio_content_name = str(uuid.uuid4())
439
+
440
+ # Create the client
441
+ self._client = self._create_client()
442
+
443
+ # Start the bidirectional stream
444
+ self._stream = await self._client.invoke_model_with_bidirectional_stream(
445
+ InvokeModelWithBidirectionalStreamOperationInput(model_id=self._model)
446
+ )
447
+
448
+ # Send session start event
449
+ await self._send_session_start_event()
450
+
451
+ # Finish connecting
452
+ self._ready_to_send_context = True
453
+ await self._finish_connecting_if_context_available()
454
+ except Exception as e:
455
+ logger.error(f"{self} initialization error: {e}")
456
+ await self._disconnect()
457
+
458
+ async def _process_completed_function_calls(self, send_new_results: bool):
459
+ # Check for set of completed function calls in the context
460
+ for message in self._context.get_messages():
461
+ if message.get("role") and message.get("content") != "IN_PROGRESS":
462
+ tool_call_id = message.get("tool_call_id")
463
+ if tool_call_id and tool_call_id not in self._completed_tool_calls:
464
+ # Found a newly-completed function call - send the result to the service
465
+ if send_new_results:
466
+ await self._send_tool_result(tool_call_id, message.get("content"))
467
+ self._completed_tool_calls.add(tool_call_id)
468
+
469
+ async def _finish_connecting_if_context_available(self):
470
+ # We can only finish connecting once we've gotten our initial context and we're ready to
471
+ # send it
472
+ if not (self._context and self._ready_to_send_context):
473
+ return
474
+
475
+ logger.info("Finishing connecting (setting up session)...")
476
+
477
+ # Initialize our bookkeeping of already-completed tool calls in the
478
+ # context
479
+ await self._process_completed_function_calls(send_new_results=False)
480
+
481
+ # Read context
482
+ adapter: AWSNovaSonicLLMAdapter = self.get_llm_adapter()
483
+ llm_connection_params = adapter.get_llm_invocation_params(self._context)
484
+
485
+ # Send prompt start event, specifying tools.
486
+ # Tools from context take priority over self._tools.
487
+ tools = (
488
+ llm_connection_params["tools"]
489
+ if llm_connection_params["tools"]
490
+ else adapter.from_standard_tools(self._tools)
491
+ )
492
+ logger.debug(f"Using tools: {tools}")
493
+ await self._send_prompt_start_event(tools)
494
+
495
+ # Send system instruction.
496
+ # Instruction from context takes priority over self._system_instruction.
497
+ system_instruction = (
498
+ llm_connection_params["system_instruction"]
499
+ if llm_connection_params["system_instruction"]
500
+ else self._system_instruction
501
+ )
502
+ logger.debug(f"Using system instruction: {system_instruction}")
503
+ if system_instruction:
504
+ await self._send_text_event(text=system_instruction, role=Role.SYSTEM)
505
+
506
+ # Send conversation history
507
+ for message in llm_connection_params["messages"]:
508
+ # logger.debug(f"Seeding conversation history with message: {message}")
509
+ await self._send_text_event(text=message.text, role=message.role)
510
+
511
+ # Start audio input
512
+ await self._send_audio_input_start_event()
513
+
514
+ # Start receiving events
515
+ self._receive_task = self.create_task(self._receive_task_handler())
516
+
517
+ # Record finished connecting time (must be done before sending assistant response trigger)
518
+ self._connected_time = time.time()
519
+
520
+ logger.info("Finished connecting")
521
+
522
+ # If we need to, send assistant response trigger (depends on self._connected_time)
523
+ if self._triggering_assistant_response:
524
+ await self._send_assistant_response_trigger()
525
+
526
+ async def _disconnect(self):
527
+ try:
528
+ logger.info("Disconnecting...")
529
+
530
+ # NOTE: see explanation of HACK, below
531
+ self._disconnecting = True
532
+
533
+ # Clean up client
534
+ if self._client:
535
+ await self._send_session_end_events()
536
+ self._client = None
537
+
538
+ # Clean up context
539
+ self._context = None
540
+
541
+ # Clean up stream
542
+ if self._stream:
543
+ await self._stream.close()
544
+ self._stream = None
545
+
546
+ # NOTE: see explanation of HACK, below
547
+ await asyncio.sleep(1)
548
+
549
+ # Clean up receive task
550
+ # HACK: we should ideally be able to cancel the receive task before stopping the input
551
+ # stream, above (meaning we wouldn't need self._disconnecting). But for some reason if
552
+ # we don't close the input stream and wait a second first, we're getting an error a lot
553
+ # like this one: https://github.com/awslabs/amazon-transcribe-streaming-sdk/issues/61.
554
+ if self._receive_task:
555
+ await self.cancel_task(self._receive_task, timeout=1.0)
556
+ self._receive_task = None
557
+
558
+ # Reset remaining connection-specific state
559
+ # Should be all private state except:
560
+ # - _wants_connection
561
+ # - _assistant_response_trigger_audio
562
+ self._prompt_name = None
563
+ self._input_audio_content_name = None
564
+ self._content_being_received = None
565
+ self._assistant_is_responding = False
566
+ self._may_need_repush_assistant_text = False
567
+ self._ready_to_send_context = False
568
+ self._handling_bot_stopped_speaking = False
569
+ self._triggering_assistant_response = False
570
+ self._waiting_for_trigger_transcription = False
571
+ self._disconnecting = False
572
+ self._connected_time = None
573
+ self._user_text_buffer = ""
574
+ self._assistant_text_buffer = ""
575
+ self._completed_tool_calls = set()
576
+
577
+ logger.info("Finished disconnecting")
578
+ except Exception as e:
579
+ logger.error(f"{self} error disconnecting: {e}")
580
+
581
+ def _create_client(self) -> BedrockRuntimeClient:
582
+ config = Config(
583
+ endpoint_uri=f"https://bedrock-runtime.{self._region}.amazonaws.com",
584
+ region=self._region,
585
+ aws_access_key_id=self._access_key_id,
586
+ aws_secret_access_key=self._secret_access_key,
587
+ aws_session_token=self._session_token,
588
+ aws_credentials_identity_resolver=StaticCredentialsResolver(),
589
+ auth_schemes={"aws.auth#sigv4": SigV4AuthScheme(service="bedrock")},
590
+ )
591
+ return BedrockRuntimeClient(config=config)
592
+
593
+ #
594
+ # LLM communication: input events (pipecat -> LLM)
595
+ #
596
+
597
+ async def _send_session_start_event(self):
598
+ session_start = f"""
599
+ {{
600
+ "event": {{
601
+ "sessionStart": {{
602
+ "inferenceConfiguration": {{
603
+ "maxTokens": {self._params.max_tokens},
604
+ "topP": {self._params.top_p},
605
+ "temperature": {self._params.temperature}
606
+ }}
607
+ }}
608
+ }}
609
+ }}
610
+ """
611
+ await self._send_client_event(session_start)
612
+
613
+ async def _send_prompt_start_event(self, tools: List[Any]):
614
+ if not self._prompt_name:
615
+ return
616
+
617
+ tools_config = (
618
+ f""",
619
+ "toolUseOutputConfiguration": {{
620
+ "mediaType": "application/json"
621
+ }},
622
+ "toolConfiguration": {{
623
+ "tools": {json.dumps(tools)}
624
+ }}
625
+ """
626
+ if tools
627
+ else ""
628
+ )
629
+
630
+ prompt_start = f'''
631
+ {{
632
+ "event": {{
633
+ "promptStart": {{
634
+ "promptName": "{self._prompt_name}",
635
+ "textOutputConfiguration": {{
636
+ "mediaType": "text/plain"
637
+ }},
638
+ "audioOutputConfiguration": {{
639
+ "mediaType": "audio/lpcm",
640
+ "sampleRateHertz": {self._params.output_sample_rate},
641
+ "sampleSizeBits": {self._params.output_sample_size},
642
+ "channelCount": {self._params.output_channel_count},
643
+ "voiceId": "{self._voice_id}",
644
+ "encoding": "base64",
645
+ "audioType": "SPEECH"
646
+ }}{tools_config}
647
+ }}
648
+ }}
649
+ }}
650
+ '''
651
+ await self._send_client_event(prompt_start)
652
+
653
+ async def _send_audio_input_start_event(self):
654
+ if not self._prompt_name:
655
+ return
656
+
657
+ audio_content_start = f'''
658
+ {{
659
+ "event": {{
660
+ "contentStart": {{
661
+ "promptName": "{self._prompt_name}",
662
+ "contentName": "{self._input_audio_content_name}",
663
+ "type": "AUDIO",
664
+ "interactive": true,
665
+ "role": "USER",
666
+ "audioInputConfiguration": {{
667
+ "mediaType": "audio/lpcm",
668
+ "sampleRateHertz": {self._params.input_sample_rate},
669
+ "sampleSizeBits": {self._params.input_sample_size},
670
+ "channelCount": {self._params.input_channel_count},
671
+ "audioType": "SPEECH",
672
+ "encoding": "base64"
673
+ }}
674
+ }}
675
+ }}
676
+ }}
677
+ '''
678
+ await self._send_client_event(audio_content_start)
679
+
680
+ async def _send_text_event(self, text: str, role: Role):
681
+ if not self._stream or not self._prompt_name or not text:
682
+ return
683
+
684
+ content_name = str(uuid.uuid4())
685
+
686
+ text_content_start = f'''
687
+ {{
688
+ "event": {{
689
+ "contentStart": {{
690
+ "promptName": "{self._prompt_name}",
691
+ "contentName": "{content_name}",
692
+ "type": "TEXT",
693
+ "interactive": true,
694
+ "role": "{role.value}",
695
+ "textInputConfiguration": {{
696
+ "mediaType": "text/plain"
697
+ }}
698
+ }}
699
+ }}
700
+ }}
701
+ '''
702
+ await self._send_client_event(text_content_start)
703
+
704
+ escaped_text = json.dumps(text) # includes quotes
705
+ text_input = f'''
706
+ {{
707
+ "event": {{
708
+ "textInput": {{
709
+ "promptName": "{self._prompt_name}",
710
+ "contentName": "{content_name}",
711
+ "content": {escaped_text}
712
+ }}
713
+ }}
714
+ }}
715
+ '''
716
+ await self._send_client_event(text_input)
717
+
718
+ text_content_end = f'''
719
+ {{
720
+ "event": {{
721
+ "contentEnd": {{
722
+ "promptName": "{self._prompt_name}",
723
+ "contentName": "{content_name}"
724
+ }}
725
+ }}
726
+ }}
727
+ '''
728
+ await self._send_client_event(text_content_end)
729
+
730
+ async def _send_user_audio_event(self, audio: bytes):
731
+ if not self._stream:
732
+ return
733
+
734
+ blob = base64.b64encode(audio)
735
+ audio_event = f'''
736
+ {{
737
+ "event": {{
738
+ "audioInput": {{
739
+ "promptName": "{self._prompt_name}",
740
+ "contentName": "{self._input_audio_content_name}",
741
+ "content": "{blob.decode("utf-8")}"
742
+ }}
743
+ }}
744
+ }}
745
+ '''
746
+ await self._send_client_event(audio_event)
747
+
748
+ async def _send_session_end_events(self):
749
+ if not self._stream or not self._prompt_name:
750
+ return
751
+
752
+ prompt_end = f'''
753
+ {{
754
+ "event": {{
755
+ "promptEnd": {{
756
+ "promptName": "{self._prompt_name}"
757
+ }}
758
+ }}
759
+ }}
760
+ '''
761
+ await self._send_client_event(prompt_end)
762
+
763
+ session_end = """
764
+ {
765
+ "event": {
766
+ "sessionEnd": {}
767
+ }
768
+ }
769
+ """
770
+ await self._send_client_event(session_end)
771
+
772
+ async def _send_tool_result(self, tool_call_id, result):
773
+ if not self._stream or not self._prompt_name:
774
+ return
775
+
776
+ content_name = str(uuid.uuid4())
777
+
778
+ result_content_start = f'''
779
+ {{
780
+ "event": {{
781
+ "contentStart": {{
782
+ "promptName": "{self._prompt_name}",
783
+ "contentName": "{content_name}",
784
+ "interactive": false,
785
+ "type": "TOOL",
786
+ "role": "TOOL",
787
+ "toolResultInputConfiguration": {{
788
+ "toolUseId": "{tool_call_id}",
789
+ "type": "TEXT",
790
+ "textInputConfiguration": {{
791
+ "mediaType": "text/plain"
792
+ }}
793
+ }}
794
+ }}
795
+ }}
796
+ }}
797
+ '''
798
+ await self._send_client_event(result_content_start)
799
+
800
+ result_content = json.dumps(
801
+ {
802
+ "event": {
803
+ "toolResult": {
804
+ "promptName": self._prompt_name,
805
+ "contentName": content_name,
806
+ "content": json.dumps(result) if isinstance(result, dict) else result,
807
+ }
808
+ }
809
+ }
810
+ )
811
+ await self._send_client_event(result_content)
812
+
813
+ result_content_end = f"""
814
+ {{
815
+ "event": {{
816
+ "contentEnd": {{
817
+ "promptName": "{self._prompt_name}",
818
+ "contentName": "{content_name}"
819
+ }}
820
+ }}
821
+ }}
822
+ """
823
+ await self._send_client_event(result_content_end)
824
+
825
+ async def _send_client_event(self, event_json: str):
826
+ if not self._stream: # should never happen
827
+ return
828
+
829
+ event = InvokeModelWithBidirectionalStreamInputChunk(
830
+ value=BidirectionalInputPayloadPart(bytes_=event_json.encode("utf-8"))
831
+ )
832
+ await self._stream.input_stream.send(event)
833
+
834
+ #
835
+ # LLM communication: output events (LLM -> pipecat)
836
+ #
837
+
838
+ # Receive events for the session.
839
+ # A few different kinds of content can be delivered:
840
+ # - Transcription of user audio
841
+ # - Tool use
842
+ # - Text preview of planned response speech before audio delivered
843
+ # - User interruption notification
844
+ # - Text of response speech that whose audio was actually delivered
845
+ # - Audio of response speech
846
+ # Each piece of content is wrapped by "contentStart" and "contentEnd" events. The content is
847
+ # delivered sequentially: one piece of content will end before another starts.
848
+ # The overall completion is wrapped by "completionStart" and "completionEnd" events.
849
+ async def _receive_task_handler(self):
850
+ try:
851
+ while self._stream and not self._disconnecting:
852
+ output = await self._stream.await_output()
853
+ result = await output[1].receive()
854
+
855
+ if result.value and result.value.bytes_:
856
+ response_data = result.value.bytes_.decode("utf-8")
857
+ json_data = json.loads(response_data)
858
+
859
+ if "event" in json_data:
860
+ event_json = json_data["event"]
861
+ if "completionStart" in event_json:
862
+ # Handle the LLM completion starting
863
+ await self._handle_completion_start_event(event_json)
864
+ elif "contentStart" in event_json:
865
+ # Handle a piece of content starting
866
+ await self._handle_content_start_event(event_json)
867
+ elif "textOutput" in event_json:
868
+ # Handle text output content
869
+ await self._handle_text_output_event(event_json)
870
+ elif "audioOutput" in event_json:
871
+ # Handle audio output content
872
+ await self._handle_audio_output_event(event_json)
873
+ elif "toolUse" in event_json:
874
+ # Handle tool use
875
+ await self._handle_tool_use_event(event_json)
876
+ elif "contentEnd" in event_json:
877
+ # Handle a piece of content ending
878
+ await self._handle_content_end_event(event_json)
879
+ elif "completionEnd" in event_json:
880
+ # Handle the LLM completion ending
881
+ await self._handle_completion_end_event(event_json)
882
+ except Exception as e:
883
+ if self._disconnecting:
884
+ # Errors are kind of expected while disconnecting, so just
885
+ # ignore them and do nothing
886
+ return
887
+ logger.error(f"{self} error processing responses: {e}")
888
+ if self._wants_connection:
889
+ await self.reset_conversation()
890
+
891
+ async def _handle_completion_start_event(self, event_json):
892
+ pass
893
+
894
+ async def _handle_content_start_event(self, event_json):
895
+ content_start = event_json["contentStart"]
896
+ type = content_start["type"]
897
+ role = content_start["role"]
898
+ generation_stage = None
899
+ if "additionalModelFields" in content_start:
900
+ additional_model_fields = json.loads(content_start["additionalModelFields"])
901
+ generation_stage = additional_model_fields.get("generationStage")
902
+
903
+ # Bookkeeping: track current content being received
904
+ content = CurrentContent(
905
+ type=ContentType(type),
906
+ role=Role(role),
907
+ text_stage=TextStage(generation_stage) if generation_stage else None,
908
+ text_content=None,
909
+ )
910
+ self._content_being_received = content
911
+
912
+ if content.role == Role.ASSISTANT:
913
+ if content.type == ContentType.AUDIO:
914
+ # Note that an assistant response can comprise of multiple audio blocks
915
+ if not self._assistant_is_responding:
916
+ # The assistant has started responding.
917
+ self._assistant_is_responding = True
918
+ await self._report_user_transcription_ended() # Consider user turn over
919
+ await self._report_assistant_response_started()
920
+
921
+ async def _handle_text_output_event(self, event_json):
922
+ if not self._content_being_received: # should never happen
923
+ return
924
+ content = self._content_being_received
925
+
926
+ text_content = event_json["textOutput"]["content"]
927
+
928
+ # Bookkeeping: augment the current content being received with text
929
+ # Assumption: only one text content per content block
930
+ content.text_content = text_content
931
+
932
+ async def _handle_audio_output_event(self, event_json):
933
+ if not self._content_being_received: # should never happen
934
+ return
935
+
936
+ # Get audio
937
+ audio_content = event_json["audioOutput"]["content"]
938
+
939
+ # Push audio frame
940
+ audio = base64.b64decode(audio_content)
941
+ frame = TTSAudioRawFrame(
942
+ audio=audio,
943
+ sample_rate=self._params.output_sample_rate,
944
+ num_channels=self._params.output_channel_count,
945
+ )
946
+ await self.push_frame(frame)
947
+
948
+ async def _handle_tool_use_event(self, event_json):
949
+ if not self._content_being_received or not self._context: # should never happen
950
+ return
951
+
952
+ # Consider user turn over
953
+ await self._report_user_transcription_ended()
954
+
955
+ # Get tool use details
956
+ tool_use = event_json["toolUse"]
957
+ function_name = tool_use["toolName"]
958
+ tool_call_id = tool_use["toolUseId"]
959
+ arguments = json.loads(tool_use["content"])
960
+
961
+ # Call tool function
962
+ if self.has_function(function_name):
963
+ if function_name in self._functions.keys() or None in self._functions.keys():
964
+ function_calls_llm = [
965
+ FunctionCallFromLLM(
966
+ context=self._context,
967
+ tool_call_id=tool_call_id,
968
+ function_name=function_name,
969
+ arguments=arguments,
970
+ )
971
+ ]
972
+
973
+ await self.run_function_calls(function_calls_llm)
974
+ else:
975
+ raise AWSNovaSonicUnhandledFunctionException(
976
+ f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function."
977
+ )
978
+
979
+ async def _handle_content_end_event(self, event_json):
980
+ if not self._content_being_received: # should never happen
981
+ return
982
+ content = self._content_being_received
983
+
984
+ content_end = event_json["contentEnd"]
985
+ stop_reason = content_end["stopReason"]
986
+
987
+ # Bookkeeping: clear current content being received
988
+ self._content_being_received = None
989
+
990
+ if content.role == Role.ASSISTANT:
991
+ if content.type == ContentType.TEXT:
992
+ # Ignore non-final text, and the "interrupted" message (which isn't meaningful text)
993
+ if content.text_stage == TextStage.FINAL and stop_reason != "INTERRUPTED":
994
+ if self._assistant_is_responding:
995
+ # Text added to the ongoing assistant response
996
+ await self._report_assistant_response_text_added(content.text_content)
997
+ elif content.role == Role.USER:
998
+ if content.type == ContentType.TEXT:
999
+ if content.text_stage == TextStage.FINAL:
1000
+ # User transcription text added
1001
+ await self._report_user_transcription_text_added(content.text_content)
1002
+
1003
+ async def _handle_completion_end_event(self, event_json):
1004
+ pass
1005
+
1006
+ #
1007
+ # assistant response reporting
1008
+ #
1009
+ # 1. Started
1010
+ # 2. Text added
1011
+ # 3. Ended
1012
+ #
1013
+
1014
+ async def _report_assistant_response_started(self):
1015
+ logger.debug("Assistant response started")
1016
+
1017
+ # Report the start of the assistant response.
1018
+ await self.push_frame(LLMFullResponseStartFrame())
1019
+
1020
+ # Report that equivalent of TTS (this is a speech-to-speech model) started
1021
+ await self.push_frame(TTSStartedFrame())
1022
+
1023
+ async def _report_assistant_response_text_added(self, text):
1024
+ if not self._context: # should never happen
1025
+ return
1026
+
1027
+ logger.debug(f"Assistant response text added: {text}")
1028
+
1029
+ # Report the text of the assistant response.
1030
+ await self.push_frame(TTSTextFrame(text))
1031
+
1032
+ # HACK: here we're also buffering the assistant text ourselves as a
1033
+ # backup rather than relying solely on the assistant context aggregator
1034
+ # to do it, because the text arrives from Nova Sonic only after all the
1035
+ # assistant audio frames have been pushed, meaning that if an
1036
+ # interruption frame were to arrive we would lose all of it (the text
1037
+ # frames sitting in the queue would be wiped).
1038
+ self._assistant_text_buffer += text
1039
+
1040
+ async def _report_assistant_response_ended(self):
1041
+ if not self._context: # should never happen
1042
+ return
1043
+
1044
+ logger.debug("Assistant response ended")
1045
+
1046
+ # If an interruption frame arrived while the assistant was responding
1047
+ # we may have lost all of the assistant text (see HACK, above), so
1048
+ # re-push it downstream to the aggregator now.
1049
+ if self._may_need_repush_assistant_text:
1050
+ # Just in case, check that assistant text hasn't already made it
1051
+ # into the context (sometimes it does, despite the interruption).
1052
+ messages = self._context.get_messages()
1053
+ last_message = messages[-1] if messages else None
1054
+ if (
1055
+ not last_message
1056
+ or last_message.get("role") != "assistant"
1057
+ or last_message.get("content") != self._assistant_text_buffer
1058
+ ):
1059
+ # We also need to re-push the LLMFullResponseStartFrame since the
1060
+ # TTSTextFrame would be ignored otherwise (the interruption frame
1061
+ # would have cleared the assistant aggregator state).
1062
+ await self.push_frame(LLMFullResponseStartFrame())
1063
+ await self.push_frame(TTSTextFrame(self._assistant_text_buffer))
1064
+ self._may_need_repush_assistant_text = False
1065
+
1066
+ # Report the end of the assistant response.
1067
+ await self.push_frame(LLMFullResponseEndFrame())
1068
+
1069
+ # Report that equivalent of TTS (this is a speech-to-speech model) stopped.
1070
+ await self.push_frame(TTSStoppedFrame())
1071
+
1072
+ # Clear out the buffered assistant text
1073
+ self._assistant_text_buffer = ""
1074
+
1075
+ #
1076
+ # user transcription reporting
1077
+ #
1078
+ # 1. Text added
1079
+ # 2. Ended
1080
+ #
1081
+ # Note: "started" does not need to be reported
1082
+ #
1083
+
1084
+ async def _report_user_transcription_text_added(self, text):
1085
+ if not self._context: # should never happen
1086
+ return
1087
+
1088
+ logger.debug(f"User transcription text added: {text}")
1089
+
1090
+ # HACK: here we're buffering the user text ourselves rather than
1091
+ # relying on the upstream user context aggregator to do it, because the
1092
+ # text arrives in fairly large chunks spaced fairly far apart in time.
1093
+ # That means the user text would be split between different messages in
1094
+ # context. Even if we sent placeholder InterimTranscriptionFrames in
1095
+ # between each TranscriptionFrame to tell the aggregator to hold off on
1096
+ # finalizing the user message, the aggregator would likely get the last
1097
+ # chunk too late.
1098
+ self._user_text_buffer += f" {text}" if self._user_text_buffer else text
1099
+
1100
+ async def _report_user_transcription_ended(self):
1101
+ if not self._context: # should never happen
1102
+ return
1103
+
1104
+ logger.debug(f"User transcription ended")
1105
+
1106
+ # Report to the upstream user context aggregator that some new user
1107
+ # transcription text is available.
1108
+
1109
+ # HACK: Check if this transcription was triggered by our own
1110
+ # assistant response trigger. If so, we need to wrap it with
1111
+ # UserStarted/StoppedSpeakingFrames; otherwise the user aggregator
1112
+ # would fire an EmulatedUserStartedSpeakingFrame, which would
1113
+ # trigger an interruption, which would prevent us from writing the
1114
+ # assistant response to context.
1115
+ #
1116
+ # Sending an EmulateUserStartedSpeakingFrame ourselves doesn't
1117
+ # work: it just causes the interruption we're trying to avoid.
1118
+ #
1119
+ # Setting enable_emulated_vad_interruptions also doesn't work: at
1120
+ # the time the user aggregator receives the TranscriptionFrame, it
1121
+ # doesn't yet know the assistant has started responding, so it
1122
+ # doesn't know that emulating the user starting to speak would
1123
+ # cause an interruption.
1124
+ should_wrap_in_user_started_stopped_speaking_frames = (
1125
+ self._waiting_for_trigger_transcription
1126
+ and self._user_text_buffer.strip().lower() == "ready"
1127
+ )
1128
+
1129
+ # Start wrapping the upstream transcription in UserStarted/StoppedSpeakingFrames if needed
1130
+ if should_wrap_in_user_started_stopped_speaking_frames:
1131
+ logger.debug(
1132
+ "Wrapping assistant response trigger transcription with upstream UserStarted/StoppedSpeakingFrames"
1133
+ )
1134
+ await self.push_frame(UserStartedSpeakingFrame(), direction=FrameDirection.UPSTREAM)
1135
+
1136
+ # Send the transcription upstream for the user context aggregator
1137
+ frame = TranscriptionFrame(
1138
+ text=self._user_text_buffer, user_id="", timestamp=time_now_iso8601()
1139
+ )
1140
+ await self.push_frame(frame, direction=FrameDirection.UPSTREAM)
1141
+
1142
+ # Finish wrapping the upstream transcription in UserStarted/StoppedSpeakingFrames if needed
1143
+ if should_wrap_in_user_started_stopped_speaking_frames:
1144
+ await self.push_frame(UserStoppedSpeakingFrame(), direction=FrameDirection.UPSTREAM)
1145
+
1146
+ # Clear out the buffered user text
1147
+ self._user_text_buffer = ""
1148
+
1149
+ # We're no longer waiting for a trigger transcription
1150
+ self._waiting_for_trigger_transcription = False
1151
+
1152
+ #
1153
+ # context
1154
+ #
1155
+
1156
+ def create_context_aggregator(
1157
+ self,
1158
+ context: OpenAILLMContext,
1159
+ *,
1160
+ user_params: LLMUserAggregatorParams = LLMUserAggregatorParams(),
1161
+ assistant_params: LLMAssistantAggregatorParams = LLMAssistantAggregatorParams(),
1162
+ ) -> LLMContextAggregatorPair:
1163
+ """Create context aggregator pair for managing conversation context.
1164
+
1165
+ NOTE: this method exists only for backward compatibility. New code
1166
+ should instead do:
1167
+ context = LLMContext(...)
1168
+ context_aggregator = LLMContextAggregatorPair(context)
1169
+
1170
+ Args:
1171
+ context: The OpenAI LLM context.
1172
+ user_params: Parameters for the user context aggregator.
1173
+ assistant_params: Parameters for the assistant context aggregator.
1174
+
1175
+ Returns:
1176
+ A pair of user and assistant context aggregators.
1177
+ """
1178
+ context = LLMContext.from_openai_context(context)
1179
+ return LLMContextAggregatorPair(
1180
+ context, user_params=user_params, assistant_params=assistant_params
1181
+ )
1182
+
1183
+ #
1184
+ # assistant response trigger (HACK)
1185
+ #
1186
+
1187
+ # Class variable
1188
+ AWAIT_TRIGGER_ASSISTANT_RESPONSE_INSTRUCTION = (
1189
+ "Start speaking when you hear the user say 'ready', but don't consider that 'ready' to be "
1190
+ "a meaningful part of the conversation other than as a trigger for you to start speaking."
1191
+ )
1192
+
1193
+ async def trigger_assistant_response(self):
1194
+ """Trigger an assistant response by sending audio cue.
1195
+
1196
+ Sends a pre-recorded "ready" audio trigger to prompt the assistant
1197
+ to start speaking. This is useful for controlling conversation flow.
1198
+
1199
+ Returns:
1200
+ False if already triggering a response, True otherwise.
1201
+ """
1202
+ if self._triggering_assistant_response:
1203
+ return False
1204
+
1205
+ self._triggering_assistant_response = True
1206
+
1207
+ # Send the trigger audio, if we're fully connected and set up
1208
+ if self._connected_time:
1209
+ await self._send_assistant_response_trigger()
1210
+
1211
+ async def _send_assistant_response_trigger(self):
1212
+ if not self._connected_time:
1213
+ # should never happen
1214
+ return
1215
+
1216
+ try:
1217
+ logger.debug("Sending assistant response trigger...")
1218
+
1219
+ self._waiting_for_trigger_transcription = True
1220
+
1221
+ chunk_duration = 0.02 # what we might get from InputAudioRawFrame
1222
+ chunk_size = int(
1223
+ chunk_duration
1224
+ * self._params.input_sample_rate
1225
+ * self._params.input_channel_count
1226
+ * (self._params.input_sample_size / 8)
1227
+ ) # e.g. 0.02 seconds of 16-bit (2-byte) PCM mono audio at 16kHz is 640 bytes
1228
+
1229
+ # Lead with a bit of blank audio, if needed.
1230
+ # It seems like the LLM can't quite "hear" the first little bit of audio sent on a
1231
+ # connection.
1232
+ current_time = time.time()
1233
+ max_blank_audio_duration = 0.5
1234
+ blank_audio_duration = (
1235
+ max_blank_audio_duration - (current_time - self._connected_time)
1236
+ if self._connected_time is not None
1237
+ and (current_time - self._connected_time) < max_blank_audio_duration
1238
+ else None
1239
+ )
1240
+ if blank_audio_duration:
1241
+ logger.debug(
1242
+ f"Leading assistant response trigger with {blank_audio_duration}s of blank audio"
1243
+ )
1244
+ blank_audio_chunk = b"\x00" * chunk_size
1245
+ num_chunks = int(blank_audio_duration / chunk_duration)
1246
+ for _ in range(num_chunks):
1247
+ await self._send_user_audio_event(blank_audio_chunk)
1248
+ await asyncio.sleep(chunk_duration)
1249
+
1250
+ # Send trigger audio
1251
+ # NOTE: this audio *will* be transcribed and eventually make it into the context. That's OK:
1252
+ # if we ever need to seed this service again with context it would make sense to include it
1253
+ # since the instruction (i.e. the "wait for the trigger" instruction) will be part of the
1254
+ # context as well.
1255
+ audio_chunks = [
1256
+ self._assistant_response_trigger_audio[i : i + chunk_size]
1257
+ for i in range(0, len(self._assistant_response_trigger_audio), chunk_size)
1258
+ ]
1259
+ for chunk in audio_chunks:
1260
+ await self._send_user_audio_event(chunk)
1261
+ await asyncio.sleep(chunk_duration)
1262
+ finally:
1263
+ # We need to clean up in case sending the trigger was cancelled, e.g. in the case of a user interruption.
1264
+ # (An asyncio.CancelledError would be raised in that case.)
1265
+ self._triggering_assistant_response = False