dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (195) hide show
  1. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
  2. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
  3. pipecat/adapters/base_llm_adapter.py +38 -1
  4. pipecat/adapters/services/anthropic_adapter.py +9 -14
  5. pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
  6. pipecat/adapters/services/bedrock_adapter.py +236 -13
  7. pipecat/adapters/services/gemini_adapter.py +12 -8
  8. pipecat/adapters/services/open_ai_adapter.py +19 -7
  9. pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
  10. pipecat/audio/dtmf/dtmf-0.wav +0 -0
  11. pipecat/audio/dtmf/dtmf-1.wav +0 -0
  12. pipecat/audio/dtmf/dtmf-2.wav +0 -0
  13. pipecat/audio/dtmf/dtmf-3.wav +0 -0
  14. pipecat/audio/dtmf/dtmf-4.wav +0 -0
  15. pipecat/audio/dtmf/dtmf-5.wav +0 -0
  16. pipecat/audio/dtmf/dtmf-6.wav +0 -0
  17. pipecat/audio/dtmf/dtmf-7.wav +0 -0
  18. pipecat/audio/dtmf/dtmf-8.wav +0 -0
  19. pipecat/audio/dtmf/dtmf-9.wav +0 -0
  20. pipecat/audio/dtmf/dtmf-pound.wav +0 -0
  21. pipecat/audio/dtmf/dtmf-star.wav +0 -0
  22. pipecat/audio/filters/krisp_viva_filter.py +193 -0
  23. pipecat/audio/filters/noisereduce_filter.py +15 -0
  24. pipecat/audio/turn/base_turn_analyzer.py +9 -1
  25. pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
  26. pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
  27. pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
  28. pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
  29. pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
  30. pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
  31. pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
  32. pipecat/audio/vad/data/README.md +10 -0
  33. pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
  34. pipecat/audio/vad/silero.py +9 -3
  35. pipecat/audio/vad/vad_analyzer.py +13 -1
  36. pipecat/extensions/voicemail/voicemail_detector.py +5 -5
  37. pipecat/frames/frames.py +277 -86
  38. pipecat/observers/loggers/debug_log_observer.py +3 -3
  39. pipecat/observers/loggers/llm_log_observer.py +7 -3
  40. pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
  41. pipecat/pipeline/runner.py +18 -6
  42. pipecat/pipeline/service_switcher.py +64 -36
  43. pipecat/pipeline/task.py +125 -79
  44. pipecat/pipeline/tts_switcher.py +30 -0
  45. pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
  46. pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
  47. pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
  48. pipecat/processors/aggregators/llm_context.py +40 -2
  49. pipecat/processors/aggregators/llm_response.py +32 -15
  50. pipecat/processors/aggregators/llm_response_universal.py +19 -15
  51. pipecat/processors/aggregators/user_response.py +6 -6
  52. pipecat/processors/aggregators/vision_image_frame.py +24 -2
  53. pipecat/processors/audio/audio_buffer_processor.py +43 -8
  54. pipecat/processors/dtmf_aggregator.py +174 -77
  55. pipecat/processors/filters/stt_mute_filter.py +17 -0
  56. pipecat/processors/frame_processor.py +110 -24
  57. pipecat/processors/frameworks/langchain.py +8 -2
  58. pipecat/processors/frameworks/rtvi.py +210 -68
  59. pipecat/processors/frameworks/strands_agents.py +170 -0
  60. pipecat/processors/logger.py +2 -2
  61. pipecat/processors/transcript_processor.py +26 -5
  62. pipecat/processors/user_idle_processor.py +35 -11
  63. pipecat/runner/daily.py +59 -20
  64. pipecat/runner/run.py +395 -93
  65. pipecat/runner/types.py +6 -4
  66. pipecat/runner/utils.py +51 -10
  67. pipecat/serializers/__init__.py +5 -1
  68. pipecat/serializers/asterisk.py +16 -2
  69. pipecat/serializers/convox.py +41 -4
  70. pipecat/serializers/custom.py +257 -0
  71. pipecat/serializers/exotel.py +5 -5
  72. pipecat/serializers/livekit.py +20 -0
  73. pipecat/serializers/plivo.py +5 -5
  74. pipecat/serializers/protobuf.py +6 -5
  75. pipecat/serializers/telnyx.py +2 -2
  76. pipecat/serializers/twilio.py +43 -23
  77. pipecat/serializers/vi.py +324 -0
  78. pipecat/services/ai_service.py +2 -6
  79. pipecat/services/anthropic/llm.py +2 -25
  80. pipecat/services/assemblyai/models.py +6 -0
  81. pipecat/services/assemblyai/stt.py +13 -5
  82. pipecat/services/asyncai/tts.py +5 -3
  83. pipecat/services/aws/__init__.py +1 -0
  84. pipecat/services/aws/llm.py +147 -105
  85. pipecat/services/aws/nova_sonic/__init__.py +0 -0
  86. pipecat/services/aws/nova_sonic/context.py +436 -0
  87. pipecat/services/aws/nova_sonic/frames.py +25 -0
  88. pipecat/services/aws/nova_sonic/llm.py +1265 -0
  89. pipecat/services/aws/stt.py +3 -3
  90. pipecat/services/aws_nova_sonic/__init__.py +19 -1
  91. pipecat/services/aws_nova_sonic/aws.py +11 -1151
  92. pipecat/services/aws_nova_sonic/context.py +8 -354
  93. pipecat/services/aws_nova_sonic/frames.py +13 -17
  94. pipecat/services/azure/llm.py +51 -1
  95. pipecat/services/azure/realtime/__init__.py +0 -0
  96. pipecat/services/azure/realtime/llm.py +65 -0
  97. pipecat/services/azure/stt.py +15 -0
  98. pipecat/services/cartesia/stt.py +77 -70
  99. pipecat/services/cartesia/tts.py +80 -13
  100. pipecat/services/deepgram/__init__.py +1 -0
  101. pipecat/services/deepgram/flux/__init__.py +0 -0
  102. pipecat/services/deepgram/flux/stt.py +640 -0
  103. pipecat/services/elevenlabs/__init__.py +4 -1
  104. pipecat/services/elevenlabs/stt.py +339 -0
  105. pipecat/services/elevenlabs/tts.py +87 -46
  106. pipecat/services/fish/tts.py +5 -2
  107. pipecat/services/gemini_multimodal_live/events.py +38 -524
  108. pipecat/services/gemini_multimodal_live/file_api.py +23 -173
  109. pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
  110. pipecat/services/gladia/stt.py +56 -72
  111. pipecat/services/google/__init__.py +1 -0
  112. pipecat/services/google/gemini_live/__init__.py +3 -0
  113. pipecat/services/google/gemini_live/file_api.py +189 -0
  114. pipecat/services/google/gemini_live/llm.py +1582 -0
  115. pipecat/services/google/gemini_live/llm_vertex.py +184 -0
  116. pipecat/services/google/llm.py +15 -11
  117. pipecat/services/google/llm_openai.py +3 -3
  118. pipecat/services/google/llm_vertex.py +86 -16
  119. pipecat/services/google/stt.py +4 -0
  120. pipecat/services/google/tts.py +7 -3
  121. pipecat/services/heygen/api.py +2 -0
  122. pipecat/services/heygen/client.py +8 -4
  123. pipecat/services/heygen/video.py +2 -0
  124. pipecat/services/hume/__init__.py +5 -0
  125. pipecat/services/hume/tts.py +220 -0
  126. pipecat/services/inworld/tts.py +6 -6
  127. pipecat/services/llm_service.py +15 -5
  128. pipecat/services/lmnt/tts.py +4 -2
  129. pipecat/services/mcp_service.py +4 -2
  130. pipecat/services/mem0/memory.py +6 -5
  131. pipecat/services/mistral/llm.py +29 -8
  132. pipecat/services/moondream/vision.py +42 -16
  133. pipecat/services/neuphonic/tts.py +5 -2
  134. pipecat/services/openai/__init__.py +1 -0
  135. pipecat/services/openai/base_llm.py +27 -20
  136. pipecat/services/openai/realtime/__init__.py +0 -0
  137. pipecat/services/openai/realtime/context.py +272 -0
  138. pipecat/services/openai/realtime/events.py +1106 -0
  139. pipecat/services/openai/realtime/frames.py +37 -0
  140. pipecat/services/openai/realtime/llm.py +829 -0
  141. pipecat/services/openai/tts.py +49 -10
  142. pipecat/services/openai_realtime/__init__.py +27 -0
  143. pipecat/services/openai_realtime/azure.py +21 -0
  144. pipecat/services/openai_realtime/context.py +21 -0
  145. pipecat/services/openai_realtime/events.py +21 -0
  146. pipecat/services/openai_realtime/frames.py +21 -0
  147. pipecat/services/openai_realtime_beta/azure.py +16 -0
  148. pipecat/services/openai_realtime_beta/openai.py +17 -5
  149. pipecat/services/piper/tts.py +7 -9
  150. pipecat/services/playht/tts.py +34 -4
  151. pipecat/services/rime/tts.py +12 -12
  152. pipecat/services/riva/stt.py +3 -1
  153. pipecat/services/salesforce/__init__.py +9 -0
  154. pipecat/services/salesforce/llm.py +700 -0
  155. pipecat/services/sarvam/__init__.py +7 -0
  156. pipecat/services/sarvam/stt.py +540 -0
  157. pipecat/services/sarvam/tts.py +97 -13
  158. pipecat/services/simli/video.py +2 -2
  159. pipecat/services/speechmatics/stt.py +22 -10
  160. pipecat/services/stt_service.py +47 -0
  161. pipecat/services/tavus/video.py +2 -2
  162. pipecat/services/tts_service.py +75 -22
  163. pipecat/services/vision_service.py +7 -6
  164. pipecat/services/vistaar/llm.py +51 -9
  165. pipecat/tests/utils.py +4 -4
  166. pipecat/transcriptions/language.py +41 -1
  167. pipecat/transports/base_input.py +13 -34
  168. pipecat/transports/base_output.py +140 -104
  169. pipecat/transports/daily/transport.py +199 -26
  170. pipecat/transports/heygen/__init__.py +0 -0
  171. pipecat/transports/heygen/transport.py +381 -0
  172. pipecat/transports/livekit/transport.py +228 -63
  173. pipecat/transports/local/audio.py +6 -1
  174. pipecat/transports/local/tk.py +11 -2
  175. pipecat/transports/network/fastapi_websocket.py +1 -1
  176. pipecat/transports/smallwebrtc/connection.py +103 -19
  177. pipecat/transports/smallwebrtc/request_handler.py +246 -0
  178. pipecat/transports/smallwebrtc/transport.py +65 -23
  179. pipecat/transports/tavus/transport.py +23 -12
  180. pipecat/transports/websocket/client.py +41 -5
  181. pipecat/transports/websocket/fastapi.py +21 -11
  182. pipecat/transports/websocket/server.py +14 -7
  183. pipecat/transports/whatsapp/api.py +8 -0
  184. pipecat/transports/whatsapp/client.py +47 -0
  185. pipecat/utils/base_object.py +54 -22
  186. pipecat/utils/redis.py +58 -0
  187. pipecat/utils/string.py +13 -1
  188. pipecat/utils/tracing/service_decorators.py +21 -21
  189. pipecat/serializers/genesys.py +0 -95
  190. pipecat/services/google/test-google-chirp.py +0 -45
  191. pipecat/services/openai.py +0 -698
  192. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
  193. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
  194. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
  195. /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
@@ -1,5 +1,5 @@
1
1
  #
2
- # Copyright (c) 2024–2025, Daily
2
+ # Copyright (c) 2025, Daily
3
3
  #
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
@@ -10,1156 +10,16 @@ This module provides a speech-to-speech LLM service using AWS Nova Sonic, which
10
10
  bidirectional audio streaming, text generation, and function calling capabilities.
11
11
  """
12
12
 
13
- import asyncio
14
- import base64
15
- import json
16
- import time
17
- import uuid
18
- import wave
19
- from dataclasses import dataclass
20
- from enum import Enum
21
- from importlib.resources import files
22
- from typing import Any, List, Optional
13
+ import warnings
23
14
 
24
- from loguru import logger
25
- from pydantic import BaseModel, Field
15
+ from pipecat.services.aws.nova_sonic.llm import *
26
16
 
27
- from pipecat.adapters.schemas.tools_schema import ToolsSchema
28
- from pipecat.adapters.services.aws_nova_sonic_adapter import AWSNovaSonicLLMAdapter
29
- from pipecat.frames.frames import (
30
- BotStoppedSpeakingFrame,
31
- CancelFrame,
32
- EndFrame,
33
- Frame,
34
- FunctionCallFromLLM,
35
- InputAudioRawFrame,
36
- InterimTranscriptionFrame,
37
- LLMContextFrame,
38
- LLMFullResponseEndFrame,
39
- LLMFullResponseStartFrame,
40
- LLMTextFrame,
41
- StartFrame,
42
- TranscriptionFrame,
43
- TTSAudioRawFrame,
44
- TTSStartedFrame,
45
- TTSStoppedFrame,
46
- TTSTextFrame,
47
- )
48
- from pipecat.processors.aggregators.llm_response import (
49
- LLMAssistantAggregatorParams,
50
- LLMUserAggregatorParams,
51
- )
52
- from pipecat.processors.aggregators.openai_llm_context import (
53
- OpenAILLMContext,
54
- OpenAILLMContextFrame,
55
- )
56
- from pipecat.processors.frame_processor import FrameDirection
57
- from pipecat.services.aws_nova_sonic.context import (
58
- AWSNovaSonicAssistantContextAggregator,
59
- AWSNovaSonicContextAggregatorPair,
60
- AWSNovaSonicLLMContext,
61
- AWSNovaSonicUserContextAggregator,
62
- Role,
63
- )
64
- from pipecat.services.aws_nova_sonic.frames import AWSNovaSonicFunctionCallResultFrame
65
- from pipecat.services.llm_service import LLMService
66
- from pipecat.utils.time import time_now_iso8601
67
-
68
- try:
69
- from aws_sdk_bedrock_runtime.client import (
70
- BedrockRuntimeClient,
71
- InvokeModelWithBidirectionalStreamOperationInput,
72
- )
73
- from aws_sdk_bedrock_runtime.config import Config, HTTPAuthSchemeResolver, SigV4AuthScheme
74
- from aws_sdk_bedrock_runtime.models import (
75
- BidirectionalInputPayloadPart,
76
- InvokeModelWithBidirectionalStreamInput,
77
- InvokeModelWithBidirectionalStreamInputChunk,
78
- InvokeModelWithBidirectionalStreamOperationOutput,
79
- InvokeModelWithBidirectionalStreamOutput,
80
- )
81
- from smithy_aws_core.credentials_resolvers.static import StaticCredentialsResolver
82
- from smithy_aws_core.identity import AWSCredentialsIdentity
83
- from smithy_core.aio.eventstream import DuplexEventStream
84
- except ModuleNotFoundError as e:
85
- logger.error(f"Exception: {e}")
86
- logger.error(
87
- "In order to use AWS services, you need to `pip install pipecat-ai[aws-nova-sonic]`."
88
- )
89
- raise Exception(f"Missing module: {e}")
90
-
91
-
92
- class AWSNovaSonicUnhandledFunctionException(Exception):
93
- """Exception raised when the LLM attempts to call an unregistered function."""
94
-
95
- pass
96
-
97
-
98
- class ContentType(Enum):
99
- """Content types supported by AWS Nova Sonic.
100
-
101
- Parameters:
102
- AUDIO: Audio content type.
103
- TEXT: Text content type.
104
- TOOL: Tool content type.
105
- """
106
-
107
- AUDIO = "AUDIO"
108
- TEXT = "TEXT"
109
- TOOL = "TOOL"
110
-
111
-
112
- class TextStage(Enum):
113
- """Text generation stages in AWS Nova Sonic responses.
114
-
115
- Parameters:
116
- FINAL: Final text that has been fully generated.
117
- SPECULATIVE: Speculative text that is still being generated.
118
- """
119
-
120
- FINAL = "FINAL" # what has been said
121
- SPECULATIVE = "SPECULATIVE" # what's planned to be said
122
-
123
-
124
- @dataclass
125
- class CurrentContent:
126
- """Represents content currently being received from AWS Nova Sonic.
127
-
128
- Parameters:
129
- type: The type of content (audio, text, or tool).
130
- role: The role generating the content (user, assistant, etc.).
131
- text_stage: The stage of text generation (final or speculative).
132
- text_content: The actual text content if applicable.
133
- """
134
-
135
- type: ContentType
136
- role: Role
137
- text_stage: TextStage # None if not text
138
- text_content: str # starts as None, then fills in if text
139
-
140
- def __str__(self):
141
- """String representation of the current content."""
142
- return (
143
- f"CurrentContent(\n"
144
- f" type={self.type.name},\n"
145
- f" role={self.role.name},\n"
146
- f" text_stage={self.text_stage.name if self.text_stage else 'None'}\n"
147
- f")"
148
- )
149
-
150
-
151
- class Params(BaseModel):
152
- """Configuration parameters for AWS Nova Sonic.
153
-
154
- Parameters:
155
- input_sample_rate: Audio input sample rate in Hz.
156
- input_sample_size: Audio input sample size in bits.
157
- input_channel_count: Number of input audio channels.
158
- output_sample_rate: Audio output sample rate in Hz.
159
- output_sample_size: Audio output sample size in bits.
160
- output_channel_count: Number of output audio channels.
161
- max_tokens: Maximum number of tokens to generate.
162
- top_p: Nucleus sampling parameter.
163
- temperature: Sampling temperature for text generation.
164
- """
165
-
166
- # Audio input
167
- input_sample_rate: Optional[int] = Field(default=16000)
168
- input_sample_size: Optional[int] = Field(default=16)
169
- input_channel_count: Optional[int] = Field(default=1)
170
-
171
- # Audio output
172
- output_sample_rate: Optional[int] = Field(default=24000)
173
- output_sample_size: Optional[int] = Field(default=16)
174
- output_channel_count: Optional[int] = Field(default=1)
175
-
176
- # Inference
177
- max_tokens: Optional[int] = Field(default=1024)
178
- top_p: Optional[float] = Field(default=0.9)
179
- temperature: Optional[float] = Field(default=0.7)
180
-
181
-
182
- class AWSNovaSonicLLMService(LLMService):
183
- """AWS Nova Sonic speech-to-speech LLM service.
184
-
185
- Provides bidirectional audio streaming, real-time transcription, text generation,
186
- and function calling capabilities using AWS Nova Sonic model.
187
- """
188
-
189
- # Override the default adapter to use the AWSNovaSonicLLMAdapter one
190
- adapter_class = AWSNovaSonicLLMAdapter
191
-
192
- def __init__(
193
- self,
194
- *,
195
- secret_access_key: str,
196
- access_key_id: str,
197
- session_token: Optional[str] = None,
198
- region: str,
199
- model: str = "amazon.nova-sonic-v1:0",
200
- voice_id: str = "matthew", # matthew, tiffany, amy
201
- params: Optional[Params] = None,
202
- system_instruction: Optional[str] = None,
203
- tools: Optional[ToolsSchema] = None,
204
- send_transcription_frames: bool = True,
205
- **kwargs,
206
- ):
207
- """Initializes the AWS Nova Sonic LLM service.
208
-
209
- Args:
210
- secret_access_key: AWS secret access key for authentication.
211
- access_key_id: AWS access key ID for authentication.
212
- session_token: AWS session token for authentication.
213
- region: AWS region where the service is hosted.
214
- model: Model identifier. Defaults to "amazon.nova-sonic-v1:0".
215
- voice_id: Voice ID for speech synthesis. Options: matthew, tiffany, amy.
216
- params: Model parameters for audio configuration and inference.
217
- system_instruction: System-level instruction for the model.
218
- tools: Available tools/functions for the model to use.
219
- send_transcription_frames: Whether to emit transcription frames.
220
- **kwargs: Additional arguments passed to the parent LLMService.
221
- """
222
- super().__init__(**kwargs)
223
- self._secret_access_key = secret_access_key
224
- self._access_key_id = access_key_id
225
- self._session_token = session_token
226
- self._region = region
227
- self._model = model
228
- self._client: Optional[BedrockRuntimeClient] = None
229
- self._voice_id = voice_id
230
- self._params = params or Params()
231
- self._system_instruction = system_instruction
232
- self._tools = tools
233
- self._send_transcription_frames = send_transcription_frames
234
- self._context: Optional[AWSNovaSonicLLMContext] = None
235
- self._stream: Optional[
236
- DuplexEventStream[
237
- InvokeModelWithBidirectionalStreamInput,
238
- InvokeModelWithBidirectionalStreamOutput,
239
- InvokeModelWithBidirectionalStreamOperationOutput,
240
- ]
241
- ] = None
242
- self._receive_task: Optional[asyncio.Task] = None
243
- self._prompt_name: Optional[str] = None
244
- self._input_audio_content_name: Optional[str] = None
245
- self._content_being_received: Optional[CurrentContent] = None
246
- self._assistant_is_responding = False
247
- self._ready_to_send_context = False
248
- self._handling_bot_stopped_speaking = False
249
- self._triggering_assistant_response = False
250
- self._assistant_response_trigger_audio: Optional[bytes] = (
251
- None # Not cleared on _disconnect()
252
- )
253
- self._disconnecting = False
254
- self._connected_time: Optional[float] = None
255
- self._wants_connection = False
256
-
257
- #
258
- # standard AIService frame handling
259
- #
260
-
261
- async def start(self, frame: StartFrame):
262
- """Start the service and initiate connection to AWS Nova Sonic.
263
-
264
- Args:
265
- frame: The start frame triggering service initialization.
266
- """
267
- await super().start(frame)
268
- self._wants_connection = True
269
- await self._start_connecting()
270
-
271
- async def stop(self, frame: EndFrame):
272
- """Stop the service and close connections.
273
-
274
- Args:
275
- frame: The end frame triggering service shutdown.
276
- """
277
- await super().stop(frame)
278
- self._wants_connection = False
279
- await self._disconnect()
280
-
281
- async def cancel(self, frame: CancelFrame):
282
- """Cancel the service and close connections.
283
-
284
- Args:
285
- frame: The cancel frame triggering service cancellation.
286
- """
287
- await super().cancel(frame)
288
- self._wants_connection = False
289
- await self._disconnect()
290
-
291
- #
292
- # conversation resetting
293
- #
294
-
295
- async def reset_conversation(self):
296
- """Reset the conversation state while preserving context.
297
-
298
- Handles bot stopped speaking event, disconnects from the service,
299
- and reconnects with the preserved context.
300
- """
301
- logger.debug("Resetting conversation")
302
- await self._handle_bot_stopped_speaking(delay_to_catch_trailing_assistant_text=False)
303
-
304
- # Carry over previous context through disconnect
305
- context = self._context
306
- await self._disconnect()
307
- self._context = context
308
-
309
- await self._start_connecting()
310
-
311
- #
312
- # frame processing
313
- #
314
-
315
- async def process_frame(self, frame: Frame, direction: FrameDirection):
316
- """Process incoming frames and handle service-specific logic.
317
-
318
- Args:
319
- frame: The frame to process.
320
- direction: The direction the frame is traveling.
321
- """
322
- await super().process_frame(frame, direction)
323
-
324
- if isinstance(frame, OpenAILLMContextFrame):
325
- await self._handle_context(frame.context)
326
- elif isinstance(frame, LLMContextFrame):
327
- raise NotImplementedError(
328
- "Universal LLMContext is not yet supported for AWS Nova Sonic."
329
- )
330
- elif isinstance(frame, InputAudioRawFrame):
331
- await self._handle_input_audio_frame(frame)
332
- elif isinstance(frame, BotStoppedSpeakingFrame):
333
- await self._handle_bot_stopped_speaking(delay_to_catch_trailing_assistant_text=True)
334
- elif isinstance(frame, AWSNovaSonicFunctionCallResultFrame):
335
- await self._handle_function_call_result(frame)
336
-
337
- await self.push_frame(frame, direction)
338
-
339
- async def _handle_context(self, context: OpenAILLMContext):
340
- if not self._context:
341
- # We got our initial context - try to finish connecting
342
- self._context = AWSNovaSonicLLMContext.upgrade_to_nova_sonic(
343
- context, self._system_instruction
344
- )
345
- await self._finish_connecting_if_context_available()
346
-
347
- async def _handle_input_audio_frame(self, frame: InputAudioRawFrame):
348
- # Wait until we're done sending the assistant response trigger audio before sending audio
349
- # from the user's mic
350
- if self._triggering_assistant_response:
351
- return
352
-
353
- await self._send_user_audio_event(frame.audio)
354
-
355
- async def _handle_bot_stopped_speaking(self, delay_to_catch_trailing_assistant_text: bool):
356
- # Protect against back-to-back BotStoppedSpeaking calls, which I've observed
357
- if self._handling_bot_stopped_speaking:
358
- return
359
- self._handling_bot_stopped_speaking = True
360
-
361
- async def finalize_assistant_response():
362
- if self._assistant_is_responding:
363
- # Consider the assistant finished with their response (possibly after a short delay,
364
- # to allow for any trailing FINAL assistant text block to come in that need to make
365
- # it into context).
366
- #
367
- # TODO: ideally we could base this solely on the LLM output events, but I couldn't
368
- # figure out a reliable way to determine when we've gotten our last FINAL text block
369
- # after the LLM is done talking.
370
- #
371
- # First I looked at stopReason, but it doesn't seem like the last FINAL text block
372
- # is reliably marked END_TURN (sometimes the *first* one is, but not the last...
373
- # bug?)
374
- #
375
- # Then I considered schemes where we tally or match up SPECULATIVE text blocks with
376
- # FINAL text blocks to know how many or which FINAL blocks to expect, but user
377
- # interruptions throw a wrench in these schemes: depending on the exact timing of
378
- # the interruption, we should or shouldn't expect some FINAL blocks.
379
- if delay_to_catch_trailing_assistant_text:
380
- # This delay length is a balancing act between "catching" trailing assistant
381
- # text that is quite delayed but not waiting so long that user text comes in
382
- # first and results in a bit of context message order scrambling.
383
- await asyncio.sleep(1.25)
384
- self._assistant_is_responding = False
385
- await self._report_assistant_response_ended()
386
-
387
- self._handling_bot_stopped_speaking = False
388
-
389
- # Finalize the assistant response, either now or after a delay
390
- if delay_to_catch_trailing_assistant_text:
391
- self.create_task(finalize_assistant_response())
392
- else:
393
- await finalize_assistant_response()
394
-
395
- async def _handle_function_call_result(self, frame: AWSNovaSonicFunctionCallResultFrame):
396
- result = frame.result_frame
397
- await self._send_tool_result(tool_call_id=result.tool_call_id, result=result.result)
398
-
399
- #
400
- # LLM communication: lifecycle
401
- #
402
-
403
- async def _start_connecting(self):
404
- try:
405
- logger.info("Connecting...")
406
-
407
- if self._client:
408
- # Here we assume that if we have a client we are connected or connecting
409
- return
410
-
411
- # Set IDs for the connection
412
- self._prompt_name = str(uuid.uuid4())
413
- self._input_audio_content_name = str(uuid.uuid4())
414
-
415
- # Create the client
416
- self._client = self._create_client()
417
-
418
- # Start the bidirectional stream
419
- self._stream = await self._client.invoke_model_with_bidirectional_stream(
420
- InvokeModelWithBidirectionalStreamOperationInput(model_id=self._model)
421
- )
422
-
423
- # Send session start event
424
- await self._send_session_start_event()
425
-
426
- # Finish connecting
427
- self._ready_to_send_context = True
428
- await self._finish_connecting_if_context_available()
429
- except Exception as e:
430
- logger.error(f"{self} initialization error: {e}")
431
- self._disconnect()
432
-
433
- async def _finish_connecting_if_context_available(self):
434
- # We can only finish connecting once we've gotten our initial context and we're ready to
435
- # send it
436
- if not (self._context and self._ready_to_send_context):
437
- return
438
-
439
- logger.info("Finishing connecting (setting up session)...")
440
-
441
- # Read context
442
- history = self._context.get_messages_for_initializing_history()
443
-
444
- # Send prompt start event, specifying tools.
445
- # Tools from context take priority over self._tools.
446
- tools = (
447
- self._context.tools
448
- if self._context.tools
449
- else self.get_llm_adapter().from_standard_tools(self._tools)
450
- )
451
- logger.debug(f"Using tools: {tools}")
452
- await self._send_prompt_start_event(tools)
453
-
454
- # Send system instruction.
455
- # Instruction from context takes priority over self._system_instruction.
456
- # (NOTE: this prioritizing occurred automatically behind the scenes: the context was
457
- # initialized with self._system_instruction and then updated itself from its messages when
458
- # get_messages_for_initializing_history() was called).
459
- logger.debug(f"Using system instruction: {history.system_instruction}")
460
- if history.system_instruction:
461
- await self._send_text_event(text=history.system_instruction, role=Role.SYSTEM)
462
-
463
- # Send conversation history
464
- for message in history.messages:
465
- await self._send_text_event(text=message.text, role=message.role)
466
-
467
- # Start audio input
468
- await self._send_audio_input_start_event()
469
-
470
- # Start receiving events
471
- self._receive_task = self.create_task(self._receive_task_handler())
472
-
473
- # Record finished connecting time (must be done before sending assistant response trigger)
474
- self._connected_time = time.time()
475
-
476
- logger.info("Finished connecting")
477
-
478
- # If we need to, send assistant response trigger (depends on self._connected_time)
479
- if self._triggering_assistant_response:
480
- await self._send_assistant_response_trigger()
481
-
482
- async def _disconnect(self):
483
- try:
484
- logger.info("Disconnecting...")
485
-
486
- # NOTE: see explanation of HACK, below
487
- self._disconnecting = True
488
-
489
- # Clean up client
490
- if self._client:
491
- await self._send_session_end_events()
492
- self._client = None
493
-
494
- # Clean up stream
495
- if self._stream:
496
- await self._stream.input_stream.close()
497
- self._stream = None
498
-
499
- # NOTE: see explanation of HACK, below
500
- await asyncio.sleep(1)
501
-
502
- # Clean up receive task
503
- # HACK: we should ideally be able to cancel the receive task before stopping the input
504
- # stream, above (meaning we wouldn't need self._disconnecting). But for some reason if
505
- # we don't close the input stream and wait a second first, we're getting an error a lot
506
- # like this one: https://github.com/awslabs/amazon-transcribe-streaming-sdk/issues/61.
507
- if self._receive_task:
508
- await self.cancel_task(self._receive_task, timeout=1.0)
509
- self._receive_task = None
510
-
511
- # Reset remaining connection-specific state
512
- self._prompt_name = None
513
- self._input_audio_content_name = None
514
- self._content_being_received = None
515
- self._assistant_is_responding = False
516
- self._ready_to_send_context = False
517
- self._handling_bot_stopped_speaking = False
518
- self._triggering_assistant_response = False
519
- self._disconnecting = False
520
- self._connected_time = None
521
-
522
- logger.info("Finished disconnecting")
523
- except Exception as e:
524
- logger.error(f"{self} error disconnecting: {e}")
525
-
526
- def _create_client(self) -> BedrockRuntimeClient:
527
- config = Config(
528
- endpoint_uri=f"https://bedrock-runtime.{self._region}.amazonaws.com",
529
- region=self._region,
530
- aws_credentials_identity_resolver=StaticCredentialsResolver(
531
- credentials=AWSCredentialsIdentity(
532
- access_key_id=self._access_key_id,
533
- secret_access_key=self._secret_access_key,
534
- session_token=self._session_token,
535
- )
536
- ),
537
- http_auth_scheme_resolver=HTTPAuthSchemeResolver(),
538
- http_auth_schemes={"aws.auth#sigv4": SigV4AuthScheme()},
539
- )
540
- return BedrockRuntimeClient(config=config)
541
-
542
- #
543
- # LLM communication: input events (pipecat -> LLM)
544
- #
545
-
546
- async def _send_session_start_event(self):
547
- session_start = f"""
548
- {{
549
- "event": {{
550
- "sessionStart": {{
551
- "inferenceConfiguration": {{
552
- "maxTokens": {self._params.max_tokens},
553
- "topP": {self._params.top_p},
554
- "temperature": {self._params.temperature}
555
- }}
556
- }}
557
- }}
558
- }}
559
- """
560
- await self._send_client_event(session_start)
561
-
562
- async def _send_prompt_start_event(self, tools: List[Any]):
563
- if not self._prompt_name:
564
- return
565
-
566
- tools_config = (
567
- f""",
568
- "toolUseOutputConfiguration": {{
569
- "mediaType": "application/json"
570
- }},
571
- "toolConfiguration": {{
572
- "tools": {json.dumps(tools)}
573
- }}
574
- """
575
- if tools
576
- else ""
577
- )
578
-
579
- prompt_start = f'''
580
- {{
581
- "event": {{
582
- "promptStart": {{
583
- "promptName": "{self._prompt_name}",
584
- "textOutputConfiguration": {{
585
- "mediaType": "text/plain"
586
- }},
587
- "audioOutputConfiguration": {{
588
- "mediaType": "audio/lpcm",
589
- "sampleRateHertz": {self._params.output_sample_rate},
590
- "sampleSizeBits": {self._params.output_sample_size},
591
- "channelCount": {self._params.output_channel_count},
592
- "voiceId": "{self._voice_id}",
593
- "encoding": "base64",
594
- "audioType": "SPEECH"
595
- }}{tools_config}
596
- }}
597
- }}
598
- }}
599
- '''
600
- await self._send_client_event(prompt_start)
601
-
602
- async def _send_audio_input_start_event(self):
603
- if not self._prompt_name:
604
- return
605
-
606
- audio_content_start = f'''
607
- {{
608
- "event": {{
609
- "contentStart": {{
610
- "promptName": "{self._prompt_name}",
611
- "contentName": "{self._input_audio_content_name}",
612
- "type": "AUDIO",
613
- "interactive": true,
614
- "role": "USER",
615
- "audioInputConfiguration": {{
616
- "mediaType": "audio/lpcm",
617
- "sampleRateHertz": {self._params.input_sample_rate},
618
- "sampleSizeBits": {self._params.input_sample_size},
619
- "channelCount": {self._params.input_channel_count},
620
- "audioType": "SPEECH",
621
- "encoding": "base64"
622
- }}
623
- }}
624
- }}
625
- }}
626
- '''
627
- await self._send_client_event(audio_content_start)
628
-
629
- async def _send_text_event(self, text: str, role: Role):
630
- if not self._stream or not self._prompt_name or not text:
631
- return
632
-
633
- content_name = str(uuid.uuid4())
634
-
635
- text_content_start = f'''
636
- {{
637
- "event": {{
638
- "contentStart": {{
639
- "promptName": "{self._prompt_name}",
640
- "contentName": "{content_name}",
641
- "type": "TEXT",
642
- "interactive": true,
643
- "role": "{role.value}",
644
- "textInputConfiguration": {{
645
- "mediaType": "text/plain"
646
- }}
647
- }}
648
- }}
649
- }}
650
- '''
651
- await self._send_client_event(text_content_start)
652
-
653
- escaped_text = json.dumps(text) # includes quotes
654
- text_input = f'''
655
- {{
656
- "event": {{
657
- "textInput": {{
658
- "promptName": "{self._prompt_name}",
659
- "contentName": "{content_name}",
660
- "content": {escaped_text}
661
- }}
662
- }}
663
- }}
664
- '''
665
- await self._send_client_event(text_input)
666
-
667
- text_content_end = f'''
668
- {{
669
- "event": {{
670
- "contentEnd": {{
671
- "promptName": "{self._prompt_name}",
672
- "contentName": "{content_name}"
673
- }}
674
- }}
675
- }}
676
- '''
677
- await self._send_client_event(text_content_end)
678
-
679
- async def _send_user_audio_event(self, audio: bytes):
680
- if not self._stream:
681
- return
682
-
683
- blob = base64.b64encode(audio)
684
- audio_event = f'''
685
- {{
686
- "event": {{
687
- "audioInput": {{
688
- "promptName": "{self._prompt_name}",
689
- "contentName": "{self._input_audio_content_name}",
690
- "content": "{blob.decode("utf-8")}"
691
- }}
692
- }}
693
- }}
694
- '''
695
- await self._send_client_event(audio_event)
696
-
697
- async def _send_session_end_events(self):
698
- if not self._stream or not self._prompt_name:
699
- return
700
-
701
- prompt_end = f'''
702
- {{
703
- "event": {{
704
- "promptEnd": {{
705
- "promptName": "{self._prompt_name}"
706
- }}
707
- }}
708
- }}
709
- '''
710
- await self._send_client_event(prompt_end)
711
-
712
- session_end = """
713
- {
714
- "event": {
715
- "sessionEnd": {}
716
- }
717
- }
718
- """
719
- await self._send_client_event(session_end)
720
-
721
- async def _send_tool_result(self, tool_call_id, result):
722
- if not self._stream or not self._prompt_name:
723
- return
724
-
725
- content_name = str(uuid.uuid4())
726
-
727
- result_content_start = f'''
728
- {{
729
- "event": {{
730
- "contentStart": {{
731
- "promptName": "{self._prompt_name}",
732
- "contentName": "{content_name}",
733
- "interactive": false,
734
- "type": "TOOL",
735
- "role": "TOOL",
736
- "toolResultInputConfiguration": {{
737
- "toolUseId": "{tool_call_id}",
738
- "type": "TEXT",
739
- "textInputConfiguration": {{
740
- "mediaType": "text/plain"
741
- }}
742
- }}
743
- }}
744
- }}
745
- }}
746
- '''
747
- await self._send_client_event(result_content_start)
748
-
749
- result_content = json.dumps(
750
- {
751
- "event": {
752
- "toolResult": {
753
- "promptName": self._prompt_name,
754
- "contentName": content_name,
755
- "content": json.dumps(result) if isinstance(result, dict) else result,
756
- }
757
- }
758
- }
759
- )
760
- await self._send_client_event(result_content)
761
-
762
- result_content_end = f"""
763
- {{
764
- "event": {{
765
- "contentEnd": {{
766
- "promptName": "{self._prompt_name}",
767
- "contentName": "{content_name}"
768
- }}
769
- }}
770
- }}
771
- """
772
- await self._send_client_event(result_content_end)
773
-
774
- async def _send_client_event(self, event_json: str):
775
- if not self._stream: # should never happen
776
- return
777
-
778
- event = InvokeModelWithBidirectionalStreamInputChunk(
779
- value=BidirectionalInputPayloadPart(bytes_=event_json.encode("utf-8"))
780
- )
781
- await self._stream.input_stream.send(event)
782
-
783
- #
784
- # LLM communication: output events (LLM -> pipecat)
785
- #
786
-
787
- # Receive events for the session.
788
- # A few different kinds of content can be delivered:
789
- # - Transcription of user audio
790
- # - Tool use
791
- # - Text preview of planned response speech before audio delivered
792
- # - User interruption notification
793
- # - Text of response speech that whose audio was actually delivered
794
- # - Audio of response speech
795
- # Each piece of content is wrapped by "contentStart" and "contentEnd" events. The content is
796
- # delivered sequentially: one piece of content will end before another starts.
797
- # The overall completion is wrapped by "completionStart" and "completionEnd" events.
798
- async def _receive_task_handler(self):
799
- try:
800
- while self._stream and not self._disconnecting:
801
- output = await self._stream.await_output()
802
- result = await output[1].receive()
803
-
804
- if result.value and result.value.bytes_:
805
- response_data = result.value.bytes_.decode("utf-8")
806
- json_data = json.loads(response_data)
807
-
808
- if "event" in json_data:
809
- event_json = json_data["event"]
810
- if "completionStart" in event_json:
811
- # Handle the LLM completion starting
812
- await self._handle_completion_start_event(event_json)
813
- elif "contentStart" in event_json:
814
- # Handle a piece of content starting
815
- await self._handle_content_start_event(event_json)
816
- elif "textOutput" in event_json:
817
- # Handle text output content
818
- await self._handle_text_output_event(event_json)
819
- elif "audioOutput" in event_json:
820
- # Handle audio output content
821
- await self._handle_audio_output_event(event_json)
822
- elif "toolUse" in event_json:
823
- # Handle tool use
824
- await self._handle_tool_use_event(event_json)
825
- elif "contentEnd" in event_json:
826
- # Handle a piece of content ending
827
- await self._handle_content_end_event(event_json)
828
- elif "completionEnd" in event_json:
829
- # Handle the LLM completion ending
830
- await self._handle_completion_end_event(event_json)
831
- except Exception as e:
832
- logger.error(f"{self} error processing responses: {e}")
833
- if self._wants_connection:
834
- await self.reset_conversation()
835
-
836
- async def _handle_completion_start_event(self, event_json):
837
- pass
838
-
839
- async def _handle_content_start_event(self, event_json):
840
- content_start = event_json["contentStart"]
841
- type = content_start["type"]
842
- role = content_start["role"]
843
- generation_stage = None
844
- if "additionalModelFields" in content_start:
845
- additional_model_fields = json.loads(content_start["additionalModelFields"])
846
- generation_stage = additional_model_fields.get("generationStage")
847
-
848
- # Bookkeeping: track current content being received
849
- content = CurrentContent(
850
- type=ContentType(type),
851
- role=Role(role),
852
- text_stage=TextStage(generation_stage) if generation_stage else None,
853
- text_content=None,
854
- )
855
- self._content_being_received = content
856
-
857
- if content.role == Role.ASSISTANT:
858
- if content.type == ContentType.AUDIO:
859
- # Note that an assistant response can comprise of multiple audio blocks
860
- if not self._assistant_is_responding:
861
- # The assistant has started responding.
862
- self._assistant_is_responding = True
863
- await self._report_user_transcription_ended() # Consider user turn over
864
- await self._report_assistant_response_started()
865
-
866
- async def _handle_text_output_event(self, event_json):
867
- if not self._content_being_received: # should never happen
868
- return
869
- content = self._content_being_received
870
-
871
- text_content = event_json["textOutput"]["content"]
872
-
873
- # Bookkeeping: augment the current content being received with text
874
- # Assumption: only one text content per content block
875
- content.text_content = text_content
876
-
877
- async def _handle_audio_output_event(self, event_json):
878
- if not self._content_being_received: # should never happen
879
- return
880
-
881
- # Get audio
882
- audio_content = event_json["audioOutput"]["content"]
883
-
884
- # Push audio frame
885
- audio = base64.b64decode(audio_content)
886
- frame = TTSAudioRawFrame(
887
- audio=audio,
888
- sample_rate=self._params.output_sample_rate,
889
- num_channels=self._params.output_channel_count,
890
- )
891
- await self.push_frame(frame)
892
-
893
- async def _handle_tool_use_event(self, event_json):
894
- if not self._content_being_received or not self._context: # should never happen
895
- return
896
-
897
- # Consider user turn over
898
- await self._report_user_transcription_ended()
899
-
900
- # Get tool use details
901
- tool_use = event_json["toolUse"]
902
- function_name = tool_use["toolName"]
903
- tool_call_id = tool_use["toolUseId"]
904
- arguments = json.loads(tool_use["content"])
905
-
906
- # Call tool function
907
- if self.has_function(function_name):
908
- if function_name in self._functions.keys() or None in self._functions.keys():
909
- function_calls_llm = [
910
- FunctionCallFromLLM(
911
- context=self._context,
912
- tool_call_id=tool_call_id,
913
- function_name=function_name,
914
- arguments=arguments,
915
- )
916
- ]
917
-
918
- await self.run_function_calls(function_calls_llm)
919
- else:
920
- raise AWSNovaSonicUnhandledFunctionException(
921
- f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function."
922
- )
923
-
924
- async def _handle_content_end_event(self, event_json):
925
- if not self._content_being_received: # should never happen
926
- return
927
- content = self._content_being_received
928
-
929
- content_end = event_json["contentEnd"]
930
- stop_reason = content_end["stopReason"]
931
-
932
- # Bookkeeping: clear current content being received
933
- self._content_being_received = None
934
-
935
- if content.role == Role.ASSISTANT:
936
- if content.type == ContentType.TEXT:
937
- # Ignore non-final text, and the "interrupted" message (which isn't meaningful text)
938
- if content.text_stage == TextStage.FINAL and stop_reason != "INTERRUPTED":
939
- if self._assistant_is_responding:
940
- # Text added to the ongoing assistant response
941
- await self._report_assistant_response_text_added(content.text_content)
942
- elif content.role == Role.USER:
943
- if content.type == ContentType.TEXT:
944
- if content.text_stage == TextStage.FINAL:
945
- # User transcription text added
946
- await self._report_user_transcription_text_added(content.text_content)
947
-
948
- async def _handle_completion_end_event(self, event_json):
949
- pass
950
-
951
- #
952
- # assistant response reporting
953
- #
954
- # 1. Started
955
- # 2. Text added
956
- # 3. Ended
957
- #
958
-
959
- async def _report_assistant_response_started(self):
960
- logger.debug("Assistant response started")
961
-
962
- # Report that the assistant has started their response.
963
- await self.push_frame(LLMFullResponseStartFrame())
964
-
965
- # Report that equivalent of TTS (this is a speech-to-speech model) started
966
- await self.push_frame(TTSStartedFrame())
967
-
968
- async def _report_assistant_response_text_added(self, text):
969
- if not self._context: # should never happen
970
- return
971
-
972
- logger.debug(f"Assistant response text added: {text}")
973
-
974
- # Report some text added to the ongoing assistant response
975
- await self.push_frame(LLMTextFrame(text))
976
-
977
- # Report some text added to the *equivalent* of TTS (this is a speech-to-speech model)
978
- await self.push_frame(TTSTextFrame(text))
979
-
980
- # TODO: this is a (hopefully temporary) HACK. Here we directly manipulate the context rather
981
- # than relying on the frames pushed to the assistant context aggregator. The pattern of
982
- # receiving full-sentence text after the assistant has spoken does not easily fit with the
983
- # Pipecat expectation of chunks of text streaming in while the assistant is speaking.
984
- # Interruption handling was especially challenging. Rather than spend days trying to fit a
985
- # square peg in a round hole, I decided on this hack for the time being. We can most cleanly
986
- # abandon this hack if/when AWS Nova Sonic implements streaming smaller text chunks
987
- # interspersed with audio. Note that when we move away from this hack, we need to make sure
988
- # that on an interruption we avoid sending LLMFullResponseEndFrame, which gets the
989
- # LLMAssistantContextAggregator into a bad state.
990
- self._context.buffer_assistant_text(text)
991
-
992
- async def _report_assistant_response_ended(self):
993
- if not self._context: # should never happen
994
- return
995
-
996
- logger.debug("Assistant response ended")
997
-
998
- # Report that the assistant has finished their response.
999
- await self.push_frame(LLMFullResponseEndFrame())
1000
-
1001
- # Report that equivalent of TTS (this is a speech-to-speech model) stopped.
1002
- await self.push_frame(TTSStoppedFrame())
1003
-
1004
- # For an explanation of this hack, see _report_assistant_response_text_added.
1005
- self._context.flush_aggregated_assistant_text()
1006
-
1007
- #
1008
- # user transcription reporting
1009
- #
1010
- # 1. Text added
1011
- # 2. Ended
1012
- #
1013
- # Note: "started" does not need to be reported
1014
- #
1015
-
1016
- async def _report_user_transcription_text_added(self, text):
1017
- if not self._context: # should never happen
1018
- return
1019
-
1020
- logger.debug(f"User transcription text added: {text}")
1021
-
1022
- # Manually add new user transcription text to context.
1023
- # We can't rely on the user context aggregator to do this since it's upstream from the LLM.
1024
- self._context.buffer_user_text(text)
1025
-
1026
- # Report that some new user transcription text is available.
1027
- if self._send_transcription_frames:
1028
- await self.push_frame(
1029
- InterimTranscriptionFrame(text=text, user_id="", timestamp=time_now_iso8601())
1030
- )
1031
-
1032
- async def _report_user_transcription_ended(self):
1033
- if not self._context: # should never happen
1034
- return
1035
-
1036
- # Manually add user transcription to context (if any has been buffered).
1037
- # We can't rely on the user context aggregator to do this since it's upstream from the LLM.
1038
- transcription = self._context.flush_aggregated_user_text()
1039
-
1040
- if not transcription:
1041
- return
1042
-
1043
- logger.debug(f"User transcription ended")
1044
-
1045
- if self._send_transcription_frames:
1046
- await self.push_frame(
1047
- TranscriptionFrame(text=transcription, user_id="", timestamp=time_now_iso8601())
1048
- )
1049
-
1050
- #
1051
- # context
1052
- #
1053
-
1054
- def create_context_aggregator(
1055
- self,
1056
- context: OpenAILLMContext,
1057
- *,
1058
- user_params: LLMUserAggregatorParams = LLMUserAggregatorParams(),
1059
- assistant_params: LLMAssistantAggregatorParams = LLMAssistantAggregatorParams(),
1060
- ) -> AWSNovaSonicContextAggregatorPair:
1061
- """Create context aggregator pair for managing conversation context.
1062
-
1063
- Args:
1064
- context: The OpenAI LLM context to upgrade.
1065
- user_params: Parameters for the user context aggregator.
1066
- assistant_params: Parameters for the assistant context aggregator.
1067
-
1068
- Returns:
1069
- A pair of user and assistant context aggregators.
1070
- """
1071
- context.set_llm_adapter(self.get_llm_adapter())
1072
-
1073
- user = AWSNovaSonicUserContextAggregator(context=context, params=user_params)
1074
- assistant = AWSNovaSonicAssistantContextAggregator(context=context, params=assistant_params)
1075
-
1076
- return AWSNovaSonicContextAggregatorPair(user, assistant)
1077
-
1078
- #
1079
- # assistant response trigger (HACK)
1080
- #
1081
-
1082
- # Class variable
1083
- AWAIT_TRIGGER_ASSISTANT_RESPONSE_INSTRUCTION = (
1084
- "Start speaking when you hear the user say 'ready', but don't consider that 'ready' to be "
1085
- "a meaningful part of the conversation other than as a trigger for you to start speaking."
17
+ with warnings.catch_warnings():
18
+ warnings.simplefilter("always")
19
+ warnings.warn(
20
+ "Types in pipecat.services.aws_nova_sonic.aws are deprecated. "
21
+ "Please use the equivalent types from "
22
+ "pipecat.services.aws.nova_sonic.llm instead.",
23
+ DeprecationWarning,
24
+ stacklevel=2,
1086
25
  )
1087
-
1088
- async def trigger_assistant_response(self):
1089
- """Trigger an assistant response by sending audio cue.
1090
-
1091
- Sends a pre-recorded "ready" audio trigger to prompt the assistant
1092
- to start speaking. This is useful for controlling conversation flow.
1093
-
1094
- Returns:
1095
- False if already triggering a response, True otherwise.
1096
- """
1097
- if self._triggering_assistant_response:
1098
- return False
1099
-
1100
- self._triggering_assistant_response = True
1101
-
1102
- # Read audio bytes, if we don't already have them cached
1103
- if not self._assistant_response_trigger_audio:
1104
- file_path = files("pipecat.services.aws_nova_sonic").joinpath("ready.wav")
1105
- with wave.open(file_path.open("rb"), "rb") as wav_file:
1106
- self._assistant_response_trigger_audio = wav_file.readframes(wav_file.getnframes())
1107
-
1108
- # Send the trigger audio, if we're fully connected and set up
1109
- if self._connected_time is not None:
1110
- await self._send_assistant_response_trigger()
1111
-
1112
- async def _send_assistant_response_trigger(self):
1113
- if (
1114
- not self._assistant_response_trigger_audio or self._connected_time is None
1115
- ): # should never happen
1116
- return
1117
-
1118
- try:
1119
- logger.debug("Sending assistant response trigger...")
1120
-
1121
- chunk_duration = 0.02 # what we might get from InputAudioRawFrame
1122
- chunk_size = int(
1123
- chunk_duration
1124
- * self._params.input_sample_rate
1125
- * self._params.input_channel_count
1126
- * (self._params.input_sample_size / 8)
1127
- ) # e.g. 0.02 seconds of 16-bit (2-byte) PCM mono audio at 16kHz is 640 bytes
1128
-
1129
- # Lead with a bit of blank audio, if needed.
1130
- # It seems like the LLM can't quite "hear" the first little bit of audio sent on a
1131
- # connection.
1132
- current_time = time.time()
1133
- max_blank_audio_duration = 0.5
1134
- blank_audio_duration = (
1135
- max_blank_audio_duration - (current_time - self._connected_time)
1136
- if self._connected_time is not None
1137
- and (current_time - self._connected_time) < max_blank_audio_duration
1138
- else None
1139
- )
1140
- if blank_audio_duration:
1141
- logger.debug(
1142
- f"Leading assistant response trigger with {blank_audio_duration}s of blank audio"
1143
- )
1144
- blank_audio_chunk = b"\x00" * chunk_size
1145
- num_chunks = int(blank_audio_duration / chunk_duration)
1146
- for _ in range(num_chunks):
1147
- await self._send_user_audio_event(blank_audio_chunk)
1148
- await asyncio.sleep(chunk_duration)
1149
-
1150
- # Send trigger audio
1151
- # NOTE: this audio *will* be transcribed and eventually make it into the context. That's OK:
1152
- # if we ever need to seed this service again with context it would make sense to include it
1153
- # since the instruction (i.e. the "wait for the trigger" instruction) will be part of the
1154
- # context as well.
1155
- audio_chunks = [
1156
- self._assistant_response_trigger_audio[i : i + chunk_size]
1157
- for i in range(0, len(self._assistant_response_trigger_audio), chunk_size)
1158
- ]
1159
- for chunk in audio_chunks:
1160
- await self._send_user_audio_event(chunk)
1161
- await asyncio.sleep(chunk_duration)
1162
- finally:
1163
- # We need to clean up in case sending the trigger was cancelled, e.g. in the case of a user interruption.
1164
- # (An asyncio.CancelledError would be raised in that case.)
1165
- self._triggering_assistant_response = False