dv-pipecat-ai 0.0.85.dev699__py3-none-any.whl → 0.0.85.dev816__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev816.dist-info}/METADATA +23 -18
- {dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev816.dist-info}/RECORD +44 -44
- pipecat/adapters/services/aws_nova_sonic_adapter.py +116 -6
- pipecat/frames/frames.py +96 -0
- pipecat/pipeline/runner.py +6 -2
- pipecat/pipeline/task.py +40 -55
- pipecat/processors/aggregators/llm_context.py +40 -2
- pipecat/processors/frameworks/rtvi.py +1 -0
- pipecat/runner/daily.py +59 -20
- pipecat/runner/run.py +149 -67
- pipecat/runner/types.py +5 -5
- pipecat/services/assemblyai/models.py +6 -0
- pipecat/services/assemblyai/stt.py +13 -5
- pipecat/services/asyncai/tts.py +3 -0
- pipecat/services/aws/llm.py +33 -16
- pipecat/services/aws/nova_sonic/context.py +69 -0
- pipecat/services/aws/nova_sonic/llm.py +199 -89
- pipecat/services/aws/stt.py +2 -0
- pipecat/services/aws_nova_sonic/context.py +8 -12
- pipecat/services/cartesia/stt.py +77 -70
- pipecat/services/cartesia/tts.py +3 -1
- pipecat/services/deepgram/flux/stt.py +4 -0
- pipecat/services/elevenlabs/tts.py +82 -41
- pipecat/services/fish/tts.py +3 -0
- pipecat/services/google/stt.py +4 -0
- pipecat/services/lmnt/tts.py +2 -0
- pipecat/services/neuphonic/tts.py +3 -0
- pipecat/services/openai/tts.py +37 -6
- pipecat/services/piper/tts.py +7 -9
- pipecat/services/playht/tts.py +3 -0
- pipecat/services/rime/tts.py +9 -8
- pipecat/services/riva/stt.py +3 -1
- pipecat/services/sarvam/tts.py +87 -10
- pipecat/services/speechmatics/stt.py +3 -1
- pipecat/services/stt_service.py +23 -10
- pipecat/services/tts_service.py +64 -13
- pipecat/transports/base_input.py +3 -0
- pipecat/transports/base_output.py +71 -77
- pipecat/transports/smallwebrtc/connection.py +5 -0
- pipecat/transports/smallwebrtc/request_handler.py +42 -0
- pipecat/utils/string.py +1 -0
- {dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev816.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev816.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev816.dist-info}/top_level.txt +0 -0
|
@@ -25,7 +25,7 @@ from loguru import logger
|
|
|
25
25
|
from pydantic import BaseModel, Field
|
|
26
26
|
|
|
27
27
|
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
|
28
|
-
from pipecat.adapters.services.aws_nova_sonic_adapter import AWSNovaSonicLLMAdapter
|
|
28
|
+
from pipecat.adapters.services.aws_nova_sonic_adapter import AWSNovaSonicLLMAdapter, Role
|
|
29
29
|
from pipecat.frames.frames import (
|
|
30
30
|
BotStoppedSpeakingFrame,
|
|
31
31
|
CancelFrame,
|
|
@@ -33,35 +33,30 @@ from pipecat.frames.frames import (
|
|
|
33
33
|
Frame,
|
|
34
34
|
FunctionCallFromLLM,
|
|
35
35
|
InputAudioRawFrame,
|
|
36
|
-
|
|
36
|
+
InterruptionFrame,
|
|
37
37
|
LLMContextFrame,
|
|
38
38
|
LLMFullResponseEndFrame,
|
|
39
39
|
LLMFullResponseStartFrame,
|
|
40
|
-
LLMTextFrame,
|
|
41
40
|
StartFrame,
|
|
42
41
|
TranscriptionFrame,
|
|
43
42
|
TTSAudioRawFrame,
|
|
44
43
|
TTSStartedFrame,
|
|
45
44
|
TTSStoppedFrame,
|
|
46
45
|
TTSTextFrame,
|
|
46
|
+
UserStartedSpeakingFrame,
|
|
47
|
+
UserStoppedSpeakingFrame,
|
|
47
48
|
)
|
|
49
|
+
from pipecat.processors.aggregators.llm_context import LLMContext
|
|
48
50
|
from pipecat.processors.aggregators.llm_response import (
|
|
49
51
|
LLMAssistantAggregatorParams,
|
|
50
52
|
LLMUserAggregatorParams,
|
|
51
53
|
)
|
|
54
|
+
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
|
52
55
|
from pipecat.processors.aggregators.openai_llm_context import (
|
|
53
56
|
OpenAILLMContext,
|
|
54
57
|
OpenAILLMContextFrame,
|
|
55
58
|
)
|
|
56
59
|
from pipecat.processors.frame_processor import FrameDirection
|
|
57
|
-
from pipecat.services.aws.nova_sonic.context import (
|
|
58
|
-
AWSNovaSonicAssistantContextAggregator,
|
|
59
|
-
AWSNovaSonicContextAggregatorPair,
|
|
60
|
-
AWSNovaSonicLLMContext,
|
|
61
|
-
AWSNovaSonicUserContextAggregator,
|
|
62
|
-
Role,
|
|
63
|
-
)
|
|
64
|
-
from pipecat.services.aws.nova_sonic.frames import AWSNovaSonicFunctionCallResultFrame
|
|
65
60
|
from pipecat.services.llm_service import LLMService
|
|
66
61
|
from pipecat.utils.time import time_now_iso8601
|
|
67
62
|
|
|
@@ -217,6 +212,11 @@ class AWSNovaSonicLLMService(LLMService):
|
|
|
217
212
|
system_instruction: System-level instruction for the model.
|
|
218
213
|
tools: Available tools/functions for the model to use.
|
|
219
214
|
send_transcription_frames: Whether to emit transcription frames.
|
|
215
|
+
|
|
216
|
+
.. deprecated:: 0.0.91
|
|
217
|
+
This parameter is deprecated and will be removed in a future version.
|
|
218
|
+
Transcription frames are always sent.
|
|
219
|
+
|
|
220
220
|
**kwargs: Additional arguments passed to the parent LLMService.
|
|
221
221
|
"""
|
|
222
222
|
super().__init__(**kwargs)
|
|
@@ -230,8 +230,20 @@ class AWSNovaSonicLLMService(LLMService):
|
|
|
230
230
|
self._params = params or Params()
|
|
231
231
|
self._system_instruction = system_instruction
|
|
232
232
|
self._tools = tools
|
|
233
|
-
|
|
234
|
-
|
|
233
|
+
|
|
234
|
+
if not send_transcription_frames:
|
|
235
|
+
import warnings
|
|
236
|
+
|
|
237
|
+
with warnings.catch_warnings():
|
|
238
|
+
warnings.simplefilter("always")
|
|
239
|
+
warnings.warn(
|
|
240
|
+
"`send_transcription_frames` is deprecated and will be removed in a future version. "
|
|
241
|
+
"Transcription frames are always sent.",
|
|
242
|
+
DeprecationWarning,
|
|
243
|
+
stacklevel=2,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
self._context: Optional[LLMContext] = None
|
|
235
247
|
self._stream: Optional[
|
|
236
248
|
DuplexEventStream[
|
|
237
249
|
InvokeModelWithBidirectionalStreamInput,
|
|
@@ -244,12 +256,17 @@ class AWSNovaSonicLLMService(LLMService):
|
|
|
244
256
|
self._input_audio_content_name: Optional[str] = None
|
|
245
257
|
self._content_being_received: Optional[CurrentContent] = None
|
|
246
258
|
self._assistant_is_responding = False
|
|
259
|
+
self._may_need_repush_assistant_text = False
|
|
247
260
|
self._ready_to_send_context = False
|
|
248
261
|
self._handling_bot_stopped_speaking = False
|
|
249
262
|
self._triggering_assistant_response = False
|
|
263
|
+
self._waiting_for_trigger_transcription = False
|
|
250
264
|
self._disconnecting = False
|
|
251
265
|
self._connected_time: Optional[float] = None
|
|
252
266
|
self._wants_connection = False
|
|
267
|
+
self._user_text_buffer = ""
|
|
268
|
+
self._assistant_text_buffer = ""
|
|
269
|
+
self._completed_tool_calls = set()
|
|
253
270
|
|
|
254
271
|
file_path = files("pipecat.services.aws.nova_sonic").joinpath("ready.wav")
|
|
255
272
|
with wave.open(file_path.open("rb"), "rb") as wav_file:
|
|
@@ -302,12 +319,12 @@ class AWSNovaSonicLLMService(LLMService):
|
|
|
302
319
|
logger.debug("Resetting conversation")
|
|
303
320
|
await self._handle_bot_stopped_speaking(delay_to_catch_trailing_assistant_text=False)
|
|
304
321
|
|
|
305
|
-
#
|
|
322
|
+
# Grab context to carry through disconnect/reconnect
|
|
306
323
|
context = self._context
|
|
307
|
-
await self._disconnect()
|
|
308
|
-
self._context = context
|
|
309
324
|
|
|
325
|
+
await self._disconnect()
|
|
310
326
|
await self._start_connecting()
|
|
327
|
+
await self._handle_context(context)
|
|
311
328
|
|
|
312
329
|
#
|
|
313
330
|
# frame processing
|
|
@@ -322,28 +339,35 @@ class AWSNovaSonicLLMService(LLMService):
|
|
|
322
339
|
"""
|
|
323
340
|
await super().process_frame(frame, direction)
|
|
324
341
|
|
|
325
|
-
if isinstance(frame, OpenAILLMContextFrame):
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
342
|
+
if isinstance(frame, (LLMContextFrame, OpenAILLMContextFrame)):
|
|
343
|
+
context = (
|
|
344
|
+
frame.context
|
|
345
|
+
if isinstance(frame, LLMContextFrame)
|
|
346
|
+
else LLMContext.from_openai_context(frame.context)
|
|
330
347
|
)
|
|
348
|
+
await self._handle_context(context)
|
|
331
349
|
elif isinstance(frame, InputAudioRawFrame):
|
|
332
350
|
await self._handle_input_audio_frame(frame)
|
|
333
351
|
elif isinstance(frame, BotStoppedSpeakingFrame):
|
|
334
352
|
await self._handle_bot_stopped_speaking(delay_to_catch_trailing_assistant_text=True)
|
|
335
|
-
elif isinstance(frame,
|
|
336
|
-
await self.
|
|
353
|
+
elif isinstance(frame, InterruptionFrame):
|
|
354
|
+
await self._handle_interruption_frame()
|
|
337
355
|
|
|
338
356
|
await self.push_frame(frame, direction)
|
|
339
357
|
|
|
340
|
-
async def _handle_context(self, context:
|
|
358
|
+
async def _handle_context(self, context: LLMContext):
|
|
359
|
+
if self._disconnecting:
|
|
360
|
+
return
|
|
361
|
+
|
|
341
362
|
if not self._context:
|
|
342
|
-
# We got our initial context
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
)
|
|
363
|
+
# We got our initial context
|
|
364
|
+
# Try to finish connecting
|
|
365
|
+
self._context = context
|
|
346
366
|
await self._finish_connecting_if_context_available()
|
|
367
|
+
else:
|
|
368
|
+
# We got an updated context
|
|
369
|
+
# Send results for any newly-completed function calls
|
|
370
|
+
await self._process_completed_function_calls(send_new_results=True)
|
|
347
371
|
|
|
348
372
|
async def _handle_input_audio_frame(self, frame: InputAudioRawFrame):
|
|
349
373
|
# Wait until we're done sending the assistant response trigger audio before sending audio
|
|
@@ -393,9 +417,9 @@ class AWSNovaSonicLLMService(LLMService):
|
|
|
393
417
|
else:
|
|
394
418
|
await finalize_assistant_response()
|
|
395
419
|
|
|
396
|
-
async def
|
|
397
|
-
|
|
398
|
-
|
|
420
|
+
async def _handle_interruption_frame(self):
|
|
421
|
+
if self._assistant_is_responding:
|
|
422
|
+
self._may_need_repush_assistant_text = True
|
|
399
423
|
|
|
400
424
|
#
|
|
401
425
|
# LLM communication: lifecycle
|
|
@@ -431,6 +455,17 @@ class AWSNovaSonicLLMService(LLMService):
|
|
|
431
455
|
logger.error(f"{self} initialization error: {e}")
|
|
432
456
|
await self._disconnect()
|
|
433
457
|
|
|
458
|
+
async def _process_completed_function_calls(self, send_new_results: bool):
|
|
459
|
+
# Check for set of completed function calls in the context
|
|
460
|
+
for message in self._context.get_messages():
|
|
461
|
+
if message.get("role") and message.get("content") != "IN_PROGRESS":
|
|
462
|
+
tool_call_id = message.get("tool_call_id")
|
|
463
|
+
if tool_call_id and tool_call_id not in self._completed_tool_calls:
|
|
464
|
+
# Found a newly-completed function call - send the result to the service
|
|
465
|
+
if send_new_results:
|
|
466
|
+
await self._send_tool_result(tool_call_id, message.get("content"))
|
|
467
|
+
self._completed_tool_calls.add(tool_call_id)
|
|
468
|
+
|
|
434
469
|
async def _finish_connecting_if_context_available(self):
|
|
435
470
|
# We can only finish connecting once we've gotten our initial context and we're ready to
|
|
436
471
|
# send it
|
|
@@ -439,30 +474,38 @@ class AWSNovaSonicLLMService(LLMService):
|
|
|
439
474
|
|
|
440
475
|
logger.info("Finishing connecting (setting up session)...")
|
|
441
476
|
|
|
477
|
+
# Initialize our bookkeeping of already-completed tool calls in the
|
|
478
|
+
# context
|
|
479
|
+
await self._process_completed_function_calls(send_new_results=False)
|
|
480
|
+
|
|
442
481
|
# Read context
|
|
443
|
-
|
|
482
|
+
adapter: AWSNovaSonicLLMAdapter = self.get_llm_adapter()
|
|
483
|
+
llm_connection_params = adapter.get_llm_invocation_params(self._context)
|
|
444
484
|
|
|
445
485
|
# Send prompt start event, specifying tools.
|
|
446
486
|
# Tools from context take priority over self._tools.
|
|
447
487
|
tools = (
|
|
448
|
-
|
|
449
|
-
if
|
|
450
|
-
else
|
|
488
|
+
llm_connection_params["tools"]
|
|
489
|
+
if llm_connection_params["tools"]
|
|
490
|
+
else adapter.from_standard_tools(self._tools)
|
|
451
491
|
)
|
|
452
492
|
logger.debug(f"Using tools: {tools}")
|
|
453
493
|
await self._send_prompt_start_event(tools)
|
|
454
494
|
|
|
455
495
|
# Send system instruction.
|
|
456
496
|
# Instruction from context takes priority over self._system_instruction.
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
497
|
+
system_instruction = (
|
|
498
|
+
llm_connection_params["system_instruction"]
|
|
499
|
+
if llm_connection_params["system_instruction"]
|
|
500
|
+
else self._system_instruction
|
|
501
|
+
)
|
|
502
|
+
logger.debug(f"Using system instruction: {system_instruction}")
|
|
503
|
+
if system_instruction:
|
|
504
|
+
await self._send_text_event(text=system_instruction, role=Role.SYSTEM)
|
|
463
505
|
|
|
464
506
|
# Send conversation history
|
|
465
|
-
for message in
|
|
507
|
+
for message in llm_connection_params["messages"]:
|
|
508
|
+
# logger.debug(f"Seeding conversation history with message: {message}")
|
|
466
509
|
await self._send_text_event(text=message.text, role=message.role)
|
|
467
510
|
|
|
468
511
|
# Start audio input
|
|
@@ -492,9 +535,12 @@ class AWSNovaSonicLLMService(LLMService):
|
|
|
492
535
|
await self._send_session_end_events()
|
|
493
536
|
self._client = None
|
|
494
537
|
|
|
538
|
+
# Clean up context
|
|
539
|
+
self._context = None
|
|
540
|
+
|
|
495
541
|
# Clean up stream
|
|
496
542
|
if self._stream:
|
|
497
|
-
await self._stream.
|
|
543
|
+
await self._stream.close()
|
|
498
544
|
self._stream = None
|
|
499
545
|
|
|
500
546
|
# NOTE: see explanation of HACK, below
|
|
@@ -510,15 +556,23 @@ class AWSNovaSonicLLMService(LLMService):
|
|
|
510
556
|
self._receive_task = None
|
|
511
557
|
|
|
512
558
|
# Reset remaining connection-specific state
|
|
559
|
+
# Should be all private state except:
|
|
560
|
+
# - _wants_connection
|
|
561
|
+
# - _assistant_response_trigger_audio
|
|
513
562
|
self._prompt_name = None
|
|
514
563
|
self._input_audio_content_name = None
|
|
515
564
|
self._content_being_received = None
|
|
516
565
|
self._assistant_is_responding = False
|
|
566
|
+
self._may_need_repush_assistant_text = False
|
|
517
567
|
self._ready_to_send_context = False
|
|
518
568
|
self._handling_bot_stopped_speaking = False
|
|
519
569
|
self._triggering_assistant_response = False
|
|
570
|
+
self._waiting_for_trigger_transcription = False
|
|
520
571
|
self._disconnecting = False
|
|
521
572
|
self._connected_time = None
|
|
573
|
+
self._user_text_buffer = ""
|
|
574
|
+
self._assistant_text_buffer = ""
|
|
575
|
+
self._completed_tool_calls = set()
|
|
522
576
|
|
|
523
577
|
logger.info("Finished disconnecting")
|
|
524
578
|
except Exception as e:
|
|
@@ -826,6 +880,10 @@ class AWSNovaSonicLLMService(LLMService):
|
|
|
826
880
|
# Handle the LLM completion ending
|
|
827
881
|
await self._handle_completion_end_event(event_json)
|
|
828
882
|
except Exception as e:
|
|
883
|
+
if self._disconnecting:
|
|
884
|
+
# Errors are kind of expected while disconnecting, so just
|
|
885
|
+
# ignore them and do nothing
|
|
886
|
+
return
|
|
829
887
|
logger.error(f"{self} error processing responses: {e}")
|
|
830
888
|
if self._wants_connection:
|
|
831
889
|
await self.reset_conversation()
|
|
@@ -956,7 +1014,7 @@ class AWSNovaSonicLLMService(LLMService):
|
|
|
956
1014
|
async def _report_assistant_response_started(self):
|
|
957
1015
|
logger.debug("Assistant response started")
|
|
958
1016
|
|
|
959
|
-
# Report
|
|
1017
|
+
# Report the start of the assistant response.
|
|
960
1018
|
await self.push_frame(LLMFullResponseStartFrame())
|
|
961
1019
|
|
|
962
1020
|
# Report that equivalent of TTS (this is a speech-to-speech model) started
|
|
@@ -968,23 +1026,16 @@ class AWSNovaSonicLLMService(LLMService):
|
|
|
968
1026
|
|
|
969
1027
|
logger.debug(f"Assistant response text added: {text}")
|
|
970
1028
|
|
|
971
|
-
# Report
|
|
972
|
-
await self.push_frame(LLMTextFrame(text))
|
|
973
|
-
|
|
974
|
-
# Report some text added to the *equivalent* of TTS (this is a speech-to-speech model)
|
|
1029
|
+
# Report the text of the assistant response.
|
|
975
1030
|
await self.push_frame(TTSTextFrame(text))
|
|
976
1031
|
|
|
977
|
-
#
|
|
978
|
-
# than relying on the
|
|
979
|
-
#
|
|
980
|
-
#
|
|
981
|
-
#
|
|
982
|
-
#
|
|
983
|
-
|
|
984
|
-
# interspersed with audio. Note that when we move away from this hack, we need to make sure
|
|
985
|
-
# that on an interruption we avoid sending LLMFullResponseEndFrame, which gets the
|
|
986
|
-
# LLMAssistantContextAggregator into a bad state.
|
|
987
|
-
self._context.buffer_assistant_text(text)
|
|
1032
|
+
# HACK: here we're also buffering the assistant text ourselves as a
|
|
1033
|
+
# backup rather than relying solely on the assistant context aggregator
|
|
1034
|
+
# to do it, because the text arrives from Nova Sonic only after all the
|
|
1035
|
+
# assistant audio frames have been pushed, meaning that if an
|
|
1036
|
+
# interruption frame were to arrive we would lose all of it (the text
|
|
1037
|
+
# frames sitting in the queue would be wiped).
|
|
1038
|
+
self._assistant_text_buffer += text
|
|
988
1039
|
|
|
989
1040
|
async def _report_assistant_response_ended(self):
|
|
990
1041
|
if not self._context: # should never happen
|
|
@@ -992,14 +1043,34 @@ class AWSNovaSonicLLMService(LLMService):
|
|
|
992
1043
|
|
|
993
1044
|
logger.debug("Assistant response ended")
|
|
994
1045
|
|
|
995
|
-
#
|
|
1046
|
+
# If an interruption frame arrived while the assistant was responding
|
|
1047
|
+
# we may have lost all of the assistant text (see HACK, above), so
|
|
1048
|
+
# re-push it downstream to the aggregator now.
|
|
1049
|
+
if self._may_need_repush_assistant_text:
|
|
1050
|
+
# Just in case, check that assistant text hasn't already made it
|
|
1051
|
+
# into the context (sometimes it does, despite the interruption).
|
|
1052
|
+
messages = self._context.get_messages()
|
|
1053
|
+
last_message = messages[-1] if messages else None
|
|
1054
|
+
if (
|
|
1055
|
+
not last_message
|
|
1056
|
+
or last_message.get("role") != "assistant"
|
|
1057
|
+
or last_message.get("content") != self._assistant_text_buffer
|
|
1058
|
+
):
|
|
1059
|
+
# We also need to re-push the LLMFullResponseStartFrame since the
|
|
1060
|
+
# TTSTextFrame would be ignored otherwise (the interruption frame
|
|
1061
|
+
# would have cleared the assistant aggregator state).
|
|
1062
|
+
await self.push_frame(LLMFullResponseStartFrame())
|
|
1063
|
+
await self.push_frame(TTSTextFrame(self._assistant_text_buffer))
|
|
1064
|
+
self._may_need_repush_assistant_text = False
|
|
1065
|
+
|
|
1066
|
+
# Report the end of the assistant response.
|
|
996
1067
|
await self.push_frame(LLMFullResponseEndFrame())
|
|
997
1068
|
|
|
998
1069
|
# Report that equivalent of TTS (this is a speech-to-speech model) stopped.
|
|
999
1070
|
await self.push_frame(TTSStoppedFrame())
|
|
1000
1071
|
|
|
1001
|
-
#
|
|
1002
|
-
self.
|
|
1072
|
+
# Clear out the buffered assistant text
|
|
1073
|
+
self._assistant_text_buffer = ""
|
|
1003
1074
|
|
|
1004
1075
|
#
|
|
1005
1076
|
# user transcription reporting
|
|
@@ -1016,33 +1087,67 @@ class AWSNovaSonicLLMService(LLMService):
|
|
|
1016
1087
|
|
|
1017
1088
|
logger.debug(f"User transcription text added: {text}")
|
|
1018
1089
|
|
|
1019
|
-
#
|
|
1020
|
-
#
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
#
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1090
|
+
# HACK: here we're buffering the user text ourselves rather than
|
|
1091
|
+
# relying on the upstream user context aggregator to do it, because the
|
|
1092
|
+
# text arrives in fairly large chunks spaced fairly far apart in time.
|
|
1093
|
+
# That means the user text would be split between different messages in
|
|
1094
|
+
# context. Even if we sent placeholder InterimTranscriptionFrames in
|
|
1095
|
+
# between each TranscriptionFrame to tell the aggregator to hold off on
|
|
1096
|
+
# finalizing the user message, the aggregator would likely get the last
|
|
1097
|
+
# chunk too late.
|
|
1098
|
+
self._user_text_buffer += f" {text}" if self._user_text_buffer else text
|
|
1028
1099
|
|
|
1029
1100
|
async def _report_user_transcription_ended(self):
|
|
1030
1101
|
if not self._context: # should never happen
|
|
1031
1102
|
return
|
|
1032
1103
|
|
|
1033
|
-
# Manually add user transcription to context (if any has been buffered).
|
|
1034
|
-
# We can't rely on the user context aggregator to do this since it's upstream from the LLM.
|
|
1035
|
-
transcription = self._context.flush_aggregated_user_text()
|
|
1036
|
-
|
|
1037
|
-
if not transcription:
|
|
1038
|
-
return
|
|
1039
|
-
|
|
1040
1104
|
logger.debug(f"User transcription ended")
|
|
1041
1105
|
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1106
|
+
# Report to the upstream user context aggregator that some new user
|
|
1107
|
+
# transcription text is available.
|
|
1108
|
+
|
|
1109
|
+
# HACK: Check if this transcription was triggered by our own
|
|
1110
|
+
# assistant response trigger. If so, we need to wrap it with
|
|
1111
|
+
# UserStarted/StoppedSpeakingFrames; otherwise the user aggregator
|
|
1112
|
+
# would fire an EmulatedUserStartedSpeakingFrame, which would
|
|
1113
|
+
# trigger an interruption, which would prevent us from writing the
|
|
1114
|
+
# assistant response to context.
|
|
1115
|
+
#
|
|
1116
|
+
# Sending an EmulateUserStartedSpeakingFrame ourselves doesn't
|
|
1117
|
+
# work: it just causes the interruption we're trying to avoid.
|
|
1118
|
+
#
|
|
1119
|
+
# Setting enable_emulated_vad_interruptions also doesn't work: at
|
|
1120
|
+
# the time the user aggregator receives the TranscriptionFrame, it
|
|
1121
|
+
# doesn't yet know the assistant has started responding, so it
|
|
1122
|
+
# doesn't know that emulating the user starting to speak would
|
|
1123
|
+
# cause an interruption.
|
|
1124
|
+
should_wrap_in_user_started_stopped_speaking_frames = (
|
|
1125
|
+
self._waiting_for_trigger_transcription
|
|
1126
|
+
and self._user_text_buffer.strip().lower() == "ready"
|
|
1127
|
+
)
|
|
1128
|
+
|
|
1129
|
+
# Start wrapping the upstream transcription in UserStarted/StoppedSpeakingFrames if needed
|
|
1130
|
+
if should_wrap_in_user_started_stopped_speaking_frames:
|
|
1131
|
+
logger.debug(
|
|
1132
|
+
"Wrapping assistant response trigger transcription with upstream UserStarted/StoppedSpeakingFrames"
|
|
1045
1133
|
)
|
|
1134
|
+
await self.push_frame(UserStartedSpeakingFrame(), direction=FrameDirection.UPSTREAM)
|
|
1135
|
+
|
|
1136
|
+
# Send the transcription upstream for the user context aggregator
|
|
1137
|
+
frame = TranscriptionFrame(
|
|
1138
|
+
text=self._user_text_buffer, user_id="", timestamp=time_now_iso8601()
|
|
1139
|
+
)
|
|
1140
|
+
await self.push_frame(frame, direction=FrameDirection.UPSTREAM)
|
|
1141
|
+
|
|
1142
|
+
# Finish wrapping the upstream transcription in UserStarted/StoppedSpeakingFrames if needed
|
|
1143
|
+
if should_wrap_in_user_started_stopped_speaking_frames:
|
|
1144
|
+
await self.push_frame(UserStoppedSpeakingFrame(), direction=FrameDirection.UPSTREAM)
|
|
1145
|
+
|
|
1146
|
+
# Clear out the buffered user text
|
|
1147
|
+
self._user_text_buffer = ""
|
|
1148
|
+
|
|
1149
|
+
# We're no longer waiting for a trigger transcription
|
|
1150
|
+
self._waiting_for_trigger_transcription = False
|
|
1046
1151
|
|
|
1047
1152
|
#
|
|
1048
1153
|
# context
|
|
@@ -1054,23 +1159,26 @@ class AWSNovaSonicLLMService(LLMService):
|
|
|
1054
1159
|
*,
|
|
1055
1160
|
user_params: LLMUserAggregatorParams = LLMUserAggregatorParams(),
|
|
1056
1161
|
assistant_params: LLMAssistantAggregatorParams = LLMAssistantAggregatorParams(),
|
|
1057
|
-
) ->
|
|
1162
|
+
) -> LLMContextAggregatorPair:
|
|
1058
1163
|
"""Create context aggregator pair for managing conversation context.
|
|
1059
1164
|
|
|
1165
|
+
NOTE: this method exists only for backward compatibility. New code
|
|
1166
|
+
should instead do:
|
|
1167
|
+
context = LLMContext(...)
|
|
1168
|
+
context_aggregator = LLMContextAggregatorPair(context)
|
|
1169
|
+
|
|
1060
1170
|
Args:
|
|
1061
|
-
context: The OpenAI LLM context
|
|
1171
|
+
context: The OpenAI LLM context.
|
|
1062
1172
|
user_params: Parameters for the user context aggregator.
|
|
1063
1173
|
assistant_params: Parameters for the assistant context aggregator.
|
|
1064
1174
|
|
|
1065
1175
|
Returns:
|
|
1066
1176
|
A pair of user and assistant context aggregators.
|
|
1067
1177
|
"""
|
|
1068
|
-
context.
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
return AWSNovaSonicContextAggregatorPair(user, assistant)
|
|
1178
|
+
context = LLMContext.from_openai_context(context)
|
|
1179
|
+
return LLMContextAggregatorPair(
|
|
1180
|
+
context, user_params=user_params, assistant_params=assistant_params
|
|
1181
|
+
)
|
|
1074
1182
|
|
|
1075
1183
|
#
|
|
1076
1184
|
# assistant response trigger (HACK)
|
|
@@ -1108,6 +1216,8 @@ class AWSNovaSonicLLMService(LLMService):
|
|
|
1108
1216
|
try:
|
|
1109
1217
|
logger.debug("Sending assistant response trigger...")
|
|
1110
1218
|
|
|
1219
|
+
self._waiting_for_trigger_transcription = True
|
|
1220
|
+
|
|
1111
1221
|
chunk_duration = 0.02 # what we might get from InputAudioRawFrame
|
|
1112
1222
|
chunk_size = int(
|
|
1113
1223
|
chunk_duration
|
pipecat/services/aws/stt.py
CHANGED
|
@@ -286,6 +286,7 @@ class AWSTranscribeSTTService(STTService):
|
|
|
286
286
|
|
|
287
287
|
logger.info(f"{self} Successfully connected to AWS Transcribe")
|
|
288
288
|
|
|
289
|
+
await self._call_event_handler("on_connected")
|
|
289
290
|
except Exception as e:
|
|
290
291
|
logger.error(f"{self} Failed to connect to AWS Transcribe: {e}")
|
|
291
292
|
await self._disconnect()
|
|
@@ -310,6 +311,7 @@ class AWSTranscribeSTTService(STTService):
|
|
|
310
311
|
logger.warning(f"{self} Error closing WebSocket connection: {e}")
|
|
311
312
|
finally:
|
|
312
313
|
self._ws_client = None
|
|
314
|
+
await self._call_event_handler("on_disconnected")
|
|
313
315
|
|
|
314
316
|
def language_to_service_language(self, language: Language) -> str | None:
|
|
315
317
|
"""Convert internal language enum to AWS Transcribe language code.
|
|
@@ -8,18 +8,14 @@
|
|
|
8
8
|
|
|
9
9
|
This module provides specialized context aggregators and message handling for AWS Nova Sonic,
|
|
10
10
|
including conversation history management and role-specific message processing.
|
|
11
|
-
"""
|
|
12
11
|
|
|
13
|
-
|
|
12
|
+
.. deprecated:: 0.0.91
|
|
13
|
+
AWS Nova Sonic no longer uses types from this module under the hood.
|
|
14
|
+
It now uses `LLMContext` and `LLMContextAggregatorPair`.
|
|
15
|
+
Using the new patterns should allow you to not need types from this module.
|
|
14
16
|
|
|
15
|
-
|
|
17
|
+
See deprecation warning in pipecat.services.aws.nova_sonic.context for more
|
|
18
|
+
details.
|
|
19
|
+
"""
|
|
16
20
|
|
|
17
|
-
|
|
18
|
-
warnings.simplefilter("always")
|
|
19
|
-
warnings.warn(
|
|
20
|
-
"Types in pipecat.services.aws_nova_sonic.context are deprecated. "
|
|
21
|
-
"Please use the equivalent types from "
|
|
22
|
-
"pipecat.services.aws.nova_sonic.context instead.",
|
|
23
|
-
DeprecationWarning,
|
|
24
|
-
stacklevel=2,
|
|
25
|
-
)
|
|
21
|
+
from pipecat.services.aws.nova_sonic.context import *
|