dv-pipecat-ai 0.0.85.dev830__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dv-pipecat-ai
3
- Version: 0.0.85.dev830
3
+ Version: 0.0.85.dev837
4
4
  Summary: An open source framework for voice (and multimodal) assistants
5
5
  License-Expression: BSD-2-Clause
6
6
  Project-URL: Source, https://github.com/pipecat-ai/pipecat
@@ -1,4 +1,4 @@
1
- dv_pipecat_ai-0.0.85.dev830.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
1
+ dv_pipecat_ai-0.0.85.dev837.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
2
2
  pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
3
3
  pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -107,7 +107,7 @@ pipecat/pipeline/to_be_updated/merge_pipeline.py,sha256=jLEWdufIW3z1xZhdoLowdJ_S
107
107
  pipecat/processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
108
108
  pipecat/processors/async_generator.py,sha256=qPOZxk5eOad_NrF_Z06vWZ6deXIxb9AKZKYO2e5pkJs,2385
109
109
  pipecat/processors/consumer_processor.py,sha256=DrWCKnfblknZJ0bLmR_unIeJ1axQw4IPUn2IB3KLGGA,3228
110
- pipecat/processors/dtmf_aggregator.py,sha256=mo_IXUlsnVl-_Xn8sbTGnRF4Lkts0h6E3uauGbeFyWs,10204
110
+ pipecat/processors/dtmf_aggregator.py,sha256=k3xYncUr_8y5lrYfeX8PxqlF7jqFLshg_HB6HiFg7TA,10193
111
111
  pipecat/processors/frame_processor.py,sha256=uBu6Waa0_diMXdQXMZ5V5a_KwaaPzcieyuv5gO9u-ME,33841
112
112
  pipecat/processors/idle_frame_processor.py,sha256=z8AuhGap61lA5K35P6XCaOpn4kkmK_9NZNppbpQxheU,3124
113
113
  pipecat/processors/logger.py,sha256=8xa4KKekXQIETlQR7zoGnwUpLNo8CeDVm7YjyXePN-w,2385
@@ -122,7 +122,7 @@ pipecat/processors/aggregators/gated.py,sha256=tii0sRrBkRW6y9Xq5iTWPnqlOEejU4VqP
122
122
  pipecat/processors/aggregators/gated_llm_context.py,sha256=CPv6sMA8irD1zZ3fU1gSv6D7qcPvCA0MdpFhBtJ_ekI,3007
123
123
  pipecat/processors/aggregators/gated_open_ai_llm_context.py,sha256=DgqmdPj1u3fP_SVmxtfP7NjHqnyhN_RVVTDfmjbkxAs,361
124
124
  pipecat/processors/aggregators/llm_context.py,sha256=wNbZA0Vt0FzNc5cu06xiv1z7DIClIlfqR1ZD8EusbVw,11085
125
- pipecat/processors/aggregators/llm_response.py,sha256=V6wBTzfUGLJfMuI34fkf5VTR0I66AWIW8btxKI8_3IM,48795
125
+ pipecat/processors/aggregators/llm_response.py,sha256=--6D736k5mNnIhmauRbA7ZG7H9tBR16okniz3Mpypns,48573
126
126
  pipecat/processors/aggregators/llm_response_universal.py,sha256=5PqmpATpekD8BVWyBExZgatKHsNbZem8M-A7_VwTbiQ,34334
127
127
  pipecat/processors/aggregators/openai_llm_context.py,sha256=cC8DXdVPERRN04i0i-1Ys6kusvnbMALeH-Z8Pu5K684,12999
128
128
  pipecat/processors/aggregators/sentence.py,sha256=E7e3knfQl6HEGpYMKPklF1aO_gOn-rr7SnynErwfkQk,2235
@@ -153,17 +153,18 @@ pipecat/runner/livekit.py,sha256=in-2Io3FUZV-VcZZ-gQCx9L1WnKp5sHqmm7tDYlFNl4,458
153
153
  pipecat/runner/run.py,sha256=McalzMoFYEJJRXyoD5PBAyUhHCdsEeeZJk8lBvplRck,30054
154
154
  pipecat/runner/types.py,sha256=zHjbAiU17fG0ypLXCEzPu7bpDOutAg-4gE7TESvK8n0,1761
155
155
  pipecat/runner/utils.py,sha256=Ve9rjRvbt1o8e9by0nIrCJzUDGcuJUeYYhkqycmgHXc,18682
156
- pipecat/serializers/__init__.py,sha256=xcmbbR7YYU5C4HPbo2WVgPij-Bl_qlrLcnunCdpcZkg,804
157
- pipecat/serializers/asterisk.py,sha256=bPuGuLiCf04_H0d9Gc-5BpEtqD9BRNWnpZZq5MZ1fDY,6091
156
+ pipecat/serializers/__init__.py,sha256=z0V5GflCoPt4k2Yqm4ivuzKDh9VsYYAgK2UXZTw10aU,863
157
+ pipecat/serializers/asterisk.py,sha256=QLJMXkU3DZ0sgFw3Vq2Zf8PHKkQQguL_v-l2Io4lZ_M,6729
158
158
  pipecat/serializers/base_serializer.py,sha256=OyBUZccs2ZT9mfkBbq2tGsUJMvci6o-j90Cl1sicPaI,2030
159
- pipecat/serializers/convox.py,sha256=Irby_iZywgBtevlxiC8nE2GY3eh4yNNRi2YC-0vnNTY,11155
160
- pipecat/serializers/custom.py,sha256=O0gHTyoSb1AZ_tEmE9VgRViYckmsNzjwCAqt-Xc2CaM,9081
159
+ pipecat/serializers/convox.py,sha256=fj9NkFTB74B9k8qWEuICQNGUQtEV0DusaHohkOqNLa8,11145
160
+ pipecat/serializers/custom.py,sha256=clUEqOazGe3B2XoUFRN9zkFpMd6aIZeVRTqBRHAzavM,9071
161
161
  pipecat/serializers/exotel.py,sha256=B04LtNnRMzKmaS61gPZbUjc2nbki3FmpCfUMww6cOe4,5953
162
162
  pipecat/serializers/livekit.py,sha256=OMaM7yUiHfeTPbpNxE2TrmIzjmbNQIjNvlujt81dsRI,3285
163
163
  pipecat/serializers/plivo.py,sha256=ie6VUhZDTJ7KlAuJyHNeIeMtJ3ScDq_2js1SZtz7jLI,9256
164
164
  pipecat/serializers/protobuf.py,sha256=L0jSqvgTdkfxsu6JWjYK8QSTVji9nhzmgRsEEbGU7xY,5223
165
165
  pipecat/serializers/telnyx.py,sha256=eFkC7dExDFildYLR8DPvgfHbgXlCwdSPd1vc11yxyok,10847
166
166
  pipecat/serializers/twilio.py,sha256=0emSzXVw8DU_N5RPruMekbBKku9Q429-0z1PMuYejSk,10823
167
+ pipecat/serializers/vi.py,sha256=Q7kMXvKM493RIuOUc99LKZWgVmvd8_owAzIK_oEktfw,11150
167
168
  pipecat/services/__init__.py,sha256=8e3Ta-8_BOPozhDB3l0GJkNXs5PWhib6yqZQUof2Kvw,1209
168
169
  pipecat/services/ai_service.py,sha256=yE386fm2Id-yD4fCNfkmEMtg0lTA7PB17n2x_A_jwTg,5896
169
170
  pipecat/services/ai_services.py,sha256=_RrDWfM8adV17atzY9RxK0nXRVM5kbUkKrvN90GAWYM,795
@@ -205,7 +206,7 @@ pipecat/services/azure/realtime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
205
206
  pipecat/services/azure/realtime/llm.py,sha256=MnDiw-YJP3kll1gbkta4z4vsWfWZ5oBprZCinMP9O0M,2385
206
207
  pipecat/services/cartesia/__init__.py,sha256=vzh0jBnfPwWdxFfV-tu0x1HFoOTgr9s91GYmD-CJUtY,284
207
208
  pipecat/services/cartesia/stt.py,sha256=00k9gQYo_xPKb-RRJ-RNV4LPFw-7xXiFU7ACFLYttWY,12388
208
- pipecat/services/cartesia/tts.py,sha256=EdpVJoDhZn7N5hj-VDsCaO-W2MsA78UzOdrHR4G7w08,24355
209
+ pipecat/services/cartesia/tts.py,sha256=I_OZCINywkDXmYzFL35MjSN8cAuNEaJs7nj0YB_obtc,27008
209
210
  pipecat/services/cerebras/__init__.py,sha256=5zBmqq9Zfcl-HC7ylekVS5qrRedbl1mAeEwUT-T-c_o,259
210
211
  pipecat/services/cerebras/llm.py,sha256=-yzSe_6YDGigwzES-LZS4vNXMPugmvsIYEpTySyr5nA,3047
211
212
  pipecat/services/deepgram/__init__.py,sha256=IjRtMI7WytRDdmYVpk2qDWClXUiNgdl7ZkvEAWg1eYE,304
@@ -324,7 +325,7 @@ pipecat/services/sambanova/llm.py,sha256=5XVfPLEk__W8ykFqLdV95ZUhlGGkAaJwmbciLdZ
324
325
  pipecat/services/sambanova/stt.py,sha256=ZZgEZ7WQjLFHbCko-3LNTtVajjtfUvbtVLtFcaNadVQ,2536
325
326
  pipecat/services/sarvam/__init__.py,sha256=B4TN_tTHV9fWg0aSoPvfQlXISA0nJaQ9-u08I9UWvH4,280
326
327
  pipecat/services/sarvam/stt.py,sha256=p9Iq4loMwnftNZ_S0WoFSoX7iBbRKyja6RsVWbpj508,19314
327
- pipecat/services/sarvam/tts.py,sha256=wzfa0vvmd0wtuzqFSjRbTmHHS8H0L8nP9jkXwqFUJ3A,27638
328
+ pipecat/services/sarvam/tts.py,sha256=lrwfdC53kZ7f2QPgNRxzryISNkrJCvNtlZ-19-iXg94,27610
328
329
  pipecat/services/simli/__init__.py,sha256=cbDcqOaGsEgKbGYKpJ1Vv7LN4ZjOWA04sE84WW5vgQI,257
329
330
  pipecat/services/simli/video.py,sha256=Zu2XLvl2Y6VHaWzT9wEdzW9d0EYoZyzYLxjQFyV8vho,8320
330
331
  pipecat/services/soniox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -338,7 +339,7 @@ pipecat/services/together/llm.py,sha256=VSayO-U6g9Ld0xK9CXRQPUsd5gWJKtiA8qDAyXgs
338
339
  pipecat/services/ultravox/__init__.py,sha256=EoHCSXI2o0DFQslELgkhAGZtxDj63gZi-9ZEhXljaKE,259
339
340
  pipecat/services/ultravox/stt.py,sha256=uCQm_-LbycXdXRV6IE1a6Mymis6tyww7V8PnPzAQtx8,16586
340
341
  pipecat/services/vistaar/__init__.py,sha256=UFfSWFN5rbzl6NN-E_OH_MFaSYodZWNlenAU0wk-rAI,110
341
- pipecat/services/vistaar/llm.py,sha256=8jp9BxGYOysmD6CFyof7m2AJRbTDx4KT4kFuUc95wcc,19335
342
+ pipecat/services/vistaar/llm.py,sha256=GNVKaelbpNH7NW7iOpBj2rJjmhMVUsPqfnBI-YgIjjw,19326
342
343
  pipecat/services/whisper/__init__.py,sha256=smADmw0Fv98k7cGRuHTEcljKTO2WdZqLpJd0qsTCwH8,281
343
344
  pipecat/services/whisper/base_stt.py,sha256=VhslESPnYIeVbmnQTzmlZPV35TH49duxYTvJe0epNnE,7850
344
345
  pipecat/services/whisper/stt.py,sha256=9Qd56vWMzg3LtHikQnfgyMtl4odE6BCHDbpAn3HSWjw,17480
@@ -353,7 +354,7 @@ pipecat/transcriptions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3
353
354
  pipecat/transcriptions/language.py,sha256=-mWI1MiZbasuoqZTOBH69dAmoM7-UJzWq9rSCcrnmh4,8228
354
355
  pipecat/transports/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
355
356
  pipecat/transports/base_input.py,sha256=WGtoXXlF3GIjYgjtYnAgi8nZozd5abNlGNjwRnz8FRs,20138
356
- pipecat/transports/base_output.py,sha256=mNlIOo7tETlbYPbDyOtA2H-TkBGFKmjuCMDzQUtiwmk,35423
357
+ pipecat/transports/base_output.py,sha256=7WoXtAQAi-3OC9PC_zk61lCWlBTk5-NuTLUbsQUAI_U,36723
357
358
  pipecat/transports/base_transport.py,sha256=JlNiH0DysTfr6azwHauJqY_Z9HJC702O29Q0qrsLrg4,7530
358
359
  pipecat/transports/daily/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
359
360
  pipecat/transports/daily/transport.py,sha256=VanO33ff9g6px-vwGgT6M7cMVg786pOGfMU7Okm7a78,91917
@@ -415,7 +416,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=fwzxFpi8DJl6BJbK74G0UEB4ccMJg
415
416
  pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
416
417
  pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
417
418
  pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
418
- dv_pipecat_ai-0.0.85.dev830.dist-info/METADATA,sha256=wPJAPffJo_L5wKNWKbIxlaBG09JAGKUTFl_qkLwmoPw,32924
419
- dv_pipecat_ai-0.0.85.dev830.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
420
- dv_pipecat_ai-0.0.85.dev830.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
421
- dv_pipecat_ai-0.0.85.dev830.dist-info/RECORD,,
419
+ dv_pipecat_ai-0.0.85.dev837.dist-info/METADATA,sha256=dQC8Y4gHZ3jPBKpybN1R9aKRUbb9mQpb0cPuLQo5KUc,32924
420
+ dv_pipecat_ai-0.0.85.dev837.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
421
+ dv_pipecat_ai-0.0.85.dev837.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
422
+ dv_pipecat_ai-0.0.85.dev837.dist-info/RECORD,,
@@ -49,7 +49,6 @@ from pipecat.frames.frames import (
49
49
  OpenAILLMContextAssistantTimestampFrame,
50
50
  SpeechControlParamsFrame,
51
51
  StartFrame,
52
- StartInterruptionFrame,
53
52
  TextFrame,
54
53
  TranscriptDropFrame,
55
54
  TranscriptionFrame,
@@ -473,8 +472,8 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
473
472
  frame: The frame to process.
474
473
  direction: The direction of frame flow in the pipeline.
475
474
  """
476
- if isinstance(frame, StartInterruptionFrame):
477
- self.logger.debug("Received StartInterruptionFrame")
475
+ if isinstance(frame, InterruptionFrame):
476
+ self.logger.debug("Received InterruptionFrame")
478
477
  await super().process_frame(frame, direction)
479
478
 
480
479
  if isinstance(frame, StartFrame):
@@ -560,7 +559,7 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
560
559
  "Triggering interruption - pushing BotInterruptionFrame and aggregation"
561
560
  )
562
561
  # await self.push_frame(BotInterruptionFrame(), FrameDirection.UPSTREAM)
563
- await self.push_frame(StartInterruptionFrame(), FrameDirection.DOWNSTREAM)
562
+ await self.push_frame(InterruptionFrame(), FrameDirection.DOWNSTREAM)
564
563
  self.logger.debug("Pushed BotInterruptionFrame")
565
564
  # No interruption config - normal behavior (always push aggregation)
566
565
  await self._process_aggregation()
@@ -596,12 +595,8 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
596
595
  """Notify upstream processors that pending transcripts should be dropped."""
597
596
  if self._pending_transcription_ids:
598
597
  drop_frame = TranscriptDropFrame(transcript_ids=list(self._pending_transcription_ids))
599
- self.logger.debug(
600
- f"Dropping {len(self._pending_transcription_ids)} transcript chunk(s) due to {reason}"
601
- )
602
598
  await self.push_frame(drop_frame, FrameDirection.UPSTREAM)
603
599
  self._pending_transcription_ids.clear()
604
- self._aggregation = ""
605
600
 
606
601
  async def _start(self, frame: StartFrame):
607
602
  self._create_aggregation_task()
@@ -1,3 +1,5 @@
1
+ """DTMF aggregator processor for collecting and flushing DTMF input digits."""
2
+
1
3
  import asyncio
2
4
 
3
5
  from pipecat.frames.frames import (
@@ -8,8 +10,8 @@ from pipecat.frames.frames import (
8
10
  EndFrame,
9
11
  Frame,
10
12
  InputDTMFFrame,
13
+ InterruptionFrame,
11
14
  StartDTMFCaptureFrame,
12
- StartInterruptionFrame,
13
15
  TranscriptionFrame,
14
16
  WaitForDTMFFrame,
15
17
  )
@@ -19,10 +21,11 @@ from pipecat.utils.time import time_now_iso8601
19
21
 
20
22
  class DTMFAggregator(FrameProcessor):
21
23
  """Aggregates DTMF frames using idle wait logic.
24
+
22
25
  The aggregator accumulates digits from incoming InputDTMFFrame instances.
23
26
  It flushes the aggregated digits by emitting a TranscriptionFrame when:
24
27
  - No new digit arrives within the specified timeout period,
25
- - The termination digit (“#”) is received, or
28
+ - The termination digit ("#") is received, or
26
29
  - The number of digits aggregated equals the configured 'digits' value.
27
30
  """
28
31
 
@@ -34,7 +37,9 @@ class DTMFAggregator(FrameProcessor):
34
37
  digits: int = None,
35
38
  **kwargs,
36
39
  ):
37
- """:param timeout: Idle timeout in seconds before flushing the aggregated digits.
40
+ """Initialize the DTMF aggregator.
41
+
42
+ :param timeout: Idle timeout in seconds before flushing the aggregated digits.
38
43
  :param digits: Number of digits to aggregate before flushing.
39
44
  """
40
45
  super().__init__(**kwargs)
@@ -48,6 +53,7 @@ class DTMFAggregator(FrameProcessor):
48
53
  self._dtmf_capture_active = False
49
54
 
50
55
  async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
56
+ """Process incoming frames and handle DTMF input aggregation."""
51
57
  # Handle DTMF frames.
52
58
  await super().process_frame(frame, direction)
53
59
 
@@ -69,8 +75,8 @@ class DTMFAggregator(FrameProcessor):
69
75
  self._digit_event.set() # Trigger the timeout handler
70
76
  await self._start_dtmf_capture()
71
77
  await self.push_frame(frame, direction)
72
- elif isinstance(frame, StartInterruptionFrame):
73
- self.logger.debug("Received StartInterruptionFrame")
78
+ elif isinstance(frame, InterruptionFrame):
79
+ self.logger.debug("Received InterruptionFrame")
74
80
  if self._aggregation:
75
81
  await self.flush_aggregation()
76
82
  await self._end_dtmf_capture()
@@ -108,9 +114,7 @@ class DTMFAggregator(FrameProcessor):
108
114
  if "digits" in settings:
109
115
  new_digits = settings["digits"]
110
116
  if new_digits != self._digits:
111
- self.logger.debug(
112
- f"Updating DTMF digits from {self._digits} to {new_digits}"
113
- )
117
+ self.logger.debug(f"Updating DTMF digits from {self._digits} to {new_digits}")
114
118
  self._digits = new_digits
115
119
  settings_changed = True
116
120
 
@@ -125,9 +129,7 @@ class DTMFAggregator(FrameProcessor):
125
129
  new_end_on = set(end_value)
126
130
 
127
131
  if new_end_on != self._end_on:
128
- self.logger.debug(
129
- f"Updating DTMF end_on from {self._end_on} to {new_end_on}"
130
- )
132
+ self.logger.debug(f"Updating DTMF end_on from {self._end_on} to {new_end_on}")
131
133
  self._end_on = new_end_on
132
134
  settings_changed = True
133
135
 
@@ -142,9 +144,7 @@ class DTMFAggregator(FrameProcessor):
142
144
  new_reset_on = set(reset_value)
143
145
 
144
146
  if new_reset_on != self._reset_on:
145
- self.logger.debug(
146
- f"Updating DTMF reset_on from {self._reset_on} to {new_reset_on}"
147
- )
147
+ self.logger.debug(f"Updating DTMF reset_on from {self._reset_on} to {new_reset_on}")
148
148
  self._reset_on = new_reset_on
149
149
  settings_changed = True
150
150
 
@@ -183,9 +183,7 @@ class DTMFAggregator(FrameProcessor):
183
183
  def _create_aggregation_task(self, raise_timeout: bool = False) -> None:
184
184
  """Creates the aggregation task if it hasn't been created yet."""
185
185
  if not self._aggregation_task:
186
- self._aggregation_task = self.create_task(
187
- self._aggregation_task_handler(raise_timeout)
188
- )
186
+ self._aggregation_task = self.create_task(self._aggregation_task_handler(raise_timeout))
189
187
 
190
188
  async def _stop_aggregation_task(self) -> None:
191
189
  """Stops the aggregation task."""
@@ -198,9 +196,7 @@ class DTMFAggregator(FrameProcessor):
198
196
  while True:
199
197
  try:
200
198
  # Wait for a new digit signal with a timeout.
201
- await asyncio.wait_for(
202
- self._digit_event.wait(), timeout=self._idle_timeout
203
- )
199
+ await asyncio.wait_for(self._digit_event.wait(), timeout=self._idle_timeout)
204
200
  self._digit_event.clear()
205
201
  except asyncio.TimeoutError:
206
202
  # No new digit arrived within the timeout period; flush if needed
@@ -216,7 +212,7 @@ class DTMFAggregator(FrameProcessor):
216
212
  aggregated_frame.metadata["push_aggregation"] = True
217
213
 
218
214
  # Send interruption frame (as per original design)
219
- await self.push_frame(StartInterruptionFrame(), FrameDirection.DOWNSTREAM)
215
+ await self.push_frame(InterruptionFrame(), FrameDirection.DOWNSTREAM)
220
216
 
221
217
  # Push the transcription frame
222
218
  await self.push_frame(aggregated_frame, FrameDirection.DOWNSTREAM)
@@ -5,6 +5,7 @@ from .exotel import ExotelFrameSerializer
5
5
  from .plivo import PlivoFrameSerializer
6
6
  from .telnyx import TelnyxFrameSerializer
7
7
  from .twilio import TwilioFrameSerializer
8
+ from .vi import VIFrameSerializer
8
9
 
9
10
  __all__ = [
10
11
  "FrameSerializer",
@@ -15,6 +16,7 @@ __all__ = [
15
16
  "PlivoFrameSerializer",
16
17
  "TelnyxFrameSerializer",
17
18
  "TwilioFrameSerializer",
19
+ "VIFrameSerializer",
18
20
  ]
19
21
 
20
22
  # Optional imports
@@ -1,4 +1,6 @@
1
1
  # asterisk_ws_serializer.py
2
+ """Frame serializer for Asterisk WebSocket communication."""
3
+
2
4
  import base64
3
5
  import json
4
6
  from typing import Literal, Optional
@@ -12,8 +14,8 @@ from pipecat.frames.frames import (
12
14
  EndFrame,
13
15
  Frame,
14
16
  InputAudioRawFrame,
17
+ InterruptionFrame,
15
18
  StartFrame,
16
- StartInterruptionFrame,
17
19
  TransportMessageFrame,
18
20
  TransportMessageUrgentFrame,
19
21
  )
@@ -21,6 +23,8 @@ from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializer
21
23
 
22
24
 
23
25
  class AsteriskFrameSerializer(FrameSerializer):
26
+ """Serializes Pipecat frames to/from Asterisk WebSocket JSON messages."""
27
+
24
28
  class InputParams(BaseModel):
25
29
  """Configuration parameters for AsteriskFrameSerializer.
26
30
 
@@ -39,6 +43,12 @@ class AsteriskFrameSerializer(FrameSerializer):
39
43
  auto_hang_up: bool = False # no-op here; adapter handles hangup
40
44
 
41
45
  def __init__(self, stream_id: str, params: Optional[InputParams] = None):
46
+ """Initialize the Asterisk frame serializer.
47
+
48
+ Args:
49
+ stream_id: Unique identifier for the media stream.
50
+ params: Configuration parameters for the serializer.
51
+ """
42
52
  self._stream_id = stream_id
43
53
  self._params = params or AsteriskFrameSerializer.InputParams()
44
54
  self._tel_rate = self._params.telephony_sample_rate
@@ -49,13 +59,16 @@ class AsteriskFrameSerializer(FrameSerializer):
49
59
 
50
60
  @property
51
61
  def type(self) -> FrameSerializerType:
62
+ """Return the serializer type (TEXT for JSON messages)."""
52
63
  return FrameSerializerType.TEXT # we send/recv JSON strings
53
64
 
54
65
  async def setup(self, frame: StartFrame):
66
+ """Setup the serializer with audio parameters from the StartFrame."""
55
67
  self._sample_rate = self._params.sample_rate or frame.audio_in_sample_rate
56
68
 
57
69
  # Pipecat -> Adapter (play to caller)
58
70
  async def serialize(self, frame: Frame) -> str | bytes | None:
71
+ """Serialize Pipecat frames to Asterisk WebSocket JSON messages."""
59
72
  # On pipeline end, ask bridge to hang up
60
73
  if (
61
74
  self._params.auto_hang_up
@@ -64,7 +77,7 @@ class AsteriskFrameSerializer(FrameSerializer):
64
77
  ):
65
78
  self._hangup_sent = True
66
79
  return json.dumps({"event": "hangup"})
67
- if isinstance(frame, StartInterruptionFrame):
80
+ if isinstance(frame, InterruptionFrame):
68
81
  return json.dumps({"event": "clear", "streamId": self._stream_id})
69
82
  if isinstance(frame, AudioRawFrame):
70
83
  pcm = frame.audio
@@ -114,6 +127,7 @@ class AsteriskFrameSerializer(FrameSerializer):
114
127
 
115
128
  # Adapter -> Pipecat (audio from caller)
116
129
  async def deserialize(self, data: str | bytes) -> Frame | None:
130
+ """Deserialize Asterisk WebSocket JSON messages to Pipecat frames."""
117
131
  try:
118
132
  msg = json.loads(data)
119
133
  except Exception:
@@ -22,9 +22,9 @@ from pipecat.frames.frames import (
22
22
  Frame,
23
23
  InputAudioRawFrame,
24
24
  InputDTMFFrame,
25
+ InterruptionFrame,
25
26
  KeypadEntry,
26
27
  StartFrame,
27
- StartInterruptionFrame,
28
28
  TransportMessageFrame,
29
29
  TransportMessageUrgentFrame,
30
30
  )
@@ -117,7 +117,7 @@ class ConVoxFrameSerializer(FrameSerializer):
117
117
  self._call_ended = True
118
118
  # Return the callEnd event to be sent via the WebSocket
119
119
  return await self._send_call_end_event()
120
- elif isinstance(frame, StartInterruptionFrame):
120
+ elif isinstance(frame, InterruptionFrame):
121
121
  # Clear/interrupt command for ConVox
122
122
  message = {
123
123
  "event": "clear",
@@ -28,8 +28,8 @@ from pipecat.frames.frames import (
28
28
  EndFrame,
29
29
  Frame,
30
30
  InputAudioRawFrame,
31
+ InterruptionFrame,
31
32
  StartFrame,
32
- StartInterruptionFrame,
33
33
  TransportMessageFrame,
34
34
  TransportMessageUrgentFrame,
35
35
  )
@@ -121,7 +121,7 @@ class CustomFrameSerializer(FrameSerializer):
121
121
  Returns:
122
122
  Serialized data as JSON string, or None if the frame isn't handled.
123
123
  """
124
- if isinstance(frame, StartInterruptionFrame):
124
+ if isinstance(frame, InterruptionFrame):
125
125
  # Send clear event to instruct client to discard buffered audio
126
126
  answer = {"event": "clear", "stream_sid": self._stream_sid}
127
127
  return json.dumps(answer)
@@ -0,0 +1,324 @@
1
+ #
2
+ # Copyright (c) 2024–2025, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ """Vodafone Idea (VI) WebSocket frame serializer for audio streaming and call management."""
8
+
9
+ import base64
10
+ import json
11
+ from datetime import datetime, timezone
12
+ from typing import Optional
13
+
14
+ from loguru import logger
15
+ from pydantic import BaseModel
16
+
17
+ from pipecat.audio.utils import create_default_resampler
18
+ from pipecat.frames.frames import (
19
+ AudioRawFrame,
20
+ CancelFrame,
21
+ EndFrame,
22
+ Frame,
23
+ InputAudioRawFrame,
24
+ InputDTMFFrame,
25
+ KeypadEntry,
26
+ StartFrame,
27
+ StartInterruptionFrame,
28
+ TransportMessageFrame,
29
+ TransportMessageUrgentFrame,
30
+ )
31
+ from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
32
+
33
+
34
+ class VIFrameSerializer(FrameSerializer):
35
+ """Serializer for Vodafone Idea (VI) WebSocket protocol.
36
+
37
+ This serializer handles converting between Pipecat frames and VI's WebSocket
38
+ protocol for bidirectional audio streaming. It supports audio conversion, DTMF events,
39
+ and real-time communication with VI telephony systems.
40
+
41
+ VI WebSocket protocol requirements:
42
+ - PCM audio format at 8kHz sample rate
43
+ - 16-bit Linear PCM encoding
44
+ - Base64 encoded audio payloads
45
+ - JSON message format for control and media events
46
+ - Bitrate: 128 Kbps
47
+
48
+ Events (VI → Endpoint):
49
+ - connected: WebSocket connection established
50
+ - start: Stream session started with call/stream IDs
51
+ - media: Audio data in Base64-encoded PCM
52
+ - dtmf: Keypad digit pressed
53
+ - stop: Stream ended
54
+ - mark: Audio playback checkpoint confirmation
55
+
56
+ Events (Endpoint → VI):
57
+ - media: Send audio back to VI
58
+ - mark: Request acknowledgment for audio playback
59
+ - clear: Clear queued audio (interruption)
60
+ - exit: Terminate session gracefully
61
+ """
62
+
63
+ class InputParams(BaseModel):
64
+ """Configuration parameters for VIFrameSerializer.
65
+
66
+ Attributes:
67
+ vi_sample_rate: Sample rate used by VI, defaults to 8000 Hz (telephony standard).
68
+ sample_rate: Optional override for pipeline input sample rate.
69
+ auto_hang_up: Whether to automatically terminate call on EndFrame.
70
+ """
71
+
72
+ vi_sample_rate: int = 8000
73
+ sample_rate: Optional[int] = None
74
+ auto_hang_up: bool = False
75
+
76
+ def __init__(
77
+ self,
78
+ stream_id: str,
79
+ call_id: Optional[str] = None,
80
+ params: Optional[InputParams] = None,
81
+ ):
82
+ """Initialize the VIFrameSerializer.
83
+
84
+ Args:
85
+ stream_id: The VI stream identifier.
86
+ call_id: The associated VI call identifier.
87
+ params: Configuration parameters.
88
+ """
89
+ self._stream_id = stream_id
90
+ self._call_id = call_id
91
+ self._params = params or VIFrameSerializer.InputParams()
92
+
93
+ self._vi_sample_rate = self._params.vi_sample_rate
94
+ self._sample_rate = 0 # Pipeline input rate
95
+ self._call_ended = False
96
+
97
+ self._resampler = create_default_resampler()
98
+
99
+ @property
100
+ def type(self) -> FrameSerializerType:
101
+ """Gets the serializer type.
102
+
103
+ Returns:
104
+ The serializer type as TEXT for JSON WebSocket messages.
105
+ """
106
+ return FrameSerializerType.TEXT
107
+
108
+ async def setup(self, frame: StartFrame):
109
+ """Sets up the serializer with pipeline configuration.
110
+
111
+ Args:
112
+ frame: The StartFrame containing pipeline configuration.
113
+ """
114
+ self._sample_rate = self._params.sample_rate or frame.audio_in_sample_rate
115
+
116
+ async def serialize(self, frame: Frame) -> str | bytes | None:
117
+ """Serializes a Pipecat frame to VI WebSocket format.
118
+
119
+ Handles conversion of various frame types to VI WebSocket messages.
120
+ For EndFrames, initiates call termination if auto_hang_up is enabled.
121
+
122
+ Args:
123
+ frame: The Pipecat frame to serialize.
124
+
125
+ Returns:
126
+ Serialized data as JSON string, or None if the frame isn't handled.
127
+ """
128
+ if (
129
+ self._params.auto_hang_up
130
+ and not self._call_ended
131
+ and isinstance(frame, (EndFrame, CancelFrame))
132
+ ):
133
+ self._call_ended = True
134
+ # Return the exit event to terminate the VI session
135
+ return await self._send_exit_event()
136
+
137
+ elif isinstance(frame, StartInterruptionFrame):
138
+ # Clear/interrupt command for VI - clears queued audio
139
+ message = {
140
+ "event": "clear",
141
+ "stream_id": self._stream_id,
142
+ "call_id": self._call_id,
143
+ }
144
+ logger.debug(f"VI: Sending clear event for stream_id: {self._stream_id}")
145
+ return json.dumps(message)
146
+
147
+ elif isinstance(frame, AudioRawFrame):
148
+ if self._call_ended:
149
+ logger.debug("VI SERIALIZE: Skipping audio - call has ended")
150
+ return None
151
+
152
+ # Convert PCM audio to VI format
153
+ data = frame.audio
154
+
155
+ # Resample to VI sample rate (8kHz)
156
+ serialized_data = await self._resampler.resample(
157
+ data, frame.sample_rate, self._vi_sample_rate
158
+ )
159
+
160
+ # Encode as base64 for transmission
161
+ payload = base64.b64encode(serialized_data).decode("ascii")
162
+
163
+ # VI expects media event format with Base64-encoded PCM audio
164
+ timestamp = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
165
+
166
+ message = {
167
+ "event": "media",
168
+ "stream_id": self._stream_id,
169
+ "media": {
170
+ "timestamp": timestamp,
171
+ "chunk": len(serialized_data), # Chunk size in bytes
172
+ "payload": payload,
173
+ },
174
+ }
175
+
176
+ return json.dumps(message)
177
+
178
+ elif isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
179
+ # Pass through transport messages (for mark events, etc.)
180
+ return json.dumps(frame.message)
181
+
182
+ return None
183
+
184
+ async def _send_exit_event(self):
185
+ """Send an exit event to VI to terminate the session gracefully.
186
+
187
+ This method is called when auto_hang_up is enabled and an EndFrame or
188
+ CancelFrame is received. The exit event allows IVR logic to continue
189
+ after the WebSocket session ends.
190
+ """
191
+ try:
192
+ exit_event = {
193
+ "event": "exit",
194
+ "stream_id": self._stream_id,
195
+ "call_id": self._call_id,
196
+ "timestamp": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
197
+ }
198
+
199
+ logger.info(
200
+ f"VI auto_hang_up: Sending exit event for stream_id: {self._stream_id}, call_id: {self._call_id}"
201
+ )
202
+ return json.dumps(exit_event)
203
+ except Exception as e:
204
+ logger.error(f"VI auto_hang_up: Failed to create exit event: {e}")
205
+ return None
206
+
207
+ async def deserialize(self, data: str | bytes) -> Frame | None:
208
+ """Deserializes VI WebSocket data to Pipecat frames.
209
+
210
+ Handles conversion of VI media events to appropriate Pipecat frames.
211
+
212
+ Args:
213
+ data: The raw WebSocket data from VI.
214
+
215
+ Returns:
216
+ A Pipecat frame corresponding to the VI event, or None if unhandled.
217
+ """
218
+ try:
219
+ message = json.loads(data)
220
+ except json.JSONDecodeError:
221
+ logger.error(f"Invalid JSON received from VI: {data}")
222
+ return None
223
+
224
+ # Log all incoming events for debugging and monitoring
225
+ event = message.get("event")
226
+ logger.debug(
227
+ f"VI INCOMING EVENT: {event} - stream_id: {self._stream_id}, call_id: {self._call_id}"
228
+ )
229
+
230
+ if event == "media":
231
+ # Handle incoming audio data from VI
232
+ media = message.get("media", {})
233
+ payload_base64 = media.get("payload")
234
+
235
+ if not payload_base64:
236
+ logger.warning("VI DESERIALIZE: No payload in VI media message")
237
+ return None
238
+
239
+ try:
240
+ payload = base64.b64decode(payload_base64)
241
+ chunk_size = len(payload)
242
+
243
+ # Log chunk info (optional)
244
+ logger.debug(
245
+ f"VI DESERIALIZE: Received audio from VI - {chunk_size} bytes at {self._vi_sample_rate}Hz"
246
+ )
247
+
248
+ except Exception as e:
249
+ logger.error(f"VI DESERIALIZE: Error decoding VI audio payload: {e}")
250
+ return None
251
+
252
+ # Convert from VI sample rate (8kHz) to pipeline sample rate
253
+ deserialized_data = await self._resampler.resample(
254
+ payload,
255
+ self._vi_sample_rate,
256
+ self._sample_rate,
257
+ )
258
+
259
+ audio_frame = InputAudioRawFrame(
260
+ audio=deserialized_data,
261
+ num_channels=1, # VI uses mono audio
262
+ sample_rate=self._sample_rate,
263
+ )
264
+ return audio_frame
265
+
266
+ elif event == "dtmf":
267
+ # Handle DTMF events
268
+ dtmf_data = message.get("dtmf", {})
269
+ digit = dtmf_data.get("digit")
270
+
271
+ if digit:
272
+ try:
273
+ logger.info(f"VI: Received DTMF digit: {digit}")
274
+ return InputDTMFFrame(KeypadEntry(digit))
275
+ except ValueError:
276
+ logger.warning(f"Invalid DTMF digit from VI: {digit}")
277
+ return None
278
+
279
+ elif event == "connected":
280
+ # Handle connection event
281
+ logger.info(f"VI connection established: {message}")
282
+ return None
283
+
284
+ elif event == "start":
285
+ # Handle stream start event
286
+ logger.info(f"VI stream started: {message}")
287
+ return None
288
+
289
+ elif event == "stop":
290
+ # Handle stream stop event
291
+ logger.info(f"VI stream stopped: {message}")
292
+ # Don't end the call here, wait for explicit exit or call end
293
+ return None
294
+
295
+ elif event == "mark":
296
+ # Handle mark event - checkpoint confirming audio playback completion
297
+ mark_data = message.get("mark", {})
298
+ mark_name = mark_data.get("name", "unknown")
299
+ logger.info(f"VI mark event received: {mark_name}")
300
+ # Mark events are informational, no frame to return
301
+ return None
302
+
303
+ elif event == "error":
304
+ # Handle error events
305
+ error_msg = message.get("error", "Unknown error")
306
+ logger.error(f"VI error: {error_msg}")
307
+ return None
308
+
309
+ elif event == "exit":
310
+ # Handle exit event from VI
311
+ logger.info("VI exit event received - terminating session")
312
+ self._call_ended = True
313
+ return CancelFrame()
314
+
315
+ elif event == "call_end" or event == "callEnd":
316
+ # Handle call end event (if VI sends this)
317
+ logger.info("VI call end event received")
318
+ self._call_ended = True
319
+ return CancelFrame()
320
+
321
+ else:
322
+ logger.debug(f"VI UNHANDLED EVENT: {event}")
323
+
324
+ return None
@@ -15,7 +15,6 @@ from typing import AsyncGenerator, List, Literal, Optional, Union
15
15
  from loguru import logger
16
16
  from pydantic import BaseModel, Field
17
17
 
18
-
19
18
  from pipecat.frames.frames import (
20
19
  CancelFrame,
21
20
  EndFrame,
@@ -49,6 +48,26 @@ except ModuleNotFoundError as e:
49
48
  raise Exception(f"Missing module: {e}")
50
49
 
51
50
 
51
+ class GenerationConfig(BaseModel):
52
+ """Configuration for Cartesia Sonic-3 generation parameters.
53
+
54
+ Sonic-3 interprets these parameters as guidance to ensure natural speech.
55
+ Test against your content for best results.
56
+
57
+ Parameters:
58
+ volume: Volume multiplier for generated speech. Valid range: [0.5, 2.0]. Default is 1.0.
59
+ speed: Speed multiplier for generated speech. Valid range: [0.6, 1.5]. Default is 1.0.
60
+ emotion: Single emotion string to guide the emotional tone. Examples include neutral,
61
+ angry, excited, content, sad, scared. Over 60 emotions are supported. For best
62
+ results, use with recommended voices: Leo, Jace, Kyle, Gavin, Maya, Tessa, Dana,
63
+ and Marian.
64
+ """
65
+
66
+ volume: Optional[float] = None
67
+ speed: Optional[float] = None
68
+ emotion: Optional[str] = None
69
+
70
+
52
71
  def language_to_cartesia_language(language: Language) -> Optional[str]:
53
72
  """Convert a Language enum to Cartesia language code.
54
73
 
@@ -74,6 +93,33 @@ def language_to_cartesia_language(language: Language) -> Optional[str]:
74
93
  Language.SV: "sv",
75
94
  Language.TR: "tr",
76
95
  Language.ZH: "zh",
96
+ Language.TL: "tl",
97
+ Language.BG: "bg",
98
+ Language.RO: "ro",
99
+ Language.AR: "ar",
100
+ Language.CS: "cs",
101
+ Language.EL: "el",
102
+ Language.FI: "fi",
103
+ Language.HR: "hr",
104
+ Language.MS: "ms",
105
+ Language.SK: "sk",
106
+ Language.DA: "da",
107
+ Language.TA: "ta",
108
+ Language.UK: "uk",
109
+ Language.HU: "hu",
110
+ Language.NO: "no",
111
+ Language.VI: "vi",
112
+ Language.BN: "bn",
113
+ Language.TH: "th",
114
+ Language.HE: "he",
115
+ Language.KA: "ka",
116
+ Language.ID: "id",
117
+ Language.TE: "te",
118
+ Language.GU: "gu",
119
+ Language.KN: "kn",
120
+ Language.ML: "ml",
121
+ Language.MR: "mr",
122
+ Language.PA: "pa",
77
123
  }
78
124
 
79
125
  result = BASE_LANGUAGES.get(language)
@@ -102,16 +148,20 @@ class CartesiaTTSService(AudioContextWordTTSService):
102
148
 
103
149
  Parameters:
104
150
  language: Language to use for synthesis.
105
- speed: Voice speed control.
106
- emotion: List of emotion controls.
151
+ speed: Voice speed control for non-Sonic-3 models (literal values).
152
+ emotion: List of emotion controls for non-Sonic-3 models.
107
153
 
108
154
  .. deprecated:: 0.0.68
109
155
  The `emotion` parameter is deprecated and will be removed in a future version.
156
+
157
+ generation_config: Generation configuration for Sonic-3 models. Includes volume,
158
+ speed (numeric), and emotion (string) parameters.
110
159
  """
111
160
 
112
161
  language: Optional[Language] = Language.EN
113
162
  speed: Optional[Literal["slow", "normal", "fast"]] = None
114
163
  emotion: Optional[List[str]] = []
164
+ generation_config: Optional[GenerationConfig] = None
115
165
 
116
166
  def __init__(
117
167
  self,
@@ -120,7 +170,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
120
170
  voice_id: str,
121
171
  cartesia_version: str = "2025-04-16",
122
172
  url: str = "wss://api.cartesia.ai/tts/websocket",
123
- model: str = "sonic-2",
173
+ model: str = "sonic-3",
124
174
  sample_rate: Optional[int] = None,
125
175
  encoding: str = "pcm_s16le",
126
176
  container: str = "raw",
@@ -136,7 +186,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
136
186
  voice_id: ID of the voice to use for synthesis.
137
187
  cartesia_version: API version string for Cartesia service.
138
188
  url: WebSocket URL for Cartesia TTS API.
139
- model: TTS model to use (e.g., "sonic-2").
189
+ model: TTS model to use (e.g., "sonic-3").
140
190
  sample_rate: Audio sample rate. If None, uses default.
141
191
  encoding: Audio encoding format.
142
192
  container: Audio container format.
@@ -180,6 +230,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
180
230
  else "en",
181
231
  "speed": params.speed,
182
232
  "emotion": params.emotion,
233
+ "generation_config": params.generation_config,
183
234
  }
184
235
  self.set_model_name(model)
185
236
  self.set_voice(voice_id)
@@ -298,6 +349,11 @@ class CartesiaTTSService(AudioContextWordTTSService):
298
349
  if self._settings["speed"]:
299
350
  msg["speed"] = self._settings["speed"]
300
351
 
352
+ if self._settings["generation_config"]:
353
+ msg["generation_config"] = self._settings["generation_config"].model_dump(
354
+ exclude_none=True
355
+ )
356
+
301
357
  return json.dumps(msg)
302
358
 
303
359
  async def start(self, frame: StartFrame):
@@ -419,7 +475,6 @@ class CartesiaTTSService(AudioContextWordTTSService):
419
475
  logger.error(f"{self} error: {msg}")
420
476
  await self.push_frame(TTSStoppedFrame())
421
477
  await self.stop_all_metrics()
422
-
423
478
  await self.push_error(ErrorFrame(f"{self} error: {msg['error']}"))
424
479
  self._context_id = None
425
480
  else:
@@ -484,23 +539,27 @@ class CartesiaHttpTTSService(TTSService):
484
539
 
485
540
  Parameters:
486
541
  language: Language to use for synthesis.
487
- speed: Voice speed control.
488
- emotion: List of emotion controls.
542
+ speed: Voice speed control for non-Sonic-3 models (literal values).
543
+ emotion: List of emotion controls for non-Sonic-3 models.
489
544
 
490
545
  .. deprecated:: 0.0.68
491
546
  The `emotion` parameter is deprecated and will be removed in a future version.
547
+
548
+ generation_config: Generation configuration for Sonic-3 models. Includes volume,
549
+ speed (numeric), and emotion (string) parameters.
492
550
  """
493
551
 
494
552
  language: Optional[Language] = Language.EN
495
553
  speed: Optional[Literal["slow", "normal", "fast"]] = None
496
554
  emotion: Optional[List[str]] = Field(default_factory=list)
555
+ generation_config: Optional[GenerationConfig] = None
497
556
 
498
557
  def __init__(
499
558
  self,
500
559
  *,
501
560
  api_key: str,
502
561
  voice_id: str,
503
- model: str = "sonic-2",
562
+ model: str = "sonic-3",
504
563
  base_url: str = "https://api.cartesia.ai",
505
564
  cartesia_version: str = "2024-11-13",
506
565
  sample_rate: Optional[int] = None,
@@ -514,7 +573,7 @@ class CartesiaHttpTTSService(TTSService):
514
573
  Args:
515
574
  api_key: Cartesia API key for authentication.
516
575
  voice_id: ID of the voice to use for synthesis.
517
- model: TTS model to use (e.g., "sonic-2").
576
+ model: TTS model to use (e.g., "sonic-3").
518
577
  base_url: Base URL for Cartesia HTTP API.
519
578
  cartesia_version: API version string for Cartesia service.
520
579
  sample_rate: Audio sample rate. If None, uses default.
@@ -541,6 +600,7 @@ class CartesiaHttpTTSService(TTSService):
541
600
  else "en",
542
601
  "speed": params.speed,
543
602
  "emotion": params.emotion,
603
+ "generation_config": params.generation_config,
544
604
  }
545
605
  self.set_voice(voice_id)
546
606
  self.set_model_name(model)
@@ -634,6 +694,11 @@ class CartesiaHttpTTSService(TTSService):
634
694
  if self._settings["speed"]:
635
695
  payload["speed"] = self._settings["speed"]
636
696
 
697
+ if self._settings["generation_config"]:
698
+ payload["generation_config"] = self._settings["generation_config"].model_dump(
699
+ exclude_none=True
700
+ )
701
+
637
702
  yield TTSStartedFrame()
638
703
 
639
704
  session = await self._client._get_session()
@@ -23,7 +23,6 @@ from pipecat.frames.frames import (
23
23
  InterruptionFrame,
24
24
  LLMFullResponseEndFrame,
25
25
  StartFrame,
26
- StartInterruptionFrame,
27
26
  TTSAudioRawFrame,
28
27
  TTSStartedFrame,
29
28
  TTSStoppedFrame,
@@ -14,15 +14,15 @@ from loguru import logger
14
14
  from pydantic import BaseModel, Field
15
15
 
16
16
  from pipecat.frames.frames import (
17
- EndFrame,
18
17
  CancelFrame,
18
+ EndFrame,
19
19
  Frame,
20
+ InterruptionFrame,
20
21
  LLMFullResponseEndFrame,
21
22
  LLMFullResponseStartFrame,
22
23
  LLMMessagesFrame,
23
24
  LLMTextFrame,
24
25
  LLMUpdateSettingsFrame,
25
- StartInterruptionFrame,
26
26
  )
27
27
  from pipecat.processors.aggregators.llm_response import (
28
28
  LLMAssistantAggregatorParams,
@@ -391,7 +391,7 @@ class VistaarLLMService(LLMService):
391
391
  )
392
392
  await self.push_frame(frame, direction)
393
393
  return
394
- elif isinstance(frame, StartInterruptionFrame):
394
+ elif isinstance(frame, InterruptionFrame):
395
395
  await self._handle_interruption()
396
396
  await self.push_frame(frame, direction)
397
397
  return
@@ -467,4 +467,4 @@ class VistaarLLMService(LLMService):
467
467
 
468
468
  def can_generate_metrics(self) -> bool:
469
469
  """Check if this service can generate processing metrics."""
470
- return True
470
+ return True
@@ -50,6 +50,11 @@ from pipecat.utils.time import nanoseconds_to_seconds
50
50
 
51
51
  # TODO: When we use GeminiMultimodalLiveLLMService, we need to change this to 0.35 but that creates issue for faster TTS.
52
52
  BOT_VAD_STOP_SECS = 0.30
53
+ # For the very first bot utterance (e.g., intro), we can safely
54
+ # detect end-of-speech sooner to improve responsiveness for the
55
+ # user’s first short reply. Keep conservative to avoid mid-utterance
56
+ # false stops when TTS streams quickly.
57
+ FIRST_BOT_VAD_STOP_SECS = 0.08
53
58
 
54
59
 
55
60
  class BaseOutputTransport(FrameProcessor):
@@ -406,6 +411,9 @@ class BaseOutputTransport(FrameProcessor):
406
411
  self._bot_speaking_frame_period = 0.2
407
412
  # Last time the bot actually spoke.
408
413
  self._bot_speech_last_time = 0
414
+ # Before the first stop event, we use a shorter silence
415
+ # threshold to make the first turn more responsive.
416
+ self._first_stop_pending = True
409
417
 
410
418
  self._audio_task: Optional[asyncio.Task] = None
411
419
  self._video_task: Optional[asyncio.Task] = None
@@ -631,6 +639,10 @@ class BaseOutputTransport(FrameProcessor):
631
639
 
632
640
  self._bot_speaking = False
633
641
 
642
+ # Mark that the first stop has been completed so subsequent
643
+ # stops use the regular (longer) VAD stop threshold.
644
+ self._first_stop_pending = False
645
+
634
646
  # Clean audio buffer (there could be tiny left overs if not multiple
635
647
  # to our output chunk size).
636
648
  self._audio_buffer = bytearray()
@@ -690,9 +702,14 @@ class BaseOutputTransport(FrameProcessor):
690
702
  async def without_mixer(vad_stop_secs: float) -> AsyncGenerator[Frame, None]:
691
703
  while True:
692
704
  try:
693
- frame = await asyncio.wait_for(
694
- self._audio_queue.get(), timeout=vad_stop_secs
705
+ # Use a shorter timeout only for the first bot stop to
706
+ # accelerate the initial turn handoff right after the intro.
707
+ timeout = (
708
+ FIRST_BOT_VAD_STOP_SECS
709
+ if getattr(self, "_first_stop_pending", True)
710
+ else BOT_VAD_STOP_SECS
695
711
  )
712
+ frame = await asyncio.wait_for(self._audio_queue.get(), timeout=timeout)
696
713
  yield frame
697
714
  self._audio_queue.task_done()
698
715
  except asyncio.TimeoutError:
@@ -713,7 +730,13 @@ class BaseOutputTransport(FrameProcessor):
713
730
  except asyncio.QueueEmpty:
714
731
  # Notify the bot stopped speaking upstream if necessary.
715
732
  diff_time = time.time() - last_frame_time
716
- if diff_time > vad_stop_secs:
733
+ # Use a shorter threshold for the first stop only.
734
+ current_stop_secs = (
735
+ FIRST_BOT_VAD_STOP_SECS
736
+ if getattr(self, "_first_stop_pending", True)
737
+ else BOT_VAD_STOP_SECS
738
+ )
739
+ if diff_time > current_stop_secs:
717
740
  await self._bot_stopped_speaking()
718
741
  # Generate an audio frame with only the mixer's part.
719
742
  frame = OutputAudioRawFrame(