dv-pipecat-ai 0.0.85.dev830__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.85.dev830.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +1 -1
- {dv_pipecat_ai-0.0.85.dev830.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +16 -15
- pipecat/processors/aggregators/llm_response.py +3 -8
- pipecat/processors/dtmf_aggregator.py +17 -21
- pipecat/serializers/__init__.py +2 -0
- pipecat/serializers/asterisk.py +16 -2
- pipecat/serializers/convox.py +2 -2
- pipecat/serializers/custom.py +2 -2
- pipecat/serializers/vi.py +324 -0
- pipecat/services/cartesia/tts.py +75 -10
- pipecat/services/sarvam/tts.py +0 -1
- pipecat/services/vistaar/llm.py +4 -4
- pipecat/transports/base_output.py +26 -3
- {dv_pipecat_ai-0.0.85.dev830.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.85.dev830.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.85.dev830.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
dv_pipecat_ai-0.0.85.
|
|
1
|
+
dv_pipecat_ai-0.0.85.dev837.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
|
|
2
2
|
pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
|
|
3
3
|
pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -107,7 +107,7 @@ pipecat/pipeline/to_be_updated/merge_pipeline.py,sha256=jLEWdufIW3z1xZhdoLowdJ_S
|
|
|
107
107
|
pipecat/processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
108
108
|
pipecat/processors/async_generator.py,sha256=qPOZxk5eOad_NrF_Z06vWZ6deXIxb9AKZKYO2e5pkJs,2385
|
|
109
109
|
pipecat/processors/consumer_processor.py,sha256=DrWCKnfblknZJ0bLmR_unIeJ1axQw4IPUn2IB3KLGGA,3228
|
|
110
|
-
pipecat/processors/dtmf_aggregator.py,sha256=
|
|
110
|
+
pipecat/processors/dtmf_aggregator.py,sha256=k3xYncUr_8y5lrYfeX8PxqlF7jqFLshg_HB6HiFg7TA,10193
|
|
111
111
|
pipecat/processors/frame_processor.py,sha256=uBu6Waa0_diMXdQXMZ5V5a_KwaaPzcieyuv5gO9u-ME,33841
|
|
112
112
|
pipecat/processors/idle_frame_processor.py,sha256=z8AuhGap61lA5K35P6XCaOpn4kkmK_9NZNppbpQxheU,3124
|
|
113
113
|
pipecat/processors/logger.py,sha256=8xa4KKekXQIETlQR7zoGnwUpLNo8CeDVm7YjyXePN-w,2385
|
|
@@ -122,7 +122,7 @@ pipecat/processors/aggregators/gated.py,sha256=tii0sRrBkRW6y9Xq5iTWPnqlOEejU4VqP
|
|
|
122
122
|
pipecat/processors/aggregators/gated_llm_context.py,sha256=CPv6sMA8irD1zZ3fU1gSv6D7qcPvCA0MdpFhBtJ_ekI,3007
|
|
123
123
|
pipecat/processors/aggregators/gated_open_ai_llm_context.py,sha256=DgqmdPj1u3fP_SVmxtfP7NjHqnyhN_RVVTDfmjbkxAs,361
|
|
124
124
|
pipecat/processors/aggregators/llm_context.py,sha256=wNbZA0Vt0FzNc5cu06xiv1z7DIClIlfqR1ZD8EusbVw,11085
|
|
125
|
-
pipecat/processors/aggregators/llm_response.py,sha256
|
|
125
|
+
pipecat/processors/aggregators/llm_response.py,sha256=--6D736k5mNnIhmauRbA7ZG7H9tBR16okniz3Mpypns,48573
|
|
126
126
|
pipecat/processors/aggregators/llm_response_universal.py,sha256=5PqmpATpekD8BVWyBExZgatKHsNbZem8M-A7_VwTbiQ,34334
|
|
127
127
|
pipecat/processors/aggregators/openai_llm_context.py,sha256=cC8DXdVPERRN04i0i-1Ys6kusvnbMALeH-Z8Pu5K684,12999
|
|
128
128
|
pipecat/processors/aggregators/sentence.py,sha256=E7e3knfQl6HEGpYMKPklF1aO_gOn-rr7SnynErwfkQk,2235
|
|
@@ -153,17 +153,18 @@ pipecat/runner/livekit.py,sha256=in-2Io3FUZV-VcZZ-gQCx9L1WnKp5sHqmm7tDYlFNl4,458
|
|
|
153
153
|
pipecat/runner/run.py,sha256=McalzMoFYEJJRXyoD5PBAyUhHCdsEeeZJk8lBvplRck,30054
|
|
154
154
|
pipecat/runner/types.py,sha256=zHjbAiU17fG0ypLXCEzPu7bpDOutAg-4gE7TESvK8n0,1761
|
|
155
155
|
pipecat/runner/utils.py,sha256=Ve9rjRvbt1o8e9by0nIrCJzUDGcuJUeYYhkqycmgHXc,18682
|
|
156
|
-
pipecat/serializers/__init__.py,sha256=
|
|
157
|
-
pipecat/serializers/asterisk.py,sha256=
|
|
156
|
+
pipecat/serializers/__init__.py,sha256=z0V5GflCoPt4k2Yqm4ivuzKDh9VsYYAgK2UXZTw10aU,863
|
|
157
|
+
pipecat/serializers/asterisk.py,sha256=QLJMXkU3DZ0sgFw3Vq2Zf8PHKkQQguL_v-l2Io4lZ_M,6729
|
|
158
158
|
pipecat/serializers/base_serializer.py,sha256=OyBUZccs2ZT9mfkBbq2tGsUJMvci6o-j90Cl1sicPaI,2030
|
|
159
|
-
pipecat/serializers/convox.py,sha256=
|
|
160
|
-
pipecat/serializers/custom.py,sha256=
|
|
159
|
+
pipecat/serializers/convox.py,sha256=fj9NkFTB74B9k8qWEuICQNGUQtEV0DusaHohkOqNLa8,11145
|
|
160
|
+
pipecat/serializers/custom.py,sha256=clUEqOazGe3B2XoUFRN9zkFpMd6aIZeVRTqBRHAzavM,9071
|
|
161
161
|
pipecat/serializers/exotel.py,sha256=B04LtNnRMzKmaS61gPZbUjc2nbki3FmpCfUMww6cOe4,5953
|
|
162
162
|
pipecat/serializers/livekit.py,sha256=OMaM7yUiHfeTPbpNxE2TrmIzjmbNQIjNvlujt81dsRI,3285
|
|
163
163
|
pipecat/serializers/plivo.py,sha256=ie6VUhZDTJ7KlAuJyHNeIeMtJ3ScDq_2js1SZtz7jLI,9256
|
|
164
164
|
pipecat/serializers/protobuf.py,sha256=L0jSqvgTdkfxsu6JWjYK8QSTVji9nhzmgRsEEbGU7xY,5223
|
|
165
165
|
pipecat/serializers/telnyx.py,sha256=eFkC7dExDFildYLR8DPvgfHbgXlCwdSPd1vc11yxyok,10847
|
|
166
166
|
pipecat/serializers/twilio.py,sha256=0emSzXVw8DU_N5RPruMekbBKku9Q429-0z1PMuYejSk,10823
|
|
167
|
+
pipecat/serializers/vi.py,sha256=Q7kMXvKM493RIuOUc99LKZWgVmvd8_owAzIK_oEktfw,11150
|
|
167
168
|
pipecat/services/__init__.py,sha256=8e3Ta-8_BOPozhDB3l0GJkNXs5PWhib6yqZQUof2Kvw,1209
|
|
168
169
|
pipecat/services/ai_service.py,sha256=yE386fm2Id-yD4fCNfkmEMtg0lTA7PB17n2x_A_jwTg,5896
|
|
169
170
|
pipecat/services/ai_services.py,sha256=_RrDWfM8adV17atzY9RxK0nXRVM5kbUkKrvN90GAWYM,795
|
|
@@ -205,7 +206,7 @@ pipecat/services/azure/realtime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
|
|
|
205
206
|
pipecat/services/azure/realtime/llm.py,sha256=MnDiw-YJP3kll1gbkta4z4vsWfWZ5oBprZCinMP9O0M,2385
|
|
206
207
|
pipecat/services/cartesia/__init__.py,sha256=vzh0jBnfPwWdxFfV-tu0x1HFoOTgr9s91GYmD-CJUtY,284
|
|
207
208
|
pipecat/services/cartesia/stt.py,sha256=00k9gQYo_xPKb-RRJ-RNV4LPFw-7xXiFU7ACFLYttWY,12388
|
|
208
|
-
pipecat/services/cartesia/tts.py,sha256=
|
|
209
|
+
pipecat/services/cartesia/tts.py,sha256=I_OZCINywkDXmYzFL35MjSN8cAuNEaJs7nj0YB_obtc,27008
|
|
209
210
|
pipecat/services/cerebras/__init__.py,sha256=5zBmqq9Zfcl-HC7ylekVS5qrRedbl1mAeEwUT-T-c_o,259
|
|
210
211
|
pipecat/services/cerebras/llm.py,sha256=-yzSe_6YDGigwzES-LZS4vNXMPugmvsIYEpTySyr5nA,3047
|
|
211
212
|
pipecat/services/deepgram/__init__.py,sha256=IjRtMI7WytRDdmYVpk2qDWClXUiNgdl7ZkvEAWg1eYE,304
|
|
@@ -324,7 +325,7 @@ pipecat/services/sambanova/llm.py,sha256=5XVfPLEk__W8ykFqLdV95ZUhlGGkAaJwmbciLdZ
|
|
|
324
325
|
pipecat/services/sambanova/stt.py,sha256=ZZgEZ7WQjLFHbCko-3LNTtVajjtfUvbtVLtFcaNadVQ,2536
|
|
325
326
|
pipecat/services/sarvam/__init__.py,sha256=B4TN_tTHV9fWg0aSoPvfQlXISA0nJaQ9-u08I9UWvH4,280
|
|
326
327
|
pipecat/services/sarvam/stt.py,sha256=p9Iq4loMwnftNZ_S0WoFSoX7iBbRKyja6RsVWbpj508,19314
|
|
327
|
-
pipecat/services/sarvam/tts.py,sha256=
|
|
328
|
+
pipecat/services/sarvam/tts.py,sha256=lrwfdC53kZ7f2QPgNRxzryISNkrJCvNtlZ-19-iXg94,27610
|
|
328
329
|
pipecat/services/simli/__init__.py,sha256=cbDcqOaGsEgKbGYKpJ1Vv7LN4ZjOWA04sE84WW5vgQI,257
|
|
329
330
|
pipecat/services/simli/video.py,sha256=Zu2XLvl2Y6VHaWzT9wEdzW9d0EYoZyzYLxjQFyV8vho,8320
|
|
330
331
|
pipecat/services/soniox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -338,7 +339,7 @@ pipecat/services/together/llm.py,sha256=VSayO-U6g9Ld0xK9CXRQPUsd5gWJKtiA8qDAyXgs
|
|
|
338
339
|
pipecat/services/ultravox/__init__.py,sha256=EoHCSXI2o0DFQslELgkhAGZtxDj63gZi-9ZEhXljaKE,259
|
|
339
340
|
pipecat/services/ultravox/stt.py,sha256=uCQm_-LbycXdXRV6IE1a6Mymis6tyww7V8PnPzAQtx8,16586
|
|
340
341
|
pipecat/services/vistaar/__init__.py,sha256=UFfSWFN5rbzl6NN-E_OH_MFaSYodZWNlenAU0wk-rAI,110
|
|
341
|
-
pipecat/services/vistaar/llm.py,sha256=
|
|
342
|
+
pipecat/services/vistaar/llm.py,sha256=GNVKaelbpNH7NW7iOpBj2rJjmhMVUsPqfnBI-YgIjjw,19326
|
|
342
343
|
pipecat/services/whisper/__init__.py,sha256=smADmw0Fv98k7cGRuHTEcljKTO2WdZqLpJd0qsTCwH8,281
|
|
343
344
|
pipecat/services/whisper/base_stt.py,sha256=VhslESPnYIeVbmnQTzmlZPV35TH49duxYTvJe0epNnE,7850
|
|
344
345
|
pipecat/services/whisper/stt.py,sha256=9Qd56vWMzg3LtHikQnfgyMtl4odE6BCHDbpAn3HSWjw,17480
|
|
@@ -353,7 +354,7 @@ pipecat/transcriptions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3
|
|
|
353
354
|
pipecat/transcriptions/language.py,sha256=-mWI1MiZbasuoqZTOBH69dAmoM7-UJzWq9rSCcrnmh4,8228
|
|
354
355
|
pipecat/transports/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
355
356
|
pipecat/transports/base_input.py,sha256=WGtoXXlF3GIjYgjtYnAgi8nZozd5abNlGNjwRnz8FRs,20138
|
|
356
|
-
pipecat/transports/base_output.py,sha256=
|
|
357
|
+
pipecat/transports/base_output.py,sha256=7WoXtAQAi-3OC9PC_zk61lCWlBTk5-NuTLUbsQUAI_U,36723
|
|
357
358
|
pipecat/transports/base_transport.py,sha256=JlNiH0DysTfr6azwHauJqY_Z9HJC702O29Q0qrsLrg4,7530
|
|
358
359
|
pipecat/transports/daily/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
359
360
|
pipecat/transports/daily/transport.py,sha256=VanO33ff9g6px-vwGgT6M7cMVg786pOGfMU7Okm7a78,91917
|
|
@@ -415,7 +416,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=fwzxFpi8DJl6BJbK74G0UEB4ccMJg
|
|
|
415
416
|
pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
|
|
416
417
|
pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
|
|
417
418
|
pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
|
|
418
|
-
dv_pipecat_ai-0.0.85.
|
|
419
|
-
dv_pipecat_ai-0.0.85.
|
|
420
|
-
dv_pipecat_ai-0.0.85.
|
|
421
|
-
dv_pipecat_ai-0.0.85.
|
|
419
|
+
dv_pipecat_ai-0.0.85.dev837.dist-info/METADATA,sha256=dQC8Y4gHZ3jPBKpybN1R9aKRUbb9mQpb0cPuLQo5KUc,32924
|
|
420
|
+
dv_pipecat_ai-0.0.85.dev837.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
421
|
+
dv_pipecat_ai-0.0.85.dev837.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
|
|
422
|
+
dv_pipecat_ai-0.0.85.dev837.dist-info/RECORD,,
|
|
@@ -49,7 +49,6 @@ from pipecat.frames.frames import (
|
|
|
49
49
|
OpenAILLMContextAssistantTimestampFrame,
|
|
50
50
|
SpeechControlParamsFrame,
|
|
51
51
|
StartFrame,
|
|
52
|
-
StartInterruptionFrame,
|
|
53
52
|
TextFrame,
|
|
54
53
|
TranscriptDropFrame,
|
|
55
54
|
TranscriptionFrame,
|
|
@@ -473,8 +472,8 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
|
|
|
473
472
|
frame: The frame to process.
|
|
474
473
|
direction: The direction of frame flow in the pipeline.
|
|
475
474
|
"""
|
|
476
|
-
if isinstance(frame,
|
|
477
|
-
self.logger.debug("Received
|
|
475
|
+
if isinstance(frame, InterruptionFrame):
|
|
476
|
+
self.logger.debug("Received InterruptionFrame")
|
|
478
477
|
await super().process_frame(frame, direction)
|
|
479
478
|
|
|
480
479
|
if isinstance(frame, StartFrame):
|
|
@@ -560,7 +559,7 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
|
|
|
560
559
|
"Triggering interruption - pushing BotInterruptionFrame and aggregation"
|
|
561
560
|
)
|
|
562
561
|
# await self.push_frame(BotInterruptionFrame(), FrameDirection.UPSTREAM)
|
|
563
|
-
await self.push_frame(
|
|
562
|
+
await self.push_frame(InterruptionFrame(), FrameDirection.DOWNSTREAM)
|
|
564
563
|
self.logger.debug("Pushed BotInterruptionFrame")
|
|
565
564
|
# No interruption config - normal behavior (always push aggregation)
|
|
566
565
|
await self._process_aggregation()
|
|
@@ -596,12 +595,8 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
|
|
|
596
595
|
"""Notify upstream processors that pending transcripts should be dropped."""
|
|
597
596
|
if self._pending_transcription_ids:
|
|
598
597
|
drop_frame = TranscriptDropFrame(transcript_ids=list(self._pending_transcription_ids))
|
|
599
|
-
self.logger.debug(
|
|
600
|
-
f"Dropping {len(self._pending_transcription_ids)} transcript chunk(s) due to {reason}"
|
|
601
|
-
)
|
|
602
598
|
await self.push_frame(drop_frame, FrameDirection.UPSTREAM)
|
|
603
599
|
self._pending_transcription_ids.clear()
|
|
604
|
-
self._aggregation = ""
|
|
605
600
|
|
|
606
601
|
async def _start(self, frame: StartFrame):
|
|
607
602
|
self._create_aggregation_task()
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""DTMF aggregator processor for collecting and flushing DTMF input digits."""
|
|
2
|
+
|
|
1
3
|
import asyncio
|
|
2
4
|
|
|
3
5
|
from pipecat.frames.frames import (
|
|
@@ -8,8 +10,8 @@ from pipecat.frames.frames import (
|
|
|
8
10
|
EndFrame,
|
|
9
11
|
Frame,
|
|
10
12
|
InputDTMFFrame,
|
|
13
|
+
InterruptionFrame,
|
|
11
14
|
StartDTMFCaptureFrame,
|
|
12
|
-
StartInterruptionFrame,
|
|
13
15
|
TranscriptionFrame,
|
|
14
16
|
WaitForDTMFFrame,
|
|
15
17
|
)
|
|
@@ -19,10 +21,11 @@ from pipecat.utils.time import time_now_iso8601
|
|
|
19
21
|
|
|
20
22
|
class DTMFAggregator(FrameProcessor):
|
|
21
23
|
"""Aggregates DTMF frames using idle wait logic.
|
|
24
|
+
|
|
22
25
|
The aggregator accumulates digits from incoming InputDTMFFrame instances.
|
|
23
26
|
It flushes the aggregated digits by emitting a TranscriptionFrame when:
|
|
24
27
|
- No new digit arrives within the specified timeout period,
|
|
25
|
-
- The termination digit (
|
|
28
|
+
- The termination digit ("#") is received, or
|
|
26
29
|
- The number of digits aggregated equals the configured 'digits' value.
|
|
27
30
|
"""
|
|
28
31
|
|
|
@@ -34,7 +37,9 @@ class DTMFAggregator(FrameProcessor):
|
|
|
34
37
|
digits: int = None,
|
|
35
38
|
**kwargs,
|
|
36
39
|
):
|
|
37
|
-
"""
|
|
40
|
+
"""Initialize the DTMF aggregator.
|
|
41
|
+
|
|
42
|
+
:param timeout: Idle timeout in seconds before flushing the aggregated digits.
|
|
38
43
|
:param digits: Number of digits to aggregate before flushing.
|
|
39
44
|
"""
|
|
40
45
|
super().__init__(**kwargs)
|
|
@@ -48,6 +53,7 @@ class DTMFAggregator(FrameProcessor):
|
|
|
48
53
|
self._dtmf_capture_active = False
|
|
49
54
|
|
|
50
55
|
async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
|
|
56
|
+
"""Process incoming frames and handle DTMF input aggregation."""
|
|
51
57
|
# Handle DTMF frames.
|
|
52
58
|
await super().process_frame(frame, direction)
|
|
53
59
|
|
|
@@ -69,8 +75,8 @@ class DTMFAggregator(FrameProcessor):
|
|
|
69
75
|
self._digit_event.set() # Trigger the timeout handler
|
|
70
76
|
await self._start_dtmf_capture()
|
|
71
77
|
await self.push_frame(frame, direction)
|
|
72
|
-
elif isinstance(frame,
|
|
73
|
-
self.logger.debug("Received
|
|
78
|
+
elif isinstance(frame, InterruptionFrame):
|
|
79
|
+
self.logger.debug("Received InterruptionFrame")
|
|
74
80
|
if self._aggregation:
|
|
75
81
|
await self.flush_aggregation()
|
|
76
82
|
await self._end_dtmf_capture()
|
|
@@ -108,9 +114,7 @@ class DTMFAggregator(FrameProcessor):
|
|
|
108
114
|
if "digits" in settings:
|
|
109
115
|
new_digits = settings["digits"]
|
|
110
116
|
if new_digits != self._digits:
|
|
111
|
-
self.logger.debug(
|
|
112
|
-
f"Updating DTMF digits from {self._digits} to {new_digits}"
|
|
113
|
-
)
|
|
117
|
+
self.logger.debug(f"Updating DTMF digits from {self._digits} to {new_digits}")
|
|
114
118
|
self._digits = new_digits
|
|
115
119
|
settings_changed = True
|
|
116
120
|
|
|
@@ -125,9 +129,7 @@ class DTMFAggregator(FrameProcessor):
|
|
|
125
129
|
new_end_on = set(end_value)
|
|
126
130
|
|
|
127
131
|
if new_end_on != self._end_on:
|
|
128
|
-
self.logger.debug(
|
|
129
|
-
f"Updating DTMF end_on from {self._end_on} to {new_end_on}"
|
|
130
|
-
)
|
|
132
|
+
self.logger.debug(f"Updating DTMF end_on from {self._end_on} to {new_end_on}")
|
|
131
133
|
self._end_on = new_end_on
|
|
132
134
|
settings_changed = True
|
|
133
135
|
|
|
@@ -142,9 +144,7 @@ class DTMFAggregator(FrameProcessor):
|
|
|
142
144
|
new_reset_on = set(reset_value)
|
|
143
145
|
|
|
144
146
|
if new_reset_on != self._reset_on:
|
|
145
|
-
self.logger.debug(
|
|
146
|
-
f"Updating DTMF reset_on from {self._reset_on} to {new_reset_on}"
|
|
147
|
-
)
|
|
147
|
+
self.logger.debug(f"Updating DTMF reset_on from {self._reset_on} to {new_reset_on}")
|
|
148
148
|
self._reset_on = new_reset_on
|
|
149
149
|
settings_changed = True
|
|
150
150
|
|
|
@@ -183,9 +183,7 @@ class DTMFAggregator(FrameProcessor):
|
|
|
183
183
|
def _create_aggregation_task(self, raise_timeout: bool = False) -> None:
|
|
184
184
|
"""Creates the aggregation task if it hasn't been created yet."""
|
|
185
185
|
if not self._aggregation_task:
|
|
186
|
-
self._aggregation_task = self.create_task(
|
|
187
|
-
self._aggregation_task_handler(raise_timeout)
|
|
188
|
-
)
|
|
186
|
+
self._aggregation_task = self.create_task(self._aggregation_task_handler(raise_timeout))
|
|
189
187
|
|
|
190
188
|
async def _stop_aggregation_task(self) -> None:
|
|
191
189
|
"""Stops the aggregation task."""
|
|
@@ -198,9 +196,7 @@ class DTMFAggregator(FrameProcessor):
|
|
|
198
196
|
while True:
|
|
199
197
|
try:
|
|
200
198
|
# Wait for a new digit signal with a timeout.
|
|
201
|
-
await asyncio.wait_for(
|
|
202
|
-
self._digit_event.wait(), timeout=self._idle_timeout
|
|
203
|
-
)
|
|
199
|
+
await asyncio.wait_for(self._digit_event.wait(), timeout=self._idle_timeout)
|
|
204
200
|
self._digit_event.clear()
|
|
205
201
|
except asyncio.TimeoutError:
|
|
206
202
|
# No new digit arrived within the timeout period; flush if needed
|
|
@@ -216,7 +212,7 @@ class DTMFAggregator(FrameProcessor):
|
|
|
216
212
|
aggregated_frame.metadata["push_aggregation"] = True
|
|
217
213
|
|
|
218
214
|
# Send interruption frame (as per original design)
|
|
219
|
-
await self.push_frame(
|
|
215
|
+
await self.push_frame(InterruptionFrame(), FrameDirection.DOWNSTREAM)
|
|
220
216
|
|
|
221
217
|
# Push the transcription frame
|
|
222
218
|
await self.push_frame(aggregated_frame, FrameDirection.DOWNSTREAM)
|
pipecat/serializers/__init__.py
CHANGED
|
@@ -5,6 +5,7 @@ from .exotel import ExotelFrameSerializer
|
|
|
5
5
|
from .plivo import PlivoFrameSerializer
|
|
6
6
|
from .telnyx import TelnyxFrameSerializer
|
|
7
7
|
from .twilio import TwilioFrameSerializer
|
|
8
|
+
from .vi import VIFrameSerializer
|
|
8
9
|
|
|
9
10
|
__all__ = [
|
|
10
11
|
"FrameSerializer",
|
|
@@ -15,6 +16,7 @@ __all__ = [
|
|
|
15
16
|
"PlivoFrameSerializer",
|
|
16
17
|
"TelnyxFrameSerializer",
|
|
17
18
|
"TwilioFrameSerializer",
|
|
19
|
+
"VIFrameSerializer",
|
|
18
20
|
]
|
|
19
21
|
|
|
20
22
|
# Optional imports
|
pipecat/serializers/asterisk.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
# asterisk_ws_serializer.py
|
|
2
|
+
"""Frame serializer for Asterisk WebSocket communication."""
|
|
3
|
+
|
|
2
4
|
import base64
|
|
3
5
|
import json
|
|
4
6
|
from typing import Literal, Optional
|
|
@@ -12,8 +14,8 @@ from pipecat.frames.frames import (
|
|
|
12
14
|
EndFrame,
|
|
13
15
|
Frame,
|
|
14
16
|
InputAudioRawFrame,
|
|
17
|
+
InterruptionFrame,
|
|
15
18
|
StartFrame,
|
|
16
|
-
StartInterruptionFrame,
|
|
17
19
|
TransportMessageFrame,
|
|
18
20
|
TransportMessageUrgentFrame,
|
|
19
21
|
)
|
|
@@ -21,6 +23,8 @@ from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializer
|
|
|
21
23
|
|
|
22
24
|
|
|
23
25
|
class AsteriskFrameSerializer(FrameSerializer):
|
|
26
|
+
"""Serializes Pipecat frames to/from Asterisk WebSocket JSON messages."""
|
|
27
|
+
|
|
24
28
|
class InputParams(BaseModel):
|
|
25
29
|
"""Configuration parameters for AsteriskFrameSerializer.
|
|
26
30
|
|
|
@@ -39,6 +43,12 @@ class AsteriskFrameSerializer(FrameSerializer):
|
|
|
39
43
|
auto_hang_up: bool = False # no-op here; adapter handles hangup
|
|
40
44
|
|
|
41
45
|
def __init__(self, stream_id: str, params: Optional[InputParams] = None):
|
|
46
|
+
"""Initialize the Asterisk frame serializer.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
stream_id: Unique identifier for the media stream.
|
|
50
|
+
params: Configuration parameters for the serializer.
|
|
51
|
+
"""
|
|
42
52
|
self._stream_id = stream_id
|
|
43
53
|
self._params = params or AsteriskFrameSerializer.InputParams()
|
|
44
54
|
self._tel_rate = self._params.telephony_sample_rate
|
|
@@ -49,13 +59,16 @@ class AsteriskFrameSerializer(FrameSerializer):
|
|
|
49
59
|
|
|
50
60
|
@property
|
|
51
61
|
def type(self) -> FrameSerializerType:
|
|
62
|
+
"""Return the serializer type (TEXT for JSON messages)."""
|
|
52
63
|
return FrameSerializerType.TEXT # we send/recv JSON strings
|
|
53
64
|
|
|
54
65
|
async def setup(self, frame: StartFrame):
|
|
66
|
+
"""Setup the serializer with audio parameters from the StartFrame."""
|
|
55
67
|
self._sample_rate = self._params.sample_rate or frame.audio_in_sample_rate
|
|
56
68
|
|
|
57
69
|
# Pipecat -> Adapter (play to caller)
|
|
58
70
|
async def serialize(self, frame: Frame) -> str | bytes | None:
|
|
71
|
+
"""Serialize Pipecat frames to Asterisk WebSocket JSON messages."""
|
|
59
72
|
# On pipeline end, ask bridge to hang up
|
|
60
73
|
if (
|
|
61
74
|
self._params.auto_hang_up
|
|
@@ -64,7 +77,7 @@ class AsteriskFrameSerializer(FrameSerializer):
|
|
|
64
77
|
):
|
|
65
78
|
self._hangup_sent = True
|
|
66
79
|
return json.dumps({"event": "hangup"})
|
|
67
|
-
if isinstance(frame,
|
|
80
|
+
if isinstance(frame, InterruptionFrame):
|
|
68
81
|
return json.dumps({"event": "clear", "streamId": self._stream_id})
|
|
69
82
|
if isinstance(frame, AudioRawFrame):
|
|
70
83
|
pcm = frame.audio
|
|
@@ -114,6 +127,7 @@ class AsteriskFrameSerializer(FrameSerializer):
|
|
|
114
127
|
|
|
115
128
|
# Adapter -> Pipecat (audio from caller)
|
|
116
129
|
async def deserialize(self, data: str | bytes) -> Frame | None:
|
|
130
|
+
"""Deserialize Asterisk WebSocket JSON messages to Pipecat frames."""
|
|
117
131
|
try:
|
|
118
132
|
msg = json.loads(data)
|
|
119
133
|
except Exception:
|
pipecat/serializers/convox.py
CHANGED
|
@@ -22,9 +22,9 @@ from pipecat.frames.frames import (
|
|
|
22
22
|
Frame,
|
|
23
23
|
InputAudioRawFrame,
|
|
24
24
|
InputDTMFFrame,
|
|
25
|
+
InterruptionFrame,
|
|
25
26
|
KeypadEntry,
|
|
26
27
|
StartFrame,
|
|
27
|
-
StartInterruptionFrame,
|
|
28
28
|
TransportMessageFrame,
|
|
29
29
|
TransportMessageUrgentFrame,
|
|
30
30
|
)
|
|
@@ -117,7 +117,7 @@ class ConVoxFrameSerializer(FrameSerializer):
|
|
|
117
117
|
self._call_ended = True
|
|
118
118
|
# Return the callEnd event to be sent via the WebSocket
|
|
119
119
|
return await self._send_call_end_event()
|
|
120
|
-
elif isinstance(frame,
|
|
120
|
+
elif isinstance(frame, InterruptionFrame):
|
|
121
121
|
# Clear/interrupt command for ConVox
|
|
122
122
|
message = {
|
|
123
123
|
"event": "clear",
|
pipecat/serializers/custom.py
CHANGED
|
@@ -28,8 +28,8 @@ from pipecat.frames.frames import (
|
|
|
28
28
|
EndFrame,
|
|
29
29
|
Frame,
|
|
30
30
|
InputAudioRawFrame,
|
|
31
|
+
InterruptionFrame,
|
|
31
32
|
StartFrame,
|
|
32
|
-
StartInterruptionFrame,
|
|
33
33
|
TransportMessageFrame,
|
|
34
34
|
TransportMessageUrgentFrame,
|
|
35
35
|
)
|
|
@@ -121,7 +121,7 @@ class CustomFrameSerializer(FrameSerializer):
|
|
|
121
121
|
Returns:
|
|
122
122
|
Serialized data as JSON string, or None if the frame isn't handled.
|
|
123
123
|
"""
|
|
124
|
-
if isinstance(frame,
|
|
124
|
+
if isinstance(frame, InterruptionFrame):
|
|
125
125
|
# Send clear event to instruct client to discard buffered audio
|
|
126
126
|
answer = {"event": "clear", "stream_sid": self._stream_sid}
|
|
127
127
|
return json.dumps(answer)
|
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2024–2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""Vodafone Idea (VI) WebSocket frame serializer for audio streaming and call management."""
|
|
8
|
+
|
|
9
|
+
import base64
|
|
10
|
+
import json
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
from loguru import logger
|
|
15
|
+
from pydantic import BaseModel
|
|
16
|
+
|
|
17
|
+
from pipecat.audio.utils import create_default_resampler
|
|
18
|
+
from pipecat.frames.frames import (
|
|
19
|
+
AudioRawFrame,
|
|
20
|
+
CancelFrame,
|
|
21
|
+
EndFrame,
|
|
22
|
+
Frame,
|
|
23
|
+
InputAudioRawFrame,
|
|
24
|
+
InputDTMFFrame,
|
|
25
|
+
KeypadEntry,
|
|
26
|
+
StartFrame,
|
|
27
|
+
StartInterruptionFrame,
|
|
28
|
+
TransportMessageFrame,
|
|
29
|
+
TransportMessageUrgentFrame,
|
|
30
|
+
)
|
|
31
|
+
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class VIFrameSerializer(FrameSerializer):
|
|
35
|
+
"""Serializer for Vodafone Idea (VI) WebSocket protocol.
|
|
36
|
+
|
|
37
|
+
This serializer handles converting between Pipecat frames and VI's WebSocket
|
|
38
|
+
protocol for bidirectional audio streaming. It supports audio conversion, DTMF events,
|
|
39
|
+
and real-time communication with VI telephony systems.
|
|
40
|
+
|
|
41
|
+
VI WebSocket protocol requirements:
|
|
42
|
+
- PCM audio format at 8kHz sample rate
|
|
43
|
+
- 16-bit Linear PCM encoding
|
|
44
|
+
- Base64 encoded audio payloads
|
|
45
|
+
- JSON message format for control and media events
|
|
46
|
+
- Bitrate: 128 Kbps
|
|
47
|
+
|
|
48
|
+
Events (VI → Endpoint):
|
|
49
|
+
- connected: WebSocket connection established
|
|
50
|
+
- start: Stream session started with call/stream IDs
|
|
51
|
+
- media: Audio data in Base64-encoded PCM
|
|
52
|
+
- dtmf: Keypad digit pressed
|
|
53
|
+
- stop: Stream ended
|
|
54
|
+
- mark: Audio playback checkpoint confirmation
|
|
55
|
+
|
|
56
|
+
Events (Endpoint → VI):
|
|
57
|
+
- media: Send audio back to VI
|
|
58
|
+
- mark: Request acknowledgment for audio playback
|
|
59
|
+
- clear: Clear queued audio (interruption)
|
|
60
|
+
- exit: Terminate session gracefully
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
class InputParams(BaseModel):
|
|
64
|
+
"""Configuration parameters for VIFrameSerializer.
|
|
65
|
+
|
|
66
|
+
Attributes:
|
|
67
|
+
vi_sample_rate: Sample rate used by VI, defaults to 8000 Hz (telephony standard).
|
|
68
|
+
sample_rate: Optional override for pipeline input sample rate.
|
|
69
|
+
auto_hang_up: Whether to automatically terminate call on EndFrame.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
vi_sample_rate: int = 8000
|
|
73
|
+
sample_rate: Optional[int] = None
|
|
74
|
+
auto_hang_up: bool = False
|
|
75
|
+
|
|
76
|
+
def __init__(
|
|
77
|
+
self,
|
|
78
|
+
stream_id: str,
|
|
79
|
+
call_id: Optional[str] = None,
|
|
80
|
+
params: Optional[InputParams] = None,
|
|
81
|
+
):
|
|
82
|
+
"""Initialize the VIFrameSerializer.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
stream_id: The VI stream identifier.
|
|
86
|
+
call_id: The associated VI call identifier.
|
|
87
|
+
params: Configuration parameters.
|
|
88
|
+
"""
|
|
89
|
+
self._stream_id = stream_id
|
|
90
|
+
self._call_id = call_id
|
|
91
|
+
self._params = params or VIFrameSerializer.InputParams()
|
|
92
|
+
|
|
93
|
+
self._vi_sample_rate = self._params.vi_sample_rate
|
|
94
|
+
self._sample_rate = 0 # Pipeline input rate
|
|
95
|
+
self._call_ended = False
|
|
96
|
+
|
|
97
|
+
self._resampler = create_default_resampler()
|
|
98
|
+
|
|
99
|
+
@property
|
|
100
|
+
def type(self) -> FrameSerializerType:
|
|
101
|
+
"""Gets the serializer type.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
The serializer type as TEXT for JSON WebSocket messages.
|
|
105
|
+
"""
|
|
106
|
+
return FrameSerializerType.TEXT
|
|
107
|
+
|
|
108
|
+
async def setup(self, frame: StartFrame):
|
|
109
|
+
"""Sets up the serializer with pipeline configuration.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
frame: The StartFrame containing pipeline configuration.
|
|
113
|
+
"""
|
|
114
|
+
self._sample_rate = self._params.sample_rate or frame.audio_in_sample_rate
|
|
115
|
+
|
|
116
|
+
async def serialize(self, frame: Frame) -> str | bytes | None:
|
|
117
|
+
"""Serializes a Pipecat frame to VI WebSocket format.
|
|
118
|
+
|
|
119
|
+
Handles conversion of various frame types to VI WebSocket messages.
|
|
120
|
+
For EndFrames, initiates call termination if auto_hang_up is enabled.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
frame: The Pipecat frame to serialize.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
Serialized data as JSON string, or None if the frame isn't handled.
|
|
127
|
+
"""
|
|
128
|
+
if (
|
|
129
|
+
self._params.auto_hang_up
|
|
130
|
+
and not self._call_ended
|
|
131
|
+
and isinstance(frame, (EndFrame, CancelFrame))
|
|
132
|
+
):
|
|
133
|
+
self._call_ended = True
|
|
134
|
+
# Return the exit event to terminate the VI session
|
|
135
|
+
return await self._send_exit_event()
|
|
136
|
+
|
|
137
|
+
elif isinstance(frame, StartInterruptionFrame):
|
|
138
|
+
# Clear/interrupt command for VI - clears queued audio
|
|
139
|
+
message = {
|
|
140
|
+
"event": "clear",
|
|
141
|
+
"stream_id": self._stream_id,
|
|
142
|
+
"call_id": self._call_id,
|
|
143
|
+
}
|
|
144
|
+
logger.debug(f"VI: Sending clear event for stream_id: {self._stream_id}")
|
|
145
|
+
return json.dumps(message)
|
|
146
|
+
|
|
147
|
+
elif isinstance(frame, AudioRawFrame):
|
|
148
|
+
if self._call_ended:
|
|
149
|
+
logger.debug("VI SERIALIZE: Skipping audio - call has ended")
|
|
150
|
+
return None
|
|
151
|
+
|
|
152
|
+
# Convert PCM audio to VI format
|
|
153
|
+
data = frame.audio
|
|
154
|
+
|
|
155
|
+
# Resample to VI sample rate (8kHz)
|
|
156
|
+
serialized_data = await self._resampler.resample(
|
|
157
|
+
data, frame.sample_rate, self._vi_sample_rate
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
# Encode as base64 for transmission
|
|
161
|
+
payload = base64.b64encode(serialized_data).decode("ascii")
|
|
162
|
+
|
|
163
|
+
# VI expects media event format with Base64-encoded PCM audio
|
|
164
|
+
timestamp = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
|
165
|
+
|
|
166
|
+
message = {
|
|
167
|
+
"event": "media",
|
|
168
|
+
"stream_id": self._stream_id,
|
|
169
|
+
"media": {
|
|
170
|
+
"timestamp": timestamp,
|
|
171
|
+
"chunk": len(serialized_data), # Chunk size in bytes
|
|
172
|
+
"payload": payload,
|
|
173
|
+
},
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
return json.dumps(message)
|
|
177
|
+
|
|
178
|
+
elif isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
|
|
179
|
+
# Pass through transport messages (for mark events, etc.)
|
|
180
|
+
return json.dumps(frame.message)
|
|
181
|
+
|
|
182
|
+
return None
|
|
183
|
+
|
|
184
|
+
async def _send_exit_event(self):
|
|
185
|
+
"""Send an exit event to VI to terminate the session gracefully.
|
|
186
|
+
|
|
187
|
+
This method is called when auto_hang_up is enabled and an EndFrame or
|
|
188
|
+
CancelFrame is received. The exit event allows IVR logic to continue
|
|
189
|
+
after the WebSocket session ends.
|
|
190
|
+
"""
|
|
191
|
+
try:
|
|
192
|
+
exit_event = {
|
|
193
|
+
"event": "exit",
|
|
194
|
+
"stream_id": self._stream_id,
|
|
195
|
+
"call_id": self._call_id,
|
|
196
|
+
"timestamp": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
logger.info(
|
|
200
|
+
f"VI auto_hang_up: Sending exit event for stream_id: {self._stream_id}, call_id: {self._call_id}"
|
|
201
|
+
)
|
|
202
|
+
return json.dumps(exit_event)
|
|
203
|
+
except Exception as e:
|
|
204
|
+
logger.error(f"VI auto_hang_up: Failed to create exit event: {e}")
|
|
205
|
+
return None
|
|
206
|
+
|
|
207
|
+
async def deserialize(self, data: str | bytes) -> Frame | None:
|
|
208
|
+
"""Deserializes VI WebSocket data to Pipecat frames.
|
|
209
|
+
|
|
210
|
+
Handles conversion of VI media events to appropriate Pipecat frames.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
data: The raw WebSocket data from VI.
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
A Pipecat frame corresponding to the VI event, or None if unhandled.
|
|
217
|
+
"""
|
|
218
|
+
try:
|
|
219
|
+
message = json.loads(data)
|
|
220
|
+
except json.JSONDecodeError:
|
|
221
|
+
logger.error(f"Invalid JSON received from VI: {data}")
|
|
222
|
+
return None
|
|
223
|
+
|
|
224
|
+
# Log all incoming events for debugging and monitoring
|
|
225
|
+
event = message.get("event")
|
|
226
|
+
logger.debug(
|
|
227
|
+
f"VI INCOMING EVENT: {event} - stream_id: {self._stream_id}, call_id: {self._call_id}"
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
if event == "media":
|
|
231
|
+
# Handle incoming audio data from VI
|
|
232
|
+
media = message.get("media", {})
|
|
233
|
+
payload_base64 = media.get("payload")
|
|
234
|
+
|
|
235
|
+
if not payload_base64:
|
|
236
|
+
logger.warning("VI DESERIALIZE: No payload in VI media message")
|
|
237
|
+
return None
|
|
238
|
+
|
|
239
|
+
try:
|
|
240
|
+
payload = base64.b64decode(payload_base64)
|
|
241
|
+
chunk_size = len(payload)
|
|
242
|
+
|
|
243
|
+
# Log chunk info (optional)
|
|
244
|
+
logger.debug(
|
|
245
|
+
f"VI DESERIALIZE: Received audio from VI - {chunk_size} bytes at {self._vi_sample_rate}Hz"
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
except Exception as e:
|
|
249
|
+
logger.error(f"VI DESERIALIZE: Error decoding VI audio payload: {e}")
|
|
250
|
+
return None
|
|
251
|
+
|
|
252
|
+
# Convert from VI sample rate (8kHz) to pipeline sample rate
|
|
253
|
+
deserialized_data = await self._resampler.resample(
|
|
254
|
+
payload,
|
|
255
|
+
self._vi_sample_rate,
|
|
256
|
+
self._sample_rate,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
audio_frame = InputAudioRawFrame(
|
|
260
|
+
audio=deserialized_data,
|
|
261
|
+
num_channels=1, # VI uses mono audio
|
|
262
|
+
sample_rate=self._sample_rate,
|
|
263
|
+
)
|
|
264
|
+
return audio_frame
|
|
265
|
+
|
|
266
|
+
elif event == "dtmf":
|
|
267
|
+
# Handle DTMF events
|
|
268
|
+
dtmf_data = message.get("dtmf", {})
|
|
269
|
+
digit = dtmf_data.get("digit")
|
|
270
|
+
|
|
271
|
+
if digit:
|
|
272
|
+
try:
|
|
273
|
+
logger.info(f"VI: Received DTMF digit: {digit}")
|
|
274
|
+
return InputDTMFFrame(KeypadEntry(digit))
|
|
275
|
+
except ValueError:
|
|
276
|
+
logger.warning(f"Invalid DTMF digit from VI: {digit}")
|
|
277
|
+
return None
|
|
278
|
+
|
|
279
|
+
elif event == "connected":
|
|
280
|
+
# Handle connection event
|
|
281
|
+
logger.info(f"VI connection established: {message}")
|
|
282
|
+
return None
|
|
283
|
+
|
|
284
|
+
elif event == "start":
|
|
285
|
+
# Handle stream start event
|
|
286
|
+
logger.info(f"VI stream started: {message}")
|
|
287
|
+
return None
|
|
288
|
+
|
|
289
|
+
elif event == "stop":
|
|
290
|
+
# Handle stream stop event
|
|
291
|
+
logger.info(f"VI stream stopped: {message}")
|
|
292
|
+
# Don't end the call here, wait for explicit exit or call end
|
|
293
|
+
return None
|
|
294
|
+
|
|
295
|
+
elif event == "mark":
|
|
296
|
+
# Handle mark event - checkpoint confirming audio playback completion
|
|
297
|
+
mark_data = message.get("mark", {})
|
|
298
|
+
mark_name = mark_data.get("name", "unknown")
|
|
299
|
+
logger.info(f"VI mark event received: {mark_name}")
|
|
300
|
+
# Mark events are informational, no frame to return
|
|
301
|
+
return None
|
|
302
|
+
|
|
303
|
+
elif event == "error":
|
|
304
|
+
# Handle error events
|
|
305
|
+
error_msg = message.get("error", "Unknown error")
|
|
306
|
+
logger.error(f"VI error: {error_msg}")
|
|
307
|
+
return None
|
|
308
|
+
|
|
309
|
+
elif event == "exit":
|
|
310
|
+
# Handle exit event from VI
|
|
311
|
+
logger.info("VI exit event received - terminating session")
|
|
312
|
+
self._call_ended = True
|
|
313
|
+
return CancelFrame()
|
|
314
|
+
|
|
315
|
+
elif event == "call_end" or event == "callEnd":
|
|
316
|
+
# Handle call end event (if VI sends this)
|
|
317
|
+
logger.info("VI call end event received")
|
|
318
|
+
self._call_ended = True
|
|
319
|
+
return CancelFrame()
|
|
320
|
+
|
|
321
|
+
else:
|
|
322
|
+
logger.debug(f"VI UNHANDLED EVENT: {event}")
|
|
323
|
+
|
|
324
|
+
return None
|
pipecat/services/cartesia/tts.py
CHANGED
|
@@ -15,7 +15,6 @@ from typing import AsyncGenerator, List, Literal, Optional, Union
|
|
|
15
15
|
from loguru import logger
|
|
16
16
|
from pydantic import BaseModel, Field
|
|
17
17
|
|
|
18
|
-
|
|
19
18
|
from pipecat.frames.frames import (
|
|
20
19
|
CancelFrame,
|
|
21
20
|
EndFrame,
|
|
@@ -49,6 +48,26 @@ except ModuleNotFoundError as e:
|
|
|
49
48
|
raise Exception(f"Missing module: {e}")
|
|
50
49
|
|
|
51
50
|
|
|
51
|
+
class GenerationConfig(BaseModel):
|
|
52
|
+
"""Configuration for Cartesia Sonic-3 generation parameters.
|
|
53
|
+
|
|
54
|
+
Sonic-3 interprets these parameters as guidance to ensure natural speech.
|
|
55
|
+
Test against your content for best results.
|
|
56
|
+
|
|
57
|
+
Parameters:
|
|
58
|
+
volume: Volume multiplier for generated speech. Valid range: [0.5, 2.0]. Default is 1.0.
|
|
59
|
+
speed: Speed multiplier for generated speech. Valid range: [0.6, 1.5]. Default is 1.0.
|
|
60
|
+
emotion: Single emotion string to guide the emotional tone. Examples include neutral,
|
|
61
|
+
angry, excited, content, sad, scared. Over 60 emotions are supported. For best
|
|
62
|
+
results, use with recommended voices: Leo, Jace, Kyle, Gavin, Maya, Tessa, Dana,
|
|
63
|
+
and Marian.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
volume: Optional[float] = None
|
|
67
|
+
speed: Optional[float] = None
|
|
68
|
+
emotion: Optional[str] = None
|
|
69
|
+
|
|
70
|
+
|
|
52
71
|
def language_to_cartesia_language(language: Language) -> Optional[str]:
|
|
53
72
|
"""Convert a Language enum to Cartesia language code.
|
|
54
73
|
|
|
@@ -74,6 +93,33 @@ def language_to_cartesia_language(language: Language) -> Optional[str]:
|
|
|
74
93
|
Language.SV: "sv",
|
|
75
94
|
Language.TR: "tr",
|
|
76
95
|
Language.ZH: "zh",
|
|
96
|
+
Language.TL: "tl",
|
|
97
|
+
Language.BG: "bg",
|
|
98
|
+
Language.RO: "ro",
|
|
99
|
+
Language.AR: "ar",
|
|
100
|
+
Language.CS: "cs",
|
|
101
|
+
Language.EL: "el",
|
|
102
|
+
Language.FI: "fi",
|
|
103
|
+
Language.HR: "hr",
|
|
104
|
+
Language.MS: "ms",
|
|
105
|
+
Language.SK: "sk",
|
|
106
|
+
Language.DA: "da",
|
|
107
|
+
Language.TA: "ta",
|
|
108
|
+
Language.UK: "uk",
|
|
109
|
+
Language.HU: "hu",
|
|
110
|
+
Language.NO: "no",
|
|
111
|
+
Language.VI: "vi",
|
|
112
|
+
Language.BN: "bn",
|
|
113
|
+
Language.TH: "th",
|
|
114
|
+
Language.HE: "he",
|
|
115
|
+
Language.KA: "ka",
|
|
116
|
+
Language.ID: "id",
|
|
117
|
+
Language.TE: "te",
|
|
118
|
+
Language.GU: "gu",
|
|
119
|
+
Language.KN: "kn",
|
|
120
|
+
Language.ML: "ml",
|
|
121
|
+
Language.MR: "mr",
|
|
122
|
+
Language.PA: "pa",
|
|
77
123
|
}
|
|
78
124
|
|
|
79
125
|
result = BASE_LANGUAGES.get(language)
|
|
@@ -102,16 +148,20 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
102
148
|
|
|
103
149
|
Parameters:
|
|
104
150
|
language: Language to use for synthesis.
|
|
105
|
-
speed: Voice speed control.
|
|
106
|
-
emotion: List of emotion controls.
|
|
151
|
+
speed: Voice speed control for non-Sonic-3 models (literal values).
|
|
152
|
+
emotion: List of emotion controls for non-Sonic-3 models.
|
|
107
153
|
|
|
108
154
|
.. deprecated:: 0.0.68
|
|
109
155
|
The `emotion` parameter is deprecated and will be removed in a future version.
|
|
156
|
+
|
|
157
|
+
generation_config: Generation configuration for Sonic-3 models. Includes volume,
|
|
158
|
+
speed (numeric), and emotion (string) parameters.
|
|
110
159
|
"""
|
|
111
160
|
|
|
112
161
|
language: Optional[Language] = Language.EN
|
|
113
162
|
speed: Optional[Literal["slow", "normal", "fast"]] = None
|
|
114
163
|
emotion: Optional[List[str]] = []
|
|
164
|
+
generation_config: Optional[GenerationConfig] = None
|
|
115
165
|
|
|
116
166
|
def __init__(
|
|
117
167
|
self,
|
|
@@ -120,7 +170,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
120
170
|
voice_id: str,
|
|
121
171
|
cartesia_version: str = "2025-04-16",
|
|
122
172
|
url: str = "wss://api.cartesia.ai/tts/websocket",
|
|
123
|
-
model: str = "sonic-
|
|
173
|
+
model: str = "sonic-3",
|
|
124
174
|
sample_rate: Optional[int] = None,
|
|
125
175
|
encoding: str = "pcm_s16le",
|
|
126
176
|
container: str = "raw",
|
|
@@ -136,7 +186,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
136
186
|
voice_id: ID of the voice to use for synthesis.
|
|
137
187
|
cartesia_version: API version string for Cartesia service.
|
|
138
188
|
url: WebSocket URL for Cartesia TTS API.
|
|
139
|
-
model: TTS model to use (e.g., "sonic-
|
|
189
|
+
model: TTS model to use (e.g., "sonic-3").
|
|
140
190
|
sample_rate: Audio sample rate. If None, uses default.
|
|
141
191
|
encoding: Audio encoding format.
|
|
142
192
|
container: Audio container format.
|
|
@@ -180,6 +230,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
180
230
|
else "en",
|
|
181
231
|
"speed": params.speed,
|
|
182
232
|
"emotion": params.emotion,
|
|
233
|
+
"generation_config": params.generation_config,
|
|
183
234
|
}
|
|
184
235
|
self.set_model_name(model)
|
|
185
236
|
self.set_voice(voice_id)
|
|
@@ -298,6 +349,11 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
298
349
|
if self._settings["speed"]:
|
|
299
350
|
msg["speed"] = self._settings["speed"]
|
|
300
351
|
|
|
352
|
+
if self._settings["generation_config"]:
|
|
353
|
+
msg["generation_config"] = self._settings["generation_config"].model_dump(
|
|
354
|
+
exclude_none=True
|
|
355
|
+
)
|
|
356
|
+
|
|
301
357
|
return json.dumps(msg)
|
|
302
358
|
|
|
303
359
|
async def start(self, frame: StartFrame):
|
|
@@ -419,7 +475,6 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
419
475
|
logger.error(f"{self} error: {msg}")
|
|
420
476
|
await self.push_frame(TTSStoppedFrame())
|
|
421
477
|
await self.stop_all_metrics()
|
|
422
|
-
|
|
423
478
|
await self.push_error(ErrorFrame(f"{self} error: {msg['error']}"))
|
|
424
479
|
self._context_id = None
|
|
425
480
|
else:
|
|
@@ -484,23 +539,27 @@ class CartesiaHttpTTSService(TTSService):
|
|
|
484
539
|
|
|
485
540
|
Parameters:
|
|
486
541
|
language: Language to use for synthesis.
|
|
487
|
-
speed: Voice speed control.
|
|
488
|
-
emotion: List of emotion controls.
|
|
542
|
+
speed: Voice speed control for non-Sonic-3 models (literal values).
|
|
543
|
+
emotion: List of emotion controls for non-Sonic-3 models.
|
|
489
544
|
|
|
490
545
|
.. deprecated:: 0.0.68
|
|
491
546
|
The `emotion` parameter is deprecated and will be removed in a future version.
|
|
547
|
+
|
|
548
|
+
generation_config: Generation configuration for Sonic-3 models. Includes volume,
|
|
549
|
+
speed (numeric), and emotion (string) parameters.
|
|
492
550
|
"""
|
|
493
551
|
|
|
494
552
|
language: Optional[Language] = Language.EN
|
|
495
553
|
speed: Optional[Literal["slow", "normal", "fast"]] = None
|
|
496
554
|
emotion: Optional[List[str]] = Field(default_factory=list)
|
|
555
|
+
generation_config: Optional[GenerationConfig] = None
|
|
497
556
|
|
|
498
557
|
def __init__(
|
|
499
558
|
self,
|
|
500
559
|
*,
|
|
501
560
|
api_key: str,
|
|
502
561
|
voice_id: str,
|
|
503
|
-
model: str = "sonic-
|
|
562
|
+
model: str = "sonic-3",
|
|
504
563
|
base_url: str = "https://api.cartesia.ai",
|
|
505
564
|
cartesia_version: str = "2024-11-13",
|
|
506
565
|
sample_rate: Optional[int] = None,
|
|
@@ -514,7 +573,7 @@ class CartesiaHttpTTSService(TTSService):
|
|
|
514
573
|
Args:
|
|
515
574
|
api_key: Cartesia API key for authentication.
|
|
516
575
|
voice_id: ID of the voice to use for synthesis.
|
|
517
|
-
model: TTS model to use (e.g., "sonic-
|
|
576
|
+
model: TTS model to use (e.g., "sonic-3").
|
|
518
577
|
base_url: Base URL for Cartesia HTTP API.
|
|
519
578
|
cartesia_version: API version string for Cartesia service.
|
|
520
579
|
sample_rate: Audio sample rate. If None, uses default.
|
|
@@ -541,6 +600,7 @@ class CartesiaHttpTTSService(TTSService):
|
|
|
541
600
|
else "en",
|
|
542
601
|
"speed": params.speed,
|
|
543
602
|
"emotion": params.emotion,
|
|
603
|
+
"generation_config": params.generation_config,
|
|
544
604
|
}
|
|
545
605
|
self.set_voice(voice_id)
|
|
546
606
|
self.set_model_name(model)
|
|
@@ -634,6 +694,11 @@ class CartesiaHttpTTSService(TTSService):
|
|
|
634
694
|
if self._settings["speed"]:
|
|
635
695
|
payload["speed"] = self._settings["speed"]
|
|
636
696
|
|
|
697
|
+
if self._settings["generation_config"]:
|
|
698
|
+
payload["generation_config"] = self._settings["generation_config"].model_dump(
|
|
699
|
+
exclude_none=True
|
|
700
|
+
)
|
|
701
|
+
|
|
637
702
|
yield TTSStartedFrame()
|
|
638
703
|
|
|
639
704
|
session = await self._client._get_session()
|
pipecat/services/sarvam/tts.py
CHANGED
pipecat/services/vistaar/llm.py
CHANGED
|
@@ -14,15 +14,15 @@ from loguru import logger
|
|
|
14
14
|
from pydantic import BaseModel, Field
|
|
15
15
|
|
|
16
16
|
from pipecat.frames.frames import (
|
|
17
|
-
EndFrame,
|
|
18
17
|
CancelFrame,
|
|
18
|
+
EndFrame,
|
|
19
19
|
Frame,
|
|
20
|
+
InterruptionFrame,
|
|
20
21
|
LLMFullResponseEndFrame,
|
|
21
22
|
LLMFullResponseStartFrame,
|
|
22
23
|
LLMMessagesFrame,
|
|
23
24
|
LLMTextFrame,
|
|
24
25
|
LLMUpdateSettingsFrame,
|
|
25
|
-
StartInterruptionFrame,
|
|
26
26
|
)
|
|
27
27
|
from pipecat.processors.aggregators.llm_response import (
|
|
28
28
|
LLMAssistantAggregatorParams,
|
|
@@ -391,7 +391,7 @@ class VistaarLLMService(LLMService):
|
|
|
391
391
|
)
|
|
392
392
|
await self.push_frame(frame, direction)
|
|
393
393
|
return
|
|
394
|
-
elif isinstance(frame,
|
|
394
|
+
elif isinstance(frame, InterruptionFrame):
|
|
395
395
|
await self._handle_interruption()
|
|
396
396
|
await self.push_frame(frame, direction)
|
|
397
397
|
return
|
|
@@ -467,4 +467,4 @@ class VistaarLLMService(LLMService):
|
|
|
467
467
|
|
|
468
468
|
def can_generate_metrics(self) -> bool:
|
|
469
469
|
"""Check if this service can generate processing metrics."""
|
|
470
|
-
return True
|
|
470
|
+
return True
|
|
@@ -50,6 +50,11 @@ from pipecat.utils.time import nanoseconds_to_seconds
|
|
|
50
50
|
|
|
51
51
|
# TODO: When we use GeminiMultimodalLiveLLMService, we need to change this to 0.35 but that creates issue for faster TTS.
|
|
52
52
|
BOT_VAD_STOP_SECS = 0.30
|
|
53
|
+
# For the very first bot utterance (e.g., intro), we can safely
|
|
54
|
+
# detect end-of-speech sooner to improve responsiveness for the
|
|
55
|
+
# user’s first short reply. Keep conservative to avoid mid-utterance
|
|
56
|
+
# false stops when TTS streams quickly.
|
|
57
|
+
FIRST_BOT_VAD_STOP_SECS = 0.08
|
|
53
58
|
|
|
54
59
|
|
|
55
60
|
class BaseOutputTransport(FrameProcessor):
|
|
@@ -406,6 +411,9 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
406
411
|
self._bot_speaking_frame_period = 0.2
|
|
407
412
|
# Last time the bot actually spoke.
|
|
408
413
|
self._bot_speech_last_time = 0
|
|
414
|
+
# Before the first stop event, we use a shorter silence
|
|
415
|
+
# threshold to make the first turn more responsive.
|
|
416
|
+
self._first_stop_pending = True
|
|
409
417
|
|
|
410
418
|
self._audio_task: Optional[asyncio.Task] = None
|
|
411
419
|
self._video_task: Optional[asyncio.Task] = None
|
|
@@ -631,6 +639,10 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
631
639
|
|
|
632
640
|
self._bot_speaking = False
|
|
633
641
|
|
|
642
|
+
# Mark that the first stop has been completed so subsequent
|
|
643
|
+
# stops use the regular (longer) VAD stop threshold.
|
|
644
|
+
self._first_stop_pending = False
|
|
645
|
+
|
|
634
646
|
# Clean audio buffer (there could be tiny left overs if not multiple
|
|
635
647
|
# to our output chunk size).
|
|
636
648
|
self._audio_buffer = bytearray()
|
|
@@ -690,9 +702,14 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
690
702
|
async def without_mixer(vad_stop_secs: float) -> AsyncGenerator[Frame, None]:
|
|
691
703
|
while True:
|
|
692
704
|
try:
|
|
693
|
-
|
|
694
|
-
|
|
705
|
+
# Use a shorter timeout only for the first bot stop to
|
|
706
|
+
# accelerate the initial turn handoff right after the intro.
|
|
707
|
+
timeout = (
|
|
708
|
+
FIRST_BOT_VAD_STOP_SECS
|
|
709
|
+
if getattr(self, "_first_stop_pending", True)
|
|
710
|
+
else BOT_VAD_STOP_SECS
|
|
695
711
|
)
|
|
712
|
+
frame = await asyncio.wait_for(self._audio_queue.get(), timeout=timeout)
|
|
696
713
|
yield frame
|
|
697
714
|
self._audio_queue.task_done()
|
|
698
715
|
except asyncio.TimeoutError:
|
|
@@ -713,7 +730,13 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
713
730
|
except asyncio.QueueEmpty:
|
|
714
731
|
# Notify the bot stopped speaking upstream if necessary.
|
|
715
732
|
diff_time = time.time() - last_frame_time
|
|
716
|
-
|
|
733
|
+
# Use a shorter threshold for the first stop only.
|
|
734
|
+
current_stop_secs = (
|
|
735
|
+
FIRST_BOT_VAD_STOP_SECS
|
|
736
|
+
if getattr(self, "_first_stop_pending", True)
|
|
737
|
+
else BOT_VAD_STOP_SECS
|
|
738
|
+
)
|
|
739
|
+
if diff_time > current_stop_secs:
|
|
717
740
|
await self._bot_stopped_speaking()
|
|
718
741
|
# Generate an audio frame with only the mixer's part.
|
|
719
742
|
frame = OutputAudioRawFrame(
|
|
File without changes
|
{dv_pipecat_ai-0.0.85.dev830.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{dv_pipecat_ai-0.0.85.dev830.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt
RENAMED
|
File without changes
|