dv-pipecat-ai 0.0.85.dev698__py3-none-any.whl → 0.0.85.dev814__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/METADATA +23 -18
- {dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/RECORD +45 -43
- pipecat/adapters/services/aws_nova_sonic_adapter.py +116 -6
- pipecat/pipeline/runner.py +6 -2
- pipecat/pipeline/task.py +40 -55
- pipecat/processors/aggregators/llm_context.py +40 -2
- pipecat/processors/frameworks/rtvi.py +1 -0
- pipecat/runner/daily.py +59 -20
- pipecat/runner/run.py +149 -67
- pipecat/runner/types.py +5 -5
- pipecat/services/assemblyai/models.py +6 -0
- pipecat/services/assemblyai/stt.py +13 -5
- pipecat/services/asyncai/tts.py +3 -0
- pipecat/services/aws/llm.py +33 -16
- pipecat/services/aws/nova_sonic/context.py +69 -0
- pipecat/services/aws/nova_sonic/llm.py +199 -89
- pipecat/services/aws/stt.py +2 -0
- pipecat/services/aws_nova_sonic/context.py +8 -12
- pipecat/services/cartesia/stt.py +77 -70
- pipecat/services/cartesia/tts.py +3 -1
- pipecat/services/deepgram/flux/stt.py +4 -0
- pipecat/services/elevenlabs/tts.py +82 -41
- pipecat/services/fish/tts.py +3 -0
- pipecat/services/google/stt.py +4 -0
- pipecat/services/lmnt/tts.py +2 -0
- pipecat/services/neuphonic/tts.py +3 -0
- pipecat/services/openai/tts.py +37 -6
- pipecat/services/piper/tts.py +7 -9
- pipecat/services/playht/tts.py +3 -0
- pipecat/services/rime/tts.py +9 -8
- pipecat/services/riva/stt.py +3 -1
- pipecat/services/salesforce/__init__.py +9 -0
- pipecat/services/salesforce/llm.py +465 -0
- pipecat/services/sarvam/tts.py +87 -10
- pipecat/services/speechmatics/stt.py +3 -1
- pipecat/services/stt_service.py +23 -10
- pipecat/services/tts_service.py +64 -13
- pipecat/transports/base_input.py +3 -0
- pipecat/transports/base_output.py +71 -77
- pipecat/transports/smallwebrtc/connection.py +5 -0
- pipecat/transports/smallwebrtc/request_handler.py +42 -0
- pipecat/utils/string.py +1 -0
- {dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/top_level.txt +0 -0
pipecat/services/cartesia/stt.py
CHANGED
|
@@ -28,13 +28,12 @@ from pipecat.frames.frames import (
|
|
|
28
28
|
UserStoppedSpeakingFrame,
|
|
29
29
|
)
|
|
30
30
|
from pipecat.processors.frame_processor import FrameDirection
|
|
31
|
-
from pipecat.services.stt_service import
|
|
31
|
+
from pipecat.services.stt_service import WebsocketSTTService
|
|
32
32
|
from pipecat.transcriptions.language import Language
|
|
33
33
|
from pipecat.utils.time import time_now_iso8601
|
|
34
34
|
from pipecat.utils.tracing.service_decorators import traced_stt
|
|
35
35
|
|
|
36
36
|
try:
|
|
37
|
-
import websockets
|
|
38
37
|
from websockets.asyncio.client import connect as websocket_connect
|
|
39
38
|
from websockets.protocol import State
|
|
40
39
|
except ModuleNotFoundError as e:
|
|
@@ -124,7 +123,7 @@ class CartesiaLiveOptions:
|
|
|
124
123
|
return cls(**json.loads(json_str))
|
|
125
124
|
|
|
126
125
|
|
|
127
|
-
class CartesiaSTTService(
|
|
126
|
+
class CartesiaSTTService(WebsocketSTTService):
|
|
128
127
|
"""Speech-to-text service using Cartesia Live API.
|
|
129
128
|
|
|
130
129
|
Provides real-time speech transcription through WebSocket connection
|
|
@@ -176,8 +175,7 @@ class CartesiaSTTService(STTService):
|
|
|
176
175
|
self.set_model_name(merged_options.model)
|
|
177
176
|
self._api_key = api_key
|
|
178
177
|
self._base_url = base_url or "api.cartesia.ai"
|
|
179
|
-
self.
|
|
180
|
-
self._receiver_task = None
|
|
178
|
+
self._receive_task = None
|
|
181
179
|
|
|
182
180
|
def can_generate_metrics(self) -> bool:
|
|
183
181
|
"""Check if the service can generate processing metrics.
|
|
@@ -214,6 +212,27 @@ class CartesiaSTTService(STTService):
|
|
|
214
212
|
await super().cancel(frame)
|
|
215
213
|
await self._disconnect()
|
|
216
214
|
|
|
215
|
+
async def start_metrics(self):
|
|
216
|
+
"""Start performance metrics collection for transcription processing."""
|
|
217
|
+
await self.start_ttfb_metrics()
|
|
218
|
+
await self.start_processing_metrics()
|
|
219
|
+
|
|
220
|
+
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
221
|
+
"""Process incoming frames and handle speech events.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
frame: The frame to process.
|
|
225
|
+
direction: Direction of frame flow in the pipeline.
|
|
226
|
+
"""
|
|
227
|
+
await super().process_frame(frame, direction)
|
|
228
|
+
|
|
229
|
+
if isinstance(frame, UserStartedSpeakingFrame):
|
|
230
|
+
await self.start_metrics()
|
|
231
|
+
elif isinstance(frame, UserStoppedSpeakingFrame):
|
|
232
|
+
# Send finalize command to flush the transcription session
|
|
233
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
234
|
+
await self._websocket.send("finalize")
|
|
235
|
+
|
|
217
236
|
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
|
218
237
|
"""Process audio data for speech-to-text transcription.
|
|
219
238
|
|
|
@@ -224,45 +243,71 @@ class CartesiaSTTService(STTService):
|
|
|
224
243
|
None - transcription results are handled via WebSocket responses.
|
|
225
244
|
"""
|
|
226
245
|
# If the connection is closed, due to timeout, we need to reconnect when the user starts speaking again
|
|
227
|
-
if not self.
|
|
246
|
+
if not self._websocket or self._websocket.state is State.CLOSED:
|
|
228
247
|
await self._connect()
|
|
229
248
|
|
|
230
|
-
await self.
|
|
249
|
+
await self._websocket.send(audio)
|
|
231
250
|
yield None
|
|
232
251
|
|
|
233
252
|
async def _connect(self):
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
253
|
+
await self._connect_websocket()
|
|
254
|
+
|
|
255
|
+
if self._websocket and not self._receive_task:
|
|
256
|
+
self._receive_task = asyncio.create_task(self._receive_task_handler(self._report_error))
|
|
238
257
|
|
|
258
|
+
async def _disconnect(self):
|
|
259
|
+
if self._receive_task:
|
|
260
|
+
await self.cancel_task(self._receive_task)
|
|
261
|
+
self._receive_task = None
|
|
262
|
+
|
|
263
|
+
await self._disconnect_websocket()
|
|
264
|
+
|
|
265
|
+
async def _connect_websocket(self):
|
|
239
266
|
try:
|
|
240
|
-
self.
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
267
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
268
|
+
return
|
|
269
|
+
logger.debug("Connecting to Cartesia STT")
|
|
270
|
+
|
|
271
|
+
params = self._settings.to_dict()
|
|
272
|
+
ws_url = f"wss://{self._base_url}/stt/websocket?{urllib.parse.urlencode(params)}"
|
|
273
|
+
headers = {"Cartesia-Version": "2025-04-16", "X-API-Key": self._api_key}
|
|
274
|
+
|
|
275
|
+
self._websocket = await websocket_connect(ws_url, additional_headers=headers)
|
|
276
|
+
await self._call_event_handler("on_connected")
|
|
245
277
|
except Exception as e:
|
|
246
278
|
logger.error(f"{self}: unable to connect to Cartesia: {e}")
|
|
247
279
|
|
|
248
|
-
async def
|
|
280
|
+
async def _disconnect_websocket(self):
|
|
249
281
|
try:
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
message = await self._connection.recv()
|
|
255
|
-
try:
|
|
256
|
-
data = json.loads(message)
|
|
257
|
-
await self._process_response(data)
|
|
258
|
-
except json.JSONDecodeError:
|
|
259
|
-
logger.warning(f"Received non-JSON message: {message}")
|
|
260
|
-
except asyncio.CancelledError:
|
|
261
|
-
pass
|
|
262
|
-
except websockets.exceptions.ConnectionClosed as e:
|
|
263
|
-
logger.debug(f"WebSocket connection closed: {e}")
|
|
282
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
283
|
+
logger.debug("Disconnecting from Cartesia STT")
|
|
284
|
+
await self._websocket.close()
|
|
264
285
|
except Exception as e:
|
|
265
|
-
logger.error(f"
|
|
286
|
+
logger.error(f"{self} error closing websocket: {e}")
|
|
287
|
+
finally:
|
|
288
|
+
self._websocket = None
|
|
289
|
+
await self._call_event_handler("on_disconnected")
|
|
290
|
+
|
|
291
|
+
def _get_websocket(self):
|
|
292
|
+
if self._websocket:
|
|
293
|
+
return self._websocket
|
|
294
|
+
raise Exception("Websocket not connected")
|
|
295
|
+
|
|
296
|
+
async def _process_messages(self):
|
|
297
|
+
async for message in self._get_websocket():
|
|
298
|
+
try:
|
|
299
|
+
data = json.loads(message)
|
|
300
|
+
await self._process_response(data)
|
|
301
|
+
except json.JSONDecodeError:
|
|
302
|
+
logger.warning(f"Received non-JSON message: {message}")
|
|
303
|
+
|
|
304
|
+
async def _receive_messages(self):
|
|
305
|
+
while True:
|
|
306
|
+
await self._process_messages()
|
|
307
|
+
# Cartesia times out after 5 minutes of innactivity (no keepalive
|
|
308
|
+
# mechanism is available). So, we try to reconnect.
|
|
309
|
+
logger.debug(f"{self} Cartesia connection was disconnected (timeout?), reconnecting")
|
|
310
|
+
await self._connect_websocket()
|
|
266
311
|
|
|
267
312
|
async def _process_response(self, data):
|
|
268
313
|
if "type" in data:
|
|
@@ -316,41 +361,3 @@ class CartesiaSTTService(STTService):
|
|
|
316
361
|
language,
|
|
317
362
|
)
|
|
318
363
|
)
|
|
319
|
-
|
|
320
|
-
async def _disconnect(self):
|
|
321
|
-
if self._receiver_task:
|
|
322
|
-
self._receiver_task.cancel()
|
|
323
|
-
try:
|
|
324
|
-
await self._receiver_task
|
|
325
|
-
except asyncio.CancelledError:
|
|
326
|
-
pass
|
|
327
|
-
except Exception as e:
|
|
328
|
-
logger.exception(f"Unexpected exception while cancelling task: {e}")
|
|
329
|
-
self._receiver_task = None
|
|
330
|
-
|
|
331
|
-
if self._connection and self._connection.state is State.OPEN:
|
|
332
|
-
logger.debug("Disconnecting from Cartesia")
|
|
333
|
-
|
|
334
|
-
await self._connection.close()
|
|
335
|
-
self._connection = None
|
|
336
|
-
|
|
337
|
-
async def start_metrics(self):
|
|
338
|
-
"""Start performance metrics collection for transcription processing."""
|
|
339
|
-
await self.start_ttfb_metrics()
|
|
340
|
-
await self.start_processing_metrics()
|
|
341
|
-
|
|
342
|
-
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
343
|
-
"""Process incoming frames and handle speech events.
|
|
344
|
-
|
|
345
|
-
Args:
|
|
346
|
-
frame: The frame to process.
|
|
347
|
-
direction: Direction of frame flow in the pipeline.
|
|
348
|
-
"""
|
|
349
|
-
await super().process_frame(frame, direction)
|
|
350
|
-
|
|
351
|
-
if isinstance(frame, UserStartedSpeakingFrame):
|
|
352
|
-
await self.start_metrics()
|
|
353
|
-
elif isinstance(frame, UserStoppedSpeakingFrame):
|
|
354
|
-
# Send finalize command to flush the transcription session
|
|
355
|
-
if self._connection and self._connection.state is State.OPEN:
|
|
356
|
-
await self._connection.send("finalize")
|
pipecat/services/cartesia/tts.py
CHANGED
|
@@ -345,10 +345,11 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
345
345
|
try:
|
|
346
346
|
if self._websocket and self._websocket.state is State.OPEN:
|
|
347
347
|
return
|
|
348
|
-
logger.debug("Connecting to Cartesia")
|
|
348
|
+
logger.debug("Connecting to Cartesia TTS")
|
|
349
349
|
self._websocket = await websocket_connect(
|
|
350
350
|
f"{self._url}?api_key={self._api_key}&cartesia_version={self._cartesia_version}"
|
|
351
351
|
)
|
|
352
|
+
await self._call_event_handler("on_connected")
|
|
352
353
|
except Exception as e:
|
|
353
354
|
logger.error(f"{self} initialization error: {e}")
|
|
354
355
|
self._websocket = None
|
|
@@ -366,6 +367,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
366
367
|
finally:
|
|
367
368
|
self._context_id = None
|
|
368
369
|
self._websocket = None
|
|
370
|
+
await self._call_event_handler("on_disconnected")
|
|
369
371
|
|
|
370
372
|
def _get_websocket(self):
|
|
371
373
|
if self._websocket:
|
|
@@ -205,6 +205,7 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
|
|
205
205
|
additional_headers={"Authorization": f"Token {self._api_key}"},
|
|
206
206
|
)
|
|
207
207
|
logger.debug("Connected to Deepgram Flux Websocket")
|
|
208
|
+
await self._call_event_handler("on_connected")
|
|
208
209
|
except Exception as e:
|
|
209
210
|
logger.error(f"{self} initialization error: {e}")
|
|
210
211
|
self._websocket = None
|
|
@@ -225,6 +226,9 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
|
|
225
226
|
await self._websocket.close()
|
|
226
227
|
except Exception as e:
|
|
227
228
|
logger.error(f"{self} error closing websocket: {e}")
|
|
229
|
+
finally:
|
|
230
|
+
self._websocket = None
|
|
231
|
+
await self._call_event_handler("on_disconnected")
|
|
228
232
|
|
|
229
233
|
async def _send_close_stream(self) -> None:
|
|
230
234
|
"""Sends a CloseStream control message to the Deepgram Flux WebSocket API.
|
|
@@ -172,16 +172,24 @@ def build_elevenlabs_voice_settings(
|
|
|
172
172
|
|
|
173
173
|
|
|
174
174
|
def calculate_word_times(
|
|
175
|
-
alignment_info: Mapping[str, Any],
|
|
176
|
-
|
|
175
|
+
alignment_info: Mapping[str, Any],
|
|
176
|
+
cumulative_time: float,
|
|
177
|
+
partial_word: str = "",
|
|
178
|
+
partial_word_start_time: float = 0.0,
|
|
179
|
+
) -> tuple[List[Tuple[str, float]], str, float]:
|
|
177
180
|
"""Calculate word timestamps from character alignment information.
|
|
178
181
|
|
|
179
182
|
Args:
|
|
180
183
|
alignment_info: Character alignment data from ElevenLabs API.
|
|
181
184
|
cumulative_time: Base time offset for this chunk.
|
|
185
|
+
partial_word: Partial word carried over from previous chunk.
|
|
186
|
+
partial_word_start_time: Start time of the partial word.
|
|
182
187
|
|
|
183
188
|
Returns:
|
|
184
|
-
|
|
189
|
+
Tuple of (word_times, new_partial_word, new_partial_word_start_time):
|
|
190
|
+
- word_times: List of (word, timestamp) tuples for complete words
|
|
191
|
+
- new_partial_word: Incomplete word at end of chunk (empty if chunk ends with space)
|
|
192
|
+
- new_partial_word_start_time: Start time of the incomplete word
|
|
185
193
|
"""
|
|
186
194
|
chars = alignment_info["chars"]
|
|
187
195
|
char_start_times_ms = alignment_info["charStartTimesMs"]
|
|
@@ -190,41 +198,37 @@ def calculate_word_times(
|
|
|
190
198
|
logger.error(
|
|
191
199
|
f"calculate_word_times: length mismatch - chars={len(chars)}, times={len(char_start_times_ms)}"
|
|
192
200
|
)
|
|
193
|
-
return []
|
|
201
|
+
return ([], partial_word, partial_word_start_time)
|
|
194
202
|
|
|
195
203
|
# Build words and track their start positions
|
|
196
204
|
words = []
|
|
197
|
-
|
|
198
|
-
current_word =
|
|
199
|
-
|
|
205
|
+
word_start_times = []
|
|
206
|
+
current_word = partial_word # Start with any partial word from previous chunk
|
|
207
|
+
word_start_time = partial_word_start_time if partial_word else None
|
|
200
208
|
|
|
201
209
|
for i, char in enumerate(chars):
|
|
202
210
|
if char == " ":
|
|
203
211
|
# End of current word
|
|
204
212
|
if current_word: # Only add non-empty words
|
|
205
213
|
words.append(current_word)
|
|
206
|
-
|
|
214
|
+
word_start_times.append(word_start_time)
|
|
207
215
|
current_word = ""
|
|
208
|
-
|
|
216
|
+
word_start_time = None
|
|
209
217
|
else:
|
|
210
218
|
# Building a word
|
|
211
|
-
if
|
|
212
|
-
|
|
219
|
+
if word_start_time is None: # First character of new word
|
|
220
|
+
# Convert from milliseconds to seconds and add cumulative offset
|
|
221
|
+
word_start_time = cumulative_time + (char_start_times_ms[i] / 1000.0)
|
|
213
222
|
current_word += char
|
|
214
223
|
|
|
215
|
-
#
|
|
216
|
-
|
|
217
|
-
words.append(current_word)
|
|
218
|
-
word_start_indices.append(word_start_index)
|
|
224
|
+
# Build result for complete words
|
|
225
|
+
word_times = list(zip(words, word_start_times))
|
|
219
226
|
|
|
220
|
-
#
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
# Convert from milliseconds to seconds and add cumulative offset
|
|
224
|
-
start_time_seconds = cumulative_time + (char_start_times_ms[start_idx] / 1000.0)
|
|
225
|
-
word_times.append((word, start_time_seconds))
|
|
227
|
+
# Return any incomplete word at the end of this chunk
|
|
228
|
+
new_partial_word = current_word if current_word else ""
|
|
229
|
+
new_partial_word_start_time = word_start_time if word_start_time is not None else 0.0
|
|
226
230
|
|
|
227
|
-
return word_times
|
|
231
|
+
return (word_times, new_partial_word, new_partial_word_start_time)
|
|
228
232
|
|
|
229
233
|
|
|
230
234
|
class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
@@ -336,6 +340,9 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
336
340
|
# there's an interruption or TTSStoppedFrame.
|
|
337
341
|
self._started = False
|
|
338
342
|
self._cumulative_time = 0
|
|
343
|
+
# Track partial words that span across alignment chunks
|
|
344
|
+
self._partial_word = ""
|
|
345
|
+
self._partial_word_start_time = 0.0
|
|
339
346
|
|
|
340
347
|
# Context management for v1 multi API
|
|
341
348
|
self._context_id = None
|
|
@@ -526,6 +533,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
526
533
|
url, max_size=16 * 1024 * 1024, additional_headers={"xi-api-key": self._api_key}
|
|
527
534
|
)
|
|
528
535
|
|
|
536
|
+
await self._call_event_handler("on_connected")
|
|
529
537
|
except Exception as e:
|
|
530
538
|
self.logger.error(f"{self} initialization error: {e}")
|
|
531
539
|
self._websocket = None
|
|
@@ -544,6 +552,11 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
544
552
|
logger.debug("Disconnected from ElevenLabs")
|
|
545
553
|
except Exception as e:
|
|
546
554
|
self.logger.error(f"{self} error closing websocket: {e}")
|
|
555
|
+
finally:
|
|
556
|
+
self._started = False
|
|
557
|
+
self._context_id = None
|
|
558
|
+
self._websocket = None
|
|
559
|
+
await self._call_event_handler("on_disconnected")
|
|
547
560
|
|
|
548
561
|
def _get_websocket(self):
|
|
549
562
|
if self._websocket:
|
|
@@ -571,6 +584,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
571
584
|
logger.error(f"Error closing context on interruption: {e}")
|
|
572
585
|
self._context_id = None
|
|
573
586
|
self._started = False
|
|
587
|
+
self._partial_word = ""
|
|
588
|
+
self._partial_word_start_time = 0.0
|
|
574
589
|
|
|
575
590
|
async def _receive_messages(self):
|
|
576
591
|
"""Handle incoming WebSocket messages from ElevenLabs."""
|
|
@@ -610,7 +625,14 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
610
625
|
|
|
611
626
|
if msg.get("alignment"):
|
|
612
627
|
alignment = msg["alignment"]
|
|
613
|
-
word_times
|
|
628
|
+
word_times, self._partial_word, self._partial_word_start_time = (
|
|
629
|
+
calculate_word_times(
|
|
630
|
+
alignment,
|
|
631
|
+
self._cumulative_time,
|
|
632
|
+
self._partial_word,
|
|
633
|
+
self._partial_word_start_time,
|
|
634
|
+
)
|
|
635
|
+
)
|
|
614
636
|
|
|
615
637
|
if word_times:
|
|
616
638
|
await self.add_word_timestamps(word_times)
|
|
@@ -685,6 +707,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
685
707
|
yield TTSStartedFrame()
|
|
686
708
|
self._started = True
|
|
687
709
|
self._cumulative_time = 0
|
|
710
|
+
self._partial_word = ""
|
|
711
|
+
self._partial_word_start_time = 0.0
|
|
688
712
|
# If a context ID does not exist, create a new one and
|
|
689
713
|
# register it. If an ID exists, that means the Pipeline is
|
|
690
714
|
# configured for allow_interruptions=False, so continue
|
|
@@ -758,6 +782,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
758
782
|
base_url: str = "https://api.elevenlabs.io",
|
|
759
783
|
sample_rate: Optional[int] = None,
|
|
760
784
|
params: Optional[InputParams] = None,
|
|
785
|
+
aggregate_sentences: Optional[bool] = True,
|
|
761
786
|
**kwargs,
|
|
762
787
|
):
|
|
763
788
|
"""Initialize the ElevenLabs HTTP TTS service.
|
|
@@ -770,10 +795,11 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
770
795
|
base_url: Base URL for ElevenLabs HTTP API.
|
|
771
796
|
sample_rate: Audio sample rate. If None, uses default.
|
|
772
797
|
params: Additional input parameters for voice customization.
|
|
798
|
+
aggregate_sentences: Whether to aggregate sentences within the TTSService.
|
|
773
799
|
**kwargs: Additional arguments passed to the parent service.
|
|
774
800
|
"""
|
|
775
801
|
super().__init__(
|
|
776
|
-
aggregate_sentences=
|
|
802
|
+
aggregate_sentences=aggregate_sentences,
|
|
777
803
|
push_text_frames=False,
|
|
778
804
|
push_stop_frames=True,
|
|
779
805
|
sample_rate=sample_rate,
|
|
@@ -811,6 +837,10 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
811
837
|
# Store previous text for context within a turn
|
|
812
838
|
self._previous_text = ""
|
|
813
839
|
|
|
840
|
+
# Track partial words that span across alignment chunks
|
|
841
|
+
self._partial_word = ""
|
|
842
|
+
self._partial_word_start_time = 0.0
|
|
843
|
+
|
|
814
844
|
def language_to_service_language(self, language: Language) -> Optional[str]:
|
|
815
845
|
"""Convert pipecat Language to ElevenLabs language code.
|
|
816
846
|
|
|
@@ -838,6 +868,8 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
838
868
|
self._cumulative_time = 0
|
|
839
869
|
self._started = False
|
|
840
870
|
self._previous_text = ""
|
|
871
|
+
self._partial_word = ""
|
|
872
|
+
self._partial_word_start_time = 0.0
|
|
841
873
|
logger.debug(f"{self}: Reset internal state")
|
|
842
874
|
|
|
843
875
|
async def start(self, frame: StartFrame):
|
|
@@ -872,11 +904,13 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
872
904
|
def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[str, float]]:
|
|
873
905
|
"""Calculate word timing from character alignment data.
|
|
874
906
|
|
|
907
|
+
This method handles partial words that may span across multiple alignment chunks.
|
|
908
|
+
|
|
875
909
|
Args:
|
|
876
910
|
alignment_info: Character timing data from ElevenLabs.
|
|
877
911
|
|
|
878
912
|
Returns:
|
|
879
|
-
List of (word, timestamp) pairs.
|
|
913
|
+
List of (word, timestamp) pairs for complete words in this chunk.
|
|
880
914
|
|
|
881
915
|
Example input data::
|
|
882
916
|
|
|
@@ -902,30 +936,28 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
902
936
|
# Build the words and find their start times
|
|
903
937
|
words = []
|
|
904
938
|
word_start_times = []
|
|
905
|
-
|
|
906
|
-
|
|
939
|
+
# Start with any partial word from previous chunk
|
|
940
|
+
current_word = self._partial_word
|
|
941
|
+
word_start_time = self._partial_word_start_time if self._partial_word else None
|
|
907
942
|
|
|
908
943
|
for i, char in enumerate(chars):
|
|
909
944
|
if char == " ":
|
|
910
945
|
if current_word: # Only add non-empty words
|
|
911
946
|
words.append(current_word)
|
|
912
|
-
|
|
913
|
-
word_start_times.append(
|
|
914
|
-
self._cumulative_time + char_start_times[first_char_idx]
|
|
915
|
-
)
|
|
947
|
+
word_start_times.append(word_start_time)
|
|
916
948
|
current_word = ""
|
|
917
|
-
|
|
949
|
+
word_start_time = None
|
|
918
950
|
else:
|
|
919
|
-
if
|
|
920
|
-
|
|
951
|
+
if word_start_time is None: # First character of a new word
|
|
952
|
+
# Use time of the first character of the word, offset by cumulative time
|
|
953
|
+
word_start_time = self._cumulative_time + char_start_times[i]
|
|
921
954
|
current_word += char
|
|
922
955
|
|
|
923
|
-
#
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
word_start_times.append(self._cumulative_time + char_start_times[first_char_idx])
|
|
956
|
+
# Store any incomplete word at the end of this chunk
|
|
957
|
+
self._partial_word = current_word if current_word else ""
|
|
958
|
+
self._partial_word_start_time = word_start_time if word_start_time is not None else 0.0
|
|
927
959
|
|
|
928
|
-
# Create word-time pairs
|
|
960
|
+
# Create word-time pairs for complete words only
|
|
929
961
|
word_times = list(zip(words, word_start_times))
|
|
930
962
|
|
|
931
963
|
return word_times
|
|
@@ -961,6 +993,9 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
961
993
|
if self._voice_settings:
|
|
962
994
|
payload["voice_settings"] = self._voice_settings
|
|
963
995
|
|
|
996
|
+
if self._settings["apply_text_normalization"] is not None:
|
|
997
|
+
payload["apply_text_normalization"] = self._settings["apply_text_normalization"]
|
|
998
|
+
|
|
964
999
|
language = self._settings["language"]
|
|
965
1000
|
if self._model_name in ELEVENLABS_MULTILINGUAL_MODELS and language:
|
|
966
1001
|
payload["language_code"] = language
|
|
@@ -981,8 +1016,6 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
981
1016
|
}
|
|
982
1017
|
if self._settings["optimize_streaming_latency"] is not None:
|
|
983
1018
|
params["optimize_streaming_latency"] = self._settings["optimize_streaming_latency"]
|
|
984
|
-
if self._settings["apply_text_normalization"] is not None:
|
|
985
|
-
params["apply_text_normalization"] = self._settings["apply_text_normalization"]
|
|
986
1019
|
|
|
987
1020
|
self.logger.debug(f"ElevenLabs request - payload: {payload}, params: {params}")
|
|
988
1021
|
|
|
@@ -1045,6 +1078,14 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
1045
1078
|
logger.error(f"Error processing response: {e}", exc_info=True)
|
|
1046
1079
|
continue
|
|
1047
1080
|
|
|
1081
|
+
# After processing all chunks, emit any remaining partial word
|
|
1082
|
+
# since this is the end of the utterance
|
|
1083
|
+
if self._partial_word:
|
|
1084
|
+
final_word_time = [(self._partial_word, self._partial_word_start_time)]
|
|
1085
|
+
await self.add_word_timestamps(final_word_time)
|
|
1086
|
+
self._partial_word = ""
|
|
1087
|
+
self._partial_word_start_time = 0.0
|
|
1088
|
+
|
|
1048
1089
|
# After processing all chunks, add the total utterance duration
|
|
1049
1090
|
# to the cumulative time to ensure next utterance starts after this one
|
|
1050
1091
|
if utterance_duration > 0:
|
pipecat/services/fish/tts.py
CHANGED
|
@@ -225,6 +225,8 @@ class FishAudioTTSService(InterruptibleTTSService):
|
|
|
225
225
|
start_message = {"event": "start", "request": {"text": "", **self._settings}}
|
|
226
226
|
await self._websocket.send(ormsgpack.packb(start_message))
|
|
227
227
|
logger.debug("Sent start message to Fish Audio")
|
|
228
|
+
|
|
229
|
+
await self._call_event_handler("on_connected")
|
|
228
230
|
except Exception as e:
|
|
229
231
|
logger.error(f"Fish Audio initialization error: {e}")
|
|
230
232
|
self._websocket = None
|
|
@@ -245,6 +247,7 @@ class FishAudioTTSService(InterruptibleTTSService):
|
|
|
245
247
|
self._request_id = None
|
|
246
248
|
self._started = False
|
|
247
249
|
self._websocket = None
|
|
250
|
+
await self._call_event_handler("on_disconnected")
|
|
248
251
|
|
|
249
252
|
async def flush_audio(self):
|
|
250
253
|
"""Flush any buffered audio by sending a flush event to Fish Audio."""
|
pipecat/services/google/stt.py
CHANGED
|
@@ -730,6 +730,8 @@ class GoogleSTTService(STTService):
|
|
|
730
730
|
self._request_queue = asyncio.Queue()
|
|
731
731
|
self._streaming_task = self.create_task(self._stream_audio())
|
|
732
732
|
|
|
733
|
+
await self._call_event_handler("on_connected")
|
|
734
|
+
|
|
733
735
|
async def _disconnect(self):
|
|
734
736
|
"""Clean up streaming recognition resources."""
|
|
735
737
|
if self._streaming_task:
|
|
@@ -737,6 +739,8 @@ class GoogleSTTService(STTService):
|
|
|
737
739
|
await self.cancel_task(self._streaming_task)
|
|
738
740
|
self._streaming_task = None
|
|
739
741
|
|
|
742
|
+
await self._call_event_handler("on_disconnected")
|
|
743
|
+
|
|
740
744
|
async def _request_generator(self):
|
|
741
745
|
"""Generates requests for the streaming recognize method."""
|
|
742
746
|
recognizer_path = f"projects/{self._project_id}/locations/{self._location}/recognizers/_"
|
pipecat/services/lmnt/tts.py
CHANGED
|
@@ -222,6 +222,7 @@ class LmntTTSService(InterruptibleTTSService):
|
|
|
222
222
|
# Send initialization message
|
|
223
223
|
await self._websocket.send(json.dumps(init_msg))
|
|
224
224
|
|
|
225
|
+
await self._call_event_handler("on_connected")
|
|
225
226
|
except Exception as e:
|
|
226
227
|
logger.error(f"{self} initialization error: {e}")
|
|
227
228
|
self._websocket = None
|
|
@@ -243,6 +244,7 @@ class LmntTTSService(InterruptibleTTSService):
|
|
|
243
244
|
finally:
|
|
244
245
|
self._started = False
|
|
245
246
|
self._websocket = None
|
|
247
|
+
await self._call_event_handler("on_disconnected")
|
|
246
248
|
|
|
247
249
|
def _get_websocket(self):
|
|
248
250
|
"""Get the WebSocket connection if available."""
|
|
@@ -293,6 +293,8 @@ class NeuphonicTTSService(InterruptibleTTSService):
|
|
|
293
293
|
headers = {"x-api-key": self._api_key}
|
|
294
294
|
|
|
295
295
|
self._websocket = await websocket_connect(url, additional_headers=headers)
|
|
296
|
+
|
|
297
|
+
await self._call_event_handler("on_connected")
|
|
296
298
|
except Exception as e:
|
|
297
299
|
logger.error(f"{self} initialization error: {e}")
|
|
298
300
|
self._websocket = None
|
|
@@ -311,6 +313,7 @@ class NeuphonicTTSService(InterruptibleTTSService):
|
|
|
311
313
|
finally:
|
|
312
314
|
self._started = False
|
|
313
315
|
self._websocket = None
|
|
316
|
+
await self._call_event_handler("on_disconnected")
|
|
314
317
|
|
|
315
318
|
async def _receive_messages(self):
|
|
316
319
|
"""Receive and process messages from Neuphonic WebSocket."""
|