dv-pipecat-ai 0.0.85.dev698__py3-none-any.whl → 0.0.85.dev814__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (45) hide show
  1. {dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/METADATA +23 -18
  2. {dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/RECORD +45 -43
  3. pipecat/adapters/services/aws_nova_sonic_adapter.py +116 -6
  4. pipecat/pipeline/runner.py +6 -2
  5. pipecat/pipeline/task.py +40 -55
  6. pipecat/processors/aggregators/llm_context.py +40 -2
  7. pipecat/processors/frameworks/rtvi.py +1 -0
  8. pipecat/runner/daily.py +59 -20
  9. pipecat/runner/run.py +149 -67
  10. pipecat/runner/types.py +5 -5
  11. pipecat/services/assemblyai/models.py +6 -0
  12. pipecat/services/assemblyai/stt.py +13 -5
  13. pipecat/services/asyncai/tts.py +3 -0
  14. pipecat/services/aws/llm.py +33 -16
  15. pipecat/services/aws/nova_sonic/context.py +69 -0
  16. pipecat/services/aws/nova_sonic/llm.py +199 -89
  17. pipecat/services/aws/stt.py +2 -0
  18. pipecat/services/aws_nova_sonic/context.py +8 -12
  19. pipecat/services/cartesia/stt.py +77 -70
  20. pipecat/services/cartesia/tts.py +3 -1
  21. pipecat/services/deepgram/flux/stt.py +4 -0
  22. pipecat/services/elevenlabs/tts.py +82 -41
  23. pipecat/services/fish/tts.py +3 -0
  24. pipecat/services/google/stt.py +4 -0
  25. pipecat/services/lmnt/tts.py +2 -0
  26. pipecat/services/neuphonic/tts.py +3 -0
  27. pipecat/services/openai/tts.py +37 -6
  28. pipecat/services/piper/tts.py +7 -9
  29. pipecat/services/playht/tts.py +3 -0
  30. pipecat/services/rime/tts.py +9 -8
  31. pipecat/services/riva/stt.py +3 -1
  32. pipecat/services/salesforce/__init__.py +9 -0
  33. pipecat/services/salesforce/llm.py +465 -0
  34. pipecat/services/sarvam/tts.py +87 -10
  35. pipecat/services/speechmatics/stt.py +3 -1
  36. pipecat/services/stt_service.py +23 -10
  37. pipecat/services/tts_service.py +64 -13
  38. pipecat/transports/base_input.py +3 -0
  39. pipecat/transports/base_output.py +71 -77
  40. pipecat/transports/smallwebrtc/connection.py +5 -0
  41. pipecat/transports/smallwebrtc/request_handler.py +42 -0
  42. pipecat/utils/string.py +1 -0
  43. {dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/WHEEL +0 -0
  44. {dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/licenses/LICENSE +0 -0
  45. {dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/top_level.txt +0 -0
@@ -28,13 +28,12 @@ from pipecat.frames.frames import (
28
28
  UserStoppedSpeakingFrame,
29
29
  )
30
30
  from pipecat.processors.frame_processor import FrameDirection
31
- from pipecat.services.stt_service import STTService
31
+ from pipecat.services.stt_service import WebsocketSTTService
32
32
  from pipecat.transcriptions.language import Language
33
33
  from pipecat.utils.time import time_now_iso8601
34
34
  from pipecat.utils.tracing.service_decorators import traced_stt
35
35
 
36
36
  try:
37
- import websockets
38
37
  from websockets.asyncio.client import connect as websocket_connect
39
38
  from websockets.protocol import State
40
39
  except ModuleNotFoundError as e:
@@ -124,7 +123,7 @@ class CartesiaLiveOptions:
124
123
  return cls(**json.loads(json_str))
125
124
 
126
125
 
127
- class CartesiaSTTService(STTService):
126
+ class CartesiaSTTService(WebsocketSTTService):
128
127
  """Speech-to-text service using Cartesia Live API.
129
128
 
130
129
  Provides real-time speech transcription through WebSocket connection
@@ -176,8 +175,7 @@ class CartesiaSTTService(STTService):
176
175
  self.set_model_name(merged_options.model)
177
176
  self._api_key = api_key
178
177
  self._base_url = base_url or "api.cartesia.ai"
179
- self._connection = None
180
- self._receiver_task = None
178
+ self._receive_task = None
181
179
 
182
180
  def can_generate_metrics(self) -> bool:
183
181
  """Check if the service can generate processing metrics.
@@ -214,6 +212,27 @@ class CartesiaSTTService(STTService):
214
212
  await super().cancel(frame)
215
213
  await self._disconnect()
216
214
 
215
+ async def start_metrics(self):
216
+ """Start performance metrics collection for transcription processing."""
217
+ await self.start_ttfb_metrics()
218
+ await self.start_processing_metrics()
219
+
220
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
221
+ """Process incoming frames and handle speech events.
222
+
223
+ Args:
224
+ frame: The frame to process.
225
+ direction: Direction of frame flow in the pipeline.
226
+ """
227
+ await super().process_frame(frame, direction)
228
+
229
+ if isinstance(frame, UserStartedSpeakingFrame):
230
+ await self.start_metrics()
231
+ elif isinstance(frame, UserStoppedSpeakingFrame):
232
+ # Send finalize command to flush the transcription session
233
+ if self._websocket and self._websocket.state is State.OPEN:
234
+ await self._websocket.send("finalize")
235
+
217
236
  async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
218
237
  """Process audio data for speech-to-text transcription.
219
238
 
@@ -224,45 +243,71 @@ class CartesiaSTTService(STTService):
224
243
  None - transcription results are handled via WebSocket responses.
225
244
  """
226
245
  # If the connection is closed, due to timeout, we need to reconnect when the user starts speaking again
227
- if not self._connection or self._connection.state is State.CLOSED:
246
+ if not self._websocket or self._websocket.state is State.CLOSED:
228
247
  await self._connect()
229
248
 
230
- await self._connection.send(audio)
249
+ await self._websocket.send(audio)
231
250
  yield None
232
251
 
233
252
  async def _connect(self):
234
- params = self._settings.to_dict()
235
- ws_url = f"wss://{self._base_url}/stt/websocket?{urllib.parse.urlencode(params)}"
236
- logger.debug(f"Connecting to Cartesia: {ws_url}")
237
- headers = {"Cartesia-Version": "2025-04-16", "X-API-Key": self._api_key}
253
+ await self._connect_websocket()
254
+
255
+ if self._websocket and not self._receive_task:
256
+ self._receive_task = asyncio.create_task(self._receive_task_handler(self._report_error))
238
257
 
258
+ async def _disconnect(self):
259
+ if self._receive_task:
260
+ await self.cancel_task(self._receive_task)
261
+ self._receive_task = None
262
+
263
+ await self._disconnect_websocket()
264
+
265
+ async def _connect_websocket(self):
239
266
  try:
240
- self._connection = await websocket_connect(ws_url, additional_headers=headers)
241
- # Setup the receiver task to handle the incoming messages from the Cartesia server
242
- if self._receiver_task is None or self._receiver_task.done():
243
- self._receiver_task = asyncio.create_task(self._receive_messages())
244
- logger.debug(f"Connected to Cartesia")
267
+ if self._websocket and self._websocket.state is State.OPEN:
268
+ return
269
+ logger.debug("Connecting to Cartesia STT")
270
+
271
+ params = self._settings.to_dict()
272
+ ws_url = f"wss://{self._base_url}/stt/websocket?{urllib.parse.urlencode(params)}"
273
+ headers = {"Cartesia-Version": "2025-04-16", "X-API-Key": self._api_key}
274
+
275
+ self._websocket = await websocket_connect(ws_url, additional_headers=headers)
276
+ await self._call_event_handler("on_connected")
245
277
  except Exception as e:
246
278
  logger.error(f"{self}: unable to connect to Cartesia: {e}")
247
279
 
248
- async def _receive_messages(self):
280
+ async def _disconnect_websocket(self):
249
281
  try:
250
- while True:
251
- if not self._connection or self._connection.state is State.CLOSED:
252
- break
253
-
254
- message = await self._connection.recv()
255
- try:
256
- data = json.loads(message)
257
- await self._process_response(data)
258
- except json.JSONDecodeError:
259
- logger.warning(f"Received non-JSON message: {message}")
260
- except asyncio.CancelledError:
261
- pass
262
- except websockets.exceptions.ConnectionClosed as e:
263
- logger.debug(f"WebSocket connection closed: {e}")
282
+ if self._websocket and self._websocket.state is State.OPEN:
283
+ logger.debug("Disconnecting from Cartesia STT")
284
+ await self._websocket.close()
264
285
  except Exception as e:
265
- logger.error(f"Error in message receiver: {e}")
286
+ logger.error(f"{self} error closing websocket: {e}")
287
+ finally:
288
+ self._websocket = None
289
+ await self._call_event_handler("on_disconnected")
290
+
291
+ def _get_websocket(self):
292
+ if self._websocket:
293
+ return self._websocket
294
+ raise Exception("Websocket not connected")
295
+
296
+ async def _process_messages(self):
297
+ async for message in self._get_websocket():
298
+ try:
299
+ data = json.loads(message)
300
+ await self._process_response(data)
301
+ except json.JSONDecodeError:
302
+ logger.warning(f"Received non-JSON message: {message}")
303
+
304
+ async def _receive_messages(self):
305
+ while True:
306
+ await self._process_messages()
307
+ # Cartesia times out after 5 minutes of innactivity (no keepalive
308
+ # mechanism is available). So, we try to reconnect.
309
+ logger.debug(f"{self} Cartesia connection was disconnected (timeout?), reconnecting")
310
+ await self._connect_websocket()
266
311
 
267
312
  async def _process_response(self, data):
268
313
  if "type" in data:
@@ -316,41 +361,3 @@ class CartesiaSTTService(STTService):
316
361
  language,
317
362
  )
318
363
  )
319
-
320
- async def _disconnect(self):
321
- if self._receiver_task:
322
- self._receiver_task.cancel()
323
- try:
324
- await self._receiver_task
325
- except asyncio.CancelledError:
326
- pass
327
- except Exception as e:
328
- logger.exception(f"Unexpected exception while cancelling task: {e}")
329
- self._receiver_task = None
330
-
331
- if self._connection and self._connection.state is State.OPEN:
332
- logger.debug("Disconnecting from Cartesia")
333
-
334
- await self._connection.close()
335
- self._connection = None
336
-
337
- async def start_metrics(self):
338
- """Start performance metrics collection for transcription processing."""
339
- await self.start_ttfb_metrics()
340
- await self.start_processing_metrics()
341
-
342
- async def process_frame(self, frame: Frame, direction: FrameDirection):
343
- """Process incoming frames and handle speech events.
344
-
345
- Args:
346
- frame: The frame to process.
347
- direction: Direction of frame flow in the pipeline.
348
- """
349
- await super().process_frame(frame, direction)
350
-
351
- if isinstance(frame, UserStartedSpeakingFrame):
352
- await self.start_metrics()
353
- elif isinstance(frame, UserStoppedSpeakingFrame):
354
- # Send finalize command to flush the transcription session
355
- if self._connection and self._connection.state is State.OPEN:
356
- await self._connection.send("finalize")
@@ -345,10 +345,11 @@ class CartesiaTTSService(AudioContextWordTTSService):
345
345
  try:
346
346
  if self._websocket and self._websocket.state is State.OPEN:
347
347
  return
348
- logger.debug("Connecting to Cartesia")
348
+ logger.debug("Connecting to Cartesia TTS")
349
349
  self._websocket = await websocket_connect(
350
350
  f"{self._url}?api_key={self._api_key}&cartesia_version={self._cartesia_version}"
351
351
  )
352
+ await self._call_event_handler("on_connected")
352
353
  except Exception as e:
353
354
  logger.error(f"{self} initialization error: {e}")
354
355
  self._websocket = None
@@ -366,6 +367,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
366
367
  finally:
367
368
  self._context_id = None
368
369
  self._websocket = None
370
+ await self._call_event_handler("on_disconnected")
369
371
 
370
372
  def _get_websocket(self):
371
373
  if self._websocket:
@@ -205,6 +205,7 @@ class DeepgramFluxSTTService(WebsocketSTTService):
205
205
  additional_headers={"Authorization": f"Token {self._api_key}"},
206
206
  )
207
207
  logger.debug("Connected to Deepgram Flux Websocket")
208
+ await self._call_event_handler("on_connected")
208
209
  except Exception as e:
209
210
  logger.error(f"{self} initialization error: {e}")
210
211
  self._websocket = None
@@ -225,6 +226,9 @@ class DeepgramFluxSTTService(WebsocketSTTService):
225
226
  await self._websocket.close()
226
227
  except Exception as e:
227
228
  logger.error(f"{self} error closing websocket: {e}")
229
+ finally:
230
+ self._websocket = None
231
+ await self._call_event_handler("on_disconnected")
228
232
 
229
233
  async def _send_close_stream(self) -> None:
230
234
  """Sends a CloseStream control message to the Deepgram Flux WebSocket API.
@@ -172,16 +172,24 @@ def build_elevenlabs_voice_settings(
172
172
 
173
173
 
174
174
  def calculate_word_times(
175
- alignment_info: Mapping[str, Any], cumulative_time: float
176
- ) -> List[Tuple[str, float]]:
175
+ alignment_info: Mapping[str, Any],
176
+ cumulative_time: float,
177
+ partial_word: str = "",
178
+ partial_word_start_time: float = 0.0,
179
+ ) -> tuple[List[Tuple[str, float]], str, float]:
177
180
  """Calculate word timestamps from character alignment information.
178
181
 
179
182
  Args:
180
183
  alignment_info: Character alignment data from ElevenLabs API.
181
184
  cumulative_time: Base time offset for this chunk.
185
+ partial_word: Partial word carried over from previous chunk.
186
+ partial_word_start_time: Start time of the partial word.
182
187
 
183
188
  Returns:
184
- List of (word, timestamp) tuples.
189
+ Tuple of (word_times, new_partial_word, new_partial_word_start_time):
190
+ - word_times: List of (word, timestamp) tuples for complete words
191
+ - new_partial_word: Incomplete word at end of chunk (empty if chunk ends with space)
192
+ - new_partial_word_start_time: Start time of the incomplete word
185
193
  """
186
194
  chars = alignment_info["chars"]
187
195
  char_start_times_ms = alignment_info["charStartTimesMs"]
@@ -190,41 +198,37 @@ def calculate_word_times(
190
198
  logger.error(
191
199
  f"calculate_word_times: length mismatch - chars={len(chars)}, times={len(char_start_times_ms)}"
192
200
  )
193
- return []
201
+ return ([], partial_word, partial_word_start_time)
194
202
 
195
203
  # Build words and track their start positions
196
204
  words = []
197
- word_start_indices = []
198
- current_word = ""
199
- word_start_index = None
205
+ word_start_times = []
206
+ current_word = partial_word # Start with any partial word from previous chunk
207
+ word_start_time = partial_word_start_time if partial_word else None
200
208
 
201
209
  for i, char in enumerate(chars):
202
210
  if char == " ":
203
211
  # End of current word
204
212
  if current_word: # Only add non-empty words
205
213
  words.append(current_word)
206
- word_start_indices.append(word_start_index)
214
+ word_start_times.append(word_start_time)
207
215
  current_word = ""
208
- word_start_index = None
216
+ word_start_time = None
209
217
  else:
210
218
  # Building a word
211
- if word_start_index is None: # First character of new word
212
- word_start_index = i
219
+ if word_start_time is None: # First character of new word
220
+ # Convert from milliseconds to seconds and add cumulative offset
221
+ word_start_time = cumulative_time + (char_start_times_ms[i] / 1000.0)
213
222
  current_word += char
214
223
 
215
- # Handle the last word if there's no trailing space
216
- if current_word and word_start_index is not None:
217
- words.append(current_word)
218
- word_start_indices.append(word_start_index)
224
+ # Build result for complete words
225
+ word_times = list(zip(words, word_start_times))
219
226
 
220
- # Calculate timestamps for each word
221
- word_times = []
222
- for word, start_idx in zip(words, word_start_indices):
223
- # Convert from milliseconds to seconds and add cumulative offset
224
- start_time_seconds = cumulative_time + (char_start_times_ms[start_idx] / 1000.0)
225
- word_times.append((word, start_time_seconds))
227
+ # Return any incomplete word at the end of this chunk
228
+ new_partial_word = current_word if current_word else ""
229
+ new_partial_word_start_time = word_start_time if word_start_time is not None else 0.0
226
230
 
227
- return word_times
231
+ return (word_times, new_partial_word, new_partial_word_start_time)
228
232
 
229
233
 
230
234
  class ElevenLabsTTSService(AudioContextWordTTSService):
@@ -336,6 +340,9 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
336
340
  # there's an interruption or TTSStoppedFrame.
337
341
  self._started = False
338
342
  self._cumulative_time = 0
343
+ # Track partial words that span across alignment chunks
344
+ self._partial_word = ""
345
+ self._partial_word_start_time = 0.0
339
346
 
340
347
  # Context management for v1 multi API
341
348
  self._context_id = None
@@ -526,6 +533,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
526
533
  url, max_size=16 * 1024 * 1024, additional_headers={"xi-api-key": self._api_key}
527
534
  )
528
535
 
536
+ await self._call_event_handler("on_connected")
529
537
  except Exception as e:
530
538
  self.logger.error(f"{self} initialization error: {e}")
531
539
  self._websocket = None
@@ -544,6 +552,11 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
544
552
  logger.debug("Disconnected from ElevenLabs")
545
553
  except Exception as e:
546
554
  self.logger.error(f"{self} error closing websocket: {e}")
555
+ finally:
556
+ self._started = False
557
+ self._context_id = None
558
+ self._websocket = None
559
+ await self._call_event_handler("on_disconnected")
547
560
 
548
561
  def _get_websocket(self):
549
562
  if self._websocket:
@@ -571,6 +584,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
571
584
  logger.error(f"Error closing context on interruption: {e}")
572
585
  self._context_id = None
573
586
  self._started = False
587
+ self._partial_word = ""
588
+ self._partial_word_start_time = 0.0
574
589
 
575
590
  async def _receive_messages(self):
576
591
  """Handle incoming WebSocket messages from ElevenLabs."""
@@ -610,7 +625,14 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
610
625
 
611
626
  if msg.get("alignment"):
612
627
  alignment = msg["alignment"]
613
- word_times = calculate_word_times(alignment, self._cumulative_time)
628
+ word_times, self._partial_word, self._partial_word_start_time = (
629
+ calculate_word_times(
630
+ alignment,
631
+ self._cumulative_time,
632
+ self._partial_word,
633
+ self._partial_word_start_time,
634
+ )
635
+ )
614
636
 
615
637
  if word_times:
616
638
  await self.add_word_timestamps(word_times)
@@ -685,6 +707,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
685
707
  yield TTSStartedFrame()
686
708
  self._started = True
687
709
  self._cumulative_time = 0
710
+ self._partial_word = ""
711
+ self._partial_word_start_time = 0.0
688
712
  # If a context ID does not exist, create a new one and
689
713
  # register it. If an ID exists, that means the Pipeline is
690
714
  # configured for allow_interruptions=False, so continue
@@ -758,6 +782,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
758
782
  base_url: str = "https://api.elevenlabs.io",
759
783
  sample_rate: Optional[int] = None,
760
784
  params: Optional[InputParams] = None,
785
+ aggregate_sentences: Optional[bool] = True,
761
786
  **kwargs,
762
787
  ):
763
788
  """Initialize the ElevenLabs HTTP TTS service.
@@ -770,10 +795,11 @@ class ElevenLabsHttpTTSService(WordTTSService):
770
795
  base_url: Base URL for ElevenLabs HTTP API.
771
796
  sample_rate: Audio sample rate. If None, uses default.
772
797
  params: Additional input parameters for voice customization.
798
+ aggregate_sentences: Whether to aggregate sentences within the TTSService.
773
799
  **kwargs: Additional arguments passed to the parent service.
774
800
  """
775
801
  super().__init__(
776
- aggregate_sentences=True,
802
+ aggregate_sentences=aggregate_sentences,
777
803
  push_text_frames=False,
778
804
  push_stop_frames=True,
779
805
  sample_rate=sample_rate,
@@ -811,6 +837,10 @@ class ElevenLabsHttpTTSService(WordTTSService):
811
837
  # Store previous text for context within a turn
812
838
  self._previous_text = ""
813
839
 
840
+ # Track partial words that span across alignment chunks
841
+ self._partial_word = ""
842
+ self._partial_word_start_time = 0.0
843
+
814
844
  def language_to_service_language(self, language: Language) -> Optional[str]:
815
845
  """Convert pipecat Language to ElevenLabs language code.
816
846
 
@@ -838,6 +868,8 @@ class ElevenLabsHttpTTSService(WordTTSService):
838
868
  self._cumulative_time = 0
839
869
  self._started = False
840
870
  self._previous_text = ""
871
+ self._partial_word = ""
872
+ self._partial_word_start_time = 0.0
841
873
  logger.debug(f"{self}: Reset internal state")
842
874
 
843
875
  async def start(self, frame: StartFrame):
@@ -872,11 +904,13 @@ class ElevenLabsHttpTTSService(WordTTSService):
872
904
  def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[str, float]]:
873
905
  """Calculate word timing from character alignment data.
874
906
 
907
+ This method handles partial words that may span across multiple alignment chunks.
908
+
875
909
  Args:
876
910
  alignment_info: Character timing data from ElevenLabs.
877
911
 
878
912
  Returns:
879
- List of (word, timestamp) pairs.
913
+ List of (word, timestamp) pairs for complete words in this chunk.
880
914
 
881
915
  Example input data::
882
916
 
@@ -902,30 +936,28 @@ class ElevenLabsHttpTTSService(WordTTSService):
902
936
  # Build the words and find their start times
903
937
  words = []
904
938
  word_start_times = []
905
- current_word = ""
906
- first_char_idx = -1
939
+ # Start with any partial word from previous chunk
940
+ current_word = self._partial_word
941
+ word_start_time = self._partial_word_start_time if self._partial_word else None
907
942
 
908
943
  for i, char in enumerate(chars):
909
944
  if char == " ":
910
945
  if current_word: # Only add non-empty words
911
946
  words.append(current_word)
912
- # Use time of the first character of the word, offset by cumulative time
913
- word_start_times.append(
914
- self._cumulative_time + char_start_times[first_char_idx]
915
- )
947
+ word_start_times.append(word_start_time)
916
948
  current_word = ""
917
- first_char_idx = -1
949
+ word_start_time = None
918
950
  else:
919
- if not current_word: # This is the first character of a new word
920
- first_char_idx = i
951
+ if word_start_time is None: # First character of a new word
952
+ # Use time of the first character of the word, offset by cumulative time
953
+ word_start_time = self._cumulative_time + char_start_times[i]
921
954
  current_word += char
922
955
 
923
- # Don't forget the last word if there's no trailing space
924
- if current_word and first_char_idx >= 0:
925
- words.append(current_word)
926
- word_start_times.append(self._cumulative_time + char_start_times[first_char_idx])
956
+ # Store any incomplete word at the end of this chunk
957
+ self._partial_word = current_word if current_word else ""
958
+ self._partial_word_start_time = word_start_time if word_start_time is not None else 0.0
927
959
 
928
- # Create word-time pairs
960
+ # Create word-time pairs for complete words only
929
961
  word_times = list(zip(words, word_start_times))
930
962
 
931
963
  return word_times
@@ -961,6 +993,9 @@ class ElevenLabsHttpTTSService(WordTTSService):
961
993
  if self._voice_settings:
962
994
  payload["voice_settings"] = self._voice_settings
963
995
 
996
+ if self._settings["apply_text_normalization"] is not None:
997
+ payload["apply_text_normalization"] = self._settings["apply_text_normalization"]
998
+
964
999
  language = self._settings["language"]
965
1000
  if self._model_name in ELEVENLABS_MULTILINGUAL_MODELS and language:
966
1001
  payload["language_code"] = language
@@ -981,8 +1016,6 @@ class ElevenLabsHttpTTSService(WordTTSService):
981
1016
  }
982
1017
  if self._settings["optimize_streaming_latency"] is not None:
983
1018
  params["optimize_streaming_latency"] = self._settings["optimize_streaming_latency"]
984
- if self._settings["apply_text_normalization"] is not None:
985
- params["apply_text_normalization"] = self._settings["apply_text_normalization"]
986
1019
 
987
1020
  self.logger.debug(f"ElevenLabs request - payload: {payload}, params: {params}")
988
1021
 
@@ -1045,6 +1078,14 @@ class ElevenLabsHttpTTSService(WordTTSService):
1045
1078
  logger.error(f"Error processing response: {e}", exc_info=True)
1046
1079
  continue
1047
1080
 
1081
+ # After processing all chunks, emit any remaining partial word
1082
+ # since this is the end of the utterance
1083
+ if self._partial_word:
1084
+ final_word_time = [(self._partial_word, self._partial_word_start_time)]
1085
+ await self.add_word_timestamps(final_word_time)
1086
+ self._partial_word = ""
1087
+ self._partial_word_start_time = 0.0
1088
+
1048
1089
  # After processing all chunks, add the total utterance duration
1049
1090
  # to the cumulative time to ensure next utterance starts after this one
1050
1091
  if utterance_duration > 0:
@@ -225,6 +225,8 @@ class FishAudioTTSService(InterruptibleTTSService):
225
225
  start_message = {"event": "start", "request": {"text": "", **self._settings}}
226
226
  await self._websocket.send(ormsgpack.packb(start_message))
227
227
  logger.debug("Sent start message to Fish Audio")
228
+
229
+ await self._call_event_handler("on_connected")
228
230
  except Exception as e:
229
231
  logger.error(f"Fish Audio initialization error: {e}")
230
232
  self._websocket = None
@@ -245,6 +247,7 @@ class FishAudioTTSService(InterruptibleTTSService):
245
247
  self._request_id = None
246
248
  self._started = False
247
249
  self._websocket = None
250
+ await self._call_event_handler("on_disconnected")
248
251
 
249
252
  async def flush_audio(self):
250
253
  """Flush any buffered audio by sending a flush event to Fish Audio."""
@@ -730,6 +730,8 @@ class GoogleSTTService(STTService):
730
730
  self._request_queue = asyncio.Queue()
731
731
  self._streaming_task = self.create_task(self._stream_audio())
732
732
 
733
+ await self._call_event_handler("on_connected")
734
+
733
735
  async def _disconnect(self):
734
736
  """Clean up streaming recognition resources."""
735
737
  if self._streaming_task:
@@ -737,6 +739,8 @@ class GoogleSTTService(STTService):
737
739
  await self.cancel_task(self._streaming_task)
738
740
  self._streaming_task = None
739
741
 
742
+ await self._call_event_handler("on_disconnected")
743
+
740
744
  async def _request_generator(self):
741
745
  """Generates requests for the streaming recognize method."""
742
746
  recognizer_path = f"projects/{self._project_id}/locations/{self._location}/recognizers/_"
@@ -222,6 +222,7 @@ class LmntTTSService(InterruptibleTTSService):
222
222
  # Send initialization message
223
223
  await self._websocket.send(json.dumps(init_msg))
224
224
 
225
+ await self._call_event_handler("on_connected")
225
226
  except Exception as e:
226
227
  logger.error(f"{self} initialization error: {e}")
227
228
  self._websocket = None
@@ -243,6 +244,7 @@ class LmntTTSService(InterruptibleTTSService):
243
244
  finally:
244
245
  self._started = False
245
246
  self._websocket = None
247
+ await self._call_event_handler("on_disconnected")
246
248
 
247
249
  def _get_websocket(self):
248
250
  """Get the WebSocket connection if available."""
@@ -293,6 +293,8 @@ class NeuphonicTTSService(InterruptibleTTSService):
293
293
  headers = {"x-api-key": self._api_key}
294
294
 
295
295
  self._websocket = await websocket_connect(url, additional_headers=headers)
296
+
297
+ await self._call_event_handler("on_connected")
296
298
  except Exception as e:
297
299
  logger.error(f"{self} initialization error: {e}")
298
300
  self._websocket = None
@@ -311,6 +313,7 @@ class NeuphonicTTSService(InterruptibleTTSService):
311
313
  finally:
312
314
  self._started = False
313
315
  self._websocket = None
316
+ await self._call_event_handler("on_disconnected")
314
317
 
315
318
  async def _receive_messages(self):
316
319
  """Receive and process messages from Neuphonic WebSocket."""