dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
- dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
- pipecat/__init__.py +17 -0
- pipecat/adapters/base_llm_adapter.py +36 -1
- pipecat/adapters/schemas/direct_function.py +296 -0
- pipecat/adapters/schemas/function_schema.py +15 -6
- pipecat/adapters/schemas/tools_schema.py +55 -7
- pipecat/adapters/services/anthropic_adapter.py +22 -3
- pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
- pipecat/adapters/services/bedrock_adapter.py +22 -3
- pipecat/adapters/services/gemini_adapter.py +16 -3
- pipecat/adapters/services/open_ai_adapter.py +17 -2
- pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
- pipecat/audio/filters/base_audio_filter.py +30 -6
- pipecat/audio/filters/koala_filter.py +37 -2
- pipecat/audio/filters/krisp_filter.py +59 -6
- pipecat/audio/filters/noisereduce_filter.py +37 -0
- pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
- pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
- pipecat/audio/mixers/base_audio_mixer.py +30 -7
- pipecat/audio/mixers/soundfile_mixer.py +53 -6
- pipecat/audio/resamplers/base_audio_resampler.py +17 -9
- pipecat/audio/resamplers/resampy_resampler.py +26 -1
- pipecat/audio/resamplers/soxr_resampler.py +32 -1
- pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
- pipecat/audio/utils.py +194 -1
- pipecat/audio/vad/silero.py +60 -3
- pipecat/audio/vad/vad_analyzer.py +114 -30
- pipecat/clocks/base_clock.py +19 -0
- pipecat/clocks/system_clock.py +25 -0
- pipecat/extensions/voicemail/__init__.py +0 -0
- pipecat/extensions/voicemail/voicemail_detector.py +707 -0
- pipecat/frames/frames.py +590 -156
- pipecat/metrics/metrics.py +64 -1
- pipecat/observers/base_observer.py +58 -19
- pipecat/observers/loggers/debug_log_observer.py +56 -64
- pipecat/observers/loggers/llm_log_observer.py +8 -1
- pipecat/observers/loggers/transcription_log_observer.py +19 -7
- pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
- pipecat/observers/turn_tracking_observer.py +26 -1
- pipecat/pipeline/base_pipeline.py +5 -7
- pipecat/pipeline/base_task.py +52 -9
- pipecat/pipeline/parallel_pipeline.py +121 -177
- pipecat/pipeline/pipeline.py +129 -20
- pipecat/pipeline/runner.py +50 -1
- pipecat/pipeline/sync_parallel_pipeline.py +132 -32
- pipecat/pipeline/task.py +263 -280
- pipecat/pipeline/task_observer.py +85 -34
- pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
- pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
- pipecat/processors/aggregators/gated.py +25 -24
- pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
- pipecat/processors/aggregators/llm_response.py +398 -89
- pipecat/processors/aggregators/openai_llm_context.py +161 -13
- pipecat/processors/aggregators/sentence.py +25 -14
- pipecat/processors/aggregators/user_response.py +28 -3
- pipecat/processors/aggregators/vision_image_frame.py +24 -14
- pipecat/processors/async_generator.py +28 -0
- pipecat/processors/audio/audio_buffer_processor.py +78 -37
- pipecat/processors/consumer_processor.py +25 -6
- pipecat/processors/filters/frame_filter.py +23 -0
- pipecat/processors/filters/function_filter.py +30 -0
- pipecat/processors/filters/identity_filter.py +17 -2
- pipecat/processors/filters/null_filter.py +24 -1
- pipecat/processors/filters/stt_mute_filter.py +56 -21
- pipecat/processors/filters/wake_check_filter.py +46 -3
- pipecat/processors/filters/wake_notifier_filter.py +21 -3
- pipecat/processors/frame_processor.py +488 -131
- pipecat/processors/frameworks/langchain.py +38 -3
- pipecat/processors/frameworks/rtvi.py +719 -34
- pipecat/processors/gstreamer/pipeline_source.py +41 -0
- pipecat/processors/idle_frame_processor.py +26 -3
- pipecat/processors/logger.py +23 -0
- pipecat/processors/metrics/frame_processor_metrics.py +77 -4
- pipecat/processors/metrics/sentry.py +42 -4
- pipecat/processors/producer_processor.py +34 -14
- pipecat/processors/text_transformer.py +22 -10
- pipecat/processors/transcript_processor.py +48 -29
- pipecat/processors/user_idle_processor.py +31 -21
- pipecat/runner/__init__.py +1 -0
- pipecat/runner/daily.py +132 -0
- pipecat/runner/livekit.py +148 -0
- pipecat/runner/run.py +543 -0
- pipecat/runner/types.py +67 -0
- pipecat/runner/utils.py +515 -0
- pipecat/serializers/base_serializer.py +42 -0
- pipecat/serializers/exotel.py +17 -6
- pipecat/serializers/genesys.py +95 -0
- pipecat/serializers/livekit.py +33 -0
- pipecat/serializers/plivo.py +16 -15
- pipecat/serializers/protobuf.py +37 -1
- pipecat/serializers/telnyx.py +18 -17
- pipecat/serializers/twilio.py +32 -16
- pipecat/services/ai_service.py +5 -3
- pipecat/services/anthropic/llm.py +113 -43
- pipecat/services/assemblyai/models.py +63 -5
- pipecat/services/assemblyai/stt.py +64 -11
- pipecat/services/asyncai/__init__.py +0 -0
- pipecat/services/asyncai/tts.py +501 -0
- pipecat/services/aws/llm.py +185 -111
- pipecat/services/aws/stt.py +217 -23
- pipecat/services/aws/tts.py +118 -52
- pipecat/services/aws/utils.py +101 -5
- pipecat/services/aws_nova_sonic/aws.py +82 -64
- pipecat/services/aws_nova_sonic/context.py +15 -6
- pipecat/services/azure/common.py +10 -2
- pipecat/services/azure/image.py +32 -0
- pipecat/services/azure/llm.py +9 -7
- pipecat/services/azure/stt.py +65 -2
- pipecat/services/azure/tts.py +154 -23
- pipecat/services/cartesia/stt.py +125 -8
- pipecat/services/cartesia/tts.py +102 -38
- pipecat/services/cerebras/llm.py +15 -23
- pipecat/services/deepgram/stt.py +19 -11
- pipecat/services/deepgram/tts.py +36 -0
- pipecat/services/deepseek/llm.py +14 -23
- pipecat/services/elevenlabs/tts.py +330 -64
- pipecat/services/fal/image.py +43 -0
- pipecat/services/fal/stt.py +48 -10
- pipecat/services/fireworks/llm.py +14 -21
- pipecat/services/fish/tts.py +109 -9
- pipecat/services/gemini_multimodal_live/__init__.py +1 -0
- pipecat/services/gemini_multimodal_live/events.py +83 -2
- pipecat/services/gemini_multimodal_live/file_api.py +189 -0
- pipecat/services/gemini_multimodal_live/gemini.py +218 -21
- pipecat/services/gladia/config.py +17 -10
- pipecat/services/gladia/stt.py +82 -36
- pipecat/services/google/frames.py +40 -0
- pipecat/services/google/google.py +2 -0
- pipecat/services/google/image.py +39 -2
- pipecat/services/google/llm.py +176 -58
- pipecat/services/google/llm_openai.py +26 -4
- pipecat/services/google/llm_vertex.py +37 -15
- pipecat/services/google/rtvi.py +41 -0
- pipecat/services/google/stt.py +65 -17
- pipecat/services/google/test-google-chirp.py +45 -0
- pipecat/services/google/tts.py +390 -19
- pipecat/services/grok/llm.py +8 -6
- pipecat/services/groq/llm.py +8 -6
- pipecat/services/groq/stt.py +13 -9
- pipecat/services/groq/tts.py +40 -0
- pipecat/services/hamsa/__init__.py +9 -0
- pipecat/services/hamsa/stt.py +241 -0
- pipecat/services/heygen/__init__.py +5 -0
- pipecat/services/heygen/api.py +281 -0
- pipecat/services/heygen/client.py +620 -0
- pipecat/services/heygen/video.py +338 -0
- pipecat/services/image_service.py +5 -3
- pipecat/services/inworld/__init__.py +1 -0
- pipecat/services/inworld/tts.py +592 -0
- pipecat/services/llm_service.py +127 -45
- pipecat/services/lmnt/tts.py +80 -7
- pipecat/services/mcp_service.py +85 -44
- pipecat/services/mem0/memory.py +42 -13
- pipecat/services/minimax/tts.py +74 -15
- pipecat/services/mistral/__init__.py +0 -0
- pipecat/services/mistral/llm.py +185 -0
- pipecat/services/moondream/vision.py +55 -10
- pipecat/services/neuphonic/tts.py +275 -48
- pipecat/services/nim/llm.py +8 -6
- pipecat/services/ollama/llm.py +27 -7
- pipecat/services/openai/base_llm.py +54 -16
- pipecat/services/openai/image.py +30 -0
- pipecat/services/openai/llm.py +7 -5
- pipecat/services/openai/stt.py +13 -9
- pipecat/services/openai/tts.py +42 -10
- pipecat/services/openai_realtime_beta/azure.py +11 -9
- pipecat/services/openai_realtime_beta/context.py +7 -5
- pipecat/services/openai_realtime_beta/events.py +10 -7
- pipecat/services/openai_realtime_beta/openai.py +37 -18
- pipecat/services/openpipe/llm.py +30 -24
- pipecat/services/openrouter/llm.py +9 -7
- pipecat/services/perplexity/llm.py +15 -19
- pipecat/services/piper/tts.py +26 -12
- pipecat/services/playht/tts.py +227 -65
- pipecat/services/qwen/llm.py +8 -6
- pipecat/services/rime/tts.py +128 -17
- pipecat/services/riva/stt.py +160 -22
- pipecat/services/riva/tts.py +67 -2
- pipecat/services/sambanova/llm.py +19 -17
- pipecat/services/sambanova/stt.py +14 -8
- pipecat/services/sarvam/tts.py +60 -13
- pipecat/services/simli/video.py +82 -21
- pipecat/services/soniox/__init__.py +0 -0
- pipecat/services/soniox/stt.py +398 -0
- pipecat/services/speechmatics/stt.py +29 -17
- pipecat/services/stt_service.py +47 -11
- pipecat/services/tavus/video.py +94 -25
- pipecat/services/together/llm.py +8 -6
- pipecat/services/tts_service.py +77 -53
- pipecat/services/ultravox/stt.py +46 -43
- pipecat/services/vision_service.py +5 -3
- pipecat/services/websocket_service.py +12 -11
- pipecat/services/whisper/base_stt.py +58 -12
- pipecat/services/whisper/stt.py +69 -58
- pipecat/services/xtts/tts.py +59 -2
- pipecat/sync/base_notifier.py +19 -0
- pipecat/sync/event_notifier.py +24 -0
- pipecat/tests/utils.py +73 -5
- pipecat/transcriptions/language.py +24 -0
- pipecat/transports/base_input.py +112 -8
- pipecat/transports/base_output.py +235 -13
- pipecat/transports/base_transport.py +119 -0
- pipecat/transports/local/audio.py +76 -0
- pipecat/transports/local/tk.py +84 -0
- pipecat/transports/network/fastapi_websocket.py +174 -15
- pipecat/transports/network/small_webrtc.py +383 -39
- pipecat/transports/network/webrtc_connection.py +214 -8
- pipecat/transports/network/websocket_client.py +171 -1
- pipecat/transports/network/websocket_server.py +147 -9
- pipecat/transports/services/daily.py +792 -70
- pipecat/transports/services/helpers/daily_rest.py +122 -129
- pipecat/transports/services/livekit.py +339 -4
- pipecat/transports/services/tavus.py +273 -38
- pipecat/utils/asyncio/task_manager.py +92 -186
- pipecat/utils/base_object.py +83 -1
- pipecat/utils/network.py +2 -0
- pipecat/utils/string.py +114 -58
- pipecat/utils/text/base_text_aggregator.py +44 -13
- pipecat/utils/text/base_text_filter.py +46 -0
- pipecat/utils/text/markdown_text_filter.py +70 -14
- pipecat/utils/text/pattern_pair_aggregator.py +18 -14
- pipecat/utils/text/simple_text_aggregator.py +43 -2
- pipecat/utils/text/skip_tags_aggregator.py +21 -13
- pipecat/utils/time.py +36 -0
- pipecat/utils/tracing/class_decorators.py +32 -7
- pipecat/utils/tracing/conversation_context_provider.py +12 -2
- pipecat/utils/tracing/service_attributes.py +80 -64
- pipecat/utils/tracing/service_decorators.py +48 -21
- pipecat/utils/tracing/setup.py +13 -7
- pipecat/utils/tracing/turn_context_provider.py +12 -2
- pipecat/utils/tracing/turn_trace_observer.py +27 -0
- pipecat/utils/utils.py +14 -14
- dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
- pipecat/examples/daily_runner.py +0 -64
- pipecat/examples/run.py +0 -265
- pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
- pipecat/utils/asyncio/watchdog_event.py +0 -42
- pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
- pipecat/utils/asyncio/watchdog_queue.py +0 -48
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
- /pipecat/{examples → extensions}/__init__.py +0 -0
|
@@ -0,0 +1,592 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2024–2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""Inworld AI Text-to-Speech Service Implementation.
|
|
8
|
+
|
|
9
|
+
This module provides integration with Inworld AI's HTTP-based TTS API, enabling
|
|
10
|
+
both streaming and non-streaming text-to-speech synthesis with high-quality,
|
|
11
|
+
natural-sounding voices.
|
|
12
|
+
|
|
13
|
+
Key Features:
|
|
14
|
+
|
|
15
|
+
- HTTP streaming and non-streaming API support for flexible audio generation
|
|
16
|
+
- Multiple voice options (Ashley, Hades, etc.)
|
|
17
|
+
- Automatic language detection from input text (no manual language setting required)
|
|
18
|
+
- Real-time audio chunk processing with proper buffering
|
|
19
|
+
- WAV header handling and audio format conversion
|
|
20
|
+
- Comprehensive error handling and metrics tracking
|
|
21
|
+
|
|
22
|
+
Technical Implementation:
|
|
23
|
+
|
|
24
|
+
- Uses aiohttp for HTTP connections
|
|
25
|
+
- Implements both JSON line-by-line parsing (streaming) and complete response (non-streaming)
|
|
26
|
+
- Handles base64-encoded audio data with proper decoding
|
|
27
|
+
- Manages audio continuity to prevent clicks and artifacts
|
|
28
|
+
- Integrates with Pipecat's frame-based pipeline architecture
|
|
29
|
+
|
|
30
|
+
Examples::
|
|
31
|
+
|
|
32
|
+
async with aiohttp.ClientSession() as session:
|
|
33
|
+
# Streaming mode (default) - real-time audio generation
|
|
34
|
+
tts = InworldTTSService(
|
|
35
|
+
api_key=os.getenv("INWORLD_API_KEY"),
|
|
36
|
+
aiohttp_session=session,
|
|
37
|
+
voice_id="Ashley",
|
|
38
|
+
model="inworld-tts-1",
|
|
39
|
+
streaming=True, # Default
|
|
40
|
+
params=InworldTTSService.InputParams(
|
|
41
|
+
temperature=0.8, # Optional: control synthesis variability (range: [0, 2])
|
|
42
|
+
),
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# Non-streaming mode - complete audio generation then playback
|
|
46
|
+
tts = InworldTTSService(
|
|
47
|
+
api_key=os.getenv("INWORLD_API_KEY"),
|
|
48
|
+
aiohttp_session=session,
|
|
49
|
+
voice_id="Ashley",
|
|
50
|
+
model="inworld-tts-1",
|
|
51
|
+
streaming=False,
|
|
52
|
+
params=InworldTTSService.InputParams(
|
|
53
|
+
temperature=0.8,
|
|
54
|
+
),
|
|
55
|
+
)
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
import base64
|
|
59
|
+
import json
|
|
60
|
+
from typing import AsyncGenerator, Optional
|
|
61
|
+
|
|
62
|
+
import aiohttp
|
|
63
|
+
from loguru import logger
|
|
64
|
+
from pydantic import BaseModel
|
|
65
|
+
|
|
66
|
+
from pipecat.frames.frames import (
|
|
67
|
+
CancelFrame,
|
|
68
|
+
EndFrame,
|
|
69
|
+
ErrorFrame,
|
|
70
|
+
Frame,
|
|
71
|
+
StartFrame,
|
|
72
|
+
TTSAudioRawFrame,
|
|
73
|
+
TTSStartedFrame,
|
|
74
|
+
TTSStoppedFrame,
|
|
75
|
+
)
|
|
76
|
+
from pipecat.services.tts_service import TTSService
|
|
77
|
+
from pipecat.utils.tracing.service_decorators import traced_tts
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class InworldTTSService(TTSService):
|
|
81
|
+
"""Inworld AI HTTP-based Text-to-Speech Service.
|
|
82
|
+
|
|
83
|
+
This unified service integrates Inworld AI's high-quality TTS API with Pipecat's pipeline
|
|
84
|
+
architecture. It supports both streaming and non-streaming modes, providing flexible
|
|
85
|
+
speech synthesis with natural-sounding voices.
|
|
86
|
+
|
|
87
|
+
Key Features:
|
|
88
|
+
|
|
89
|
+
- **Streaming Mode**: Real-time HTTP streaming for minimal latency
|
|
90
|
+
- **Non-Streaming Mode**: Complete audio synthesis then chunked playback
|
|
91
|
+
- Multiple voice options (Ashley, Hades, etc.)
|
|
92
|
+
- High-quality audio output (48kHz LINEAR16 PCM)
|
|
93
|
+
- Automatic audio format handling and header stripping
|
|
94
|
+
- Comprehensive error handling and recovery
|
|
95
|
+
- Built-in performance metrics and monitoring
|
|
96
|
+
- Unified interface for both modes
|
|
97
|
+
|
|
98
|
+
Technical Architecture:
|
|
99
|
+
|
|
100
|
+
- Uses aiohttp for non-blocking HTTP requests
|
|
101
|
+
- **Streaming**: Implements JSON line-by-line streaming protocol
|
|
102
|
+
- **Non-Streaming**: Single HTTP POST with complete response
|
|
103
|
+
- Processes base64-encoded audio chunks in real-time or batch
|
|
104
|
+
- Manages audio continuity to prevent artifacts
|
|
105
|
+
- Integrates with Pipecat's frame-based pipeline system
|
|
106
|
+
|
|
107
|
+
Supported Configuration:
|
|
108
|
+
|
|
109
|
+
- Voice Selection: Ashley, Hades, and other Inworld voices
|
|
110
|
+
- Models: inworld-tts-1 and other available models
|
|
111
|
+
- Audio Formats: LINEAR16 PCM at various sample rates
|
|
112
|
+
- Language Detection: Automatically inferred from input text (no explicit language setting required)
|
|
113
|
+
- Mode Selection: streaming=True for real-time, streaming=False for complete synthesis
|
|
114
|
+
|
|
115
|
+
Examples::
|
|
116
|
+
|
|
117
|
+
async with aiohttp.ClientSession() as session:
|
|
118
|
+
# Streaming mode (default) - Real-time audio generation
|
|
119
|
+
tts_streaming = InworldTTSService(
|
|
120
|
+
api_key=os.getenv("INWORLD_API_KEY"),
|
|
121
|
+
aiohttp_session=session,
|
|
122
|
+
voice_id="Ashley",
|
|
123
|
+
model="inworld-tts-1",
|
|
124
|
+
streaming=True, # Default behavior
|
|
125
|
+
params=InworldTTSService.InputParams(
|
|
126
|
+
temperature=0.8, # Add variability to speech synthesis (range: [0, 2])
|
|
127
|
+
),
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# Non-streaming mode - Complete audio then playback
|
|
131
|
+
tts_complete = InworldTTSService(
|
|
132
|
+
api_key=os.getenv("INWORLD_API_KEY"),
|
|
133
|
+
aiohttp_session=session,
|
|
134
|
+
voice_id="Hades",
|
|
135
|
+
model="inworld-tts-1-max",
|
|
136
|
+
streaming=False,
|
|
137
|
+
params=InworldTTSService.InputParams(
|
|
138
|
+
temperature=0.8,
|
|
139
|
+
),
|
|
140
|
+
)
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
class InputParams(BaseModel):
|
|
144
|
+
"""Optional input parameters for Inworld TTS configuration.
|
|
145
|
+
|
|
146
|
+
Parameters:
|
|
147
|
+
temperature: Voice temperature control for synthesis variability (e.g., 0.8).
|
|
148
|
+
Valid range: [0, 2]. Higher values increase variability.
|
|
149
|
+
|
|
150
|
+
Note:
|
|
151
|
+
Language is automatically inferred from the input text by Inworld's TTS models,
|
|
152
|
+
so no explicit language parameter is required.
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
temperature: Optional[float] = None # optional temperature control (range: [0, 2])
|
|
156
|
+
|
|
157
|
+
def __init__(
|
|
158
|
+
self,
|
|
159
|
+
*,
|
|
160
|
+
api_key: str,
|
|
161
|
+
aiohttp_session: aiohttp.ClientSession,
|
|
162
|
+
voice_id: str = "Ashley",
|
|
163
|
+
model: str = "inworld-tts-1",
|
|
164
|
+
streaming: bool = True,
|
|
165
|
+
sample_rate: Optional[int] = None,
|
|
166
|
+
encoding: str = "LINEAR16",
|
|
167
|
+
params: Optional[InputParams] = None,
|
|
168
|
+
**kwargs,
|
|
169
|
+
):
|
|
170
|
+
"""Initialize the Inworld TTS service.
|
|
171
|
+
|
|
172
|
+
Sets up the TTS service with Inworld AI's API configuration.
|
|
173
|
+
This constructor prepares all necessary parameters for speech synthesis.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
api_key: Inworld API key for authentication (base64-encoded from Inworld Portal).
|
|
177
|
+
Get this from: Inworld Portal > Settings > API Keys > Runtime API Key
|
|
178
|
+
aiohttp_session: Shared aiohttp session for HTTP requests. Must be provided
|
|
179
|
+
for proper connection pooling and resource management.
|
|
180
|
+
voice_id: Voice selection for speech synthesis. Common options include:
|
|
181
|
+
- "Ashley": Clear, professional female voice (default)
|
|
182
|
+
- "Hades": Deep, authoritative male voice
|
|
183
|
+
- And many more available in your Inworld account
|
|
184
|
+
model: TTS model to use for speech synthesis:
|
|
185
|
+
- "inworld-tts-1": Standard quality model (default)
|
|
186
|
+
- "inworld-tts-1-max": Higher quality model
|
|
187
|
+
- Other models as available in your Inworld account
|
|
188
|
+
streaming: Whether to use streaming mode (True) or non-streaming mode (False).
|
|
189
|
+
- True: Real-time audio chunks as they're generated (lower latency)
|
|
190
|
+
- False: Complete audio file generated first, then chunked for playback (simpler)
|
|
191
|
+
The base URL is automatically selected based on this mode:
|
|
192
|
+
- Streaming: "https://api.inworld.ai/tts/v1/voice:stream"
|
|
193
|
+
- Non-streaming: "https://api.inworld.ai/tts/v1/voice"
|
|
194
|
+
sample_rate: Audio sample rate in Hz. If None, uses default from StartFrame.
|
|
195
|
+
Common values: 48000 (high quality), 24000 (good quality), 16000 (basic)
|
|
196
|
+
encoding: Audio encoding format. Supported options:
|
|
197
|
+
- "LINEAR16" (default) - Uncompressed PCM, best quality
|
|
198
|
+
- Other formats as supported by Inworld API
|
|
199
|
+
params: Optional input parameters for additional configuration. Use this to specify:
|
|
200
|
+
- temperature: Voice temperature control for variability (range: [0, 2], e.g., 0.8, optional)
|
|
201
|
+
Language is automatically inferred from input text.
|
|
202
|
+
**kwargs: Additional arguments passed to the parent TTSService class.
|
|
203
|
+
|
|
204
|
+
Note:
|
|
205
|
+
The aiohttp_session parameter is required because Inworld's HTTP API
|
|
206
|
+
benefits from connection reuse and proper async session management.
|
|
207
|
+
"""
|
|
208
|
+
# Initialize parent TTSService with audio configuration
|
|
209
|
+
super().__init__(sample_rate=sample_rate, **kwargs)
|
|
210
|
+
|
|
211
|
+
# Use provided params or create default configuration
|
|
212
|
+
params = params or InworldTTSService.InputParams()
|
|
213
|
+
|
|
214
|
+
# Store core configuration for API requests
|
|
215
|
+
self._api_key = api_key # Authentication credentials
|
|
216
|
+
self._session = aiohttp_session # HTTP session for requests
|
|
217
|
+
self._streaming = streaming # Streaming mode selection
|
|
218
|
+
|
|
219
|
+
# Set base URL based on streaming mode
|
|
220
|
+
if streaming:
|
|
221
|
+
self._base_url = "https://api.inworld.ai/tts/v1/voice:stream" # Streaming endpoint
|
|
222
|
+
else:
|
|
223
|
+
self._base_url = "https://api.inworld.ai/tts/v1/voice" # Non-streaming endpoint
|
|
224
|
+
|
|
225
|
+
# Build settings dictionary that matches Inworld's API expectations
|
|
226
|
+
# This will be sent as JSON payload in each TTS request
|
|
227
|
+
# Note: Language is automatically inferred from text by Inworld's models
|
|
228
|
+
self._settings = {
|
|
229
|
+
"voiceId": voice_id, # Voice selection from direct parameter
|
|
230
|
+
"modelId": model, # TTS model selection from direct parameter
|
|
231
|
+
"audio_config": { # Audio format configuration
|
|
232
|
+
"audio_encoding": encoding, # Format: LINEAR16, MP3, etc.
|
|
233
|
+
"sample_rate_hertz": 0, # Will be set in start() from parent service
|
|
234
|
+
},
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
# Add optional temperature parameter if provided (valid range: [0, 2])
|
|
238
|
+
if params and params.temperature is not None:
|
|
239
|
+
self._settings["temperature"] = params.temperature
|
|
240
|
+
|
|
241
|
+
# Register voice and model with parent service for metrics and tracking
|
|
242
|
+
self.set_voice(voice_id) # Used for logging and metrics
|
|
243
|
+
self.set_model_name(model) # Used for performance tracking
|
|
244
|
+
|
|
245
|
+
def can_generate_metrics(self) -> bool:
|
|
246
|
+
"""Check if this service can generate processing metrics.
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
True, as Inworld TTS service supports metrics generation.
|
|
250
|
+
"""
|
|
251
|
+
return True
|
|
252
|
+
|
|
253
|
+
async def start(self, frame: StartFrame):
|
|
254
|
+
"""Start the Inworld TTS service.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
frame: The start frame containing initialization parameters.
|
|
258
|
+
"""
|
|
259
|
+
await super().start(frame)
|
|
260
|
+
self._settings["audio_config"]["sample_rate_hertz"] = self.sample_rate
|
|
261
|
+
|
|
262
|
+
async def stop(self, frame: EndFrame):
|
|
263
|
+
"""Stop the Inworld TTS service.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
frame: The end frame.
|
|
267
|
+
"""
|
|
268
|
+
await super().stop(frame)
|
|
269
|
+
|
|
270
|
+
async def cancel(self, frame: CancelFrame):
|
|
271
|
+
"""Cancel the Inworld TTS service.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
frame: The cancel frame.
|
|
275
|
+
"""
|
|
276
|
+
await super().cancel(frame)
|
|
277
|
+
|
|
278
|
+
@traced_tts
|
|
279
|
+
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
|
280
|
+
"""Generate speech from text using Inworld's HTTP API.
|
|
281
|
+
|
|
282
|
+
This is the core TTS processing function that adapts its behavior based on the streaming mode:
|
|
283
|
+
|
|
284
|
+
**Streaming Mode (streaming=True)**:
|
|
285
|
+
1. Sends text to Inworld's streaming TTS endpoint
|
|
286
|
+
2. Receives JSON-streamed audio chunks in real-time
|
|
287
|
+
3. Processes and cleans audio data (removes WAV headers, validates content)
|
|
288
|
+
4. Yields audio frames for immediate playback in the pipeline
|
|
289
|
+
|
|
290
|
+
**Non-Streaming Mode (streaming=False)**:
|
|
291
|
+
1. Sends text to Inworld's non-streaming TTS endpoint
|
|
292
|
+
2. Receives complete audio file as base64-encoded response
|
|
293
|
+
3. Processes entire audio and chunks for playback
|
|
294
|
+
4. Yields audio frames in manageable pieces
|
|
295
|
+
|
|
296
|
+
Technical Details:
|
|
297
|
+
|
|
298
|
+
- **Streaming**: Uses HTTP streaming with JSON line-by-line responses
|
|
299
|
+
- **Non-Streaming**: Single HTTP POST with complete JSON response
|
|
300
|
+
- Each audio chunk contains base64-encoded audio data
|
|
301
|
+
- Implements buffering to handle partial data (streaming mode)
|
|
302
|
+
- Strips WAV headers to prevent audio artifacts/clicks
|
|
303
|
+
- Provides optimized audio delivery for each mode
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
text: The text to synthesize into speech.
|
|
307
|
+
|
|
308
|
+
Yields:
|
|
309
|
+
Frame: Audio frames containing the synthesized speech, plus control frames.
|
|
310
|
+
|
|
311
|
+
Raises:
|
|
312
|
+
ErrorFrame: If API errors occur or audio processing fails.
|
|
313
|
+
"""
|
|
314
|
+
logger.debug(f"{self}: Generating TTS [{text}] (streaming={self._streaming})")
|
|
315
|
+
|
|
316
|
+
# ================================================================================
|
|
317
|
+
# STEP 1: PREPARE API REQUEST
|
|
318
|
+
# ================================================================================
|
|
319
|
+
# Build the JSON payload according to Inworld's API specification
|
|
320
|
+
# This matches the format shown in their documentation examples
|
|
321
|
+
# Note: Language is automatically inferred from the input text by Inworld's models
|
|
322
|
+
payload = {
|
|
323
|
+
"text": text, # Text to synthesize
|
|
324
|
+
"voiceId": self._settings["voiceId"], # Voice selection (Ashley, Hades, etc.)
|
|
325
|
+
"modelId": self._settings["modelId"], # TTS model (inworld-tts-1)
|
|
326
|
+
"audio_config": self._settings[
|
|
327
|
+
"audio_config"
|
|
328
|
+
], # Audio format settings (LINEAR16, 48kHz)
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
# Add optional temperature parameter if configured (valid range: [0, 2])
|
|
332
|
+
if "temperature" in self._settings:
|
|
333
|
+
payload["temperature"] = self._settings["temperature"]
|
|
334
|
+
|
|
335
|
+
# Set up HTTP headers for authentication and content type
|
|
336
|
+
# Inworld requires Basic auth with base64-encoded API key
|
|
337
|
+
headers = {
|
|
338
|
+
"Authorization": f"Basic {self._api_key}", # Base64 API key from Inworld Portal
|
|
339
|
+
"Content-Type": "application/json", # JSON request body
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
try:
|
|
343
|
+
# ================================================================================
|
|
344
|
+
# STEP 2: INITIALIZE METRICS AND PROCESSING
|
|
345
|
+
# ================================================================================
|
|
346
|
+
# Start measuring Time To First Byte (TTFB) for performance tracking
|
|
347
|
+
await self.start_ttfb_metrics()
|
|
348
|
+
|
|
349
|
+
# Signal to the pipeline that TTS generation has started
|
|
350
|
+
# This allows downstream processors to prepare for incoming audio
|
|
351
|
+
yield TTSStartedFrame()
|
|
352
|
+
|
|
353
|
+
# ================================================================================
|
|
354
|
+
# STEP 3: MAKE HTTP REQUEST (MODE-SPECIFIC)
|
|
355
|
+
# ================================================================================
|
|
356
|
+
# Use aiohttp to make request to Inworld's endpoint
|
|
357
|
+
# Behavior differs based on streaming mode
|
|
358
|
+
async with self._session.post(
|
|
359
|
+
self._base_url, json=payload, headers=headers
|
|
360
|
+
) as response:
|
|
361
|
+
# ================================================================================
|
|
362
|
+
# STEP 4: HANDLE HTTP ERRORS
|
|
363
|
+
# ================================================================================
|
|
364
|
+
# Check for API errors (expired keys, invalid requests, etc.)
|
|
365
|
+
if response.status != 200:
|
|
366
|
+
error_text = await response.text()
|
|
367
|
+
logger.error(f"Inworld API error: {error_text}")
|
|
368
|
+
await self.push_error(ErrorFrame(f"Inworld API error: {error_text}"))
|
|
369
|
+
return
|
|
370
|
+
|
|
371
|
+
# ================================================================================
|
|
372
|
+
# STEP 5: PROCESS RESPONSE (MODE-SPECIFIC)
|
|
373
|
+
# ================================================================================
|
|
374
|
+
# Choose processing method based on streaming mode
|
|
375
|
+
if self._streaming:
|
|
376
|
+
# Stream processing: JSON line-by-line with real-time audio
|
|
377
|
+
async for frame in self._process_streaming_response(response):
|
|
378
|
+
yield frame
|
|
379
|
+
else:
|
|
380
|
+
# Non-stream processing: Complete JSON response with batch audio
|
|
381
|
+
async for frame in self._process_non_streaming_response(response):
|
|
382
|
+
yield frame
|
|
383
|
+
|
|
384
|
+
# ================================================================================
|
|
385
|
+
# STEP 6: FINALIZE METRICS AND CLEANUP
|
|
386
|
+
# ================================================================================
|
|
387
|
+
# Start usage metrics tracking after successful completion
|
|
388
|
+
await self.start_tts_usage_metrics(text)
|
|
389
|
+
|
|
390
|
+
except Exception as e:
|
|
391
|
+
# ================================================================================
|
|
392
|
+
# STEP 7: ERROR HANDLING
|
|
393
|
+
# ================================================================================
|
|
394
|
+
# Log any unexpected errors and notify the pipeline
|
|
395
|
+
logger.error(f"{self} exception: {e}")
|
|
396
|
+
await self.push_error(ErrorFrame(f"Error generating TTS: {e}"))
|
|
397
|
+
finally:
|
|
398
|
+
# ================================================================================
|
|
399
|
+
# STEP 8: CLEANUP AND COMPLETION
|
|
400
|
+
# ================================================================================
|
|
401
|
+
# Always stop metrics tracking, even if errors occurred
|
|
402
|
+
await self.stop_all_metrics()
|
|
403
|
+
|
|
404
|
+
# Signal to pipeline that TTS generation is complete
|
|
405
|
+
# This allows downstream processors to finalize audio processing
|
|
406
|
+
yield TTSStoppedFrame()
|
|
407
|
+
|
|
408
|
+
async def _process_streaming_response(
|
|
409
|
+
self, response: aiohttp.ClientResponse
|
|
410
|
+
) -> AsyncGenerator[Frame, None]:
|
|
411
|
+
"""Process streaming JSON response with real-time audio chunks.
|
|
412
|
+
|
|
413
|
+
This method handles Inworld's streaming endpoint response format:
|
|
414
|
+
- JSON lines containing base64-encoded audio chunks
|
|
415
|
+
- Real-time processing as data arrives
|
|
416
|
+
- Line buffering to handle partial JSON data
|
|
417
|
+
|
|
418
|
+
Args:
|
|
419
|
+
response: The aiohttp response object from streaming endpoint.
|
|
420
|
+
|
|
421
|
+
Yields:
|
|
422
|
+
Frame: Audio frames as they're processed from the stream.
|
|
423
|
+
"""
|
|
424
|
+
# ================================================================================
|
|
425
|
+
# STREAMING: PROCESS JSON LINE-BY-LINE RESPONSE
|
|
426
|
+
# ================================================================================
|
|
427
|
+
# Inworld streams JSON lines where each line contains audio data
|
|
428
|
+
# We need to buffer incoming data and process complete lines
|
|
429
|
+
|
|
430
|
+
# Buffer to accumulate incoming text data
|
|
431
|
+
# This handles cases where JSON lines are split across HTTP chunks
|
|
432
|
+
buffer = ""
|
|
433
|
+
|
|
434
|
+
# Read HTTP response in manageable chunks (1KB each)
|
|
435
|
+
# This prevents memory issues with large responses
|
|
436
|
+
async for chunk in response.content.iter_chunked(1024):
|
|
437
|
+
if not chunk:
|
|
438
|
+
continue
|
|
439
|
+
|
|
440
|
+
# ============================================================================
|
|
441
|
+
# BUFFER MANAGEMENT
|
|
442
|
+
# ============================================================================
|
|
443
|
+
# Decode binary chunk to text and add to our line buffer
|
|
444
|
+
# Each chunk may contain partial JSON lines, so we need to accumulate
|
|
445
|
+
buffer += chunk.decode("utf-8")
|
|
446
|
+
|
|
447
|
+
# ============================================================================
|
|
448
|
+
# LINE-BY-LINE JSON PROCESSING
|
|
449
|
+
# ============================================================================
|
|
450
|
+
# Process all complete lines in the buffer (lines ending with \n)
|
|
451
|
+
# Leave partial lines in buffer for next iteration
|
|
452
|
+
while "\n" in buffer:
|
|
453
|
+
# Split on first newline, keeping remainder in buffer
|
|
454
|
+
line, buffer = buffer.split("\n", 1)
|
|
455
|
+
line_str = line.strip()
|
|
456
|
+
|
|
457
|
+
# Skip empty lines (common in streaming responses)
|
|
458
|
+
if not line_str:
|
|
459
|
+
continue
|
|
460
|
+
|
|
461
|
+
try:
|
|
462
|
+
# ================================================================
|
|
463
|
+
# PARSE JSON AND EXTRACT AUDIO
|
|
464
|
+
# ================================================================
|
|
465
|
+
# Parse the JSON line - should contain audio data
|
|
466
|
+
chunk_data = json.loads(line_str)
|
|
467
|
+
|
|
468
|
+
# Check if this line contains audio content
|
|
469
|
+
# Inworld's response format: {"result": {"audioContent": "base64data"}}
|
|
470
|
+
if "result" in chunk_data and "audioContent" in chunk_data["result"]:
|
|
471
|
+
# Process the audio chunk
|
|
472
|
+
await self.stop_ttfb_metrics()
|
|
473
|
+
async for frame in self._process_audio_chunk(
|
|
474
|
+
base64.b64decode(chunk_data["result"]["audioContent"])
|
|
475
|
+
):
|
|
476
|
+
yield frame
|
|
477
|
+
|
|
478
|
+
except json.JSONDecodeError:
|
|
479
|
+
# Ignore malformed JSON lines - streaming can have partial data
|
|
480
|
+
# This is normal in HTTP streaming scenarios
|
|
481
|
+
continue
|
|
482
|
+
|
|
483
|
+
async def _process_non_streaming_response(
|
|
484
|
+
self, response: aiohttp.ClientResponse
|
|
485
|
+
) -> AsyncGenerator[Frame, None]:
|
|
486
|
+
"""Process complete JSON response with full audio content.
|
|
487
|
+
|
|
488
|
+
This method handles Inworld's non-streaming endpoint response format:
|
|
489
|
+
- Single JSON response with complete base64-encoded audio
|
|
490
|
+
- Full audio download then chunked playback
|
|
491
|
+
- Simpler processing without line buffering
|
|
492
|
+
|
|
493
|
+
Args:
|
|
494
|
+
response: The aiohttp response object from non-streaming endpoint.
|
|
495
|
+
|
|
496
|
+
Yields:
|
|
497
|
+
Frame: Audio frames chunked from the complete audio.
|
|
498
|
+
"""
|
|
499
|
+
# ================================================================================
|
|
500
|
+
# NON-STREAMING: PARSE COMPLETE JSON RESPONSE
|
|
501
|
+
# ================================================================================
|
|
502
|
+
# Parse the complete JSON response containing base64 audio data
|
|
503
|
+
response_data = await response.json()
|
|
504
|
+
|
|
505
|
+
# ================================================================================
|
|
506
|
+
# EXTRACT AND VALIDATE AUDIO CONTENT
|
|
507
|
+
# ================================================================================
|
|
508
|
+
# Extract the base64-encoded audio content from response
|
|
509
|
+
if "audioContent" not in response_data:
|
|
510
|
+
logger.error("No audioContent in Inworld API response")
|
|
511
|
+
await self.push_error(ErrorFrame("No audioContent in response"))
|
|
512
|
+
return
|
|
513
|
+
|
|
514
|
+
# ================================================================================
|
|
515
|
+
# DECODE AND PROCESS COMPLETE AUDIO DATA
|
|
516
|
+
# ================================================================================
|
|
517
|
+
# Decode the base64 audio data to binary
|
|
518
|
+
audio_data = base64.b64decode(response_data["audioContent"])
|
|
519
|
+
|
|
520
|
+
# Strip WAV header if present (Inworld may include WAV header)
|
|
521
|
+
# This prevents audio clicks and ensures clean audio playback
|
|
522
|
+
if len(audio_data) > 44 and audio_data.startswith(b"RIFF"):
|
|
523
|
+
audio_data = audio_data[44:]
|
|
524
|
+
|
|
525
|
+
# ================================================================================
|
|
526
|
+
# CHUNK AND YIELD COMPLETE AUDIO FOR PLAYBACK
|
|
527
|
+
# ================================================================================
|
|
528
|
+
# Chunk the complete audio for streaming playback
|
|
529
|
+
# This allows the pipeline to process audio in manageable pieces
|
|
530
|
+
CHUNK_SIZE = self.chunk_size
|
|
531
|
+
|
|
532
|
+
for i in range(0, len(audio_data), CHUNK_SIZE):
|
|
533
|
+
chunk = audio_data[i : i + CHUNK_SIZE]
|
|
534
|
+
if len(chunk) > 0:
|
|
535
|
+
await self.stop_ttfb_metrics()
|
|
536
|
+
yield TTSAudioRawFrame(
|
|
537
|
+
audio=chunk,
|
|
538
|
+
sample_rate=self.sample_rate,
|
|
539
|
+
num_channels=1,
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
async def _process_audio_chunk(self, audio_chunk: bytes) -> AsyncGenerator[Frame, None]:
|
|
543
|
+
"""Process a single audio chunk (common logic for both modes).
|
|
544
|
+
|
|
545
|
+
This method handles audio chunk processing that's common to both streaming
|
|
546
|
+
and non-streaming modes:
|
|
547
|
+
- WAV header removal
|
|
548
|
+
- Audio validation
|
|
549
|
+
- Frame creation and yielding
|
|
550
|
+
|
|
551
|
+
Args:
|
|
552
|
+
audio_chunk: Raw audio data bytes to process.
|
|
553
|
+
|
|
554
|
+
Yields:
|
|
555
|
+
Frame: Audio frame if chunk contains valid audio data.
|
|
556
|
+
"""
|
|
557
|
+
# ========================================================
|
|
558
|
+
# AUDIO DATA VALIDATION
|
|
559
|
+
# ========================================================
|
|
560
|
+
# Skip empty audio chunks that could cause discontinuities
|
|
561
|
+
# Empty chunks can create gaps or clicks in audio playback
|
|
562
|
+
if not audio_chunk:
|
|
563
|
+
return
|
|
564
|
+
|
|
565
|
+
# Start with the raw audio data
|
|
566
|
+
audio_data = audio_chunk
|
|
567
|
+
|
|
568
|
+
# ========================================================
|
|
569
|
+
# WAV HEADER REMOVAL (CRITICAL FOR AUDIO QUALITY)
|
|
570
|
+
# ========================================================
|
|
571
|
+
# Each audio chunk may have its own WAV header (44 bytes)
|
|
572
|
+
# These headers contain metadata and will sound like clicks if played
|
|
573
|
+
# We must strip them from EVERY chunk, not just the first one
|
|
574
|
+
if (
|
|
575
|
+
len(audio_chunk) > 44 # Ensure chunk is large enough
|
|
576
|
+
and audio_chunk.startswith(b"RIFF") # Check for WAV header magic bytes
|
|
577
|
+
):
|
|
578
|
+
# Remove the 44-byte WAV header to get pure audio data
|
|
579
|
+
audio_data = audio_chunk[44:]
|
|
580
|
+
|
|
581
|
+
# ========================================================
|
|
582
|
+
# YIELD AUDIO FRAME TO PIPELINE
|
|
583
|
+
# ========================================================
|
|
584
|
+
# Only yield frames with actual audio content
|
|
585
|
+
# Empty frames can cause pipeline issues
|
|
586
|
+
if len(audio_data) > 0:
|
|
587
|
+
# Create Pipecat audio frame with processed audio data
|
|
588
|
+
yield TTSAudioRawFrame(
|
|
589
|
+
audio=audio_data, # Clean audio without headers
|
|
590
|
+
sample_rate=self.sample_rate, # Configured sample rate (48kHz)
|
|
591
|
+
num_channels=1, # Mono audio
|
|
592
|
+
)
|