roomkit 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- roomkit/__init__.py +45 -0
- roomkit/_version.py +1 -1
- roomkit/channels/voice.py +728 -0
- roomkit/core/_channel_ops.py +7 -0
- roomkit/core/_inbound.py +4 -0
- roomkit/core/framework.py +177 -1
- roomkit/core/hooks.py +32 -6
- roomkit/models/enums.py +12 -0
- roomkit/sources/__init__.py +4 -4
- roomkit/sources/sse.py +226 -0
- roomkit/voice/__init__.py +99 -0
- roomkit/voice/backends/__init__.py +1 -0
- roomkit/voice/backends/base.py +264 -0
- roomkit/voice/backends/fastrtc.py +467 -0
- roomkit/voice/backends/mock.py +302 -0
- roomkit/voice/base.py +115 -0
- roomkit/voice/events.py +140 -0
- roomkit/voice/stt/__init__.py +1 -0
- roomkit/voice/stt/base.py +58 -0
- roomkit/voice/stt/deepgram.py +214 -0
- roomkit/voice/stt/mock.py +40 -0
- roomkit/voice/tts/__init__.py +1 -0
- roomkit/voice/tts/base.py +58 -0
- roomkit/voice/tts/elevenlabs.py +329 -0
- roomkit/voice/tts/mock.py +51 -0
- {roomkit-0.1.0.dist-info → roomkit-0.2.0.dist-info}/METADATA +11 -2
- {roomkit-0.1.0.dist-info → roomkit-0.2.0.dist-info}/RECORD +29 -12
- {roomkit-0.1.0.dist-info → roomkit-0.2.0.dist-info}/WHEEL +1 -1
- {roomkit-0.1.0.dist-info → roomkit-0.2.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""Deepgram speech-to-text provider."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
from collections.abc import AsyncIterator
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import TYPE_CHECKING, Any
|
|
10
|
+
|
|
11
|
+
import httpx
|
|
12
|
+
|
|
13
|
+
from roomkit.voice.base import AudioChunk, TranscriptionResult
|
|
14
|
+
from roomkit.voice.stt.base import STTProvider
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from roomkit.models.event import AudioContent
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class DeepgramConfig:
|
|
24
|
+
"""Configuration for Deepgram STT provider."""
|
|
25
|
+
|
|
26
|
+
api_key: str
|
|
27
|
+
model: str = "nova-2"
|
|
28
|
+
language: str = "en"
|
|
29
|
+
punctuate: bool = True
|
|
30
|
+
diarize: bool = False
|
|
31
|
+
smart_format: bool = True
|
|
32
|
+
filler_words: bool = False
|
|
33
|
+
# Real-time streaming options
|
|
34
|
+
interim_results: bool = True
|
|
35
|
+
endpointing: int = 300 # ms of silence to end utterance
|
|
36
|
+
vad_events: bool = True
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class DeepgramSTTProvider(STTProvider):
|
|
40
|
+
"""Deepgram speech-to-text provider with streaming support."""
|
|
41
|
+
|
|
42
|
+
def __init__(self, config: DeepgramConfig) -> None:
|
|
43
|
+
self._config = config
|
|
44
|
+
self._client: httpx.AsyncClient | None = None
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def name(self) -> str:
|
|
48
|
+
return "DeepgramSTT"
|
|
49
|
+
|
|
50
|
+
def _get_client(self) -> httpx.AsyncClient:
|
|
51
|
+
if self._client is None:
|
|
52
|
+
self._client = httpx.AsyncClient(
|
|
53
|
+
base_url="https://api.deepgram.com/v1",
|
|
54
|
+
headers={
|
|
55
|
+
"Authorization": f"Token {self._config.api_key}",
|
|
56
|
+
"Content-Type": "audio/wav",
|
|
57
|
+
},
|
|
58
|
+
timeout=60.0,
|
|
59
|
+
)
|
|
60
|
+
return self._client
|
|
61
|
+
|
|
62
|
+
def _build_query_params(self) -> dict[str, Any]:
|
|
63
|
+
"""Build query parameters for Deepgram API."""
|
|
64
|
+
params: dict[str, Any] = {
|
|
65
|
+
"model": self._config.model,
|
|
66
|
+
"language": self._config.language,
|
|
67
|
+
"punctuate": self._config.punctuate,
|
|
68
|
+
"diarize": self._config.diarize,
|
|
69
|
+
"smart_format": self._config.smart_format,
|
|
70
|
+
"filler_words": self._config.filler_words,
|
|
71
|
+
}
|
|
72
|
+
return {k: str(v).lower() if isinstance(v, bool) else v for k, v in params.items()}
|
|
73
|
+
|
|
74
|
+
async def transcribe(self, audio: AudioContent | AudioChunk) -> str:
|
|
75
|
+
"""Transcribe complete audio to text.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
audio: Audio content (URL) or raw audio chunk.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Transcribed text.
|
|
82
|
+
"""
|
|
83
|
+
client = self._get_client()
|
|
84
|
+
params = self._build_query_params()
|
|
85
|
+
|
|
86
|
+
# Handle AudioContent (URL-based)
|
|
87
|
+
if hasattr(audio, "url"):
|
|
88
|
+
# Fetch audio from URL
|
|
89
|
+
async with httpx.AsyncClient() as fetch_client:
|
|
90
|
+
resp = await fetch_client.get(audio.url)
|
|
91
|
+
resp.raise_for_status()
|
|
92
|
+
audio_data = resp.content
|
|
93
|
+
content_type = resp.headers.get("content-type", "audio/wav")
|
|
94
|
+
else:
|
|
95
|
+
# Handle AudioChunk (raw bytes)
|
|
96
|
+
audio_data = audio.data
|
|
97
|
+
audio_format = getattr(audio, "format", "wav")
|
|
98
|
+
|
|
99
|
+
# For raw PCM formats, set encoding params for Deepgram
|
|
100
|
+
if audio_format in ("pcm_s16le", "linear16", "raw"):
|
|
101
|
+
content_type = "audio/raw"
|
|
102
|
+
params["encoding"] = "linear16"
|
|
103
|
+
params["sample_rate"] = getattr(audio, "sample_rate", 16000)
|
|
104
|
+
params["channels"] = getattr(audio, "channels", 1)
|
|
105
|
+
else:
|
|
106
|
+
content_type = f"audio/{audio_format}"
|
|
107
|
+
|
|
108
|
+
# Call Deepgram API
|
|
109
|
+
response = await client.post(
|
|
110
|
+
"/listen",
|
|
111
|
+
params=params,
|
|
112
|
+
content=audio_data,
|
|
113
|
+
headers={"Content-Type": content_type},
|
|
114
|
+
)
|
|
115
|
+
response.raise_for_status()
|
|
116
|
+
result = response.json()
|
|
117
|
+
|
|
118
|
+
# Extract transcript
|
|
119
|
+
try:
|
|
120
|
+
transcript: str = result["results"]["channels"][0]["alternatives"][0]["transcript"]
|
|
121
|
+
return transcript.strip()
|
|
122
|
+
except (KeyError, IndexError):
|
|
123
|
+
logger.warning("No transcript in Deepgram response: %s", result)
|
|
124
|
+
return ""
|
|
125
|
+
|
|
126
|
+
async def transcribe_stream(
|
|
127
|
+
self, audio_stream: AsyncIterator[AudioChunk]
|
|
128
|
+
) -> AsyncIterator[TranscriptionResult]:
|
|
129
|
+
"""Stream transcription with partial results using WebSocket.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
audio_stream: Async iterator of audio chunks.
|
|
133
|
+
|
|
134
|
+
Yields:
|
|
135
|
+
TranscriptionResult with partial and final transcripts.
|
|
136
|
+
"""
|
|
137
|
+
import websockets
|
|
138
|
+
|
|
139
|
+
# Build WebSocket URL with query params
|
|
140
|
+
params = self._build_query_params()
|
|
141
|
+
params["interim_results"] = str(self._config.interim_results).lower()
|
|
142
|
+
params["endpointing"] = self._config.endpointing
|
|
143
|
+
params["vad_events"] = str(self._config.vad_events).lower()
|
|
144
|
+
params["encoding"] = "linear16"
|
|
145
|
+
params["sample_rate"] = "16000"
|
|
146
|
+
|
|
147
|
+
query_string = "&".join(f"{k}={v}" for k, v in params.items())
|
|
148
|
+
ws_url = f"wss://api.deepgram.com/v1/listen?{query_string}"
|
|
149
|
+
|
|
150
|
+
headers = [("Authorization", f"Token {self._config.api_key}")]
|
|
151
|
+
|
|
152
|
+
async with websockets.connect(ws_url, additional_headers=headers) as ws:
|
|
153
|
+
# Start sender task
|
|
154
|
+
async def send_audio() -> None:
|
|
155
|
+
try:
|
|
156
|
+
async for chunk in audio_stream:
|
|
157
|
+
if chunk.data:
|
|
158
|
+
await ws.send(chunk.data)
|
|
159
|
+
if chunk.is_final:
|
|
160
|
+
# Send close frame to signal end of audio
|
|
161
|
+
await ws.send(b"")
|
|
162
|
+
break
|
|
163
|
+
except Exception as e:
|
|
164
|
+
logger.error("Error sending audio to Deepgram: %s", e)
|
|
165
|
+
|
|
166
|
+
sender_task = asyncio.create_task(send_audio())
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
async for message in ws:
|
|
170
|
+
if isinstance(message, bytes):
|
|
171
|
+
continue
|
|
172
|
+
|
|
173
|
+
import json
|
|
174
|
+
|
|
175
|
+
data = json.loads(message)
|
|
176
|
+
|
|
177
|
+
# Handle transcription results
|
|
178
|
+
if data.get("type") == "Results":
|
|
179
|
+
channel = data.get("channel", {})
|
|
180
|
+
alternatives = channel.get("alternatives", [])
|
|
181
|
+
if alternatives:
|
|
182
|
+
alt = alternatives[0]
|
|
183
|
+
transcript = alt.get("transcript", "")
|
|
184
|
+
confidence = alt.get("confidence")
|
|
185
|
+
words = alt.get("words", [])
|
|
186
|
+
is_final = data.get("is_final", False)
|
|
187
|
+
|
|
188
|
+
if transcript:
|
|
189
|
+
yield TranscriptionResult(
|
|
190
|
+
text=transcript,
|
|
191
|
+
is_final=is_final,
|
|
192
|
+
confidence=confidence,
|
|
193
|
+
language=data.get("channel", {}).get("detected_language"),
|
|
194
|
+
words=words,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# Handle speech events
|
|
198
|
+
elif data.get("type") == "SpeechStarted":
|
|
199
|
+
logger.debug("Speech started")
|
|
200
|
+
elif data.get("type") == "UtteranceEnd":
|
|
201
|
+
logger.debug("Utterance ended")
|
|
202
|
+
|
|
203
|
+
finally:
|
|
204
|
+
sender_task.cancel()
|
|
205
|
+
import contextlib
|
|
206
|
+
|
|
207
|
+
with contextlib.suppress(asyncio.CancelledError):
|
|
208
|
+
await sender_task
|
|
209
|
+
|
|
210
|
+
async def close(self) -> None: # noqa: B027
|
|
211
|
+
"""Release resources."""
|
|
212
|
+
if self._client:
|
|
213
|
+
await self._client.aclose()
|
|
214
|
+
self._client = None
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Mock speech-to-text provider for testing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import AsyncIterator
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
from roomkit.voice.base import AudioChunk, TranscriptionResult
|
|
9
|
+
from roomkit.voice.stt.base import STTProvider
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from roomkit.models.event import AudioContent
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MockSTTProvider(STTProvider):
|
|
16
|
+
"""Mock speech-to-text for testing."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, transcripts: list[str] | None = None) -> None:
|
|
19
|
+
self.transcripts = transcripts or ["Hello", "How can I help you?"]
|
|
20
|
+
self.calls: list[AudioContent | AudioChunk] = []
|
|
21
|
+
self._index = 0
|
|
22
|
+
|
|
23
|
+
async def transcribe(self, audio: AudioContent | AudioChunk) -> str:
|
|
24
|
+
self.calls.append(audio)
|
|
25
|
+
result = self.transcripts[self._index % len(self.transcripts)]
|
|
26
|
+
self._index += 1
|
|
27
|
+
return result
|
|
28
|
+
|
|
29
|
+
async def transcribe_stream(
|
|
30
|
+
self, audio_stream: AsyncIterator[AudioChunk]
|
|
31
|
+
) -> AsyncIterator[TranscriptionResult]:
|
|
32
|
+
chunks = []
|
|
33
|
+
async for chunk in audio_stream:
|
|
34
|
+
chunks.append(chunk)
|
|
35
|
+
|
|
36
|
+
text = self.transcripts[self._index % len(self.transcripts)]
|
|
37
|
+
self._index += 1
|
|
38
|
+
self.calls.extend(chunks)
|
|
39
|
+
|
|
40
|
+
yield TranscriptionResult(text=text, is_final=True)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Text-to-speech providers."""
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Text-to-speech provider ABC."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from collections.abc import AsyncIterator
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from roomkit.models.event import AudioContent
|
|
11
|
+
from roomkit.voice.base import AudioChunk
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TTSProvider(ABC):
|
|
15
|
+
"""Text-to-speech provider."""
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def name(self) -> str:
|
|
19
|
+
"""Provider name (e.g. 'elevenlabs', 'openai')."""
|
|
20
|
+
return self.__class__.__name__
|
|
21
|
+
|
|
22
|
+
@property
|
|
23
|
+
def default_voice(self) -> str | None:
|
|
24
|
+
"""Default voice ID. Override in subclasses."""
|
|
25
|
+
return None
|
|
26
|
+
|
|
27
|
+
@abstractmethod
|
|
28
|
+
async def synthesize(
|
|
29
|
+
self, text: str, *, voice: str | None = None
|
|
30
|
+
) -> AudioContent:
|
|
31
|
+
"""Synthesize text to audio.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
text: Text to synthesize.
|
|
35
|
+
voice: Voice ID (uses default_voice if not specified).
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
AudioContent with URL to generated audio.
|
|
39
|
+
"""
|
|
40
|
+
...
|
|
41
|
+
|
|
42
|
+
async def synthesize_stream(
|
|
43
|
+
self, text: str, *, voice: str | None = None
|
|
44
|
+
) -> AsyncIterator[AudioChunk]:
|
|
45
|
+
"""Stream audio chunks as they're generated.
|
|
46
|
+
|
|
47
|
+
Override for providers that support streaming.
|
|
48
|
+
Default: synthesizes full audio and yields single chunk.
|
|
49
|
+
"""
|
|
50
|
+
raise NotImplementedError(
|
|
51
|
+
f"{self.name} does not support streaming synthesis. "
|
|
52
|
+
"Use synthesize() instead."
|
|
53
|
+
)
|
|
54
|
+
# Make this an async generator (unreachable, but required for type)
|
|
55
|
+
yield # pragma: no cover
|
|
56
|
+
|
|
57
|
+
async def close(self) -> None: # noqa: B027
|
|
58
|
+
"""Release resources. Override in subclasses if needed."""
|
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
"""ElevenLabs text-to-speech provider."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from collections.abc import AsyncIterator
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
|
|
12
|
+
from roomkit.voice.base import AudioChunk
|
|
13
|
+
from roomkit.voice.tts.base import TTSProvider
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from roomkit.models.event import AudioContent
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ElevenLabsConfig:
|
|
23
|
+
"""Configuration for ElevenLabs TTS provider."""
|
|
24
|
+
|
|
25
|
+
api_key: str
|
|
26
|
+
voice_id: str = "21m00Tcm4TlvDq8ikWAM" # Rachel (default)
|
|
27
|
+
model_id: str = "eleven_multilingual_v2"
|
|
28
|
+
stability: float = 0.5
|
|
29
|
+
similarity_boost: float = 0.75
|
|
30
|
+
style: float = 0.0
|
|
31
|
+
use_speaker_boost: bool = True
|
|
32
|
+
output_format: str = "mp3_44100_128" # mp3, pcm_16000, pcm_22050, etc.
|
|
33
|
+
# Streaming options
|
|
34
|
+
optimize_streaming_latency: int = 3 # 0-4, higher = lower latency
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class ElevenLabsVoice:
|
|
39
|
+
"""Voice metadata from ElevenLabs."""
|
|
40
|
+
|
|
41
|
+
voice_id: str
|
|
42
|
+
name: str
|
|
43
|
+
category: str = "premade"
|
|
44
|
+
labels: dict[str, str] = field(default_factory=dict)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class ElevenLabsTTSProvider(TTSProvider):
|
|
48
|
+
"""ElevenLabs text-to-speech provider with streaming support."""
|
|
49
|
+
|
|
50
|
+
def __init__(self, config: ElevenLabsConfig) -> None:
|
|
51
|
+
self._config = config
|
|
52
|
+
self._client: httpx.AsyncClient | None = None
|
|
53
|
+
self._voices_cache: dict[str, ElevenLabsVoice] | None = None
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def name(self) -> str:
|
|
57
|
+
return "ElevenLabsTTS"
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def default_voice(self) -> str:
|
|
61
|
+
return self._config.voice_id
|
|
62
|
+
|
|
63
|
+
def _get_client(self) -> httpx.AsyncClient:
|
|
64
|
+
if self._client is None:
|
|
65
|
+
self._client = httpx.AsyncClient(
|
|
66
|
+
base_url="https://api.elevenlabs.io/v1",
|
|
67
|
+
headers={
|
|
68
|
+
"xi-api-key": self._config.api_key,
|
|
69
|
+
"Content-Type": "application/json",
|
|
70
|
+
},
|
|
71
|
+
timeout=60.0,
|
|
72
|
+
)
|
|
73
|
+
return self._client
|
|
74
|
+
|
|
75
|
+
def _build_voice_settings(self) -> dict[str, float | bool]:
|
|
76
|
+
"""Build voice settings for synthesis."""
|
|
77
|
+
return {
|
|
78
|
+
"stability": self._config.stability,
|
|
79
|
+
"similarity_boost": self._config.similarity_boost,
|
|
80
|
+
"style": self._config.style,
|
|
81
|
+
"use_speaker_boost": self._config.use_speaker_boost,
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
async def list_voices(self) -> list[ElevenLabsVoice]:
|
|
85
|
+
"""List available voices from ElevenLabs."""
|
|
86
|
+
if self._voices_cache is not None:
|
|
87
|
+
return list(self._voices_cache.values())
|
|
88
|
+
|
|
89
|
+
client = self._get_client()
|
|
90
|
+
response = await client.get("/voices")
|
|
91
|
+
response.raise_for_status()
|
|
92
|
+
data = response.json()
|
|
93
|
+
|
|
94
|
+
self._voices_cache = {}
|
|
95
|
+
for voice in data.get("voices", []):
|
|
96
|
+
v = ElevenLabsVoice(
|
|
97
|
+
voice_id=voice["voice_id"],
|
|
98
|
+
name=voice["name"],
|
|
99
|
+
category=voice.get("category", "premade"),
|
|
100
|
+
labels=voice.get("labels", {}),
|
|
101
|
+
)
|
|
102
|
+
self._voices_cache[v.voice_id] = v
|
|
103
|
+
|
|
104
|
+
return list(self._voices_cache.values())
|
|
105
|
+
|
|
106
|
+
async def synthesize(
|
|
107
|
+
self, text: str, *, voice: str | None = None
|
|
108
|
+
) -> AudioContent:
|
|
109
|
+
"""Synthesize text to audio.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
text: Text to synthesize.
|
|
113
|
+
voice: Voice ID (uses default_voice if not specified).
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
AudioContent with URL to generated audio.
|
|
117
|
+
"""
|
|
118
|
+
from roomkit.models.event import AudioContent as AudioContentModel
|
|
119
|
+
|
|
120
|
+
voice_id = voice or self._config.voice_id
|
|
121
|
+
client = self._get_client()
|
|
122
|
+
|
|
123
|
+
response = await client.post(
|
|
124
|
+
f"/text-to-speech/{voice_id}",
|
|
125
|
+
json={
|
|
126
|
+
"text": text,
|
|
127
|
+
"model_id": self._config.model_id,
|
|
128
|
+
"voice_settings": self._build_voice_settings(),
|
|
129
|
+
},
|
|
130
|
+
params={"output_format": self._config.output_format},
|
|
131
|
+
)
|
|
132
|
+
response.raise_for_status()
|
|
133
|
+
|
|
134
|
+
# ElevenLabs returns raw audio bytes
|
|
135
|
+
# We need to save/upload this somewhere to get a URL
|
|
136
|
+
# For now, return a data URL (base64 encoded)
|
|
137
|
+
import base64
|
|
138
|
+
|
|
139
|
+
audio_bytes = response.content
|
|
140
|
+
mime_type = self._get_mime_type()
|
|
141
|
+
data_url = f"data:{mime_type};base64,{base64.b64encode(audio_bytes).decode()}"
|
|
142
|
+
|
|
143
|
+
# Estimate duration (rough: ~150 words/minute, ~5 chars/word)
|
|
144
|
+
words = len(text.split())
|
|
145
|
+
duration = words / 150 * 60 # seconds
|
|
146
|
+
|
|
147
|
+
return AudioContentModel(
|
|
148
|
+
url=data_url,
|
|
149
|
+
mime_type=mime_type,
|
|
150
|
+
transcript=text,
|
|
151
|
+
duration_seconds=duration,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
async def synthesize_stream(
|
|
155
|
+
self, text: str, *, voice: str | None = None
|
|
156
|
+
) -> AsyncIterator[AudioChunk]:
|
|
157
|
+
"""Stream audio chunks as they're generated.
|
|
158
|
+
|
|
159
|
+
Uses ElevenLabs streaming API for low-latency synthesis.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
text: Text to synthesize.
|
|
163
|
+
voice: Voice ID (uses default_voice if not specified).
|
|
164
|
+
|
|
165
|
+
Yields:
|
|
166
|
+
AudioChunk with raw audio data.
|
|
167
|
+
"""
|
|
168
|
+
voice_id = voice or self._config.voice_id
|
|
169
|
+
client = self._get_client()
|
|
170
|
+
|
|
171
|
+
# Use streaming endpoint
|
|
172
|
+
async with client.stream(
|
|
173
|
+
"POST",
|
|
174
|
+
f"/text-to-speech/{voice_id}/stream",
|
|
175
|
+
json={
|
|
176
|
+
"text": text,
|
|
177
|
+
"model_id": self._config.model_id,
|
|
178
|
+
"voice_settings": self._build_voice_settings(),
|
|
179
|
+
},
|
|
180
|
+
params={
|
|
181
|
+
"output_format": self._config.output_format,
|
|
182
|
+
"optimize_streaming_latency": self._config.optimize_streaming_latency,
|
|
183
|
+
},
|
|
184
|
+
) as response:
|
|
185
|
+
response.raise_for_status()
|
|
186
|
+
|
|
187
|
+
chunk_index = 0
|
|
188
|
+
async for chunk in response.aiter_bytes(chunk_size=4096):
|
|
189
|
+
if chunk:
|
|
190
|
+
yield AudioChunk(
|
|
191
|
+
data=chunk,
|
|
192
|
+
sample_rate=self._get_sample_rate(),
|
|
193
|
+
format=self._get_audio_format(),
|
|
194
|
+
is_final=False,
|
|
195
|
+
)
|
|
196
|
+
chunk_index += 1
|
|
197
|
+
|
|
198
|
+
# Send final chunk marker
|
|
199
|
+
yield AudioChunk(
|
|
200
|
+
data=b"",
|
|
201
|
+
sample_rate=self._get_sample_rate(),
|
|
202
|
+
format=self._get_audio_format(),
|
|
203
|
+
is_final=True,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
async def synthesize_stream_input(
|
|
207
|
+
self, text_stream: AsyncIterator[str], *, voice: str | None = None
|
|
208
|
+
) -> AsyncIterator[AudioChunk]:
|
|
209
|
+
"""Stream audio from streaming text input.
|
|
210
|
+
|
|
211
|
+
Uses ElevenLabs WebSocket API for real-time text-to-speech.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
text_stream: Async iterator of text chunks.
|
|
215
|
+
voice: Voice ID (uses default_voice if not specified).
|
|
216
|
+
|
|
217
|
+
Yields:
|
|
218
|
+
AudioChunk with raw audio data.
|
|
219
|
+
"""
|
|
220
|
+
import asyncio
|
|
221
|
+
import json
|
|
222
|
+
|
|
223
|
+
import websockets
|
|
224
|
+
|
|
225
|
+
voice_id = voice or self._config.voice_id
|
|
226
|
+
model_id = self._config.model_id
|
|
227
|
+
|
|
228
|
+
ws_url = (
|
|
229
|
+
f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input"
|
|
230
|
+
f"?model_id={model_id}"
|
|
231
|
+
f"&output_format={self._config.output_format}"
|
|
232
|
+
f"&optimize_streaming_latency={self._config.optimize_streaming_latency}"
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
async with websockets.connect(
|
|
236
|
+
ws_url,
|
|
237
|
+
additional_headers=[("xi-api-key", self._config.api_key)],
|
|
238
|
+
) as ws:
|
|
239
|
+
# Send initial BOS (beginning of stream) message
|
|
240
|
+
await ws.send(
|
|
241
|
+
json.dumps(
|
|
242
|
+
{
|
|
243
|
+
"text": " ",
|
|
244
|
+
"voice_settings": self._build_voice_settings(),
|
|
245
|
+
"xi_api_key": self._config.api_key,
|
|
246
|
+
}
|
|
247
|
+
)
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
# Start text sender task
|
|
251
|
+
async def send_text() -> None:
|
|
252
|
+
try:
|
|
253
|
+
async for text_chunk in text_stream:
|
|
254
|
+
if text_chunk:
|
|
255
|
+
await ws.send(json.dumps({"text": text_chunk}))
|
|
256
|
+
# Send EOS (end of stream) message
|
|
257
|
+
await ws.send(json.dumps({"text": ""}))
|
|
258
|
+
except Exception as e:
|
|
259
|
+
logger.error("Error sending text to ElevenLabs: %s", e)
|
|
260
|
+
|
|
261
|
+
sender_task = asyncio.create_task(send_text())
|
|
262
|
+
|
|
263
|
+
try:
|
|
264
|
+
async for message in ws:
|
|
265
|
+
if isinstance(message, str):
|
|
266
|
+
data = json.loads(message)
|
|
267
|
+
if "audio" in data:
|
|
268
|
+
import base64
|
|
269
|
+
|
|
270
|
+
audio_bytes = base64.b64decode(data["audio"])
|
|
271
|
+
yield AudioChunk(
|
|
272
|
+
data=audio_bytes,
|
|
273
|
+
sample_rate=self._get_sample_rate(),
|
|
274
|
+
format=self._get_audio_format(),
|
|
275
|
+
is_final=data.get("isFinal", False),
|
|
276
|
+
)
|
|
277
|
+
elif data.get("isFinal"):
|
|
278
|
+
yield AudioChunk(
|
|
279
|
+
data=b"",
|
|
280
|
+
sample_rate=self._get_sample_rate(),
|
|
281
|
+
format=self._get_audio_format(),
|
|
282
|
+
is_final=True,
|
|
283
|
+
)
|
|
284
|
+
break
|
|
285
|
+
finally:
|
|
286
|
+
sender_task.cancel()
|
|
287
|
+
import contextlib
|
|
288
|
+
|
|
289
|
+
with contextlib.suppress(asyncio.CancelledError):
|
|
290
|
+
await sender_task
|
|
291
|
+
|
|
292
|
+
def _get_mime_type(self) -> str:
|
|
293
|
+
"""Get MIME type from output format."""
|
|
294
|
+
fmt = self._config.output_format
|
|
295
|
+
if fmt.startswith("mp3"):
|
|
296
|
+
return "audio/mpeg"
|
|
297
|
+
elif fmt.startswith("pcm"):
|
|
298
|
+
return "audio/pcm"
|
|
299
|
+
elif fmt.startswith("ulaw"):
|
|
300
|
+
return "audio/basic"
|
|
301
|
+
return "audio/mpeg"
|
|
302
|
+
|
|
303
|
+
def _get_sample_rate(self) -> int:
|
|
304
|
+
"""Get sample rate from output format."""
|
|
305
|
+
fmt = self._config.output_format
|
|
306
|
+
if "44100" in fmt:
|
|
307
|
+
return 44100
|
|
308
|
+
elif "22050" in fmt:
|
|
309
|
+
return 22050
|
|
310
|
+
elif "16000" in fmt:
|
|
311
|
+
return 16000
|
|
312
|
+
return 44100
|
|
313
|
+
|
|
314
|
+
def _get_audio_format(self) -> str:
|
|
315
|
+
"""Get audio format string."""
|
|
316
|
+
fmt = self._config.output_format
|
|
317
|
+
if fmt.startswith("mp3"):
|
|
318
|
+
return "mp3"
|
|
319
|
+
elif fmt.startswith("pcm"):
|
|
320
|
+
return "pcm_s16le"
|
|
321
|
+
elif fmt.startswith("ulaw"):
|
|
322
|
+
return "ulaw"
|
|
323
|
+
return "mp3"
|
|
324
|
+
|
|
325
|
+
async def close(self) -> None: # noqa: B027
|
|
326
|
+
"""Release resources."""
|
|
327
|
+
if self._client:
|
|
328
|
+
await self._client.aclose()
|
|
329
|
+
self._client = None
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Mock text-to-speech provider for testing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import AsyncIterator
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
from uuid import uuid4
|
|
8
|
+
|
|
9
|
+
from roomkit.voice.base import AudioChunk
|
|
10
|
+
from roomkit.voice.tts.base import TTSProvider
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from roomkit.models.event import AudioContent
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MockTTSProvider(TTSProvider):
|
|
17
|
+
"""Mock text-to-speech for testing."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, voice: str = "mock-voice") -> None:
|
|
20
|
+
self._default_voice = voice
|
|
21
|
+
self.calls: list[dict[str, str | None]] = []
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def default_voice(self) -> str:
|
|
25
|
+
return self._default_voice
|
|
26
|
+
|
|
27
|
+
async def synthesize(
|
|
28
|
+
self, text: str, *, voice: str | None = None
|
|
29
|
+
) -> AudioContent:
|
|
30
|
+
from roomkit.models.event import AudioContent as AudioContentModel
|
|
31
|
+
|
|
32
|
+
self.calls.append({"text": text, "voice": voice or self._default_voice})
|
|
33
|
+
return AudioContentModel(
|
|
34
|
+
url=f"https://mock.test/audio/{uuid4().hex}.mp3",
|
|
35
|
+
mime_type="audio/mpeg",
|
|
36
|
+
transcript=text,
|
|
37
|
+
duration_seconds=len(text) * 0.05, # ~50ms per char
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
async def synthesize_stream(
|
|
41
|
+
self, text: str, *, voice: str | None = None
|
|
42
|
+
) -> AsyncIterator[AudioChunk]:
|
|
43
|
+
self.calls.append({"text": text, "voice": voice or self._default_voice})
|
|
44
|
+
# Simulate streaming with small chunks
|
|
45
|
+
words = text.split()
|
|
46
|
+
for i, word in enumerate(words):
|
|
47
|
+
yield AudioChunk(
|
|
48
|
+
data=f"mock-audio-{word}".encode(),
|
|
49
|
+
sample_rate=16000,
|
|
50
|
+
is_final=(i == len(words) - 1),
|
|
51
|
+
)
|