roomkit 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,214 @@
1
+ """Deepgram speech-to-text provider."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ from collections.abc import AsyncIterator
8
+ from dataclasses import dataclass
9
+ from typing import TYPE_CHECKING, Any
10
+
11
+ import httpx
12
+
13
+ from roomkit.voice.base import AudioChunk, TranscriptionResult
14
+ from roomkit.voice.stt.base import STTProvider
15
+
16
+ if TYPE_CHECKING:
17
+ from roomkit.models.event import AudioContent
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @dataclass
23
+ class DeepgramConfig:
24
+ """Configuration for Deepgram STT provider."""
25
+
26
+ api_key: str
27
+ model: str = "nova-2"
28
+ language: str = "en"
29
+ punctuate: bool = True
30
+ diarize: bool = False
31
+ smart_format: bool = True
32
+ filler_words: bool = False
33
+ # Real-time streaming options
34
+ interim_results: bool = True
35
+ endpointing: int = 300 # ms of silence to end utterance
36
+ vad_events: bool = True
37
+
38
+
39
+ class DeepgramSTTProvider(STTProvider):
40
+ """Deepgram speech-to-text provider with streaming support."""
41
+
42
+ def __init__(self, config: DeepgramConfig) -> None:
43
+ self._config = config
44
+ self._client: httpx.AsyncClient | None = None
45
+
46
+ @property
47
+ def name(self) -> str:
48
+ return "DeepgramSTT"
49
+
50
+ def _get_client(self) -> httpx.AsyncClient:
51
+ if self._client is None:
52
+ self._client = httpx.AsyncClient(
53
+ base_url="https://api.deepgram.com/v1",
54
+ headers={
55
+ "Authorization": f"Token {self._config.api_key}",
56
+ "Content-Type": "audio/wav",
57
+ },
58
+ timeout=60.0,
59
+ )
60
+ return self._client
61
+
62
+ def _build_query_params(self) -> dict[str, Any]:
63
+ """Build query parameters for Deepgram API."""
64
+ params: dict[str, Any] = {
65
+ "model": self._config.model,
66
+ "language": self._config.language,
67
+ "punctuate": self._config.punctuate,
68
+ "diarize": self._config.diarize,
69
+ "smart_format": self._config.smart_format,
70
+ "filler_words": self._config.filler_words,
71
+ }
72
+ return {k: str(v).lower() if isinstance(v, bool) else v for k, v in params.items()}
73
+
74
+ async def transcribe(self, audio: AudioContent | AudioChunk) -> str:
75
+ """Transcribe complete audio to text.
76
+
77
+ Args:
78
+ audio: Audio content (URL) or raw audio chunk.
79
+
80
+ Returns:
81
+ Transcribed text.
82
+ """
83
+ client = self._get_client()
84
+ params = self._build_query_params()
85
+
86
+ # Handle AudioContent (URL-based)
87
+ if hasattr(audio, "url"):
88
+ # Fetch audio from URL
89
+ async with httpx.AsyncClient() as fetch_client:
90
+ resp = await fetch_client.get(audio.url)
91
+ resp.raise_for_status()
92
+ audio_data = resp.content
93
+ content_type = resp.headers.get("content-type", "audio/wav")
94
+ else:
95
+ # Handle AudioChunk (raw bytes)
96
+ audio_data = audio.data
97
+ audio_format = getattr(audio, "format", "wav")
98
+
99
+ # For raw PCM formats, set encoding params for Deepgram
100
+ if audio_format in ("pcm_s16le", "linear16", "raw"):
101
+ content_type = "audio/raw"
102
+ params["encoding"] = "linear16"
103
+ params["sample_rate"] = getattr(audio, "sample_rate", 16000)
104
+ params["channels"] = getattr(audio, "channels", 1)
105
+ else:
106
+ content_type = f"audio/{audio_format}"
107
+
108
+ # Call Deepgram API
109
+ response = await client.post(
110
+ "/listen",
111
+ params=params,
112
+ content=audio_data,
113
+ headers={"Content-Type": content_type},
114
+ )
115
+ response.raise_for_status()
116
+ result = response.json()
117
+
118
+ # Extract transcript
119
+ try:
120
+ transcript: str = result["results"]["channels"][0]["alternatives"][0]["transcript"]
121
+ return transcript.strip()
122
+ except (KeyError, IndexError):
123
+ logger.warning("No transcript in Deepgram response: %s", result)
124
+ return ""
125
+
126
+ async def transcribe_stream(
127
+ self, audio_stream: AsyncIterator[AudioChunk]
128
+ ) -> AsyncIterator[TranscriptionResult]:
129
+ """Stream transcription with partial results using WebSocket.
130
+
131
+ Args:
132
+ audio_stream: Async iterator of audio chunks.
133
+
134
+ Yields:
135
+ TranscriptionResult with partial and final transcripts.
136
+ """
137
+ import websockets
138
+
139
+ # Build WebSocket URL with query params
140
+ params = self._build_query_params()
141
+ params["interim_results"] = str(self._config.interim_results).lower()
142
+ params["endpointing"] = self._config.endpointing
143
+ params["vad_events"] = str(self._config.vad_events).lower()
144
+ params["encoding"] = "linear16"
145
+ params["sample_rate"] = "16000"
146
+
147
+ query_string = "&".join(f"{k}={v}" for k, v in params.items())
148
+ ws_url = f"wss://api.deepgram.com/v1/listen?{query_string}"
149
+
150
+ headers = [("Authorization", f"Token {self._config.api_key}")]
151
+
152
+ async with websockets.connect(ws_url, additional_headers=headers) as ws:
153
+ # Start sender task
154
+ async def send_audio() -> None:
155
+ try:
156
+ async for chunk in audio_stream:
157
+ if chunk.data:
158
+ await ws.send(chunk.data)
159
+ if chunk.is_final:
160
+ # Send close frame to signal end of audio
161
+ await ws.send(b"")
162
+ break
163
+ except Exception as e:
164
+ logger.error("Error sending audio to Deepgram: %s", e)
165
+
166
+ sender_task = asyncio.create_task(send_audio())
167
+
168
+ try:
169
+ async for message in ws:
170
+ if isinstance(message, bytes):
171
+ continue
172
+
173
+ import json
174
+
175
+ data = json.loads(message)
176
+
177
+ # Handle transcription results
178
+ if data.get("type") == "Results":
179
+ channel = data.get("channel", {})
180
+ alternatives = channel.get("alternatives", [])
181
+ if alternatives:
182
+ alt = alternatives[0]
183
+ transcript = alt.get("transcript", "")
184
+ confidence = alt.get("confidence")
185
+ words = alt.get("words", [])
186
+ is_final = data.get("is_final", False)
187
+
188
+ if transcript:
189
+ yield TranscriptionResult(
190
+ text=transcript,
191
+ is_final=is_final,
192
+ confidence=confidence,
193
+ language=data.get("channel", {}).get("detected_language"),
194
+ words=words,
195
+ )
196
+
197
+ # Handle speech events
198
+ elif data.get("type") == "SpeechStarted":
199
+ logger.debug("Speech started")
200
+ elif data.get("type") == "UtteranceEnd":
201
+ logger.debug("Utterance ended")
202
+
203
+ finally:
204
+ sender_task.cancel()
205
+ import contextlib
206
+
207
+ with contextlib.suppress(asyncio.CancelledError):
208
+ await sender_task
209
+
210
+ async def close(self) -> None: # noqa: B027
211
+ """Release resources."""
212
+ if self._client:
213
+ await self._client.aclose()
214
+ self._client = None
@@ -0,0 +1,40 @@
1
+ """Mock speech-to-text provider for testing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import AsyncIterator
6
+ from typing import TYPE_CHECKING
7
+
8
+ from roomkit.voice.base import AudioChunk, TranscriptionResult
9
+ from roomkit.voice.stt.base import STTProvider
10
+
11
+ if TYPE_CHECKING:
12
+ from roomkit.models.event import AudioContent
13
+
14
+
15
+ class MockSTTProvider(STTProvider):
16
+ """Mock speech-to-text for testing."""
17
+
18
+ def __init__(self, transcripts: list[str] | None = None) -> None:
19
+ self.transcripts = transcripts or ["Hello", "How can I help you?"]
20
+ self.calls: list[AudioContent | AudioChunk] = []
21
+ self._index = 0
22
+
23
+ async def transcribe(self, audio: AudioContent | AudioChunk) -> str:
24
+ self.calls.append(audio)
25
+ result = self.transcripts[self._index % len(self.transcripts)]
26
+ self._index += 1
27
+ return result
28
+
29
+ async def transcribe_stream(
30
+ self, audio_stream: AsyncIterator[AudioChunk]
31
+ ) -> AsyncIterator[TranscriptionResult]:
32
+ chunks = []
33
+ async for chunk in audio_stream:
34
+ chunks.append(chunk)
35
+
36
+ text = self.transcripts[self._index % len(self.transcripts)]
37
+ self._index += 1
38
+ self.calls.extend(chunks)
39
+
40
+ yield TranscriptionResult(text=text, is_final=True)
@@ -0,0 +1 @@
1
+ """Text-to-speech providers."""
@@ -0,0 +1,58 @@
1
+ """Text-to-speech provider ABC."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from collections.abc import AsyncIterator
7
+ from typing import TYPE_CHECKING
8
+
9
+ if TYPE_CHECKING:
10
+ from roomkit.models.event import AudioContent
11
+ from roomkit.voice.base import AudioChunk
12
+
13
+
14
+ class TTSProvider(ABC):
15
+ """Text-to-speech provider."""
16
+
17
+ @property
18
+ def name(self) -> str:
19
+ """Provider name (e.g. 'elevenlabs', 'openai')."""
20
+ return self.__class__.__name__
21
+
22
+ @property
23
+ def default_voice(self) -> str | None:
24
+ """Default voice ID. Override in subclasses."""
25
+ return None
26
+
27
+ @abstractmethod
28
+ async def synthesize(
29
+ self, text: str, *, voice: str | None = None
30
+ ) -> AudioContent:
31
+ """Synthesize text to audio.
32
+
33
+ Args:
34
+ text: Text to synthesize.
35
+ voice: Voice ID (uses default_voice if not specified).
36
+
37
+ Returns:
38
+ AudioContent with URL to generated audio.
39
+ """
40
+ ...
41
+
42
+ async def synthesize_stream(
43
+ self, text: str, *, voice: str | None = None
44
+ ) -> AsyncIterator[AudioChunk]:
45
+ """Stream audio chunks as they're generated.
46
+
47
+ Override for providers that support streaming.
48
+ Default: synthesizes full audio and yields single chunk.
49
+ """
50
+ raise NotImplementedError(
51
+ f"{self.name} does not support streaming synthesis. "
52
+ "Use synthesize() instead."
53
+ )
54
+ # Make this an async generator (unreachable, but required for type)
55
+ yield # pragma: no cover
56
+
57
+ async def close(self) -> None: # noqa: B027
58
+ """Release resources. Override in subclasses if needed."""
@@ -0,0 +1,329 @@
1
+ """ElevenLabs text-to-speech provider."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from collections.abc import AsyncIterator
7
+ from dataclasses import dataclass, field
8
+ from typing import TYPE_CHECKING
9
+
10
+ import httpx
11
+
12
+ from roomkit.voice.base import AudioChunk
13
+ from roomkit.voice.tts.base import TTSProvider
14
+
15
+ if TYPE_CHECKING:
16
+ from roomkit.models.event import AudioContent
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ @dataclass
22
+ class ElevenLabsConfig:
23
+ """Configuration for ElevenLabs TTS provider."""
24
+
25
+ api_key: str
26
+ voice_id: str = "21m00Tcm4TlvDq8ikWAM" # Rachel (default)
27
+ model_id: str = "eleven_multilingual_v2"
28
+ stability: float = 0.5
29
+ similarity_boost: float = 0.75
30
+ style: float = 0.0
31
+ use_speaker_boost: bool = True
32
+ output_format: str = "mp3_44100_128" # mp3, pcm_16000, pcm_22050, etc.
33
+ # Streaming options
34
+ optimize_streaming_latency: int = 3 # 0-4, higher = lower latency
35
+
36
+
37
+ @dataclass
38
+ class ElevenLabsVoice:
39
+ """Voice metadata from ElevenLabs."""
40
+
41
+ voice_id: str
42
+ name: str
43
+ category: str = "premade"
44
+ labels: dict[str, str] = field(default_factory=dict)
45
+
46
+
47
+ class ElevenLabsTTSProvider(TTSProvider):
48
+ """ElevenLabs text-to-speech provider with streaming support."""
49
+
50
+ def __init__(self, config: ElevenLabsConfig) -> None:
51
+ self._config = config
52
+ self._client: httpx.AsyncClient | None = None
53
+ self._voices_cache: dict[str, ElevenLabsVoice] | None = None
54
+
55
+ @property
56
+ def name(self) -> str:
57
+ return "ElevenLabsTTS"
58
+
59
+ @property
60
+ def default_voice(self) -> str:
61
+ return self._config.voice_id
62
+
63
+ def _get_client(self) -> httpx.AsyncClient:
64
+ if self._client is None:
65
+ self._client = httpx.AsyncClient(
66
+ base_url="https://api.elevenlabs.io/v1",
67
+ headers={
68
+ "xi-api-key": self._config.api_key,
69
+ "Content-Type": "application/json",
70
+ },
71
+ timeout=60.0,
72
+ )
73
+ return self._client
74
+
75
+ def _build_voice_settings(self) -> dict[str, float | bool]:
76
+ """Build voice settings for synthesis."""
77
+ return {
78
+ "stability": self._config.stability,
79
+ "similarity_boost": self._config.similarity_boost,
80
+ "style": self._config.style,
81
+ "use_speaker_boost": self._config.use_speaker_boost,
82
+ }
83
+
84
+ async def list_voices(self) -> list[ElevenLabsVoice]:
85
+ """List available voices from ElevenLabs."""
86
+ if self._voices_cache is not None:
87
+ return list(self._voices_cache.values())
88
+
89
+ client = self._get_client()
90
+ response = await client.get("/voices")
91
+ response.raise_for_status()
92
+ data = response.json()
93
+
94
+ self._voices_cache = {}
95
+ for voice in data.get("voices", []):
96
+ v = ElevenLabsVoice(
97
+ voice_id=voice["voice_id"],
98
+ name=voice["name"],
99
+ category=voice.get("category", "premade"),
100
+ labels=voice.get("labels", {}),
101
+ )
102
+ self._voices_cache[v.voice_id] = v
103
+
104
+ return list(self._voices_cache.values())
105
+
106
+ async def synthesize(
107
+ self, text: str, *, voice: str | None = None
108
+ ) -> AudioContent:
109
+ """Synthesize text to audio.
110
+
111
+ Args:
112
+ text: Text to synthesize.
113
+ voice: Voice ID (uses default_voice if not specified).
114
+
115
+ Returns:
116
+ AudioContent with URL to generated audio.
117
+ """
118
+ from roomkit.models.event import AudioContent as AudioContentModel
119
+
120
+ voice_id = voice or self._config.voice_id
121
+ client = self._get_client()
122
+
123
+ response = await client.post(
124
+ f"/text-to-speech/{voice_id}",
125
+ json={
126
+ "text": text,
127
+ "model_id": self._config.model_id,
128
+ "voice_settings": self._build_voice_settings(),
129
+ },
130
+ params={"output_format": self._config.output_format},
131
+ )
132
+ response.raise_for_status()
133
+
134
+ # ElevenLabs returns raw audio bytes
135
+ # We need to save/upload this somewhere to get a URL
136
+ # For now, return a data URL (base64 encoded)
137
+ import base64
138
+
139
+ audio_bytes = response.content
140
+ mime_type = self._get_mime_type()
141
+ data_url = f"data:{mime_type};base64,{base64.b64encode(audio_bytes).decode()}"
142
+
143
+ # Estimate duration (rough: ~150 words/minute, ~5 chars/word)
144
+ words = len(text.split())
145
+ duration = words / 150 * 60 # seconds
146
+
147
+ return AudioContentModel(
148
+ url=data_url,
149
+ mime_type=mime_type,
150
+ transcript=text,
151
+ duration_seconds=duration,
152
+ )
153
+
154
+ async def synthesize_stream(
155
+ self, text: str, *, voice: str | None = None
156
+ ) -> AsyncIterator[AudioChunk]:
157
+ """Stream audio chunks as they're generated.
158
+
159
+ Uses ElevenLabs streaming API for low-latency synthesis.
160
+
161
+ Args:
162
+ text: Text to synthesize.
163
+ voice: Voice ID (uses default_voice if not specified).
164
+
165
+ Yields:
166
+ AudioChunk with raw audio data.
167
+ """
168
+ voice_id = voice or self._config.voice_id
169
+ client = self._get_client()
170
+
171
+ # Use streaming endpoint
172
+ async with client.stream(
173
+ "POST",
174
+ f"/text-to-speech/{voice_id}/stream",
175
+ json={
176
+ "text": text,
177
+ "model_id": self._config.model_id,
178
+ "voice_settings": self._build_voice_settings(),
179
+ },
180
+ params={
181
+ "output_format": self._config.output_format,
182
+ "optimize_streaming_latency": self._config.optimize_streaming_latency,
183
+ },
184
+ ) as response:
185
+ response.raise_for_status()
186
+
187
+ chunk_index = 0
188
+ async for chunk in response.aiter_bytes(chunk_size=4096):
189
+ if chunk:
190
+ yield AudioChunk(
191
+ data=chunk,
192
+ sample_rate=self._get_sample_rate(),
193
+ format=self._get_audio_format(),
194
+ is_final=False,
195
+ )
196
+ chunk_index += 1
197
+
198
+ # Send final chunk marker
199
+ yield AudioChunk(
200
+ data=b"",
201
+ sample_rate=self._get_sample_rate(),
202
+ format=self._get_audio_format(),
203
+ is_final=True,
204
+ )
205
+
206
+ async def synthesize_stream_input(
207
+ self, text_stream: AsyncIterator[str], *, voice: str | None = None
208
+ ) -> AsyncIterator[AudioChunk]:
209
+ """Stream audio from streaming text input.
210
+
211
+ Uses ElevenLabs WebSocket API for real-time text-to-speech.
212
+
213
+ Args:
214
+ text_stream: Async iterator of text chunks.
215
+ voice: Voice ID (uses default_voice if not specified).
216
+
217
+ Yields:
218
+ AudioChunk with raw audio data.
219
+ """
220
+ import asyncio
221
+ import json
222
+
223
+ import websockets
224
+
225
+ voice_id = voice or self._config.voice_id
226
+ model_id = self._config.model_id
227
+
228
+ ws_url = (
229
+ f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input"
230
+ f"?model_id={model_id}"
231
+ f"&output_format={self._config.output_format}"
232
+ f"&optimize_streaming_latency={self._config.optimize_streaming_latency}"
233
+ )
234
+
235
+ async with websockets.connect(
236
+ ws_url,
237
+ additional_headers=[("xi-api-key", self._config.api_key)],
238
+ ) as ws:
239
+ # Send initial BOS (beginning of stream) message
240
+ await ws.send(
241
+ json.dumps(
242
+ {
243
+ "text": " ",
244
+ "voice_settings": self._build_voice_settings(),
245
+ "xi_api_key": self._config.api_key,
246
+ }
247
+ )
248
+ )
249
+
250
+ # Start text sender task
251
+ async def send_text() -> None:
252
+ try:
253
+ async for text_chunk in text_stream:
254
+ if text_chunk:
255
+ await ws.send(json.dumps({"text": text_chunk}))
256
+ # Send EOS (end of stream) message
257
+ await ws.send(json.dumps({"text": ""}))
258
+ except Exception as e:
259
+ logger.error("Error sending text to ElevenLabs: %s", e)
260
+
261
+ sender_task = asyncio.create_task(send_text())
262
+
263
+ try:
264
+ async for message in ws:
265
+ if isinstance(message, str):
266
+ data = json.loads(message)
267
+ if "audio" in data:
268
+ import base64
269
+
270
+ audio_bytes = base64.b64decode(data["audio"])
271
+ yield AudioChunk(
272
+ data=audio_bytes,
273
+ sample_rate=self._get_sample_rate(),
274
+ format=self._get_audio_format(),
275
+ is_final=data.get("isFinal", False),
276
+ )
277
+ elif data.get("isFinal"):
278
+ yield AudioChunk(
279
+ data=b"",
280
+ sample_rate=self._get_sample_rate(),
281
+ format=self._get_audio_format(),
282
+ is_final=True,
283
+ )
284
+ break
285
+ finally:
286
+ sender_task.cancel()
287
+ import contextlib
288
+
289
+ with contextlib.suppress(asyncio.CancelledError):
290
+ await sender_task
291
+
292
+ def _get_mime_type(self) -> str:
293
+ """Get MIME type from output format."""
294
+ fmt = self._config.output_format
295
+ if fmt.startswith("mp3"):
296
+ return "audio/mpeg"
297
+ elif fmt.startswith("pcm"):
298
+ return "audio/pcm"
299
+ elif fmt.startswith("ulaw"):
300
+ return "audio/basic"
301
+ return "audio/mpeg"
302
+
303
+ def _get_sample_rate(self) -> int:
304
+ """Get sample rate from output format."""
305
+ fmt = self._config.output_format
306
+ if "44100" in fmt:
307
+ return 44100
308
+ elif "22050" in fmt:
309
+ return 22050
310
+ elif "16000" in fmt:
311
+ return 16000
312
+ return 44100
313
+
314
+ def _get_audio_format(self) -> str:
315
+ """Get audio format string."""
316
+ fmt = self._config.output_format
317
+ if fmt.startswith("mp3"):
318
+ return "mp3"
319
+ elif fmt.startswith("pcm"):
320
+ return "pcm_s16le"
321
+ elif fmt.startswith("ulaw"):
322
+ return "ulaw"
323
+ return "mp3"
324
+
325
+ async def close(self) -> None: # noqa: B027
326
+ """Release resources."""
327
+ if self._client:
328
+ await self._client.aclose()
329
+ self._client = None
@@ -0,0 +1,51 @@
1
+ """Mock text-to-speech provider for testing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import AsyncIterator
6
+ from typing import TYPE_CHECKING
7
+ from uuid import uuid4
8
+
9
+ from roomkit.voice.base import AudioChunk
10
+ from roomkit.voice.tts.base import TTSProvider
11
+
12
+ if TYPE_CHECKING:
13
+ from roomkit.models.event import AudioContent
14
+
15
+
16
+ class MockTTSProvider(TTSProvider):
17
+ """Mock text-to-speech for testing."""
18
+
19
+ def __init__(self, voice: str = "mock-voice") -> None:
20
+ self._default_voice = voice
21
+ self.calls: list[dict[str, str | None]] = []
22
+
23
+ @property
24
+ def default_voice(self) -> str:
25
+ return self._default_voice
26
+
27
+ async def synthesize(
28
+ self, text: str, *, voice: str | None = None
29
+ ) -> AudioContent:
30
+ from roomkit.models.event import AudioContent as AudioContentModel
31
+
32
+ self.calls.append({"text": text, "voice": voice or self._default_voice})
33
+ return AudioContentModel(
34
+ url=f"https://mock.test/audio/{uuid4().hex}.mp3",
35
+ mime_type="audio/mpeg",
36
+ transcript=text,
37
+ duration_seconds=len(text) * 0.05, # ~50ms per char
38
+ )
39
+
40
+ async def synthesize_stream(
41
+ self, text: str, *, voice: str | None = None
42
+ ) -> AsyncIterator[AudioChunk]:
43
+ self.calls.append({"text": text, "voice": voice or self._default_voice})
44
+ # Simulate streaming with small chunks
45
+ words = text.split()
46
+ for i, word in enumerate(words):
47
+ yield AudioChunk(
48
+ data=f"mock-audio-{word}".encode(),
49
+ sample_rate=16000,
50
+ is_final=(i == len(words) - 1),
51
+ )