atom-audio-engine 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {atom_audio_engine-0.1.4.dist-info → atom_audio_engine-0.1.6.dist-info}/METADATA +1 -1
- atom_audio_engine-0.1.6.dist-info/RECORD +32 -0
- audio_engine/__init__.py +6 -2
- audio_engine/asr/__init__.py +48 -0
- audio_engine/asr/base.py +89 -0
- audio_engine/asr/cartesia.py +350 -0
- audio_engine/asr/deepgram.py +196 -0
- audio_engine/core/__init__.py +13 -0
- audio_engine/core/config.py +162 -0
- audio_engine/core/pipeline.py +278 -0
- audio_engine/core/types.py +87 -0
- audio_engine/integrations/__init__.py +5 -0
- audio_engine/integrations/geneface.py +297 -0
- audio_engine/llm/__init__.py +40 -0
- audio_engine/llm/base.py +106 -0
- audio_engine/llm/groq.py +208 -0
- audio_engine/pipelines/__init__.py +1 -0
- audio_engine/pipelines/personaplex/__init__.py +41 -0
- audio_engine/pipelines/personaplex/client.py +259 -0
- audio_engine/pipelines/personaplex/config.py +69 -0
- audio_engine/pipelines/personaplex/pipeline.py +301 -0
- audio_engine/pipelines/personaplex/types.py +173 -0
- audio_engine/pipelines/personaplex/utils.py +192 -0
- audio_engine/streaming/__init__.py +5 -0
- audio_engine/streaming/websocket_server.py +333 -0
- audio_engine/tts/__init__.py +35 -0
- audio_engine/tts/base.py +153 -0
- audio_engine/tts/cartesia.py +370 -0
- audio_engine/utils/__init__.py +15 -0
- audio_engine/utils/audio.py +218 -0
- atom_audio_engine-0.1.4.dist-info/RECORD +0 -5
- {atom_audio_engine-0.1.4.dist-info → atom_audio_engine-0.1.6.dist-info}/WHEEL +0 -0
- {atom_audio_engine-0.1.4.dist-info → atom_audio_engine-0.1.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
audio_engine/__init__.py,sha256=KxGAd4of3q5pvGhniSnQvihi-UC-uFI6nFJdbyV2rb8,1538
|
|
2
|
+
audio_engine/asr/__init__.py,sha256=HkNjZXyDqZSYTB17kKp3koI9k5rrjID8kZgH2Idxt28,1232
|
|
3
|
+
audio_engine/asr/base.py,sha256=dC6cvAlxoOzUXNz9do-ueRObmNXqg4rtsY9fh6k67vA,2380
|
|
4
|
+
audio_engine/asr/cartesia.py,sha256=jLoetv6jiAbcfmfGkrwKRGo5sbMZ-WJTwIVvWno1Vko,13185
|
|
5
|
+
audio_engine/asr/deepgram.py,sha256=RwtG0e66Y_8HKehoHFzjgJ5JlQs8JEpu_0kpWGmsvBc,6431
|
|
6
|
+
audio_engine/core/__init__.py,sha256=aqN1HeFZGX0xAnVyfURdExXXxxmwwZEq_KFj6MqXZwo,289
|
|
7
|
+
audio_engine/core/config.py,sha256=Jmp-rr0MXXejF3qtx1-CPCEwQOorggHIH5cV3t8G-I4,5205
|
|
8
|
+
audio_engine/core/pipeline.py,sha256=rMZOlllT32xruz3nkeoYGRfnq94zgs-dzAbTahSITtU,8808
|
|
9
|
+
audio_engine/core/types.py,sha256=iFQPajgeS1YgMWXJvubA8sWbxLI1Z8nF-z1uucrgNm4,2295
|
|
10
|
+
audio_engine/integrations/__init__.py,sha256=69Hna1pfmB929WbM7GpAHlrk4xPOleKTnoaBBksFo9k,114
|
|
11
|
+
audio_engine/integrations/geneface.py,sha256=JgxGYfqDk9n-p4e1VNczoEJdMPzzfF5QGsyxxinrWr8,8790
|
|
12
|
+
audio_engine/llm/__init__.py,sha256=nCMx0QYVZSPVQaqu23LGLE8ePDnceygdw0Zz-4Oqg6g,1016
|
|
13
|
+
audio_engine/llm/base.py,sha256=vsKi2UYuhMr_nubMsoyU6hzSV6gr3DZ1sPvqkSvap3c,2862
|
|
14
|
+
audio_engine/llm/groq.py,sha256=zX4z_ZPyB5_FxhMwg-MnK5Ga6vpqRJLaRTrknSYJZXU,6682
|
|
15
|
+
audio_engine/pipelines/__init__.py,sha256=Q1iZjX38TigrZPBaFgv_5AXw21wBN1Z-4nfXPjV-xDI,49
|
|
16
|
+
audio_engine/pipelines/personaplex/__init__.py,sha256=nX37MS93pYUPKiYwY2aa9G-PEI4x2yKjdLqGeab7wWI,916
|
|
17
|
+
audio_engine/pipelines/personaplex/client.py,sha256=NAiG6V9nTWh8ozrb5jT-6h8fesTuJZDgh-l7DlHQm6M,8667
|
|
18
|
+
audio_engine/pipelines/personaplex/config.py,sha256=6fBteI-HjJJl3ZcK5QZCCa9kcKVNDgPptLIkJNZc9kg,2935
|
|
19
|
+
audio_engine/pipelines/personaplex/pipeline.py,sha256=WUkFalPQ9sxICeFpF-58HJxzfQ30vfZ4WAs-E5aI60s,10411
|
|
20
|
+
audio_engine/pipelines/personaplex/types.py,sha256=6MvU2hBukBflJxat3MtC6bGQY1b33jaOIiOi2tZJRnU,4727
|
|
21
|
+
audio_engine/pipelines/personaplex/utils.py,sha256=um_7nGRFH0QaLIIfLwPnBXgFW0fVGU7gkjF8Gm-Hq4U,5000
|
|
22
|
+
audio_engine/streaming/__init__.py,sha256=0FOorloUtads4ZeJKLdlTcqaL0l2G7Byq4ijQG3W1Fk,127
|
|
23
|
+
audio_engine/streaming/websocket_server.py,sha256=p9_ugvXfUW0TeuCkTCTUoHCmQ0vBTGq2J2Ubys1HdeY,10896
|
|
24
|
+
audio_engine/tts/__init__.py,sha256=sJIN_PgwO1_GVtr6NBR8oegqzLa9oR_qqD6ixbkDVro,967
|
|
25
|
+
audio_engine/tts/base.py,sha256=YbvdVF9XbJyv7NWf_5W7bawyquz8z83BcNcXOdA6iaY,4424
|
|
26
|
+
audio_engine/tts/cartesia.py,sha256=VipeNLgOac_hlsi2hasZe_ALYZZ7hvQ72eFeY17J600,16313
|
|
27
|
+
audio_engine/utils/__init__.py,sha256=J-XxXjgjAmvsM39W3pKI2we-C6S7rd49zfaEf9omwN8,245
|
|
28
|
+
audio_engine/utils/audio.py,sha256=RHp-FRjyCMPDaSQCOnxp7m_KO2z3Enu3iX7J5BVRD-0,5507
|
|
29
|
+
atom_audio_engine-0.1.6.dist-info/METADATA,sha256=C3C3SOBD6B-PSEeFVN5Mn67_1hcGFsIOG5jcNs9YxDI,6690
|
|
30
|
+
atom_audio_engine-0.1.6.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
31
|
+
atom_audio_engine-0.1.6.dist-info/top_level.txt,sha256=IyumwgFrsDL7nlZlBijX-0shiSVhhBCFPUNBRNKzWP4,13
|
|
32
|
+
atom_audio_engine-0.1.6.dist-info/RECORD,,
|
audio_engine/__init__.py
CHANGED
|
@@ -4,7 +4,7 @@ Audio Engine - Pluggable audio-to-audio conversational AI framework.
|
|
|
4
4
|
Orchestrates ASR → LLM → TTS pipeline with real-time streaming support.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
__version__ = "0.1.
|
|
7
|
+
__version__ = "0.1.6"
|
|
8
8
|
|
|
9
9
|
# Core exports
|
|
10
10
|
from .core.pipeline import Pipeline
|
|
@@ -33,7 +33,11 @@ except ImportError:
|
|
|
33
33
|
|
|
34
34
|
# LLM Providers
|
|
35
35
|
from .llm.base import BaseLLM
|
|
36
|
-
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
from .llm.groq import GroqLLM
|
|
39
|
+
except ImportError:
|
|
40
|
+
pass
|
|
37
41
|
|
|
38
42
|
# TTS Providers
|
|
39
43
|
from .tts.base import BaseTTS
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""ASR (Speech-to-Text) providers."""
|
|
2
|
+
|
|
3
|
+
from ..core.config import ASRConfig
|
|
4
|
+
|
|
5
|
+
from .base import BaseASR
|
|
6
|
+
from .cartesia import CartesiaASR
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from .deepgram import DeepgramASR
|
|
10
|
+
except ImportError:
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
__all__ = ["BaseASR", "DeepgramASR", "CartesiaASR", "get_asr_from_config"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_asr_from_config(config: ASRConfig) -> BaseASR:
|
|
17
|
+
"""
|
|
18
|
+
Instantiate ASR provider from config.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
config: ASRConfig object with provider name and settings
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Initialized BaseASR provider instance
|
|
25
|
+
|
|
26
|
+
Raises:
|
|
27
|
+
ValueError: If provider name is not recognized
|
|
28
|
+
"""
|
|
29
|
+
provider_name = config.provider.lower()
|
|
30
|
+
|
|
31
|
+
if provider_name == "deepgram":
|
|
32
|
+
return DeepgramASR(
|
|
33
|
+
api_key=config.api_key,
|
|
34
|
+
model=config.model or "nova-2",
|
|
35
|
+
language=config.language,
|
|
36
|
+
**config.extra,
|
|
37
|
+
)
|
|
38
|
+
elif provider_name == "cartesia":
|
|
39
|
+
return CartesiaASR(
|
|
40
|
+
api_key=config.api_key,
|
|
41
|
+
model=config.model or "ink-whisper",
|
|
42
|
+
language=config.language,
|
|
43
|
+
**config.extra,
|
|
44
|
+
)
|
|
45
|
+
else:
|
|
46
|
+
raise ValueError(
|
|
47
|
+
f"Unknown ASR provider: {config.provider}. " f"Supported: deepgram, cartesia"
|
|
48
|
+
)
|
audio_engine/asr/base.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Abstract base class for ASR (Speech-to-Text) providers."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import AsyncIterator, Optional
|
|
5
|
+
|
|
6
|
+
from ..core.types import AudioChunk, TranscriptChunk
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BaseASR(ABC):
|
|
10
|
+
"""
|
|
11
|
+
Abstract base class for Speech-to-Text providers.
|
|
12
|
+
|
|
13
|
+
All ASR implementations must inherit from this class and implement
|
|
14
|
+
the required methods for both batch and streaming transcription.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, api_key: Optional[str] = None, **kwargs):
|
|
18
|
+
"""
|
|
19
|
+
Initialize the ASR provider.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
api_key: API key for the provider (if required)
|
|
23
|
+
**kwargs: Additional provider-specific configuration
|
|
24
|
+
"""
|
|
25
|
+
self.api_key = api_key
|
|
26
|
+
self.config = kwargs
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
async def transcribe(self, audio: bytes, sample_rate: int = 16000) -> str:
|
|
30
|
+
"""
|
|
31
|
+
Transcribe a complete audio buffer to text.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
audio: Raw audio bytes (PCM format expected)
|
|
35
|
+
sample_rate: Sample rate of the audio in Hz
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Transcribed text string
|
|
39
|
+
"""
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
async def transcribe_stream(
|
|
44
|
+
self, audio_stream: AsyncIterator[AudioChunk]
|
|
45
|
+
) -> AsyncIterator[TranscriptChunk]:
|
|
46
|
+
"""
|
|
47
|
+
Transcribe streaming audio in real-time.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
audio_stream: Async iterator yielding AudioChunk objects
|
|
51
|
+
|
|
52
|
+
Yields:
|
|
53
|
+
TranscriptChunk objects with partial and final transcriptions
|
|
54
|
+
"""
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
async def __aenter__(self):
|
|
58
|
+
"""Async context manager entry."""
|
|
59
|
+
await self.connect()
|
|
60
|
+
return self
|
|
61
|
+
|
|
62
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
63
|
+
"""Async context manager exit."""
|
|
64
|
+
await self.disconnect()
|
|
65
|
+
|
|
66
|
+
async def connect(self):
|
|
67
|
+
"""
|
|
68
|
+
Establish connection to the ASR service.
|
|
69
|
+
Override in subclasses if needed.
|
|
70
|
+
"""
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
async def disconnect(self):
|
|
74
|
+
"""
|
|
75
|
+
Close connection to the ASR service.
|
|
76
|
+
Override in subclasses if needed.
|
|
77
|
+
"""
|
|
78
|
+
pass
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
@abstractmethod
|
|
82
|
+
def name(self) -> str:
|
|
83
|
+
"""Return the name of this ASR provider."""
|
|
84
|
+
pass
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def supports_streaming(self) -> bool:
|
|
88
|
+
"""Whether this provider supports real-time streaming."""
|
|
89
|
+
return True
|
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
"""Cartesia API implementation for ASR (Speech-to-Text) via WebSocket."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
from typing import AsyncIterator, Optional
|
|
7
|
+
from urllib.parse import quote
|
|
8
|
+
|
|
9
|
+
import websockets
|
|
10
|
+
|
|
11
|
+
from ..core.types import AudioChunk, TranscriptChunk
|
|
12
|
+
from .base import BaseASR
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
# Cartesia API version (required header)
|
|
17
|
+
CARTESIA_VERSION = "2025-04-16"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class CartesiaASR(BaseASR):
|
|
21
|
+
"""
|
|
22
|
+
Cartesia API client for speech-to-text transcription via WebSocket.
|
|
23
|
+
|
|
24
|
+
Supports both batch transcription and real-time streaming.
|
|
25
|
+
Uses Cartesia's Whisper model (ink-whisper) for high-accuracy transcription.
|
|
26
|
+
|
|
27
|
+
Approach:
|
|
28
|
+
1. Batch mode: collect audio, send via WebSocket, wait for final result
|
|
29
|
+
2. Streaming mode: send audio chunks as they arrive, yield results immediately
|
|
30
|
+
3. Background receive task queues responses from server
|
|
31
|
+
4. VAD (Voice Activity Detection) configurable via min_volume and max_silence_duration_secs
|
|
32
|
+
|
|
33
|
+
Example:
|
|
34
|
+
asr = CartesiaASR(api_key="sk_...")
|
|
35
|
+
|
|
36
|
+
# Batch transcription
|
|
37
|
+
text = await asr.transcribe(audio_bytes)
|
|
38
|
+
|
|
39
|
+
# Streaming transcription
|
|
40
|
+
async for chunk in asr.transcribe_stream(audio_stream):
|
|
41
|
+
print(chunk.text, end="", flush=True)
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
api_key: Optional[str] = None,
|
|
47
|
+
model: str = "ink-whisper",
|
|
48
|
+
language: str = "en",
|
|
49
|
+
encoding: str = "pcm_s16le",
|
|
50
|
+
sample_rate: int = 16000,
|
|
51
|
+
min_volume: float = 0.0,
|
|
52
|
+
max_silence_duration_secs: float = 30.0,
|
|
53
|
+
**kwargs,
|
|
54
|
+
):
|
|
55
|
+
"""
|
|
56
|
+
Initialize Cartesia ASR provider.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
api_key: Cartesia API key
|
|
60
|
+
model: Model to use (default: ink-whisper)
|
|
61
|
+
language: Language code in ISO-639-1 format (default: en)
|
|
62
|
+
encoding: Audio encoding format (default: pcm_s16le)
|
|
63
|
+
sample_rate: Sample rate in Hz (default: 16000)
|
|
64
|
+
min_volume: VAD threshold 0.0-1.0, higher = more aggressive (default: 0.0)
|
|
65
|
+
max_silence_duration_secs: Max silence before endpointing (default: 30.0)
|
|
66
|
+
**kwargs: Additional config (stored in self.config)
|
|
67
|
+
"""
|
|
68
|
+
super().__init__(api_key=api_key, **kwargs)
|
|
69
|
+
self.model = model
|
|
70
|
+
self.language = language
|
|
71
|
+
self.encoding = encoding
|
|
72
|
+
self.sample_rate = sample_rate
|
|
73
|
+
self.min_volume = min_volume
|
|
74
|
+
self.max_silence_duration_secs = max_silence_duration_secs
|
|
75
|
+
|
|
76
|
+
self.websocket = None
|
|
77
|
+
self._receive_task: Optional[asyncio.Task] = None
|
|
78
|
+
self._response_queue: asyncio.Queue = asyncio.Queue()
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def name(self) -> str:
|
|
82
|
+
"""Return provider name."""
|
|
83
|
+
return "cartesia"
|
|
84
|
+
|
|
85
|
+
async def connect(self):
|
|
86
|
+
"""
|
|
87
|
+
Initialize WebSocket connection to Cartesia STT endpoint.
|
|
88
|
+
|
|
89
|
+
Approach:
|
|
90
|
+
1. Construct WebSocket URL with parameters (model, language, encoding, sample_rate, VAD)
|
|
91
|
+
2. Connect to wss://api.cartesia.ai/stt/websocket
|
|
92
|
+
3. Launch background receive task to collect server responses
|
|
93
|
+
4. Log initialization status
|
|
94
|
+
|
|
95
|
+
Rationale: Lazy connection on first transcription; background task ensures
|
|
96
|
+
responses are queued even if caller is temporarily blocked.
|
|
97
|
+
"""
|
|
98
|
+
if self.websocket:
|
|
99
|
+
return
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
if not self.api_key:
|
|
103
|
+
# Try to get from environment
|
|
104
|
+
import os
|
|
105
|
+
|
|
106
|
+
self.api_key = os.getenv("CARTESIA_API_KEY") or os.getenv("ASR_API_KEY")
|
|
107
|
+
|
|
108
|
+
if not self.api_key:
|
|
109
|
+
raise ValueError("Cartesia API key not provided")
|
|
110
|
+
|
|
111
|
+
# Construct WebSocket URL with properly encoded parameters
|
|
112
|
+
# API key must be URL-encoded to handle special characters
|
|
113
|
+
url = (
|
|
114
|
+
f"wss://api.cartesia.ai/stt/websocket?"
|
|
115
|
+
f"model={quote(str(self.model))}"
|
|
116
|
+
f"&language={quote(str(self.language))}"
|
|
117
|
+
f"&encoding={quote(str(self.encoding))}"
|
|
118
|
+
f"&sample_rate={quote(str(self.sample_rate))}"
|
|
119
|
+
f"&min_volume={quote(str(self.min_volume))}"
|
|
120
|
+
f"&max_silence_duration_secs={quote(str(self.max_silence_duration_secs))}"
|
|
121
|
+
f"&api_key={quote(str(self.api_key))}"
|
|
122
|
+
)
|
|
123
|
+
logger.debug(f"Cartesia WebSocket URL: {url}")
|
|
124
|
+
|
|
125
|
+
# Connect to WebSocket with required Cartesia-Version header
|
|
126
|
+
try:
|
|
127
|
+
# Build URL with all parameters
|
|
128
|
+
# Add Cartesia-Version header via subprotocol or connection params
|
|
129
|
+
self.websocket = await asyncio.wait_for(
|
|
130
|
+
websockets.connect(
|
|
131
|
+
url, additional_headers=[("Cartesia-Version", CARTESIA_VERSION)]
|
|
132
|
+
),
|
|
133
|
+
timeout=30.0, # Increase timeout to 30s for initial connection
|
|
134
|
+
)
|
|
135
|
+
logger.debug("Cartesia WebSocket connected")
|
|
136
|
+
except asyncio.TimeoutError:
|
|
137
|
+
logger.error(f"WebSocket connection timeout to {url}")
|
|
138
|
+
raise TimeoutError("Failed to connect to Cartesia WebSocket within 30s timeout")
|
|
139
|
+
|
|
140
|
+
# Start background receive task
|
|
141
|
+
self._receive_task = asyncio.create_task(self._receive_loop())
|
|
142
|
+
|
|
143
|
+
except Exception as e:
|
|
144
|
+
logger.error(f"Failed to initialize Cartesia WebSocket: {e}")
|
|
145
|
+
raise
|
|
146
|
+
|
|
147
|
+
async def disconnect(self):
|
|
148
|
+
"""Close WebSocket connection and cleanup."""
|
|
149
|
+
try:
|
|
150
|
+
if self._receive_task:
|
|
151
|
+
self._receive_task.cancel()
|
|
152
|
+
try:
|
|
153
|
+
await self._receive_task
|
|
154
|
+
except asyncio.CancelledError:
|
|
155
|
+
pass
|
|
156
|
+
|
|
157
|
+
if self.websocket:
|
|
158
|
+
await self.websocket.close()
|
|
159
|
+
logger.debug("Cartesia WebSocket closed")
|
|
160
|
+
|
|
161
|
+
except Exception as e:
|
|
162
|
+
logger.error(f"Error disconnecting Cartesia: {e}")
|
|
163
|
+
|
|
164
|
+
async def _receive_loop(self):
|
|
165
|
+
"""
|
|
166
|
+
Background task: continuously receive messages from WebSocket.
|
|
167
|
+
|
|
168
|
+
Parses JSON responses and queues them for retrieval by transcribe methods.
|
|
169
|
+
Handles: transcript, flush_done, done, error message types.
|
|
170
|
+
"""
|
|
171
|
+
try:
|
|
172
|
+
if not self.websocket:
|
|
173
|
+
return
|
|
174
|
+
|
|
175
|
+
async for message in self.websocket:
|
|
176
|
+
try:
|
|
177
|
+
# Parse JSON response
|
|
178
|
+
response = json.loads(message)
|
|
179
|
+
await self._response_queue.put(response)
|
|
180
|
+
|
|
181
|
+
except json.JSONDecodeError as e:
|
|
182
|
+
logger.error(f"Failed to parse Cartesia response: {e}")
|
|
183
|
+
except Exception as e:
|
|
184
|
+
logger.error(f"Error in receive loop: {e}")
|
|
185
|
+
|
|
186
|
+
except asyncio.CancelledError:
|
|
187
|
+
logger.debug("Receive loop cancelled")
|
|
188
|
+
except Exception as e:
|
|
189
|
+
logger.error(f"Unexpected error in receive loop: {e}")
|
|
190
|
+
|
|
191
|
+
async def transcribe(self, audio: bytes, sample_rate: int = 16000) -> str:
|
|
192
|
+
"""
|
|
193
|
+
Transcribe complete audio buffer to text.
|
|
194
|
+
|
|
195
|
+
Approach:
|
|
196
|
+
1. Initialize WebSocket if needed
|
|
197
|
+
2. Send audio in chunks (100ms intervals)
|
|
198
|
+
3. Send 'done' command to finalize
|
|
199
|
+
4. Collect all responses until 'done' received
|
|
200
|
+
5. Extract and return transcript text
|
|
201
|
+
|
|
202
|
+
Rationale: Batch mode for complete audio files; simple sequential flow.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
audio: Raw PCM audio bytes
|
|
206
|
+
sample_rate: Sample rate in Hz (default 16000)
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
Transcribed text
|
|
210
|
+
"""
|
|
211
|
+
if not self.websocket:
|
|
212
|
+
await self.connect()
|
|
213
|
+
|
|
214
|
+
try:
|
|
215
|
+
logger.debug(f"Transcribing {len(audio)} bytes at {sample_rate}Hz")
|
|
216
|
+
|
|
217
|
+
# Send audio in chunks (100ms intervals at 16kHz = 3200 bytes)
|
|
218
|
+
chunk_size = int(self.sample_rate * 0.1 * 2) # 100ms in bytes
|
|
219
|
+
offset = 0
|
|
220
|
+
|
|
221
|
+
while offset < len(audio):
|
|
222
|
+
chunk = audio[offset : offset + chunk_size]
|
|
223
|
+
await self.websocket.send(chunk)
|
|
224
|
+
offset += chunk_size
|
|
225
|
+
|
|
226
|
+
# Send 'done' command to finalize
|
|
227
|
+
await self.websocket.send("done")
|
|
228
|
+
|
|
229
|
+
# Collect responses until 'done' received
|
|
230
|
+
transcript_parts = []
|
|
231
|
+
while True:
|
|
232
|
+
try:
|
|
233
|
+
response = await asyncio.wait_for(self._response_queue.get(), timeout=10.0)
|
|
234
|
+
|
|
235
|
+
if response.get("type") == "transcript":
|
|
236
|
+
text = response.get("text", "")
|
|
237
|
+
if text:
|
|
238
|
+
transcript_parts.append(text)
|
|
239
|
+
|
|
240
|
+
elif response.get("type") == "done":
|
|
241
|
+
break
|
|
242
|
+
|
|
243
|
+
elif response.get("type") == "error":
|
|
244
|
+
error_msg = response.get("error", "Unknown error")
|
|
245
|
+
raise RuntimeError(f"Cartesia error: {error_msg}")
|
|
246
|
+
|
|
247
|
+
except asyncio.TimeoutError:
|
|
248
|
+
logger.warning("Timeout waiting for Cartesia response")
|
|
249
|
+
break
|
|
250
|
+
|
|
251
|
+
return "".join(transcript_parts)
|
|
252
|
+
|
|
253
|
+
except Exception as e:
|
|
254
|
+
logger.error(f"Cartesia transcription error: {e}")
|
|
255
|
+
raise
|
|
256
|
+
|
|
257
|
+
async def transcribe_stream(
|
|
258
|
+
self, audio_stream: AsyncIterator[AudioChunk]
|
|
259
|
+
) -> AsyncIterator[TranscriptChunk]:
|
|
260
|
+
"""
|
|
261
|
+
Transcribe streaming audio in real-time.
|
|
262
|
+
|
|
263
|
+
Approach:
|
|
264
|
+
1. Initialize WebSocket if needed
|
|
265
|
+
2. For each audio chunk from stream:
|
|
266
|
+
- Send binary audio via WebSocket
|
|
267
|
+
- Check response queue for server responses (non-blocking)
|
|
268
|
+
- Yield TranscriptChunk for each response
|
|
269
|
+
3. On final audio chunk (is_final=True), send 'done' command
|
|
270
|
+
4. Continue yielding responses until 'done' received
|
|
271
|
+
5. Signal stream end
|
|
272
|
+
|
|
273
|
+
Rationale: Streaming yields results immediately; low latency;
|
|
274
|
+
background task queues responses so we don't block on receives.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
audio_stream: Async iterator yielding AudioChunk objects
|
|
278
|
+
|
|
279
|
+
Yields:
|
|
280
|
+
TranscriptChunk objects with partial and final transcriptions
|
|
281
|
+
"""
|
|
282
|
+
if not self.websocket:
|
|
283
|
+
await self.connect()
|
|
284
|
+
|
|
285
|
+
try:
|
|
286
|
+
done_sent = False
|
|
287
|
+
|
|
288
|
+
async for audio_chunk in audio_stream:
|
|
289
|
+
# Send audio via WebSocket
|
|
290
|
+
await self.websocket.send(audio_chunk.data)
|
|
291
|
+
|
|
292
|
+
# Try to get responses (non-blocking)
|
|
293
|
+
while not self._response_queue.empty():
|
|
294
|
+
response = self._response_queue.get_nowait()
|
|
295
|
+
|
|
296
|
+
if response.get("type") == "transcript":
|
|
297
|
+
text = response.get("text", "")
|
|
298
|
+
is_final = response.get("is_final", False)
|
|
299
|
+
if text:
|
|
300
|
+
yield TranscriptChunk(
|
|
301
|
+
text=text,
|
|
302
|
+
confidence=None, # Cartesia doesn't return confidence
|
|
303
|
+
is_final=is_final,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
elif response.get("type") == "error":
|
|
307
|
+
error_msg = response.get("error", "Unknown error")
|
|
308
|
+
logger.error(f"Cartesia error: {error_msg}")
|
|
309
|
+
|
|
310
|
+
# If this is the final audio chunk, send 'done' command
|
|
311
|
+
if audio_chunk.is_final and not done_sent:
|
|
312
|
+
await self.websocket.send("done")
|
|
313
|
+
done_sent = True
|
|
314
|
+
|
|
315
|
+
# Continue collecting responses until 'done' received
|
|
316
|
+
if done_sent:
|
|
317
|
+
while True:
|
|
318
|
+
try:
|
|
319
|
+
response = await asyncio.wait_for(self._response_queue.get(), timeout=5.0)
|
|
320
|
+
|
|
321
|
+
if response.get("type") == "transcript":
|
|
322
|
+
text = response.get("text", "")
|
|
323
|
+
is_final = response.get("is_final", False)
|
|
324
|
+
if text:
|
|
325
|
+
yield TranscriptChunk(
|
|
326
|
+
text=text,
|
|
327
|
+
confidence=None,
|
|
328
|
+
is_final=is_final,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
elif response.get("type") == "done":
|
|
332
|
+
# Yield final chunk to signal stream end
|
|
333
|
+
yield TranscriptChunk(
|
|
334
|
+
text="",
|
|
335
|
+
confidence=None,
|
|
336
|
+
is_final=True,
|
|
337
|
+
)
|
|
338
|
+
break
|
|
339
|
+
|
|
340
|
+
elif response.get("type") == "error":
|
|
341
|
+
error_msg = response.get("error", "Unknown error")
|
|
342
|
+
logger.error(f"Cartesia error: {error_msg}")
|
|
343
|
+
|
|
344
|
+
except asyncio.TimeoutError:
|
|
345
|
+
logger.warning("Timeout waiting for Cartesia final response")
|
|
346
|
+
break
|
|
347
|
+
|
|
348
|
+
except Exception as e:
|
|
349
|
+
logger.error(f"Cartesia streaming transcription error: {e}")
|
|
350
|
+
raise
|