gdmcode 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gdmcode-0.1.0.dist-info/METADATA +240 -0
- gdmcode-0.1.0.dist-info/RECORD +131 -0
- gdmcode-0.1.0.dist-info/WHEEL +4 -0
- gdmcode-0.1.0.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/_internal/__init__.py +0 -0
- src/_internal/constants.py +244 -0
- src/_internal/domain_skills.py +339 -0
- src/agent/__init__.py +0 -0
- src/agent/commit_classifier.py +91 -0
- src/agent/context_budget.py +391 -0
- src/agent/daemon.py +681 -0
- src/agent/dag_validator.py +153 -0
- src/agent/debug_loop.py +473 -0
- src/agent/impact_analyzer.py +149 -0
- src/agent/impact_graph.py +117 -0
- src/agent/loop.py +1410 -0
- src/agent/orchestrator.py +141 -0
- src/agent/regression_guard.py +251 -0
- src/agent/review_gate.py +648 -0
- src/agent/risk_scorer.py +169 -0
- src/agent/self_healing.py +145 -0
- src/agent/smart_test_selector.py +89 -0
- src/agent/system_prompt.py +226 -0
- src/agent/task_tracker.py +320 -0
- src/agent/test_validator.py +210 -0
- src/agent/tool_orchestrator.py +402 -0
- src/agent/transcript.py +230 -0
- src/agent/verification_loop.py +133 -0
- src/agent/work_director.py +136 -0
- src/agent/worktree_manager.py +53 -0
- src/artifacts/__init__.py +16 -0
- src/artifacts/artifact_store.py +456 -0
- src/artifacts/verification_graph.py +75 -0
- src/auth.py +411 -0
- src/cli.py +1290 -0
- src/commands.py +1398 -0
- src/config.py +762 -0
- src/cost_tracker.py +348 -0
- src/db/__init__.py +4 -0
- src/db/migrations.py +337 -0
- src/enterprise/__init__.py +3 -0
- src/enterprise/audit_log.py +182 -0
- src/enterprise/identity.py +90 -0
- src/enterprise/rbac.py +100 -0
- src/enterprise/team_config.py +125 -0
- src/enterprise/usage_analytics.py +261 -0
- src/exceptions.py +207 -0
- src/git_workflow.py +651 -0
- src/integrations/__init__.py +6 -0
- src/integrations/github_actions.py +106 -0
- src/integrations/mcp_server.py +333 -0
- src/integrations/sentry_integration.py +100 -0
- src/integrations/sentry_server.py +82 -0
- src/integrations/webhook_security.py +19 -0
- src/main.py +27 -0
- src/memory/__init__.py +0 -0
- src/memory/code_index.py +376 -0
- src/memory/compressor.py +378 -0
- src/memory/context_memory.py +135 -0
- src/memory/continuous_memory.py +234 -0
- src/memory/conventions.py +495 -0
- src/memory/db.py +1119 -0
- src/memory/document_index.py +205 -0
- src/memory/file_cache.py +128 -0
- src/memory/project_scanner.py +178 -0
- src/memory/session_store.py +201 -0
- src/models/__init__.py +0 -0
- src/models/client.py +715 -0
- src/models/definitions.py +459 -0
- src/models/router.py +418 -0
- src/models/schemas.py +389 -0
- src/permissions.py +294 -0
- src/remote/__init__.py +5 -0
- src/remote/command_filter.py +33 -0
- src/remote/models.py +31 -0
- src/remote/permission_handler.py +79 -0
- src/remote/phone_ui.py +48 -0
- src/remote/protocol.py +59 -0
- src/remote/qr.py +65 -0
- src/remote/server.py +586 -0
- src/remote/token_manager.py +61 -0
- src/remote/tunnel.py +212 -0
- src/repl.py +475 -0
- src/runtime/__init__.py +1 -0
- src/runtime/branch_farm.py +372 -0
- src/runtime/replay.py +351 -0
- src/sandbox/__init__.py +2 -0
- src/sandbox/hermetic.py +214 -0
- src/sandbox/policy.py +44 -0
- src/sdk/__init__.py +3 -0
- src/sdk/plugin_base.py +39 -0
- src/sdk/plugin_host.py +100 -0
- src/sdk/plugin_loader.py +101 -0
- src/security.py +409 -0
- src/server/__init__.py +7 -0
- src/server/bridge.py +427 -0
- src/server/bridge_cli.py +103 -0
- src/server/bridge_client.py +170 -0
- src/server/protocol_version.py +103 -0
- src/session/__init__.py +10 -0
- src/session/event_fanout.py +46 -0
- src/session/input_broker.py +38 -0
- src/session/permission_bridge.py +100 -0
- src/tools/__init__.py +160 -0
- src/tools/_atomic.py +72 -0
- src/tools/agent_tools.py +423 -0
- src/tools/ask_user_tool.py +83 -0
- src/tools/bash_tool.py +384 -0
- src/tools/browser_tool.py +352 -0
- src/tools/browser_tools.py +179 -0
- src/tools/dep_tools.py +210 -0
- src/tools/document_reader.py +167 -0
- src/tools/document_tool.py +240 -0
- src/tools/document_writer.py +171 -0
- src/tools/impact_tools.py +240 -0
- src/tools/playwright_tool.py +172 -0
- src/tools/quality_tools.py +366 -0
- src/tools/read_tools.py +318 -0
- src/tools/result_cache.py +157 -0
- src/tools/search_tools.py +310 -0
- src/tools/shell_tools.py +311 -0
- src/tools/write_tools.py +337 -0
- src/voice/__init__.py +25 -0
- src/voice/audio_capture.py +92 -0
- src/voice/audio_playback.py +68 -0
- src/voice/errors.py +14 -0
- src/voice/models.py +35 -0
- src/voice/providers.py +143 -0
- src/voice/vad.py +55 -0
- src/voice/voice_loop.py +156 -0
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import threading
|
|
5
|
+
import time
|
|
6
|
+
from collections import deque
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
from src.voice.errors import AudioDeviceError
|
|
10
|
+
from src.voice.models import VoiceConfig
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
import sounddevice as sd # type: ignore[import-untyped]
|
|
14
|
+
_SD_AVAILABLE = True
|
|
15
|
+
except ImportError:
|
|
16
|
+
_SD_AVAILABLE = False
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
RING_BUFFER_FRAMES = 50
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class AudioCapture:
|
|
27
|
+
"""Wraps sounddevice mic input with a thread-safe ring buffer."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, config: VoiceConfig) -> None:
|
|
30
|
+
self._config = config
|
|
31
|
+
self._buffer: deque[bytes] = deque(maxlen=RING_BUFFER_FRAMES)
|
|
32
|
+
self._lock = threading.Lock()
|
|
33
|
+
self._data_event = threading.Event()
|
|
34
|
+
self._stream: object | None = None
|
|
35
|
+
self._running = False
|
|
36
|
+
|
|
37
|
+
def start(self) -> None:
|
|
38
|
+
"""Start sounddevice InputStream in a background thread."""
|
|
39
|
+
if not _SD_AVAILABLE:
|
|
40
|
+
raise AudioDeviceError("sounddevice not installed")
|
|
41
|
+
|
|
42
|
+
def _callback(indata: object, frames: int, time: object, status: object) -> None: # type: ignore[override]
|
|
43
|
+
if status:
|
|
44
|
+
logger.warning("AudioCapture status: %s", status)
|
|
45
|
+
import numpy as np # type: ignore[import-untyped]
|
|
46
|
+
raw = np.array(indata).tobytes()
|
|
47
|
+
with self._lock:
|
|
48
|
+
self._buffer.append(raw)
|
|
49
|
+
self._data_event.set()
|
|
50
|
+
|
|
51
|
+
self._stream = sd.InputStream(
|
|
52
|
+
samplerate=self._config.sample_rate,
|
|
53
|
+
channels=self._config.channels,
|
|
54
|
+
dtype="int16",
|
|
55
|
+
blocksize=self._config.chunk_size,
|
|
56
|
+
device=self._config.device_index,
|
|
57
|
+
callback=_callback,
|
|
58
|
+
)
|
|
59
|
+
self._stream.start() # type: ignore[union-attr]
|
|
60
|
+
self._running = True
|
|
61
|
+
|
|
62
|
+
def stop(self) -> None:
|
|
63
|
+
"""Stop stream and drain buffer."""
|
|
64
|
+
self._running = False
|
|
65
|
+
if self._stream is not None:
|
|
66
|
+
try:
|
|
67
|
+
self._stream.stop() # type: ignore[union-attr]
|
|
68
|
+
self._stream.close() # type: ignore[union-attr]
|
|
69
|
+
except Exception:
|
|
70
|
+
logger.debug("Error stopping audio stream", exc_info=True)
|
|
71
|
+
self._stream = None
|
|
72
|
+
self._data_event.set() # unblock any waiting read_frames
|
|
73
|
+
with self._lock:
|
|
74
|
+
self._buffer.clear()
|
|
75
|
+
|
|
76
|
+
def read_frames(self, n: int) -> bytes:
|
|
77
|
+
"""Read n frames from ring buffer, blocking up to 2s; returns what's available."""
|
|
78
|
+
collected: list[bytes] = []
|
|
79
|
+
deadline = 2.0
|
|
80
|
+
end = time.monotonic() + deadline
|
|
81
|
+
while len(collected) < n:
|
|
82
|
+
with self._lock:
|
|
83
|
+
while self._buffer and len(collected) < n:
|
|
84
|
+
collected.append(self._buffer.popleft())
|
|
85
|
+
if len(collected) >= n:
|
|
86
|
+
break
|
|
87
|
+
remaining = end - time.monotonic()
|
|
88
|
+
if remaining <= 0:
|
|
89
|
+
break
|
|
90
|
+
self._data_event.wait(timeout=min(remaining, 0.1))
|
|
91
|
+
self._data_event.clear()
|
|
92
|
+
return b"".join(collected)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
from src.voice.models import VoiceConfig
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
import sounddevice as sd # type: ignore[import-untyped]
|
|
11
|
+
_SD_AVAILABLE = True
|
|
12
|
+
except ImportError:
|
|
13
|
+
_SD_AVAILABLE = False
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
_QUEUE_MAXSIZE = 32
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class AudioPlayback:
|
|
24
|
+
"""Async TTS chunk queue drain via sounddevice output."""
|
|
25
|
+
|
|
26
|
+
def __init__(self, config: VoiceConfig) -> None:
|
|
27
|
+
self._config = config
|
|
28
|
+
self._queue: asyncio.Queue[bytes] = asyncio.Queue(maxsize=_QUEUE_MAXSIZE)
|
|
29
|
+
self._stop_event = asyncio.Event()
|
|
30
|
+
|
|
31
|
+
def play_chunk(self, audio_bytes: bytes) -> None:
|
|
32
|
+
"""Non-blocking enqueue; logs warning and no-ops if sounddevice unavailable."""
|
|
33
|
+
if not _SD_AVAILABLE:
|
|
34
|
+
logger.warning("sounddevice not available; audio playback disabled")
|
|
35
|
+
return
|
|
36
|
+
try:
|
|
37
|
+
self._queue.put_nowait(audio_bytes)
|
|
38
|
+
except asyncio.QueueFull:
|
|
39
|
+
logger.warning("AudioPlayback queue full; dropping chunk")
|
|
40
|
+
|
|
41
|
+
def stop(self) -> None:
|
|
42
|
+
"""Signal drain loop to stop and clear pending chunks."""
|
|
43
|
+
self._stop_event.set()
|
|
44
|
+
while not self._queue.empty():
|
|
45
|
+
try:
|
|
46
|
+
self._queue.get_nowait()
|
|
47
|
+
except asyncio.QueueEmpty:
|
|
48
|
+
break
|
|
49
|
+
|
|
50
|
+
async def drain(self) -> None:
|
|
51
|
+
"""Dequeue chunks and write to sounddevice OutputStream."""
|
|
52
|
+
if not _SD_AVAILABLE:
|
|
53
|
+
return
|
|
54
|
+
|
|
55
|
+
import numpy as np # type: ignore[import-untyped]
|
|
56
|
+
|
|
57
|
+
with sd.OutputStream(
|
|
58
|
+
samplerate=self._config.sample_rate,
|
|
59
|
+
channels=self._config.channels,
|
|
60
|
+
dtype="int16",
|
|
61
|
+
) as stream:
|
|
62
|
+
while not self._stop_event.is_set():
|
|
63
|
+
try:
|
|
64
|
+
chunk = await asyncio.wait_for(self._queue.get(), timeout=0.1)
|
|
65
|
+
except asyncio.TimeoutError:
|
|
66
|
+
continue
|
|
67
|
+
audio_array = np.frombuffer(chunk, dtype="int16").reshape(-1, self._config.channels)
|
|
68
|
+
stream.write(audio_array)
|
src/voice/errors.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
class VoiceError(Exception):
|
|
2
|
+
"""Base voice error."""
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class AudioDeviceError(VoiceError):
|
|
6
|
+
"""Audio device unavailable or misconfigured."""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class STTError(VoiceError):
|
|
10
|
+
"""Speech-to-text failure."""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TTSError(VoiceError):
|
|
14
|
+
"""Text-to-speech failure."""
|
src/voice/models.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Protocol, runtime_checkable
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@runtime_checkable
|
|
8
|
+
class STTEngine(Protocol):
|
|
9
|
+
def transcribe(self, audio_bytes: bytes, sample_rate: int = 16000) -> str: ...
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@runtime_checkable
|
|
13
|
+
class TTSEngine(Protocol):
|
|
14
|
+
def synthesize(self, text: str) -> bytes: ... # returns raw PCM bytes
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class VoiceEvent:
|
|
19
|
+
type: str # "speech_start" | "speech_end" | "utterance"
|
|
20
|
+
data: bytes = field(default_factory=bytes)
|
|
21
|
+
text: str = ""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class VoiceConfig:
|
|
26
|
+
sample_rate: int = 16000
|
|
27
|
+
channels: int = 1
|
|
28
|
+
frame_duration_ms: int = 20 # VAD frame size
|
|
29
|
+
vad_aggressiveness: int = 2 # 0-3
|
|
30
|
+
silence_timeout_ms: int = 550 # silence duration → speech endpoint
|
|
31
|
+
device_index: int | None = None # None = system default
|
|
32
|
+
chunk_size: int = 320 # frames per read
|
|
33
|
+
stt_provider: str = "whisper" # STT backend name
|
|
34
|
+
tts_provider: str = "pyttsx3" # TTS backend name
|
|
35
|
+
language: str = "en" # language hint for STT/TTS
|
src/voice/providers.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""STT/TTS provider abstractions for the voice layer."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import io
|
|
5
|
+
import os
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class VoiceConfig:
|
|
12
|
+
"""Provider-level voice configuration."""
|
|
13
|
+
|
|
14
|
+
stt_provider: str = "whisper"
|
|
15
|
+
tts_provider: str = "pyttsx3"
|
|
16
|
+
language: str = "en"
|
|
17
|
+
sample_rate: int = 16000
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class STTProvider(ABC):
|
|
21
|
+
"""Abstract base for Speech-to-Text providers."""
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def transcribe(self, audio_bytes: bytes) -> str:
|
|
25
|
+
"""Transcribe audio bytes to text."""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class TTSProvider(ABC):
|
|
29
|
+
"""Abstract base for Text-to-Speech providers."""
|
|
30
|
+
|
|
31
|
+
@abstractmethod
|
|
32
|
+
def synthesize(self, text: str) -> bytes:
|
|
33
|
+
"""Synthesize text to audio bytes."""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class WhisperSTTProvider(STTProvider):
|
|
37
|
+
"""OpenAI Whisper STT via openai.audio.transcriptions."""
|
|
38
|
+
|
|
39
|
+
def __init__(self, api_key: str | None = None) -> None:
|
|
40
|
+
self._api_key = api_key or os.environ.get("OPENAI_API_KEY")
|
|
41
|
+
|
|
42
|
+
def transcribe(self, audio_bytes: bytes) -> str:
|
|
43
|
+
import openai # already a core dependency
|
|
44
|
+
|
|
45
|
+
client = openai.OpenAI(api_key=self._api_key)
|
|
46
|
+
audio_file = io.BytesIO(audio_bytes)
|
|
47
|
+
audio_file.name = "audio.wav"
|
|
48
|
+
result = client.audio.transcriptions.create(model="whisper-1", file=audio_file)
|
|
49
|
+
return result.text
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class DeepgramSTTProvider(STTProvider):
|
|
53
|
+
"""Deepgram STT stub — install deepgram-sdk to use."""
|
|
54
|
+
|
|
55
|
+
def transcribe(self, audio_bytes: bytes) -> str:
|
|
56
|
+
raise NotImplementedError("install deepgram-sdk")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class ElevenLabsTTSProvider(TTSProvider):
|
|
60
|
+
"""ElevenLabs TTS stub — install elevenlabs to use."""
|
|
61
|
+
|
|
62
|
+
def synthesize(self, text: str) -> bytes:
|
|
63
|
+
raise NotImplementedError("install elevenlabs")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class GTTSTTSProvider(TTSProvider):
|
|
67
|
+
"""Google TTS via gtts library."""
|
|
68
|
+
|
|
69
|
+
def __init__(self, language: str = "en") -> None:
|
|
70
|
+
self._language = language
|
|
71
|
+
|
|
72
|
+
def synthesize(self, text: str) -> bytes:
|
|
73
|
+
try:
|
|
74
|
+
from gtts import gTTS # type: ignore[import-untyped]
|
|
75
|
+
except ImportError as exc:
|
|
76
|
+
raise ImportError(
|
|
77
|
+
"gtts is not installed. Install it with: pip install gtts"
|
|
78
|
+
) from exc
|
|
79
|
+
|
|
80
|
+
buf = io.BytesIO()
|
|
81
|
+
tts = gTTS(text=text, lang=self._language)
|
|
82
|
+
tts.write_to_fp(buf)
|
|
83
|
+
return buf.getvalue()
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class PyttsxTTSProvider(TTSProvider):
|
|
87
|
+
"""pyttsx3 offline TTS."""
|
|
88
|
+
|
|
89
|
+
def synthesize(self, text: str) -> bytes:
|
|
90
|
+
try:
|
|
91
|
+
import pyttsx3 # type: ignore[import-untyped]
|
|
92
|
+
except ImportError as exc:
|
|
93
|
+
raise ImportError(
|
|
94
|
+
"pyttsx3 is not installed. Install it with: pip install pyttsx3"
|
|
95
|
+
) from exc
|
|
96
|
+
|
|
97
|
+
import tempfile
|
|
98
|
+
|
|
99
|
+
engine = pyttsx3.init()
|
|
100
|
+
fd, tmp_path = tempfile.mkstemp(suffix=".wav")
|
|
101
|
+
os.close(fd)
|
|
102
|
+
try:
|
|
103
|
+
engine.save_to_file(text, tmp_path)
|
|
104
|
+
engine.runAndWait()
|
|
105
|
+
with open(tmp_path, "rb") as f:
|
|
106
|
+
return f.read()
|
|
107
|
+
finally:
|
|
108
|
+
try:
|
|
109
|
+
os.remove(tmp_path)
|
|
110
|
+
except OSError:
|
|
111
|
+
pass
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
_STT_REGISTRY: dict[str, type[STTProvider]] = {
|
|
115
|
+
"whisper": WhisperSTTProvider,
|
|
116
|
+
"deepgram": DeepgramSTTProvider,
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
_TTS_REGISTRY: dict[str, type[TTSProvider]] = {
|
|
120
|
+
"gtts": GTTSTTSProvider,
|
|
121
|
+
"elevenlabs": ElevenLabsTTSProvider,
|
|
122
|
+
"pyttsx3": PyttsxTTSProvider,
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def get_stt(provider: str = "whisper") -> STTProvider:
|
|
127
|
+
"""Return an STTProvider instance by name."""
|
|
128
|
+
try:
|
|
129
|
+
return _STT_REGISTRY[provider]()
|
|
130
|
+
except KeyError:
|
|
131
|
+
raise ValueError(
|
|
132
|
+
f"Unknown STT provider: {provider!r}. Available: {list(_STT_REGISTRY)}"
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def get_tts(provider: str = "pyttsx3") -> TTSProvider:
|
|
137
|
+
"""Return a TTSProvider instance by name."""
|
|
138
|
+
try:
|
|
139
|
+
return _TTS_REGISTRY[provider]()
|
|
140
|
+
except KeyError:
|
|
141
|
+
raise ValueError(
|
|
142
|
+
f"Unknown TTS provider: {provider!r}. Available: {list(_TTS_REGISTRY)}"
|
|
143
|
+
)
|
src/voice/vad.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from src.voice.models import VoiceConfig
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
import webrtcvad # type: ignore[import-untyped]
|
|
9
|
+
_VAD_AVAILABLE = True
|
|
10
|
+
except ImportError:
|
|
11
|
+
_VAD_AVAILABLE = False
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class VADProcessor:
|
|
17
|
+
"""webrtcvad-based voice activity detection."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, config: VoiceConfig) -> None:
|
|
20
|
+
self._config = config
|
|
21
|
+
self._speech_active = False
|
|
22
|
+
self._silent_frames = 0
|
|
23
|
+
|
|
24
|
+
if _VAD_AVAILABLE:
|
|
25
|
+
self._vad = webrtcvad.Vad(config.vad_aggressiveness)
|
|
26
|
+
else:
|
|
27
|
+
self._vad = None
|
|
28
|
+
|
|
29
|
+
# How many consecutive silent frames equal silence_timeout_ms
|
|
30
|
+
frame_ms = config.frame_duration_ms or 20
|
|
31
|
+
self._silence_frame_limit = max(1, config.silence_timeout_ms // frame_ms)
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def is_speech_active(self) -> bool:
|
|
35
|
+
return self._speech_active
|
|
36
|
+
|
|
37
|
+
def process(self, frame: bytes) -> bool:
|
|
38
|
+
"""Return True if speech detected in this 20ms frame."""
|
|
39
|
+
if not _VAD_AVAILABLE or self._vad is None:
|
|
40
|
+
# Graceful degradation: treat everything as speech
|
|
41
|
+
self._speech_active = True
|
|
42
|
+
self._silent_frames = 0
|
|
43
|
+
return True
|
|
44
|
+
|
|
45
|
+
is_speech = self._vad.is_speech(frame, self._config.sample_rate)
|
|
46
|
+
|
|
47
|
+
if is_speech:
|
|
48
|
+
self._speech_active = True
|
|
49
|
+
self._silent_frames = 0
|
|
50
|
+
else:
|
|
51
|
+
self._silent_frames += 1
|
|
52
|
+
if self._silent_frames >= self._silence_frame_limit:
|
|
53
|
+
self._speech_active = False
|
|
54
|
+
|
|
55
|
+
return bool(is_speech)
|
src/voice/voice_loop.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""VoiceLoop — connects mic input → STT → agent → TTS output.
|
|
2
|
+
|
|
3
|
+
Usage (push-to-talk):
|
|
4
|
+
loop = VoiceLoop(config, agent_fn)
|
|
5
|
+
audio_response = loop.push_to_talk(audio_bytes)
|
|
6
|
+
|
|
7
|
+
Usage (streaming / background):
|
|
8
|
+
loop = VoiceLoop(config, agent_fn)
|
|
9
|
+
loop.start() # launches background thread
|
|
10
|
+
...
|
|
11
|
+
loop.stop() # graceful shutdown
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
import threading
|
|
17
|
+
from typing import TYPE_CHECKING, Callable
|
|
18
|
+
|
|
19
|
+
from src.voice.errors import STTError, TTSError, VoiceError
|
|
20
|
+
from src.voice.models import VoiceConfig
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from src.voice.providers import STTProvider, TTSProvider
|
|
24
|
+
|
|
25
|
+
__all__ = ["VoiceLoop", "VoiceLoopError"]
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class VoiceLoopError(VoiceError):
|
|
31
|
+
"""Raised when VoiceLoop encounters an unrecoverable pipeline error."""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class VoiceLoop:
|
|
35
|
+
"""Full mic → STT → agent → TTS pipeline.
|
|
36
|
+
|
|
37
|
+
Parameters
|
|
38
|
+
----------
|
|
39
|
+
config:
|
|
40
|
+
Voice configuration (sample rate, provider names, …).
|
|
41
|
+
agent_send_fn:
|
|
42
|
+
Callable that accepts a text prompt and returns the agent's text reply.
|
|
43
|
+
stt:
|
|
44
|
+
Optional STT provider instance. If *None* the provider is resolved via
|
|
45
|
+
:func:`src.voice.providers.get_stt` using ``config.stt_provider``.
|
|
46
|
+
tts:
|
|
47
|
+
Optional TTS provider instance. If *None* the provider is resolved via
|
|
48
|
+
:func:`src.voice.providers.get_tts` using ``config.tts_provider``.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
config: VoiceConfig,
|
|
54
|
+
agent_send_fn: Callable[[str], str],
|
|
55
|
+
*,
|
|
56
|
+
stt: STTProvider | None = None,
|
|
57
|
+
tts: TTSProvider | None = None,
|
|
58
|
+
) -> None:
|
|
59
|
+
self._config = config
|
|
60
|
+
self._agent_send_fn = agent_send_fn
|
|
61
|
+
self._stop_event = threading.Event()
|
|
62
|
+
self._thread: threading.Thread | None = None
|
|
63
|
+
|
|
64
|
+
if stt is not None:
|
|
65
|
+
self._stt = stt
|
|
66
|
+
else:
|
|
67
|
+
from src.voice.providers import get_stt
|
|
68
|
+
self._stt = get_stt(config.stt_provider)
|
|
69
|
+
|
|
70
|
+
if tts is not None:
|
|
71
|
+
self._tts = tts
|
|
72
|
+
else:
|
|
73
|
+
from src.voice.providers import get_tts
|
|
74
|
+
self._tts = get_tts(config.tts_provider)
|
|
75
|
+
|
|
76
|
+
# ------------------------------------------------------------------
|
|
77
|
+
# Thread lifecycle
|
|
78
|
+
# ------------------------------------------------------------------
|
|
79
|
+
|
|
80
|
+
def start(self) -> None:
|
|
81
|
+
"""Start the background listening thread.
|
|
82
|
+
|
|
83
|
+
Calling *start* when the thread is already alive is a no-op.
|
|
84
|
+
"""
|
|
85
|
+
if self._thread is not None and self._thread.is_alive():
|
|
86
|
+
return
|
|
87
|
+
self._stop_event.clear()
|
|
88
|
+
self._thread = threading.Thread(
|
|
89
|
+
target=self._listen_loop,
|
|
90
|
+
name="voice-loop",
|
|
91
|
+
daemon=True,
|
|
92
|
+
)
|
|
93
|
+
self._thread.start()
|
|
94
|
+
logger.debug("VoiceLoop: background thread started")
|
|
95
|
+
|
|
96
|
+
def stop(self) -> None:
|
|
97
|
+
"""Signal the background thread to stop and wait up to 5 s for it to join."""
|
|
98
|
+
self._stop_event.set()
|
|
99
|
+
if self._thread is not None:
|
|
100
|
+
self._thread.join(timeout=5.0)
|
|
101
|
+
self._thread = None
|
|
102
|
+
logger.debug("VoiceLoop: background thread stopped")
|
|
103
|
+
|
|
104
|
+
def _listen_loop(self) -> None:
|
|
105
|
+
"""Background loop — runs until _stop_event is set."""
|
|
106
|
+
logger.debug("VoiceLoop: listen loop entered")
|
|
107
|
+
while not self._stop_event.is_set():
|
|
108
|
+
# In a real implementation this would read frames from AudioCapture
|
|
109
|
+
# and feed them through run_once(). For now we just wait.
|
|
110
|
+
self._stop_event.wait(timeout=0.05)
|
|
111
|
+
logger.debug("VoiceLoop: listen loop exited")
|
|
112
|
+
|
|
113
|
+
# ------------------------------------------------------------------
|
|
114
|
+
# Core pipeline
|
|
115
|
+
# ------------------------------------------------------------------
|
|
116
|
+
|
|
117
|
+
def run_once(self, audio_bytes: bytes) -> bytes:
|
|
118
|
+
"""Transcribe *audio_bytes* → call agent → synthesize → return audio bytes.
|
|
119
|
+
|
|
120
|
+
Returns an empty ``bytes`` object if STT produced no speech (silence).
|
|
121
|
+
|
|
122
|
+
Raises
|
|
123
|
+
------
|
|
124
|
+
VoiceLoopError
|
|
125
|
+
If the STT, agent, or TTS step fails.
|
|
126
|
+
"""
|
|
127
|
+
# 1. Transcribe
|
|
128
|
+
try:
|
|
129
|
+
text = self._stt.transcribe(audio_bytes)
|
|
130
|
+
except (STTError, Exception) as exc:
|
|
131
|
+
raise VoiceLoopError(f"STT failed: {exc}") from exc
|
|
132
|
+
|
|
133
|
+
if not text or not text.strip():
|
|
134
|
+
return b""
|
|
135
|
+
|
|
136
|
+
# 2. Send to agent
|
|
137
|
+
try:
|
|
138
|
+
response_text = self._agent_send_fn(text)
|
|
139
|
+
except Exception as exc:
|
|
140
|
+
raise VoiceLoopError(f"Agent call failed: {exc}") from exc
|
|
141
|
+
|
|
142
|
+
# 3. Synthesize
|
|
143
|
+
try:
|
|
144
|
+
audio_response = self._tts.synthesize(response_text)
|
|
145
|
+
except (TTSError, Exception) as exc:
|
|
146
|
+
raise VoiceLoopError(f"TTS failed: {exc}") from exc
|
|
147
|
+
|
|
148
|
+
return audio_response
|
|
149
|
+
|
|
150
|
+
def push_to_talk(self, audio_bytes: bytes) -> bytes:
|
|
151
|
+
"""Convenience alias — transcribe + agent + synthesize in one call.
|
|
152
|
+
|
|
153
|
+
Identical to :meth:`run_once`; provided as a named entry-point that
|
|
154
|
+
mirrors the push-to-talk UX pattern.
|
|
155
|
+
"""
|
|
156
|
+
return self.run_once(audio_bytes)
|