python-voiceio 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- python_voiceio-0.2.0.dist-info/METADATA +260 -0
- python_voiceio-0.2.0.dist-info/RECORD +43 -0
- python_voiceio-0.2.0.dist-info/WHEEL +5 -0
- python_voiceio-0.2.0.dist-info/entry_points.txt +6 -0
- python_voiceio-0.2.0.dist-info/licenses/LICENSE +21 -0
- python_voiceio-0.2.0.dist-info/top_level.txt +1 -0
- voiceio/__init__.py +1 -0
- voiceio/__main__.py +3 -0
- voiceio/app.py +415 -0
- voiceio/backends.py +13 -0
- voiceio/cli.py +475 -0
- voiceio/config.py +136 -0
- voiceio/feedback.py +78 -0
- voiceio/health.py +194 -0
- voiceio/hotkeys/__init__.py +22 -0
- voiceio/hotkeys/base.py +27 -0
- voiceio/hotkeys/chain.py +83 -0
- voiceio/hotkeys/evdev.py +134 -0
- voiceio/hotkeys/pynput_backend.py +80 -0
- voiceio/hotkeys/socket_backend.py +77 -0
- voiceio/ibus/__init__.py +8 -0
- voiceio/ibus/engine.py +268 -0
- voiceio/platform.py +139 -0
- voiceio/recorder.py +208 -0
- voiceio/service.py +234 -0
- voiceio/sounds/__init__.py +0 -0
- voiceio/sounds/commit.wav +0 -0
- voiceio/sounds/start.wav +0 -0
- voiceio/sounds/stop.wav +0 -0
- voiceio/streaming.py +202 -0
- voiceio/transcriber.py +165 -0
- voiceio/tray.py +54 -0
- voiceio/typers/__init__.py +31 -0
- voiceio/typers/base.py +44 -0
- voiceio/typers/chain.py +79 -0
- voiceio/typers/clipboard.py +110 -0
- voiceio/typers/ibus.py +389 -0
- voiceio/typers/pynput_type.py +51 -0
- voiceio/typers/wtype.py +57 -0
- voiceio/typers/xdotool.py +45 -0
- voiceio/typers/ydotool.py +115 -0
- voiceio/wizard.py +882 -0
- voiceio/worker.py +39 -0
voiceio/ibus/engine.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""VoiceIO IBus engine: receives commands via Unix socket, injects text via IBus.
|
|
3
|
+
|
|
4
|
+
Run as a standalone process:
|
|
5
|
+
python3 -m voiceio.ibus.engine
|
|
6
|
+
|
|
7
|
+
Architecture:
|
|
8
|
+
- GLib main loop drives the IBus engine (required by IBus).
|
|
9
|
+
- Socket listener thread receives commands from voiceio daemon.
|
|
10
|
+
- Commands are dispatched to the engine via GLib.idle_add() for thread safety.
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
import logging.handlers
|
|
16
|
+
import os
|
|
17
|
+
import socket
|
|
18
|
+
import sys
|
|
19
|
+
import threading
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
import gi
|
|
23
|
+
|
|
24
|
+
gi.require_version("IBus", "1.0")
|
|
25
|
+
from gi.repository import GLib, GObject, IBus
|
|
26
|
+
|
|
27
|
+
from voiceio.ibus import READY_PATH, SOCKET_PATH
|
|
28
|
+
|
|
29
|
+
log = logging.getLogger(__name__)
|
|
30
|
+
ENGINE_NAME = "voiceio"
|
|
31
|
+
COMPONENT_NAME = "org.voiceio.ibus"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class VoiceIOEngine(IBus.Engine):
|
|
35
|
+
"""IBus engine that receives text injection commands via socket."""
|
|
36
|
+
|
|
37
|
+
__gtype_name__ = "VoiceIOEngine"
|
|
38
|
+
|
|
39
|
+
def __init__(self, **kwargs):
|
|
40
|
+
super().__init__(**kwargs)
|
|
41
|
+
self._focused = False
|
|
42
|
+
log.info("VoiceIOEngine instance created (path=%s)", kwargs.get("object_path"))
|
|
43
|
+
|
|
44
|
+
def do_focus_in(self):
|
|
45
|
+
self._focused = True
|
|
46
|
+
|
|
47
|
+
def do_focus_out(self):
|
|
48
|
+
self._focused = False
|
|
49
|
+
|
|
50
|
+
def do_process_key_event(self, keyval, keycode, state):
|
|
51
|
+
# CRITICAL: Always pass all keys through. Never intercept typing.
|
|
52
|
+
# If this ever returns True (or raises), ALL keyboard input dies system-wide.
|
|
53
|
+
try:
|
|
54
|
+
return False
|
|
55
|
+
except Exception:
|
|
56
|
+
return False
|
|
57
|
+
|
|
58
|
+
def preedit(self, text: str) -> None:
|
|
59
|
+
"""Show text as preedit (underlined preview)."""
|
|
60
|
+
if not text:
|
|
61
|
+
self.hide_preedit_text()
|
|
62
|
+
return
|
|
63
|
+
ibus_text = IBus.Text.new_from_string(text)
|
|
64
|
+
ibus_text.append_attribute(
|
|
65
|
+
IBus.AttrType.UNDERLINE,
|
|
66
|
+
IBus.AttrUnderline.SINGLE,
|
|
67
|
+
0,
|
|
68
|
+
len(text),
|
|
69
|
+
)
|
|
70
|
+
self.update_preedit_text(ibus_text, len(text), True)
|
|
71
|
+
|
|
72
|
+
def commit(self, text: str) -> None:
|
|
73
|
+
"""Clear preedit and commit final text."""
|
|
74
|
+
self.hide_preedit_text()
|
|
75
|
+
if text:
|
|
76
|
+
self.commit_text(IBus.Text.new_from_string(text))
|
|
77
|
+
|
|
78
|
+
def clear(self) -> None:
|
|
79
|
+
"""Clear preedit without committing."""
|
|
80
|
+
self.hide_preedit_text()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class VoiceIOEngineFactory(IBus.Factory):
|
|
84
|
+
"""Custom factory that creates engine instances with proper D-Bus object paths."""
|
|
85
|
+
|
|
86
|
+
__gtype_name__ = "VoiceIOEngineFactory"
|
|
87
|
+
_engine_count = 0
|
|
88
|
+
|
|
89
|
+
def __init__(self, bus):
|
|
90
|
+
self._bus = bus
|
|
91
|
+
super().__init__(
|
|
92
|
+
object_path=IBus.PATH_FACTORY,
|
|
93
|
+
connection=bus.get_connection(),
|
|
94
|
+
)
|
|
95
|
+
log.info("VoiceIOEngineFactory created")
|
|
96
|
+
|
|
97
|
+
def do_create_engine(self, engine_name):
|
|
98
|
+
global _engine
|
|
99
|
+
VoiceIOEngineFactory._engine_count += 1
|
|
100
|
+
obj_path = f"/org/freedesktop/IBus/Engine/{VoiceIOEngineFactory._engine_count}"
|
|
101
|
+
log.info("Creating engine '%s' at %s", engine_name, obj_path)
|
|
102
|
+
engine = VoiceIOEngine(
|
|
103
|
+
engine_name=engine_name,
|
|
104
|
+
object_path=obj_path,
|
|
105
|
+
connection=self._bus.get_connection(),
|
|
106
|
+
)
|
|
107
|
+
_engine = engine
|
|
108
|
+
# Signal readiness to the voiceio daemon
|
|
109
|
+
try:
|
|
110
|
+
READY_PATH.write_text(str(os.getpid()))
|
|
111
|
+
log.info("Engine ready signal written to %s", READY_PATH)
|
|
112
|
+
except OSError:
|
|
113
|
+
pass
|
|
114
|
+
return engine
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# Global engine reference (set when factory creates the engine)
|
|
118
|
+
_engine: VoiceIOEngine | None = None
|
|
119
|
+
_pending_commands: list[str] = []
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _socket_listener(mainloop: GLib.MainLoop) -> None:
|
|
123
|
+
"""Listen for commands on Unix DGRAM socket. Runs in a thread."""
|
|
124
|
+
SOCKET_PATH.unlink(missing_ok=True)
|
|
125
|
+
sock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
|
|
126
|
+
sock.bind(str(SOCKET_PATH))
|
|
127
|
+
sock.settimeout(1.0)
|
|
128
|
+
log.info("Socket listener started at %s", SOCKET_PATH)
|
|
129
|
+
|
|
130
|
+
while mainloop.is_running():
|
|
131
|
+
try:
|
|
132
|
+
data, addr = sock.recvfrom(65536)
|
|
133
|
+
except socket.timeout:
|
|
134
|
+
continue
|
|
135
|
+
except OSError:
|
|
136
|
+
break
|
|
137
|
+
|
|
138
|
+
msg = data.decode("utf-8", errors="replace")
|
|
139
|
+
log.debug("Received: %s", msg[:80])
|
|
140
|
+
|
|
141
|
+
if msg == "ping":
|
|
142
|
+
# Respond to probe: send pong back
|
|
143
|
+
if addr:
|
|
144
|
+
try:
|
|
145
|
+
sock.sendto(b"pong", addr)
|
|
146
|
+
except OSError:
|
|
147
|
+
pass
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
# Dispatch to engine on GLib main thread
|
|
151
|
+
GLib.idle_add(_handle_command, msg)
|
|
152
|
+
|
|
153
|
+
sock.close()
|
|
154
|
+
SOCKET_PATH.unlink(missing_ok=True)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _flush_pending() -> None:
|
|
158
|
+
"""Replay any commands that arrived before the engine was ready."""
|
|
159
|
+
while _pending_commands:
|
|
160
|
+
_dispatch(_pending_commands.pop(0))
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _dispatch(msg: str) -> None:
|
|
164
|
+
"""Execute a single command on the engine."""
|
|
165
|
+
try:
|
|
166
|
+
if msg.startswith("preedit:"):
|
|
167
|
+
_engine.preedit(msg[8:])
|
|
168
|
+
elif msg.startswith("commit:"):
|
|
169
|
+
_engine.commit(msg[7:])
|
|
170
|
+
elif msg == "clear":
|
|
171
|
+
_engine.clear()
|
|
172
|
+
else:
|
|
173
|
+
log.warning("Unknown command: %s", msg[:40])
|
|
174
|
+
except Exception:
|
|
175
|
+
log.exception("Error dispatching command: %s", msg[:40])
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _handle_command(msg: str) -> bool:
|
|
179
|
+
"""Handle a command on the GLib main thread. Returns False to remove from idle."""
|
|
180
|
+
if _engine is None:
|
|
181
|
+
log.debug("Engine not ready, buffering command: %s", msg[:40])
|
|
182
|
+
_pending_commands.append(msg)
|
|
183
|
+
return False
|
|
184
|
+
|
|
185
|
+
# Flush any buffered commands first
|
|
186
|
+
if _pending_commands:
|
|
187
|
+
log.info("Engine ready, flushing %d buffered commands", len(_pending_commands))
|
|
188
|
+
_flush_pending()
|
|
189
|
+
|
|
190
|
+
_dispatch(msg)
|
|
191
|
+
return False # run once, don't repeat
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def main() -> None:
|
|
195
|
+
# Log to file so we can debug when IBus spawns us
|
|
196
|
+
log_path = Path(os.environ.get("XDG_RUNTIME_DIR", "/tmp")) / "voiceio-ibus-engine.log"
|
|
197
|
+
logging.basicConfig(
|
|
198
|
+
level=logging.DEBUG,
|
|
199
|
+
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
|
|
200
|
+
handlers=[
|
|
201
|
+
logging.StreamHandler(),
|
|
202
|
+
logging.handlers.RotatingFileHandler(
|
|
203
|
+
str(log_path), maxBytes=1_000_000, backupCount=1,
|
|
204
|
+
),
|
|
205
|
+
],
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
IBus.init()
|
|
209
|
+
bus = IBus.Bus()
|
|
210
|
+
|
|
211
|
+
if not bus.is_connected():
|
|
212
|
+
log.error("Cannot connect to IBus daemon. Is IBus running?")
|
|
213
|
+
sys.exit(1)
|
|
214
|
+
|
|
215
|
+
# Register GTypes
|
|
216
|
+
GObject.type_register(VoiceIOEngine)
|
|
217
|
+
GObject.type_register(VoiceIOEngineFactory)
|
|
218
|
+
|
|
219
|
+
# Create custom factory (registers on D-Bus at IBus.PATH_FACTORY)
|
|
220
|
+
VoiceIOEngineFactory(bus) # registers on D-Bus at IBus.PATH_FACTORY
|
|
221
|
+
|
|
222
|
+
# Register component so IBus knows about our engine
|
|
223
|
+
component = IBus.Component.new(
|
|
224
|
+
COMPONENT_NAME,
|
|
225
|
+
"VoiceIO voice input",
|
|
226
|
+
"1.0",
|
|
227
|
+
"MIT",
|
|
228
|
+
"voiceio",
|
|
229
|
+
"",
|
|
230
|
+
"",
|
|
231
|
+
"voiceio",
|
|
232
|
+
)
|
|
233
|
+
engine_desc = IBus.EngineDesc.new(
|
|
234
|
+
ENGINE_NAME,
|
|
235
|
+
"VoiceIO",
|
|
236
|
+
"Voice-to-text input",
|
|
237
|
+
"other",
|
|
238
|
+
"MIT",
|
|
239
|
+
"voiceio",
|
|
240
|
+
"",
|
|
241
|
+
"us",
|
|
242
|
+
)
|
|
243
|
+
component.add_engine(engine_desc)
|
|
244
|
+
bus.register_component(component)
|
|
245
|
+
|
|
246
|
+
log.info("VoiceIO IBus engine registered with custom factory")
|
|
247
|
+
bus.request_name(COMPONENT_NAME, 0)
|
|
248
|
+
|
|
249
|
+
mainloop = GLib.MainLoop()
|
|
250
|
+
|
|
251
|
+
# Start socket listener in background thread
|
|
252
|
+
listener = threading.Thread(
|
|
253
|
+
target=_socket_listener, args=(mainloop,), daemon=True,
|
|
254
|
+
)
|
|
255
|
+
listener.start()
|
|
256
|
+
|
|
257
|
+
try:
|
|
258
|
+
mainloop.run()
|
|
259
|
+
except KeyboardInterrupt:
|
|
260
|
+
pass
|
|
261
|
+
finally:
|
|
262
|
+
SOCKET_PATH.unlink(missing_ok=True)
|
|
263
|
+
READY_PATH.unlink(missing_ok=True)
|
|
264
|
+
log.info("VoiceIO IBus engine stopped")
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
if __name__ == "__main__":
|
|
268
|
+
main()
|
voiceio/platform.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""Platform detection: OS, display server, desktop environment, available tools."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import shutil
|
|
6
|
+
import sys
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from functools import lru_cache
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class Platform:
|
|
13
|
+
os: str # "linux", "darwin", "windows"
|
|
14
|
+
display_server: str # "x11", "wayland", "quartz", "unknown"
|
|
15
|
+
desktop: str # "gnome", "kde", "sway", "hyprland", "macos", "unknown"
|
|
16
|
+
|
|
17
|
+
# Tool availability
|
|
18
|
+
has_xdotool: bool = False
|
|
19
|
+
has_ydotool: bool = False
|
|
20
|
+
has_wtype: bool = False
|
|
21
|
+
has_xclip: bool = False
|
|
22
|
+
has_wl_copy: bool = False
|
|
23
|
+
has_dotool: bool = False
|
|
24
|
+
has_ibus: bool = False
|
|
25
|
+
|
|
26
|
+
# Permissions
|
|
27
|
+
has_input_group: bool = False
|
|
28
|
+
has_uinput_access: bool = False
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def is_linux(self) -> bool:
|
|
32
|
+
return self.os == "linux"
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def is_mac(self) -> bool:
|
|
36
|
+
return self.os == "darwin"
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def is_wayland(self) -> bool:
|
|
40
|
+
return self.display_server == "wayland"
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def is_x11(self) -> bool:
|
|
44
|
+
return self.display_server == "x11"
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def is_gnome(self) -> bool:
|
|
48
|
+
return self.desktop in ("gnome", "unity")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _detect_os() -> str:
|
|
52
|
+
if sys.platform.startswith("linux"):
|
|
53
|
+
return "linux"
|
|
54
|
+
if sys.platform == "darwin":
|
|
55
|
+
return "darwin"
|
|
56
|
+
if sys.platform == "win32":
|
|
57
|
+
return "windows"
|
|
58
|
+
return "unknown"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _detect_display_server() -> str:
|
|
62
|
+
plat = _detect_os()
|
|
63
|
+
if plat == "darwin":
|
|
64
|
+
return "quartz"
|
|
65
|
+
if plat == "windows":
|
|
66
|
+
return "unknown"
|
|
67
|
+
|
|
68
|
+
session = os.environ.get("XDG_SESSION_TYPE", "").lower()
|
|
69
|
+
if session == "wayland":
|
|
70
|
+
return "wayland"
|
|
71
|
+
if session == "x11":
|
|
72
|
+
return "x11"
|
|
73
|
+
|
|
74
|
+
# Fallback heuristics
|
|
75
|
+
if os.environ.get("WAYLAND_DISPLAY"):
|
|
76
|
+
return "wayland"
|
|
77
|
+
if os.environ.get("DISPLAY"):
|
|
78
|
+
return "x11"
|
|
79
|
+
|
|
80
|
+
return "unknown"
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _detect_desktop() -> str:
|
|
84
|
+
plat = _detect_os()
|
|
85
|
+
if plat == "darwin":
|
|
86
|
+
return "macos"
|
|
87
|
+
|
|
88
|
+
raw = os.environ.get("XDG_CURRENT_DESKTOP", "").lower()
|
|
89
|
+
|
|
90
|
+
if "gnome" in raw:
|
|
91
|
+
return "gnome"
|
|
92
|
+
if "kde" in raw or "plasma" in raw:
|
|
93
|
+
return "kde"
|
|
94
|
+
if "sway" in raw:
|
|
95
|
+
return "sway"
|
|
96
|
+
if "hyprland" in raw:
|
|
97
|
+
return "hyprland"
|
|
98
|
+
if raw:
|
|
99
|
+
return raw.split(":")[0] # take first component
|
|
100
|
+
|
|
101
|
+
return "unknown"
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _check_input_group() -> bool:
|
|
105
|
+
try:
|
|
106
|
+
import grp
|
|
107
|
+
input_gid = grp.getgrnam("input").gr_gid
|
|
108
|
+
return input_gid in os.getgroups()
|
|
109
|
+
except (KeyError, ImportError):
|
|
110
|
+
return False
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _check_uinput_access() -> bool:
|
|
114
|
+
try:
|
|
115
|
+
with open("/dev/uinput", "rb"):
|
|
116
|
+
pass
|
|
117
|
+
return True
|
|
118
|
+
except (PermissionError, FileNotFoundError, OSError):
|
|
119
|
+
return False
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@lru_cache(maxsize=1)
|
|
123
|
+
def detect() -> Platform:
|
|
124
|
+
"""Detect the current platform. Cached, safe to call multiple times."""
|
|
125
|
+
plat_os = _detect_os()
|
|
126
|
+
return Platform(
|
|
127
|
+
os=plat_os,
|
|
128
|
+
display_server=_detect_display_server(),
|
|
129
|
+
desktop=_detect_desktop(),
|
|
130
|
+
has_xdotool=shutil.which("xdotool") is not None,
|
|
131
|
+
has_ydotool=shutil.which("ydotool") is not None,
|
|
132
|
+
has_wtype=shutil.which("wtype") is not None,
|
|
133
|
+
has_xclip=shutil.which("xclip") is not None,
|
|
134
|
+
has_wl_copy=shutil.which("wl-copy") is not None,
|
|
135
|
+
has_dotool=shutil.which("dotool") is not None,
|
|
136
|
+
has_ibus=shutil.which("ibus") is not None,
|
|
137
|
+
has_input_group=_check_input_group() if plat_os == "linux" else False,
|
|
138
|
+
has_uinput_access=_check_uinput_access() if plat_os == "linux" else False,
|
|
139
|
+
)
|
voiceio/recorder.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
"""Audio capture with pre-buffer ring to prevent clipping."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
import threading
|
|
6
|
+
from typing import TYPE_CHECKING, Callable
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import sounddevice as sd
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from voiceio.config import AudioConfig
|
|
13
|
+
|
|
14
|
+
log = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class RingBuffer:
|
|
18
|
+
"""Fixed-size ring buffer for float32 audio samples."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, max_samples: int):
|
|
21
|
+
self._buf = np.zeros(max_samples, dtype=np.float32)
|
|
22
|
+
self._max = max_samples
|
|
23
|
+
self._write_pos = 0
|
|
24
|
+
self._filled = 0
|
|
25
|
+
|
|
26
|
+
def append(self, data: np.ndarray) -> None:
|
|
27
|
+
if self._max == 0:
|
|
28
|
+
return
|
|
29
|
+
flat = data.flatten()
|
|
30
|
+
n = len(flat)
|
|
31
|
+
if n >= self._max:
|
|
32
|
+
# Data larger than buffer: just keep the tail
|
|
33
|
+
self._buf[:] = flat[-self._max:]
|
|
34
|
+
self._write_pos = 0
|
|
35
|
+
self._filled = self._max
|
|
36
|
+
return
|
|
37
|
+
|
|
38
|
+
end = self._write_pos + n
|
|
39
|
+
if end <= self._max:
|
|
40
|
+
self._buf[self._write_pos:end] = flat
|
|
41
|
+
else:
|
|
42
|
+
first = self._max - self._write_pos
|
|
43
|
+
self._buf[self._write_pos:] = flat[:first]
|
|
44
|
+
self._buf[:n - first] = flat[first:]
|
|
45
|
+
|
|
46
|
+
self._write_pos = end % self._max
|
|
47
|
+
self._filled = min(self._filled + n, self._max)
|
|
48
|
+
|
|
49
|
+
def get(self) -> np.ndarray:
|
|
50
|
+
"""Return buffered audio in chronological order."""
|
|
51
|
+
if self._filled == 0:
|
|
52
|
+
return np.zeros(0, dtype=np.float32)
|
|
53
|
+
if self._filled < self._max:
|
|
54
|
+
return self._buf[:self._filled].copy()
|
|
55
|
+
# Full ring: read from write_pos (oldest) through the end
|
|
56
|
+
return np.concatenate([
|
|
57
|
+
self._buf[self._write_pos:],
|
|
58
|
+
self._buf[:self._write_pos],
|
|
59
|
+
])
|
|
60
|
+
|
|
61
|
+
def clear(self) -> None:
|
|
62
|
+
self._write_pos = 0
|
|
63
|
+
self._filled = 0
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class AudioRecorder:
|
|
67
|
+
"""Audio recorder with always-on pre-buffer ring.
|
|
68
|
+
|
|
69
|
+
The audio stream runs continuously. A ring buffer captures the last
|
|
70
|
+
`prebuffer_secs` of audio. When recording starts, the ring buffer
|
|
71
|
+
contents become the start of the recording, so no first syllable is lost.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
def __init__(self, cfg: AudioConfig, on_speech_pause: Callable[[], None] | None = None):
|
|
75
|
+
self.sample_rate = cfg.sample_rate
|
|
76
|
+
self.device = None if cfg.device == "default" else cfg.device
|
|
77
|
+
self.prebuffer_secs = cfg.prebuffer_secs
|
|
78
|
+
|
|
79
|
+
self._ring = RingBuffer(int(self.prebuffer_secs * self.sample_rate))
|
|
80
|
+
self._chunks: list[np.ndarray] = []
|
|
81
|
+
self._stream: sd.InputStream | None = None
|
|
82
|
+
self._lock = threading.Lock()
|
|
83
|
+
self._recording = False
|
|
84
|
+
|
|
85
|
+
# Streaming VAD
|
|
86
|
+
self._on_speech_pause = on_speech_pause
|
|
87
|
+
self._silence_threshold = cfg.silence_threshold
|
|
88
|
+
self._silence_duration = cfg.silence_duration
|
|
89
|
+
self._silent_chunks = 0.0
|
|
90
|
+
self._last_transcribed_len = 0
|
|
91
|
+
self._total_samples = 0
|
|
92
|
+
|
|
93
|
+
def open_stream(self) -> None:
|
|
94
|
+
"""Start the always-on audio stream (feeds ring buffer)."""
|
|
95
|
+
if self._stream is not None:
|
|
96
|
+
return
|
|
97
|
+
self._stream = sd.InputStream(
|
|
98
|
+
samplerate=self.sample_rate,
|
|
99
|
+
channels=1,
|
|
100
|
+
dtype="float32",
|
|
101
|
+
device=self.device,
|
|
102
|
+
callback=self._callback,
|
|
103
|
+
)
|
|
104
|
+
self._stream.start()
|
|
105
|
+
log.debug("Audio stream opened (prebuffer=%.1fs)", self.prebuffer_secs)
|
|
106
|
+
|
|
107
|
+
def close_stream(self) -> None:
|
|
108
|
+
"""Stop the always-on audio stream."""
|
|
109
|
+
if self._stream is not None:
|
|
110
|
+
self._stream.stop()
|
|
111
|
+
self._stream.close()
|
|
112
|
+
self._stream = None
|
|
113
|
+
self._ring.clear()
|
|
114
|
+
|
|
115
|
+
def start(self) -> None:
|
|
116
|
+
"""Start recording. Grabs ring buffer contents as the beginning."""
|
|
117
|
+
with self._lock:
|
|
118
|
+
if self._recording:
|
|
119
|
+
return
|
|
120
|
+
# Ensure stream is running
|
|
121
|
+
if self._stream is None:
|
|
122
|
+
self.open_stream()
|
|
123
|
+
# Grab pre-buffer
|
|
124
|
+
prebuf = self._ring.get()
|
|
125
|
+
self._chunks = [prebuf.reshape(-1, 1)] if len(prebuf) > 0 else []
|
|
126
|
+
self._total_samples = sum(len(c) for c in self._chunks)
|
|
127
|
+
self._silent_chunks = 0.0
|
|
128
|
+
self._last_transcribed_len = 0
|
|
129
|
+
self._recording = True
|
|
130
|
+
prebuf_ms = len(prebuf) / self.sample_rate * 1000
|
|
131
|
+
log.info("Recording started (%.0fms pre-buffer)", prebuf_ms)
|
|
132
|
+
|
|
133
|
+
def stop(self) -> np.ndarray | None:
|
|
134
|
+
"""Stop recording, return captured audio."""
|
|
135
|
+
with self._lock:
|
|
136
|
+
if not self._recording:
|
|
137
|
+
return None
|
|
138
|
+
self._recording = False
|
|
139
|
+
|
|
140
|
+
if not self._chunks:
|
|
141
|
+
log.warning("No audio captured")
|
|
142
|
+
return None
|
|
143
|
+
|
|
144
|
+
audio = np.concatenate(self._chunks, axis=0).flatten()
|
|
145
|
+
remaining = audio[self._last_transcribed_len:]
|
|
146
|
+
duration = len(remaining) / self.sample_rate
|
|
147
|
+
|
|
148
|
+
if duration < 0.3:
|
|
149
|
+
if self._last_transcribed_len > 0:
|
|
150
|
+
return None
|
|
151
|
+
log.warning("Audio too short (%.1fs), skipping", duration)
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
log.info("Recording stopped, %.1fs audio", duration)
|
|
155
|
+
return remaining
|
|
156
|
+
|
|
157
|
+
def get_audio_so_far(self) -> np.ndarray | None:
|
|
158
|
+
"""Get all audio captured so far (for streaming)."""
|
|
159
|
+
with self._lock:
|
|
160
|
+
if not self._chunks:
|
|
161
|
+
return None
|
|
162
|
+
return np.concatenate(self._chunks, axis=0).flatten()
|
|
163
|
+
|
|
164
|
+
def set_on_speech_pause(self, callback: Callable[[], None] | None) -> None:
|
|
165
|
+
"""Set/clear the speech pause callback (used by streaming session)."""
|
|
166
|
+
self._on_speech_pause = callback
|
|
167
|
+
|
|
168
|
+
def mark_transcribed(self, num_samples: int) -> None:
|
|
169
|
+
self._last_transcribed_len = num_samples
|
|
170
|
+
|
|
171
|
+
@property
|
|
172
|
+
def is_recording(self) -> bool:
|
|
173
|
+
return self._recording
|
|
174
|
+
|
|
175
|
+
def _callback(
|
|
176
|
+
self, indata: np.ndarray, frames: int, time_info: object, status: object
|
|
177
|
+
) -> None:
|
|
178
|
+
if status:
|
|
179
|
+
log.warning("Audio stream status: %s", status)
|
|
180
|
+
|
|
181
|
+
# Always feed ring buffer
|
|
182
|
+
self._ring.append(indata)
|
|
183
|
+
|
|
184
|
+
# Only collect chunks when recording
|
|
185
|
+
if not self._recording:
|
|
186
|
+
return
|
|
187
|
+
|
|
188
|
+
chunk = indata.copy()
|
|
189
|
+
self._chunks.append(chunk)
|
|
190
|
+
self._total_samples += chunk.shape[0]
|
|
191
|
+
|
|
192
|
+
# Streaming VAD
|
|
193
|
+
if self._on_speech_pause is not None:
|
|
194
|
+
flat = indata.ravel()
|
|
195
|
+
rms = float(np.sqrt(np.dot(flat, flat) / len(flat)))
|
|
196
|
+
chunk_secs = frames / self.sample_rate
|
|
197
|
+
|
|
198
|
+
if rms < self._silence_threshold:
|
|
199
|
+
self._silent_chunks += chunk_secs
|
|
200
|
+
else:
|
|
201
|
+
self._silent_chunks = 0.0
|
|
202
|
+
|
|
203
|
+
has_new = self._total_samples > self._last_transcribed_len + self.sample_rate
|
|
204
|
+
|
|
205
|
+
if self._silent_chunks >= self._silence_duration and has_new:
|
|
206
|
+
self._silent_chunks = 0.0
|
|
207
|
+
# Signal pause. Don't concatenate on the audio thread.
|
|
208
|
+
self._on_speech_pause()
|