python-voiceio 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
voiceio/ibus/engine.py ADDED
@@ -0,0 +1,268 @@
1
+ #!/usr/bin/env python3
2
+ """VoiceIO IBus engine: receives commands via Unix socket, injects text via IBus.
3
+
4
+ Run as a standalone process:
5
+ python3 -m voiceio.ibus.engine
6
+
7
+ Architecture:
8
+ - GLib main loop drives the IBus engine (required by IBus).
9
+ - Socket listener thread receives commands from voiceio daemon.
10
+ - Commands are dispatched to the engine via GLib.idle_add() for thread safety.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import logging
15
+ import logging.handlers
16
+ import os
17
+ import socket
18
+ import sys
19
+ import threading
20
+ from pathlib import Path
21
+
22
+ import gi
23
+
24
+ gi.require_version("IBus", "1.0")
25
+ from gi.repository import GLib, GObject, IBus
26
+
27
+ from voiceio.ibus import READY_PATH, SOCKET_PATH
28
+
29
+ log = logging.getLogger(__name__)
30
+ ENGINE_NAME = "voiceio"
31
+ COMPONENT_NAME = "org.voiceio.ibus"
32
+
33
+
34
+ class VoiceIOEngine(IBus.Engine):
35
+ """IBus engine that receives text injection commands via socket."""
36
+
37
+ __gtype_name__ = "VoiceIOEngine"
38
+
39
+ def __init__(self, **kwargs):
40
+ super().__init__(**kwargs)
41
+ self._focused = False
42
+ log.info("VoiceIOEngine instance created (path=%s)", kwargs.get("object_path"))
43
+
44
+ def do_focus_in(self):
45
+ self._focused = True
46
+
47
+ def do_focus_out(self):
48
+ self._focused = False
49
+
50
+ def do_process_key_event(self, keyval, keycode, state):
51
+ # CRITICAL: Always pass all keys through. Never intercept typing.
52
+ # If this ever returns True (or raises), ALL keyboard input dies system-wide.
53
+ try:
54
+ return False
55
+ except Exception:
56
+ return False
57
+
58
+ def preedit(self, text: str) -> None:
59
+ """Show text as preedit (underlined preview)."""
60
+ if not text:
61
+ self.hide_preedit_text()
62
+ return
63
+ ibus_text = IBus.Text.new_from_string(text)
64
+ ibus_text.append_attribute(
65
+ IBus.AttrType.UNDERLINE,
66
+ IBus.AttrUnderline.SINGLE,
67
+ 0,
68
+ len(text),
69
+ )
70
+ self.update_preedit_text(ibus_text, len(text), True)
71
+
72
+ def commit(self, text: str) -> None:
73
+ """Clear preedit and commit final text."""
74
+ self.hide_preedit_text()
75
+ if text:
76
+ self.commit_text(IBus.Text.new_from_string(text))
77
+
78
+ def clear(self) -> None:
79
+ """Clear preedit without committing."""
80
+ self.hide_preedit_text()
81
+
82
+
83
+ class VoiceIOEngineFactory(IBus.Factory):
84
+ """Custom factory that creates engine instances with proper D-Bus object paths."""
85
+
86
+ __gtype_name__ = "VoiceIOEngineFactory"
87
+ _engine_count = 0
88
+
89
+ def __init__(self, bus):
90
+ self._bus = bus
91
+ super().__init__(
92
+ object_path=IBus.PATH_FACTORY,
93
+ connection=bus.get_connection(),
94
+ )
95
+ log.info("VoiceIOEngineFactory created")
96
+
97
+ def do_create_engine(self, engine_name):
98
+ global _engine
99
+ VoiceIOEngineFactory._engine_count += 1
100
+ obj_path = f"/org/freedesktop/IBus/Engine/{VoiceIOEngineFactory._engine_count}"
101
+ log.info("Creating engine '%s' at %s", engine_name, obj_path)
102
+ engine = VoiceIOEngine(
103
+ engine_name=engine_name,
104
+ object_path=obj_path,
105
+ connection=self._bus.get_connection(),
106
+ )
107
+ _engine = engine
108
+ # Signal readiness to the voiceio daemon
109
+ try:
110
+ READY_PATH.write_text(str(os.getpid()))
111
+ log.info("Engine ready signal written to %s", READY_PATH)
112
+ except OSError:
113
+ pass
114
+ return engine
115
+
116
+
117
+ # Global engine reference (set when factory creates the engine)
118
+ _engine: VoiceIOEngine | None = None
119
+ _pending_commands: list[str] = []
120
+
121
+
122
+ def _socket_listener(mainloop: GLib.MainLoop) -> None:
123
+ """Listen for commands on Unix DGRAM socket. Runs in a thread."""
124
+ SOCKET_PATH.unlink(missing_ok=True)
125
+ sock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
126
+ sock.bind(str(SOCKET_PATH))
127
+ sock.settimeout(1.0)
128
+ log.info("Socket listener started at %s", SOCKET_PATH)
129
+
130
+ while mainloop.is_running():
131
+ try:
132
+ data, addr = sock.recvfrom(65536)
133
+ except socket.timeout:
134
+ continue
135
+ except OSError:
136
+ break
137
+
138
+ msg = data.decode("utf-8", errors="replace")
139
+ log.debug("Received: %s", msg[:80])
140
+
141
+ if msg == "ping":
142
+ # Respond to probe: send pong back
143
+ if addr:
144
+ try:
145
+ sock.sendto(b"pong", addr)
146
+ except OSError:
147
+ pass
148
+ continue
149
+
150
+ # Dispatch to engine on GLib main thread
151
+ GLib.idle_add(_handle_command, msg)
152
+
153
+ sock.close()
154
+ SOCKET_PATH.unlink(missing_ok=True)
155
+
156
+
157
+ def _flush_pending() -> None:
158
+ """Replay any commands that arrived before the engine was ready."""
159
+ while _pending_commands:
160
+ _dispatch(_pending_commands.pop(0))
161
+
162
+
163
+ def _dispatch(msg: str) -> None:
164
+ """Execute a single command on the engine."""
165
+ try:
166
+ if msg.startswith("preedit:"):
167
+ _engine.preedit(msg[8:])
168
+ elif msg.startswith("commit:"):
169
+ _engine.commit(msg[7:])
170
+ elif msg == "clear":
171
+ _engine.clear()
172
+ else:
173
+ log.warning("Unknown command: %s", msg[:40])
174
+ except Exception:
175
+ log.exception("Error dispatching command: %s", msg[:40])
176
+
177
+
178
+ def _handle_command(msg: str) -> bool:
179
+ """Handle a command on the GLib main thread. Returns False to remove from idle."""
180
+ if _engine is None:
181
+ log.debug("Engine not ready, buffering command: %s", msg[:40])
182
+ _pending_commands.append(msg)
183
+ return False
184
+
185
+ # Flush any buffered commands first
186
+ if _pending_commands:
187
+ log.info("Engine ready, flushing %d buffered commands", len(_pending_commands))
188
+ _flush_pending()
189
+
190
+ _dispatch(msg)
191
+ return False # run once, don't repeat
192
+
193
+
194
+ def main() -> None:
195
+ # Log to file so we can debug when IBus spawns us
196
+ log_path = Path(os.environ.get("XDG_RUNTIME_DIR", "/tmp")) / "voiceio-ibus-engine.log"
197
+ logging.basicConfig(
198
+ level=logging.DEBUG,
199
+ format="%(asctime)s %(levelname)s %(name)s: %(message)s",
200
+ handlers=[
201
+ logging.StreamHandler(),
202
+ logging.handlers.RotatingFileHandler(
203
+ str(log_path), maxBytes=1_000_000, backupCount=1,
204
+ ),
205
+ ],
206
+ )
207
+
208
+ IBus.init()
209
+ bus = IBus.Bus()
210
+
211
+ if not bus.is_connected():
212
+ log.error("Cannot connect to IBus daemon. Is IBus running?")
213
+ sys.exit(1)
214
+
215
+ # Register GTypes
216
+ GObject.type_register(VoiceIOEngine)
217
+ GObject.type_register(VoiceIOEngineFactory)
218
+
219
+ # Create custom factory (registers on D-Bus at IBus.PATH_FACTORY)
220
+ VoiceIOEngineFactory(bus) # registers on D-Bus at IBus.PATH_FACTORY
221
+
222
+ # Register component so IBus knows about our engine
223
+ component = IBus.Component.new(
224
+ COMPONENT_NAME,
225
+ "VoiceIO voice input",
226
+ "1.0",
227
+ "MIT",
228
+ "voiceio",
229
+ "",
230
+ "",
231
+ "voiceio",
232
+ )
233
+ engine_desc = IBus.EngineDesc.new(
234
+ ENGINE_NAME,
235
+ "VoiceIO",
236
+ "Voice-to-text input",
237
+ "other",
238
+ "MIT",
239
+ "voiceio",
240
+ "",
241
+ "us",
242
+ )
243
+ component.add_engine(engine_desc)
244
+ bus.register_component(component)
245
+
246
+ log.info("VoiceIO IBus engine registered with custom factory")
247
+ bus.request_name(COMPONENT_NAME, 0)
248
+
249
+ mainloop = GLib.MainLoop()
250
+
251
+ # Start socket listener in background thread
252
+ listener = threading.Thread(
253
+ target=_socket_listener, args=(mainloop,), daemon=True,
254
+ )
255
+ listener.start()
256
+
257
+ try:
258
+ mainloop.run()
259
+ except KeyboardInterrupt:
260
+ pass
261
+ finally:
262
+ SOCKET_PATH.unlink(missing_ok=True)
263
+ READY_PATH.unlink(missing_ok=True)
264
+ log.info("VoiceIO IBus engine stopped")
265
+
266
+
267
+ if __name__ == "__main__":
268
+ main()
voiceio/platform.py ADDED
@@ -0,0 +1,139 @@
1
+ """Platform detection: OS, display server, desktop environment, available tools."""
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ import shutil
6
+ import sys
7
+ from dataclasses import dataclass
8
+ from functools import lru_cache
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class Platform:
13
+ os: str # "linux", "darwin", "windows"
14
+ display_server: str # "x11", "wayland", "quartz", "unknown"
15
+ desktop: str # "gnome", "kde", "sway", "hyprland", "macos", "unknown"
16
+
17
+ # Tool availability
18
+ has_xdotool: bool = False
19
+ has_ydotool: bool = False
20
+ has_wtype: bool = False
21
+ has_xclip: bool = False
22
+ has_wl_copy: bool = False
23
+ has_dotool: bool = False
24
+ has_ibus: bool = False
25
+
26
+ # Permissions
27
+ has_input_group: bool = False
28
+ has_uinput_access: bool = False
29
+
30
+ @property
31
+ def is_linux(self) -> bool:
32
+ return self.os == "linux"
33
+
34
+ @property
35
+ def is_mac(self) -> bool:
36
+ return self.os == "darwin"
37
+
38
+ @property
39
+ def is_wayland(self) -> bool:
40
+ return self.display_server == "wayland"
41
+
42
+ @property
43
+ def is_x11(self) -> bool:
44
+ return self.display_server == "x11"
45
+
46
+ @property
47
+ def is_gnome(self) -> bool:
48
+ return self.desktop in ("gnome", "unity")
49
+
50
+
51
+ def _detect_os() -> str:
52
+ if sys.platform.startswith("linux"):
53
+ return "linux"
54
+ if sys.platform == "darwin":
55
+ return "darwin"
56
+ if sys.platform == "win32":
57
+ return "windows"
58
+ return "unknown"
59
+
60
+
61
+ def _detect_display_server() -> str:
62
+ plat = _detect_os()
63
+ if plat == "darwin":
64
+ return "quartz"
65
+ if plat == "windows":
66
+ return "unknown"
67
+
68
+ session = os.environ.get("XDG_SESSION_TYPE", "").lower()
69
+ if session == "wayland":
70
+ return "wayland"
71
+ if session == "x11":
72
+ return "x11"
73
+
74
+ # Fallback heuristics
75
+ if os.environ.get("WAYLAND_DISPLAY"):
76
+ return "wayland"
77
+ if os.environ.get("DISPLAY"):
78
+ return "x11"
79
+
80
+ return "unknown"
81
+
82
+
83
+ def _detect_desktop() -> str:
84
+ plat = _detect_os()
85
+ if plat == "darwin":
86
+ return "macos"
87
+
88
+ raw = os.environ.get("XDG_CURRENT_DESKTOP", "").lower()
89
+
90
+ if "gnome" in raw:
91
+ return "gnome"
92
+ if "kde" in raw or "plasma" in raw:
93
+ return "kde"
94
+ if "sway" in raw:
95
+ return "sway"
96
+ if "hyprland" in raw:
97
+ return "hyprland"
98
+ if raw:
99
+ return raw.split(":")[0] # take first component
100
+
101
+ return "unknown"
102
+
103
+
104
+ def _check_input_group() -> bool:
105
+ try:
106
+ import grp
107
+ input_gid = grp.getgrnam("input").gr_gid
108
+ return input_gid in os.getgroups()
109
+ except (KeyError, ImportError):
110
+ return False
111
+
112
+
113
+ def _check_uinput_access() -> bool:
114
+ try:
115
+ with open("/dev/uinput", "rb"):
116
+ pass
117
+ return True
118
+ except (PermissionError, FileNotFoundError, OSError):
119
+ return False
120
+
121
+
122
+ @lru_cache(maxsize=1)
123
+ def detect() -> Platform:
124
+ """Detect the current platform. Cached, safe to call multiple times."""
125
+ plat_os = _detect_os()
126
+ return Platform(
127
+ os=plat_os,
128
+ display_server=_detect_display_server(),
129
+ desktop=_detect_desktop(),
130
+ has_xdotool=shutil.which("xdotool") is not None,
131
+ has_ydotool=shutil.which("ydotool") is not None,
132
+ has_wtype=shutil.which("wtype") is not None,
133
+ has_xclip=shutil.which("xclip") is not None,
134
+ has_wl_copy=shutil.which("wl-copy") is not None,
135
+ has_dotool=shutil.which("dotool") is not None,
136
+ has_ibus=shutil.which("ibus") is not None,
137
+ has_input_group=_check_input_group() if plat_os == "linux" else False,
138
+ has_uinput_access=_check_uinput_access() if plat_os == "linux" else False,
139
+ )
voiceio/recorder.py ADDED
@@ -0,0 +1,208 @@
1
+ """Audio capture with pre-buffer ring to prevent clipping."""
2
+ from __future__ import annotations
3
+
4
+ import logging
5
+ import threading
6
+ from typing import TYPE_CHECKING, Callable
7
+
8
+ import numpy as np
9
+ import sounddevice as sd
10
+
11
+ if TYPE_CHECKING:
12
+ from voiceio.config import AudioConfig
13
+
14
+ log = logging.getLogger(__name__)
15
+
16
+
17
+ class RingBuffer:
18
+ """Fixed-size ring buffer for float32 audio samples."""
19
+
20
+ def __init__(self, max_samples: int):
21
+ self._buf = np.zeros(max_samples, dtype=np.float32)
22
+ self._max = max_samples
23
+ self._write_pos = 0
24
+ self._filled = 0
25
+
26
+ def append(self, data: np.ndarray) -> None:
27
+ if self._max == 0:
28
+ return
29
+ flat = data.flatten()
30
+ n = len(flat)
31
+ if n >= self._max:
32
+ # Data larger than buffer: just keep the tail
33
+ self._buf[:] = flat[-self._max:]
34
+ self._write_pos = 0
35
+ self._filled = self._max
36
+ return
37
+
38
+ end = self._write_pos + n
39
+ if end <= self._max:
40
+ self._buf[self._write_pos:end] = flat
41
+ else:
42
+ first = self._max - self._write_pos
43
+ self._buf[self._write_pos:] = flat[:first]
44
+ self._buf[:n - first] = flat[first:]
45
+
46
+ self._write_pos = end % self._max
47
+ self._filled = min(self._filled + n, self._max)
48
+
49
+ def get(self) -> np.ndarray:
50
+ """Return buffered audio in chronological order."""
51
+ if self._filled == 0:
52
+ return np.zeros(0, dtype=np.float32)
53
+ if self._filled < self._max:
54
+ return self._buf[:self._filled].copy()
55
+ # Full ring: read from write_pos (oldest) through the end
56
+ return np.concatenate([
57
+ self._buf[self._write_pos:],
58
+ self._buf[:self._write_pos],
59
+ ])
60
+
61
+ def clear(self) -> None:
62
+ self._write_pos = 0
63
+ self._filled = 0
64
+
65
+
66
+ class AudioRecorder:
67
+ """Audio recorder with always-on pre-buffer ring.
68
+
69
+ The audio stream runs continuously. A ring buffer captures the last
70
+ `prebuffer_secs` of audio. When recording starts, the ring buffer
71
+ contents become the start of the recording, so no first syllable is lost.
72
+ """
73
+
74
+ def __init__(self, cfg: AudioConfig, on_speech_pause: Callable[[], None] | None = None):
75
+ self.sample_rate = cfg.sample_rate
76
+ self.device = None if cfg.device == "default" else cfg.device
77
+ self.prebuffer_secs = cfg.prebuffer_secs
78
+
79
+ self._ring = RingBuffer(int(self.prebuffer_secs * self.sample_rate))
80
+ self._chunks: list[np.ndarray] = []
81
+ self._stream: sd.InputStream | None = None
82
+ self._lock = threading.Lock()
83
+ self._recording = False
84
+
85
+ # Streaming VAD
86
+ self._on_speech_pause = on_speech_pause
87
+ self._silence_threshold = cfg.silence_threshold
88
+ self._silence_duration = cfg.silence_duration
89
+ self._silent_chunks = 0.0
90
+ self._last_transcribed_len = 0
91
+ self._total_samples = 0
92
+
93
+ def open_stream(self) -> None:
94
+ """Start the always-on audio stream (feeds ring buffer)."""
95
+ if self._stream is not None:
96
+ return
97
+ self._stream = sd.InputStream(
98
+ samplerate=self.sample_rate,
99
+ channels=1,
100
+ dtype="float32",
101
+ device=self.device,
102
+ callback=self._callback,
103
+ )
104
+ self._stream.start()
105
+ log.debug("Audio stream opened (prebuffer=%.1fs)", self.prebuffer_secs)
106
+
107
+ def close_stream(self) -> None:
108
+ """Stop the always-on audio stream."""
109
+ if self._stream is not None:
110
+ self._stream.stop()
111
+ self._stream.close()
112
+ self._stream = None
113
+ self._ring.clear()
114
+
115
+ def start(self) -> None:
116
+ """Start recording. Grabs ring buffer contents as the beginning."""
117
+ with self._lock:
118
+ if self._recording:
119
+ return
120
+ # Ensure stream is running
121
+ if self._stream is None:
122
+ self.open_stream()
123
+ # Grab pre-buffer
124
+ prebuf = self._ring.get()
125
+ self._chunks = [prebuf.reshape(-1, 1)] if len(prebuf) > 0 else []
126
+ self._total_samples = sum(len(c) for c in self._chunks)
127
+ self._silent_chunks = 0.0
128
+ self._last_transcribed_len = 0
129
+ self._recording = True
130
+ prebuf_ms = len(prebuf) / self.sample_rate * 1000
131
+ log.info("Recording started (%.0fms pre-buffer)", prebuf_ms)
132
+
133
+ def stop(self) -> np.ndarray | None:
134
+ """Stop recording, return captured audio."""
135
+ with self._lock:
136
+ if not self._recording:
137
+ return None
138
+ self._recording = False
139
+
140
+ if not self._chunks:
141
+ log.warning("No audio captured")
142
+ return None
143
+
144
+ audio = np.concatenate(self._chunks, axis=0).flatten()
145
+ remaining = audio[self._last_transcribed_len:]
146
+ duration = len(remaining) / self.sample_rate
147
+
148
+ if duration < 0.3:
149
+ if self._last_transcribed_len > 0:
150
+ return None
151
+ log.warning("Audio too short (%.1fs), skipping", duration)
152
+ return None
153
+
154
+ log.info("Recording stopped, %.1fs audio", duration)
155
+ return remaining
156
+
157
+ def get_audio_so_far(self) -> np.ndarray | None:
158
+ """Get all audio captured so far (for streaming)."""
159
+ with self._lock:
160
+ if not self._chunks:
161
+ return None
162
+ return np.concatenate(self._chunks, axis=0).flatten()
163
+
164
+ def set_on_speech_pause(self, callback: Callable[[], None] | None) -> None:
165
+ """Set/clear the speech pause callback (used by streaming session)."""
166
+ self._on_speech_pause = callback
167
+
168
+ def mark_transcribed(self, num_samples: int) -> None:
169
+ self._last_transcribed_len = num_samples
170
+
171
+ @property
172
+ def is_recording(self) -> bool:
173
+ return self._recording
174
+
175
+ def _callback(
176
+ self, indata: np.ndarray, frames: int, time_info: object, status: object
177
+ ) -> None:
178
+ if status:
179
+ log.warning("Audio stream status: %s", status)
180
+
181
+ # Always feed ring buffer
182
+ self._ring.append(indata)
183
+
184
+ # Only collect chunks when recording
185
+ if not self._recording:
186
+ return
187
+
188
+ chunk = indata.copy()
189
+ self._chunks.append(chunk)
190
+ self._total_samples += chunk.shape[0]
191
+
192
+ # Streaming VAD
193
+ if self._on_speech_pause is not None:
194
+ flat = indata.ravel()
195
+ rms = float(np.sqrt(np.dot(flat, flat) / len(flat)))
196
+ chunk_secs = frames / self.sample_rate
197
+
198
+ if rms < self._silence_threshold:
199
+ self._silent_chunks += chunk_secs
200
+ else:
201
+ self._silent_chunks = 0.0
202
+
203
+ has_new = self._total_samples > self._last_transcribed_len + self.sample_rate
204
+
205
+ if self._silent_chunks >= self._silence_duration and has_new:
206
+ self._silent_chunks = 0.0
207
+ # Signal pause. Don't concatenate on the audio thread.
208
+ self._on_speech_pause()