@codexstar/pi-listen 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/daemon.py ADDED
@@ -0,0 +1,517 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ pi-voice STT daemon — keeps model warm in memory for zero cold-start transcription.
4
+
5
+ Protocol: Unix domain socket, newline-delimited JSON.
6
+
7
+ Commands (send JSON, receive JSON response):
8
+ {"cmd": "transcribe", "audio": "/path/to/file.wav"}
9
+ {"cmd": "transcribe", "audio": "/path/to/file.wav", "backend": "faster-whisper", "model": "small", "language": "en"}
10
+ {"cmd": "status"}
11
+ {"cmd": "load", "backend": "faster-whisper", "model": "small"}
12
+ {"cmd": "shutdown"}
13
+ {"cmd": "ping"}
14
+
15
+ The daemon auto-exits after 5 minutes of inactivity.
16
+ """
17
+ import sys
18
+ import os
19
+ import json
20
+ import time
21
+ import socket
22
+ import signal
23
+ import argparse
24
+ import threading
25
+ import tempfile
26
+ import traceback
27
+
28
+ # Add our own directory to path so we can import transcribe module
29
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
30
+ from transcribe import BACKENDS, resolve_backend_and_model
31
+
32
+
33
+ # ─── Config ──────────────────────────────────────────────────────────────────
34
+
35
+ DEFAULT_SOCKET = os.path.join(tempfile.gettempdir(), "pi-voice-daemon.sock")
36
+ IDLE_TIMEOUT = 300 # 5 minutes
37
+ MAX_MSG_SIZE = 1024 * 1024 # 1MB
38
+
39
+
40
+ # ─── Warm Model Cache ────────────────────────────────────────────────────────
41
+
42
+ class ModelCache:
43
+ """Keeps a single STT model loaded in memory."""
44
+
45
+ def __init__(self):
46
+ self.backend_name: str | None = None
47
+ self.model_name: str | None = None
48
+ self._model = None
49
+ self._lock = threading.Lock()
50
+
51
+ def load(self, backend: str, model: str) -> dict:
52
+ with self._lock:
53
+ if self.backend_name == backend and self.model_name == model and self._model is not None:
54
+ return {"status": "already_loaded", "backend": backend, "model": model}
55
+
56
+ # Unload previous
57
+ self._model = None
58
+ self.backend_name = backend
59
+ self.model_name = model
60
+
61
+ start = time.time()
62
+ try:
63
+ if backend == "faster-whisper":
64
+ from faster_whisper import WhisperModel
65
+ self._model = WhisperModel(model, device="cpu", compute_type="int8")
66
+ elif backend == "parakeet":
67
+ import nemo.collections.asr as nemo_asr
68
+ self._model = nemo_asr.models.ASRModel.from_pretrained(model)
69
+ else:
70
+ # moonshine, whisper-cpp, deepgram don't have persistent model objects
71
+ # We still mark them as "loaded" so the daemon knows what to use
72
+ self._model = "external"
73
+
74
+ elapsed = round(time.time() - start, 2)
75
+ return {"status": "loaded", "backend": backend, "model": model, "load_time": elapsed}
76
+ except Exception as e:
77
+ self._model = None
78
+ return {"status": "error", "error": str(e)}
79
+
80
+ def transcribe(self, audio_path: str, language: str = "en") -> dict:
81
+ with self._lock:
82
+ if not self.backend_name:
83
+ return {"error": "No model loaded. Send 'load' first."}
84
+
85
+ backend = self.backend_name
86
+ model = self.model_name
87
+ start = time.time()
88
+
89
+ try:
90
+ if backend == "faster-whisper" and self._model and self._model != "external":
91
+ segments, info = self._model.transcribe(
92
+ audio_path,
93
+ language=language if language != "auto" else None,
94
+ beam_size=1,
95
+ vad_filter=True,
96
+ )
97
+ text = " ".join(seg.text.strip() for seg in segments)
98
+ return {
99
+ "text": text,
100
+ "duration": round(time.time() - start, 2),
101
+ "backend": backend,
102
+ "model": model,
103
+ "language": getattr(info, 'language', language),
104
+ }
105
+
106
+ elif backend == "parakeet" and self._model and self._model != "external":
107
+ result = self._model.transcribe([audio_path])[0]
108
+ if isinstance(result, list):
109
+ result = result[0]
110
+ return {
111
+ "text": str(result),
112
+ "duration": round(time.time() - start, 2),
113
+ "backend": backend,
114
+ "model": model,
115
+ }
116
+
117
+ else:
118
+ # For backends without warm models, delegate to their transcribe fn
119
+ info = BACKENDS.get(backend)
120
+ if not info:
121
+ return {"error": f"Unknown backend: {backend}"}
122
+ result = info["fn"](audio_path, model, language)
123
+ return result
124
+
125
+ except Exception as e:
126
+ log(f"Transcription error: {traceback.format_exc()}")
127
+ return {"error": str(e), "backend": backend, "model": model}
128
+
129
+
130
+ # ─── VAD (Voice Activity Detection) ─────────────────────────────────────────
131
+
132
+ _vad_model = None
133
+
134
+ def get_vad_model():
135
+ """Lazy-load Silero VAD model."""
136
+ global _vad_model
137
+ if _vad_model is None:
138
+ try:
139
+ import torch
140
+ model, utils = torch.hub.load(
141
+ repo_or_dir='snakers4/silero-vad',
142
+ model='silero_vad',
143
+ force_reload=False,
144
+ onnx=True,
145
+ )
146
+ _vad_model = (model, utils)
147
+ except Exception:
148
+ _vad_model = False # Mark as unavailable
149
+ return _vad_model if _vad_model else None
150
+
151
+ def run_vad(audio_path: str) -> dict:
152
+ """Run VAD on audio file, return speech segments and whether speech was detected."""
153
+ vad = get_vad_model()
154
+ if not vad:
155
+ return {"has_speech": True, "vad_available": False} # Assume speech if no VAD
156
+
157
+ try:
158
+ model, utils = vad
159
+ (get_speech_timestamps, _, read_audio, _, _) = utils
160
+ wav = read_audio(audio_path, sampling_rate=16000)
161
+ timestamps = get_speech_timestamps(wav, model, sampling_rate=16000)
162
+ has_speech = len(timestamps) > 0
163
+ return {
164
+ "has_speech": has_speech,
165
+ "vad_available": True,
166
+ "segments": len(timestamps),
167
+ "speech_duration_ms": sum(t['end'] - t['start'] for t in timestamps) * 1000 // 16000 if timestamps else 0,
168
+ }
169
+ except Exception as e:
170
+ return {"has_speech": True, "vad_available": False, "error": str(e)}
171
+
172
+
173
+ # ─── Server ──────────────────────────────────────────────────────────────────
174
+
175
+ class DaemonServer:
176
+ def __init__(self, socket_path: str, backend: str | None = None, model: str | None = None):
177
+ self.socket_path = socket_path
178
+ self.cache = ModelCache()
179
+ self.running = False
180
+ self.last_activity = time.time()
181
+ self.server_socket: socket.socket | None = None
182
+ self._idle_thread: threading.Thread | None = None
183
+ self.start_time = time.time()
184
+ self.request_count = 0
185
+
186
+ # Auto-load model if specified
187
+ if backend and model:
188
+ result = self.cache.load(backend, model)
189
+ log(f"Pre-loaded model: {json.dumps(result)}")
190
+
191
+ def handle_client(self, conn: socket.socket):
192
+ """Handle a single client connection (one request-response per line)."""
193
+ self.last_activity = time.time()
194
+ buf = b""
195
+
196
+ try:
197
+ while True:
198
+ chunk = conn.recv(4096)
199
+ if not chunk:
200
+ break
201
+ buf += chunk
202
+
203
+ # Guard against unbounded buffer growth from clients
204
+ # that never send a newline delimiter
205
+ if len(buf) > MAX_MSG_SIZE:
206
+ response = {"error": "Message exceeds maximum size"}
207
+ try:
208
+ conn.sendall((json.dumps(response) + "\n").encode())
209
+ except (ConnectionResetError, BrokenPipeError):
210
+ pass
211
+ break
212
+
213
+ while b"\n" in buf:
214
+ line, buf = buf.split(b"\n", 1)
215
+ if not line.strip():
216
+ continue
217
+
218
+ try:
219
+ req = json.loads(line.decode("utf-8"))
220
+ except json.JSONDecodeError as e:
221
+ response = {"error": f"Invalid JSON: {e}"}
222
+ conn.sendall((json.dumps(response) + "\n").encode())
223
+ continue
224
+
225
+ response = self.dispatch(req)
226
+ self.request_count += 1
227
+ self.last_activity = time.time()
228
+ conn.sendall((json.dumps(response) + "\n").encode())
229
+
230
+ # Shutdown command
231
+ if req.get("cmd") == "shutdown":
232
+ self.running = False
233
+ return
234
+
235
+ except (ConnectionResetError, BrokenPipeError):
236
+ pass
237
+ finally:
238
+ conn.close()
239
+
240
+ def dispatch(self, req: dict) -> dict:
241
+ cmd = req.get("cmd", "")
242
+
243
+ if cmd == "ping":
244
+ return {"status": "ok", "pid": os.getpid()}
245
+
246
+ elif cmd == "status":
247
+ return {
248
+ "status": "running",
249
+ "pid": os.getpid(),
250
+ "uptime": round(time.time() - self.start_time, 1),
251
+ "requests": self.request_count,
252
+ "idle": round(time.time() - self.last_activity, 1),
253
+ "backend": self.cache.backend_name,
254
+ "model": self.cache.model_name,
255
+ "model_loaded": self.cache._model is not None,
256
+ }
257
+
258
+ elif cmd == "load":
259
+ backend, model = resolve_backend_and_model(req.get("backend"), req.get("model"))
260
+ if backend == "none" or backend not in BACKENDS:
261
+ return {"status": "error", "error": "No STT backend found"}
262
+ return self.cache.load(backend, model)
263
+
264
+ elif cmd == "transcribe":
265
+ audio = req.get("audio")
266
+ if not audio or not os.path.exists(audio):
267
+ return {"error": f"Audio file not found: {audio}"}
268
+
269
+ language = req.get("language", "en")
270
+
271
+ # Auto-load if not loaded yet
272
+ if not self.cache.backend_name:
273
+ backend, model = resolve_backend_and_model(req.get("backend"), req.get("model"))
274
+ if backend == "none" or backend not in BACKENDS:
275
+ return {"status": "error", "error": "No STT backend found"}
276
+ load_result = self.cache.load(backend, model)
277
+ if load_result.get("status") == "error":
278
+ return load_result
279
+
280
+ # Optional VAD pre-check
281
+ if req.get("vad", False):
282
+ vad_result = run_vad(audio)
283
+ if not vad_result.get("has_speech", True):
284
+ return {"text": "", "duration": 0, "vad": vad_result, "skipped": True}
285
+
286
+ return self.cache.transcribe(audio, language)
287
+
288
+ elif cmd == "vad":
289
+ audio = req.get("audio")
290
+ if not audio or not os.path.exists(audio):
291
+ return {"error": f"Audio file not found: {audio}"}
292
+ return run_vad(audio)
293
+
294
+ elif cmd == "shutdown":
295
+ return {"status": "shutting_down"}
296
+
297
+ elif cmd == "backends":
298
+ result = []
299
+ for name, info in BACKENDS.items():
300
+ available = info["available"]()
301
+ result.append({
302
+ "name": name,
303
+ "available": available,
304
+ "type": info["type"],
305
+ "default_model": info["default_model"],
306
+ "models": info["models"],
307
+ })
308
+ return {"backends": result}
309
+
310
+ else:
311
+ return {"error": f"Unknown command: {cmd}"}
312
+
313
+ def idle_watcher(self):
314
+ """Background thread that shuts down the daemon after idle timeout."""
315
+ while self.running:
316
+ time.sleep(10)
317
+ idle = time.time() - self.last_activity
318
+ if idle > IDLE_TIMEOUT:
319
+ log(f"Idle for {idle:.0f}s, shutting down")
320
+ self.running = False
321
+ # Connect to self to unblock accept()
322
+ try:
323
+ s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
324
+ s.connect(self.socket_path)
325
+ s.sendall(b'{"cmd":"shutdown"}\n')
326
+ s.close()
327
+ except Exception:
328
+ pass
329
+ break
330
+
331
+ def start(self):
332
+ # Clean up stale socket
333
+ if os.path.exists(self.socket_path):
334
+ try:
335
+ # Check if another daemon is running
336
+ test = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
337
+ test.settimeout(1)
338
+ test.connect(self.socket_path)
339
+ test.sendall(b'{"cmd":"ping"}\n')
340
+ resp = test.recv(1024)
341
+ test.close()
342
+ if resp:
343
+ log(f"Another daemon is already running")
344
+ print(json.dumps({"error": "already_running", "socket": self.socket_path}))
345
+ sys.exit(1)
346
+ except (ConnectionRefusedError, FileNotFoundError, OSError):
347
+ os.unlink(self.socket_path)
348
+
349
+ self.server_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
350
+ self.server_socket.bind(self.socket_path)
351
+ self.server_socket.listen(5)
352
+ self.server_socket.settimeout(1.0)
353
+ self.running = True
354
+
355
+ # Write PID file
356
+ pid_path = self.socket_path + ".pid"
357
+ with open(pid_path, "w") as f:
358
+ f.write(str(os.getpid()))
359
+
360
+ # Start idle watcher
361
+ self._idle_thread = threading.Thread(target=self.idle_watcher, daemon=True)
362
+ self._idle_thread.start()
363
+
364
+ log(f"Daemon started: pid={os.getpid()} socket={self.socket_path}")
365
+ print(json.dumps({
366
+ "status": "started",
367
+ "pid": os.getpid(),
368
+ "socket": self.socket_path,
369
+ }), flush=True)
370
+
371
+ try:
372
+ while self.running:
373
+ try:
374
+ conn, _ = self.server_socket.accept()
375
+ thread = threading.Thread(target=self.handle_client, args=(conn,), daemon=True)
376
+ thread.start()
377
+ except socket.timeout:
378
+ continue
379
+ except OSError:
380
+ break
381
+ finally:
382
+ self.cleanup()
383
+
384
+ def cleanup(self):
385
+ log("Daemon shutting down")
386
+ self.running = False
387
+ if self.server_socket:
388
+ try:
389
+ self.server_socket.close()
390
+ except Exception:
391
+ pass
392
+ if os.path.exists(self.socket_path):
393
+ try:
394
+ os.unlink(self.socket_path)
395
+ except Exception:
396
+ pass
397
+ pid_path = self.socket_path + ".pid"
398
+ if os.path.exists(pid_path):
399
+ try:
400
+ os.unlink(pid_path)
401
+ except Exception:
402
+ pass
403
+
404
+
405
+ def log(msg: str):
406
+ print(f"[pi-voice-daemon] {msg}", file=sys.stderr, flush=True)
407
+
408
+
409
+ # ─── Client helper (for testing / CLI) ───────────────────────────────────────
410
+
411
+ def send_command(socket_path: str, cmd: dict, timeout: float = 30) -> dict:
412
+ """Send a command to the daemon and return the response."""
413
+ sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
414
+ sock.settimeout(timeout)
415
+ try:
416
+ sock.connect(socket_path)
417
+ sock.sendall((json.dumps(cmd) + "\n").encode())
418
+ buf = b""
419
+ while b"\n" not in buf:
420
+ chunk = sock.recv(4096)
421
+ if not chunk:
422
+ break
423
+ buf += chunk
424
+ if buf:
425
+ return json.loads(buf.decode("utf-8").strip())
426
+ return {"error": "No response from daemon"}
427
+ except ConnectionRefusedError:
428
+ return {"error": "Daemon not running", "socket": socket_path}
429
+ except FileNotFoundError:
430
+ return {"error": "Daemon not running (socket not found)", "socket": socket_path}
431
+ finally:
432
+ sock.close()
433
+
434
+
435
+ def is_daemon_running(socket_path: str) -> bool:
436
+ """Check if the daemon is running."""
437
+ result = send_command(socket_path, {"cmd": "ping"}, timeout=2)
438
+ return result.get("status") == "ok"
439
+
440
+
441
+ # ─── Main ────────────────────────────────────────────────────────────────────
442
+
443
+ def main():
444
+ parser = argparse.ArgumentParser(description="pi-voice STT daemon")
445
+ sub = parser.add_subparsers(dest="action", help="Action to perform")
446
+
447
+ # Start daemon
448
+ start_p = sub.add_parser("start", help="Start the daemon")
449
+ start_p.add_argument("--socket", default=DEFAULT_SOCKET, help="Unix socket path")
450
+ start_p.add_argument("--backend", default=None, help="STT backend to pre-load")
451
+ start_p.add_argument("--model", default=None, help="Model to pre-load")
452
+
453
+ # Client commands
454
+ status_p = sub.add_parser("status", help="Get daemon status")
455
+ status_p.add_argument("--socket", default=DEFAULT_SOCKET)
456
+
457
+ stop_p = sub.add_parser("stop", help="Stop the daemon")
458
+ stop_p.add_argument("--socket", default=DEFAULT_SOCKET)
459
+
460
+ ping_p = sub.add_parser("ping", help="Ping the daemon")
461
+ ping_p.add_argument("--socket", default=DEFAULT_SOCKET)
462
+
463
+ tx_p = sub.add_parser("transcribe", help="Transcribe audio file")
464
+ tx_p.add_argument("audio", help="Path to audio file")
465
+ tx_p.add_argument("--socket", default=DEFAULT_SOCKET)
466
+ tx_p.add_argument("--language", default="en")
467
+ tx_p.add_argument("--vad", action="store_true", help="Run VAD before transcription")
468
+
469
+ load_p = sub.add_parser("load", help="Load a model")
470
+ load_p.add_argument("--socket", default=DEFAULT_SOCKET)
471
+ load_p.add_argument("--backend", default=None)
472
+ load_p.add_argument("--model", default=None)
473
+
474
+ args = parser.parse_args()
475
+
476
+ if args.action == "start":
477
+ # Handle signals
478
+ server = DaemonServer(args.socket, args.backend, args.model)
479
+ signal.signal(signal.SIGTERM, lambda *_: setattr(server, 'running', False))
480
+ signal.signal(signal.SIGINT, lambda *_: setattr(server, 'running', False))
481
+ server.start()
482
+
483
+ elif args.action == "status":
484
+ print(json.dumps(send_command(args.socket, {"cmd": "status"}), indent=2))
485
+
486
+ elif args.action == "stop":
487
+ print(json.dumps(send_command(args.socket, {"cmd": "shutdown"}), indent=2))
488
+
489
+ elif args.action == "ping":
490
+ result = send_command(args.socket, {"cmd": "ping"}, timeout=2)
491
+ print(json.dumps(result, indent=2))
492
+ sys.exit(0 if result.get("status") == "ok" else 1)
493
+
494
+ elif args.action == "transcribe":
495
+ result = send_command(args.socket, {
496
+ "cmd": "transcribe",
497
+ "audio": os.path.abspath(args.audio),
498
+ "language": args.language,
499
+ "vad": args.vad,
500
+ })
501
+ print(json.dumps(result, indent=2))
502
+
503
+ elif args.action == "load":
504
+ result = send_command(args.socket, {
505
+ "cmd": "load",
506
+ "backend": args.backend,
507
+ "model": args.model,
508
+ })
509
+ print(json.dumps(result, indent=2))
510
+
511
+ else:
512
+ parser.print_help()
513
+ sys.exit(1)
514
+
515
+
516
+ if __name__ == "__main__":
517
+ main()