pi-friday 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/wake_daemon.py ADDED
@@ -0,0 +1,318 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Friday Wake Word Daemon
4
+
5
+ Listens for a wake word using openwakeword, records speech after detection,
6
+ transcribes with faster-whisper, and writes the result to a command file
7
+ that the Friday pi extension picks up.
8
+
9
+ Usage:
10
+ python3 wake_daemon.py <command_file> [--wake-word hey_jarvis] [--threshold 0.5]
11
+ """
12
+
13
+ import argparse
14
+ import json
15
+ import logging
16
+ import sys
17
+ import time
18
+ import os
19
+ import struct
20
+ import tempfile
21
+ import signal
22
+ import numpy as np
23
+
24
+ # Audio config
25
+ SAMPLE_RATE = 16000
26
+ CHUNK_SIZE = 1280 # 80ms at 16kHz — openwakeword expects this
27
+ FORMAT_WIDTH = 2 # 16-bit
28
+ CHANNELS = 1
29
+
30
+ # Silence detection
31
+ SILENCE_THRESHOLD = 500 # RMS amplitude threshold for silence
32
+ SILENCE_DURATION = 2.0 # seconds of silence to stop recording
33
+ INITIAL_WAIT_SECONDS = 3.5 # seconds to wait for speech after wake word
34
+ MAX_RECORD_SECONDS = 30 # safety cap
35
+ MIN_RECORD_SECONDS = 0.5 # ignore very short recordings
36
+ MUTE_FILE_NAME = "tts_playing" # skip detection while this file exists
37
+ LISTEN_NOW_FILE = "listen_now" # extension signals: start recording immediately
38
+
39
+ logging.basicConfig(
40
+ level=logging.INFO,
41
+ format="[friday-wake] %(asctime)s %(message)s",
42
+ datefmt="%H:%M:%S",
43
+ )
44
+ log = logging.getLogger("friday-wake")
45
+
46
+
47
+ def rms(audio_chunk: bytes) -> float:
48
+ """Calculate RMS amplitude of a 16-bit audio chunk."""
49
+ if len(audio_chunk) < 2:
50
+ return 0.0
51
+ count = len(audio_chunk) // 2
52
+ shorts = struct.unpack(f"<{count}h", audio_chunk)
53
+ return (sum(s * s for s in shorts) / count) ** 0.5
54
+
55
+
56
+ def record_until_silence(stream, sample_rate: int, chunk_size: int,
57
+ wait_for_speech: float = 0,
58
+ max_record: float = 0) -> bytes:
59
+ """Record audio from stream until silence is detected.
60
+
61
+ Args:
62
+ wait_for_speech: Max seconds to wait for the user to START talking.
63
+ 0 = start recording immediately (wake-word mode).
64
+ >0 = wait up to this many seconds for speech before giving up
65
+ (question-response mode).
66
+ max_record: Max recording duration in seconds. 0 = use MAX_RECORD_SECONDS default.
67
+ """
68
+ effective_max = max_record if max_record > 0 else MAX_RECORD_SECONDS
69
+ frames = []
70
+ silent_chunks = 0
71
+ chunks_for_silence = int(SILENCE_DURATION * sample_rate / chunk_size)
72
+ max_chunks = int(effective_max * sample_rate / chunk_size)
73
+ min_chunks = int(MIN_RECORD_SECONDS * sample_rate / chunk_size)
74
+
75
+ # Phase 1: Wait for speech to begin (if wait_for_speech > 0)
76
+ if wait_for_speech > 0:
77
+ wait_chunks = int(wait_for_speech * sample_rate / chunk_size)
78
+ log.info(f"Waiting up to {wait_for_speech:.0f}s for speech...")
79
+ speech_started = False
80
+ for _ in range(wait_chunks):
81
+ data = stream.read(chunk_size, exception_on_overflow=False)
82
+ amplitude = rms(data)
83
+ if amplitude >= SILENCE_THRESHOLD:
84
+ # Speech detected — keep this chunk and move to recording
85
+ frames.append(data)
86
+ speech_started = True
87
+ break
88
+ if not speech_started:
89
+ log.info("No speech detected within wait window, giving up")
90
+ return b""
91
+
92
+ # Phase 2: Record until silence
93
+ log.info("Recording... (speak now)")
94
+
95
+ for i in range(max_chunks):
96
+ data = stream.read(chunk_size, exception_on_overflow=False)
97
+ frames.append(data)
98
+
99
+ amplitude = rms(data)
100
+ if amplitude < SILENCE_THRESHOLD:
101
+ silent_chunks += 1
102
+ else:
103
+ silent_chunks = 0
104
+
105
+ # Stop on sustained silence (but only after minimum recording time)
106
+ if silent_chunks >= chunks_for_silence and i >= min_chunks:
107
+ break
108
+
109
+ duration = len(frames) * chunk_size / sample_rate
110
+ log.info(f"Recorded {duration:.1f}s of audio")
111
+ return b"".join(frames)
112
+
113
+
114
+ def transcribe(audio_bytes: bytes, model) -> str:
115
+ """Transcribe raw 16-bit PCM audio bytes using faster-whisper."""
116
+ # Convert bytes to float32 numpy array
117
+ audio_array = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
118
+
119
+ segments, info = model.transcribe(
120
+ audio_array,
121
+ beam_size=5,
122
+ language="en",
123
+ vad_filter=True,
124
+ )
125
+
126
+ text = " ".join(segment.text.strip() for segment in segments).strip()
127
+ return text
128
+
129
+
130
+ def write_signal(command_file: str, signal_type: str, text: str = ""):
131
+ """Write a signal/command to the command file for the extension."""
132
+ payload = json.dumps({"type": signal_type, "text": text, "timestamp": time.time()})
133
+ tmp = command_file + ".tmp"
134
+ with open(tmp, "w") as f:
135
+ f.write(payload + "\n")
136
+ os.rename(tmp, command_file)
137
+ if text:
138
+ log.info(f"Sent to pi: {text}")
139
+
140
+
141
+ def play_listening_sound():
142
+ """Play a subtle sound to indicate we're listening."""
143
+ # Quick beep using sox
144
+ try:
145
+ os.system("play -q -n synth 0.1 sin 800 vol 0.3 2>/dev/null &")
146
+ except Exception:
147
+ pass
148
+
149
+
150
+ def play_done_sound():
151
+ """Play a subtle sound to indicate we're done recording."""
152
+ try:
153
+ os.system("play -q -n synth 0.05 sin 600 vol 0.2 2>/dev/null &")
154
+ except Exception:
155
+ pass
156
+
157
+
158
+ def main():
159
+ parser = argparse.ArgumentParser(description="Friday Wake Word Daemon")
160
+ parser.add_argument("command_file", help="File to write transcribed commands to")
161
+ parser.add_argument("--wake-word", default="hey_jarvis", help="Wake word model name")
162
+ parser.add_argument("--threshold", type=float, default=0.5, help="Wake word detection threshold")
163
+ parser.add_argument("--whisper-model", default="tiny.en", help="Whisper model size")
164
+ parser.add_argument("--data-dir", default=None, help="Directory for custom wake word models")
165
+ args = parser.parse_args()
166
+
167
+ # Handle shutdown gracefully
168
+ running = True
169
+ def shutdown(sig, frame):
170
+ nonlocal running
171
+ log.info("Shutting down...")
172
+ running = False
173
+ signal.signal(signal.SIGTERM, shutdown)
174
+ signal.signal(signal.SIGINT, shutdown)
175
+
176
+ # Load wake word model
177
+ log.info(f"Loading wake word model: {args.wake_word}")
178
+ from openwakeword.model import Model as WakeModel
179
+
180
+ # Check if it's a custom model file in the data dir
181
+ data_dir = args.data_dir or os.path.join(os.path.expanduser("~"), ".pi/agent/friday")
182
+ custom_model_path = os.path.join(data_dir, f"{args.wake_word}.onnx")
183
+ if os.path.exists(custom_model_path):
184
+ log.info(f"Using custom model: {custom_model_path}")
185
+ model_ref = custom_model_path
186
+ else:
187
+ model_ref = args.wake_word
188
+
189
+ wake_model = WakeModel(
190
+ wakeword_models=[model_ref],
191
+ inference_framework="onnx",
192
+ )
193
+
194
+ # Load whisper model
195
+ log.info(f"Loading whisper model: {args.whisper_model}")
196
+ from faster_whisper import WhisperModel
197
+ whisper_model = WhisperModel(args.whisper_model, device="cpu", compute_type="int8")
198
+
199
+ # Open microphone
200
+ import pyaudio
201
+ pa = pyaudio.PyAudio()
202
+ stream = pa.open(
203
+ format=pyaudio.paInt16,
204
+ channels=CHANNELS,
205
+ rate=SAMPLE_RATE,
206
+ input=True,
207
+ frames_per_buffer=CHUNK_SIZE,
208
+ )
209
+
210
+ log.info(f"Listening for '{args.wake_word}' (threshold: {args.threshold})...")
211
+ log.info(f"Command file: {args.command_file}")
212
+
213
+ # Log RMS levels periodically for mic diagnostics
214
+ rms_sample_counter = 0
215
+ rms_max_seen = 0
216
+
217
+ try:
218
+ while running:
219
+ # Read audio chunk
220
+ try:
221
+ audio = stream.read(CHUNK_SIZE, exception_on_overflow=False)
222
+ except Exception:
223
+ time.sleep(0.01)
224
+ continue
225
+
226
+ # Convert to numpy for openwakeword
227
+ audio_array = np.frombuffer(audio, dtype=np.int16)
228
+
229
+ # Periodic RMS diagnostics (every ~2s)
230
+ current_rms = rms(audio)
231
+ if current_rms > rms_max_seen:
232
+ rms_max_seen = current_rms
233
+ rms_sample_counter += 1
234
+ if rms_sample_counter % 25 == 0: # ~2s at 80ms chunks
235
+ log.info(f"[mic] RMS: {current_rms:.0f} | max seen: {rms_max_seen:.0f} | threshold: {SILENCE_THRESHOLD}")
236
+
237
+ base_dir = os.path.dirname(args.command_file)
238
+
239
+ # Skip detection while TTS is playing (prevents self-triggering)
240
+ mute_path = os.path.join(base_dir, MUTE_FILE_NAME)
241
+ if os.path.exists(mute_path):
242
+ wake_model.reset()
243
+ continue
244
+
245
+ # Check if extension wants us to listen immediately (question asked)
246
+ listen_now_path = os.path.join(base_dir, LISTEN_NOW_FILE)
247
+ immediate_listen = False
248
+ wait_for_speech = 0
249
+ max_record = 0 # 0 = use MAX_RECORD_SECONDS default
250
+ if os.path.exists(listen_now_path):
251
+ try:
252
+ raw = open(listen_now_path).read().strip()
253
+ os.remove(listen_now_path)
254
+ immediate_listen = True
255
+ # Parse JSON payload for waitForSpeech parameter
256
+ try:
257
+ payload = json.loads(raw)
258
+ wait_for_speech = float(payload.get("waitForSpeech", 5))
259
+ max_record = float(payload.get("maxRecord", 10))
260
+ except (json.JSONDecodeError, ValueError):
261
+ wait_for_speech = 5
262
+ max_record = 10
263
+ log.info(f"Auto-listen triggered (wait={wait_for_speech:.0f}s, max_record={max_record:.0f}s)")
264
+ except Exception:
265
+ pass
266
+
267
+ if not immediate_listen:
268
+ # Run wake word detection
269
+ prediction = wake_model.predict(audio_array)
270
+ score = prediction.get(args.wake_word, 0)
271
+ if score < args.threshold:
272
+ continue
273
+
274
+ if True: # wake word detected OR immediate listen
275
+ if not immediate_listen:
276
+ log.info(f"Wake word detected! (score: {score:.2f})")
277
+ # Signal extension IMMEDIATELY to kill any playing TTS
278
+ write_signal(args.command_file, "wake", "")
279
+
280
+ play_listening_sound()
281
+
282
+ # Reset the wake word model to avoid re-triggering
283
+ wake_model.reset()
284
+
285
+ # Record speech until silence
286
+ # For auto-listen (questions), wait up to wait_for_speech seconds
287
+ # for the user to start talking before giving up
288
+ audio_data = record_until_silence(
289
+ stream, SAMPLE_RATE, CHUNK_SIZE,
290
+ wait_for_speech=wait_for_speech if immediate_listen else INITIAL_WAIT_SECONDS,
291
+ max_record=max_record if immediate_listen else MAX_RECORD_SECONDS,
292
+ )
293
+ play_done_sound()
294
+
295
+ if len(audio_data) < SAMPLE_RATE * MIN_RECORD_SECONDS * FORMAT_WIDTH:
296
+ log.info("Recording too short, ignoring")
297
+ continue
298
+
299
+ # Transcribe
300
+ log.info("Transcribing...")
301
+ text = transcribe(audio_data, whisper_model)
302
+
303
+ if text and len(text.strip()) > 0:
304
+ write_signal(args.command_file, "command", text.strip())
305
+ else:
306
+ log.info("No speech detected in recording")
307
+
308
+ except KeyboardInterrupt:
309
+ pass
310
+ finally:
311
+ log.info("Cleaning up...")
312
+ stream.stop_stream()
313
+ stream.close()
314
+ pa.terminate()
315
+
316
+
317
+ if __name__ == "__main__":
318
+ main()