pi-friday 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +137 -0
- package/acks.ts +166 -0
- package/daemon.ts +161 -0
- package/index.ts +509 -0
- package/package.json +19 -0
- package/panel.ts +338 -0
- package/prompt.ts +34 -0
- package/settings.json +18 -0
- package/settings.ts +75 -0
- package/voice.ts +400 -0
- package/wake_daemon.py +318 -0
package/wake_daemon.py
ADDED
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Friday Wake Word Daemon
|
|
4
|
+
|
|
5
|
+
Listens for a wake word using openwakeword, records speech after detection,
|
|
6
|
+
transcribes with faster-whisper, and writes the result to a command file
|
|
7
|
+
that the Friday pi extension picks up.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python3 wake_daemon.py <command_file> [--wake-word hey_jarvis] [--threshold 0.5]
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
import json
|
|
15
|
+
import logging
|
|
16
|
+
import sys
|
|
17
|
+
import time
|
|
18
|
+
import os
|
|
19
|
+
import struct
|
|
20
|
+
import tempfile
|
|
21
|
+
import signal
|
|
22
|
+
import numpy as np
|
|
23
|
+
|
|
24
|
+
# Audio config
|
|
25
|
+
SAMPLE_RATE = 16000
|
|
26
|
+
CHUNK_SIZE = 1280 # 80ms at 16kHz — openwakeword expects this
|
|
27
|
+
FORMAT_WIDTH = 2 # 16-bit
|
|
28
|
+
CHANNELS = 1
|
|
29
|
+
|
|
30
|
+
# Silence detection
|
|
31
|
+
SILENCE_THRESHOLD = 500 # RMS amplitude threshold for silence
|
|
32
|
+
SILENCE_DURATION = 2.0 # seconds of silence to stop recording
|
|
33
|
+
INITIAL_WAIT_SECONDS = 3.5 # seconds to wait for speech after wake word
|
|
34
|
+
MAX_RECORD_SECONDS = 30 # safety cap
|
|
35
|
+
MIN_RECORD_SECONDS = 0.5 # ignore very short recordings
|
|
36
|
+
MUTE_FILE_NAME = "tts_playing" # skip detection while this file exists
|
|
37
|
+
LISTEN_NOW_FILE = "listen_now" # extension signals: start recording immediately
|
|
38
|
+
|
|
39
|
+
logging.basicConfig(
|
|
40
|
+
level=logging.INFO,
|
|
41
|
+
format="[friday-wake] %(asctime)s %(message)s",
|
|
42
|
+
datefmt="%H:%M:%S",
|
|
43
|
+
)
|
|
44
|
+
log = logging.getLogger("friday-wake")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def rms(audio_chunk: bytes) -> float:
|
|
48
|
+
"""Calculate RMS amplitude of a 16-bit audio chunk."""
|
|
49
|
+
if len(audio_chunk) < 2:
|
|
50
|
+
return 0.0
|
|
51
|
+
count = len(audio_chunk) // 2
|
|
52
|
+
shorts = struct.unpack(f"<{count}h", audio_chunk)
|
|
53
|
+
return (sum(s * s for s in shorts) / count) ** 0.5
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def record_until_silence(stream, sample_rate: int, chunk_size: int,
|
|
57
|
+
wait_for_speech: float = 0,
|
|
58
|
+
max_record: float = 0) -> bytes:
|
|
59
|
+
"""Record audio from stream until silence is detected.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
wait_for_speech: Max seconds to wait for the user to START talking.
|
|
63
|
+
0 = start recording immediately (wake-word mode).
|
|
64
|
+
>0 = wait up to this many seconds for speech before giving up
|
|
65
|
+
(question-response mode).
|
|
66
|
+
max_record: Max recording duration in seconds. 0 = use MAX_RECORD_SECONDS default.
|
|
67
|
+
"""
|
|
68
|
+
effective_max = max_record if max_record > 0 else MAX_RECORD_SECONDS
|
|
69
|
+
frames = []
|
|
70
|
+
silent_chunks = 0
|
|
71
|
+
chunks_for_silence = int(SILENCE_DURATION * sample_rate / chunk_size)
|
|
72
|
+
max_chunks = int(effective_max * sample_rate / chunk_size)
|
|
73
|
+
min_chunks = int(MIN_RECORD_SECONDS * sample_rate / chunk_size)
|
|
74
|
+
|
|
75
|
+
# Phase 1: Wait for speech to begin (if wait_for_speech > 0)
|
|
76
|
+
if wait_for_speech > 0:
|
|
77
|
+
wait_chunks = int(wait_for_speech * sample_rate / chunk_size)
|
|
78
|
+
log.info(f"Waiting up to {wait_for_speech:.0f}s for speech...")
|
|
79
|
+
speech_started = False
|
|
80
|
+
for _ in range(wait_chunks):
|
|
81
|
+
data = stream.read(chunk_size, exception_on_overflow=False)
|
|
82
|
+
amplitude = rms(data)
|
|
83
|
+
if amplitude >= SILENCE_THRESHOLD:
|
|
84
|
+
# Speech detected — keep this chunk and move to recording
|
|
85
|
+
frames.append(data)
|
|
86
|
+
speech_started = True
|
|
87
|
+
break
|
|
88
|
+
if not speech_started:
|
|
89
|
+
log.info("No speech detected within wait window, giving up")
|
|
90
|
+
return b""
|
|
91
|
+
|
|
92
|
+
# Phase 2: Record until silence
|
|
93
|
+
log.info("Recording... (speak now)")
|
|
94
|
+
|
|
95
|
+
for i in range(max_chunks):
|
|
96
|
+
data = stream.read(chunk_size, exception_on_overflow=False)
|
|
97
|
+
frames.append(data)
|
|
98
|
+
|
|
99
|
+
amplitude = rms(data)
|
|
100
|
+
if amplitude < SILENCE_THRESHOLD:
|
|
101
|
+
silent_chunks += 1
|
|
102
|
+
else:
|
|
103
|
+
silent_chunks = 0
|
|
104
|
+
|
|
105
|
+
# Stop on sustained silence (but only after minimum recording time)
|
|
106
|
+
if silent_chunks >= chunks_for_silence and i >= min_chunks:
|
|
107
|
+
break
|
|
108
|
+
|
|
109
|
+
duration = len(frames) * chunk_size / sample_rate
|
|
110
|
+
log.info(f"Recorded {duration:.1f}s of audio")
|
|
111
|
+
return b"".join(frames)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def transcribe(audio_bytes: bytes, model) -> str:
|
|
115
|
+
"""Transcribe raw 16-bit PCM audio bytes using faster-whisper."""
|
|
116
|
+
# Convert bytes to float32 numpy array
|
|
117
|
+
audio_array = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
|
|
118
|
+
|
|
119
|
+
segments, info = model.transcribe(
|
|
120
|
+
audio_array,
|
|
121
|
+
beam_size=5,
|
|
122
|
+
language="en",
|
|
123
|
+
vad_filter=True,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
text = " ".join(segment.text.strip() for segment in segments).strip()
|
|
127
|
+
return text
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def write_signal(command_file: str, signal_type: str, text: str = ""):
|
|
131
|
+
"""Write a signal/command to the command file for the extension."""
|
|
132
|
+
payload = json.dumps({"type": signal_type, "text": text, "timestamp": time.time()})
|
|
133
|
+
tmp = command_file + ".tmp"
|
|
134
|
+
with open(tmp, "w") as f:
|
|
135
|
+
f.write(payload + "\n")
|
|
136
|
+
os.rename(tmp, command_file)
|
|
137
|
+
if text:
|
|
138
|
+
log.info(f"Sent to pi: {text}")
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def play_listening_sound():
|
|
142
|
+
"""Play a subtle sound to indicate we're listening."""
|
|
143
|
+
# Quick beep using sox
|
|
144
|
+
try:
|
|
145
|
+
os.system("play -q -n synth 0.1 sin 800 vol 0.3 2>/dev/null &")
|
|
146
|
+
except Exception:
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def play_done_sound():
|
|
151
|
+
"""Play a subtle sound to indicate we're done recording."""
|
|
152
|
+
try:
|
|
153
|
+
os.system("play -q -n synth 0.05 sin 600 vol 0.2 2>/dev/null &")
|
|
154
|
+
except Exception:
|
|
155
|
+
pass
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def main():
|
|
159
|
+
parser = argparse.ArgumentParser(description="Friday Wake Word Daemon")
|
|
160
|
+
parser.add_argument("command_file", help="File to write transcribed commands to")
|
|
161
|
+
parser.add_argument("--wake-word", default="hey_jarvis", help="Wake word model name")
|
|
162
|
+
parser.add_argument("--threshold", type=float, default=0.5, help="Wake word detection threshold")
|
|
163
|
+
parser.add_argument("--whisper-model", default="tiny.en", help="Whisper model size")
|
|
164
|
+
parser.add_argument("--data-dir", default=None, help="Directory for custom wake word models")
|
|
165
|
+
args = parser.parse_args()
|
|
166
|
+
|
|
167
|
+
# Handle shutdown gracefully
|
|
168
|
+
running = True
|
|
169
|
+
def shutdown(sig, frame):
|
|
170
|
+
nonlocal running
|
|
171
|
+
log.info("Shutting down...")
|
|
172
|
+
running = False
|
|
173
|
+
signal.signal(signal.SIGTERM, shutdown)
|
|
174
|
+
signal.signal(signal.SIGINT, shutdown)
|
|
175
|
+
|
|
176
|
+
# Load wake word model
|
|
177
|
+
log.info(f"Loading wake word model: {args.wake_word}")
|
|
178
|
+
from openwakeword.model import Model as WakeModel
|
|
179
|
+
|
|
180
|
+
# Check if it's a custom model file in the data dir
|
|
181
|
+
data_dir = args.data_dir or os.path.join(os.path.expanduser("~"), ".pi/agent/friday")
|
|
182
|
+
custom_model_path = os.path.join(data_dir, f"{args.wake_word}.onnx")
|
|
183
|
+
if os.path.exists(custom_model_path):
|
|
184
|
+
log.info(f"Using custom model: {custom_model_path}")
|
|
185
|
+
model_ref = custom_model_path
|
|
186
|
+
else:
|
|
187
|
+
model_ref = args.wake_word
|
|
188
|
+
|
|
189
|
+
wake_model = WakeModel(
|
|
190
|
+
wakeword_models=[model_ref],
|
|
191
|
+
inference_framework="onnx",
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# Load whisper model
|
|
195
|
+
log.info(f"Loading whisper model: {args.whisper_model}")
|
|
196
|
+
from faster_whisper import WhisperModel
|
|
197
|
+
whisper_model = WhisperModel(args.whisper_model, device="cpu", compute_type="int8")
|
|
198
|
+
|
|
199
|
+
# Open microphone
|
|
200
|
+
import pyaudio
|
|
201
|
+
pa = pyaudio.PyAudio()
|
|
202
|
+
stream = pa.open(
|
|
203
|
+
format=pyaudio.paInt16,
|
|
204
|
+
channels=CHANNELS,
|
|
205
|
+
rate=SAMPLE_RATE,
|
|
206
|
+
input=True,
|
|
207
|
+
frames_per_buffer=CHUNK_SIZE,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
log.info(f"Listening for '{args.wake_word}' (threshold: {args.threshold})...")
|
|
211
|
+
log.info(f"Command file: {args.command_file}")
|
|
212
|
+
|
|
213
|
+
# Log RMS levels periodically for mic diagnostics
|
|
214
|
+
rms_sample_counter = 0
|
|
215
|
+
rms_max_seen = 0
|
|
216
|
+
|
|
217
|
+
try:
|
|
218
|
+
while running:
|
|
219
|
+
# Read audio chunk
|
|
220
|
+
try:
|
|
221
|
+
audio = stream.read(CHUNK_SIZE, exception_on_overflow=False)
|
|
222
|
+
except Exception:
|
|
223
|
+
time.sleep(0.01)
|
|
224
|
+
continue
|
|
225
|
+
|
|
226
|
+
# Convert to numpy for openwakeword
|
|
227
|
+
audio_array = np.frombuffer(audio, dtype=np.int16)
|
|
228
|
+
|
|
229
|
+
# Periodic RMS diagnostics (every ~2s)
|
|
230
|
+
current_rms = rms(audio)
|
|
231
|
+
if current_rms > rms_max_seen:
|
|
232
|
+
rms_max_seen = current_rms
|
|
233
|
+
rms_sample_counter += 1
|
|
234
|
+
if rms_sample_counter % 25 == 0: # ~2s at 80ms chunks
|
|
235
|
+
log.info(f"[mic] RMS: {current_rms:.0f} | max seen: {rms_max_seen:.0f} | threshold: {SILENCE_THRESHOLD}")
|
|
236
|
+
|
|
237
|
+
base_dir = os.path.dirname(args.command_file)
|
|
238
|
+
|
|
239
|
+
# Skip detection while TTS is playing (prevents self-triggering)
|
|
240
|
+
mute_path = os.path.join(base_dir, MUTE_FILE_NAME)
|
|
241
|
+
if os.path.exists(mute_path):
|
|
242
|
+
wake_model.reset()
|
|
243
|
+
continue
|
|
244
|
+
|
|
245
|
+
# Check if extension wants us to listen immediately (question asked)
|
|
246
|
+
listen_now_path = os.path.join(base_dir, LISTEN_NOW_FILE)
|
|
247
|
+
immediate_listen = False
|
|
248
|
+
wait_for_speech = 0
|
|
249
|
+
max_record = 0 # 0 = use MAX_RECORD_SECONDS default
|
|
250
|
+
if os.path.exists(listen_now_path):
|
|
251
|
+
try:
|
|
252
|
+
raw = open(listen_now_path).read().strip()
|
|
253
|
+
os.remove(listen_now_path)
|
|
254
|
+
immediate_listen = True
|
|
255
|
+
# Parse JSON payload for waitForSpeech parameter
|
|
256
|
+
try:
|
|
257
|
+
payload = json.loads(raw)
|
|
258
|
+
wait_for_speech = float(payload.get("waitForSpeech", 5))
|
|
259
|
+
max_record = float(payload.get("maxRecord", 10))
|
|
260
|
+
except (json.JSONDecodeError, ValueError):
|
|
261
|
+
wait_for_speech = 5
|
|
262
|
+
max_record = 10
|
|
263
|
+
log.info(f"Auto-listen triggered (wait={wait_for_speech:.0f}s, max_record={max_record:.0f}s)")
|
|
264
|
+
except Exception:
|
|
265
|
+
pass
|
|
266
|
+
|
|
267
|
+
if not immediate_listen:
|
|
268
|
+
# Run wake word detection
|
|
269
|
+
prediction = wake_model.predict(audio_array)
|
|
270
|
+
score = prediction.get(args.wake_word, 0)
|
|
271
|
+
if score < args.threshold:
|
|
272
|
+
continue
|
|
273
|
+
|
|
274
|
+
if True: # wake word detected OR immediate listen
|
|
275
|
+
if not immediate_listen:
|
|
276
|
+
log.info(f"Wake word detected! (score: {score:.2f})")
|
|
277
|
+
# Signal extension IMMEDIATELY to kill any playing TTS
|
|
278
|
+
write_signal(args.command_file, "wake", "")
|
|
279
|
+
|
|
280
|
+
play_listening_sound()
|
|
281
|
+
|
|
282
|
+
# Reset the wake word model to avoid re-triggering
|
|
283
|
+
wake_model.reset()
|
|
284
|
+
|
|
285
|
+
# Record speech until silence
|
|
286
|
+
# For auto-listen (questions), wait up to wait_for_speech seconds
|
|
287
|
+
# for the user to start talking before giving up
|
|
288
|
+
audio_data = record_until_silence(
|
|
289
|
+
stream, SAMPLE_RATE, CHUNK_SIZE,
|
|
290
|
+
wait_for_speech=wait_for_speech if immediate_listen else INITIAL_WAIT_SECONDS,
|
|
291
|
+
max_record=max_record if immediate_listen else MAX_RECORD_SECONDS,
|
|
292
|
+
)
|
|
293
|
+
play_done_sound()
|
|
294
|
+
|
|
295
|
+
if len(audio_data) < SAMPLE_RATE * MIN_RECORD_SECONDS * FORMAT_WIDTH:
|
|
296
|
+
log.info("Recording too short, ignoring")
|
|
297
|
+
continue
|
|
298
|
+
|
|
299
|
+
# Transcribe
|
|
300
|
+
log.info("Transcribing...")
|
|
301
|
+
text = transcribe(audio_data, whisper_model)
|
|
302
|
+
|
|
303
|
+
if text and len(text.strip()) > 0:
|
|
304
|
+
write_signal(args.command_file, "command", text.strip())
|
|
305
|
+
else:
|
|
306
|
+
log.info("No speech detected in recording")
|
|
307
|
+
|
|
308
|
+
except KeyboardInterrupt:
|
|
309
|
+
pass
|
|
310
|
+
finally:
|
|
311
|
+
log.info("Cleaning up...")
|
|
312
|
+
stream.stop_stream()
|
|
313
|
+
stream.close()
|
|
314
|
+
pa.terminate()
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
if __name__ == "__main__":
|
|
318
|
+
main()
|