pi-voice 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/out/main/index.js CHANGED
@@ -1,76 +1,435 @@
1
- import { app, session as session$1, BrowserWindow, ipcMain } from "electron";
1
+ import { systemPreferences, app, BrowserWindow, ipcMain } from "electron";
2
2
  import { fileURLToPath } from "node:url";
3
- import iohook from "iohook-macos";
4
- import { GoogleGenAI } from "@google/genai";
5
- import { createAgentSession, SessionManager } from "@mariozechner/pi-coding-agent";
3
+ import { uIOhook, UiohookKey } from "uiohook-napi";
6
4
  import { join } from "node:path";
7
5
  import { homedir } from "node:os";
8
- import { writeFileSync, existsSync, unlinkSync, mkdirSync } from "node:fs";
6
+ import pino from "pino";
7
+ import { readFileSync, existsSync, mkdirSync, createWriteStream, unlinkSync, writeFileSync } from "node:fs";
8
+ import { z } from "zod";
9
+ import { GoogleGenAI } from "@google/genai";
10
+ import OpenAI, { toFile } from "openai";
11
+ import { WhisperFullParams, WhisperSamplingStrategy, Whisper } from "@napi-rs/whisper";
12
+ import { Readable } from "node:stream";
13
+ import { finished } from "node:stream/promises";
14
+ import { spawn } from "node:child_process";
15
+ import { createAgentSession, SessionManager } from "@mariozechner/pi-coding-agent";
9
16
  import { createServer } from "node:net";
17
+ function resolveLogPath() {
18
+ const envPath = process.env["PI_VOICE_LOG_PATH"];
19
+ if (envPath) return envPath;
20
+ const configHome = process.env["XDG_CONFIG_HOME"] || join(homedir(), ".config");
21
+ return join(configHome, "pi-voice", "daemon.log");
22
+ }
23
+ const logPath = resolveLogPath();
24
+ const logger = pino(
25
+ {
26
+ level: "debug"
27
+ },
28
+ pino.multistream([
29
+ // Console output (human-readable via stdout)
30
+ { level: "debug", stream: process.stdout },
31
+ // File output (JSON, auto-creates parent directories)
32
+ {
33
+ level: "debug",
34
+ stream: pino.destination({ dest: logPath, mkdir: true, sync: false })
35
+ }
36
+ ])
37
+ );
38
+ function getReleaseCodes(binding) {
39
+ const codes = [binding.keycode];
40
+ if (binding.ctrl) {
41
+ codes.push(UiohookKey.Ctrl, UiohookKey.CtrlRight);
42
+ }
43
+ if (binding.shift) {
44
+ codes.push(UiohookKey.Shift, UiohookKey.ShiftRight);
45
+ }
46
+ if (binding.alt) {
47
+ codes.push(UiohookKey.Alt, UiohookKey.AltRight);
48
+ }
49
+ if (binding.meta) {
50
+ codes.push(UiohookKey.Meta, UiohookKey.MetaRight);
51
+ }
52
+ return codes;
53
+ }
10
54
  class FnHook {
11
- fnDown = false;
55
+ active = false;
12
56
  callbacks;
13
57
  started = false;
14
- constructor(callbacks) {
58
+ binding;
59
+ releaseCodes;
60
+ displayName;
61
+ constructor(callbacks, binding, displayName) {
15
62
  this.callbacks = callbacks;
63
+ this.binding = binding;
64
+ this.releaseCodes = new Set(getReleaseCodes(binding));
65
+ this.displayName = displayName;
16
66
  }
17
67
  start() {
18
68
  if (this.started) return;
19
- const perms = iohook.checkAccessibilityPermissions();
20
- if (!perms.hasPermissions) {
21
- console.log(
22
- "[FnHook] Accessibility permissions not granted. Requesting..."
23
- );
24
- iohook.requestAccessibilityPermissions();
25
- throw new Error(
26
- "Accessibility permissions required. Please grant access in System Preferences > Privacy & Security > Accessibility, then restart the app."
27
- );
69
+ if (process.platform === "darwin") {
70
+ const trusted = systemPreferences.isTrustedAccessibilityClient(true);
71
+ if (!trusted) {
72
+ throw new Error(
73
+ "Accessibility permissions required. Please grant access in System Preferences > Privacy & Security > Accessibility, then restart the app."
74
+ );
75
+ }
28
76
  }
29
- iohook.setEventFilter({
30
- filterByEventType: true,
31
- allowKeyboard: true,
32
- allowMouse: false,
33
- allowScroll: false
34
- });
35
- iohook.enablePerformanceMode();
36
- iohook.on("flagsChanged", (event) => {
37
- const fnNow = event.modifiers.fn;
38
- if (fnNow && !this.fnDown) {
39
- this.fnDown = true;
77
+ uIOhook.on("keydown", (e) => {
78
+ if (this.active) return;
79
+ if (e.keycode === this.binding.keycode && e.ctrlKey === this.binding.ctrl && e.shiftKey === this.binding.shift && e.altKey === this.binding.alt && e.metaKey === this.binding.meta) {
80
+ this.active = true;
40
81
  this.callbacks.onFnDown();
41
- } else if (!fnNow && this.fnDown) {
42
- this.fnDown = false;
82
+ }
83
+ });
84
+ uIOhook.on("keyup", (e) => {
85
+ if (!this.active) return;
86
+ if (this.releaseCodes.has(e.keycode)) {
87
+ this.active = false;
43
88
  this.callbacks.onFnUp();
44
89
  }
45
90
  });
46
- iohook.startMonitoring();
91
+ uIOhook.start();
47
92
  this.started = true;
48
- console.log("[FnHook] Started monitoring Fn key");
93
+ logger.info({ key: this.displayName }, "Started monitoring key");
49
94
  }
50
95
  stop() {
51
96
  if (!this.started) return;
52
- iohook.stopMonitoring();
97
+ uIOhook.stop();
53
98
  this.started = false;
54
- this.fnDown = false;
55
- console.log("[FnHook] Stopped monitoring");
99
+ this.active = false;
100
+ logger.info("Stopped monitoring key");
56
101
  }
57
102
  get isFnDown() {
58
- return this.fnDown;
103
+ return this.active;
104
+ }
105
+ }
106
+ const KEY_MAP = {
107
+ // Letters
108
+ a: UiohookKey.A,
109
+ b: UiohookKey.B,
110
+ c: UiohookKey.C,
111
+ d: UiohookKey.D,
112
+ e: UiohookKey.E,
113
+ f: UiohookKey.F,
114
+ g: UiohookKey.G,
115
+ h: UiohookKey.H,
116
+ i: UiohookKey.I,
117
+ j: UiohookKey.J,
118
+ k: UiohookKey.K,
119
+ l: UiohookKey.L,
120
+ m: UiohookKey.M,
121
+ n: UiohookKey.N,
122
+ o: UiohookKey.O,
123
+ p: UiohookKey.P,
124
+ q: UiohookKey.Q,
125
+ r: UiohookKey.R,
126
+ s: UiohookKey.S,
127
+ t: UiohookKey.T,
128
+ u: UiohookKey.U,
129
+ v: UiohookKey.V,
130
+ w: UiohookKey.W,
131
+ x: UiohookKey.X,
132
+ y: UiohookKey.Y,
133
+ z: UiohookKey.Z,
134
+ // Numbers
135
+ "0": UiohookKey[0],
136
+ "1": UiohookKey[1],
137
+ "2": UiohookKey[2],
138
+ "3": UiohookKey[3],
139
+ "4": UiohookKey[4],
140
+ "5": UiohookKey[5],
141
+ "6": UiohookKey[6],
142
+ "7": UiohookKey[7],
143
+ "8": UiohookKey[8],
144
+ "9": UiohookKey[9],
145
+ // Function keys
146
+ f1: UiohookKey.F1,
147
+ f2: UiohookKey.F2,
148
+ f3: UiohookKey.F3,
149
+ f4: UiohookKey.F4,
150
+ f5: UiohookKey.F5,
151
+ f6: UiohookKey.F6,
152
+ f7: UiohookKey.F7,
153
+ f8: UiohookKey.F8,
154
+ f9: UiohookKey.F9,
155
+ f10: UiohookKey.F10,
156
+ f11: UiohookKey.F11,
157
+ f12: UiohookKey.F12,
158
+ // Special keys
159
+ space: UiohookKey.Space,
160
+ enter: UiohookKey.Enter,
161
+ return: UiohookKey.Enter,
162
+ escape: UiohookKey.Escape,
163
+ esc: UiohookKey.Escape,
164
+ tab: UiohookKey.Tab,
165
+ backspace: UiohookKey.Backspace,
166
+ delete: UiohookKey.Delete,
167
+ insert: UiohookKey.Insert,
168
+ home: UiohookKey.Home,
169
+ end: UiohookKey.End,
170
+ pageup: UiohookKey.PageUp,
171
+ pagedown: UiohookKey.PageDown,
172
+ // Arrow keys
173
+ up: UiohookKey.ArrowUp,
174
+ down: UiohookKey.ArrowDown,
175
+ left: UiohookKey.ArrowLeft,
176
+ right: UiohookKey.ArrowRight,
177
+ arrowup: UiohookKey.ArrowUp,
178
+ arrowdown: UiohookKey.ArrowDown,
179
+ arrowleft: UiohookKey.ArrowLeft,
180
+ arrowright: UiohookKey.ArrowRight,
181
+ // Punctuation
182
+ semicolon: UiohookKey.Semicolon,
183
+ equal: UiohookKey.Equal,
184
+ comma: UiohookKey.Comma,
185
+ minus: UiohookKey.Minus,
186
+ period: UiohookKey.Period,
187
+ slash: UiohookKey.Slash,
188
+ backquote: UiohookKey.Backquote,
189
+ bracketleft: UiohookKey.BracketLeft,
190
+ backslash: UiohookKey.Backslash,
191
+ bracketright: UiohookKey.BracketRight,
192
+ quote: UiohookKey.Quote
193
+ };
194
+ function parseKeyBinding(keyStr) {
195
+ const parts = keyStr.toLowerCase().split("+").map((s) => s.trim());
196
+ if (parts.length === 0 || parts.some((p) => p === "")) {
197
+ throw new Error(`Invalid key binding: "${keyStr}"`);
198
+ }
199
+ let ctrl = false;
200
+ let shift = false;
201
+ let alt = false;
202
+ let meta = false;
203
+ let mainKey;
204
+ for (const part of parts) {
205
+ if (part === "ctrl" || part === "control") {
206
+ ctrl = true;
207
+ } else if (part === "shift") {
208
+ shift = true;
209
+ } else if (part === "alt" || part === "opt" || part === "option") {
210
+ alt = true;
211
+ } else if (part === "meta" || part === "cmd" || part === "command" || part === "super" || part === "win") {
212
+ meta = true;
213
+ } else {
214
+ if (mainKey !== void 0) {
215
+ throw new Error(`Multiple main keys in key binding: "${keyStr}"`);
216
+ }
217
+ mainKey = part;
218
+ }
219
+ }
220
+ if (mainKey === void 0) {
221
+ throw new Error(`No main key specified in key binding: "${keyStr}"`);
222
+ }
223
+ const keycode = KEY_MAP[mainKey];
224
+ if (keycode === void 0) {
225
+ throw new Error(`Unknown key "${mainKey}" in key binding: "${keyStr}"`);
226
+ }
227
+ return { keycode, ctrl, shift, alt, meta };
228
+ }
229
+ function formatKeyDisplay(binding) {
230
+ const isMac = process.platform === "darwin";
231
+ const parts = [];
232
+ if (binding.ctrl) parts.push(isMac ? "⌃" : "Ctrl");
233
+ if (binding.alt) parts.push(isMac ? "⌥" : "Alt");
234
+ if (binding.shift) parts.push(isMac ? "⇧" : "Shift");
235
+ if (binding.meta) parts.push(isMac ? "⌘" : "Win");
236
+ const keyName = Object.entries(KEY_MAP).find(([, v]) => v === binding.keycode)?.[0]?.toUpperCase() ?? "?";
237
+ parts.push(keyName);
238
+ return parts.join(isMac ? "" : "+");
239
+ }
240
+ const DEFAULT_KEY_STRING = "meta+shift+i";
241
+ const DEFAULT_PROVIDER = "local";
242
+ function defaultConfig() {
243
+ const binding = parseKeyBinding(DEFAULT_KEY_STRING);
244
+ return {
245
+ key: binding,
246
+ keyDisplay: formatKeyDisplay(binding),
247
+ provider: DEFAULT_PROVIDER
248
+ };
249
+ }
250
+ const configFileSchema = z.object({
251
+ key: z.string().refine(
252
+ (v) => {
253
+ try {
254
+ parseKeyBinding(v);
255
+ return true;
256
+ } catch {
257
+ return false;
258
+ }
259
+ },
260
+ { message: "Invalid key binding" }
261
+ ).optional().default(DEFAULT_KEY_STRING),
262
+ provider: z.enum(["local", "gemini", "openai"]).optional().default(DEFAULT_PROVIDER)
263
+ });
264
+ class ConfigError extends Error {
265
+ constructor(configPath, details) {
266
+ super(`Invalid config at ${configPath}:
267
+ ${details}`);
268
+ this.configPath = configPath;
269
+ this.details = details;
270
+ this.name = "ConfigError";
59
271
  }
60
272
  }
61
- let ai$1 = null;
62
- function getClient$1() {
63
- if (ai$1) return ai$1;
273
+ function loadConfig(cwd) {
274
+ const configPath = join(cwd, ".pi", "pi-voice.json");
275
+ let raw;
276
+ try {
277
+ raw = readFileSync(configPath, "utf-8");
278
+ } catch (err) {
279
+ if (err.code === "ENOENT") {
280
+ logger.info({ configPath }, "No config file found, using defaults");
281
+ return defaultConfig();
282
+ }
283
+ throw new ConfigError(configPath, `Failed to read file: ${err.message}`);
284
+ }
285
+ let json;
286
+ try {
287
+ json = JSON.parse(raw);
288
+ } catch {
289
+ throw new ConfigError(configPath, "Invalid JSON syntax");
290
+ }
291
+ const result = configFileSchema.safeParse(json);
292
+ if (!result.success) {
293
+ const details = result.error.issues.map((issue) => {
294
+ const path = issue.path.length > 0 ? `"${issue.path.join(".")}"` : "(root)";
295
+ return ` - ${path}: ${issue.message}`;
296
+ }).join("\n");
297
+ throw new ConfigError(configPath, details);
298
+ }
299
+ const parsed = result.data;
300
+ const binding = parseKeyBinding(parsed.key);
301
+ const display = formatKeyDisplay(binding);
302
+ logger.info({ key: display, provider: parsed.provider, configPath }, "Loaded config");
303
+ return { key: binding, keyDisplay: display, provider: parsed.provider };
304
+ }
305
+ const DEFAULT_MODEL = "medium-q5_0";
306
+ const HF_BASE_URL = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main";
307
+ function modelCacheDir() {
308
+ return join(homedir(), ".pi-agent", "whisper");
309
+ }
310
+ function modelFileName(model) {
311
+ return `ggml-${model}.bin`;
312
+ }
313
+ async function downloadModel(model, destPath) {
314
+ const url = `${HF_BASE_URL}/${modelFileName(model)}`;
315
+ logger.info({ model, url }, "Downloading Whisper model");
316
+ const response = await fetch(url, { method: "GET", redirect: "follow" });
317
+ if (!response.ok) {
318
+ throw new Error(
319
+ `Failed to download Whisper model "${model}": HTTP ${response.status}`
320
+ );
321
+ }
322
+ const totalBytes = Number(response.headers.get("content-length") ?? 0);
323
+ const totalMB = totalBytes > 0 ? (totalBytes / 1024 / 1024).toFixed(0) : "?";
324
+ const tmpPath = destPath + ".tmp";
325
+ try {
326
+ const fileStream = createWriteStream(tmpPath);
327
+ let downloadedBytes = 0;
328
+ let lastPercent = -1;
329
+ const body = response.body;
330
+ if (!body) {
331
+ throw new Error("Response body is null");
332
+ }
333
+ const reader = body.getReader();
334
+ const progressStream = new ReadableStream({
335
+ async pull(controller) {
336
+ const { done, value } = await reader.read();
337
+ if (done) {
338
+ controller.close();
339
+ return;
340
+ }
341
+ downloadedBytes += value.byteLength;
342
+ const downloadedMB = (downloadedBytes / 1024 / 1024).toFixed(0);
343
+ if (totalBytes > 0) {
344
+ const percent = Math.floor(
345
+ downloadedBytes / totalBytes * 100
346
+ );
347
+ if (percent !== lastPercent) {
348
+ lastPercent = percent;
349
+ process.stderr.write(
350
+ `\r[Whisper] Downloading model "${model}"... ${percent}% (${downloadedMB}/${totalMB} MB)`
351
+ );
352
+ }
353
+ } else {
354
+ process.stderr.write(
355
+ `\r[Whisper] Downloading model "${model}"... ${downloadedMB} MB`
356
+ );
357
+ }
358
+ controller.enqueue(value);
359
+ }
360
+ });
361
+ await finished(
362
+ Readable.fromWeb(progressStream).pipe(fileStream)
363
+ );
364
+ process.stderr.write("\n");
365
+ const { renameSync } = await import("node:fs");
366
+ renameSync(tmpPath, destPath);
367
+ logger.info({ destPath }, "Whisper model saved");
368
+ } catch (err) {
369
+ try {
370
+ unlinkSync(tmpPath);
371
+ } catch {
372
+ }
373
+ throw err;
374
+ }
375
+ }
376
+ async function resolveModelPath() {
377
+ const envPath = process.env.WHISPER_MODEL_PATH;
378
+ if (envPath) {
379
+ if (!existsSync(envPath)) {
380
+ throw new Error(
381
+ `WHISPER_MODEL_PATH points to "${envPath}" but the file does not exist`
382
+ );
383
+ }
384
+ return envPath;
385
+ }
386
+ const model = process.env.WHISPER_MODEL ?? DEFAULT_MODEL;
387
+ const cacheDir = modelCacheDir();
388
+ const destPath = join(cacheDir, modelFileName(model));
389
+ if (existsSync(destPath)) {
390
+ return destPath;
391
+ }
392
+ mkdirSync(cacheDir, { recursive: true });
393
+ await downloadModel(model, destPath);
394
+ return destPath;
395
+ }
396
+ let geminiClient$1 = null;
397
+ function getGeminiClient$1() {
398
+ if (geminiClient$1) return geminiClient$1;
64
399
  const project = process.env.GOOGLE_CLOUD_PROJECT;
65
400
  const location = process.env.GOOGLE_CLOUD_LOCATION ?? "us-central1";
66
401
  if (!project) {
67
402
  throw new Error("GOOGLE_CLOUD_PROJECT environment variable is required");
68
403
  }
69
- ai$1 = new GoogleGenAI({ vertexai: true, project, location });
70
- return ai$1;
404
+ geminiClient$1 = new GoogleGenAI({ vertexai: true, project, location });
405
+ return geminiClient$1;
406
+ }
407
+ let openaiClient$1 = null;
408
+ function getOpenAIClient$1() {
409
+ if (openaiClient$1) return openaiClient$1;
410
+ const apiKey = process.env.OPENAI_API_KEY;
411
+ if (!apiKey) {
412
+ throw new Error("OPENAI_API_KEY environment variable is required");
413
+ }
414
+ openaiClient$1 = new OpenAI({ apiKey });
415
+ return openaiClient$1;
71
416
  }
72
- async function transcribe(audioBuffer) {
73
- const client = getClient$1();
417
+ let whisperInstance = null;
418
+ let whisperInitPromise = null;
419
+ async function getWhisperInstance() {
420
+ if (whisperInstance) return whisperInstance;
421
+ if (whisperInitPromise) return whisperInitPromise;
422
+ whisperInitPromise = (async () => {
423
+ const modelPath = await resolveModelPath();
424
+ logger.info({ modelPath }, "Loading Whisper model");
425
+ whisperInstance = new Whisper(modelPath);
426
+ logger.info("Whisper model loaded");
427
+ return whisperInstance;
428
+ })();
429
+ return whisperInitPromise;
430
+ }
431
+ async function transcribeGemini(audioBuffer) {
432
+ const client = getGeminiClient$1();
74
433
  const base64Audio = audioBuffer.toString("base64");
75
434
  const response = await client.models.generateContent({
76
435
  model: "gemini-2.5-flash",
@@ -91,26 +450,74 @@ async function transcribe(audioBuffer) {
91
450
  }
92
451
  ]
93
452
  });
94
- const text = response.text?.trim() ?? "";
95
- console.log(`[STT] Transcribed: "${text}"`);
453
+ return response.text?.trim() ?? "";
454
+ }
455
+ async function transcribeOpenAI(audioBuffer) {
456
+ const client = getOpenAIClient$1();
457
+ const file = await toFile(audioBuffer, "recording.webm");
458
+ const transcription = await client.audio.transcriptions.create({
459
+ model: "gpt-4o-mini-transcribe",
460
+ file
461
+ });
462
+ return transcription.text?.trim() ?? "";
463
+ }
464
+ async function transcribeLocal(samples) {
465
+ const whisper = await getWhisperInstance();
466
+ const params = new WhisperFullParams(WhisperSamplingStrategy.Greedy);
467
+ params.language = "auto";
468
+ params.printProgress = false;
469
+ params.printRealtime = false;
470
+ params.printTimestamps = false;
471
+ params.singleSegment = false;
472
+ params.noTimestamps = true;
473
+ return whisper.full(params, samples);
474
+ }
475
+ async function transcribe(audioData, provider = "local") {
476
+ let text;
477
+ switch (provider) {
478
+ case "local": {
479
+ const samples = new Float32Array(audioData);
480
+ text = await transcribeLocal(samples);
481
+ break;
482
+ }
483
+ case "openai":
484
+ text = await transcribeOpenAI(Buffer.from(audioData));
485
+ break;
486
+ case "gemini":
487
+ default:
488
+ text = await transcribeGemini(Buffer.from(audioData));
489
+ break;
490
+ }
491
+ logger.info({ provider, text }, "Transcribed");
96
492
  return text;
97
493
  }
98
- let ai = null;
99
- function getClient() {
100
- if (ai) return ai;
494
+ let geminiClient = null;
495
+ function getGeminiClient() {
496
+ if (geminiClient) return geminiClient;
101
497
  const project = process.env.GOOGLE_CLOUD_PROJECT;
102
498
  const location = process.env.GOOGLE_CLOUD_LOCATION ?? "us-central1";
103
499
  if (!project) {
104
500
  throw new Error("GOOGLE_CLOUD_PROJECT environment variable is required");
105
501
  }
106
- ai = new GoogleGenAI({ vertexai: true, project, location });
107
- return ai;
502
+ geminiClient = new GoogleGenAI({ vertexai: true, project, location });
503
+ return geminiClient;
504
+ }
505
+ let openaiClient = null;
506
+ function getOpenAIClient() {
507
+ if (openaiClient) return openaiClient;
508
+ const apiKey = process.env.OPENAI_API_KEY;
509
+ if (!apiKey) {
510
+ throw new Error("OPENAI_API_KEY environment variable is required");
511
+ }
512
+ openaiClient = new OpenAI({ apiKey });
513
+ return openaiClient;
108
514
  }
109
515
  const TTS_SAMPLE_RATE = 24e3;
110
516
  const TTS_CHANNELS = 1;
111
517
  const TTS_BITS_PER_SAMPLE = 16;
112
- async function* synthesizeStream(text) {
113
- const client = getClient();
518
+ const OPENAI_PCM_CHUNK_SIZE = TTS_SAMPLE_RATE * (TTS_BITS_PER_SAMPLE / 8) * TTS_CHANNELS * 0.1;
519
+ async function* synthesizeStreamGemini(text) {
520
+ const client = getGeminiClient();
114
521
  const response = await client.models.generateContentStream({
115
522
  model: "gemini-2.5-flash-preview-tts",
116
523
  contents: [
@@ -159,10 +566,80 @@ async function* synthesizeStream(text) {
159
566
  totalBytes += leftover.length;
160
567
  yield leftover;
161
568
  }
162
- console.log(
163
- `[TTS] Streamed ${totalBytes} bytes of PCM audio for "${text.substring(0, 50)}..."`
569
+ logger.info(
570
+ { provider: "gemini", totalBytes, text: text.substring(0, 50) },
571
+ "Streamed PCM audio"
164
572
  );
165
573
  }
574
+ async function* synthesizeStreamOpenAI(text) {
575
+ const client = getOpenAIClient();
576
+ const response = await client.audio.speech.create({
577
+ model: "gpt-4o-mini-tts",
578
+ voice: "alloy",
579
+ input: text,
580
+ response_format: "pcm"
581
+ // raw 24kHz 16-bit signed LE mono PCM
582
+ });
583
+ const arrayBuffer = await response.arrayBuffer();
584
+ const fullBuffer = Buffer.from(arrayBuffer);
585
+ let totalBytes = 0;
586
+ let offset = 0;
587
+ while (offset < fullBuffer.length) {
588
+ const end = Math.min(offset + OPENAI_PCM_CHUNK_SIZE, fullBuffer.length);
589
+ const chunk = fullBuffer.subarray(offset, end);
590
+ totalBytes += chunk.length;
591
+ yield chunk;
592
+ offset = end;
593
+ }
594
+ logger.info(
595
+ { provider: "openai", totalBytes, text: text.substring(0, 50) },
596
+ "Streamed PCM audio"
597
+ );
598
+ }
599
+ function speakLocal(text) {
600
+ return new Promise((resolve, reject) => {
601
+ if (process.platform !== "darwin") {
602
+ reject(new Error("Local TTS (say command) is only supported on macOS"));
603
+ return;
604
+ }
605
+ const voice = process.env.SAY_VOICE;
606
+ const args = [];
607
+ if (voice) {
608
+ args.push("-v", voice);
609
+ }
610
+ args.push(text);
611
+ const child = spawn("say", args, { stdio: "ignore" });
612
+ child.on("error", (err) => {
613
+ reject(new Error(`Failed to spawn say command: ${err.message}`));
614
+ });
615
+ child.on("close", (code) => {
616
+ if (code === 0) {
617
+ logger.info(
618
+ { provider: "local", text: text.substring(0, 50) },
619
+ "Spoke text"
620
+ );
621
+ resolve();
622
+ } else {
623
+ reject(new Error(`say command exited with code ${code}`));
624
+ }
625
+ });
626
+ });
627
+ }
628
+ async function* synthesizeStream(text, provider = "local") {
629
+ switch (provider) {
630
+ case "local":
631
+ throw new Error(
632
+ "Local TTS does not support PCM streaming. Use speakLocal() instead."
633
+ );
634
+ case "openai":
635
+ yield* synthesizeStreamOpenAI(text);
636
+ break;
637
+ case "gemini":
638
+ default:
639
+ yield* synthesizeStreamGemini(text);
640
+ break;
641
+ }
642
+ }
166
643
  let session = null;
167
644
  let sessionCwd = process.cwd();
168
645
  function setSessionCwd(cwd) {
@@ -170,13 +647,13 @@ function setSessionCwd(cwd) {
170
647
  }
171
648
  async function getOrCreateSession() {
172
649
  if (session) return session;
173
- console.log(`[PiSession] Creating new agent session (cwd: ${sessionCwd})...`);
650
+ logger.info({ cwd: sessionCwd }, "Creating new agent session");
174
651
  const result = await createAgentSession({
175
652
  cwd: sessionCwd,
176
653
  sessionManager: SessionManager.inMemory()
177
654
  });
178
655
  session = result.session;
179
- console.log("[PiSession] Session created");
656
+ logger.info("Agent session created");
180
657
  return session;
181
658
  }
182
659
  async function prompt(text, options) {
@@ -185,7 +662,7 @@ async function prompt(text, options) {
185
662
  if (event.type === "message_update" && event.assistantMessageEvent.type === "text_end") {
186
663
  const content = event.assistantMessageEvent.content.trim();
187
664
  if (content.length > 0) {
188
- console.log(`[PiSession] Response: ${content}`);
665
+ logger.info({ content }, "Agent response");
189
666
  options?.onTextEnd?.(content);
190
667
  }
191
668
  }
@@ -200,7 +677,7 @@ function dispose() {
200
677
  if (session) {
201
678
  session.dispose();
202
679
  session = null;
203
- console.log("[PiSession] Session disposed");
680
+ logger.info("Agent session disposed");
204
681
  }
205
682
  }
206
683
  const IPC = {
@@ -210,8 +687,6 @@ const IPC = {
210
687
  PLAY_AUDIO_STREAM_START: "play-audio-stream-start",
211
688
  PLAY_AUDIO_STREAM_CHUNK: "play-audio-stream-chunk",
212
689
  PLAY_AUDIO_STREAM_END: "play-audio-stream-end",
213
- STATE_CHANGED: "state-changed",
214
- STATUS_MESSAGE: "status-message",
215
690
  // renderer -> main
216
691
  RECORDING_DATA: "recording-data",
217
692
  RECORDING_ERROR: "recording-error",
@@ -275,7 +750,7 @@ function startDaemonServer(handler) {
275
750
  });
276
751
  });
277
752
  server.listen(socketPath);
278
- console.log(`[DaemonIPC] Listening on ${socketPath}`);
753
+ logger.info({ socketPath }, "DaemonIPC listening");
279
754
  return socketPath;
280
755
  }
281
756
  function stopDaemonServer() {
@@ -290,31 +765,24 @@ function stopDaemonServer() {
290
765
  } catch {
291
766
  }
292
767
  }
293
- console.log("[DaemonIPC] Server stopped");
768
+ logger.info("DaemonIPC server stopped");
294
769
  }
295
770
  const workingCwd = process.env["PI_VOICE_CWD"] || process.cwd();
296
771
  let mainWindow = null;
297
772
  let fnHook = null;
298
773
  let currentState = "idle";
299
- let forceQuit = false;
300
774
  setSessionCwd(workingCwd);
301
775
  function setState(state, message) {
302
776
  currentState = state;
303
- console.log(`[Main] State: ${state}${message ? ` - ${message}` : ""}`);
304
- mainWindow?.webContents.send(IPC.STATE_CHANGED, state);
305
- if (message) {
306
- mainWindow?.webContents.send(IPC.STATUS_MESSAGE, message);
307
- }
777
+ logger.info({ state, message }, "State changed");
308
778
  }
309
779
  function createWindow() {
310
780
  mainWindow = new BrowserWindow({
311
781
  width: 400,
312
782
  height: 300,
313
- resizable: true,
314
- alwaysOnTop: true,
315
- titleBarStyle: "hiddenInset",
316
- // Daemon-first: window starts hidden
783
+ // Hidden audio worker – never shown to user
317
784
  show: false,
785
+ skipTaskbar: true,
318
786
  webPreferences: {
319
787
  preload: fileURLToPath(
320
788
  new URL("../preload/index.cjs", import.meta.url)
@@ -332,80 +800,81 @@ function createWindow() {
332
800
  )
333
801
  );
334
802
  }
335
- mainWindow.on("close", (e) => {
336
- if (!forceQuit) {
337
- e.preventDefault();
338
- mainWindow?.hide();
339
- }
340
- });
341
803
  mainWindow.on("closed", () => {
342
804
  mainWindow = null;
343
805
  });
344
806
  }
345
- function showWindow() {
346
- if (mainWindow) {
347
- mainWindow.show();
348
- mainWindow.focus();
349
- } else {
350
- createWindow();
351
- mainWindow.once("ready-to-show", () => {
352
- mainWindow.show();
353
- mainWindow.focus();
354
- });
355
- }
356
- }
357
- function setupIpcHandlers() {
807
+ function setupIpcHandlers(provider) {
358
808
  ipcMain.on(IPC.RECORDING_DATA, async (_event, data) => {
359
809
  if (currentState !== "recording") return;
360
- const audioBuffer = Buffer.from(data);
361
- if (audioBuffer.length < 1e3) {
362
- console.log("[Main] Recording too short, ignoring");
810
+ if (data.byteLength < 1e3) {
811
+ logger.info("Recording too short, ignoring");
363
812
  setState("idle", "Recording too short");
364
813
  return;
365
814
  }
366
815
  try {
367
816
  setState("transcribing", "Transcribing...");
368
- const text = await transcribe(audioBuffer);
817
+ const text = await transcribe(data, provider);
369
818
  if (!text) {
370
819
  setState("idle", "No speech detected");
371
820
  return;
372
821
  }
373
822
  setState("thinking", `Sent: "${text}"`);
374
- let streamStarted = false;
375
- let ttsChain = Promise.resolve();
376
- await prompt(text, {
377
- onTextEnd: (segment) => {
378
- if (!streamStarted) {
379
- streamStarted = true;
380
- setState("speaking", "Generating speech...");
381
- mainWindow?.webContents.send(IPC.PLAY_AUDIO_STREAM_START, {
382
- sampleRate: TTS_SAMPLE_RATE,
383
- channels: TTS_CHANNELS,
384
- bitsPerSample: TTS_BITS_PER_SAMPLE
385
- });
386
- }
387
- ttsChain = ttsChain.then(async () => {
388
- for await (const pcmChunk of synthesizeStream(segment)) {
389
- mainWindow?.webContents.send(
390
- IPC.PLAY_AUDIO_STREAM_CHUNK,
391
- pcmChunk.buffer.slice(
392
- pcmChunk.byteOffset,
393
- pcmChunk.byteOffset + pcmChunk.byteLength
394
- )
395
- );
823
+ if (provider === "local") {
824
+ let speakStarted = false;
825
+ let ttsChain = Promise.resolve();
826
+ await prompt(text, {
827
+ onTextEnd: (segment) => {
828
+ if (!speakStarted) {
829
+ speakStarted = true;
830
+ setState("speaking", "Speaking...");
396
831
  }
397
- });
832
+ ttsChain = ttsChain.then(() => speakLocal(segment));
833
+ }
834
+ });
835
+ await ttsChain;
836
+ if (!speakStarted) {
837
+ setState("idle", "No response from pi");
838
+ } else {
839
+ setState("idle");
398
840
  }
399
- });
400
- await ttsChain;
401
- if (streamStarted) {
402
- mainWindow?.webContents.send(IPC.PLAY_AUDIO_STREAM_END);
403
841
  } else {
404
- setState("idle", "No response from pi");
842
+ let streamStarted = false;
843
+ let ttsChain = Promise.resolve();
844
+ await prompt(text, {
845
+ onTextEnd: (segment) => {
846
+ if (!streamStarted) {
847
+ streamStarted = true;
848
+ setState("speaking", "Generating speech...");
849
+ mainWindow?.webContents.send(IPC.PLAY_AUDIO_STREAM_START, {
850
+ sampleRate: TTS_SAMPLE_RATE,
851
+ channels: TTS_CHANNELS,
852
+ bitsPerSample: TTS_BITS_PER_SAMPLE
853
+ });
854
+ }
855
+ ttsChain = ttsChain.then(async () => {
856
+ for await (const pcmChunk of synthesizeStream(segment, provider)) {
857
+ mainWindow?.webContents.send(
858
+ IPC.PLAY_AUDIO_STREAM_CHUNK,
859
+ pcmChunk.buffer.slice(
860
+ pcmChunk.byteOffset,
861
+ pcmChunk.byteOffset + pcmChunk.byteLength
862
+ )
863
+ );
864
+ }
865
+ });
866
+ }
867
+ });
868
+ await ttsChain;
869
+ if (streamStarted) {
870
+ mainWindow?.webContents.send(IPC.PLAY_AUDIO_STREAM_END);
871
+ } else {
872
+ setState("idle", "No response from pi");
873
+ }
405
874
  }
406
875
  } catch (err) {
407
876
  const msg = err instanceof Error ? err.message : String(err);
408
- console.error("[Main] Pipeline error:", msg);
877
+ logger.error({ err: msg }, "Pipeline error");
409
878
  setState("error", msg);
410
879
  setTimeout(() => {
411
880
  if (currentState === "error") setState("idle");
@@ -413,7 +882,7 @@ function setupIpcHandlers() {
413
882
  }
414
883
  });
415
884
  ipcMain.on(IPC.RECORDING_ERROR, (_event, error) => {
416
- console.error("[Main] Recording error:", error);
885
+ logger.error({ err: error }, "Recording error");
417
886
  setState("error", error);
418
887
  setTimeout(() => {
419
888
  if (currentState === "error") setState("idle");
@@ -425,45 +894,37 @@ function setupIpcHandlers() {
425
894
  }
426
895
  });
427
896
  }
428
- function setupFnHook() {
429
- fnHook = new FnHook({
430
- onFnDown: () => {
431
- if (currentState !== "idle") {
432
- console.log(
433
- `[Main] Fn pressed but state is ${currentState}, ignoring`
434
- );
435
- return;
897
+ function setupFnHook(config) {
898
+ const recordingFormat = config.provider === "local" ? "pcm" : "webm";
899
+ fnHook = new FnHook(
900
+ {
901
+ onFnDown: () => {
902
+ if (currentState !== "idle") {
903
+ logger.info(
904
+ { key: config.keyDisplay, state: currentState },
905
+ "Key pressed but not idle, ignoring"
906
+ );
907
+ return;
908
+ }
909
+ setState("recording", "Recording...");
910
+ mainWindow?.webContents.send(IPC.START_RECORDING, recordingFormat);
911
+ },
912
+ onFnUp: () => {
913
+ if (currentState !== "recording") return;
914
+ mainWindow?.webContents.send(IPC.STOP_RECORDING);
436
915
  }
437
- setState("recording", "Recording...");
438
- mainWindow?.webContents.send(IPC.START_RECORDING);
439
916
  },
440
- onFnUp: () => {
441
- if (currentState !== "recording") return;
442
- mainWindow?.webContents.send(IPC.STOP_RECORDING);
443
- }
444
- });
917
+ config.key,
918
+ config.keyDisplay
919
+ );
445
920
  try {
446
921
  fnHook.start();
447
922
  } catch (err) {
448
923
  const msg = err instanceof Error ? err.message : String(err);
449
- console.error("[Main] FnHook error:", msg);
924
+ logger.error({ err: msg }, "FnHook error");
450
925
  setState("error", msg);
451
926
  }
452
927
  }
453
- function setupCsp() {
454
- const isDev = !app.isPackaged && !!process.env["ELECTRON_RENDERER_URL"];
455
- session$1.defaultSession.webRequest.onHeadersReceived(
456
- (details, callback) => {
457
- const csp = isDev ? "default-src 'self'; script-src 'self' 'unsafe-inline'; style-src 'self' 'unsafe-inline'; connect-src 'self' ws://localhost:* http://localhost:*; media-src 'self' blob:" : "default-src 'self'; style-src 'self' 'unsafe-inline'; media-src 'self' blob:";
458
- callback({
459
- responseHeaders: {
460
- ...details.responseHeaders,
461
- "Content-Security-Policy": [csp]
462
- }
463
- });
464
- }
465
- );
466
- }
467
928
  function handleDaemonCommand(command) {
468
929
  switch (command) {
469
930
  case "status":
@@ -474,12 +935,8 @@ function handleDaemonCommand(command) {
474
935
  pid: process.pid,
475
936
  uptime: process.uptime()
476
937
  };
477
- case "show":
478
- showWindow();
479
- return { ok: true };
480
938
  case "stop":
481
939
  setImmediate(() => {
482
- forceQuit = true;
483
940
  app.quit();
484
941
  });
485
942
  return { ok: true };
@@ -488,7 +945,7 @@ function handleDaemonCommand(command) {
488
945
  }
489
946
  }
490
947
  function gracefulShutdown() {
491
- console.log("[Main] Shutting down...");
948
+ logger.info("Shutting down...");
492
949
  fnHook?.stop();
493
950
  dispose();
494
951
  stopDaemonServer();
@@ -496,33 +953,45 @@ function gracefulShutdown() {
496
953
  }
497
954
  process.on("SIGTERM", () => {
498
955
  gracefulShutdown();
499
- forceQuit = true;
500
956
  app.quit();
501
957
  });
502
958
  const gotLock = app.requestSingleInstanceLock();
503
959
  if (!gotLock) {
504
- console.log("[Main] Another instance is already running. Exiting.");
960
+ logger.warn("Another instance is already running. Exiting.");
505
961
  app.quit();
506
- } else {
507
- app.on("second-instance", () => {
508
- showWindow();
509
- });
510
962
  }
511
- app.whenReady().then(() => {
512
- setupCsp();
963
+ app.whenReady().then(async () => {
964
+ let config;
965
+ try {
966
+ config = loadConfig(workingCwd);
967
+ } catch (err) {
968
+ if (err instanceof ConfigError) {
969
+ logger.error({ err: err.message }, "Config error");
970
+ } else {
971
+ logger.error({ err: err instanceof Error ? err.message : String(err) }, "Failed to load config");
972
+ }
973
+ app.quit();
974
+ return;
975
+ }
976
+ if (config.provider === "local") {
977
+ try {
978
+ await resolveModelPath();
979
+ } catch (err) {
980
+ const msg = err instanceof Error ? err.message : String(err);
981
+ logger.error({ err: msg }, "Failed to prepare Whisper model");
982
+ app.quit();
983
+ return;
984
+ }
985
+ }
513
986
  createWindow();
514
- setupIpcHandlers();
515
- setupFnHook();
987
+ setupIpcHandlers(config.provider);
988
+ setupFnHook(config);
516
989
  startDaemonServer(handleDaemonCommand);
517
990
  saveRuntimeState(workingCwd);
518
- console.log(`[Main] pi-voice daemon started (cwd: ${workingCwd})`);
991
+ logger.info({ cwd: workingCwd }, "pi-voice daemon started");
519
992
  });
520
993
  app.on("window-all-closed", () => {
521
994
  });
522
- app.on("activate", () => {
523
- showWindow();
524
- });
525
995
  app.on("before-quit", () => {
526
- forceQuit = true;
527
996
  gracefulShutdown();
528
997
  });