pi-voice 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -3
- package/out/cli/cli.js +4 -4
- package/out/main/index.js +102 -27
- package/package.json +6 -1
package/README.md
CHANGED
|
@@ -2,7 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
Headless voice interface for the [Pi Coding Agent](https://github.com/badlogic/pi-mono). Hold a key, speak, and pi executes your instructions with voice feedback.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
#### Demo using ElevenLabs provider (make sure unmuted)
|
|
6
|
+
|
|
7
|
+
https://github.com/user-attachments/assets/76adb941-83cf-4394-b8d2-f6d73a1df8bc
|
|
6
8
|
|
|
7
9
|
## Installation
|
|
8
10
|
|
|
@@ -52,15 +54,16 @@ You can configure pi-voice in `.pi/pi-voice.json`:
|
|
|
52
54
|
| Key | Description |
|
|
53
55
|
| --- | --- |
|
|
54
56
|
| `key` | Push-to-talk shortcut. Combine modifiers (`ctrl`, `shift`, `alt`/`opt`, `meta`/`cmd`) and a main key with `+`. Examples: `"ctrl+t"`, `"alt+space"`, `"ctrl+shift+r"`. Default: `"meta+shift+i"`. |
|
|
55
|
-
| `provider` | Speech provider for STT & TTS. `"local"`, `"gemini"` (Vertex AI), or `"
|
|
57
|
+
| `provider` | Speech provider for STT & TTS. `"local"`, `"gemini"` (Vertex AI or Gemini API), `"openai"`, or `"elevenlabs"`. Default: `"local"`. |
|
|
56
58
|
|
|
57
59
|
### Environment variables
|
|
58
60
|
|
|
59
61
|
| Provider | Required variables |
|
|
60
62
|
| --- | --- |
|
|
61
63
|
| `local` | None (model is auto-downloaded on first launch). Optional: `WHISPER_MODEL_PATH` (custom model path), `WHISPER_MODEL` (model name, default `medium-q5_0`), `SAY_VOICE` (macOS `say` voice name, e.g. `"Kyoko"`). |
|
|
62
|
-
| `gemini` | `GOOGLE_CLOUD_PROJECT`, `GOOGLE_CLOUD_LOCATION` (optional, default `us-central1`) |
|
|
64
|
+
| `gemini` | **Vertex AI:** `GOOGLE_CLOUD_PROJECT`, `GOOGLE_CLOUD_LOCATION` (optional, default `us-central1`). **Gemini API:** `GEMINI_API_KEY` or `GOOGLE_API_KEY`. If `GOOGLE_CLOUD_PROJECT` is set, Vertex AI is used; set `GOOGLE_GENAI_USE_VERTEXAI=false` to force API key mode. |
|
|
63
65
|
| `openai` | `OPENAI_API_KEY` |
|
|
66
|
+
| `elevenlabs` | `ELEVENLABS_API_KEY`. Optional: `ELEVENLABS_VOICE_ID` (TTS voice, default `CwhRBWXzGAHq8TQ4Fs17`), `ELEVENLABS_TTS_MODEL` (default `eleven_flash_v2_5`). |
|
|
64
67
|
|
|
65
68
|
#### Logging
|
|
66
69
|
|
package/out/cli/cli.js
CHANGED
|
@@ -1952,7 +1952,7 @@ var require_indexes = __commonJS((exports, module) => {
|
|
|
1952
1952
|
|
|
1953
1953
|
// node_modules/thread-stream/index.js
|
|
1954
1954
|
var require_thread_stream = __commonJS((exports, module) => {
|
|
1955
|
-
var __dirname = "/
|
|
1955
|
+
var __dirname = "/home/runner/work/pi-voice/pi-voice/node_modules/thread-stream";
|
|
1956
1956
|
var { version } = require_package();
|
|
1957
1957
|
var { EventEmitter } = __require("events");
|
|
1958
1958
|
var { Worker } = __require("worker_threads");
|
|
@@ -2377,7 +2377,7 @@ var require_thread_stream = __commonJS((exports, module) => {
|
|
|
2377
2377
|
|
|
2378
2378
|
// node_modules/pino/lib/transport.js
|
|
2379
2379
|
var require_transport = __commonJS((exports, module) => {
|
|
2380
|
-
var __dirname = "/
|
|
2380
|
+
var __dirname = "/home/runner/work/pi-voice/pi-voice/node_modules/pino/lib";
|
|
2381
2381
|
var { createRequire: createRequire2 } = __require("module");
|
|
2382
2382
|
var { existsSync: existsSync2 } = __require("node:fs");
|
|
2383
2383
|
var getCallers = require_caller();
|
|
@@ -4492,7 +4492,7 @@ var require_node_gyp_build2 = __commonJS((exports, module) => {
|
|
|
4492
4492
|
|
|
4493
4493
|
// node_modules/uiohook-napi/dist/index.js
|
|
4494
4494
|
var require_dist = __commonJS((exports) => {
|
|
4495
|
-
var __dirname = "/
|
|
4495
|
+
var __dirname = "/home/runner/work/pi-voice/pi-voice/node_modules/uiohook-napi/dist";
|
|
4496
4496
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
4497
4497
|
exports.uIOhook = exports.UiohookKey = exports.WheelDirection = exports.EventType = undefined;
|
|
4498
4498
|
var events_1 = __require("events");
|
|
@@ -18511,7 +18511,7 @@ var configFileSchema = exports_external.object({
|
|
|
18511
18511
|
return false;
|
|
18512
18512
|
}
|
|
18513
18513
|
}, { message: "Invalid key binding" }).optional().default(DEFAULT_KEY_STRING),
|
|
18514
|
-
provider: exports_external.enum(["local", "gemini", "openai"]).optional().default(DEFAULT_PROVIDER)
|
|
18514
|
+
provider: exports_external.enum(["local", "gemini", "openai", "elevenlabs"]).optional().default(DEFAULT_PROVIDER)
|
|
18515
18515
|
});
|
|
18516
18516
|
|
|
18517
18517
|
class ConfigError extends Error {
|
package/out/main/index.js
CHANGED
|
@@ -6,9 +6,10 @@ import { homedir } from "node:os";
|
|
|
6
6
|
import pino from "pino";
|
|
7
7
|
import { readFileSync, existsSync, mkdirSync, createWriteStream, unlinkSync, writeFileSync } from "node:fs";
|
|
8
8
|
import { z } from "zod";
|
|
9
|
-
import { GoogleGenAI } from "@google/genai";
|
|
10
9
|
import OpenAI, { toFile } from "openai";
|
|
10
|
+
import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
|
|
11
11
|
import { WhisperFullParams, WhisperSamplingStrategy, Whisper } from "@napi-rs/whisper";
|
|
12
|
+
import { GoogleGenAI } from "@google/genai";
|
|
12
13
|
import { Readable } from "node:stream";
|
|
13
14
|
import { finished } from "node:stream/promises";
|
|
14
15
|
import { spawn } from "node:child_process";
|
|
@@ -259,7 +260,7 @@ const configFileSchema = z.object({
|
|
|
259
260
|
},
|
|
260
261
|
{ message: "Invalid key binding" }
|
|
261
262
|
).optional().default(DEFAULT_KEY_STRING),
|
|
262
|
-
provider: z.enum(["local", "gemini", "openai"]).optional().default(DEFAULT_PROVIDER)
|
|
263
|
+
provider: z.enum(["local", "gemini", "openai", "elevenlabs"]).optional().default(DEFAULT_PROVIDER)
|
|
263
264
|
});
|
|
264
265
|
class ConfigError extends Error {
|
|
265
266
|
constructor(configPath, details) {
|
|
@@ -302,6 +303,26 @@ function loadConfig(cwd) {
|
|
|
302
303
|
logger.info({ key: display, provider: parsed.provider, configPath }, "Loaded config");
|
|
303
304
|
return { key: binding, keyDisplay: display, provider: parsed.provider };
|
|
304
305
|
}
|
|
306
|
+
let geminiClient = null;
|
|
307
|
+
function getGeminiClient() {
|
|
308
|
+
if (geminiClient) return geminiClient;
|
|
309
|
+
const forceVertexOff = process.env.GOOGLE_GENAI_USE_VERTEXAI === "false";
|
|
310
|
+
const project = process.env.GOOGLE_CLOUD_PROJECT;
|
|
311
|
+
const location = process.env.GOOGLE_CLOUD_LOCATION ?? "us-central1";
|
|
312
|
+
const apiKey = process.env.GEMINI_API_KEY ?? process.env.GOOGLE_API_KEY;
|
|
313
|
+
if (project && !forceVertexOff) {
|
|
314
|
+
logger.info({ project, location }, "Initializing Gemini client (Vertex AI)");
|
|
315
|
+
geminiClient = new GoogleGenAI({ vertexai: true, project, location });
|
|
316
|
+
} else if (apiKey) {
|
|
317
|
+
logger.info("Initializing Gemini client (API key)");
|
|
318
|
+
geminiClient = new GoogleGenAI({ apiKey });
|
|
319
|
+
} else {
|
|
320
|
+
throw new Error(
|
|
321
|
+
"Gemini provider requires either GOOGLE_CLOUD_PROJECT (for Vertex AI) or GEMINI_API_KEY / GOOGLE_API_KEY (for Gemini API)."
|
|
322
|
+
);
|
|
323
|
+
}
|
|
324
|
+
return geminiClient;
|
|
325
|
+
}
|
|
305
326
|
const DEFAULT_MODEL = "medium-q5_0";
|
|
306
327
|
const HF_BASE_URL = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main";
|
|
307
328
|
function modelCacheDir() {
|
|
@@ -393,17 +414,6 @@ async function resolveModelPath() {
|
|
|
393
414
|
await downloadModel(model, destPath);
|
|
394
415
|
return destPath;
|
|
395
416
|
}
|
|
396
|
-
let geminiClient$1 = null;
|
|
397
|
-
function getGeminiClient$1() {
|
|
398
|
-
if (geminiClient$1) return geminiClient$1;
|
|
399
|
-
const project = process.env.GOOGLE_CLOUD_PROJECT;
|
|
400
|
-
const location = process.env.GOOGLE_CLOUD_LOCATION ?? "us-central1";
|
|
401
|
-
if (!project) {
|
|
402
|
-
throw new Error("GOOGLE_CLOUD_PROJECT environment variable is required");
|
|
403
|
-
}
|
|
404
|
-
geminiClient$1 = new GoogleGenAI({ vertexai: true, project, location });
|
|
405
|
-
return geminiClient$1;
|
|
406
|
-
}
|
|
407
417
|
let openaiClient$1 = null;
|
|
408
418
|
function getOpenAIClient$1() {
|
|
409
419
|
if (openaiClient$1) return openaiClient$1;
|
|
@@ -429,7 +439,7 @@ async function getWhisperInstance() {
|
|
|
429
439
|
return whisperInitPromise;
|
|
430
440
|
}
|
|
431
441
|
async function transcribeGemini(audioBuffer) {
|
|
432
|
-
const client = getGeminiClient
|
|
442
|
+
const client = getGeminiClient();
|
|
433
443
|
const base64Audio = audioBuffer.toString("base64");
|
|
434
444
|
const response = await client.models.generateContent({
|
|
435
445
|
model: "gemini-2.5-flash",
|
|
@@ -461,6 +471,34 @@ async function transcribeOpenAI(audioBuffer) {
|
|
|
461
471
|
});
|
|
462
472
|
return transcription.text?.trim() ?? "";
|
|
463
473
|
}
|
|
474
|
+
let elevenlabsClient$1 = null;
|
|
475
|
+
function getElevenLabsClient$1() {
|
|
476
|
+
if (elevenlabsClient$1) return elevenlabsClient$1;
|
|
477
|
+
const apiKey = process.env.ELEVENLABS_API_KEY;
|
|
478
|
+
if (!apiKey) {
|
|
479
|
+
throw new Error("ELEVENLABS_API_KEY environment variable is required");
|
|
480
|
+
}
|
|
481
|
+
elevenlabsClient$1 = new ElevenLabsClient({ apiKey });
|
|
482
|
+
return elevenlabsClient$1;
|
|
483
|
+
}
|
|
484
|
+
async function transcribeElevenLabs(audioBuffer) {
|
|
485
|
+
const client = getElevenLabsClient$1();
|
|
486
|
+
const result = await client.speechToText.convert({
|
|
487
|
+
file: {
|
|
488
|
+
data: audioBuffer,
|
|
489
|
+
filename: "recording.webm",
|
|
490
|
+
contentType: "audio/webm"
|
|
491
|
+
},
|
|
492
|
+
modelId: "scribe_v2"
|
|
493
|
+
});
|
|
494
|
+
if ("text" in result) {
|
|
495
|
+
return (result.text ?? "").trim();
|
|
496
|
+
}
|
|
497
|
+
if ("transcripts" in result && result.transcripts?.[0]) {
|
|
498
|
+
return (result.transcripts[0].text ?? "").trim();
|
|
499
|
+
}
|
|
500
|
+
return "";
|
|
501
|
+
}
|
|
464
502
|
async function transcribeLocal(samples) {
|
|
465
503
|
const whisper = await getWhisperInstance();
|
|
466
504
|
const params = new WhisperFullParams(WhisperSamplingStrategy.Greedy);
|
|
@@ -483,6 +521,9 @@ async function transcribe(audioData, provider = "local") {
|
|
|
483
521
|
case "openai":
|
|
484
522
|
text = await transcribeOpenAI(Buffer.from(audioData));
|
|
485
523
|
break;
|
|
524
|
+
case "elevenlabs":
|
|
525
|
+
text = await transcribeElevenLabs(Buffer.from(audioData));
|
|
526
|
+
break;
|
|
486
527
|
case "gemini":
|
|
487
528
|
default:
|
|
488
529
|
text = await transcribeGemini(Buffer.from(audioData));
|
|
@@ -491,17 +532,6 @@ async function transcribe(audioData, provider = "local") {
|
|
|
491
532
|
logger.info({ provider, text }, "Transcribed");
|
|
492
533
|
return text;
|
|
493
534
|
}
|
|
494
|
-
let geminiClient = null;
|
|
495
|
-
function getGeminiClient() {
|
|
496
|
-
if (geminiClient) return geminiClient;
|
|
497
|
-
const project = process.env.GOOGLE_CLOUD_PROJECT;
|
|
498
|
-
const location = process.env.GOOGLE_CLOUD_LOCATION ?? "us-central1";
|
|
499
|
-
if (!project) {
|
|
500
|
-
throw new Error("GOOGLE_CLOUD_PROJECT environment variable is required");
|
|
501
|
-
}
|
|
502
|
-
geminiClient = new GoogleGenAI({ vertexai: true, project, location });
|
|
503
|
-
return geminiClient;
|
|
504
|
-
}
|
|
505
535
|
let openaiClient = null;
|
|
506
536
|
function getOpenAIClient() {
|
|
507
537
|
if (openaiClient) return openaiClient;
|
|
@@ -515,7 +545,7 @@ function getOpenAIClient() {
|
|
|
515
545
|
const TTS_SAMPLE_RATE = 24e3;
|
|
516
546
|
const TTS_CHANNELS = 1;
|
|
517
547
|
const TTS_BITS_PER_SAMPLE = 16;
|
|
518
|
-
const
|
|
548
|
+
const PCM_CHUNK_SIZE = TTS_SAMPLE_RATE * (TTS_BITS_PER_SAMPLE / 8) * TTS_CHANNELS * 0.1;
|
|
519
549
|
async function* synthesizeStreamGemini(text) {
|
|
520
550
|
const client = getGeminiClient();
|
|
521
551
|
const response = await client.models.generateContentStream({
|
|
@@ -585,7 +615,7 @@ async function* synthesizeStreamOpenAI(text) {
|
|
|
585
615
|
let totalBytes = 0;
|
|
586
616
|
let offset = 0;
|
|
587
617
|
while (offset < fullBuffer.length) {
|
|
588
|
-
const end = Math.min(offset +
|
|
618
|
+
const end = Math.min(offset + PCM_CHUNK_SIZE, fullBuffer.length);
|
|
589
619
|
const chunk = fullBuffer.subarray(offset, end);
|
|
590
620
|
totalBytes += chunk.length;
|
|
591
621
|
yield chunk;
|
|
@@ -596,6 +626,48 @@ async function* synthesizeStreamOpenAI(text) {
|
|
|
596
626
|
"Streamed PCM audio"
|
|
597
627
|
);
|
|
598
628
|
}
|
|
629
|
+
let elevenlabsClient = null;
|
|
630
|
+
function getElevenLabsClient() {
|
|
631
|
+
if (elevenlabsClient) return elevenlabsClient;
|
|
632
|
+
const apiKey = process.env.ELEVENLABS_API_KEY;
|
|
633
|
+
if (!apiKey) {
|
|
634
|
+
throw new Error("ELEVENLABS_API_KEY environment variable is required");
|
|
635
|
+
}
|
|
636
|
+
elevenlabsClient = new ElevenLabsClient({ apiKey });
|
|
637
|
+
return elevenlabsClient;
|
|
638
|
+
}
|
|
639
|
+
const DEFAULT_ELEVENLABS_VOICE_ID = "CwhRBWXzGAHq8TQ4Fs17";
|
|
640
|
+
async function* synthesizeStreamElevenLabs(text) {
|
|
641
|
+
const client = getElevenLabsClient();
|
|
642
|
+
const voiceId = process.env.ELEVENLABS_VOICE_ID ?? DEFAULT_ELEVENLABS_VOICE_ID;
|
|
643
|
+
const modelId = process.env.ELEVENLABS_TTS_MODEL ?? "eleven_flash_v2_5";
|
|
644
|
+
const audio = await client.textToSpeech.convert(voiceId, {
|
|
645
|
+
text,
|
|
646
|
+
modelId,
|
|
647
|
+
outputFormat: "pcm_24000"
|
|
648
|
+
});
|
|
649
|
+
const chunks = [];
|
|
650
|
+
const reader = audio.getReader();
|
|
651
|
+
while (true) {
|
|
652
|
+
const { done, value } = await reader.read();
|
|
653
|
+
if (done) break;
|
|
654
|
+
chunks.push(value);
|
|
655
|
+
}
|
|
656
|
+
const fullBuffer = Buffer.concat(chunks);
|
|
657
|
+
let totalBytes = 0;
|
|
658
|
+
let offset = 0;
|
|
659
|
+
while (offset < fullBuffer.length) {
|
|
660
|
+
const end = Math.min(offset + PCM_CHUNK_SIZE, fullBuffer.length);
|
|
661
|
+
const chunk = fullBuffer.subarray(offset, end);
|
|
662
|
+
totalBytes += chunk.length;
|
|
663
|
+
yield chunk;
|
|
664
|
+
offset = end;
|
|
665
|
+
}
|
|
666
|
+
logger.info(
|
|
667
|
+
{ provider: "elevenlabs", totalBytes, text: text.substring(0, 50) },
|
|
668
|
+
"Streamed PCM audio"
|
|
669
|
+
);
|
|
670
|
+
}
|
|
599
671
|
function speakLocal(text) {
|
|
600
672
|
return new Promise((resolve, reject) => {
|
|
601
673
|
if (process.platform !== "darwin") {
|
|
@@ -634,6 +706,9 @@ async function* synthesizeStream(text, provider = "local") {
|
|
|
634
706
|
case "openai":
|
|
635
707
|
yield* synthesizeStreamOpenAI(text);
|
|
636
708
|
break;
|
|
709
|
+
case "elevenlabs":
|
|
710
|
+
yield* synthesizeStreamElevenLabs(text);
|
|
711
|
+
break;
|
|
637
712
|
case "gemini":
|
|
638
713
|
default:
|
|
639
714
|
yield* synthesizeStreamGemini(text);
|
package/package.json
CHANGED
|
@@ -1,9 +1,13 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pi-voice",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"description": "Voice interface for pi coding agent",
|
|
5
5
|
"author": "Yuku Kotani",
|
|
6
6
|
"license": "MIT",
|
|
7
|
+
"repository": {
|
|
8
|
+
"type": "git",
|
|
9
|
+
"url": "https://github.com/yukukotani/pi-voice"
|
|
10
|
+
},
|
|
7
11
|
"type": "module",
|
|
8
12
|
"main": "./out/main/index.js",
|
|
9
13
|
"bin": {
|
|
@@ -32,6 +36,7 @@
|
|
|
32
36
|
"typescript": "^5"
|
|
33
37
|
},
|
|
34
38
|
"dependencies": {
|
|
39
|
+
"@elevenlabs/elevenlabs-js": "^2.35.0",
|
|
35
40
|
"@google/genai": "^1.40.0",
|
|
36
41
|
"@mariozechner/pi-coding-agent": "^0.52.7",
|
|
37
42
|
"@napi-rs/whisper": "^0.0.4",
|