pi-voice 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -2
- package/out/cli/cli.js +1 -1
- package/out/main/index.js +80 -3
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -2,7 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
Headless voice interface for the [Pi Coding Agent](https://github.com/badlogic/pi-mono). Hold a key, speak, and pi executes your instructions with voice feedback.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
#### Demo using ElevenLabs provider (make sure unmuted)
|
|
6
|
+
|
|
7
|
+
https://github.com/user-attachments/assets/76adb941-83cf-4394-b8d2-f6d73a1df8bc
|
|
6
8
|
|
|
7
9
|
## Installation
|
|
8
10
|
|
|
@@ -52,7 +54,7 @@ You can configure pi-voice in `.pi/pi-voice.json`:
|
|
|
52
54
|
| Key | Description |
|
|
53
55
|
| --- | --- |
|
|
54
56
|
| `key` | Push-to-talk shortcut. Combine modifiers (`ctrl`, `shift`, `alt`/`opt`, `meta`/`cmd`) and a main key with `+`. Examples: `"ctrl+t"`, `"alt+space"`, `"ctrl+shift+r"`. Default: `"meta+shift+i"`. |
|
|
55
|
-
| `provider` | Speech provider for STT & TTS. `"local"`, `"gemini"` (Vertex AI or Gemini API), or `"
|
|
57
|
+
| `provider` | Speech provider for STT & TTS. `"local"`, `"gemini"` (Vertex AI or Gemini API), `"openai"`, or `"elevenlabs"`. Default: `"local"`. |
|
|
56
58
|
|
|
57
59
|
### Environment variables
|
|
58
60
|
|
|
@@ -61,6 +63,7 @@ You can configure pi-voice in `.pi/pi-voice.json`:
|
|
|
61
63
|
| `local` | None (model is auto-downloaded on first launch). Optional: `WHISPER_MODEL_PATH` (custom model path), `WHISPER_MODEL` (model name, default `medium-q5_0`), `SAY_VOICE` (macOS `say` voice name, e.g. `"Kyoko"`). |
|
|
62
64
|
| `gemini` | **Vertex AI:** `GOOGLE_CLOUD_PROJECT`, `GOOGLE_CLOUD_LOCATION` (optional, default `us-central1`). **Gemini API:** `GEMINI_API_KEY` or `GOOGLE_API_KEY`. If `GOOGLE_CLOUD_PROJECT` is set, Vertex AI is used; set `GOOGLE_GENAI_USE_VERTEXAI=false` to force API key mode. |
|
|
63
65
|
| `openai` | `OPENAI_API_KEY` |
|
|
66
|
+
| `elevenlabs` | `ELEVENLABS_API_KEY`. Optional: `ELEVENLABS_VOICE_ID` (TTS voice, default `CwhRBWXzGAHq8TQ4Fs17`), `ELEVENLABS_TTS_MODEL` (default `eleven_flash_v2_5`). |
|
|
64
67
|
|
|
65
68
|
#### Logging
|
|
66
69
|
|
package/out/cli/cli.js
CHANGED
|
@@ -18511,7 +18511,7 @@ var configFileSchema = exports_external.object({
|
|
|
18511
18511
|
return false;
|
|
18512
18512
|
}
|
|
18513
18513
|
}, { message: "Invalid key binding" }).optional().default(DEFAULT_KEY_STRING),
|
|
18514
|
-
provider: exports_external.enum(["local", "gemini", "openai"]).optional().default(DEFAULT_PROVIDER)
|
|
18514
|
+
provider: exports_external.enum(["local", "gemini", "openai", "elevenlabs"]).optional().default(DEFAULT_PROVIDER)
|
|
18515
18515
|
});
|
|
18516
18516
|
|
|
18517
18517
|
class ConfigError extends Error {
|
package/out/main/index.js
CHANGED
|
@@ -7,6 +7,7 @@ import pino from "pino";
|
|
|
7
7
|
import { readFileSync, existsSync, mkdirSync, createWriteStream, unlinkSync, writeFileSync } from "node:fs";
|
|
8
8
|
import { z } from "zod";
|
|
9
9
|
import OpenAI, { toFile } from "openai";
|
|
10
|
+
import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
|
|
10
11
|
import { WhisperFullParams, WhisperSamplingStrategy, Whisper } from "@napi-rs/whisper";
|
|
11
12
|
import { GoogleGenAI } from "@google/genai";
|
|
12
13
|
import { Readable } from "node:stream";
|
|
@@ -259,7 +260,7 @@ const configFileSchema = z.object({
|
|
|
259
260
|
},
|
|
260
261
|
{ message: "Invalid key binding" }
|
|
261
262
|
).optional().default(DEFAULT_KEY_STRING),
|
|
262
|
-
provider: z.enum(["local", "gemini", "openai"]).optional().default(DEFAULT_PROVIDER)
|
|
263
|
+
provider: z.enum(["local", "gemini", "openai", "elevenlabs"]).optional().default(DEFAULT_PROVIDER)
|
|
263
264
|
});
|
|
264
265
|
class ConfigError extends Error {
|
|
265
266
|
constructor(configPath, details) {
|
|
@@ -470,6 +471,34 @@ async function transcribeOpenAI(audioBuffer) {
|
|
|
470
471
|
});
|
|
471
472
|
return transcription.text?.trim() ?? "";
|
|
472
473
|
}
|
|
474
|
+
let elevenlabsClient$1 = null;
|
|
475
|
+
function getElevenLabsClient$1() {
|
|
476
|
+
if (elevenlabsClient$1) return elevenlabsClient$1;
|
|
477
|
+
const apiKey = process.env.ELEVENLABS_API_KEY;
|
|
478
|
+
if (!apiKey) {
|
|
479
|
+
throw new Error("ELEVENLABS_API_KEY environment variable is required");
|
|
480
|
+
}
|
|
481
|
+
elevenlabsClient$1 = new ElevenLabsClient({ apiKey });
|
|
482
|
+
return elevenlabsClient$1;
|
|
483
|
+
}
|
|
484
|
+
async function transcribeElevenLabs(audioBuffer) {
|
|
485
|
+
const client = getElevenLabsClient$1();
|
|
486
|
+
const result = await client.speechToText.convert({
|
|
487
|
+
file: {
|
|
488
|
+
data: audioBuffer,
|
|
489
|
+
filename: "recording.webm",
|
|
490
|
+
contentType: "audio/webm"
|
|
491
|
+
},
|
|
492
|
+
modelId: "scribe_v2"
|
|
493
|
+
});
|
|
494
|
+
if ("text" in result) {
|
|
495
|
+
return (result.text ?? "").trim();
|
|
496
|
+
}
|
|
497
|
+
if ("transcripts" in result && result.transcripts?.[0]) {
|
|
498
|
+
return (result.transcripts[0].text ?? "").trim();
|
|
499
|
+
}
|
|
500
|
+
return "";
|
|
501
|
+
}
|
|
473
502
|
async function transcribeLocal(samples) {
|
|
474
503
|
const whisper = await getWhisperInstance();
|
|
475
504
|
const params = new WhisperFullParams(WhisperSamplingStrategy.Greedy);
|
|
@@ -492,6 +521,9 @@ async function transcribe(audioData, provider = "local") {
|
|
|
492
521
|
case "openai":
|
|
493
522
|
text = await transcribeOpenAI(Buffer.from(audioData));
|
|
494
523
|
break;
|
|
524
|
+
case "elevenlabs":
|
|
525
|
+
text = await transcribeElevenLabs(Buffer.from(audioData));
|
|
526
|
+
break;
|
|
495
527
|
case "gemini":
|
|
496
528
|
default:
|
|
497
529
|
text = await transcribeGemini(Buffer.from(audioData));
|
|
@@ -513,7 +545,7 @@ function getOpenAIClient() {
|
|
|
513
545
|
const TTS_SAMPLE_RATE = 24e3;
|
|
514
546
|
const TTS_CHANNELS = 1;
|
|
515
547
|
const TTS_BITS_PER_SAMPLE = 16;
|
|
516
|
-
const
|
|
548
|
+
const PCM_CHUNK_SIZE = TTS_SAMPLE_RATE * (TTS_BITS_PER_SAMPLE / 8) * TTS_CHANNELS * 0.1;
|
|
517
549
|
async function* synthesizeStreamGemini(text) {
|
|
518
550
|
const client = getGeminiClient();
|
|
519
551
|
const response = await client.models.generateContentStream({
|
|
@@ -583,7 +615,7 @@ async function* synthesizeStreamOpenAI(text) {
|
|
|
583
615
|
let totalBytes = 0;
|
|
584
616
|
let offset = 0;
|
|
585
617
|
while (offset < fullBuffer.length) {
|
|
586
|
-
const end = Math.min(offset +
|
|
618
|
+
const end = Math.min(offset + PCM_CHUNK_SIZE, fullBuffer.length);
|
|
587
619
|
const chunk = fullBuffer.subarray(offset, end);
|
|
588
620
|
totalBytes += chunk.length;
|
|
589
621
|
yield chunk;
|
|
@@ -594,6 +626,48 @@ async function* synthesizeStreamOpenAI(text) {
|
|
|
594
626
|
"Streamed PCM audio"
|
|
595
627
|
);
|
|
596
628
|
}
|
|
629
|
+
let elevenlabsClient = null;
|
|
630
|
+
function getElevenLabsClient() {
|
|
631
|
+
if (elevenlabsClient) return elevenlabsClient;
|
|
632
|
+
const apiKey = process.env.ELEVENLABS_API_KEY;
|
|
633
|
+
if (!apiKey) {
|
|
634
|
+
throw new Error("ELEVENLABS_API_KEY environment variable is required");
|
|
635
|
+
}
|
|
636
|
+
elevenlabsClient = new ElevenLabsClient({ apiKey });
|
|
637
|
+
return elevenlabsClient;
|
|
638
|
+
}
|
|
639
|
+
const DEFAULT_ELEVENLABS_VOICE_ID = "CwhRBWXzGAHq8TQ4Fs17";
|
|
640
|
+
async function* synthesizeStreamElevenLabs(text) {
|
|
641
|
+
const client = getElevenLabsClient();
|
|
642
|
+
const voiceId = process.env.ELEVENLABS_VOICE_ID ?? DEFAULT_ELEVENLABS_VOICE_ID;
|
|
643
|
+
const modelId = process.env.ELEVENLABS_TTS_MODEL ?? "eleven_flash_v2_5";
|
|
644
|
+
const audio = await client.textToSpeech.convert(voiceId, {
|
|
645
|
+
text,
|
|
646
|
+
modelId,
|
|
647
|
+
outputFormat: "pcm_24000"
|
|
648
|
+
});
|
|
649
|
+
const chunks = [];
|
|
650
|
+
const reader = audio.getReader();
|
|
651
|
+
while (true) {
|
|
652
|
+
const { done, value } = await reader.read();
|
|
653
|
+
if (done) break;
|
|
654
|
+
chunks.push(value);
|
|
655
|
+
}
|
|
656
|
+
const fullBuffer = Buffer.concat(chunks);
|
|
657
|
+
let totalBytes = 0;
|
|
658
|
+
let offset = 0;
|
|
659
|
+
while (offset < fullBuffer.length) {
|
|
660
|
+
const end = Math.min(offset + PCM_CHUNK_SIZE, fullBuffer.length);
|
|
661
|
+
const chunk = fullBuffer.subarray(offset, end);
|
|
662
|
+
totalBytes += chunk.length;
|
|
663
|
+
yield chunk;
|
|
664
|
+
offset = end;
|
|
665
|
+
}
|
|
666
|
+
logger.info(
|
|
667
|
+
{ provider: "elevenlabs", totalBytes, text: text.substring(0, 50) },
|
|
668
|
+
"Streamed PCM audio"
|
|
669
|
+
);
|
|
670
|
+
}
|
|
597
671
|
function speakLocal(text) {
|
|
598
672
|
return new Promise((resolve, reject) => {
|
|
599
673
|
if (process.platform !== "darwin") {
|
|
@@ -632,6 +706,9 @@ async function* synthesizeStream(text, provider = "local") {
|
|
|
632
706
|
case "openai":
|
|
633
707
|
yield* synthesizeStreamOpenAI(text);
|
|
634
708
|
break;
|
|
709
|
+
case "elevenlabs":
|
|
710
|
+
yield* synthesizeStreamElevenLabs(text);
|
|
711
|
+
break;
|
|
635
712
|
case "gemini":
|
|
636
713
|
default:
|
|
637
714
|
yield* synthesizeStreamGemini(text);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pi-voice",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"description": "Voice interface for pi coding agent",
|
|
5
5
|
"author": "Yuku Kotani",
|
|
6
6
|
"license": "MIT",
|
|
@@ -36,6 +36,7 @@
|
|
|
36
36
|
"typescript": "^5"
|
|
37
37
|
},
|
|
38
38
|
"dependencies": {
|
|
39
|
+
"@elevenlabs/elevenlabs-js": "^2.35.0",
|
|
39
40
|
"@google/genai": "^1.40.0",
|
|
40
41
|
"@mariozechner/pi-coding-agent": "^0.52.7",
|
|
41
42
|
"@napi-rs/whisper": "^0.0.4",
|