pi-voice 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -2,7 +2,9 @@
2
2
 
3
3
  Headless voice interface for the [Pi Coding Agent](https://github.com/badlogic/pi-mono). Hold a key, speak, and pi executes your instructions with voice feedback.
4
4
 
5
- https://github.com/user-attachments/assets/06a0c56f-76bf-48cb-9de3-f9ca48a7245d
5
+ #### Demo using ElevenLabs provider (make sure unmuted)
6
+
7
+ https://github.com/user-attachments/assets/76adb941-83cf-4394-b8d2-f6d73a1df8bc
6
8
 
7
9
  ## Installation
8
10
 
@@ -52,7 +54,7 @@ You can configure pi-voice in `.pi/pi-voice.json`:
52
54
  | Key | Description |
53
55
  | --- | --- |
54
56
  | `key` | Push-to-talk shortcut. Combine modifiers (`ctrl`, `shift`, `alt`/`opt`, `meta`/`cmd`) and a main key with `+`. Examples: `"ctrl+t"`, `"alt+space"`, `"ctrl+shift+r"`. Default: `"meta+shift+i"`. |
55
- | `provider` | Speech provider for STT & TTS. `"local"`, `"gemini"` (Vertex AI or Gemini API), or `"openai"`. Default: `"local"`. |
57
+ | `provider` | Speech provider for STT & TTS. `"local"`, `"gemini"` (Vertex AI or Gemini API), `"openai"`, or `"elevenlabs"`. Default: `"local"`. |
56
58
 
57
59
  ### Environment variables
58
60
 
@@ -61,6 +63,7 @@ You can configure pi-voice in `.pi/pi-voice.json`:
61
63
  | `local` | None (model is auto-downloaded on first launch). Optional: `WHISPER_MODEL_PATH` (custom model path), `WHISPER_MODEL` (model name, default `medium-q5_0`), `SAY_VOICE` (macOS `say` voice name, e.g. `"Kyoko"`). |
62
64
  | `gemini` | **Vertex AI:** `GOOGLE_CLOUD_PROJECT`, `GOOGLE_CLOUD_LOCATION` (optional, default `us-central1`). **Gemini API:** `GEMINI_API_KEY` or `GOOGLE_API_KEY`. If `GOOGLE_CLOUD_PROJECT` is set, Vertex AI is used; set `GOOGLE_GENAI_USE_VERTEXAI=false` to force API key mode. |
63
65
  | `openai` | `OPENAI_API_KEY` |
66
+ | `elevenlabs` | `ELEVENLABS_API_KEY`. Optional: `ELEVENLABS_VOICE_ID` (TTS voice, default `CwhRBWXzGAHq8TQ4Fs17`), `ELEVENLABS_TTS_MODEL` (default `eleven_flash_v2_5`). |
64
67
 
65
68
  #### Logging
66
69
 
package/out/cli/cli.js CHANGED
@@ -18511,7 +18511,7 @@ var configFileSchema = exports_external.object({
18511
18511
  return false;
18512
18512
  }
18513
18513
  }, { message: "Invalid key binding" }).optional().default(DEFAULT_KEY_STRING),
18514
- provider: exports_external.enum(["local", "gemini", "openai"]).optional().default(DEFAULT_PROVIDER)
18514
+ provider: exports_external.enum(["local", "gemini", "openai", "elevenlabs"]).optional().default(DEFAULT_PROVIDER)
18515
18515
  });
18516
18516
 
18517
18517
  class ConfigError extends Error {
package/out/main/index.js CHANGED
@@ -7,6 +7,7 @@ import pino from "pino";
7
7
  import { readFileSync, existsSync, mkdirSync, createWriteStream, unlinkSync, writeFileSync } from "node:fs";
8
8
  import { z } from "zod";
9
9
  import OpenAI, { toFile } from "openai";
10
+ import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
10
11
  import { WhisperFullParams, WhisperSamplingStrategy, Whisper } from "@napi-rs/whisper";
11
12
  import { GoogleGenAI } from "@google/genai";
12
13
  import { Readable } from "node:stream";
@@ -259,7 +260,7 @@ const configFileSchema = z.object({
259
260
  },
260
261
  { message: "Invalid key binding" }
261
262
  ).optional().default(DEFAULT_KEY_STRING),
262
- provider: z.enum(["local", "gemini", "openai"]).optional().default(DEFAULT_PROVIDER)
263
+ provider: z.enum(["local", "gemini", "openai", "elevenlabs"]).optional().default(DEFAULT_PROVIDER)
263
264
  });
264
265
  class ConfigError extends Error {
265
266
  constructor(configPath, details) {
@@ -470,6 +471,34 @@ async function transcribeOpenAI(audioBuffer) {
470
471
  });
471
472
  return transcription.text?.trim() ?? "";
472
473
  }
474
+ let elevenlabsClient$1 = null;
475
+ function getElevenLabsClient$1() {
476
+ if (elevenlabsClient$1) return elevenlabsClient$1;
477
+ const apiKey = process.env.ELEVENLABS_API_KEY;
478
+ if (!apiKey) {
479
+ throw new Error("ELEVENLABS_API_KEY environment variable is required");
480
+ }
481
+ elevenlabsClient$1 = new ElevenLabsClient({ apiKey });
482
+ return elevenlabsClient$1;
483
+ }
484
+ async function transcribeElevenLabs(audioBuffer) {
485
+ const client = getElevenLabsClient$1();
486
+ const result = await client.speechToText.convert({
487
+ file: {
488
+ data: audioBuffer,
489
+ filename: "recording.webm",
490
+ contentType: "audio/webm"
491
+ },
492
+ modelId: "scribe_v2"
493
+ });
494
+ if ("text" in result) {
495
+ return (result.text ?? "").trim();
496
+ }
497
+ if ("transcripts" in result && result.transcripts?.[0]) {
498
+ return (result.transcripts[0].text ?? "").trim();
499
+ }
500
+ return "";
501
+ }
473
502
  async function transcribeLocal(samples) {
474
503
  const whisper = await getWhisperInstance();
475
504
  const params = new WhisperFullParams(WhisperSamplingStrategy.Greedy);
@@ -492,6 +521,9 @@ async function transcribe(audioData, provider = "local") {
492
521
  case "openai":
493
522
  text = await transcribeOpenAI(Buffer.from(audioData));
494
523
  break;
524
+ case "elevenlabs":
525
+ text = await transcribeElevenLabs(Buffer.from(audioData));
526
+ break;
495
527
  case "gemini":
496
528
  default:
497
529
  text = await transcribeGemini(Buffer.from(audioData));
@@ -513,7 +545,7 @@ function getOpenAIClient() {
513
545
  const TTS_SAMPLE_RATE = 24e3;
514
546
  const TTS_CHANNELS = 1;
515
547
  const TTS_BITS_PER_SAMPLE = 16;
516
- const OPENAI_PCM_CHUNK_SIZE = TTS_SAMPLE_RATE * (TTS_BITS_PER_SAMPLE / 8) * TTS_CHANNELS * 0.1;
548
+ const PCM_CHUNK_SIZE = TTS_SAMPLE_RATE * (TTS_BITS_PER_SAMPLE / 8) * TTS_CHANNELS * 0.1;
517
549
  async function* synthesizeStreamGemini(text) {
518
550
  const client = getGeminiClient();
519
551
  const response = await client.models.generateContentStream({
@@ -583,7 +615,7 @@ async function* synthesizeStreamOpenAI(text) {
583
615
  let totalBytes = 0;
584
616
  let offset = 0;
585
617
  while (offset < fullBuffer.length) {
586
- const end = Math.min(offset + OPENAI_PCM_CHUNK_SIZE, fullBuffer.length);
618
+ const end = Math.min(offset + PCM_CHUNK_SIZE, fullBuffer.length);
587
619
  const chunk = fullBuffer.subarray(offset, end);
588
620
  totalBytes += chunk.length;
589
621
  yield chunk;
@@ -594,6 +626,48 @@ async function* synthesizeStreamOpenAI(text) {
594
626
  "Streamed PCM audio"
595
627
  );
596
628
  }
629
+ let elevenlabsClient = null;
630
+ function getElevenLabsClient() {
631
+ if (elevenlabsClient) return elevenlabsClient;
632
+ const apiKey = process.env.ELEVENLABS_API_KEY;
633
+ if (!apiKey) {
634
+ throw new Error("ELEVENLABS_API_KEY environment variable is required");
635
+ }
636
+ elevenlabsClient = new ElevenLabsClient({ apiKey });
637
+ return elevenlabsClient;
638
+ }
639
+ const DEFAULT_ELEVENLABS_VOICE_ID = "CwhRBWXzGAHq8TQ4Fs17";
640
+ async function* synthesizeStreamElevenLabs(text) {
641
+ const client = getElevenLabsClient();
642
+ const voiceId = process.env.ELEVENLABS_VOICE_ID ?? DEFAULT_ELEVENLABS_VOICE_ID;
643
+ const modelId = process.env.ELEVENLABS_TTS_MODEL ?? "eleven_flash_v2_5";
644
+ const audio = await client.textToSpeech.convert(voiceId, {
645
+ text,
646
+ modelId,
647
+ outputFormat: "pcm_24000"
648
+ });
649
+ const chunks = [];
650
+ const reader = audio.getReader();
651
+ while (true) {
652
+ const { done, value } = await reader.read();
653
+ if (done) break;
654
+ chunks.push(value);
655
+ }
656
+ const fullBuffer = Buffer.concat(chunks);
657
+ let totalBytes = 0;
658
+ let offset = 0;
659
+ while (offset < fullBuffer.length) {
660
+ const end = Math.min(offset + PCM_CHUNK_SIZE, fullBuffer.length);
661
+ const chunk = fullBuffer.subarray(offset, end);
662
+ totalBytes += chunk.length;
663
+ yield chunk;
664
+ offset = end;
665
+ }
666
+ logger.info(
667
+ { provider: "elevenlabs", totalBytes, text: text.substring(0, 50) },
668
+ "Streamed PCM audio"
669
+ );
670
+ }
597
671
  function speakLocal(text) {
598
672
  return new Promise((resolve, reject) => {
599
673
  if (process.platform !== "darwin") {
@@ -632,6 +706,9 @@ async function* synthesizeStream(text, provider = "local") {
632
706
  case "openai":
633
707
  yield* synthesizeStreamOpenAI(text);
634
708
  break;
709
+ case "elevenlabs":
710
+ yield* synthesizeStreamElevenLabs(text);
711
+ break;
635
712
  case "gemini":
636
713
  default:
637
714
  yield* synthesizeStreamGemini(text);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-voice",
3
- "version": "0.3.3",
3
+ "version": "0.4.0",
4
4
  "description": "Voice interface for pi coding agent",
5
5
  "author": "Yuku Kotani",
6
6
  "license": "MIT",
@@ -36,6 +36,7 @@
36
36
  "typescript": "^5"
37
37
  },
38
38
  "dependencies": {
39
+ "@elevenlabs/elevenlabs-js": "^2.35.0",
39
40
  "@google/genai": "^1.40.0",
40
41
  "@mariozechner/pi-coding-agent": "^0.52.7",
41
42
  "@napi-rs/whisper": "^0.0.4",