npm - pi-voice - Versions diffs - 0.2.0 → 0.4.0 - Mend

pi-voice 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -2,7 +2,9 @@
 Headless voice interface for the [Pi Coding Agent](https://github.com/badlogic/pi-mono). Hold a key, speak, and pi executes your instructions with voice feedback.
-https://github.com/user-attachments/assets/06a0c56f-76bf-48cb-9de3-f9ca48a7245d
+#### Demo using ElevenLabs provider (make sure unmuted)
+https://github.com/user-attachments/assets/76adb941-83cf-4394-b8d2-f6d73a1df8bc
 ## Installation
@@ -52,15 +54,16 @@ You can configure pi-voice in `.pi/pi-voice.json`:
 | Key | Description |
 | --- | --- |
 | `key` | Push-to-talk shortcut. Combine modifiers (`ctrl`, `shift`, `alt`/`opt`, `meta`/`cmd`) and a main key with `+`. Examples: `"ctrl+t"`, `"alt+space"`, `"ctrl+shift+r"`. Default: `"meta+shift+i"`. |
-| `provider` | Speech provider for STT & TTS. `"local"`, `"gemini"` (Vertex AI), or `"openai"`. Default: `"local"`. |
+| `provider` | Speech provider for STT & TTS. `"local"`, `"gemini"` (Vertex AI or Gemini API), `"openai"`, or `"elevenlabs"`. Default: `"local"`. |
 ### Environment variables
 | Provider | Required variables |
 | --- | --- |
 | `local` | None (model is auto-downloaded on first launch). Optional: `WHISPER_MODEL_PATH` (custom model path), `WHISPER_MODEL` (model name, default `medium-q5_0`), `SAY_VOICE` (macOS `say` voice name, e.g. `"Kyoko"`). |
-| `gemini` | `GOOGLE_CLOUD_PROJECT`, `GOOGLE_CLOUD_LOCATION` (optional, default `us-central1`) |
+| `gemini` | **Vertex AI:** `GOOGLE_CLOUD_PROJECT`, `GOOGLE_CLOUD_LOCATION` (optional, default `us-central1`). **Gemini API:** `GEMINI_API_KEY` or `GOOGLE_API_KEY`. If `GOOGLE_CLOUD_PROJECT` is set, Vertex AI is used; set `GOOGLE_GENAI_USE_VERTEXAI=false` to force API key mode. |
 | `openai` | `OPENAI_API_KEY` |
+| `elevenlabs` | `ELEVENLABS_API_KEY`. Optional: `ELEVENLABS_VOICE_ID` (TTS voice, default `CwhRBWXzGAHq8TQ4Fs17`), `ELEVENLABS_TTS_MODEL` (default `eleven_flash_v2_5`). |
 #### Logging

package/out/cli/cli.js CHANGED Viewed

@@ -1952,7 +1952,7 @@ var require_indexes = __commonJS((exports, module) => {
 // node_modules/thread-stream/index.js
 var require_thread_stream = __commonJS((exports, module) => {
-  var __dirname = "/Users/yukukotani/ghq/github.com/yukukotani/pi-voice/node_modules/thread-stream";
+  var __dirname = "/home/runner/work/pi-voice/pi-voice/node_modules/thread-stream";
   var { version } = require_package();
   var { EventEmitter } = __require("events");
   var { Worker } = __require("worker_threads");
@@ -2377,7 +2377,7 @@ var require_thread_stream = __commonJS((exports, module) => {
 // node_modules/pino/lib/transport.js
 var require_transport = __commonJS((exports, module) => {
-  var __dirname = "/Users/yukukotani/ghq/github.com/yukukotani/pi-voice/node_modules/pino/lib";
+  var __dirname = "/home/runner/work/pi-voice/pi-voice/node_modules/pino/lib";
   var { createRequire: createRequire2 } = __require("module");
   var { existsSync: existsSync2 } = __require("node:fs");
   var getCallers = require_caller();
@@ -4492,7 +4492,7 @@ var require_node_gyp_build2 = __commonJS((exports, module) => {
 // node_modules/uiohook-napi/dist/index.js
 var require_dist = __commonJS((exports) => {
-  var __dirname = "/Users/yukukotani/ghq/github.com/yukukotani/pi-voice/node_modules/uiohook-napi/dist";
+  var __dirname = "/home/runner/work/pi-voice/pi-voice/node_modules/uiohook-napi/dist";
   Object.defineProperty(exports, "__esModule", { value: true });
   exports.uIOhook = exports.UiohookKey = exports.WheelDirection = exports.EventType = undefined;
   var events_1 = __require("events");
@@ -18511,7 +18511,7 @@ var configFileSchema = exports_external.object({
       return false;
     }
   }, { message: "Invalid key binding" }).optional().default(DEFAULT_KEY_STRING),
-  provider: exports_external.enum(["local", "gemini", "openai"]).optional().default(DEFAULT_PROVIDER)
+  provider: exports_external.enum(["local", "gemini", "openai", "elevenlabs"]).optional().default(DEFAULT_PROVIDER)
 });
 class ConfigError extends Error {

package/out/main/index.js CHANGED Viewed

@@ -6,9 +6,10 @@ import { homedir } from "node:os";
 import pino from "pino";
 import { readFileSync, existsSync, mkdirSync, createWriteStream, unlinkSync, writeFileSync } from "node:fs";
 import { z } from "zod";
-import { GoogleGenAI } from "@google/genai";
 import OpenAI, { toFile } from "openai";
+import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
 import { WhisperFullParams, WhisperSamplingStrategy, Whisper } from "@napi-rs/whisper";
+import { GoogleGenAI } from "@google/genai";
 import { Readable } from "node:stream";
 import { finished } from "node:stream/promises";
 import { spawn } from "node:child_process";
@@ -259,7 +260,7 @@ const configFileSchema = z.object({
     },
     { message: "Invalid key binding" }
   ).optional().default(DEFAULT_KEY_STRING),
-  provider: z.enum(["local", "gemini", "openai"]).optional().default(DEFAULT_PROVIDER)
+  provider: z.enum(["local", "gemini", "openai", "elevenlabs"]).optional().default(DEFAULT_PROVIDER)
 });
 class ConfigError extends Error {
   constructor(configPath, details) {
@@ -302,6 +303,26 @@ function loadConfig(cwd) {
   logger.info({ key: display, provider: parsed.provider, configPath }, "Loaded config");
   return { key: binding, keyDisplay: display, provider: parsed.provider };
 }
+let geminiClient = null;
+function getGeminiClient() {
+  if (geminiClient) return geminiClient;
+  const forceVertexOff = process.env.GOOGLE_GENAI_USE_VERTEXAI === "false";
+  const project = process.env.GOOGLE_CLOUD_PROJECT;
+  const location = process.env.GOOGLE_CLOUD_LOCATION ?? "us-central1";
+  const apiKey = process.env.GEMINI_API_KEY ?? process.env.GOOGLE_API_KEY;
+  if (project && !forceVertexOff) {
+    logger.info({ project, location }, "Initializing Gemini client (Vertex AI)");
+    geminiClient = new GoogleGenAI({ vertexai: true, project, location });
+  } else if (apiKey) {
+    logger.info("Initializing Gemini client (API key)");
+    geminiClient = new GoogleGenAI({ apiKey });
+  } else {
+    throw new Error(
+      "Gemini provider requires either GOOGLE_CLOUD_PROJECT (for Vertex AI) or GEMINI_API_KEY / GOOGLE_API_KEY (for Gemini API)."
+    );
+  }
+  return geminiClient;
+}
 const DEFAULT_MODEL = "medium-q5_0";
 const HF_BASE_URL = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main";
 function modelCacheDir() {
@@ -393,17 +414,6 @@ async function resolveModelPath() {
   await downloadModel(model, destPath);
   return destPath;
 }
-let geminiClient$1 = null;
-function getGeminiClient$1() {
-  if (geminiClient$1) return geminiClient$1;
-  const project = process.env.GOOGLE_CLOUD_PROJECT;
-  const location = process.env.GOOGLE_CLOUD_LOCATION ?? "us-central1";
-  if (!project) {
-    throw new Error("GOOGLE_CLOUD_PROJECT environment variable is required");
-  }
-  geminiClient$1 = new GoogleGenAI({ vertexai: true, project, location });
-  return geminiClient$1;
-}
 let openaiClient$1 = null;
 function getOpenAIClient$1() {
   if (openaiClient$1) return openaiClient$1;
@@ -429,7 +439,7 @@ async function getWhisperInstance() {
   return whisperInitPromise;
 }
 async function transcribeGemini(audioBuffer) {
-  const client = getGeminiClient$1();
+  const client = getGeminiClient();
   const base64Audio = audioBuffer.toString("base64");
   const response = await client.models.generateContent({
     model: "gemini-2.5-flash",
@@ -461,6 +471,34 @@ async function transcribeOpenAI(audioBuffer) {
   });
   return transcription.text?.trim() ?? "";
 }
+let elevenlabsClient$1 = null;
+function getElevenLabsClient$1() {
+  if (elevenlabsClient$1) return elevenlabsClient$1;
+  const apiKey = process.env.ELEVENLABS_API_KEY;
+  if (!apiKey) {
+    throw new Error("ELEVENLABS_API_KEY environment variable is required");
+  }
+  elevenlabsClient$1 = new ElevenLabsClient({ apiKey });
+  return elevenlabsClient$1;
+}
+async function transcribeElevenLabs(audioBuffer) {
+  const client = getElevenLabsClient$1();
+  const result = await client.speechToText.convert({
+    file: {
+      data: audioBuffer,
+      filename: "recording.webm",
+      contentType: "audio/webm"
+    },
+    modelId: "scribe_v2"
+  });
+  if ("text" in result) {
+    return (result.text ?? "").trim();
+  }
+  if ("transcripts" in result && result.transcripts?.[0]) {
+    return (result.transcripts[0].text ?? "").trim();
+  }
+  return "";
+}
 async function transcribeLocal(samples) {
   const whisper = await getWhisperInstance();
   const params = new WhisperFullParams(WhisperSamplingStrategy.Greedy);
@@ -483,6 +521,9 @@ async function transcribe(audioData, provider = "local") {
     case "openai":
       text = await transcribeOpenAI(Buffer.from(audioData));
       break;
+    case "elevenlabs":
+      text = await transcribeElevenLabs(Buffer.from(audioData));
+      break;
     case "gemini":
     default:
       text = await transcribeGemini(Buffer.from(audioData));
@@ -491,17 +532,6 @@ async function transcribe(audioData, provider = "local") {
   logger.info({ provider, text }, "Transcribed");
   return text;
 }
-let geminiClient = null;
-function getGeminiClient() {
-  if (geminiClient) return geminiClient;
-  const project = process.env.GOOGLE_CLOUD_PROJECT;
-  const location = process.env.GOOGLE_CLOUD_LOCATION ?? "us-central1";
-  if (!project) {
-    throw new Error("GOOGLE_CLOUD_PROJECT environment variable is required");
-  }
-  geminiClient = new GoogleGenAI({ vertexai: true, project, location });
-  return geminiClient;
-}
 let openaiClient = null;
 function getOpenAIClient() {
   if (openaiClient) return openaiClient;
@@ -515,7 +545,7 @@ function getOpenAIClient() {
 const TTS_SAMPLE_RATE = 24e3;
 const TTS_CHANNELS = 1;
 const TTS_BITS_PER_SAMPLE = 16;
-const OPENAI_PCM_CHUNK_SIZE = TTS_SAMPLE_RATE * (TTS_BITS_PER_SAMPLE / 8) * TTS_CHANNELS * 0.1;
+const PCM_CHUNK_SIZE = TTS_SAMPLE_RATE * (TTS_BITS_PER_SAMPLE / 8) * TTS_CHANNELS * 0.1;
 async function* synthesizeStreamGemini(text) {
   const client = getGeminiClient();
   const response = await client.models.generateContentStream({
@@ -585,7 +615,7 @@ async function* synthesizeStreamOpenAI(text) {
   let totalBytes = 0;
   let offset = 0;
   while (offset < fullBuffer.length) {
-    const end = Math.min(offset + OPENAI_PCM_CHUNK_SIZE, fullBuffer.length);
+    const end = Math.min(offset + PCM_CHUNK_SIZE, fullBuffer.length);
     const chunk = fullBuffer.subarray(offset, end);
     totalBytes += chunk.length;
     yield chunk;
@@ -596,6 +626,48 @@ async function* synthesizeStreamOpenAI(text) {
     "Streamed PCM audio"
   );
 }
+let elevenlabsClient = null;
+function getElevenLabsClient() {
+  if (elevenlabsClient) return elevenlabsClient;
+  const apiKey = process.env.ELEVENLABS_API_KEY;
+  if (!apiKey) {
+    throw new Error("ELEVENLABS_API_KEY environment variable is required");
+  }
+  elevenlabsClient = new ElevenLabsClient({ apiKey });
+  return elevenlabsClient;
+}
+const DEFAULT_ELEVENLABS_VOICE_ID = "CwhRBWXzGAHq8TQ4Fs17";
+async function* synthesizeStreamElevenLabs(text) {
+  const client = getElevenLabsClient();
+  const voiceId = process.env.ELEVENLABS_VOICE_ID ?? DEFAULT_ELEVENLABS_VOICE_ID;
+  const modelId = process.env.ELEVENLABS_TTS_MODEL ?? "eleven_flash_v2_5";
+  const audio = await client.textToSpeech.convert(voiceId, {
+    text,
+    modelId,
+    outputFormat: "pcm_24000"
+  });
+  const chunks = [];
+  const reader = audio.getReader();
+  while (true) {
+    const { done, value } = await reader.read();
+    if (done) break;
+    chunks.push(value);
+  }
+  const fullBuffer = Buffer.concat(chunks);
+  let totalBytes = 0;
+  let offset = 0;
+  while (offset < fullBuffer.length) {
+    const end = Math.min(offset + PCM_CHUNK_SIZE, fullBuffer.length);
+    const chunk = fullBuffer.subarray(offset, end);
+    totalBytes += chunk.length;
+    yield chunk;
+    offset = end;
+  }
+  logger.info(
+    { provider: "elevenlabs", totalBytes, text: text.substring(0, 50) },
+    "Streamed PCM audio"
+  );
+}
 function speakLocal(text) {
   return new Promise((resolve, reject) => {
     if (process.platform !== "darwin") {
@@ -634,6 +706,9 @@ async function* synthesizeStream(text, provider = "local") {
     case "openai":
       yield* synthesizeStreamOpenAI(text);
       break;
+    case "elevenlabs":
+      yield* synthesizeStreamElevenLabs(text);
+      break;
     case "gemini":
     default:
       yield* synthesizeStreamGemini(text);

package/package.json CHANGED Viewed

@@ -1,9 +1,13 @@
 {
   "name": "pi-voice",
-  "version": "0.2.0",
+  "version": "0.4.0",
   "description": "Voice interface for pi coding agent",
   "author": "Yuku Kotani",
   "license": "MIT",
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/yukukotani/pi-voice"
+  },
   "type": "module",
   "main": "./out/main/index.js",
   "bin": {
@@ -32,6 +36,7 @@
     "typescript": "^5"
   },
   "dependencies": {
+    "@elevenlabs/elevenlabs-js": "^2.35.0",
     "@google/genai": "^1.40.0",
     "@mariozechner/pi-coding-agent": "^0.52.7",
     "@napi-rs/whisper": "^0.0.4",