npm - @acpfx/stt-elevenlabs - Versions diffs - 0.2.0 - Mend

@acpfx/stt-elevenlabs 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,21 @@
+# @acpfx/stt-elevenlabs
+## 0.2.0
+### Minor Changes
+- d757640: Initial release: type-safe contracts, Rust orchestrator, manifest-driven event filtering
+  - Rust schema crate as canonical event type source of truth with codegen to TypeScript + Zod
+  - Node manifests (manifest.yaml) declaring consumes/emits contracts
+  - Orchestrator event filtering: nodes only receive declared events
+  - Rust orchestrator with ratatui TUI (--ui flag)
+  - node-sdk with structured logging helpers
+  - CI/CD with GitHub Actions and changesets
+  - Platform-specific npm packages for Rust binaries (esbuild-style distribution)
+### Patch Changes
+- Updated dependencies [d757640]
+  - @acpfx/core@0.2.0
+  - @acpfx/node-sdk@0.2.0

package/manifest.yaml ADDED Viewed

@@ -0,0 +1,12 @@
+name: stt-elevenlabs
+description: Speech-to-text via ElevenLabs streaming API
+consumes:
+  - audio.chunk
+emits:
+  - speech.partial
+  - speech.delta
+  - speech.final
+  - speech.pause
+  - lifecycle.ready
+  - lifecycle.done
+  - control.error

package/package.json ADDED Viewed

@@ -0,0 +1,16 @@
+{
+  "name": "@acpfx/stt-elevenlabs",
+  "version": "0.2.0",
+  "type": "module",
+  "bin": {
+    "acpfx-stt-elevenlabs": "./dist/index.js"
+  },
+  "main": "./dist/index.js",
+  "dependencies": {
+    "@acpfx/core": "0.2.0",
+    "@acpfx/node-sdk": "0.2.0"
+  },
+  "scripts": {
+    "build": "esbuild src/index.ts --bundle --platform=node --format=esm --outfile=dist/index.js --packages=external"
+  }
+}

package/src/index.ts ADDED Viewed

@@ -0,0 +1,273 @@
+/**
+ * stt-elevenlabs node — ElevenLabs Scribe v2 Realtime STT with built-in VAD.
+ *
+ * Reads audio.chunk events from stdin, streams to ElevenLabs WebSocket,
+ * emits speech.partial, speech.delta, speech.final, and speech.pause events.
+ *
+ * Uses commit_strategy=vad so ElevenLabs handles pause detection server-side.
+ *
+ * Settings (via ACPFX_SETTINGS):
+ *   language?: string   — language code (default: "en")
+ *   apiKey?: string      — ElevenLabs API key (falls back to ELEVENLABS_API_KEY env)
+ *   pauseMs?: number     — VAD silence threshold hint (default: 600)
+ */
+import { emit, log, onEvent, handleManifestFlag } from "@acpfx/node-sdk";
+handleManifestFlag();
+const WS_URL = "wss://api.elevenlabs.io/v1/speech-to-text/realtime";
+const MODEL = "scribe_v2_realtime";
+type Settings = {
+  language?: string;
+  apiKey?: string;
+  pauseMs?: number;
+  vadThreshold?: number;         // 0-1, default 0.5 (higher = less sensitive)
+  minSpeechDurationMs?: number;  // default 250 (ignore short noise bursts)
+  minSilenceDurationMs?: number; // default 100
+};
+const settings: Settings = JSON.parse(process.env.ACPFX_SETTINGS || "{}");
+const LANGUAGE = settings.language ?? "en";
+const API_KEY = settings.apiKey ?? process.env.ELEVENLABS_API_KEY ?? "";
+const TRACK_ID = "stt";
+if (!API_KEY) {
+  log.error("No API key. Set ELEVENLABS_API_KEY or settings.apiKey");
+  process.exit(1);
+}
+let ws: WebSocket | null = null;
+let connected = false;
+let reconnecting = false;
+let interrupted = false;
+let lastPartialText = "";
+let accumulatedText = "";
+let partialStaleTimer: ReturnType<typeof setTimeout> | null = null;
+const PARTIAL_STALE_MS = 3000;
+async function connectWebSocket(): Promise<void> {
+  const vadSilenceSecs = (settings.pauseMs ?? 600) / 1000;
+  const vadThreshold = settings.vadThreshold ?? 0.5;
+  const minSpeechMs = settings.minSpeechDurationMs ?? 250;
+  const minSilenceMs = settings.minSilenceDurationMs ?? 100;
+  const url =
+    `${WS_URL}?model_id=${MODEL}` +
+    `&language_code=${encodeURIComponent(LANGUAGE)}` +
+    `&sample_rate=16000` +
+    `&encoding=pcm_s16le` +
+    `&commit_strategy=vad` +
+    `&vad_silence_threshold_secs=${vadSilenceSecs}` +
+    `&vad_threshold=${vadThreshold}` +
+    `&min_speech_duration_ms=${minSpeechMs}` +
+    `&min_silence_duration_ms=${minSilenceMs}`;
+  ws = new WebSocket(url, {
+    headers: { "xi-api-key": API_KEY },
+  } as unknown as string[]);
+  await new Promise<void>((resolve, reject) => {
+    ws!.addEventListener(
+      "open",
+      () => {
+        connected = true;
+        log.info("Connected to ElevenLabs STT");
+        resolve();
+      },
+      { once: true },
+    );
+    ws!.addEventListener(
+      "error",
+      () => {
+        reject(new Error("WebSocket connection failed"));
+      },
+      { once: true },
+    );
+  });
+  ws.addEventListener("message", (event: MessageEvent) => {
+    try {
+      const data =
+        typeof event.data === "string"
+          ? event.data
+          : Buffer.from(event.data as ArrayBuffer).toString("utf-8");
+      const msg = JSON.parse(data);
+      handleServerMessage(msg);
+    } catch {
+      // ignore parse errors
+    }
+  });
+  ws.addEventListener("error", (event: Event) => {
+    log.error(`WebSocket error: ${(event as ErrorEvent).message ?? "unknown"}`);
+    emit({
+      type: "control.error",
+      component: "stt-elevenlabs",
+      message: "WebSocket error",
+      fatal: false,
+    });
+  });
+  ws.addEventListener("close", () => {
+    connected = false;
+    log.info("WebSocket closed — will reconnect on next audio");
+  });
+}
+function handleServerMessage(msg: Record<string, unknown>): void {
+  const msgType = msg.message_type as string;
+  if (interrupted) return;
+  if (msgType === "partial_transcript") {
+    const text = (msg.text as string) ?? "";
+    if (!text) return;
+    // Check if this is a correction of a previous partial
+    if (lastPartialText && text !== lastPartialText && !text.startsWith(lastPartialText)) {
+      // This is a correction — emit speech.delta with replaces
+      emit({
+        type: "speech.delta",
+        trackId: TRACK_ID,
+        text,
+        replaces: lastPartialText,
+      });
+    } else {
+      emit({
+        type: "speech.partial",
+        trackId: TRACK_ID,
+        text,
+      });
+    }
+    lastPartialText = text;
+    // If partial never gets committed, force a commit after timeout.
+    // Continuous audio stream means the API may never see "end of speech."
+    if (partialStaleTimer) clearTimeout(partialStaleTimer);
+    partialStaleTimer = setTimeout(() => {
+      if (lastPartialText && !interrupted && ws && connected) {
+        log.info(`Stale partial: forcing commit`);
+        ws.send(JSON.stringify({
+          message_type: "input_audio_chunk",
+          audio_base_64: "",
+          commit: true,
+          sample_rate: 16000,
+        }));
+      }
+      partialStaleTimer = null;
+    }, PARTIAL_STALE_MS);
+  } else if (
+    msgType === "committed_transcript" ||
+    msgType === "committed_transcript_with_timestamps"
+  ) {
+    const text = (msg.text as string) ?? "";
+    if (!text) return;
+    // Clear stale timer — proper commit arrived
+    if (partialStaleTimer) { clearTimeout(partialStaleTimer); partialStaleTimer = null; }
+    lastPartialText = "";
+    // Emit speech.final
+    emit({
+      type: "speech.final",
+      trackId: TRACK_ID,
+      text,
+    });
+    accumulatedText = accumulatedText ? `${accumulatedText} ${text}` : text;
+    // When using VAD commit_strategy, a committed_transcript means
+    // ElevenLabs detected a pause. Emit speech.pause.
+    emit({
+      type: "speech.pause",
+      trackId: TRACK_ID,
+      pendingText: accumulatedText,
+      silenceMs: settings.pauseMs ?? 600,
+    });
+    // Reset for next utterance
+    lastPartialText = "";
+    accumulatedText = "";
+  } else if (msgType === "auth_error" || msgType === "error") {
+    const errMsg =
+      (msg.message as string) ?? (msg.error as string) ?? msgType;
+    log.error(`Server error: ${errMsg}`);
+    emit({
+      type: "control.error",
+      component: "stt-elevenlabs",
+      message: errMsg,
+      fatal: msgType === "auth_error",
+    });
+  }
+}
+function sendAudio(base64Data: string): void {
+  if (!ws || !connected) return;
+  ws.send(
+    JSON.stringify({
+      message_type: "input_audio_chunk",
+      audio_base_64: base64Data,
+      commit: false,
+      sample_rate: 16000,
+    }),
+  );
+}
+function closeWebSocket(): void {
+  connected = false;
+  if (ws) {
+    try {
+      ws.close();
+    } catch {
+      // ignore
+    }
+    ws = null;
+  }
+}
+// --- Main ---
+async function main(): Promise<void> {
+  await connectWebSocket();
+  // Emit lifecycle.ready after WS is connected
+  emit({ type: "lifecycle.ready", component: "stt-elevenlabs" });
+  const rl = onEvent((event) => {
+    if (event.type === "audio.chunk") {
+      if (!connected && !reconnecting) {
+        reconnecting = true;
+        interrupted = false;
+        log.info("Reconnecting...");
+        connectWebSocket().then(() => {
+          reconnecting = false;
+          sendAudio(event.data as string);
+        }).catch(() => {
+          reconnecting = false;
+        });
+      } else if (connected && !interrupted) {
+        sendAudio(event.data as string);
+      }
+    } else if (event.type === "control.interrupt") {
+      // Don't close WebSocket — STT should keep listening for barge-in.
+    }
+  });
+  rl.on("close", () => {
+    closeWebSocket();
+    emit({ type: "lifecycle.done", component: "stt-elevenlabs" });
+    process.exit(0);
+  });
+  process.on("SIGTERM", () => {
+    closeWebSocket();
+    process.exit(0);
+  });
+}
+main().catch((err) => {
+  log.error(`Fatal: ${err.message}`);
+  process.exit(1);
+});