npm - @cheeko-ai/esp32-voice - Versions diffs - 2026.2.21 - Mend

@cheeko-ai/esp32-voice 2026.2.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/NPM_PUBLISH_READINESS.md +299 -0
package/README.md +226 -0
package/TODO.md +418 -0
package/index.ts +128 -0
package/openclaw.plugin.json +9 -0
package/package.json +62 -0
package/src/accounts.ts +110 -0
package/src/channel.ts +270 -0
package/src/config-schema.ts +37 -0
package/src/device/device-otp.ts +173 -0
package/src/http-handler.ts +154 -0
package/src/monitor.ts +124 -0
package/src/onboarding.ts +575 -0
package/src/runtime.ts +14 -0
package/src/stt/deepgram.ts +215 -0
package/src/stt/stt-provider.ts +107 -0
package/src/stt/stt-registry.ts +71 -0
package/src/tts/elevenlabs.ts +215 -0
package/src/tts/tts-provider.ts +111 -0
package/src/tts/tts-registry.ts +71 -0
package/src/types.ts +136 -0
package/src/voice/voice-endpoint.ts +296 -0
package/src/voice/voice-session.ts +1041 -0

package/src/tts/tts-provider.ts ADDED Viewed

@@ -0,0 +1,111 @@
+/**
+ * Text-to-Speech (TTS) provider interface.
+ *
+ * All TTS providers must implement this interface. Providers can be
+ * streaming (WebSocket-based, receiving audio chunks in real time) or
+ * batch (send complete text, receive full audio).
+ *
+ * Modeled after OpenClaw's multi-provider architecture — new providers
+ * can be added by implementing this interface and registering with
+ * the TTS registry.
+ */
+export type TtsAudioCallback = (pcmChunk: Buffer) => void | Promise<void>;
+export type TtsDoneCallback = () => void | Promise<void>;
+export interface TtsProviderConfig {
+  /** Provider-specific API key. */
+  apiKey: string;
+  /** Voice identifier (e.g., ElevenLabs voice ID). */
+  voiceId?: string;
+  /** Model identifier. */
+  model?: string;
+  /** Language code. */
+  language?: string;
+  /** Output sample rate in Hz (default: 24000). */
+  sampleRate?: number;
+  /** Additional provider-specific options. */
+  options?: Record<string, unknown>;
+}
+export interface TtsProvider {
+  /** Unique provider identifier (e.g., "elevenlabs", "google", "edge-tts"). */
+  readonly id: string;
+  /** Human-readable provider name. */
+  readonly name: string;
+  /** Whether this provider supports real-time streaming. */
+  readonly streaming: boolean;
+  /** Output sample rate in Hz. */
+  readonly outputSampleRate: number;
+  /**
+   * Called when PCM audio data is received from the TTS service.
+   * Audio format: 16-bit signed LE PCM, mono, at `outputSampleRate` Hz.
+   * Set this before calling `connect()`.
+   */
+  onAudio: TtsAudioCallback | null;
+  /**
+   * Called when the TTS synthesis is complete (all audio sent).
+   * Set this before calling `connect()`.
+   */
+  onDone: TtsDoneCallback | null;
+  /**
+   * Open a connection to the TTS service.
+   * For streaming providers, this opens a WebSocket.
+   */
+  connect(): Promise<void>;
+  /**
+   * Send text to be synthesized into speech.
+   * For streaming providers: can be called multiple times for partial text.
+   * For batch providers: should be called once with the full text.
+   *
+   * @param text - The text to synthesize.
+   */
+  synthesize(text: string): Promise<void>;
+  /**
+   * Signal end of text input and wait for all audio to be delivered.
+   * After this resolves, all audio has been sent via `onAudio`.
+   */
+  flush(): Promise<void>;
+  /**
+   * Close the connection and release resources.
+   */
+  close(): Promise<void>;
+}
+/**
+ * Factory function type for creating TTS provider instances.
+ */
+export type TtsProviderFactory = (config: TtsProviderConfig) => TtsProvider;
+/**
+ * Metadata about a registered TTS provider.
+ */
+export interface TtsProviderMeta {
+  /** Provider ID. */
+  id: string;
+  /** Human-readable name. */
+  name: string;
+  /** Short description. */
+  description: string;
+  /** Whether it supports streaming. */
+  streaming: boolean;
+  /** Required environment variable for the API key. */
+  envVar: string;
+  /** Default voice ID. */
+  defaultVoiceId?: string;
+  /** Default model. */
+  defaultModel?: string;
+  /** Output sample rate. */
+  outputSampleRate: number;
+  /** Documentation URL. */
+  docsUrl?: string;
+}

package/src/tts/tts-registry.ts ADDED Viewed

@@ -0,0 +1,71 @@
+/**
+ * TTS Provider Registry.
+ *
+ * Central registry for text-to-speech providers. Providers register
+ * themselves with a factory function, and the voice session creates
+ * instances as needed.
+ *
+ * Usage:
+ *   ttsRegistry.register(elevenlabsMeta, createElevenLabsTts);
+ *   const provider = ttsRegistry.create("elevenlabs", { apiKey: "..." });
+ */
+import type { TtsProvider, TtsProviderConfig, TtsProviderFactory, TtsProviderMeta } from "./tts-provider.js";
+interface RegisteredTtsProvider {
+  meta: TtsProviderMeta;
+  factory: TtsProviderFactory;
+}
+class TtsRegistry {
+  private providers = new Map<string, RegisteredTtsProvider>();
+  /**
+   * Register a new TTS provider.
+   */
+  register(meta: TtsProviderMeta, factory: TtsProviderFactory): void {
+    if (this.providers.has(meta.id)) {
+      console.warn(`[tts-registry] Provider "${meta.id}" is already registered, overwriting.`);
+    }
+    this.providers.set(meta.id, { meta, factory });
+    console.log(`[tts-registry] Registered TTS provider: ${meta.name} (${meta.id})`);
+  }
+  /**
+   * Create an instance of a registered TTS provider.
+   */
+  create(providerId: string, config: TtsProviderConfig): TtsProvider {
+    const registered = this.providers.get(providerId);
+    if (!registered) {
+      const available = [...this.providers.keys()].join(", ");
+      throw new Error(
+        `TTS provider "${providerId}" not found. Available: ${available || "none"}`,
+      );
+    }
+    return registered.factory(config);
+  }
+  /**
+   * Get metadata for a registered provider.
+   */
+  getMeta(providerId: string): TtsProviderMeta | undefined {
+    return this.providers.get(providerId)?.meta;
+  }
+  /**
+   * List all registered providers.
+   */
+  list(): TtsProviderMeta[] {
+    return [...this.providers.values()].map((p) => p.meta);
+  }
+  /**
+   * Check if a provider is registered.
+   */
+  has(providerId: string): boolean {
+    return this.providers.has(providerId);
+  }
+}
+/** Global TTS provider registry. */
+export const ttsRegistry = new TtsRegistry();

package/src/types.ts ADDED Viewed

@@ -0,0 +1,136 @@
+import type { DmPolicy } from "openclaw/plugin-sdk";
+// ── STT / TTS Provider Config ─────────────────────────────────
+export type SttProviderChoice = "deepgram" | "google" | "whisper" | "azure" | string;
+export type TtsProviderChoice = "elevenlabs" | "google" | "edge-tts" | "azure" | string;
+// ── Device / Account Config ───────────────────────────────────
+/**
+ * Configuration for a single ESP32 Voice device account.
+ */
+export type Esp32VoiceAccountConfig = {
+  /** Optional human-readable name for this device. */
+  name?: string;
+  /** If false, do not accept messages from this device. Default: true. */
+  enabled?: boolean;
+  // ── Authentication ──
+  /**
+   * Shared secret token for device authentication.
+   * Generated during OTP pairing or set manually.
+   */
+  deviceToken?: string;
+  /** Unique device identifier. */
+  deviceId?: string;
+  // ── Security ──
+  /** DM security policy. Default: "pairing". */
+  dmPolicy?: DmPolicy;
+  /** Allowlist of device IDs allowed to communicate. */
+  allowFrom?: string[];
+  // ── STT Configuration ──
+  /** Speech-to-text provider ID. Default: "deepgram". */
+  sttProvider?: SttProviderChoice;
+  /** STT API key (overrides env var). */
+  sttApiKey?: string;
+  /** STT model (e.g., "nova-2" for Deepgram). */
+  sttModel?: string;
+  // ── TTS Configuration ──
+  /** Text-to-speech provider ID. Default: "elevenlabs". */
+  ttsProvider?: TtsProviderChoice;
+  /** TTS API key (overrides env var). */
+  ttsApiKey?: string;
+  /** TTS voice ID. */
+  ttsVoiceId?: string;
+  /** TTS model ID. */
+  ttsModel?: string;
+  // ── Voice Pipeline ──
+  /** Max response length in characters. Default: 500. */
+  maxResponseLength?: number;
+  /** Whether to optimize prompts for voice output. Default: true. */
+  voiceOptimized?: boolean;
+  /** Language code (ISO 639-1). Default: "en". */
+  language?: string;
+};
+/**
+ * Top-level ESP32 Voice channel configuration.
+ * Supports single-device (flat) and multi-device (accounts map) modes.
+ */
+export type Esp32VoiceConfig = {
+  /** Per-device configuration (multi-device mode). */
+  accounts?: Record<string, Esp32VoiceAccountConfig>;
+} & Esp32VoiceAccountConfig;
+/**
+ * Resolved account with computed defaults and source tracking.
+ */
+export type ResolvedEsp32VoiceAccount = {
+  accountId: string;
+  name?: string;
+  enabled: boolean;
+  deviceToken?: string;
+  deviceTokenSource: "config" | "env" | "none";
+  deviceId?: string;
+  sttProvider: string;
+  sttApiKey?: string;
+  sttModel?: string;
+  ttsProvider: string;
+  ttsApiKey?: string;
+  ttsVoiceId?: string;
+  ttsModel?: string;
+  maxResponseLength: number;
+  voiceOptimized: boolean;
+  language: string;
+  config: Esp32VoiceAccountConfig;
+};
+// ── Voice Protocol Messages ───────────────────────────────────
+/**
+ * Hello message sent by the ESP32 during WebSocket handshake.
+ * Contains OpenClaw credentials, STT/TTS provider config, and optional OTP.
+ */
+export type Esp32VoiceHelloMessage = {
+  type: "hello";
+  /** Device identifier. */
+  deviceId?: string;
+  /** Device firmware version. */
+  version?: number;
+  /** Transport type (always "websocket" for ESP32). */
+  transport?: string;
+  /** Audio parameters. */
+  audio_params?: {
+    format: string;
+    sample_rate: number;
+    channels: number;
+    frame_duration?: number;
+  };
+  /** OTP code for initial pairing. */
+  otp?: string;
+  /** OpenClaw Gateway credentials. */
+  openclaw?: {
+    url: string;
+    token: string;
+  };
+  /** STT provider overrides. */
+  stt?: {
+    provider?: string;
+    apiKey?: string;
+    model?: string;
+  };
+  /** TTS provider overrides. */
+  tts?: {
+    provider?: string;
+    apiKey?: string;
+    voiceId?: string;
+    model?: string;
+  };
+  /** Language code. */
+  language?: string;
+};

package/src/voice/voice-endpoint.ts ADDED Viewed

@@ -0,0 +1,296 @@
+/**
+ * WebSocket endpoint for voice streaming.
+ *
+ * Ported from cheekoclaw_bridge/voice_endpoint.py
+ *
+ * Runs a **standalone** HTTP + WebSocket server on its own port (default 8765)
+ * because the OpenClaw Gateway plugin API does not support WebSocket upgrade
+ * registration. The ESP32 connects directly to this server.
+ *
+ * Routes any WebSocket connection to a VoiceSession per client.
+ */
+import crypto from "node:crypto";
+import { networkInterfaces, homedir } from "node:os";
+import { readFileSync, writeFileSync, existsSync, mkdirSync } from "node:fs";
+import { join } from "node:path";
+import { createServer, type IncomingMessage, type ServerResponse } from "node:http";
+import type { Duplex } from "node:stream";
+import { WebSocketServer, WebSocket } from "ws";
+import { VoiceSession } from "./voice-session.js";
+// Import providers to trigger auto-registration
+import "../stt/deepgram.js";
+import "../tts/elevenlabs.js";
+/** Default port for the standalone voice WebSocket server. */
+const DEFAULT_VOICE_PORT = parseInt(process.env.ESP32_VOICE_PORT || "8765", 10);
+// ── Cheeko Dashboard Pairing (optional) ──────────────────────────────────────
+// If CHEEKO_PAIR env var is set, the plugin registers itself with the Cheeko
+// dashboard on startup so the user does not have to manually type their URL.
+// Set CHEEKO_DASHBOARD_URL to point at your dashboard (default: cheeko cloud).
+//
+// Usage:
+//   CHEEKO_PAIR=XK9-2M4 openclaw gateway
+//
+// The dashboard generates the token (GET /user/openclaw-pair/generate) and
+// displays the command for the user to copy and run. Once the plugin calls home,
+// the dashboard stores the voice URL and advances the onboarding flow.
+/**
+ * Auto-detect the machine's LAN IP address.
+ * Prefers common WiFi/Ethernet interfaces; falls back to first non-internal IPv4.
+ * Exported so the onboarding wizard can use the same detection logic.
+ */
+export function detectLocalIp(): string {
+  // Allow manual override (useful if auto-detect picks wrong interface)
+  if (process.env.MAC_IP) return process.env.MAC_IP;
+  const nets = networkInterfaces();
+  // Prefer these interfaces in order (macOS WiFi, macOS Ethernet, Linux eth, Linux wlan)
+  const preferred = ["en0", "en1", "eth0", "wlan0", "wlo1"];
+  for (const iface of preferred) {
+    const addrs = nets[iface];
+    if (!addrs) continue;
+    for (const addr of addrs) {
+      if (addr.family === "IPv4" && !addr.internal) {
+        return addr.address;
+      }
+    }
+  }
+  // Fallback: first non-internal IPv4 on any interface
+  for (const addrs of Object.values(nets)) {
+    if (!addrs) continue;
+    for (const addr of addrs) {
+      if (addr.family === "IPv4" && !addr.internal) {
+        return addr.address;
+      }
+    }
+  }
+  return "127.0.0.1";
+}
+/**
+ * Register this plugin with the Cheeko dashboard using a one-time pairing token.
+ *
+ * Called at server startup when the CHEEKO_PAIR environment variable is set.
+ * The plugin POSTs its voice WebSocket URL to the dashboard so the user's
+ * devices can be configured without manual URL entry.
+ *
+ * Environment variables:
+ *   CHEEKO_PAIR            — one-time pairing token (generated by dashboard)
+ *   CHEEKO_DASHBOARD_URL   — dashboard base URL (default: https://cheeko.app)
+ *   MAC_IP                 — override auto-detected LAN IP
+ *   ESP32_VOICE_PORT       — voice server port (default: 8765)
+ */
+/**
+ * Persist CHEEKO_PAIR and CHEEKO_DASHBOARD_URL into ~/.openclaw/.env so the
+ * plugin auto-registers on every subsequent `openclaw gateway` restart without
+ * the user needing to pass the env vars again on the command line.
+ *
+ * Safely upserts the keys — existing lines are replaced, new ones are appended.
+ */
+function savePairTokenToEnv(token: string, dashboardUrl: string): void {
+  const stateDir = process.env.OPENCLAW_STATE_DIR ?? join(homedir(), ".openclaw");
+  const envPath = join(stateDir, ".env");
+  try {
+    // Ensure ~/.openclaw/ exists
+    if (!existsSync(stateDir)) mkdirSync(stateDir, { recursive: true });
+    // Read existing .env (or start empty)
+    let lines: string[] = [];
+    if (existsSync(envPath)) {
+      lines = readFileSync(envPath, "utf8").split("\n");
+    }
+    // Upsert helper — replaces existing key or pushes new line
+    const upsert = (key: string, value: string) => {
+      const idx = lines.findIndex((l) => l.trimStart().startsWith(`${key}=`));
+      const line = `${key}=${value}`;
+      if (idx !== -1) {
+        lines[idx] = line;
+      } else {
+        lines.push(line);
+      }
+    };
+    upsert("CHEEKO_PAIR", token);
+    upsert("CHEEKO_DASHBOARD_URL", dashboardUrl);
+    writeFileSync(envPath, lines.join("\n").trimEnd() + "\n", "utf8");
+    console.log(`[esp32voice] ✅ Saved CHEEKO_PAIR to ${envPath} — auto-registers on next restart`);
+  } catch (err) {
+    // Non-fatal — token was already used successfully, just won't auto-persist
+    console.warn(`[esp32voice] ⚠️  Could not save token to ${envPath}: ${err instanceof Error ? err.message : err}`);
+  }
+}
+async function registerWithCheekoDashboard(voicePort: number): Promise<void> {
+  const pairToken = process.env.CHEEKO_PAIR;
+  if (!pairToken) return; // No pairing token — skip silently
+  const dashboardUrl = (process.env.CHEEKO_DASHBOARD_URL || "http://64.227.170.31:8001").replace(/\/$/, "");
+  // Backend API runs on port 8002 with /toy context path
+  const backendApiUrl = (process.env.CHEEKO_API_URL || "http://64.227.170.31:8002/toy").replace(/\/$/, "");
+  const localIp = detectLocalIp();
+  const voiceUrl = `ws://${localIp}:${voicePort}/`;
+  console.log(`[esp32voice] CHEEKO_PAIR detected — registering with dashboard API: ${backendApiUrl}`);
+  console.log(`[esp32voice] Voice URL to register: ${voiceUrl}`);
+  try {
+    const response = await fetch(`${backendApiUrl}/api/openclaw/pair`, {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify({
+        token: pairToken,
+        url: voiceUrl,
+        localIp,
+      }),
+      signal: AbortSignal.timeout(10_000), // 10-second timeout
+    });
+    if (!response.ok) {
+      const errorText = await response.text().catch(() => "(no body)");
+      console.warn(`[esp32voice] ⚠️  Dashboard registration failed (HTTP ${response.status}): ${errorText}`);
+      return;
+    }
+    const data = (await response.json()) as { ok?: boolean; error?: string };
+    if (data.ok) {
+      console.log(`[esp32voice] ✅ Registered with Cheeko dashboard — your device will connect to ${voiceUrl}`);
+      // Persist token so future restarts auto-register without passing CHEEKO_PAIR again
+      savePairTokenToEnv(pairToken, dashboardUrl);
+    } else {
+      console.warn(`[esp32voice] ⚠️  Dashboard registration rejected: ${data.error ?? "unknown error"}`);
+    }
+  } catch (err) {
+    const message = err instanceof Error ? err.message : String(err);
+    // Non-fatal — dashboard may be down, user can still use manual URL entry
+    console.warn(`[esp32voice] ⚠️  Could not reach Cheeko dashboard: ${message}`);
+    console.warn(`[esp32voice]    Manual setup: set your OpenClaw URL to ${voiceUrl} in the dashboard`);
+  }
+}
+/**
+ * Create the voice WebSocket server (noServer mode — for manual upgrade).
+ */
+export function createVoiceWebSocketServer(): WebSocketServer {
+  const wss = new WebSocketServer({ noServer: true });
+  wss.on("connection", (ws: WebSocket) => {
+    const sessionId = crypto.randomUUID().replace(/-/g, "");
+    const session = new VoiceSession(ws, sessionId);
+    console.log(`[esp32voice] Voice client connected [${sessionId.slice(0, 8)}]`);
+    ws.on("message", async (data: Buffer | string, isBinary: boolean) => {
+      // ── Raw message debug logging ─────────────────────────────
+      if (isBinary || Buffer.isBuffer(data)) {
+        const buf = Buffer.isBuffer(data) ? data : Buffer.from(data as ArrayBuffer);
+        console.log(`[esp32voice] [${sessionId.slice(0, 8)}] ← BINARY frame: ${buf.length} bytes`);
+        await session.handleMessage(buf);
+      } else {
+        const text = typeof data === "string" ? data : data.toString();
+        console.log(`[esp32voice] [${sessionId.slice(0, 8)}] ← TEXT message: ${text.slice(0, 300)}`);
+        try {
+          await session.handleMessage(text);
+        } catch (err) {
+          console.error(`[esp32voice] [${sessionId.slice(0, 8)}] Message error: ${err}`);
+        }
+      }
+    });
+    ws.on("close", async () => {
+      console.log(`[esp32voice] [${sessionId.slice(0, 8)}] Voice client disconnected`);
+      await session.cleanup();
+    });
+    ws.on("error", async (err) => {
+      console.error(`[esp32voice] [${sessionId.slice(0, 8)}] WebSocket error: ${err.message}`);
+      await session.cleanup();
+    });
+  });
+  console.log("[esp32voice] Voice WebSocket server created");
+  return wss;
+}
+/**
+ * Handle an HTTP upgrade request for the voice WebSocket.
+ */
+export function handleVoiceUpgrade(
+  wss: WebSocketServer,
+  request: IncomingMessage,
+  socket: Duplex,
+  head: Buffer,
+): void {
+  // Accept any upgrade on this server — it's dedicated to voice
+  wss.handleUpgrade(request, socket, head, (ws) => {
+    wss.emit("connection", ws, request);
+  });
+}
+/**
+ * Start a standalone HTTP + WebSocket server for ESP32 voice streaming.
+ *
+ * This server runs on its own port (separate from the Gateway) so that
+ * WebSocket upgrades are handled directly by the esp32-voice plugin
+ * without needing core Gateway changes.
+ *
+ * @returns The HTTP server instance (for cleanup).
+ */
+export function startStandaloneVoiceServer(port?: number): {
+  httpServer: ReturnType<typeof createServer>;
+  wss: WebSocketServer;
+  port: number;
+} {
+  const listenPort = port ?? DEFAULT_VOICE_PORT;
+  const wss = createVoiceWebSocketServer();
+  const httpServer = createServer((req: IncomingMessage, res: ServerResponse) => {
+    const url = req.url ?? "/";
+    // Health check endpoint
+    if (url === "/" || url === "/health") {
+      res.writeHead(200, { "Content-Type": "application/json" });
+      res.end(
+        JSON.stringify({
+          ok: true,
+          service: "esp32-voice",
+          type: "websocket",
+          hint: "Connect via WebSocket to this server for voice streaming",
+          sttConfigured: Boolean(process.env.DEEPGRAM_API_KEY),
+          ttsConfigured: Boolean(process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY),
+        }),
+      );
+      return;
+    }
+    // Fallback
+    res.writeHead(404, { "Content-Type": "application/json" });
+    res.end(JSON.stringify({ error: "Not found. Connect via WebSocket for voice streaming." }));
+  });
+  // Handle WebSocket upgrades on ANY path on this server
+  httpServer.on("upgrade", (request, socket, head) => {
+    handleVoiceUpgrade(wss, request, socket as Duplex, head);
+  });
+  httpServer.listen(listenPort, "0.0.0.0", () => {
+    console.log(`[esp32voice] Standalone voice server listening on ws://0.0.0.0:${listenPort}`);
+    console.log(`[esp32voice] ESP32 should connect to ws://<your-ip>:${listenPort}/`);
+    // Register with Cheeko dashboard if CHEEKO_PAIR env var is set.
+    // Fire-and-forget — startup is not blocked by network call.
+    registerWithCheekoDashboard(listenPort).catch(() => {
+      // Already logged inside registerWithCheekoDashboard
+    });
+  });
+  return { httpServer, wss, port: listenPort };
+}