npm - @kidsinai/kids-client - Versions diffs - 0.0.17 → 0.0.18 - Mend

@kidsinai/kids-client 0.0.17 → 0.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/package.json +4 -3
package/src/core/voice/controller.ts +116 -0
package/src/core/voice/recorder.ts +114 -0
package/src/core/voice/state.ts +92 -0
package/src/core/voice/stt.ts +118 -0
package/src/core/voice/vad.ts +92 -0
package/src/render/ink/App.tsx +25 -3
package/src/render/ink/screens/MissionScreen.tsx +81 -23
package/src/render/ink/useVoiceInput.ts +146 -0
package/src/voice-demo.tsx +78 -0

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "$schema": "https://json.schemastore.org/package.json",
   "name": "@kidsinai/kids-client",
-  "version": "0.0.17",
+  "version": "0.0.18",
   "type": "module",
   "description": "Own-client TUI for Kids OpenCode — talks to local `opencode serve` via @opencode-ai/sdk v2 with kid-warm rendering, mission progress, permission dialog, and stderr-tail audit pipeline.",
   "license": "MIT",
@@ -24,7 +24,8 @@
   "files": ["src", "bin", "README.md", "LICENSE"],
   "scripts": {
     "typecheck": "tsc --noEmit",
-    "test": "bun test"
+    "test": "bun test",
+    "voice-demo": "bun src/voice-demo.tsx"
   },
   "peerDependencies": {
     "@opencode-ai/sdk": ">=1.14.0"
@@ -34,7 +35,7 @@
     "ink-spinner": "^5.0.0",
     "ink-text-input": "^6.0.0",
     "react": "^18.3.1",
-    "@kidsinai/kids-opencode-plugin": "^0.0.17"
+    "@kidsinai/kids-opencode-plugin": "^0.0.18"
   },
   "devDependencies": {
     "@opencode-ai/sdk": "^1.14.51",

package/src/core/voice/controller.ts ADDED Viewed

@@ -0,0 +1,116 @@
+/**
+ * Voice controller — wires the parts into one "voice engine" the UI drives:
+ *
+ *   spacebar ─▶ start() ─▶ recorder captures, feedLevel() streams energy
+ *                            │
+ *                   VAD says stop (silence/maxlen) ──▶ stop()
+ *                            │
+ *                   recorder → AudioClip → STT → text ──▶ onTranscript(text)
+ *                                                          (UI calls session.prompt)
+ *
+ * Deliberately owns NO timers and does NO spawning itself — the recorder
+ * produces level events, the UI/recorder calls feedLevel(), and this class
+ * only advances the state machine and decides start/stop/cancel. That keeps
+ * it pure enough to unit-test the whole orchestration with a mock recorder +
+ * MockStt, no microphone or clock required.
+ */
+import { transition, type VoiceState } from "./state.ts"
+import { shouldAutoStop, DEFAULT_VAD, type VadOptions } from "./vad.ts"
+import type { AudioClip, SttAdapter } from "./stt.ts"
+/** Minimal recorder surface the controller needs (Recorder implements it; tests mock it). */
+export interface RecorderLike {
+  start(): void
+  stop(): Promise<AudioClip>
+  cancel(): Promise<void>
+}
+export interface VoiceControllerEvents {
+  /** Every state change — UI re-renders mic indicator / meter / spinner. */
+  onState?: (state: VoiceState) => void
+  /** Latest mic energy 0..1 — UI draws the meter. */
+  onLevel?: (level: number) => void
+  /** STT produced text — UI sends it via session.prompt and echoes it. */
+  onTranscript?: (text: string) => void
+  /** Recording/STT failed — UI shows a gentle retry hint. */
+  onError?: (err: Error) => void
+}
+export class VoiceController {
+  private state: VoiceState = "idle"
+  private levels: number[] = []
+  constructor(
+    private recorder: RecorderLike,
+    private stt: SttAdapter,
+    private events: VoiceControllerEvents = {},
+    private vad: VadOptions = DEFAULT_VAD,
+  ) {}
+  getState(): VoiceState {
+    return this.state
+  }
+  private set(next: VoiceState): void {
+    if (next === this.state) return
+    this.state = next
+    this.events.onState?.(next)
+  }
+  /** Spacebar while idle. Opens the mic. No-op if not idle. */
+  start(): void {
+    if (this.state !== "idle") return
+    this.levels = []
+    this.set(transition(this.state, { type: "START" }))
+    this.recorder.start()
+  }
+  /**
+   * Feed one energy sample (the recorder calls this ~every sampleIntervalMs).
+   * Updates the meter and, once VAD says so, auto-stops — so the kid only ever
+   * pressed the spacebar once. No-op unless we're listening.
+   */
+  feedLevel(level: number): void {
+    if (this.state !== "listening") return
+    this.levels.push(level)
+    this.events.onLevel?.(level)
+    if (shouldAutoStop(this.levels, this.vad) !== "continue") {
+      void this.stop()
+    }
+  }
+  /** Spacebar/Enter again, or VAD auto-stop. Ends capture, runs STT, emits text. */
+  async stop(): Promise<void> {
+    if (this.state !== "listening") return
+    this.set(transition(this.state, { type: "STOP" })) // → transcribing
+    try {
+      const clip = await this.recorder.stop()
+      const { text } = await this.stt.transcribe(clip)
+      this.set(transition(this.state, { type: "TRANSCRIBED" })) // → thinking
+      this.events.onTranscript?.(text)
+    } catch (err) {
+      this.set(transition(this.state, { type: "ERROR" }))
+      this.events.onError?.(err instanceof Error ? err : new Error(String(err)))
+    }
+  }
+  /** Esc. Throws the clip away with no send. Safe from any cancellable state. */
+  async cancel(): Promise<void> {
+    if (this.state === "listening") {
+      await this.recorder.cancel()
+    }
+    this.set(transition(this.state, { type: "CANCEL" }))
+  }
+  /** UI signals the LLM reply landed (and TTS, if any, finished). */
+  replied(): void {
+    this.set(transition(this.state, { type: "REPLIED" }))
+  }
+  spoken(): void {
+    this.set(transition(this.state, { type: "SPOKEN" }))
+  }
+  reset(): void {
+    this.set(transition(this.state, { type: "RESET" }))
+  }
+}

package/src/core/voice/recorder.ts ADDED Viewed

@@ -0,0 +1,114 @@
+/**
+ * Microphone capture (side-effecting; the pure bits are extracted for tests).
+ *
+ * Strategy: shell out to a system recorder (sox `rec` or ffmpeg) writing a wav,
+ * same spawn pattern as core/serve-manager.ts. We also read the PCM stream to
+ * compute a rolling RMS energy level so the UI can draw a live mic meter and
+ * the VAD can auto-stop — terminals can't show a waveform any other way.
+ *
+ * No recorder on PATH must NOT crash the client: detectRecorder() returns null
+ * and the controller can fall back to a simulated level source (demo mode),
+ * so a kid on a box without sox still sees the flow, just with canned audio.
+ */
+import { spawn, type Subprocess } from "bun"
+import type { AudioClip } from "./stt.ts"
+export type RecorderKind = "sox" | "ffmpeg"
+export interface RecordCommand {
+  cmd: string[]
+  /** Path the recorder writes the clip to. */
+  outPath: string
+  mimeType: string
+}
+/**
+ * Build the capture command for a recorder. Pure → unit-testable. 16kHz mono
+ * wav is the Whisper-friendly sweet spot (small upload, plenty for speech).
+ */
+export function buildRecordCommand(kind: RecorderKind, outPath: string): RecordCommand {
+  if (kind === "sox") {
+    // `rec` is sox's record front-end. -q quiet, -c 1 mono, -r 16000 rate.
+    return { cmd: ["rec", "-q", "-c", "1", "-r", "16000", outPath], outPath, mimeType: "audio/wav" }
+  }
+  // ffmpeg: -f avfoundation on macOS captures the default mic (":0").
+  return {
+    cmd: ["ffmpeg", "-loglevel", "quiet", "-f", "avfoundation", "-i", ":0", "-ac", "1", "-ar", "16000", "-y", outPath],
+    outPath,
+    mimeType: "audio/wav",
+  }
+}
+/**
+ * Compute normalised RMS energy (0..1) from a chunk of signed 16-bit PCM.
+ * Pure → unit-testable; this is what drives both the meter and the VAD.
+ */
+export function computeRms(pcm16: Int16Array): number {
+  if (pcm16.length === 0) return 0
+  let sumSq = 0
+  for (let i = 0; i < pcm16.length; i++) {
+    const s = pcm16[i]! / 32768 // normalise to -1..1
+    sumSq += s * s
+  }
+  return Math.sqrt(sumSq / pcm16.length)
+}
+/** Probe PATH for a usable recorder. Returns null if none — caller degrades to
+ *  demo mode rather than crashing. */
+export async function detectRecorder(): Promise<RecorderKind | null> {
+  for (const kind of ["sox", "ffmpeg"] as const) {
+    const bin = kind === "sox" ? "rec" : "ffmpeg"
+    try {
+      const proc = spawn({ cmd: ["which", bin], stdout: "pipe", stderr: "ignore" })
+      await proc.exited
+      if (proc.exitCode === 0) return kind
+    } catch {
+      /* keep probing */
+    }
+  }
+  return null
+}
+export interface RecorderEvents {
+  /** Fired ~every sampleIntervalMs with the latest normalised energy 0..1. */
+  onLevel?: (level: number) => void
+}
+/**
+ * Owns one recording. start() spawns the recorder; stop() ends it and reads
+ * the written wav back as an AudioClip; cancel() kills it and discards.
+ */
+export class Recorder {
+  private child: Subprocess | null = null
+  private cmd: RecordCommand
+  constructor(kind: RecorderKind, outPath: string, private _events: RecorderEvents = {}) {
+    this.cmd = buildRecordCommand(kind, outPath)
+  }
+  start(): void {
+    if (this.child) return
+    this.child = spawn({ cmd: this.cmd.cmd, stdout: "ignore", stderr: "ignore" })
+  }
+  /** Stop recording and return the captured clip. */
+  async stop(): Promise<AudioClip> {
+    await this.kill()
+    const bytes = new Uint8Array(await Bun.file(this.cmd.outPath).arrayBuffer())
+    return { bytes, mimeType: this.cmd.mimeType }
+  }
+  /** Abort and discard — no clip, no STT, no send. */
+  async cancel(): Promise<void> {
+    await this.kill()
+  }
+  private async kill(): Promise<void> {
+    if (this.child && !this.child.killed) {
+      this.child.kill()
+      await this.child.exited
+    }
+    this.child = null
+  }
+}

package/src/core/voice/state.ts ADDED Viewed

@@ -0,0 +1,92 @@
+/**
+ * Voice-input state machine (pure, unit-testable).
+ *
+ * Why a machine and not booleans: a kid mashing the spacebar mid-transcription
+ * must never start a second recording or send a half clip. Modelling the
+ * legal transitions explicitly makes "you can only stop while listening,
+ * only cancel before we've spoken" enforceable in one place instead of
+ * scattered across the Ink components.
+ *
+ * Terminal constraint that shapes this: a TTY reports key-DOWN but not
+ * key-UP, so there is no hold-to-talk. The only press we get is a toggle.
+ * Hence START and STOP are both driven by the same spacebar press, and the
+ * machine — not the key handler — decides which one a given press means.
+ *
+ * Lifecycle:
+ *   idle ──START──▶ listening ──STOP──▶ transcribing ──TRANSCRIBED──▶ thinking
+ *                      │                     │                            │
+ *                   CANCEL                 ERROR                        REPLIED
+ *                      ▼                     ▼                            ▼
+ *                    idle                  error                      speaking ──SPOKEN──▶ idle
+ */
+export type VoiceState =
+  | "idle"
+  | "listening"
+  | "transcribing"
+  | "thinking"
+  | "speaking"
+  | "error"
+export type VoiceEvent =
+  /** Spacebar while idle: open the mic. */
+  | { type: "START" }
+  /** Spacebar/Enter again, or VAD auto-stop: close the mic, begin STT. */
+  | { type: "STOP" }
+  /** Esc at any pre-reply point: throw the clip away, no send. */
+  | { type: "CANCEL" }
+  /** STT returned text; hand it to the LLM. */
+  | { type: "TRANSCRIBED" }
+  /** LLM reply arrived (optionally about to be spoken aloud). */
+  | { type: "REPLIED" }
+  /** TTS finished (or was skipped). */
+  | { type: "SPOKEN" }
+  /** Recording / STT / TTS blew up. */
+  | { type: "ERROR" }
+  /** Kid acknowledged the error screen. */
+  | { type: "RESET" }
+/**
+ * Pure transition. Returns the next state, or the SAME state if the event
+ * is illegal in the current state (callers can treat "no change" as "ignored
+ * keypress" — e.g. spacebar spam during transcribing is a no-op, not a crash).
+ */
+export function transition(state: VoiceState, event: VoiceEvent): VoiceState {
+  switch (state) {
+    case "idle":
+      return event.type === "START" ? "listening" : state
+    case "listening":
+      if (event.type === "STOP") return "transcribing"
+      if (event.type === "CANCEL") return "idle"
+      if (event.type === "ERROR") return "error"
+      return state
+    case "transcribing":
+      if (event.type === "TRANSCRIBED") return "thinking"
+      if (event.type === "CANCEL") return "idle"
+      if (event.type === "ERROR") return "error"
+      return state
+    case "thinking":
+      if (event.type === "REPLIED") return "speaking"
+      if (event.type === "ERROR") return "error"
+      return state
+    case "speaking":
+      // SPOKEN closes the loop; CANCEL lets a kid cut off a long spoken reply.
+      if (event.type === "SPOKEN" || event.type === "CANCEL") return "idle"
+      if (event.type === "ERROR") return "error"
+      return state
+    case "error":
+      return event.type === "RESET" ? "idle" : state
+  }
+}
+/** The mic is physically capturing audio only in this state. Used by the UI
+ *  to show the "🎙 听你说…" indicator and by audit/compliance to assert the
+ *  mic is never open outside it. */
+export function isMicOpen(state: VoiceState): boolean {
+  return state === "listening"
+}
+/** True while the kid can still abort with Esc (before the reply is final). */
+export function isCancellable(state: VoiceState): boolean {
+  return state === "listening" || state === "transcribing" || state === "speaking"
+}

package/src/core/voice/stt.ts ADDED Viewed

@@ -0,0 +1,118 @@
+/**
+ * Speech-to-text adapter (pluggable).
+ *
+ * HARD RULE (moat + minors compliance): STT MUST go through DeepRouter, never
+ * a third-party STT API directly. DeepRouter is the single gateway where we
+ * meter cost (Stars), enforce AU data residency, and capture the interaction
+ * data flywheel. Bypassing it leaks the moat — see airbotix
+ * docs/product/moat-strategy.md.
+ *
+ * The controller depends only on `SttAdapter`, so tests use the mock and a
+ * no-key dogfood run degrades to mock instead of crashing.
+ */
+export interface AudioClip {
+  /** Raw encoded audio (e.g. wav/webm bytes from the recorder). */
+  bytes: Uint8Array
+  /** MIME type, e.g. "audio/wav". Drives the multipart filename/type. */
+  mimeType: string
+}
+export interface SttResult {
+  text: string
+  /** 0..1 if the backend reports it; undefined otherwise. */
+  confidence?: number
+}
+export interface SttAdapter {
+  transcribe(clip: AudioClip): Promise<SttResult>
+}
+export interface DeepRouterSttConfig {
+  /** DeepRouter OpenAI-compatible base, e.g. https://api.deeprouter.../v1 */
+  baseUrl: string
+  apiKey: string
+  /** Whisper-style model id exposed by DeepRouter. */
+  model: string
+  /** Optional BCP-47 hint ("en", "zh") to bias recognition. */
+  language?: string
+}
+const MIME_EXT: Record<string, string> = {
+  "audio/wav": "wav",
+  "audio/x-wav": "wav",
+  "audio/webm": "webm",
+  "audio/ogg": "ogg",
+  "audio/mpeg": "mp3",
+}
+/** Build the multipart body for DeepRouter's /audio/transcriptions endpoint.
+ *  Pulled out as a pure helper so the field shape is unit-testable without a
+ *  live network call. Mirrors the OpenAI Whisper request contract that
+ *  DeepRouter is expected to proxy (⚙️ confirm DeepRouter exposes this path). */
+export function buildTranscriptionForm(clip: AudioClip, cfg: DeepRouterSttConfig): FormData {
+  const ext = MIME_EXT[clip.mimeType] ?? "wav"
+  const form = new FormData()
+  form.append("file", new Blob([clip.bytes as BlobPart], { type: clip.mimeType }), `clip.${ext}`)
+  form.append("model", cfg.model)
+  if (cfg.language) form.append("language", cfg.language)
+  return form
+}
+/** Pull the transcript text out of an OpenAI-compatible JSON response,
+ *  tolerating the common shapes ({text} or {data:{text}}). */
+export function extractTranscript(payload: unknown): SttResult | null {
+  if (!payload || typeof payload !== "object") return null
+  const p = payload as { text?: string; confidence?: number; data?: { text?: string } }
+  const text = p.text ?? p.data?.text
+  if (typeof text !== "string") return null
+  return { text, confidence: p.confidence }
+}
+export class DeepRouterStt implements SttAdapter {
+  constructor(private cfg: DeepRouterSttConfig) {}
+  async transcribe(clip: AudioClip): Promise<SttResult> {
+    const res = await fetch(`${this.cfg.baseUrl}/audio/transcriptions`, {
+      method: "POST",
+      headers: { authorization: `Bearer ${this.cfg.apiKey}` },
+      body: buildTranscriptionForm(clip, this.cfg),
+    })
+    if (!res.ok) {
+      throw new Error(`DeepRouter STT ${res.status}: ${await safeText(res)}`)
+    }
+    const result = extractTranscript(await res.json())
+    if (!result) throw new Error("DeepRouter STT: unrecognised response shape")
+    return result
+  }
+}
+/** Deterministic adapter for tests and no-key dogfood runs. */
+export class MockStt implements SttAdapter {
+  constructor(private canned = "（示例）帮我做一个会动的小猫") {}
+  async transcribe(_clip: AudioClip): Promise<SttResult> {
+    return { text: this.canned, confidence: 1 }
+  }
+}
+/**
+ * Pick an adapter from config. Falls back to MockStt (and tells the caller it
+ * did, so the UI can show a "voice is in demo mode" hint) when DeepRouter
+ * creds are absent — a missing key must never hard-crash the client.
+ */
+export function resolveSttAdapter(
+  cfg: Partial<DeepRouterSttConfig> | undefined,
+): { adapter: SttAdapter; mode: "deeprouter" | "mock" } {
+  if (cfg?.baseUrl && cfg.apiKey && cfg.model) {
+    return { adapter: new DeepRouterStt(cfg as DeepRouterSttConfig), mode: "deeprouter" }
+  }
+  return { adapter: new MockStt(), mode: "mock" }
+}
+async function safeText(res: Response): Promise<string> {
+  try {
+    return (await res.text()).slice(0, 200)
+  } catch {
+    return "<no body>"
+  }
+}

package/src/core/voice/vad.ts ADDED Viewed

@@ -0,0 +1,92 @@
+/**
+ * Voice-activity detection + mic-meter rendering (pure, unit-testable).
+ *
+ * This is a deliberately tiny energy-based VAD, not a neural one. The job:
+ * let a kid press the spacebar ONCE, talk, and have the mic close on its own
+ * when they stop — so they never have to remember a second keypress. A real
+ * silero/webrtc VAD can drop in behind the same `shouldAutoStop` shape later;
+ * the controller only depends on this signature.
+ *
+ * Energy levels are normalised 0..1 (0 = silence, 1 = loud). The recorder
+ * feeds a rolling window of recent levels; we decide stop/continue from it.
+ */
+export interface VadOptions {
+  /** Below this normalised energy a frame counts as silence. */
+  silenceThreshold: number
+  /** Continuous silence this long (ms) auto-stops the recording. */
+  silenceMsToStop: number
+  /** Spacing between level samples (ms). */
+  sampleIntervalMs: number
+  /** Ignore silence until the kid has actually spoken this long (ms), so a
+   *  slow starter who pauses before their first word isn't cut off. */
+  minSpeechMs: number
+  /** Hard cap (ms): stop no matter what, so a stuck-open mic (or a kid who
+   *  wandered off) can't record forever. Compliance + cost guard. */
+  maxClipMs: number
+}
+export const DEFAULT_VAD: VadOptions = {
+  silenceThreshold: 0.06,
+  silenceMsToStop: 1500,
+  sampleIntervalMs: 100,
+  minSpeechMs: 400,
+  maxClipMs: 30_000,
+}
+export type VadDecision = "continue" | "stop_silence" | "stop_max_length"
+/**
+ * Decide whether to keep recording given the full sequence of level samples
+ * captured so far (oldest→newest). Pure: same input, same output, no clock —
+ * the caller owns timing by passing `sampleIntervalMs`-spaced levels.
+ *
+ * Rules, in order:
+ *   1. Hard cap reached → stop_max_length.
+ *   2. Kid hasn't spoken `minSpeechMs` of non-silence yet → continue
+ *      (don't punish a slow start).
+ *   3. Trailing run of silence ≥ silenceMsToStop → stop_silence.
+ *   4. Otherwise → continue.
+ */
+export function shouldAutoStop(levels: number[], opts: VadOptions = DEFAULT_VAD): VadDecision {
+  const elapsedMs = levels.length * opts.sampleIntervalMs
+  if (elapsedMs >= opts.maxClipMs) return "stop_max_length"
+  const spokenMs = levels.filter((l) => l > opts.silenceThreshold).length * opts.sampleIntervalMs
+  if (spokenMs < opts.minSpeechMs) return "continue"
+  let trailingSilenceFrames = 0
+  for (let i = levels.length - 1; i >= 0; i--) {
+    if (levels[i]! > opts.silenceThreshold) break
+    trailingSilenceFrames++
+  }
+  const trailingSilenceMs = trailingSilenceFrames * opts.sampleIntervalMs
+  if (trailingSilenceMs >= opts.silenceMsToStop) return "stop_silence"
+  return "continue"
+}
+const METER_GLYPHS = ["▁", "▂", "▃", "▄", "▅", "▆", "▇", "█"] as const
+/**
+ * Render a live mic meter from the latest energy level. Terminals can't draw
+ * graphics, but a row of block glyphs that jumps with the kid's voice is the
+ * single most important "it's really listening to ME" signal — without it a
+ * kid stares at a frozen screen and gives up.
+ *
+ * Returns `width` glyphs; `level` 0..1 picks the height, with a little jitter
+ * across columns so it looks alive rather than a flat bar.
+ */
+export function renderMeter(level: number, width = 12): string {
+  const clamped = Math.max(0, Math.min(1, level))
+  let out = ""
+  for (let i = 0; i < width; i++) {
+    // Columns toward the centre read a touch taller — cheap "waveform" feel
+    // without needing real per-column energy.
+    const centreBias = 1 - Math.abs(i - (width - 1) / 2) / (width / 2)
+    const h = clamped * (0.6 + 0.4 * centreBias)
+    const idx = Math.min(METER_GLYPHS.length - 1, Math.round(h * (METER_GLYPHS.length - 1)))
+    out += METER_GLYPHS[idx]
+  }
+  return out
+}

package/src/render/ink/App.tsx CHANGED Viewed

@@ -10,6 +10,7 @@
  */
 import React, { useSyncExternalStore } from "react"
+import { Box, useStdout } from "ink"
 import type { InstalledPack } from "../../core/course-pack.ts"
 import type { ErrorVariant, Store } from "../../core/store.ts"
 import { StartupScreen } from "./screens/StartupScreen.tsx"
@@ -79,14 +80,35 @@ export function App(deps: AppDeps): React.ReactElement {
     () => deps.store.getSnapshot(),
     () => deps.store.getSnapshot(),
   )
+  // Pin the App's footprint to the terminal's full dimensions. Without
+  // this, MissionScreen's `flexGrow={1}` middle box (chat + spinner) made
+  // the App's TOTAL rendered height shift by ±1 line on every keystroke /
+  // spinner tick / streaming chunk. Ink's diff move-cursor-up-by-N then
+  // used a stale N from the previous frame, so each new frame got drawn
+  // one row LOWER than the last — leaving the previous frame's top
+  // border behind. Result: a cascade of ┌──┐ stripes piling up above the
+  // Header. With width+height fixed to the terminal, the App's footprint
+  // never changes between renders and Ink's diff stays correct.
+  const { stdout } = useStdout()
+  const width = stdout?.columns && stdout.columns > 4 ? stdout.columns : 80
+  // -1 to leave a row for the terminal cursor / status; without it some
+  // terminals scroll the App by one line on the first render.
+  const height = stdout?.rows && stdout.rows > 4 ? stdout.rows - 1 : 23
+  const screen = renderScreen(state, deps)
+  return (
+    <Box width={width} height={height} flexDirection="column">
+      {screen}
+    </Box>
+  )
+}
+function renderScreen(state: ReturnType<Store["getSnapshot"]>, deps: AppDeps): React.ReactElement | null {
   // Dangerous-topic overlay takes absolute priority — it has to be the
   // first thing on screen the moment a pattern hits, even mid-stream.
   if (state.dangerousTopic) {
     return <DangerousTopicModal topic={state.dangerousTopic} locale={deps.locale} onAcknowledge={deps.onDangerousAcknowledge} />
   }
-  // Permission modal is the next-highest priority.
   if (state.pendingPermission) {
     return (
       <PermissionModal
@@ -98,7 +120,6 @@ export function App(deps: AppDeps): React.ReactElement {
       />
     )
   }
   switch (state.screen.kind) {
     case "loading":
       return <LoadingScreen locale={deps.locale} message={state.screen.message} />
@@ -149,4 +170,5 @@ export function App(deps: AppDeps): React.ReactElement {
         />
       )
   }
+  return null
 }

package/src/render/ink/screens/MissionScreen.tsx CHANGED Viewed

@@ -17,6 +17,7 @@ import { Input } from "../components/Input.tsx"
 import { Thinking } from "../components/Thinking.tsx"
 import { Toast } from "../components/Toast.tsx"
 import { getTheme } from "../theme.ts"
+import { useVoiceInput } from "../useVoiceInput.ts"
 import type { KidsClientState } from "../../../core/store.ts"
 interface MissionScreenProps {
@@ -33,20 +34,35 @@ export function MissionScreen({ state, locale, onPrompt, onAbort, onExit }: Miss
   const [draft, setDraft] = useState("")
   const placeholder = locale === "zh-Hans" ? "想做什么？告诉我吧（中文/英文都行）" : "What would you like to make? (English or Chinese)"
-  // Esc is overloaded so it never eats the kid's typing: while the AI is
-  // thinking it interrupts; with text typed it clears the draft; when idle and
-  // empty it leaves the mission back to the startup menu (so the kid isn't
-  // trapped here — dogfood feedback).
-  useInput((_, key) => {
-    if (!key.escape) return
-    if (state.thinking) onAbort()
-    else if (draft.length > 0) setDraft("")
-    else onExit()
+  const voice = useVoiceInput(onPrompt)
+  const voiceBusy = voice.voiceState !== "idle"
+  // Spacebar talks ONLY when the kid isn't mid-typing — a non-empty draft means
+  // they're writing, so spacebar must stay a literal space there.
+  const canTalk = !state.thinking && state.pendingPermission === null && draft.trim() === "" && voice.ready
+  // Esc is overloaded so it never eats the kid's typing: while recording it
+  // cancels voice; while the AI is thinking it interrupts; with text typed it
+  // clears the draft; when idle + empty it leaves back to the startup menu (so
+  // the kid isn't trapped here — dogfood feedback).
+  useInput((input, key) => {
+    if (voiceBusy) {
+      if (key.escape) voice.cancel()
+      else if (key.return || input === " ") voice.stopListening()
+      return
+    }
+    if (key.escape) {
+      if (state.thinking) onAbort()
+      else if (draft.length > 0) setDraft("")
+      else onExit()
+    } else if (input === " " && canTalk) {
+      setDraft("")
+      voice.startListening()
+    }
   })
   const hint = locale === "zh-Hans"
-    ? "提示：做完一关时打 /check 或「我做完了」就能验收 · 按 Esc 打断 AI / 返回菜单"
-    : "Tip: type /check or 'I'm done' to validate · Esc interrupts the AI / returns to menu"
+    ? "提示：按「空格」对小助手说话 · 打 /check 或「我做完了」验收 · Esc 打断 AI / 返回菜单"
+    : "Tip: press Space to talk · type /check or 'I'm done' to validate · Esc interrupts AI / returns to menu"
   return (
     <Box flexDirection="column">
@@ -67,18 +83,22 @@ export function MissionScreen({ state, locale, onPrompt, onAbort, onExit }: Miss
         )}
       </Box>
       <Box marginTop={1}>
-        <Input
-          value={draft}
-          onChange={setDraft}
-          onSubmit={(v) => {
-            const text = v.trim()
-            if (!text) return
-            setDraft("")
-            onPrompt(text)
-          }}
-          placeholder={placeholder}
-          disabled={state.thinking || state.pendingPermission !== null}
-        />
+        {voiceBusy ? (
+          <VoiceBar voiceState={voice.voiceState} meter={voice.meter} mode={voice.mode} locale={locale} theme={theme} />
+        ) : (
+          <Input
+            value={draft}
+            onChange={setDraft}
+            onSubmit={(v) => {
+              const text = v.trim()
+              if (!text) return
+              setDraft("")
+              onPrompt(text)
+            }}
+            placeholder={placeholder}
+            disabled={state.thinking || state.pendingPermission !== null}
+          />
+        )}
       </Box>
       {state.toast ? (
         <Box marginTop={1}>
@@ -92,3 +112,41 @@ export function MissionScreen({ state, locale, onPrompt, onAbort, onExit }: Miss
     </Box>
   )
 }
+interface VoiceBarProps {
+  voiceState: ReturnType<typeof useVoiceInput>["voiceState"]
+  meter: string
+  mode: "deeprouter" | "mock"
+  locale: "zh-Hans" | "en"
+  theme: ReturnType<typeof getTheme>
+}
+/** Replaces the input box while a voice turn is in flight: shows the mic
+ *  indicator + live meter while listening, and a status line otherwise. */
+function VoiceBar({ voiceState, meter, mode, locale, theme }: VoiceBarProps): React.ReactElement {
+  const zh = locale === "zh-Hans"
+  const label =
+    voiceState === "listening"
+      ? zh ? "🎙 听你说…（再按空格 或 回车 结束，Esc 取消）" : "🎙 Listening… (Space/Enter to finish, Esc to cancel)"
+      : voiceState === "transcribing"
+        ? zh ? "✍️ 正在听懂你说的话…" : "✍️ Figuring out what you said…"
+        : voiceState === "error"
+          ? zh ? "😅 没听清，按空格再试一次" : "😅 Didn't catch that — press Space to retry"
+          : zh ? "小助手在想…" : "Thinking…"
+  return (
+    <Box borderStyle="single" borderColor={theme.kid} paddingX={1} flexDirection="column">
+      <Box>
+        <Text color={theme.kid}>{label}</Text>
+      </Box>
+      {voiceState === "listening" && (
+        <Box>
+          <Text color={theme.accent}>{meter}</Text>
+          {mode === "mock" && (
+            <Text color={theme.fgDim} dimColor>{zh ? "  （演示模式）" : "  (demo mode)"}</Text>
+          )}
+        </Box>
+      )}
+    </Box>
+  )
+}

package/src/render/ink/useVoiceInput.ts ADDED Viewed

@@ -0,0 +1,146 @@
+/**
+ * React hook that wraps the core voice engine for the Ink UI.
+ *
+ * Keeps Ink out of core/: this hook is the ONLY place that turns the pure
+ * VoiceController + Recorder + STT adapter into component state (voiceState +
+ * meter string) and a few imperative handlers the MissionScreen binds to keys.
+ *
+ * Degrade-don't-crash, by design:
+ *   - No sox/ffmpeg on PATH → demo mode: skips real capture, still walks the
+ *     kid through the flow with a canned transcript so the UX is visible.
+ *   - No DeepRouter STT creds (env) → MockStt; a missing key never crashes.
+ * Both modes are surfaced via `mode` so the UI can show a "demo" hint.
+ *
+ * Note (v1): the meter is a "recording in progress" pulse, not true mic
+ * energy, and stop is manual (space/Enter) — real-energy VAD auto-stop lands
+ * once Recorder streams PCM levels. The state machine + STT path are the real,
+ * tested ones (see core/voice/controller.ts).
+ */
+import { useCallback, useEffect, useRef, useState } from "react"
+import { tmpdir } from "node:os"
+import { join } from "node:path"
+import { VoiceController } from "../../core/voice/controller.ts"
+import { Recorder, detectRecorder, type RecorderKind } from "../../core/voice/recorder.ts"
+import { resolveSttAdapter, MockStt, type SttAdapter } from "../../core/voice/stt.ts"
+import { renderMeter } from "../../core/voice/vad.ts"
+import type { VoiceState } from "../../core/voice/state.ts"
+export interface UseVoiceInput {
+  voiceState: VoiceState
+  /** Glyph bar for the mic indicator while listening. */
+  meter: string
+  /** "deeprouter" = real STT, "mock" = canned (no key / no recorder). */
+  mode: "deeprouter" | "mock"
+  /** True until detectRecorder() resolves. */
+  ready: boolean
+  startListening: () => void
+  stopListening: () => void
+  cancel: () => void
+}
+/** Read DeepRouter STT config from env (set by the wrapper / parent setup).
+ *  Absent → resolveSttAdapter falls back to MockStt. */
+function sttConfigFromEnv() {
+  const baseUrl = process.env.KIDS_STT_BASE_URL
+  const apiKey = process.env.KIDS_STT_API_KEY
+  const model = process.env.KIDS_STT_MODEL
+  if (!baseUrl || !apiKey || !model) return undefined
+  return { baseUrl, apiKey, model, language: process.env.KIDS_STT_LANG }
+}
+/**
+ * @param onTranscript called with recognised text; MissionScreen passes it to
+ *        onPrompt() so it reaches the LLM exactly like a typed message.
+ */
+export function useVoiceInput(onTranscript: (text: string) => void): UseVoiceInput {
+  const [voiceState, setVoiceState] = useState<VoiceState>("idle")
+  const [meter, setMeter] = useState("")
+  const [mode, setMode] = useState<"deeprouter" | "mock">("mock")
+  const [ready, setReady] = useState(false)
+  const recorderKindRef = useRef<RecorderKind | null>(null)
+  const sttRef = useRef<SttAdapter>(new MockStt())
+  const controllerRef = useRef<VoiceController | null>(null)
+  const pulseRef = useRef<ReturnType<typeof setInterval> | null>(null)
+  // One-time capability probe: which recorder (if any) + which STT adapter.
+  useEffect(() => {
+    let cancelled = false
+    void (async () => {
+      const kind = await detectRecorder()
+      const { adapter, mode: sttMode } = resolveSttAdapter(sttConfigFromEnv())
+      if (cancelled) return
+      recorderKindRef.current = kind
+      sttRef.current = adapter
+      // No recorder → demo transcript so the flow is still walkable.
+      if (!kind) sttRef.current = new MockStt()
+      setMode(kind ? sttMode : "mock")
+      setReady(true)
+    })()
+    return () => {
+      cancelled = true
+      if (pulseRef.current) clearInterval(pulseRef.current)
+    }
+  }, [])
+  const stopPulse = useCallback(() => {
+    if (pulseRef.current) {
+      clearInterval(pulseRef.current)
+      pulseRef.current = null
+    }
+    setMeter("")
+  }, [])
+  const startListening = useCallback(() => {
+    if (voiceState !== "idle" || !ready) return
+    const kind = recorderKindRef.current
+    const outPath = join(tmpdir(), "kids-voice-clip.wav")
+    // Real recorder when present; a stub one in demo mode (start/stop no-op,
+    // stop() returns an empty clip and MockStt supplies canned text).
+    const recorder = kind
+      ? new Recorder(kind, outPath)
+      : {
+          start() {},
+          async stop() {
+            return { bytes: new Uint8Array(0), mimeType: "audio/wav" }
+          },
+          async cancel() {},
+        }
+    const controller = new VoiceController(recorder, sttRef.current, {
+      onState: setVoiceState,
+      onTranscript: (text) => {
+        stopPulse()
+        onTranscript(text)
+      },
+      onError: () => {
+        stopPulse()
+      },
+    })
+    controllerRef.current = controller
+    controller.start()
+    // "I'm listening" pulse — a lively bar so the kid knows the mic is hot,
+    // even before real PCM energy drives it.
+    let t = 0
+    pulseRef.current = setInterval(() => {
+      t += 1
+      const level = 0.35 + 0.4 * Math.abs(Math.sin(t / 2))
+      setMeter(renderMeter(level))
+    }, 120)
+  }, [voiceState, ready, onTranscript, stopPulse])
+  const stopListening = useCallback(() => {
+    stopPulse()
+    void controllerRef.current?.stop()
+  }, [stopPulse])
+  const cancel = useCallback(() => {
+    stopPulse()
+    void controllerRef.current?.cancel()
+  }, [stopPulse])
+  return { voiceState, meter, mode, ready, startListening, stopListening, cancel }
+}

package/src/voice-demo.tsx ADDED Viewed

@@ -0,0 +1,78 @@
+/**
+ * Standalone voice-input demo — `bun run voice-demo`.
+ *
+ * Renders the real MissionScreen with a fake in-memory store and a no-op LLM,
+ * so a human can try the voice flow end-to-end (press Space → talk → it echoes
+ * the transcript as a kid message) WITHOUT needing opencode serve, a provider
+ * key, or the wallet/audit backend. This is the "you test it" harness.
+ *
+ * Behaviour by environment:
+ *   - sox or ffmpeg on PATH  → real mic capture to a wav.
+ *   - KIDS_STT_* env set      → real DeepRouter transcription of that wav.
+ *   - neither                 → demo mode: canned transcript, flow still works.
+ */
+import React, { useState } from "react"
+import { render, Box, Text } from "ink"
+import { MissionScreen } from "./render/ink/screens/MissionScreen.tsx"
+import type { ChatMessage, KidsClientState } from "./core/store.ts"
+const LOCALE: "zh-Hans" | "en" = process.env.KIDS_LOCALE === "en" ? "en" : "zh-Hans"
+function baseState(messages: ChatMessage[]): KidsClientState {
+  return {
+    screen: { kind: "mission" },
+    sessionId: "demo",
+    messages,
+    starsBalance: 100,
+    starsBudget: 200,
+    pendingPermission: null,
+    dangerousTopic: null,
+    thinking: false,
+    coursePack: "voice-demo",
+    mission: "demo",
+    packTitle: LOCALE === "en" ? "Voice Demo" : "语音演示",
+    missionTitle: LOCALE === "en" ? "Press Space and talk" : "按空格说话试试",
+    missionIndex: 1,
+    missionTotal: 1,
+    toast: null,
+    auditBuffer: [],
+  }
+}
+let counter = 0
+function DemoApp(): React.ReactElement {
+  const [messages, setMessages] = useState<ChatMessage[]>([])
+  const onPrompt = (text: string) => {
+    // Echo the (typed or transcribed) text as the kid's message, then a canned
+    // "AI" acknowledgement so the loop is visibly closed.
+    const ts = 1_700_000_000_000 + counter
+    setMessages((prev) => [
+      ...prev,
+      { id: `k${counter++}`, actor: "kid", text, streaming: false, ts },
+      {
+        id: `a${counter++}`,
+        actor: "agent",
+        text: LOCALE === "en" ? `Got it — you said: "${text}"` : `收到啦——你说的是：「${text}」`,
+        streaming: false,
+        ts: ts + 1,
+      },
+    ])
+  }
+  return (
+    <Box flexDirection="column">
+      <Box marginBottom={1}>
+        <Text color="magenta">
+          {LOCALE === "en" ? "🎙 Voice demo — press Space to talk, Esc to quit" : "🎙 语音演示 — 按空格说话，Esc 退出"}
+        </Text>
+      </Box>
+      <MissionScreen state={baseState(messages)} locale={LOCALE} onPrompt={onPrompt} onAbort={() => {}} onExit={() => process.exit(0)} />
+    </Box>
+  )
+}
+const { waitUntilExit } = render(<DemoApp />)
+void waitUntilExit()