@kidsinai/kids-client 0.0.17 → 0.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "$schema": "https://json.schemastore.org/package.json",
3
3
  "name": "@kidsinai/kids-client",
4
- "version": "0.0.17",
4
+ "version": "0.0.18",
5
5
  "type": "module",
6
6
  "description": "Own-client TUI for Kids OpenCode — talks to local `opencode serve` via @opencode-ai/sdk v2 with kid-warm rendering, mission progress, permission dialog, and stderr-tail audit pipeline.",
7
7
  "license": "MIT",
@@ -24,7 +24,8 @@
24
24
  "files": ["src", "bin", "README.md", "LICENSE"],
25
25
  "scripts": {
26
26
  "typecheck": "tsc --noEmit",
27
- "test": "bun test"
27
+ "test": "bun test",
28
+ "voice-demo": "bun src/voice-demo.tsx"
28
29
  },
29
30
  "peerDependencies": {
30
31
  "@opencode-ai/sdk": ">=1.14.0"
@@ -34,7 +35,7 @@
34
35
  "ink-spinner": "^5.0.0",
35
36
  "ink-text-input": "^6.0.0",
36
37
  "react": "^18.3.1",
37
- "@kidsinai/kids-opencode-plugin": "^0.0.17"
38
+ "@kidsinai/kids-opencode-plugin": "^0.0.18"
38
39
  },
39
40
  "devDependencies": {
40
41
  "@opencode-ai/sdk": "^1.14.51",
@@ -0,0 +1,116 @@
1
+ /**
2
+ * Voice controller — wires the parts into one "voice engine" the UI drives:
3
+ *
4
+ * spacebar ─▶ start() ─▶ recorder captures, feedLevel() streams energy
5
+ * │
6
+ * VAD says stop (silence/maxlen) ──▶ stop()
7
+ * │
8
+ * recorder → AudioClip → STT → text ──▶ onTranscript(text)
9
+ * (UI calls session.prompt)
10
+ *
11
+ * Deliberately owns NO timers and does NO spawning itself — the recorder
12
+ * produces level events, the UI/recorder calls feedLevel(), and this class
13
+ * only advances the state machine and decides start/stop/cancel. That keeps
14
+ * it pure enough to unit-test the whole orchestration with a mock recorder +
15
+ * MockStt, no microphone or clock required.
16
+ */
17
+
18
+ import { transition, type VoiceState } from "./state.ts"
19
+ import { shouldAutoStop, DEFAULT_VAD, type VadOptions } from "./vad.ts"
20
+ import type { AudioClip, SttAdapter } from "./stt.ts"
21
+
22
+ /** Minimal recorder surface the controller needs (Recorder implements it; tests mock it). */
23
+ export interface RecorderLike {
24
+ start(): void
25
+ stop(): Promise<AudioClip>
26
+ cancel(): Promise<void>
27
+ }
28
+
29
+ export interface VoiceControllerEvents {
30
+ /** Every state change — UI re-renders mic indicator / meter / spinner. */
31
+ onState?: (state: VoiceState) => void
32
+ /** Latest mic energy 0..1 — UI draws the meter. */
33
+ onLevel?: (level: number) => void
34
+ /** STT produced text — UI sends it via session.prompt and echoes it. */
35
+ onTranscript?: (text: string) => void
36
+ /** Recording/STT failed — UI shows a gentle retry hint. */
37
+ onError?: (err: Error) => void
38
+ }
39
+
40
+ export class VoiceController {
41
+ private state: VoiceState = "idle"
42
+ private levels: number[] = []
43
+
44
+ constructor(
45
+ private recorder: RecorderLike,
46
+ private stt: SttAdapter,
47
+ private events: VoiceControllerEvents = {},
48
+ private vad: VadOptions = DEFAULT_VAD,
49
+ ) {}
50
+
51
+ getState(): VoiceState {
52
+ return this.state
53
+ }
54
+
55
+ private set(next: VoiceState): void {
56
+ if (next === this.state) return
57
+ this.state = next
58
+ this.events.onState?.(next)
59
+ }
60
+
61
+ /** Spacebar while idle. Opens the mic. No-op if not idle. */
62
+ start(): void {
63
+ if (this.state !== "idle") return
64
+ this.levels = []
65
+ this.set(transition(this.state, { type: "START" }))
66
+ this.recorder.start()
67
+ }
68
+
69
+ /**
70
+ * Feed one energy sample (the recorder calls this ~every sampleIntervalMs).
71
+ * Updates the meter and, once VAD says so, auto-stops — so the kid only ever
72
+ * pressed the spacebar once. No-op unless we're listening.
73
+ */
74
+ feedLevel(level: number): void {
75
+ if (this.state !== "listening") return
76
+ this.levels.push(level)
77
+ this.events.onLevel?.(level)
78
+ if (shouldAutoStop(this.levels, this.vad) !== "continue") {
79
+ void this.stop()
80
+ }
81
+ }
82
+
83
+ /** Spacebar/Enter again, or VAD auto-stop. Ends capture, runs STT, emits text. */
84
+ async stop(): Promise<void> {
85
+ if (this.state !== "listening") return
86
+ this.set(transition(this.state, { type: "STOP" })) // → transcribing
87
+ try {
88
+ const clip = await this.recorder.stop()
89
+ const { text } = await this.stt.transcribe(clip)
90
+ this.set(transition(this.state, { type: "TRANSCRIBED" })) // → thinking
91
+ this.events.onTranscript?.(text)
92
+ } catch (err) {
93
+ this.set(transition(this.state, { type: "ERROR" }))
94
+ this.events.onError?.(err instanceof Error ? err : new Error(String(err)))
95
+ }
96
+ }
97
+
98
+ /** Esc. Throws the clip away with no send. Safe from any cancellable state. */
99
+ async cancel(): Promise<void> {
100
+ if (this.state === "listening") {
101
+ await this.recorder.cancel()
102
+ }
103
+ this.set(transition(this.state, { type: "CANCEL" }))
104
+ }
105
+
106
+ /** UI signals the LLM reply landed (and TTS, if any, finished). */
107
+ replied(): void {
108
+ this.set(transition(this.state, { type: "REPLIED" }))
109
+ }
110
+ spoken(): void {
111
+ this.set(transition(this.state, { type: "SPOKEN" }))
112
+ }
113
+ reset(): void {
114
+ this.set(transition(this.state, { type: "RESET" }))
115
+ }
116
+ }
@@ -0,0 +1,114 @@
1
+ /**
2
+ * Microphone capture (side-effecting; the pure bits are extracted for tests).
3
+ *
4
+ * Strategy: shell out to a system recorder (sox `rec` or ffmpeg) writing a wav,
5
+ * same spawn pattern as core/serve-manager.ts. We also read the PCM stream to
6
+ * compute a rolling RMS energy level so the UI can draw a live mic meter and
7
+ * the VAD can auto-stop — terminals can't show a waveform any other way.
8
+ *
9
+ * No recorder on PATH must NOT crash the client: detectRecorder() returns null
10
+ * and the controller can fall back to a simulated level source (demo mode),
11
+ * so a kid on a box without sox still sees the flow, just with canned audio.
12
+ */
13
+
14
+ import { spawn, type Subprocess } from "bun"
15
+ import type { AudioClip } from "./stt.ts"
16
+
17
+ export type RecorderKind = "sox" | "ffmpeg"
18
+
19
+ export interface RecordCommand {
20
+ cmd: string[]
21
+ /** Path the recorder writes the clip to. */
22
+ outPath: string
23
+ mimeType: string
24
+ }
25
+
26
+ /**
27
+ * Build the capture command for a recorder. Pure → unit-testable. 16kHz mono
28
+ * wav is the Whisper-friendly sweet spot (small upload, plenty for speech).
29
+ */
30
+ export function buildRecordCommand(kind: RecorderKind, outPath: string): RecordCommand {
31
+ if (kind === "sox") {
32
+ // `rec` is sox's record front-end. -q quiet, -c 1 mono, -r 16000 rate.
33
+ return { cmd: ["rec", "-q", "-c", "1", "-r", "16000", outPath], outPath, mimeType: "audio/wav" }
34
+ }
35
+ // ffmpeg: -f avfoundation on macOS captures the default mic (":0").
36
+ return {
37
+ cmd: ["ffmpeg", "-loglevel", "quiet", "-f", "avfoundation", "-i", ":0", "-ac", "1", "-ar", "16000", "-y", outPath],
38
+ outPath,
39
+ mimeType: "audio/wav",
40
+ }
41
+ }
42
+
43
+ /**
44
+ * Compute normalised RMS energy (0..1) from a chunk of signed 16-bit PCM.
45
+ * Pure → unit-testable; this is what drives both the meter and the VAD.
46
+ */
47
+ export function computeRms(pcm16: Int16Array): number {
48
+ if (pcm16.length === 0) return 0
49
+ let sumSq = 0
50
+ for (let i = 0; i < pcm16.length; i++) {
51
+ const s = pcm16[i]! / 32768 // normalise to -1..1
52
+ sumSq += s * s
53
+ }
54
+ return Math.sqrt(sumSq / pcm16.length)
55
+ }
56
+
57
+ /** Probe PATH for a usable recorder. Returns null if none — caller degrades to
58
+ * demo mode rather than crashing. */
59
+ export async function detectRecorder(): Promise<RecorderKind | null> {
60
+ for (const kind of ["sox", "ffmpeg"] as const) {
61
+ const bin = kind === "sox" ? "rec" : "ffmpeg"
62
+ try {
63
+ const proc = spawn({ cmd: ["which", bin], stdout: "pipe", stderr: "ignore" })
64
+ await proc.exited
65
+ if (proc.exitCode === 0) return kind
66
+ } catch {
67
+ /* keep probing */
68
+ }
69
+ }
70
+ return null
71
+ }
72
+
73
+ export interface RecorderEvents {
74
+ /** Fired ~every sampleIntervalMs with the latest normalised energy 0..1. */
75
+ onLevel?: (level: number) => void
76
+ }
77
+
78
+ /**
79
+ * Owns one recording. start() spawns the recorder; stop() ends it and reads
80
+ * the written wav back as an AudioClip; cancel() kills it and discards.
81
+ */
82
+ export class Recorder {
83
+ private child: Subprocess | null = null
84
+ private cmd: RecordCommand
85
+
86
+ constructor(kind: RecorderKind, outPath: string, private _events: RecorderEvents = {}) {
87
+ this.cmd = buildRecordCommand(kind, outPath)
88
+ }
89
+
90
+ start(): void {
91
+ if (this.child) return
92
+ this.child = spawn({ cmd: this.cmd.cmd, stdout: "ignore", stderr: "ignore" })
93
+ }
94
+
95
+ /** Stop recording and return the captured clip. */
96
+ async stop(): Promise<AudioClip> {
97
+ await this.kill()
98
+ const bytes = new Uint8Array(await Bun.file(this.cmd.outPath).arrayBuffer())
99
+ return { bytes, mimeType: this.cmd.mimeType }
100
+ }
101
+
102
+ /** Abort and discard — no clip, no STT, no send. */
103
+ async cancel(): Promise<void> {
104
+ await this.kill()
105
+ }
106
+
107
+ private async kill(): Promise<void> {
108
+ if (this.child && !this.child.killed) {
109
+ this.child.kill()
110
+ await this.child.exited
111
+ }
112
+ this.child = null
113
+ }
114
+ }
@@ -0,0 +1,92 @@
1
+ /**
2
+ * Voice-input state machine (pure, unit-testable).
3
+ *
4
+ * Why a machine and not booleans: a kid mashing the spacebar mid-transcription
5
+ * must never start a second recording or send a half clip. Modelling the
6
+ * legal transitions explicitly makes "you can only stop while listening,
7
+ * only cancel before we've spoken" enforceable in one place instead of
8
+ * scattered across the Ink components.
9
+ *
10
+ * Terminal constraint that shapes this: a TTY reports key-DOWN but not
11
+ * key-UP, so there is no hold-to-talk. The only press we get is a toggle.
12
+ * Hence START and STOP are both driven by the same spacebar press, and the
13
+ * machine — not the key handler — decides which one a given press means.
14
+ *
15
+ * Lifecycle:
16
+ * idle ──START──▶ listening ──STOP──▶ transcribing ──TRANSCRIBED──▶ thinking
17
+ * │ │ │
18
+ * CANCEL ERROR REPLIED
19
+ * ▼ ▼ ▼
20
+ * idle error speaking ──SPOKEN──▶ idle
21
+ */
22
+
23
+ export type VoiceState =
24
+ | "idle"
25
+ | "listening"
26
+ | "transcribing"
27
+ | "thinking"
28
+ | "speaking"
29
+ | "error"
30
+
31
+ export type VoiceEvent =
32
+ /** Spacebar while idle: open the mic. */
33
+ | { type: "START" }
34
+ /** Spacebar/Enter again, or VAD auto-stop: close the mic, begin STT. */
35
+ | { type: "STOP" }
36
+ /** Esc at any pre-reply point: throw the clip away, no send. */
37
+ | { type: "CANCEL" }
38
+ /** STT returned text; hand it to the LLM. */
39
+ | { type: "TRANSCRIBED" }
40
+ /** LLM reply arrived (optionally about to be spoken aloud). */
41
+ | { type: "REPLIED" }
42
+ /** TTS finished (or was skipped). */
43
+ | { type: "SPOKEN" }
44
+ /** Recording / STT / TTS blew up. */
45
+ | { type: "ERROR" }
46
+ /** Kid acknowledged the error screen. */
47
+ | { type: "RESET" }
48
+
49
+ /**
50
+ * Pure transition. Returns the next state, or the SAME state if the event
51
+ * is illegal in the current state (callers can treat "no change" as "ignored
52
+ * keypress" — e.g. spacebar spam during transcribing is a no-op, not a crash).
53
+ */
54
+ export function transition(state: VoiceState, event: VoiceEvent): VoiceState {
55
+ switch (state) {
56
+ case "idle":
57
+ return event.type === "START" ? "listening" : state
58
+ case "listening":
59
+ if (event.type === "STOP") return "transcribing"
60
+ if (event.type === "CANCEL") return "idle"
61
+ if (event.type === "ERROR") return "error"
62
+ return state
63
+ case "transcribing":
64
+ if (event.type === "TRANSCRIBED") return "thinking"
65
+ if (event.type === "CANCEL") return "idle"
66
+ if (event.type === "ERROR") return "error"
67
+ return state
68
+ case "thinking":
69
+ if (event.type === "REPLIED") return "speaking"
70
+ if (event.type === "ERROR") return "error"
71
+ return state
72
+ case "speaking":
73
+ // SPOKEN closes the loop; CANCEL lets a kid cut off a long spoken reply.
74
+ if (event.type === "SPOKEN" || event.type === "CANCEL") return "idle"
75
+ if (event.type === "ERROR") return "error"
76
+ return state
77
+ case "error":
78
+ return event.type === "RESET" ? "idle" : state
79
+ }
80
+ }
81
+
82
+ /** The mic is physically capturing audio only in this state. Used by the UI
83
+ * to show the "🎙 听你说…" indicator and by audit/compliance to assert the
84
+ * mic is never open outside it. */
85
+ export function isMicOpen(state: VoiceState): boolean {
86
+ return state === "listening"
87
+ }
88
+
89
+ /** True while the kid can still abort with Esc (before the reply is final). */
90
+ export function isCancellable(state: VoiceState): boolean {
91
+ return state === "listening" || state === "transcribing" || state === "speaking"
92
+ }
@@ -0,0 +1,118 @@
1
+ /**
2
+ * Speech-to-text adapter (pluggable).
3
+ *
4
+ * HARD RULE (moat + minors compliance): STT MUST go through DeepRouter, never
5
+ * a third-party STT API directly. DeepRouter is the single gateway where we
6
+ * meter cost (Stars), enforce AU data residency, and capture the interaction
7
+ * data flywheel. Bypassing it leaks the moat — see airbotix
8
+ * docs/product/moat-strategy.md.
9
+ *
10
+ * The controller depends only on `SttAdapter`, so tests use the mock and a
11
+ * no-key dogfood run degrades to mock instead of crashing.
12
+ */
13
+
14
+ export interface AudioClip {
15
+ /** Raw encoded audio (e.g. wav/webm bytes from the recorder). */
16
+ bytes: Uint8Array
17
+ /** MIME type, e.g. "audio/wav". Drives the multipart filename/type. */
18
+ mimeType: string
19
+ }
20
+
21
+ export interface SttResult {
22
+ text: string
23
+ /** 0..1 if the backend reports it; undefined otherwise. */
24
+ confidence?: number
25
+ }
26
+
27
+ export interface SttAdapter {
28
+ transcribe(clip: AudioClip): Promise<SttResult>
29
+ }
30
+
31
+ export interface DeepRouterSttConfig {
32
+ /** DeepRouter OpenAI-compatible base, e.g. https://api.deeprouter.../v1 */
33
+ baseUrl: string
34
+ apiKey: string
35
+ /** Whisper-style model id exposed by DeepRouter. */
36
+ model: string
37
+ /** Optional BCP-47 hint ("en", "zh") to bias recognition. */
38
+ language?: string
39
+ }
40
+
41
+ const MIME_EXT: Record<string, string> = {
42
+ "audio/wav": "wav",
43
+ "audio/x-wav": "wav",
44
+ "audio/webm": "webm",
45
+ "audio/ogg": "ogg",
46
+ "audio/mpeg": "mp3",
47
+ }
48
+
49
+ /** Build the multipart body for DeepRouter's /audio/transcriptions endpoint.
50
+ * Pulled out as a pure helper so the field shape is unit-testable without a
51
+ * live network call. Mirrors the OpenAI Whisper request contract that
52
+ * DeepRouter is expected to proxy (⚙️ confirm DeepRouter exposes this path). */
53
+ export function buildTranscriptionForm(clip: AudioClip, cfg: DeepRouterSttConfig): FormData {
54
+ const ext = MIME_EXT[clip.mimeType] ?? "wav"
55
+ const form = new FormData()
56
+ form.append("file", new Blob([clip.bytes as BlobPart], { type: clip.mimeType }), `clip.${ext}`)
57
+ form.append("model", cfg.model)
58
+ if (cfg.language) form.append("language", cfg.language)
59
+ return form
60
+ }
61
+
62
+ /** Pull the transcript text out of an OpenAI-compatible JSON response,
63
+ * tolerating the common shapes ({text} or {data:{text}}). */
64
+ export function extractTranscript(payload: unknown): SttResult | null {
65
+ if (!payload || typeof payload !== "object") return null
66
+ const p = payload as { text?: string; confidence?: number; data?: { text?: string } }
67
+ const text = p.text ?? p.data?.text
68
+ if (typeof text !== "string") return null
69
+ return { text, confidence: p.confidence }
70
+ }
71
+
72
+ export class DeepRouterStt implements SttAdapter {
73
+ constructor(private cfg: DeepRouterSttConfig) {}
74
+
75
+ async transcribe(clip: AudioClip): Promise<SttResult> {
76
+ const res = await fetch(`${this.cfg.baseUrl}/audio/transcriptions`, {
77
+ method: "POST",
78
+ headers: { authorization: `Bearer ${this.cfg.apiKey}` },
79
+ body: buildTranscriptionForm(clip, this.cfg),
80
+ })
81
+ if (!res.ok) {
82
+ throw new Error(`DeepRouter STT ${res.status}: ${await safeText(res)}`)
83
+ }
84
+ const result = extractTranscript(await res.json())
85
+ if (!result) throw new Error("DeepRouter STT: unrecognised response shape")
86
+ return result
87
+ }
88
+ }
89
+
90
+ /** Deterministic adapter for tests and no-key dogfood runs. */
91
+ export class MockStt implements SttAdapter {
92
+ constructor(private canned = "(示例)帮我做一个会动的小猫") {}
93
+ async transcribe(_clip: AudioClip): Promise<SttResult> {
94
+ return { text: this.canned, confidence: 1 }
95
+ }
96
+ }
97
+
98
+ /**
99
+ * Pick an adapter from config. Falls back to MockStt (and tells the caller it
100
+ * did, so the UI can show a "voice is in demo mode" hint) when DeepRouter
101
+ * creds are absent — a missing key must never hard-crash the client.
102
+ */
103
+ export function resolveSttAdapter(
104
+ cfg: Partial<DeepRouterSttConfig> | undefined,
105
+ ): { adapter: SttAdapter; mode: "deeprouter" | "mock" } {
106
+ if (cfg?.baseUrl && cfg.apiKey && cfg.model) {
107
+ return { adapter: new DeepRouterStt(cfg as DeepRouterSttConfig), mode: "deeprouter" }
108
+ }
109
+ return { adapter: new MockStt(), mode: "mock" }
110
+ }
111
+
112
+ async function safeText(res: Response): Promise<string> {
113
+ try {
114
+ return (await res.text()).slice(0, 200)
115
+ } catch {
116
+ return "<no body>"
117
+ }
118
+ }
@@ -0,0 +1,92 @@
1
+ /**
2
+ * Voice-activity detection + mic-meter rendering (pure, unit-testable).
3
+ *
4
+ * This is a deliberately tiny energy-based VAD, not a neural one. The job:
5
+ * let a kid press the spacebar ONCE, talk, and have the mic close on its own
6
+ * when they stop — so they never have to remember a second keypress. A real
7
+ * silero/webrtc VAD can drop in behind the same `shouldAutoStop` shape later;
8
+ * the controller only depends on this signature.
9
+ *
10
+ * Energy levels are normalised 0..1 (0 = silence, 1 = loud). The recorder
11
+ * feeds a rolling window of recent levels; we decide stop/continue from it.
12
+ */
13
+
14
+ export interface VadOptions {
15
+ /** Below this normalised energy a frame counts as silence. */
16
+ silenceThreshold: number
17
+ /** Continuous silence this long (ms) auto-stops the recording. */
18
+ silenceMsToStop: number
19
+ /** Spacing between level samples (ms). */
20
+ sampleIntervalMs: number
21
+ /** Ignore silence until the kid has actually spoken this long (ms), so a
22
+ * slow starter who pauses before their first word isn't cut off. */
23
+ minSpeechMs: number
24
+ /** Hard cap (ms): stop no matter what, so a stuck-open mic (or a kid who
25
+ * wandered off) can't record forever. Compliance + cost guard. */
26
+ maxClipMs: number
27
+ }
28
+
29
+ export const DEFAULT_VAD: VadOptions = {
30
+ silenceThreshold: 0.06,
31
+ silenceMsToStop: 1500,
32
+ sampleIntervalMs: 100,
33
+ minSpeechMs: 400,
34
+ maxClipMs: 30_000,
35
+ }
36
+
37
+ export type VadDecision = "continue" | "stop_silence" | "stop_max_length"
38
+
39
+ /**
40
+ * Decide whether to keep recording given the full sequence of level samples
41
+ * captured so far (oldest→newest). Pure: same input, same output, no clock —
42
+ * the caller owns timing by passing `sampleIntervalMs`-spaced levels.
43
+ *
44
+ * Rules, in order:
45
+ * 1. Hard cap reached → stop_max_length.
46
+ * 2. Kid hasn't spoken `minSpeechMs` of non-silence yet → continue
47
+ * (don't punish a slow start).
48
+ * 3. Trailing run of silence ≥ silenceMsToStop → stop_silence.
49
+ * 4. Otherwise → continue.
50
+ */
51
+ export function shouldAutoStop(levels: number[], opts: VadOptions = DEFAULT_VAD): VadDecision {
52
+ const elapsedMs = levels.length * opts.sampleIntervalMs
53
+ if (elapsedMs >= opts.maxClipMs) return "stop_max_length"
54
+
55
+ const spokenMs = levels.filter((l) => l > opts.silenceThreshold).length * opts.sampleIntervalMs
56
+ if (spokenMs < opts.minSpeechMs) return "continue"
57
+
58
+ let trailingSilenceFrames = 0
59
+ for (let i = levels.length - 1; i >= 0; i--) {
60
+ if (levels[i]! > opts.silenceThreshold) break
61
+ trailingSilenceFrames++
62
+ }
63
+ const trailingSilenceMs = trailingSilenceFrames * opts.sampleIntervalMs
64
+ if (trailingSilenceMs >= opts.silenceMsToStop) return "stop_silence"
65
+
66
+ return "continue"
67
+ }
68
+
69
+ const METER_GLYPHS = ["▁", "▂", "▃", "▄", "▅", "▆", "▇", "█"] as const
70
+
71
+ /**
72
+ * Render a live mic meter from the latest energy level. Terminals can't draw
73
+ * graphics, but a row of block glyphs that jumps with the kid's voice is the
74
+ * single most important "it's really listening to ME" signal — without it a
75
+ * kid stares at a frozen screen and gives up.
76
+ *
77
+ * Returns `width` glyphs; `level` 0..1 picks the height, with a little jitter
78
+ * across columns so it looks alive rather than a flat bar.
79
+ */
80
+ export function renderMeter(level: number, width = 12): string {
81
+ const clamped = Math.max(0, Math.min(1, level))
82
+ let out = ""
83
+ for (let i = 0; i < width; i++) {
84
+ // Columns toward the centre read a touch taller — cheap "waveform" feel
85
+ // without needing real per-column energy.
86
+ const centreBias = 1 - Math.abs(i - (width - 1) / 2) / (width / 2)
87
+ const h = clamped * (0.6 + 0.4 * centreBias)
88
+ const idx = Math.min(METER_GLYPHS.length - 1, Math.round(h * (METER_GLYPHS.length - 1)))
89
+ out += METER_GLYPHS[idx]
90
+ }
91
+ return out
92
+ }
@@ -10,6 +10,7 @@
10
10
  */
11
11
 
12
12
  import React, { useSyncExternalStore } from "react"
13
+ import { Box, useStdout } from "ink"
13
14
  import type { InstalledPack } from "../../core/course-pack.ts"
14
15
  import type { ErrorVariant, Store } from "../../core/store.ts"
15
16
  import { StartupScreen } from "./screens/StartupScreen.tsx"
@@ -79,14 +80,35 @@ export function App(deps: AppDeps): React.ReactElement {
79
80
  () => deps.store.getSnapshot(),
80
81
  () => deps.store.getSnapshot(),
81
82
  )
83
+ // Pin the App's footprint to the terminal's full dimensions. Without
84
+ // this, MissionScreen's `flexGrow={1}` middle box (chat + spinner) made
85
+ // the App's TOTAL rendered height shift by ±1 line on every keystroke /
86
+ // spinner tick / streaming chunk. Ink's diff move-cursor-up-by-N then
87
+ // used a stale N from the previous frame, so each new frame got drawn
88
+ // one row LOWER than the last — leaving the previous frame's top
89
+ // border behind. Result: a cascade of ┌──┐ stripes piling up above the
90
+ // Header. With width+height fixed to the terminal, the App's footprint
91
+ // never changes between renders and Ink's diff stays correct.
92
+ const { stdout } = useStdout()
93
+ const width = stdout?.columns && stdout.columns > 4 ? stdout.columns : 80
94
+ // -1 to leave a row for the terminal cursor / status; without it some
95
+ // terminals scroll the App by one line on the first render.
96
+ const height = stdout?.rows && stdout.rows > 4 ? stdout.rows - 1 : 23
82
97
 
98
+ const screen = renderScreen(state, deps)
99
+ return (
100
+ <Box width={width} height={height} flexDirection="column">
101
+ {screen}
102
+ </Box>
103
+ )
104
+ }
105
+
106
+ function renderScreen(state: ReturnType<Store["getSnapshot"]>, deps: AppDeps): React.ReactElement | null {
83
107
  // Dangerous-topic overlay takes absolute priority — it has to be the
84
108
  // first thing on screen the moment a pattern hits, even mid-stream.
85
109
  if (state.dangerousTopic) {
86
110
  return <DangerousTopicModal topic={state.dangerousTopic} locale={deps.locale} onAcknowledge={deps.onDangerousAcknowledge} />
87
111
  }
88
-
89
- // Permission modal is the next-highest priority.
90
112
  if (state.pendingPermission) {
91
113
  return (
92
114
  <PermissionModal
@@ -98,7 +120,6 @@ export function App(deps: AppDeps): React.ReactElement {
98
120
  />
99
121
  )
100
122
  }
101
-
102
123
  switch (state.screen.kind) {
103
124
  case "loading":
104
125
  return <LoadingScreen locale={deps.locale} message={state.screen.message} />
@@ -149,4 +170,5 @@ export function App(deps: AppDeps): React.ReactElement {
149
170
  />
150
171
  )
151
172
  }
173
+ return null
152
174
  }
@@ -17,6 +17,7 @@ import { Input } from "../components/Input.tsx"
17
17
  import { Thinking } from "../components/Thinking.tsx"
18
18
  import { Toast } from "../components/Toast.tsx"
19
19
  import { getTheme } from "../theme.ts"
20
+ import { useVoiceInput } from "../useVoiceInput.ts"
20
21
  import type { KidsClientState } from "../../../core/store.ts"
21
22
 
22
23
  interface MissionScreenProps {
@@ -33,20 +34,35 @@ export function MissionScreen({ state, locale, onPrompt, onAbort, onExit }: Miss
33
34
  const [draft, setDraft] = useState("")
34
35
  const placeholder = locale === "zh-Hans" ? "想做什么?告诉我吧(中文/英文都行)" : "What would you like to make? (English or Chinese)"
35
36
 
36
- // Esc is overloaded so it never eats the kid's typing: while the AI is
37
- // thinking it interrupts; with text typed it clears the draft; when idle and
38
- // empty it leaves the mission back to the startup menu (so the kid isn't
39
- // trapped here dogfood feedback).
40
- useInput((_, key) => {
41
- if (!key.escape) return
42
- if (state.thinking) onAbort()
43
- else if (draft.length > 0) setDraft("")
44
- else onExit()
37
+ const voice = useVoiceInput(onPrompt)
38
+ const voiceBusy = voice.voiceState !== "idle"
39
+ // Spacebar talks ONLY when the kid isn't mid-typing a non-empty draft means
40
+ // they're writing, so spacebar must stay a literal space there.
41
+ const canTalk = !state.thinking && state.pendingPermission === null && draft.trim() === "" && voice.ready
42
+
43
+ // Esc is overloaded so it never eats the kid's typing: while recording it
44
+ // cancels voice; while the AI is thinking it interrupts; with text typed it
45
+ // clears the draft; when idle + empty it leaves back to the startup menu (so
46
+ // the kid isn't trapped here — dogfood feedback).
47
+ useInput((input, key) => {
48
+ if (voiceBusy) {
49
+ if (key.escape) voice.cancel()
50
+ else if (key.return || input === " ") voice.stopListening()
51
+ return
52
+ }
53
+ if (key.escape) {
54
+ if (state.thinking) onAbort()
55
+ else if (draft.length > 0) setDraft("")
56
+ else onExit()
57
+ } else if (input === " " && canTalk) {
58
+ setDraft("")
59
+ voice.startListening()
60
+ }
45
61
  })
46
62
 
47
63
  const hint = locale === "zh-Hans"
48
- ? "提示:做完一关时打 /check 或「我做完了」就能验收 · Esc 打断 AI / 返回菜单"
49
- : "Tip: type /check or 'I'm done' to validate · Esc interrupts the AI / returns to menu"
64
+ ? "提示:按「空格」对小助手说话 · 打 /check 或「我做完了」验收 · Esc 打断 AI / 返回菜单"
65
+ : "Tip: press Space to talk · type /check or 'I'm done' to validate · Esc interrupts AI / returns to menu"
50
66
 
51
67
  return (
52
68
  <Box flexDirection="column">
@@ -67,18 +83,22 @@ export function MissionScreen({ state, locale, onPrompt, onAbort, onExit }: Miss
67
83
  )}
68
84
  </Box>
69
85
  <Box marginTop={1}>
70
- <Input
71
- value={draft}
72
- onChange={setDraft}
73
- onSubmit={(v) => {
74
- const text = v.trim()
75
- if (!text) return
76
- setDraft("")
77
- onPrompt(text)
78
- }}
79
- placeholder={placeholder}
80
- disabled={state.thinking || state.pendingPermission !== null}
81
- />
86
+ {voiceBusy ? (
87
+ <VoiceBar voiceState={voice.voiceState} meter={voice.meter} mode={voice.mode} locale={locale} theme={theme} />
88
+ ) : (
89
+ <Input
90
+ value={draft}
91
+ onChange={setDraft}
92
+ onSubmit={(v) => {
93
+ const text = v.trim()
94
+ if (!text) return
95
+ setDraft("")
96
+ onPrompt(text)
97
+ }}
98
+ placeholder={placeholder}
99
+ disabled={state.thinking || state.pendingPermission !== null}
100
+ />
101
+ )}
82
102
  </Box>
83
103
  {state.toast ? (
84
104
  <Box marginTop={1}>
@@ -92,3 +112,41 @@ export function MissionScreen({ state, locale, onPrompt, onAbort, onExit }: Miss
92
112
  </Box>
93
113
  )
94
114
  }
115
+
116
+ interface VoiceBarProps {
117
+ voiceState: ReturnType<typeof useVoiceInput>["voiceState"]
118
+ meter: string
119
+ mode: "deeprouter" | "mock"
120
+ locale: "zh-Hans" | "en"
121
+ theme: ReturnType<typeof getTheme>
122
+ }
123
+
124
+ /** Replaces the input box while a voice turn is in flight: shows the mic
125
+ * indicator + live meter while listening, and a status line otherwise. */
126
+ function VoiceBar({ voiceState, meter, mode, locale, theme }: VoiceBarProps): React.ReactElement {
127
+ const zh = locale === "zh-Hans"
128
+ const label =
129
+ voiceState === "listening"
130
+ ? zh ? "🎙 听你说…(再按空格 或 回车 结束,Esc 取消)" : "🎙 Listening… (Space/Enter to finish, Esc to cancel)"
131
+ : voiceState === "transcribing"
132
+ ? zh ? "✍️ 正在听懂你说的话…" : "✍️ Figuring out what you said…"
133
+ : voiceState === "error"
134
+ ? zh ? "😅 没听清,按空格再试一次" : "😅 Didn't catch that — press Space to retry"
135
+ : zh ? "小助手在想…" : "Thinking…"
136
+
137
+ return (
138
+ <Box borderStyle="single" borderColor={theme.kid} paddingX={1} flexDirection="column">
139
+ <Box>
140
+ <Text color={theme.kid}>{label}</Text>
141
+ </Box>
142
+ {voiceState === "listening" && (
143
+ <Box>
144
+ <Text color={theme.accent}>{meter}</Text>
145
+ {mode === "mock" && (
146
+ <Text color={theme.fgDim} dimColor>{zh ? " (演示模式)" : " (demo mode)"}</Text>
147
+ )}
148
+ </Box>
149
+ )}
150
+ </Box>
151
+ )
152
+ }
@@ -0,0 +1,146 @@
1
+ /**
2
+ * React hook that wraps the core voice engine for the Ink UI.
3
+ *
4
+ * Keeps Ink out of core/: this hook is the ONLY place that turns the pure
5
+ * VoiceController + Recorder + STT adapter into component state (voiceState +
6
+ * meter string) and a few imperative handlers the MissionScreen binds to keys.
7
+ *
8
+ * Degrade-don't-crash, by design:
9
+ * - No sox/ffmpeg on PATH → demo mode: skips real capture, still walks the
10
+ * kid through the flow with a canned transcript so the UX is visible.
11
+ * - No DeepRouter STT creds (env) → MockStt; a missing key never crashes.
12
+ * Both modes are surfaced via `mode` so the UI can show a "demo" hint.
13
+ *
14
+ * Note (v1): the meter is a "recording in progress" pulse, not true mic
15
+ * energy, and stop is manual (space/Enter) — real-energy VAD auto-stop lands
16
+ * once Recorder streams PCM levels. The state machine + STT path are the real,
17
+ * tested ones (see core/voice/controller.ts).
18
+ */
19
+
20
+ import { useCallback, useEffect, useRef, useState } from "react"
21
+ import { tmpdir } from "node:os"
22
+ import { join } from "node:path"
23
+ import { VoiceController } from "../../core/voice/controller.ts"
24
+ import { Recorder, detectRecorder, type RecorderKind } from "../../core/voice/recorder.ts"
25
+ import { resolveSttAdapter, MockStt, type SttAdapter } from "../../core/voice/stt.ts"
26
+ import { renderMeter } from "../../core/voice/vad.ts"
27
+ import type { VoiceState } from "../../core/voice/state.ts"
28
+
29
+ export interface UseVoiceInput {
30
+ voiceState: VoiceState
31
+ /** Glyph bar for the mic indicator while listening. */
32
+ meter: string
33
+ /** "deeprouter" = real STT, "mock" = canned (no key / no recorder). */
34
+ mode: "deeprouter" | "mock"
35
+ /** True until detectRecorder() resolves. */
36
+ ready: boolean
37
+ startListening: () => void
38
+ stopListening: () => void
39
+ cancel: () => void
40
+ }
41
+
42
+ /** Read DeepRouter STT config from env (set by the wrapper / parent setup).
43
+ * Absent → resolveSttAdapter falls back to MockStt. */
44
+ function sttConfigFromEnv() {
45
+ const baseUrl = process.env.KIDS_STT_BASE_URL
46
+ const apiKey = process.env.KIDS_STT_API_KEY
47
+ const model = process.env.KIDS_STT_MODEL
48
+ if (!baseUrl || !apiKey || !model) return undefined
49
+ return { baseUrl, apiKey, model, language: process.env.KIDS_STT_LANG }
50
+ }
51
+
52
+ /**
53
+ * @param onTranscript called with recognised text; MissionScreen passes it to
54
+ * onPrompt() so it reaches the LLM exactly like a typed message.
55
+ */
56
+ export function useVoiceInput(onTranscript: (text: string) => void): UseVoiceInput {
57
+ const [voiceState, setVoiceState] = useState<VoiceState>("idle")
58
+ const [meter, setMeter] = useState("")
59
+ const [mode, setMode] = useState<"deeprouter" | "mock">("mock")
60
+ const [ready, setReady] = useState(false)
61
+
62
+ const recorderKindRef = useRef<RecorderKind | null>(null)
63
+ const sttRef = useRef<SttAdapter>(new MockStt())
64
+ const controllerRef = useRef<VoiceController | null>(null)
65
+ const pulseRef = useRef<ReturnType<typeof setInterval> | null>(null)
66
+
67
+ // One-time capability probe: which recorder (if any) + which STT adapter.
68
+ useEffect(() => {
69
+ let cancelled = false
70
+ void (async () => {
71
+ const kind = await detectRecorder()
72
+ const { adapter, mode: sttMode } = resolveSttAdapter(sttConfigFromEnv())
73
+ if (cancelled) return
74
+ recorderKindRef.current = kind
75
+ sttRef.current = adapter
76
+ // No recorder → demo transcript so the flow is still walkable.
77
+ if (!kind) sttRef.current = new MockStt()
78
+ setMode(kind ? sttMode : "mock")
79
+ setReady(true)
80
+ })()
81
+ return () => {
82
+ cancelled = true
83
+ if (pulseRef.current) clearInterval(pulseRef.current)
84
+ }
85
+ }, [])
86
+
87
+ const stopPulse = useCallback(() => {
88
+ if (pulseRef.current) {
89
+ clearInterval(pulseRef.current)
90
+ pulseRef.current = null
91
+ }
92
+ setMeter("")
93
+ }, [])
94
+
95
+ const startListening = useCallback(() => {
96
+ if (voiceState !== "idle" || !ready) return
97
+
98
+ const kind = recorderKindRef.current
99
+ const outPath = join(tmpdir(), "kids-voice-clip.wav")
100
+ // Real recorder when present; a stub one in demo mode (start/stop no-op,
101
+ // stop() returns an empty clip and MockStt supplies canned text).
102
+ const recorder = kind
103
+ ? new Recorder(kind, outPath)
104
+ : {
105
+ start() {},
106
+ async stop() {
107
+ return { bytes: new Uint8Array(0), mimeType: "audio/wav" }
108
+ },
109
+ async cancel() {},
110
+ }
111
+
112
+ const controller = new VoiceController(recorder, sttRef.current, {
113
+ onState: setVoiceState,
114
+ onTranscript: (text) => {
115
+ stopPulse()
116
+ onTranscript(text)
117
+ },
118
+ onError: () => {
119
+ stopPulse()
120
+ },
121
+ })
122
+ controllerRef.current = controller
123
+ controller.start()
124
+
125
+ // "I'm listening" pulse — a lively bar so the kid knows the mic is hot,
126
+ // even before real PCM energy drives it.
127
+ let t = 0
128
+ pulseRef.current = setInterval(() => {
129
+ t += 1
130
+ const level = 0.35 + 0.4 * Math.abs(Math.sin(t / 2))
131
+ setMeter(renderMeter(level))
132
+ }, 120)
133
+ }, [voiceState, ready, onTranscript, stopPulse])
134
+
135
+ const stopListening = useCallback(() => {
136
+ stopPulse()
137
+ void controllerRef.current?.stop()
138
+ }, [stopPulse])
139
+
140
+ const cancel = useCallback(() => {
141
+ stopPulse()
142
+ void controllerRef.current?.cancel()
143
+ }, [stopPulse])
144
+
145
+ return { voiceState, meter, mode, ready, startListening, stopListening, cancel }
146
+ }
@@ -0,0 +1,78 @@
1
+ /**
2
+ * Standalone voice-input demo — `bun run voice-demo`.
3
+ *
4
+ * Renders the real MissionScreen with a fake in-memory store and a no-op LLM,
5
+ * so a human can try the voice flow end-to-end (press Space → talk → it echoes
6
+ * the transcript as a kid message) WITHOUT needing opencode serve, a provider
7
+ * key, or the wallet/audit backend. This is the "you test it" harness.
8
+ *
9
+ * Behaviour by environment:
10
+ * - sox or ffmpeg on PATH → real mic capture to a wav.
11
+ * - KIDS_STT_* env set → real DeepRouter transcription of that wav.
12
+ * - neither → demo mode: canned transcript, flow still works.
13
+ */
14
+
15
+ import React, { useState } from "react"
16
+ import { render, Box, Text } from "ink"
17
+ import { MissionScreen } from "./render/ink/screens/MissionScreen.tsx"
18
+ import type { ChatMessage, KidsClientState } from "./core/store.ts"
19
+
20
+ const LOCALE: "zh-Hans" | "en" = process.env.KIDS_LOCALE === "en" ? "en" : "zh-Hans"
21
+
22
+ function baseState(messages: ChatMessage[]): KidsClientState {
23
+ return {
24
+ screen: { kind: "mission" },
25
+ sessionId: "demo",
26
+ messages,
27
+ starsBalance: 100,
28
+ starsBudget: 200,
29
+ pendingPermission: null,
30
+ dangerousTopic: null,
31
+ thinking: false,
32
+ coursePack: "voice-demo",
33
+ mission: "demo",
34
+ packTitle: LOCALE === "en" ? "Voice Demo" : "语音演示",
35
+ missionTitle: LOCALE === "en" ? "Press Space and talk" : "按空格说话试试",
36
+ missionIndex: 1,
37
+ missionTotal: 1,
38
+ toast: null,
39
+ auditBuffer: [],
40
+ }
41
+ }
42
+
43
+ let counter = 0
44
+
45
+ function DemoApp(): React.ReactElement {
46
+ const [messages, setMessages] = useState<ChatMessage[]>([])
47
+
48
+ const onPrompt = (text: string) => {
49
+ // Echo the (typed or transcribed) text as the kid's message, then a canned
50
+ // "AI" acknowledgement so the loop is visibly closed.
51
+ const ts = 1_700_000_000_000 + counter
52
+ setMessages((prev) => [
53
+ ...prev,
54
+ { id: `k${counter++}`, actor: "kid", text, streaming: false, ts },
55
+ {
56
+ id: `a${counter++}`,
57
+ actor: "agent",
58
+ text: LOCALE === "en" ? `Got it — you said: "${text}"` : `收到啦——你说的是:「${text}」`,
59
+ streaming: false,
60
+ ts: ts + 1,
61
+ },
62
+ ])
63
+ }
64
+
65
+ return (
66
+ <Box flexDirection="column">
67
+ <Box marginBottom={1}>
68
+ <Text color="magenta">
69
+ {LOCALE === "en" ? "🎙 Voice demo — press Space to talk, Esc to quit" : "🎙 语音演示 — 按空格说话,Esc 退出"}
70
+ </Text>
71
+ </Box>
72
+ <MissionScreen state={baseState(messages)} locale={LOCALE} onPrompt={onPrompt} onAbort={() => {}} onExit={() => process.exit(0)} />
73
+ </Box>
74
+ )
75
+ }
76
+
77
+ const { waitUntilExit } = render(<DemoApp />)
78
+ void waitUntilExit()