@oh-my-pi/pi-coding-agent 15.12.4 → 15.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (291) hide show
  1. package/CHANGELOG.md +304 -6
  2. package/dist/cli.js +1015 -881
  3. package/dist/types/async/job-manager.d.ts +15 -0
  4. package/dist/types/autolearn/controller.d.ts +25 -0
  5. package/dist/types/autolearn/managed-skills.d.ts +45 -0
  6. package/dist/types/autoresearch/state.d.ts +1 -1
  7. package/dist/types/autoresearch/types.d.ts +1 -1
  8. package/dist/types/cli/args.d.ts +19 -1
  9. package/dist/types/cli/session-picker.d.ts +1 -1
  10. package/dist/types/cli/setup-cli.d.ts +1 -1
  11. package/dist/types/cli/setup-model-picker.d.ts +14 -0
  12. package/dist/types/collab/protocol.d.ts +1 -1
  13. package/dist/types/commands/say.d.ts +24 -0
  14. package/dist/types/config/keybindings.d.ts +3 -3
  15. package/dist/types/config/model-registry.d.ts +10 -0
  16. package/dist/types/config/models-config-schema.d.ts +12 -0
  17. package/dist/types/config/models-config.d.ts +8 -2
  18. package/dist/types/config/settings-schema.d.ts +261 -58
  19. package/dist/types/export/html/index.d.ts +2 -1
  20. package/dist/types/extensibility/extensions/model-api.d.ts +17 -0
  21. package/dist/types/extensibility/extensions/runner.d.ts +3 -1
  22. package/dist/types/extensibility/extensions/types.d.ts +47 -1
  23. package/dist/types/extensibility/hooks/index.d.ts +2 -1
  24. package/dist/types/extensibility/plugins/legacy-pi-compat.d.ts +9 -0
  25. package/dist/types/extensibility/plugins/loader.d.ts +11 -0
  26. package/dist/types/extensibility/shared-events.d.ts +1 -1
  27. package/dist/types/extensibility/skills.d.ts +10 -0
  28. package/dist/types/goals/guided-setup.d.ts +18 -0
  29. package/dist/types/goals/state.d.ts +1 -1
  30. package/dist/types/hindsight/transcript.d.ts +1 -1
  31. package/dist/types/index.d.ts +5 -0
  32. package/dist/types/internal-urls/local-protocol.d.ts +4 -2
  33. package/dist/types/main.d.ts +4 -3
  34. package/dist/types/mcp/startup-events.d.ts +11 -0
  35. package/dist/types/memories/index.d.ts +7 -0
  36. package/dist/types/memory-backend/local-backend.d.ts +4 -3
  37. package/dist/types/mnemopi/config.d.ts +4 -4
  38. package/dist/types/modes/components/agent-hub.d.ts +6 -0
  39. package/dist/types/modes/components/assistant-message.d.ts +1 -2
  40. package/dist/types/modes/components/compaction-summary-message.d.ts +15 -1
  41. package/dist/types/modes/components/custom-editor.d.ts +39 -1
  42. package/dist/types/modes/components/custom-editor.test.d.ts +1 -0
  43. package/dist/types/modes/components/session-selector.d.ts +1 -1
  44. package/dist/types/modes/components/tool-execution.d.ts +26 -16
  45. package/dist/types/modes/components/transcript-container.d.ts +23 -2
  46. package/dist/types/modes/components/tree-selector.d.ts +1 -1
  47. package/dist/types/modes/components/usage-row.d.ts +3 -0
  48. package/dist/types/modes/controllers/command-controller.d.ts +2 -2
  49. package/dist/types/modes/controllers/input-controller.d.ts +14 -0
  50. package/dist/types/modes/controllers/selector-controller.d.ts +3 -1
  51. package/dist/types/modes/gradient-highlight.d.ts +9 -4
  52. package/dist/types/modes/image-references.d.ts +6 -0
  53. package/dist/types/modes/interactive-mode.d.ts +27 -3
  54. package/dist/types/modes/magic-keywords.d.ts +13 -1
  55. package/dist/types/modes/rpc/rpc-mode.d.ts +35 -1
  56. package/dist/types/modes/rpc/rpc-types.d.ts +9 -1
  57. package/dist/types/modes/runtime-init.d.ts +4 -0
  58. package/dist/types/modes/theme/theme.d.ts +13 -2
  59. package/dist/types/modes/types.d.ts +8 -2
  60. package/dist/types/modes/utils/ui-helpers.d.ts +1 -1
  61. package/dist/types/registry/agent-registry.d.ts +17 -0
  62. package/dist/types/secrets/obfuscator.d.ts +1 -1
  63. package/dist/types/session/agent-session.d.ts +14 -2
  64. package/dist/types/session/indexed-session-storage.d.ts +3 -4
  65. package/dist/types/session/session-context.d.ts +39 -0
  66. package/dist/types/session/session-entries.d.ts +159 -0
  67. package/dist/types/session/session-listing.d.ts +69 -0
  68. package/dist/types/session/session-loader.d.ts +16 -0
  69. package/dist/types/session/session-manager.d.ts +82 -474
  70. package/dist/types/session/session-migrations.d.ts +12 -0
  71. package/dist/types/session/session-paths.d.ts +25 -0
  72. package/dist/types/session/session-persistence.d.ts +8 -0
  73. package/dist/types/session/session-storage.d.ts +11 -12
  74. package/dist/types/session/snapcompact-inline.d.ts +12 -1
  75. package/dist/types/session/snapcompact-savings-journal.d.ts +46 -0
  76. package/dist/types/session/tool-choice-queue.d.ts +6 -6
  77. package/dist/types/stt/asr-client.d.ts +90 -0
  78. package/dist/types/stt/asr-protocol.d.ts +97 -0
  79. package/dist/types/stt/asr-worker.d.ts +2 -0
  80. package/dist/types/stt/downloader.d.ts +38 -0
  81. package/dist/types/stt/endpointer.d.ts +59 -0
  82. package/dist/types/stt/index.d.ts +5 -1
  83. package/dist/types/stt/models.d.ts +120 -0
  84. package/dist/types/stt/recorder.d.ts +17 -0
  85. package/dist/types/stt/stt-controller.d.ts +6 -0
  86. package/dist/types/stt/transcriber.d.ts +5 -7
  87. package/dist/types/stt/wav.d.ts +29 -0
  88. package/dist/types/system-prompt.d.ts +4 -0
  89. package/dist/types/task/executor.d.ts +2 -0
  90. package/dist/types/task/index.d.ts +9 -1
  91. package/dist/types/task/types.d.ts +36 -0
  92. package/dist/types/tools/bash.d.ts +2 -2
  93. package/dist/types/tools/eval-render.d.ts +1 -1
  94. package/dist/types/tools/index.d.ts +11 -1
  95. package/dist/types/tools/irc.d.ts +1 -0
  96. package/dist/types/tools/learn.d.ts +51 -0
  97. package/dist/types/tools/manage-skill.d.ts +40 -0
  98. package/dist/types/tools/plan-mode-guard.d.ts +10 -0
  99. package/dist/types/tools/renderers.d.ts +7 -11
  100. package/dist/types/tools/ssh.d.ts +1 -1
  101. package/dist/types/tools/todo.d.ts +1 -1
  102. package/dist/types/tools/tts.d.ts +25 -0
  103. package/dist/types/tools/write.d.ts +1 -1
  104. package/dist/types/tts/downloader.d.ts +20 -0
  105. package/dist/types/tts/index.d.ts +8 -0
  106. package/dist/types/tts/models.d.ts +82 -0
  107. package/dist/types/tts/player.d.ts +32 -0
  108. package/dist/types/tts/runtime.d.ts +6 -0
  109. package/dist/types/tts/streaming-player.d.ts +41 -0
  110. package/dist/types/tts/tts-client.d.ts +93 -0
  111. package/dist/types/tts/tts-protocol.d.ts +95 -0
  112. package/dist/types/tts/tts-worker.d.ts +2 -0
  113. package/dist/types/tts/vocalizer.d.ts +41 -0
  114. package/dist/types/tts/wav.d.ts +8 -0
  115. package/dist/types/utils/tool-choice.d.ts +8 -0
  116. package/dist/types/utils/tools-manager.d.ts +2 -1
  117. package/dist/types/utils/tools-manager.test.d.ts +1 -0
  118. package/dist/types/web/scrapers/github.d.ts +1 -1
  119. package/package.json +15 -14
  120. package/src/async/job-manager.ts +49 -0
  121. package/src/autolearn/controller.ts +139 -0
  122. package/src/autolearn/managed-skills.ts +257 -0
  123. package/src/autoresearch/state.ts +1 -1
  124. package/src/autoresearch/types.ts +1 -1
  125. package/src/cli/args.ts +56 -2
  126. package/src/cli/session-picker.ts +2 -1
  127. package/src/cli/setup-cli.ts +148 -47
  128. package/src/cli/setup-model-picker.ts +43 -0
  129. package/src/cli-commands.ts +1 -0
  130. package/src/cli.ts +45 -13
  131. package/src/collab/host.ts +1 -1
  132. package/src/collab/protocol.ts +1 -1
  133. package/src/commands/say.ts +102 -0
  134. package/src/commands/setup.ts +1 -1
  135. package/src/commit/agentic/tools/analyze-file.ts +3 -0
  136. package/src/config/keybindings.ts +2 -2
  137. package/src/config/model-discovery.ts +11 -5
  138. package/src/config/model-registry.ts +64 -9
  139. package/src/config/models-config-schema.ts +4 -1
  140. package/src/config/models-config.ts +2 -1
  141. package/src/config/settings-schema.ts +248 -32
  142. package/src/config/settings.ts +10 -0
  143. package/src/discovery/builtin.ts +23 -1
  144. package/src/discovery/claude-plugins.ts +44 -5
  145. package/src/discovery/helpers.ts +41 -1
  146. package/src/eval/__tests__/budget-bridge.test.ts +1 -1
  147. package/src/eval/js/shared/prelude.txt +69 -17
  148. package/src/export/html/index.ts +3 -6
  149. package/src/extensibility/extensions/model-api.ts +41 -0
  150. package/src/extensibility/extensions/runner.ts +4 -0
  151. package/src/extensibility/extensions/types.ts +52 -1
  152. package/src/extensibility/extensions/wrapper.ts +41 -5
  153. package/src/extensibility/hooks/index.ts +2 -1
  154. package/src/extensibility/plugins/legacy-pi-compat.ts +43 -13
  155. package/src/extensibility/plugins/loader.ts +30 -19
  156. package/src/extensibility/plugins/manager.ts +221 -90
  157. package/src/extensibility/shared-events.ts +1 -1
  158. package/src/extensibility/skills.ts +96 -15
  159. package/src/goals/guided-setup.ts +133 -0
  160. package/src/goals/state.ts +1 -1
  161. package/src/hindsight/transcript.ts +1 -1
  162. package/src/index.ts +5 -0
  163. package/src/internal-urls/docs-index.generated.ts +10 -10
  164. package/src/internal-urls/history-protocol.ts +1 -1
  165. package/src/internal-urls/local-protocol.ts +29 -7
  166. package/src/main.ts +27 -7
  167. package/src/mcp/startup-events.ts +21 -0
  168. package/src/mcp/transports/stdio.ts +2 -1
  169. package/src/memories/index.ts +146 -11
  170. package/src/memory-backend/local-backend.ts +11 -5
  171. package/src/mnemopi/backend.ts +1 -0
  172. package/src/mnemopi/config.ts +26 -10
  173. package/src/modes/acp/acp-agent.ts +3 -5
  174. package/src/modes/components/agent-hub.ts +49 -4
  175. package/src/modes/components/assistant-message.ts +4 -37
  176. package/src/modes/components/compaction-summary-message.ts +125 -26
  177. package/src/modes/components/custom-editor.test.ts +96 -0
  178. package/src/modes/components/custom-editor.ts +164 -8
  179. package/src/modes/components/session-selector.ts +1 -1
  180. package/src/modes/components/settings-defs.ts +7 -0
  181. package/src/modes/components/tool-execution.ts +82 -43
  182. package/src/modes/components/transcript-container.ts +70 -1
  183. package/src/modes/components/tree-selector.ts +1 -1
  184. package/src/modes/components/usage-row.ts +18 -0
  185. package/src/modes/components/user-message.ts +4 -2
  186. package/src/modes/controllers/command-controller.ts +14 -4
  187. package/src/modes/controllers/event-controller.ts +78 -11
  188. package/src/modes/controllers/extension-ui-controller.ts +6 -0
  189. package/src/modes/controllers/input-controller.ts +258 -27
  190. package/src/modes/controllers/selector-controller.ts +12 -2
  191. package/src/modes/gradient-highlight.ts +21 -9
  192. package/src/modes/image-references.ts +20 -0
  193. package/src/modes/interactive-mode.ts +286 -40
  194. package/src/modes/magic-keywords.ts +27 -5
  195. package/src/modes/rpc/rpc-mode.ts +146 -14
  196. package/src/modes/rpc/rpc-subagents.ts +2 -2
  197. package/src/modes/rpc/rpc-types.ts +8 -2
  198. package/src/modes/runtime-init.ts +28 -3
  199. package/src/modes/theme/theme.ts +98 -50
  200. package/src/modes/types.ts +6 -2
  201. package/src/modes/utils/hotkeys-markdown.ts +1 -1
  202. package/src/modes/utils/ui-helpers.ts +34 -6
  203. package/src/priority.json +5 -1
  204. package/src/prompts/agents/task.md +1 -0
  205. package/src/prompts/goals/guided-goal-interview.md +8 -0
  206. package/src/prompts/goals/guided-goal-system.md +12 -0
  207. package/src/prompts/memories/read-path.md +6 -0
  208. package/src/prompts/system/autolearn-guidance-learn.md +1 -0
  209. package/src/prompts/system/autolearn-guidance.md +7 -0
  210. package/src/prompts/system/autolearn-nudge.md +3 -0
  211. package/src/prompts/system/eager-task.md +7 -0
  212. package/src/prompts/system/eager-todo.md +11 -6
  213. package/src/prompts/system/subagent-system-prompt.md +4 -0
  214. package/src/prompts/system/system-prompt.md +10 -5
  215. package/src/prompts/system/title-marker-instruction.md +1 -0
  216. package/src/prompts/system/title-system-marker.md +16 -0
  217. package/src/prompts/tools/job.md +1 -0
  218. package/src/prompts/tools/learn.md +7 -0
  219. package/src/prompts/tools/manage-skill.md +9 -0
  220. package/src/prompts/tools/task.md +3 -0
  221. package/src/registry/agent-registry.ts +30 -0
  222. package/src/sdk.ts +88 -24
  223. package/src/secrets/obfuscator.ts +1 -1
  224. package/src/session/agent-session.ts +209 -87
  225. package/src/session/history-storage.ts +2 -2
  226. package/src/session/indexed-session-storage.ts +7 -17
  227. package/src/session/session-context.ts +352 -0
  228. package/src/session/session-entries.ts +194 -0
  229. package/src/session/session-listing.ts +588 -0
  230. package/src/session/session-loader.ts +106 -0
  231. package/src/session/session-manager.ts +933 -3145
  232. package/src/session/session-migrations.ts +78 -0
  233. package/src/session/session-paths.ts +193 -0
  234. package/src/session/session-persistence.ts +131 -0
  235. package/src/session/session-storage.ts +91 -50
  236. package/src/session/snapcompact-inline.ts +21 -1
  237. package/src/session/snapcompact-savings-journal.ts +113 -0
  238. package/src/session/tool-choice-queue.ts +23 -11
  239. package/src/slash-commands/builtin-registry.ts +25 -3
  240. package/src/stt/asr-client.ts +520 -0
  241. package/src/stt/asr-protocol.ts +65 -0
  242. package/src/stt/asr-worker.ts +790 -0
  243. package/src/stt/downloader.ts +107 -47
  244. package/src/stt/endpointer.ts +259 -0
  245. package/src/stt/index.ts +5 -1
  246. package/src/stt/models.ts +150 -0
  247. package/src/stt/recorder.ts +247 -60
  248. package/src/stt/stt-controller.ts +201 -22
  249. package/src/stt/transcriber.ts +37 -68
  250. package/src/stt/wav.ts +173 -0
  251. package/src/system-prompt.ts +8 -0
  252. package/src/task/agents.ts +1 -2
  253. package/src/task/executor.ts +49 -15
  254. package/src/task/index.ts +60 -6
  255. package/src/task/render.ts +83 -8
  256. package/src/task/types.ts +53 -0
  257. package/src/tools/ask.ts +8 -0
  258. package/src/tools/bash.ts +4 -3
  259. package/src/tools/eval-render.ts +4 -3
  260. package/src/tools/index.ts +40 -4
  261. package/src/tools/irc.ts +10 -2
  262. package/src/tools/job.ts +14 -2
  263. package/src/tools/learn.ts +144 -0
  264. package/src/tools/manage-skill.ts +104 -0
  265. package/src/tools/plan-mode-guard.ts +53 -19
  266. package/src/tools/renderers.ts +7 -11
  267. package/src/tools/ssh.ts +4 -3
  268. package/src/tools/todo.ts +1 -1
  269. package/src/tools/tts.ts +203 -92
  270. package/src/tools/write.ts +18 -2
  271. package/src/tts/downloader.ts +64 -0
  272. package/src/tts/index.ts +8 -0
  273. package/src/tts/models.ts +137 -0
  274. package/src/tts/player.ts +137 -0
  275. package/src/tts/runtime.ts +21 -0
  276. package/src/tts/streaming-player.ts +266 -0
  277. package/src/tts/tts-client.ts +647 -0
  278. package/src/tts/tts-protocol.ts +60 -0
  279. package/src/tts/tts-worker.ts +497 -0
  280. package/src/tts/vocalizer.ts +162 -0
  281. package/src/tts/wav.ts +58 -0
  282. package/src/utils/title-generator.ts +48 -5
  283. package/src/utils/tool-choice.ts +16 -0
  284. package/src/utils/tools-manager.test.ts +25 -0
  285. package/src/utils/tools-manager.ts +19 -1
  286. package/src/web/scrapers/github.ts +96 -0
  287. package/src/web/search/index.ts +13 -0
  288. package/src/web/search/providers/searxng.ts +13 -1
  289. package/dist/types/stt/setup.d.ts +0 -18
  290. package/src/stt/setup.ts +0 -52
  291. package/src/stt/transcribe.py +0 -70
@@ -1,6 +1,10 @@
1
- import { $which, logger } from "@oh-my-pi/pi-utils";
2
- import { $ } from "bun";
3
- import { resolvePython } from "./transcriber";
1
+ import * as fs from "node:fs/promises";
2
+ import * as path from "node:path";
3
+ import { getTinyModelsCacheDir } from "@oh-my-pi/pi-utils";
4
+ import { sttClient } from "./asr-client";
5
+ import type { SttProgressStatus } from "./asr-protocol";
6
+ import { resolveSttModelSpec } from "./models";
7
+ import { ensureRecorder } from "./recorder";
4
8
 
5
9
  export interface DownloadProgress {
6
10
  stage: string;
@@ -9,63 +13,119 @@ export interface DownloadProgress {
9
13
 
10
14
  export interface EnsureOptions {
11
15
  modelName?: string;
16
+ signal?: AbortSignal;
12
17
  onProgress?: (progress: DownloadProgress) => void;
13
18
  }
14
19
 
15
- // ── Recording tool ─────────────────────────────────────────────────
20
+ // ── ONNX Whisper model ─────────────────────────────────────────────
16
21
 
17
- async function ensureRecordingTool(options?: EnsureOptions): Promise<void> {
18
- if ($which("sox")) return;
19
- if ($which("ffmpeg")) return;
20
- if (process.platform === "linux" && $which("arecord")) return;
22
+ /**
23
+ * Real-progress event for a speech-model download, surfaced to UI callers.
24
+ * `percent` is an integer 0–100 aggregated across all model files (encoder +
25
+ * decoder shards), so it advances monotonically toward completion.
26
+ */
27
+ export interface SttDownloadProgress {
28
+ status: SttProgressStatus;
29
+ /** Integer 0–100 aggregated across files. */
30
+ percent: number;
31
+ /** Bytes downloaded so far across all files. */
32
+ loaded: number;
33
+ /** Total bytes across all files seen so far. */
34
+ total: number;
35
+ /** The file currently downloading, when known. */
36
+ file?: string;
37
+ repo: string;
38
+ label: string;
39
+ }
21
40
 
22
- // Windows: PowerShell mciSendString is always available as fallback
23
- if (process.platform === "win32") {
24
- // Try to get ffmpeg for better quality, but don't block on failure
25
- options?.onProgress?.({ stage: "Trying to install FFmpeg via winget..." });
26
- const result = await $`winget install --id Gyan.FFmpeg -e --accept-source-agreements --accept-package-agreements`
27
- .quiet()
28
- .nothrow();
29
- if (result.exitCode === 0) {
30
- logger.debug("FFmpeg installed via winget");
41
+ /**
42
+ * Whether the selected model is already present in the local cache. For
43
+ * transformers.js Whisper tiers a complete download leaves `config.json` plus
44
+ * the `onnx/` weight files (a bare `config.json` from an interrupted fetch reads
45
+ * as not-cached); for sherpa-onnx tiers every model file (encoder/decoder/joiner
46
+ * + tokens) must be present (`.part` sidecars from an interrupted fetch are
47
+ * ignored).
48
+ */
49
+ export async function isSttModelCached(key: string): Promise<boolean> {
50
+ const spec = resolveSttModelSpec(key);
51
+ const repoDir = path.join(getTinyModelsCacheDir(), spec.repo);
52
+ if (spec.engine === "sherpa") {
53
+ try {
54
+ const root = new Set(await fs.readdir(repoDir));
55
+ for (const role in spec.files) {
56
+ if (!root.has(spec.files[role as keyof typeof spec.files])) return false;
57
+ }
58
+ return true;
59
+ } catch {
60
+ return false;
31
61
  }
32
- return;
33
62
  }
34
-
35
- throw new Error(
36
- "No audio recording tool found. Install SoX: sudo apt install sox, or FFmpeg: sudo apt install ffmpeg",
37
- );
38
- }
39
-
40
- // ── Python whisper ─────────────────────────────────────────────────
41
-
42
- async function ensurePythonWhisper(options?: EnsureOptions): Promise<void> {
43
- const pythonCmd = resolvePython();
44
- if (!pythonCmd) {
45
- throw new Error("Python not found. Install Python 3.8+ from https://python.org");
63
+ try {
64
+ const root = await fs.readdir(repoDir);
65
+ if (!root.includes("config.json")) return false;
66
+ const onnxFiles = await fs.readdir(path.join(repoDir, "onnx")).catch(() => [] as string[]);
67
+ return onnxFiles.some(file => file.endsWith(".onnx"));
68
+ } catch {
69
+ return false;
46
70
  }
71
+ }
47
72
 
48
- // Check if whisper module is already importable
49
- const check = Bun.spawnSync([pythonCmd, "-c", "import whisper"], {
50
- stdout: "pipe",
51
- stderr: "pipe",
73
+ /**
74
+ * Download (or warm from cache) the selected ONNX Whisper model via the speech
75
+ * worker, resolving once the model is fully present and loaded. Streams real
76
+ * Hub progress with an aggregated integer percent. Rejects if the worker cannot
77
+ * obtain the model. Safe to call non-interactively.
78
+ */
79
+ export async function downloadSttModel(
80
+ key: string,
81
+ onProgress?: (progress: SttDownloadProgress) => void,
82
+ options?: { signal?: AbortSignal },
83
+ ): Promise<void> {
84
+ const spec = resolveSttModelSpec(key);
85
+ const files = new Map<string, { loaded: number; total: number }>();
86
+ const ok = await sttClient.downloadModel(spec.key, {
87
+ signal: options?.signal,
88
+ onProgress: event => {
89
+ if ((event.status === "progress" || event.status === "progress_total") && event.file) {
90
+ if (typeof event.loaded === "number" && typeof event.total === "number" && event.total > 0) {
91
+ files.set(event.file, { loaded: event.loaded, total: event.total });
92
+ }
93
+ }
94
+ let loaded = 0;
95
+ let total = 0;
96
+ for (const file of files.values()) {
97
+ loaded += file.loaded;
98
+ total += file.total;
99
+ }
100
+ const settled = event.status === "ready" || event.status === "done";
101
+ const percent = total > 0 ? Math.min(100, Math.round((loaded / total) * 100)) : settled ? 100 : 0;
102
+ onProgress?.({
103
+ status: event.status,
104
+ percent,
105
+ loaded,
106
+ total,
107
+ file: event.file,
108
+ repo: spec.repo,
109
+ label: spec.label,
110
+ });
111
+ },
52
112
  });
53
- if (check.exitCode === 0) return;
54
-
55
- options?.onProgress?.({ stage: "Installing openai-whisper (this may take a few minutes)..." });
56
- logger.debug("Installing openai-whisper via pip");
57
-
58
- const install = await $`${pythonCmd} -m pip install -q openai-whisper`.quiet().nothrow();
59
- if (install.exitCode !== 0) {
60
- const stderr = install.stderr.toString().trim();
61
- throw new Error(`Failed to install openai-whisper: ${stderr.split("\n").pop()}`);
62
- }
63
- logger.debug("openai-whisper installed successfully");
113
+ if (!ok) throw new Error(`Failed to download speech model (${spec.repo}). Check your network connection.`);
64
114
  }
65
115
 
66
116
  // ── Public API ─────────────────────────────────────────────────────
67
117
 
68
118
  export async function ensureSTTDependencies(options?: EnsureOptions): Promise<void> {
69
- await ensureRecordingTool(options);
70
- await ensurePythonWhisper(options);
119
+ await ensureRecorder(progress => options?.onProgress?.(progress), options?.signal);
120
+ await downloadSttModel(
121
+ resolveSttModelSpec(options?.modelName).key,
122
+ progress => {
123
+ const stage =
124
+ progress.status === "ready" || progress.status === "done"
125
+ ? `Speech model ${progress.label} ready`
126
+ : `Downloading speech model ${progress.label}`;
127
+ options?.onProgress?.({ stage, percent: progress.percent });
128
+ },
129
+ { signal: options?.signal },
130
+ );
71
131
  }
@@ -0,0 +1,259 @@
1
+ /**
2
+ * Energy-based speech endpointer for live transcription.
3
+ *
4
+ * The on-device ASR models we ship are non-streaming: the sherpa-onnx Parakeet
5
+ * recognizer and the transformers.js Whisper pipelines both decode a complete
6
+ * waveform in one shot. To transcribe *while the user is still speaking*, this
7
+ * splits the continuous 16 kHz mono float stream into speech segments at natural
8
+ * pauses — each segment is decoded and committed as it finalizes, and the
9
+ * in-progress segment is re-decoded periodically for a volatile live preview.
10
+ *
11
+ * Segmentation is pure short-time-energy VAD with an adaptive noise floor, so it
12
+ * needs no extra model and is engine-agnostic (it runs the same way whether the
13
+ * downstream model is sherpa or transformers). It is deliberately simple and
14
+ * fully deterministic so it can be unit-tested with synthetic signals.
15
+ */
16
+
17
+ /** Tunable thresholds for {@link StreamEndpointer}. All durations in ms. */
18
+ export interface EndpointerConfig {
19
+ /** Input sample rate (the recorder always delivers 16 kHz mono). */
20
+ sampleRate: number;
21
+ /** Short-time analysis frame size. */
22
+ frameMs: number;
23
+ /** Trailing silence inside a segment that finalizes (commits) it. */
24
+ endSilenceMs: number;
25
+ /** Shortest speech run that is committed; shorter runs are discarded as noise. */
26
+ minSpeechMs: number;
27
+ /** Hard cap on segment length so long pause-free speech still commits periodically. */
28
+ maxSegmentMs: number;
29
+ /** Audio retained before onset so the first phoneme of a segment is never clipped. */
30
+ preRollMs: number;
31
+ /** Cadence of volatile partial emissions for the in-progress segment. */
32
+ partialIntervalMs: number;
33
+ /** Speech threshold is `max(minThreshold, noiseFloor * energyRatio)`. */
34
+ energyRatio: number;
35
+ /** EMA weight tracking the ambient noise floor on non-speech frames. */
36
+ floorAttack: number;
37
+ /** Absolute RMS floor so a near-silent room never trips speech detection. */
38
+ minThreshold: number;
39
+ }
40
+
41
+ export const DEFAULT_ENDPOINTER_CONFIG: EndpointerConfig = {
42
+ sampleRate: 16_000,
43
+ frameMs: 30,
44
+ endSilenceMs: 600,
45
+ minSpeechMs: 200,
46
+ maxSegmentMs: 12_000,
47
+ preRollMs: 240,
48
+ partialIntervalMs: 450,
49
+ energyRatio: 2.5,
50
+ floorAttack: 0.05,
51
+ minThreshold: 0.008,
52
+ };
53
+
54
+ /**
55
+ * Emitted by {@link StreamEndpointer.push} / {@link StreamEndpointer.flush}.
56
+ * `partial` is the volatile in-progress segment (decode and show as preview,
57
+ * never commit); `segment` is a finalized run (decode and commit once).
58
+ */
59
+ export type EndpointerEvent = { kind: "partial"; audio: Float32Array } | { kind: "segment"; audio: Float32Array };
60
+
61
+ /** Append-growable Float32 buffer (amortized O(1) push, no per-frame realloc). */
62
+ class FloatBuffer {
63
+ #data = new Float32Array(0);
64
+ #len = 0;
65
+
66
+ get length(): number {
67
+ return this.#len;
68
+ }
69
+
70
+ push(samples: Float32Array): void {
71
+ const needed = this.#len + samples.length;
72
+ if (needed > this.#data.length) {
73
+ const next = new Float32Array(Math.max(this.#data.length * 2, needed, 1 << 14));
74
+ next.set(this.#data.subarray(0, this.#len));
75
+ this.#data = next;
76
+ }
77
+ this.#data.set(samples, this.#len);
78
+ this.#len += samples.length;
79
+ }
80
+
81
+ /** Copy `[0, end)` into a fresh array the caller can retain. */
82
+ take(end = this.#len): Float32Array {
83
+ return this.#data.slice(0, Math.max(0, Math.min(end, this.#len)));
84
+ }
85
+
86
+ reset(): void {
87
+ this.#len = 0;
88
+ }
89
+ }
90
+
91
+ function rms(frame: Float32Array): number {
92
+ let sum = 0;
93
+ for (let i = 0; i < frame.length; i += 1) sum += frame[i]! * frame[i]!;
94
+ return Math.sqrt(sum / Math.max(1, frame.length));
95
+ }
96
+
97
+ export class StreamEndpointer {
98
+ readonly #cfg: EndpointerConfig;
99
+ readonly #frameSamples: number;
100
+ readonly #preRollSamples: number;
101
+
102
+ #leftover = new Float32Array(0);
103
+ #inSpeech = false;
104
+ #noiseFloor: number;
105
+ #silenceMs = 0;
106
+ #segmentMs = 0;
107
+ #msSincePartial = 0;
108
+ #partialDirty = false;
109
+
110
+ readonly #segment = new FloatBuffer();
111
+ /** Ring of the most recent pre-onset frames, used as segment pre-roll. */
112
+ readonly #preRoll = new FloatBuffer();
113
+
114
+ constructor(config: Partial<EndpointerConfig> = {}) {
115
+ this.#cfg = { ...DEFAULT_ENDPOINTER_CONFIG, ...config };
116
+ this.#frameSamples = Math.max(1, Math.round((this.#cfg.sampleRate * this.#cfg.frameMs) / 1000));
117
+ this.#preRollSamples = Math.max(0, Math.round((this.#cfg.sampleRate * this.#cfg.preRollMs) / 1000));
118
+ this.#noiseFloor = this.#cfg.minThreshold;
119
+ }
120
+
121
+ /** Feed newly-captured samples; returns ordered partial/segment events. */
122
+ push(samples: Float32Array): EndpointerEvent[] {
123
+ const events: EndpointerEvent[] = [];
124
+ // Prepend the carried-over tail, then consume whole frames.
125
+ let buf: Float32Array;
126
+ if (this.#leftover.length === 0) {
127
+ buf = samples;
128
+ } else {
129
+ buf = new Float32Array(this.#leftover.length + samples.length);
130
+ buf.set(this.#leftover, 0);
131
+ buf.set(samples, this.#leftover.length);
132
+ }
133
+ let offset = 0;
134
+ for (; offset + this.#frameSamples <= buf.length; offset += this.#frameSamples) {
135
+ this.#processFrame(buf.subarray(offset, offset + this.#frameSamples), events);
136
+ }
137
+ this.#leftover = buf.slice(offset);
138
+ return events;
139
+ }
140
+
141
+ /** End the stream; returns a trailing committed segment if one is pending. */
142
+ flush(): EndpointerEvent[] {
143
+ const events: EndpointerEvent[] = [];
144
+ if (this.#inSpeech && this.#leftover.length > 0) {
145
+ this.#segment.push(this.#leftover);
146
+ this.#segmentMs += (this.#leftover.length / this.#cfg.sampleRate) * 1000;
147
+ }
148
+ this.#leftover = new Float32Array(0);
149
+ if (this.#inSpeech) {
150
+ const speechMs = this.#segmentMs - this.#silenceMs;
151
+ if (speechMs >= this.#cfg.minSpeechMs) {
152
+ events.push({ kind: "segment", audio: this.#segment.take(this.#endpointKeep()) });
153
+ }
154
+ }
155
+ this.#reset();
156
+ return events;
157
+ }
158
+
159
+ #processFrame(frame: Float32Array, events: EndpointerEvent[]): void {
160
+ const energy = rms(frame);
161
+ const threshold = Math.max(this.#cfg.minThreshold, this.#noiseFloor * this.#cfg.energyRatio);
162
+ const voiced = energy > threshold;
163
+ // Track ambient noise on non-speech frames only, so loud speech never
164
+ // inflates the floor (which would make the tail of an utterance read as
165
+ // silence and clip the segment short).
166
+ if (!voiced) {
167
+ this.#noiseFloor = this.#noiseFloor * (1 - this.#cfg.floorAttack) + energy * this.#cfg.floorAttack;
168
+ }
169
+
170
+ if (!this.#inSpeech) {
171
+ this.#preRoll.push(frame);
172
+ // Keep only the most recent pre-roll window.
173
+ if (this.#preRoll.length > this.#preRollSamples) {
174
+ const tail = this.#preRoll.take().slice(this.#preRoll.length - this.#preRollSamples);
175
+ this.#preRoll.reset();
176
+ this.#preRoll.push(tail);
177
+ }
178
+ if (voiced) this.#beginSegment(frame);
179
+ return;
180
+ }
181
+
182
+ this.#segment.push(frame);
183
+ this.#segmentMs += this.#cfg.frameMs;
184
+ this.#msSincePartial += this.#cfg.frameMs;
185
+ if (voiced) {
186
+ this.#silenceMs = 0;
187
+ this.#partialDirty = true;
188
+ } else {
189
+ this.#silenceMs += this.#cfg.frameMs;
190
+ }
191
+
192
+ if (this.#silenceMs >= this.#cfg.endSilenceMs) {
193
+ this.#finalizeSegment(events);
194
+ return;
195
+ }
196
+ if (this.#segmentMs >= this.#cfg.maxSegmentMs) {
197
+ // Pause-free long speech: commit what we have and continue a fresh
198
+ // segment so output keeps flowing.
199
+ events.push({ kind: "segment", audio: this.#segment.take() });
200
+ this.#segment.reset();
201
+ this.#segmentMs = 0;
202
+ this.#silenceMs = 0;
203
+ this.#msSincePartial = 0;
204
+ this.#partialDirty = false;
205
+ return;
206
+ }
207
+ if (this.#partialDirty && this.#msSincePartial >= this.#cfg.partialIntervalMs) {
208
+ events.push({ kind: "partial", audio: this.#segment.take() });
209
+ this.#msSincePartial = 0;
210
+ this.#partialDirty = false;
211
+ }
212
+ }
213
+
214
+ #beginSegment(onsetFrame: Float32Array): void {
215
+ this.#inSpeech = true;
216
+ this.#segment.reset();
217
+ const preRoll = this.#preRoll.take();
218
+ if (preRoll.length > 0) this.#segment.push(preRoll);
219
+ this.#segment.push(onsetFrame);
220
+ this.#preRoll.reset();
221
+ this.#silenceMs = 0;
222
+ this.#segmentMs = (this.#segment.length / this.#cfg.sampleRate) * 1000;
223
+ this.#msSincePartial = 0;
224
+ this.#partialDirty = true;
225
+ }
226
+
227
+ #finalizeSegment(events: EndpointerEvent[]): void {
228
+ const speechMs = this.#segmentMs - this.#silenceMs;
229
+ if (speechMs >= this.#cfg.minSpeechMs) {
230
+ events.push({ kind: "segment", audio: this.#segment.take(this.#endpointKeep()) });
231
+ }
232
+ this.#inSpeech = false;
233
+ this.#segment.reset();
234
+ this.#silenceMs = 0;
235
+ this.#segmentMs = 0;
236
+ this.#msSincePartial = 0;
237
+ this.#partialDirty = false;
238
+ }
239
+
240
+ /** Samples to keep when committing on silence: drop most of the trailing
241
+ * silence but leave a short tail so the final word is not cut. */
242
+ #endpointKeep(): number {
243
+ const tailMs = Math.min(this.#silenceMs, 120);
244
+ const dropMs = Math.max(0, this.#silenceMs - tailMs);
245
+ const drop = Math.round((this.#cfg.sampleRate * dropMs) / 1000);
246
+ return Math.max(0, this.#segment.length - drop);
247
+ }
248
+
249
+ #reset(): void {
250
+ this.#inSpeech = false;
251
+ this.#segment.reset();
252
+ this.#preRoll.reset();
253
+ this.#silenceMs = 0;
254
+ this.#segmentMs = 0;
255
+ this.#msSincePartial = 0;
256
+ this.#partialDirty = false;
257
+ this.#noiseFloor = this.#cfg.minThreshold;
258
+ }
259
+ }
package/src/stt/index.ts CHANGED
@@ -1,3 +1,7 @@
1
+ export * from "./asr-client";
2
+ export * from "./asr-protocol";
1
3
  export * from "./downloader";
2
- export * from "./setup";
4
+ export * from "./models";
3
5
  export * from "./stt-controller";
6
+ export * from "./transcriber";
7
+ export * from "./wav";
@@ -0,0 +1,150 @@
1
+ import type { TinyModelDtype } from "../tiny/dtype";
2
+
3
+ /**
4
+ * On-device speech-to-text model registry. Each tier maps a stable settings key
5
+ * onto a locally-runnable ASR model and the engine that loads it:
6
+ *
7
+ * - `transformers` — a transformers.js / ONNX Whisper repo, loaded by the
8
+ * `@huggingface/transformers` `automatic-speech-recognition` pipeline.
9
+ * - `sherpa` — a sherpa-onnx (Next-gen Kaldi) offline model, loaded by the
10
+ * native `sherpa-onnx-node` addon. Used for NVIDIA Parakeet, the Open ASR
11
+ * Leaderboard accuracy/speed leader.
12
+ *
13
+ * The worker resolves the spec by key and loads the model lazily (kept warm
14
+ * afterwards). Both engines run inside the hard-killed subprocess worker.
15
+ */
16
+
17
+ /** ASR runtime that loads a given tier's model. */
18
+ export type SttEngine = "transformers" | "sherpa";
19
+
20
+ interface SttModelBase {
21
+ /** Stable key persisted in `stt.modelName` and sent over the worker protocol. */
22
+ key: string;
23
+ engine: SttEngine;
24
+ /** Hugging Face repo id (transformers.js ONNX repo, or sherpa-onnx model repo). */
25
+ repo: string;
26
+ /** English-only checkpoint: rejects a configured source `language`. */
27
+ englishOnly: boolean;
28
+ label: string;
29
+ description: string;
30
+ /** Approximate on-disk download size for the shipped weights (UI hint). */
31
+ sizeHint: string;
32
+ }
33
+
34
+ /** A Whisper-family tier loaded via the transformers.js ASR pipeline. */
35
+ export interface TransformersSttModelSpec extends SttModelBase {
36
+ engine: "transformers";
37
+ /** ONNX precision used unless overridden by `PI_TINY_DTYPE` / `providers.tinyModelDtype`. */
38
+ dtype: TinyModelDtype;
39
+ }
40
+
41
+ /** A sherpa-onnx offline tier (e.g. NeMo Parakeet transducer) loaded natively. */
42
+ export interface SherpaSttModelSpec extends SttModelBase {
43
+ engine: "sherpa";
44
+ /** sherpa-onnx offline model family (e.g. `nemo_transducer`). */
45
+ modelType: string;
46
+ /** Model files (relative to the repo root) fetched into the local cache. */
47
+ files: { encoder: string; decoder: string; joiner: string; tokens: string };
48
+ }
49
+
50
+ export type SttModelSpec = TransformersSttModelSpec | SherpaSttModelSpec;
51
+
52
+ /**
53
+ * Speech model tiers, ordered light → SoTA. Defaults to {@link DEFAULT_STT_MODEL_KEY}.
54
+ * `fast`/`balanced`/`turbo` are multilingual Whisper checkpoints on transformers.js;
55
+ * `parakeet` is NVIDIA Parakeet TDT 0.6B v3 on sherpa-onnx — the Open ASR
56
+ * Leaderboard leader (lower WER and far higher throughput than Whisper).
57
+ */
58
+ export const STT_MODELS = [
59
+ {
60
+ key: "fast",
61
+ engine: "transformers",
62
+ repo: "onnx-community/whisper-base",
63
+ dtype: "q8",
64
+ englishOnly: false,
65
+ label: "Fast (Whisper base)",
66
+ description: "Whisper base, multilingual. Smallest + fastest; lowest accuracy. Best for low-resource machines.",
67
+ sizeHint: "~60 MB",
68
+ },
69
+ {
70
+ key: "balanced",
71
+ engine: "transformers",
72
+ repo: "onnx-community/whisper-small",
73
+ dtype: "q8",
74
+ englishOnly: false,
75
+ label: "Balanced (Whisper small)",
76
+ description: "Whisper small, multilingual. More accurate than Fast, still light on CPU/RAM.",
77
+ sizeHint: "~190 MB",
78
+ },
79
+ {
80
+ key: "turbo",
81
+ engine: "transformers",
82
+ repo: "onnx-community/whisper-large-v3-turbo",
83
+ dtype: "q4",
84
+ englishOnly: false,
85
+ label: "Turbo (Whisper large-v3)",
86
+ description: "Whisper large-v3-turbo, 99 languages. Widest language coverage; large download, slower.",
87
+ sizeHint: "~600 MB",
88
+ },
89
+ {
90
+ key: "parakeet",
91
+ engine: "sherpa",
92
+ repo: "csukuangfj/sherpa-onnx-nemo-parakeet-tdt-0.6b-v3-int8",
93
+ modelType: "nemo_transducer",
94
+ files: {
95
+ encoder: "encoder.int8.onnx",
96
+ decoder: "decoder.int8.onnx",
97
+ joiner: "joiner.int8.onnx",
98
+ tokens: "tokens.txt",
99
+ },
100
+ englishOnly: false,
101
+ label: "Parakeet TDT v3 (SoTA)",
102
+ description:
103
+ "NVIDIA Parakeet TDT 0.6B v3, 25 languages. Open ASR Leaderboard leader — best accuracy and far fastest decoding. Default.",
104
+ sizeHint: "~680 MB",
105
+ },
106
+ ] as const satisfies readonly SttModelSpec[];
107
+
108
+ /**
109
+ * SoTA default — NVIDIA Parakeet TDT 0.6B v3 (sherpa-onnx). Tops the Open ASR
110
+ * Leaderboard on accuracy while decoding ~20× faster than Whisper large-v3.
111
+ */
112
+ export const DEFAULT_STT_MODEL_KEY = "parakeet";
113
+
114
+ export type SttModelKey = (typeof STT_MODELS)[number]["key"];
115
+
116
+ /** A concrete entry from {@link STT_MODELS}; `key` is the literal tier union. */
117
+ export type SttModel = (typeof STT_MODELS)[number];
118
+
119
+ export const STT_MODEL_VALUES = ["fast", "balanced", "turbo", "parakeet"] as const satisfies readonly SttModelKey[];
120
+
121
+ type MissingSttModelValue = Exclude<SttModelKey, (typeof STT_MODEL_VALUES)[number]>;
122
+ type ExtraSttModelValue = Exclude<(typeof STT_MODEL_VALUES)[number], SttModelKey>;
123
+ const STT_MODEL_VALUES_MATCH_REGISTRY: MissingSttModelValue extends never
124
+ ? ExtraSttModelValue extends never
125
+ ? true
126
+ : never
127
+ : never = true;
128
+ void STT_MODEL_VALUES_MATCH_REGISTRY;
129
+
130
+ export const STT_MODEL_OPTIONS = STT_MODELS.map(({ key, label, description }) => ({
131
+ value: key,
132
+ label,
133
+ description,
134
+ })) satisfies ReadonlyArray<{ value: SttModelKey; label: string; description: string }>;
135
+
136
+ export function isSttModelKey(value: string): value is SttModelKey {
137
+ return STT_MODELS.some(model => model.key === value);
138
+ }
139
+
140
+ export function getSttModelSpec(key: string): SttModel | undefined {
141
+ return STT_MODELS.find(model => model.key === key);
142
+ }
143
+
144
+ /**
145
+ * Resolve a (possibly stale or legacy) `stt.modelName` value onto a concrete
146
+ * spec, falling back to the SoTA default when the key is unknown.
147
+ */
148
+ export function resolveSttModelSpec(key: string | undefined): SttModel {
149
+ return (key !== undefined ? getSttModelSpec(key) : undefined) ?? getSttModelSpec(DEFAULT_STT_MODEL_KEY)!;
150
+ }