@drakulavich/parakeet-cli 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 drakulavich
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,113 @@
1
+ # parakeet-cli
2
+
3
+ Fast multilingual speech-to-text CLI powered by NVIDIA Parakeet ONNX models. Zero Python. Runs on CPU.
4
+
5
+ ## Features
6
+
7
+ - **25 languages** — automatic language detection, no prompting needed
8
+ - **3x faster than Whisper** on CPU (see [benchmark](#benchmark))
9
+ - **Zero Python** — pure TypeScript/Bun with onnxruntime-node
10
+ - **Auto-downloads models** — ~3GB cached in `~/.cache/parakeet/` on first run
11
+ - **Any audio format** — ffmpeg handles OGG, MP3, WAV, FLAC, M4A, etc.
12
+
13
+ ## Install
14
+
15
+ ```bash
16
+ bun install -g @drakulavich/parakeet-cli
17
+ ```
18
+
19
+ Or clone and link locally:
20
+
21
+ ```bash
22
+ git clone https://github.com/drakulavich/parakeet-cli.git
23
+ cd parakeet-cli
24
+ bun install
25
+ bun link
26
+ ```
27
+
28
+ ## Usage
29
+
30
+ ```bash
31
+ # Transcribe any audio file (language auto-detected)
32
+ parakeet audio.ogg
33
+
34
+ # Force re-download models
35
+ parakeet --no-cache audio.wav
36
+
37
+ # Show version
38
+ parakeet --version
39
+ ```
40
+
41
+ Output goes to stdout, errors to stderr. Designed for piping and scripting.
42
+
43
+ ## Benchmark
44
+
45
+ Tested on 10 real Telegram voice messages (Russian, 3-10s each).
46
+ VM: AMD EPYC 7763 8C/16T, 64GB RAM, CPU-only.
47
+
48
+ | # | Whisper | Parakeet | Whisper Transcript | Parakeet Transcript |
49
+ |---|---------|----------|--------------------|---------------------|
50
+ | 1 | 13.3s | 4.4s | Проверь все свои конфиги и перенеси секреты в .env файл. | проверь все свои конфигии и перенеси секреты в дот энф файл |
51
+ | 2 | 13.1s | 4.2s | Вынеси еще секрет от Клода, который я тебе добавил. | неси еще секрет от Клода, который я тебе добавил |
52
+ | 3 | 12.7s | 4.0s | Установи пока Клод Код | Установи пока клот кот |
53
+ | 4 | 13.1s | 4.1s | Какие еще Telegram-юзеры имеют доступ к тебе? | ки еще телеграм юзеры имеют доступ к тебе |
54
+ | 5 | 12.7s | 4.0s | Закомите изменения в ГИТ | Закомить изменения в Гет |
55
+ | 6 | 13.1s | 4.1s | Узнай второго юзера в телеграме. | Узнай второго юзера в Телеграме |
56
+ | 7 | 13.4s | 5.0s | Ты добавил себе в память информацию из Vantage Handbook Репозитория | Ты добавил себе в память информацию из Вентаж хэндбук репозитория |
57
+ | 8 | 13.1s | 4.8s | Покажи его username в телеграмме, хочу написать ему. | жи его юзернейм в телеграме хочу написать ему |
58
+ | 9 | 14.2s | 4.5s | Не нужно посылать сообщение с транскрипцией. Сразу выполняй инструкцию. | жно слать сообщение с транскрипцией сразу выполняй инструкцию |
59
+ | 10 | 13.5s | 4.8s | То, что находится в папке Workspace, ты тоже коммитишь? | То, что находится в папке Воркспейс, ты тоже комитишь? |
60
+ | **Total** | **132.1s** | **43.8s** | | |
61
+
62
+ **Parakeet is 3x faster.** Whisper handles mixed-language words better (`.env`, `Workspace`). Parakeet transliterates them phonetically. Both produce transcripts usable by LLMs.
63
+
64
+ Models: Whisper medium (int8) vs Parakeet TDT 0.6B v3 (ONNX, CPU).
65
+
66
+ ## Supported Languages
67
+
68
+ Bulgarian, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, German, Greek, Hungarian, Italian, Latvian, Lithuanian, Maltese, Polish, Portuguese, Romanian, Russian, Slovak, Slovenian, Spanish, Swedish, Ukrainian.
69
+
70
+ ## How It Works
71
+
72
+ ```
73
+ parakeet audio.ogg
74
+ |
75
+ +-- ffmpeg: any format -> 16kHz mono float32
76
+ +-- nemo128.onnx: waveform -> 128-dim log-mel spectrogram
77
+ +-- per-utterance normalization (mean=0, std=1)
78
+ +-- encoder-model.onnx: mel features -> encoder output
79
+ +-- TDT greedy decoder: encoder output -> token IDs + durations
80
+ +-- vocab.txt: token IDs -> text
81
+ |
82
+ stdout: transcript
83
+ ```
84
+
85
+ Uses [NVIDIA Parakeet TDT 0.6B v3](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3) exported to ONNX by [istupakov](https://huggingface.co/istupakov/parakeet-tdt-0.6b-v3-onnx). Models auto-download from HuggingFace on first run (~3GB).
86
+
87
+ ## Requirements
88
+
89
+ - [Bun](https://bun.sh) >= 1.3
90
+ - [ffmpeg](https://ffmpeg.org) installed and in PATH
91
+ - ~3GB disk space for model cache
92
+
93
+ ### macOS (Apple Silicon)
94
+
95
+ Works natively on M1/M2/M3/M4. Install dependencies with Homebrew:
96
+
97
+ ```bash
98
+ brew install ffmpeg
99
+ curl -fsSL https://bun.sh/install | bash
100
+ bun install -g @drakulavich/parakeet-cli
101
+ ```
102
+
103
+ ### Linux
104
+
105
+ ```bash
106
+ apt install ffmpeg # or yum, pacman, etc.
107
+ curl -fsSL https://bun.sh/install | bash
108
+ bun install -g @drakulavich/parakeet-cli
109
+ ```
110
+
111
+ ## License
112
+
113
+ MIT
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env bun
2
+ await import("../src/cli.ts");
package/package.json ADDED
@@ -0,0 +1,52 @@
1
+ {
2
+ "name": "@drakulavich/parakeet-cli",
3
+ "version": "0.1.1",
4
+ "description": "Fast multilingual speech-to-text CLI powered by NVIDIA Parakeet ONNX models",
5
+ "type": "module",
6
+ "bin": {
7
+ "parakeet": "bin/parakeet.js"
8
+ },
9
+ "files": [
10
+ "bin/",
11
+ "src/",
12
+ "package.json",
13
+ "tsconfig.json",
14
+ "LICENSE",
15
+ "README.md"
16
+ ],
17
+ "scripts": {
18
+ "test": "bun test",
19
+ "test:unit": "bun test src/__tests__/",
20
+ "test:integration": "bun test tests/integration/"
21
+ },
22
+ "keywords": [
23
+ "asr",
24
+ "speech-to-text",
25
+ "transcription",
26
+ "parakeet",
27
+ "nvidia",
28
+ "onnx",
29
+ "multilingual",
30
+ "bun",
31
+ "cli"
32
+ ],
33
+ "author": "drakulavich",
34
+ "license": "MIT",
35
+ "repository": {
36
+ "type": "git",
37
+ "url": "git+https://github.com/drakulavich/parakeet-cli.git"
38
+ },
39
+ "homepage": "https://github.com/drakulavich/parakeet-cli#readme",
40
+ "bugs": {
41
+ "url": "https://github.com/drakulavich/parakeet-cli/issues"
42
+ },
43
+ "engines": {
44
+ "bun": ">=1.3.0"
45
+ },
46
+ "devDependencies": {
47
+ "@types/bun": "latest"
48
+ },
49
+ "dependencies": {
50
+ "onnxruntime-node": "^1.24.0"
51
+ }
52
+ }
@@ -0,0 +1,26 @@
1
+ import { describe, test, expect } from "bun:test";
2
+ import { convertToFloat32PCM } from "../audio";
3
+ import { existsSync } from "fs";
4
+
5
+ describe("audio", () => {
6
+ test("converts WAV to 16kHz mono Float32Array", async () => {
7
+ const buffer = await convertToFloat32PCM("fixtures/silence.wav");
8
+ expect(buffer).toBeInstanceOf(Float32Array);
9
+ // 1 second at 16kHz = 16000 samples
10
+ expect(buffer.length).toBeGreaterThan(15000);
11
+ expect(buffer.length).toBeLessThan(17000);
12
+ });
13
+
14
+ test("throws on missing file", async () => {
15
+ expect(convertToFloat32PCM("nonexistent.wav")).rejects.toThrow(
16
+ "file not found"
17
+ );
18
+ });
19
+
20
+ test("throws on corrupt file", async () => {
21
+ await Bun.write("fixtures/corrupt.bin", "not audio data");
22
+ expect(convertToFloat32PCM("fixtures/corrupt.bin")).rejects.toThrow(
23
+ "failed to convert audio"
24
+ );
25
+ });
26
+ });
@@ -0,0 +1,57 @@
1
+ import { describe, test, expect } from "bun:test";
2
+ import { greedyDecode, type DecoderSession } from "../decoder";
3
+
4
+ function mockSession(responses: Array<{ tokenLogits: number[]; durationLogits: number[] }>): DecoderSession {
5
+ let callIndex = 0;
6
+ return {
7
+ async decode(_encoderFrame, _targets, _targetLength, _state1, _state2) {
8
+ const resp = responses[Math.min(callIndex++, responses.length - 1)];
9
+ const output = new Float32Array([...resp.tokenLogits, ...resp.durationLogits]);
10
+ const state1 = new Float32Array(1);
11
+ const state2 = new Float32Array(1);
12
+ return { output, state1, state2 };
13
+ },
14
+ vocabSize: responses[0]?.tokenLogits.length ?? 4,
15
+ blankId: (responses[0]?.tokenLogits.length ?? 4) - 1,
16
+ stateDims: { layers: 1, hidden: 1 },
17
+ };
18
+ }
19
+
20
+ describe("decoder", () => {
21
+ test("emits non-blank tokens", async () => {
22
+ const session = mockSession([
23
+ { tokenLogits: [10, 0, 0, -10], durationLogits: [10, 0] },
24
+ { tokenLogits: [0, 10, 0, -10], durationLogits: [10, 0] },
25
+ { tokenLogits: [0, 0, 0, 10], durationLogits: [10, 0] },
26
+ ]);
27
+ const tokens = await greedyDecode(session, 3);
28
+ expect(tokens).toEqual([0, 1]);
29
+ });
30
+
31
+ test("respects duration skipping", async () => {
32
+ const session = mockSession([
33
+ { tokenLogits: [10, 0, 0, -10], durationLogits: [0, 0, 10] },
34
+ { tokenLogits: [0, 10, 0, -10], durationLogits: [10, 0, 0] },
35
+ { tokenLogits: [0, 0, 0, 10], durationLogits: [10, 0, 0] },
36
+ ]);
37
+ const tokens = await greedyDecode(session, 5);
38
+ expect(tokens).toEqual([0, 1]);
39
+ });
40
+
41
+ test("handles max_tokens_per_step limit", async () => {
42
+ const session = mockSession([
43
+ { tokenLogits: [10, 0, 0, -10], durationLogits: [10, 0] },
44
+ ]);
45
+ const tokens = await greedyDecode(session, 2);
46
+ expect(tokens.length).toBeLessThanOrEqual(20);
47
+ expect(tokens.length).toBeGreaterThan(0);
48
+ });
49
+
50
+ test("returns empty for zero-length encoder output", async () => {
51
+ const session = mockSession([
52
+ { tokenLogits: [0, 0, 0, 10], durationLogits: [10, 0] },
53
+ ]);
54
+ const tokens = await greedyDecode(session, 0);
55
+ expect(tokens).toEqual([]);
56
+ });
57
+ });
@@ -0,0 +1,29 @@
1
+ import { describe, test, expect } from "bun:test";
2
+ import { getModelDir, MODEL_FILES, HF_REPOS } from "../models";
3
+ import { join } from "path";
4
+ import { homedir } from "os";
5
+
6
+ describe("models", () => {
7
+ test("getModelDir returns correct cache path for v2", () => {
8
+ const dir = getModelDir("v2");
9
+ expect(dir).toBe(join(homedir(), ".cache", "parakeet", "v2"));
10
+ });
11
+
12
+ test("getModelDir returns correct cache path for v3", () => {
13
+ const dir = getModelDir("v3");
14
+ expect(dir).toBe(join(homedir(), ".cache", "parakeet", "v3"));
15
+ });
16
+
17
+ test("MODEL_FILES lists required files", () => {
18
+ expect(MODEL_FILES).toContain("encoder-model.onnx");
19
+ expect(MODEL_FILES).toContain("encoder-model.onnx.data");
20
+ expect(MODEL_FILES).toContain("decoder_joint-model.onnx");
21
+ expect(MODEL_FILES).toContain("nemo128.onnx");
22
+ expect(MODEL_FILES).toContain("vocab.txt");
23
+ });
24
+
25
+ test("HF_REPOS maps versions to repo IDs", () => {
26
+ expect(HF_REPOS.v2).toBe("istupakov/parakeet-tdt-0.6b-v2-onnx");
27
+ expect(HF_REPOS.v3).toBe("istupakov/parakeet-tdt-0.6b-v3-onnx");
28
+ });
29
+ });
@@ -0,0 +1,45 @@
1
+ import { describe, test, expect } from "bun:test";
2
+ import { Tokenizer } from "../tokenizer";
3
+
4
+ describe("tokenizer", () => {
5
+ test("loads vocab from file", async () => {
6
+ const tok = await Tokenizer.fromFile("fixtures/test-vocab.txt");
7
+ expect(tok.vocabSize).toBe(6);
8
+ expect(tok.blankId).toBe(5);
9
+ });
10
+
11
+ test("detokenizes token IDs to text", async () => {
12
+ const tok = await Tokenizer.fromFile("fixtures/test-vocab.txt");
13
+ const text = tok.detokenize([0, 1]);
14
+ expect(text).toBe("hello world");
15
+ });
16
+
17
+ test("handles blank tokens by skipping them", async () => {
18
+ const tok = await Tokenizer.fromFile("fixtures/test-vocab.txt");
19
+ const text = tok.detokenize([0, 5, 1]);
20
+ expect(text).toBe("hello world");
21
+ });
22
+
23
+ test("handles empty token list", async () => {
24
+ const tok = await Tokenizer.fromFile("fixtures/test-vocab.txt");
25
+ const text = tok.detokenize([]);
26
+ expect(text).toBe("");
27
+ });
28
+
29
+ test("handles only blank tokens", async () => {
30
+ const tok = await Tokenizer.fromFile("fixtures/test-vocab.txt");
31
+ const text = tok.detokenize([5, 5, 5]);
32
+ expect(text).toBe("");
33
+ });
34
+
35
+ test("joins subword tokens correctly", async () => {
36
+ const tok = await Tokenizer.fromFile("fixtures/test-vocab.txt");
37
+ const text = tok.detokenize([3, 4]);
38
+ expect(text).toBe("cats");
39
+ });
40
+
41
+ test("isAsciiDominant returns true for ASCII tokens", async () => {
42
+ const tok = await Tokenizer.fromFile("fixtures/test-vocab.txt");
43
+ expect(tok.isAsciiDominant([0, 1, 2])).toBe(true);
44
+ });
45
+ });
@@ -0,0 +1,11 @@
1
+ import { describe, test, expect } from "bun:test";
2
+ import { transcribe } from "../transcribe";
3
+
4
+ describe("transcribe", () => {
5
+ test("returns empty string for very short audio", async () => {
6
+ // Audio < 0.1s (1600 samples) should return empty
7
+ // We can't easily test this without a fixture, so this is a smoke test
8
+ // that the module exports correctly
9
+ expect(typeof transcribe).toBe("function");
10
+ });
11
+ });
package/src/audio.ts ADDED
@@ -0,0 +1,45 @@
1
+ import { existsSync, unlinkSync } from "fs";
2
+ import { tmpdir } from "os";
3
+ import { join } from "path";
4
+ import { randomUUID } from "crypto";
5
+
6
+ let ffmpegChecked = false;
7
+
8
+ export async function convertToFloat32PCM(inputPath: string): Promise<Float32Array> {
9
+ if (!existsSync(inputPath)) {
10
+ throw new Error(`file not found: ${inputPath}`);
11
+ }
12
+
13
+ await assertFfmpegExists();
14
+
15
+ const tmpPath = join(tmpdir(), `parakeet-${randomUUID()}.f32le`);
16
+
17
+ try {
18
+ const proc = Bun.spawn(
19
+ ["ffmpeg", "-i", inputPath, "-ar", "16000", "-ac", "1", "-f", "f32le", "-acodec", "pcm_f32le", tmpPath, "-y"],
20
+ { stdout: "pipe", stderr: "pipe" }
21
+ );
22
+
23
+ const exitCode = await proc.exited;
24
+
25
+ if (exitCode !== 0) {
26
+ const stderr = await new Response(proc.stderr).text();
27
+ throw new Error(`failed to convert audio: ${stderr.trim().split("\n").pop()}`);
28
+ }
29
+
30
+ const raw = await Bun.file(tmpPath).arrayBuffer();
31
+ return new Float32Array(raw);
32
+ } finally {
33
+ try { unlinkSync(tmpPath); } catch {}
34
+ }
35
+ }
36
+
37
+ async function assertFfmpegExists(): Promise<void> {
38
+ if (ffmpegChecked) return;
39
+ const proc = Bun.spawn(["which", "ffmpeg"], { stdout: "pipe", stderr: "pipe" });
40
+ const exitCode = await proc.exited;
41
+ if (exitCode !== 0) {
42
+ throw new Error("ffmpeg not found in PATH");
43
+ }
44
+ ffmpegChecked = true;
45
+ }
package/src/cli.ts ADDED
@@ -0,0 +1,37 @@
1
+ #!/usr/bin/env bun
2
+
3
+ import { existsSync } from "fs";
4
+ import { transcribe } from "./transcribe";
5
+
6
+ async function main(): Promise<void> {
7
+ const args = process.argv.slice(2);
8
+
9
+ if (args.includes("--version")) {
10
+ const pkg = await Bun.file(new URL("../package.json", import.meta.url)).json();
11
+ console.log(pkg.version);
12
+ process.exit(0);
13
+ }
14
+
15
+ const noCache = args.includes("--no-cache");
16
+ const file = args.filter((a) => !a.startsWith("--"))[0];
17
+
18
+ if (!file) {
19
+ console.error("Usage: parakeet [--no-cache] <audio_file>");
20
+ process.exit(1);
21
+ }
22
+
23
+ if (!existsSync(file)) {
24
+ console.error(`Error: file not found: ${file}`);
25
+ process.exit(1);
26
+ }
27
+
28
+ try {
29
+ const text = await transcribe(file, { noCache });
30
+ if (text) process.stdout.write(text + "\n");
31
+ } catch (err: any) {
32
+ console.error(`Error: ${err.message}`);
33
+ process.exit(1);
34
+ }
35
+ }
36
+
37
+ main();
package/src/decoder.ts ADDED
@@ -0,0 +1,135 @@
1
+ import * as ort from "onnxruntime-node";
2
+ import { join } from "path";
3
+ import { ensureOrtBackend } from "./ort-backend-fix";
4
+
5
+ const MAX_TOKENS_PER_STEP = 10;
6
+
7
+ export interface DecoderSession {
8
+ decode(
9
+ encoderFrame: Float32Array,
10
+ targets: number[],
11
+ targetLength: number,
12
+ state1: Float32Array,
13
+ state2: Float32Array
14
+ ): Promise<{ output: Float32Array; state1: Float32Array; state2: Float32Array }>;
15
+ vocabSize: number;
16
+ blankId: number;
17
+ stateDims: { layers: number; hidden: number };
18
+ }
19
+
20
+ export async function greedyDecode(
21
+ session: DecoderSession,
22
+ encoderLength: number,
23
+ encoderData?: Float32Array,
24
+ encoderDim?: number
25
+ ): Promise<number[]> {
26
+ if (encoderLength === 0) return [];
27
+
28
+ const tokens: number[] = [];
29
+ const stateSize = session.stateDims.layers * session.stateDims.hidden;
30
+ let state1 = new Float32Array(stateSize);
31
+ let state2 = new Float32Array(stateSize);
32
+ let lastToken = session.blankId;
33
+
34
+ let t = 0;
35
+ while (t < encoderLength) {
36
+ let tokensThisStep = 0;
37
+
38
+ while (tokensThisStep < MAX_TOKENS_PER_STEP) {
39
+ let frame: Float32Array;
40
+ if (encoderData && encoderDim) {
41
+ // Must copy — ort.Tensor doesn't work with subarray views under Bun
42
+ frame = encoderData.slice(t * encoderDim, (t + 1) * encoderDim);
43
+ } else {
44
+ frame = new Float32Array(1);
45
+ }
46
+
47
+ const result = await session.decode(frame, [lastToken], 1, state1, state2);
48
+ const output = result.output;
49
+
50
+ const tokenLogits = output.slice(0, session.vocabSize);
51
+ const durationLogits = output.slice(session.vocabSize);
52
+
53
+ const tokenId = argmax(tokenLogits);
54
+ const duration = argmax(durationLogits);
55
+
56
+ state1 = result.state1;
57
+ state2 = result.state2;
58
+
59
+ if (tokenId === session.blankId) {
60
+ t += 1;
61
+ break;
62
+ }
63
+
64
+ tokens.push(tokenId);
65
+ lastToken = tokenId;
66
+ tokensThisStep++;
67
+
68
+ if (duration > 0) {
69
+ t += duration;
70
+ break;
71
+ }
72
+ }
73
+
74
+ if (tokensThisStep >= MAX_TOKENS_PER_STEP) {
75
+ t += 1;
76
+ }
77
+ }
78
+
79
+ return tokens;
80
+ }
81
+
82
+ function argmax(arr: Float32Array): number {
83
+ let maxIdx = 0;
84
+ let maxVal = arr[0];
85
+ for (let i = 1; i < arr.length; i++) {
86
+ if (arr[i] > maxVal) {
87
+ maxVal = arr[i];
88
+ maxIdx = i;
89
+ }
90
+ }
91
+ return maxIdx;
92
+ }
93
+
94
+ let onnxSession: ort.InferenceSession | null = null;
95
+
96
+ export async function initDecoder(modelDir: string): Promise<void> {
97
+ if (onnxSession) return;
98
+ ensureOrtBackend();
99
+ onnxSession = await ort.InferenceSession.create(join(modelDir, "decoder_joint-model.onnx"));
100
+ }
101
+
102
+ export function createOnnxDecoderSession(
103
+ vocabSize: number,
104
+ blankId: number,
105
+ layers: number,
106
+ hidden: number
107
+ ): DecoderSession {
108
+ return {
109
+ vocabSize,
110
+ blankId,
111
+ stateDims: { layers, hidden },
112
+ async decode(encoderFrame, targets, targetLength, state1, state2) {
113
+ if (!onnxSession) throw new Error("decoder not initialized");
114
+
115
+ const D = encoderFrame.length;
116
+ const results = await onnxSession.run({
117
+ encoder_outputs: new ort.Tensor("float32", encoderFrame, [1, D, 1]),
118
+ targets: new ort.Tensor("int32", Int32Array.from(targets), [1, targets.length]),
119
+ target_length: new ort.Tensor("int32", Int32Array.from([targetLength]), [1]),
120
+ input_states_1: new ort.Tensor("float32", state1, [layers, 1, hidden]),
121
+ input_states_2: new ort.Tensor("float32", state2, [layers, 1, hidden]),
122
+ });
123
+
124
+ return {
125
+ output: new Float32Array(results["outputs"].data as Float32Array),
126
+ state1: new Float32Array(results["output_states_1"].data as Float32Array),
127
+ state2: new Float32Array(results["output_states_2"].data as Float32Array),
128
+ };
129
+ },
130
+ };
131
+ }
132
+
133
+ export function releaseDecoder(): void {
134
+ onnxSession = null;
135
+ }
package/src/encoder.ts ADDED
@@ -0,0 +1,32 @@
1
+ import * as ort from "onnxruntime-node";
2
+ import { join } from "path";
3
+ import { ensureOrtBackend } from "./ort-backend-fix";
4
+
5
+ let session: ort.InferenceSession | null = null;
6
+
7
+ export async function initEncoder(modelDir: string): Promise<void> {
8
+ if (session) return;
9
+ ensureOrtBackend();
10
+ session = await ort.InferenceSession.create(join(modelDir, "encoder-model.onnx"));
11
+ }
12
+
13
+ export async function encode(
14
+ features: ort.Tensor,
15
+ length: ort.Tensor
16
+ ): Promise<{ encoderOutput: ort.Tensor; encodedLength: number }> {
17
+ if (!session) throw new Error("encoder not initialized");
18
+
19
+ const results = await session.run({
20
+ audio_signal: features,
21
+ length: length,
22
+ });
23
+
24
+ const encoderOutput = results["outputs"];
25
+ const encodedLength = Number((results["encoded_lengths"].data as BigInt64Array)[0]);
26
+
27
+ return { encoderOutput, encodedLength };
28
+ }
29
+
30
+ export function releaseEncoder(): void {
31
+ session = null;
32
+ }
package/src/models.ts ADDED
@@ -0,0 +1,58 @@
1
+ import { join } from "path";
2
+ import { homedir } from "os";
3
+ import { existsSync, mkdirSync } from "fs";
4
+
5
+ export type ModelVersion = "v2" | "v3";
6
+
7
+ export const HF_REPOS: Record<ModelVersion, string> = {
8
+ v2: "istupakov/parakeet-tdt-0.6b-v2-onnx",
9
+ v3: "istupakov/parakeet-tdt-0.6b-v3-onnx",
10
+ };
11
+
12
+ export const MODEL_FILES = [
13
+ "encoder-model.onnx",
14
+ "encoder-model.onnx.data",
15
+ "decoder_joint-model.onnx",
16
+ "nemo128.onnx",
17
+ "vocab.txt",
18
+ ];
19
+
20
+ export function getModelDir(version: ModelVersion): string {
21
+ return join(homedir(), ".cache", "parakeet", version);
22
+ }
23
+
24
+ export function isModelCached(version: ModelVersion): boolean {
25
+ const dir = getModelDir(version);
26
+ return MODEL_FILES.every((f) => existsSync(join(dir, f)));
27
+ }
28
+
29
+ export async function ensureModel(version: ModelVersion, noCache = false): Promise<string> {
30
+ const dir = getModelDir(version);
31
+
32
+ if (!noCache && isModelCached(version)) {
33
+ return dir;
34
+ }
35
+
36
+ mkdirSync(dir, { recursive: true });
37
+
38
+ const repo = HF_REPOS[version];
39
+
40
+ for (const file of MODEL_FILES) {
41
+ const url = `https://huggingface.co/${repo}/resolve/main/${file}`;
42
+ const dest = join(dir, file);
43
+
44
+ if (!noCache && existsSync(dest)) continue;
45
+
46
+ console.error(`Downloading ${file}...`);
47
+
48
+ const res = await fetch(url, { redirect: "follow" });
49
+
50
+ if (!res.ok) {
51
+ throw new Error(`failed to download model: ${url} (${res.status})`);
52
+ }
53
+
54
+ await Bun.write(dest, res);
55
+ }
56
+
57
+ return dir;
58
+ }
@@ -0,0 +1,26 @@
1
+ /**
2
+ * Workaround for Bun + onnxruntime-node backend registration issue.
3
+ *
4
+ * When Bun imports onnxruntime-node (CJS), the backend gets registered
5
+ * in the CJS instance of onnxruntime-common. But our ESM code gets the
6
+ * ESM instance of onnxruntime-common, which has no backends registered.
7
+ *
8
+ * This module manually registers the native backend into the ESM module.
9
+ */
10
+
11
+ let registered = false;
12
+
13
+ export function ensureOrtBackend(): void {
14
+ if (registered) return;
15
+ registered = true;
16
+
17
+ try {
18
+ // Force-load onnxruntime-node via require() to trigger CJS side-effects
19
+ // that register the native backend. Under bun test this happens
20
+ // automatically, but bun run may need the nudge.
21
+ require("onnxruntime-node");
22
+ } catch {
23
+ // If it fails, the native backend might already be registered
24
+ // (e.g. running under Node.js or a future Bun version that fixes this)
25
+ }
26
+ }
@@ -0,0 +1,59 @@
1
+ import * as ort from "onnxruntime-node";
2
+ import { join } from "path";
3
+ import { ensureOrtBackend } from "./ort-backend-fix";
4
+
5
+ let session: ort.InferenceSession | null = null;
6
+
7
+ export async function initPreprocessor(modelDir: string): Promise<void> {
8
+ if (session) return;
9
+ ensureOrtBackend();
10
+ session = await ort.InferenceSession.create(join(modelDir, "nemo128.onnx"));
11
+ }
12
+
13
+ export async function preprocess(audio: Float32Array): Promise<{ features: ort.Tensor; length: ort.Tensor }> {
14
+ if (!session) throw new Error("preprocessor not initialized");
15
+
16
+ const inputTensor = new ort.Tensor("float32", audio, [1, audio.length]);
17
+ const lengthTensor = new ort.Tensor("int64", BigInt64Array.from([BigInt(audio.length)]), [1]);
18
+
19
+ const results = await session.run({
20
+ waveforms: inputTensor,
21
+ waveforms_lens: lengthTensor,
22
+ });
23
+
24
+ const melData = results["features"].data as Float32Array;
25
+ const melDims = results["features"].dims as readonly number[];
26
+ const T = melDims[2];
27
+ const actualLength = Number((results["features_lens"].data as BigInt64Array)[0]);
28
+
29
+ const numFeatures = melDims[1];
30
+ const normalized = new Float32Array(melData.length);
31
+
32
+ for (let f = 0; f < numFeatures; f++) {
33
+ let sum = 0;
34
+ let sumSq = 0;
35
+
36
+ for (let t = 0; t < actualLength; t++) {
37
+ const val = melData[f * T + t];
38
+ sum += val;
39
+ sumSq += val * val;
40
+ }
41
+
42
+ const mean = sum / actualLength;
43
+ const variance = sumSq / actualLength - mean * mean;
44
+ const std = Math.sqrt(Math.max(variance, 1e-10));
45
+
46
+ for (let t = 0; t < T; t++) {
47
+ normalized[f * T + t] = t < actualLength ? (melData[f * T + t] - mean) / std : 0;
48
+ }
49
+ }
50
+
51
+ const featureTensor = new ort.Tensor("float32", normalized, melDims as number[]);
52
+ const outputLength = new ort.Tensor("int64", BigInt64Array.from([BigInt(actualLength)]), [1]);
53
+
54
+ return { features: featureTensor, length: outputLength };
55
+ }
56
+
57
+ export function releasePreprocessor(): void {
58
+ session = null;
59
+ }
@@ -0,0 +1,59 @@
1
+ export class Tokenizer {
2
+ private idToToken: Map<number, string>;
3
+ readonly vocabSize: number;
4
+ readonly blankId: number;
5
+
6
+ private constructor(idToToken: Map<number, string>, blankId: number) {
7
+ this.idToToken = idToToken;
8
+ this.vocabSize = idToToken.size;
9
+ this.blankId = blankId;
10
+ }
11
+
12
+ static async fromFile(path: string): Promise<Tokenizer> {
13
+ const content = await Bun.file(path).text();
14
+ const idToToken = new Map<number, string>();
15
+ let blankId = -1;
16
+
17
+ for (const line of content.trim().split("\n")) {
18
+ const lastSpace = line.lastIndexOf(" ");
19
+ if (lastSpace === -1) continue;
20
+ const token = line.slice(0, lastSpace);
21
+ const id = parseInt(line.slice(lastSpace + 1), 10);
22
+ if (isNaN(id)) continue;
23
+ idToToken.set(id, token);
24
+ if (token === "<blk>") blankId = id;
25
+ }
26
+
27
+ if (blankId === -1) {
28
+ blankId = idToToken.size - 1;
29
+ }
30
+
31
+ return new Tokenizer(idToToken, blankId);
32
+ }
33
+
34
+ detokenize(tokenIds: number[]): string {
35
+ const pieces: string[] = [];
36
+ for (const id of tokenIds) {
37
+ if (id === this.blankId) continue;
38
+ const token = this.idToToken.get(id);
39
+ if (token !== undefined) pieces.push(token);
40
+ }
41
+ return pieces.join("").replaceAll("\u2581", " ").trim();
42
+ }
43
+
44
+ isAsciiDominant(tokenIds: number[], threshold = 0.9): boolean {
45
+ const nonBlank = tokenIds.filter((id) => id !== this.blankId);
46
+ if (nonBlank.length === 0) return false;
47
+
48
+ let asciiCount = 0;
49
+ for (const id of nonBlank) {
50
+ const token = this.idToToken.get(id) ?? "";
51
+ const cleaned = token.replaceAll("\u2581", "");
52
+ if (cleaned.length > 0 && /^[\x00-\x7F]+$/.test(cleaned)) {
53
+ asciiCount++;
54
+ }
55
+ }
56
+
57
+ return asciiCount / nonBlank.length >= threshold;
58
+ }
59
+ }
@@ -0,0 +1,61 @@
1
+ import { ensureModel } from "./models";
2
+ import { convertToFloat32PCM } from "./audio";
3
+ import { initPreprocessor, preprocess } from "./preprocess";
4
+ import { initEncoder, encode } from "./encoder";
5
+ import {
6
+ initDecoder,
7
+ createOnnxDecoderSession,
8
+ greedyDecode,
9
+ } from "./decoder";
10
+ import { Tokenizer } from "./tokenizer";
11
+ import { join } from "path";
12
+
13
+ // Parakeet TDT 0.6B decoder state dimensions (from ONNX model input shapes)
14
+ const DECODER_LAYERS = 2;
15
+ const DECODER_HIDDEN = 640;
16
+
17
+ export interface TranscribeOptions {
18
+ noCache?: boolean;
19
+ }
20
+
21
+ export async function transcribe(audioPath: string, opts: TranscribeOptions = {}): Promise<string> {
22
+ const audio = await convertToFloat32PCM(audioPath);
23
+
24
+ if (audio.length < 1600) {
25
+ return "";
26
+ }
27
+
28
+ const noCache = opts.noCache ?? false;
29
+ const modelDir = await ensureModel("v3", noCache);
30
+ const tokenizer = await Tokenizer.fromFile(join(modelDir, "vocab.txt"));
31
+
32
+ await initPreprocessor(modelDir);
33
+ await initEncoder(modelDir);
34
+ await initDecoder(modelDir);
35
+
36
+ const { features, length } = await preprocess(audio);
37
+ const { encoderOutput, encodedLength } = await encode(features, length);
38
+
39
+ const encoderData = encoderOutput.data as Float32Array;
40
+ const dims = encoderOutput.dims as readonly number[];
41
+ const D = dims[1];
42
+ const T = dims[2];
43
+
44
+ // Transpose from [1, D, T] to [T, D] so each frame is contiguous
45
+ const transposed = new Float32Array(T * D);
46
+ for (let t = 0; t < T; t++) {
47
+ for (let d = 0; d < D; d++) {
48
+ transposed[t * D + d] = encoderData[d * T + t];
49
+ }
50
+ }
51
+
52
+ const session = createOnnxDecoderSession(
53
+ tokenizer.vocabSize,
54
+ tokenizer.blankId,
55
+ DECODER_LAYERS,
56
+ DECODER_HIDDEN,
57
+ );
58
+
59
+ const tokens = await greedyDecode(session, encodedLength, transposed, D);
60
+ return tokenizer.detokenize(tokens);
61
+ }
package/tsconfig.json ADDED
@@ -0,0 +1,13 @@
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ESNext",
4
+ "module": "ESNext",
5
+ "moduleResolution": "bundler",
6
+ "types": ["bun-types"],
7
+ "strict": true,
8
+ "skipLibCheck": true,
9
+ "outDir": "./dist",
10
+ "rootDir": "./src"
11
+ },
12
+ "include": ["src/**/*.ts", "tests/**/*.ts"]
13
+ }