@drakulavich/parakeet-cli 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +113 -0
- package/bin/parakeet.js +2 -0
- package/package.json +52 -0
- package/src/__tests__/audio.test.ts +26 -0
- package/src/__tests__/decoder.test.ts +57 -0
- package/src/__tests__/models.test.ts +29 -0
- package/src/__tests__/tokenizer.test.ts +45 -0
- package/src/__tests__/transcribe.test.ts +11 -0
- package/src/audio.ts +45 -0
- package/src/cli.ts +37 -0
- package/src/decoder.ts +135 -0
- package/src/encoder.ts +32 -0
- package/src/models.ts +58 -0
- package/src/ort-backend-fix.ts +26 -0
- package/src/preprocess.ts +59 -0
- package/src/tokenizer.ts +59 -0
- package/src/transcribe.ts +61 -0
- package/tsconfig.json +13 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 drakulavich
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
# parakeet-cli
|
|
2
|
+
|
|
3
|
+
Fast multilingual speech-to-text CLI powered by NVIDIA Parakeet ONNX models. Zero Python. Runs on CPU.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **25 languages** — automatic language detection, no prompting needed
|
|
8
|
+
- **3x faster than Whisper** on CPU (see [benchmark](#benchmark))
|
|
9
|
+
- **Zero Python** — pure TypeScript/Bun with onnxruntime-node
|
|
10
|
+
- **Auto-downloads models** — ~3GB cached in `~/.cache/parakeet/` on first run
|
|
11
|
+
- **Any audio format** — ffmpeg handles OGG, MP3, WAV, FLAC, M4A, etc.
|
|
12
|
+
|
|
13
|
+
## Install
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
bun install -g @drakulavich/parakeet-cli
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Or clone and link locally:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
git clone https://github.com/drakulavich/parakeet-cli.git
|
|
23
|
+
cd parakeet-cli
|
|
24
|
+
bun install
|
|
25
|
+
bun link
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Usage
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
# Transcribe any audio file (language auto-detected)
|
|
32
|
+
parakeet audio.ogg
|
|
33
|
+
|
|
34
|
+
# Force re-download models
|
|
35
|
+
parakeet --no-cache audio.wav
|
|
36
|
+
|
|
37
|
+
# Show version
|
|
38
|
+
parakeet --version
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Output goes to stdout, errors to stderr. Designed for piping and scripting.
|
|
42
|
+
|
|
43
|
+
## Benchmark
|
|
44
|
+
|
|
45
|
+
Tested on 10 real Telegram voice messages (Russian, 3-10s each).
|
|
46
|
+
VM: AMD EPYC 7763 8C/16T, 64GB RAM, CPU-only.
|
|
47
|
+
|
|
48
|
+
| # | Whisper | Parakeet | Whisper Transcript | Parakeet Transcript |
|
|
49
|
+
|---|---------|----------|--------------------|---------------------|
|
|
50
|
+
| 1 | 13.3s | 4.4s | Проверь все свои конфиги и перенеси секреты в .env файл. | проверь все свои конфигии и перенеси секреты в дот энф файл |
|
|
51
|
+
| 2 | 13.1s | 4.2s | Вынеси еще секрет от Клода, который я тебе добавил. | неси еще секрет от Клода, который я тебе добавил |
|
|
52
|
+
| 3 | 12.7s | 4.0s | Установи пока Клод Код | Установи пока клот кот |
|
|
53
|
+
| 4 | 13.1s | 4.1s | Какие еще Telegram-юзеры имеют доступ к тебе? | ки еще телеграм юзеры имеют доступ к тебе |
|
|
54
|
+
| 5 | 12.7s | 4.0s | Закомите изменения в ГИТ | Закомить изменения в Гет |
|
|
55
|
+
| 6 | 13.1s | 4.1s | Узнай второго юзера в телеграме. | Узнай второго юзера в Телеграме |
|
|
56
|
+
| 7 | 13.4s | 5.0s | Ты добавил себе в память информацию из Vantage Handbook Репозитория | Ты добавил себе в память информацию из Вентаж хэндбук репозитория |
|
|
57
|
+
| 8 | 13.1s | 4.8s | Покажи его username в телеграмме, хочу написать ему. | жи его юзернейм в телеграме хочу написать ему |
|
|
58
|
+
| 9 | 14.2s | 4.5s | Не нужно посылать сообщение с транскрипцией. Сразу выполняй инструкцию. | жно слать сообщение с транскрипцией сразу выполняй инструкцию |
|
|
59
|
+
| 10 | 13.5s | 4.8s | То, что находится в папке Workspace, ты тоже коммитишь? | То, что находится в папке Воркспейс, ты тоже комитишь? |
|
|
60
|
+
| **Total** | **132.1s** | **43.8s** | | |
|
|
61
|
+
|
|
62
|
+
**Parakeet is 3x faster.** Whisper handles mixed-language words better (`.env`, `Workspace`). Parakeet transliterates them phonetically. Both produce transcripts usable by LLMs.
|
|
63
|
+
|
|
64
|
+
Models: Whisper medium (int8) vs Parakeet TDT 0.6B v3 (ONNX, CPU).
|
|
65
|
+
|
|
66
|
+
## Supported Languages
|
|
67
|
+
|
|
68
|
+
Bulgarian, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, German, Greek, Hungarian, Italian, Latvian, Lithuanian, Maltese, Polish, Portuguese, Romanian, Russian, Slovak, Slovenian, Spanish, Swedish, Ukrainian.
|
|
69
|
+
|
|
70
|
+
## How It Works
|
|
71
|
+
|
|
72
|
+
```
|
|
73
|
+
parakeet audio.ogg
|
|
74
|
+
|
|
|
75
|
+
+-- ffmpeg: any format -> 16kHz mono float32
|
|
76
|
+
+-- nemo128.onnx: waveform -> 128-dim log-mel spectrogram
|
|
77
|
+
+-- per-utterance normalization (mean=0, std=1)
|
|
78
|
+
+-- encoder-model.onnx: mel features -> encoder output
|
|
79
|
+
+-- TDT greedy decoder: encoder output -> token IDs + durations
|
|
80
|
+
+-- vocab.txt: token IDs -> text
|
|
81
|
+
|
|
|
82
|
+
stdout: transcript
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Uses [NVIDIA Parakeet TDT 0.6B v3](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3) exported to ONNX by [istupakov](https://huggingface.co/istupakov/parakeet-tdt-0.6b-v3-onnx). Models auto-download from HuggingFace on first run (~3GB).
|
|
86
|
+
|
|
87
|
+
## Requirements
|
|
88
|
+
|
|
89
|
+
- [Bun](https://bun.sh) >= 1.3
|
|
90
|
+
- [ffmpeg](https://ffmpeg.org) installed and in PATH
|
|
91
|
+
- ~3GB disk space for model cache
|
|
92
|
+
|
|
93
|
+
### macOS (Apple Silicon)
|
|
94
|
+
|
|
95
|
+
Works natively on M1/M2/M3/M4. Install dependencies with Homebrew:
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
brew install ffmpeg
|
|
99
|
+
curl -fsSL https://bun.sh/install | bash
|
|
100
|
+
bun install -g @drakulavich/parakeet-cli
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Linux
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
apt install ffmpeg # or yum, pacman, etc.
|
|
107
|
+
curl -fsSL https://bun.sh/install | bash
|
|
108
|
+
bun install -g @drakulavich/parakeet-cli
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## License
|
|
112
|
+
|
|
113
|
+
MIT
|
package/bin/parakeet.js
ADDED
package/package.json
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@drakulavich/parakeet-cli",
|
|
3
|
+
"version": "0.1.1",
|
|
4
|
+
"description": "Fast multilingual speech-to-text CLI powered by NVIDIA Parakeet ONNX models",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"bin": {
|
|
7
|
+
"parakeet": "bin/parakeet.js"
|
|
8
|
+
},
|
|
9
|
+
"files": [
|
|
10
|
+
"bin/",
|
|
11
|
+
"src/",
|
|
12
|
+
"package.json",
|
|
13
|
+
"tsconfig.json",
|
|
14
|
+
"LICENSE",
|
|
15
|
+
"README.md"
|
|
16
|
+
],
|
|
17
|
+
"scripts": {
|
|
18
|
+
"test": "bun test",
|
|
19
|
+
"test:unit": "bun test src/__tests__/",
|
|
20
|
+
"test:integration": "bun test tests/integration/"
|
|
21
|
+
},
|
|
22
|
+
"keywords": [
|
|
23
|
+
"asr",
|
|
24
|
+
"speech-to-text",
|
|
25
|
+
"transcription",
|
|
26
|
+
"parakeet",
|
|
27
|
+
"nvidia",
|
|
28
|
+
"onnx",
|
|
29
|
+
"multilingual",
|
|
30
|
+
"bun",
|
|
31
|
+
"cli"
|
|
32
|
+
],
|
|
33
|
+
"author": "drakulavich",
|
|
34
|
+
"license": "MIT",
|
|
35
|
+
"repository": {
|
|
36
|
+
"type": "git",
|
|
37
|
+
"url": "git+https://github.com/drakulavich/parakeet-cli.git"
|
|
38
|
+
},
|
|
39
|
+
"homepage": "https://github.com/drakulavich/parakeet-cli#readme",
|
|
40
|
+
"bugs": {
|
|
41
|
+
"url": "https://github.com/drakulavich/parakeet-cli/issues"
|
|
42
|
+
},
|
|
43
|
+
"engines": {
|
|
44
|
+
"bun": ">=1.3.0"
|
|
45
|
+
},
|
|
46
|
+
"devDependencies": {
|
|
47
|
+
"@types/bun": "latest"
|
|
48
|
+
},
|
|
49
|
+
"dependencies": {
|
|
50
|
+
"onnxruntime-node": "^1.24.0"
|
|
51
|
+
}
|
|
52
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { describe, test, expect } from "bun:test";
|
|
2
|
+
import { convertToFloat32PCM } from "../audio";
|
|
3
|
+
import { existsSync } from "fs";
|
|
4
|
+
|
|
5
|
+
describe("audio", () => {
|
|
6
|
+
test("converts WAV to 16kHz mono Float32Array", async () => {
|
|
7
|
+
const buffer = await convertToFloat32PCM("fixtures/silence.wav");
|
|
8
|
+
expect(buffer).toBeInstanceOf(Float32Array);
|
|
9
|
+
// 1 second at 16kHz = 16000 samples
|
|
10
|
+
expect(buffer.length).toBeGreaterThan(15000);
|
|
11
|
+
expect(buffer.length).toBeLessThan(17000);
|
|
12
|
+
});
|
|
13
|
+
|
|
14
|
+
test("throws on missing file", async () => {
|
|
15
|
+
expect(convertToFloat32PCM("nonexistent.wav")).rejects.toThrow(
|
|
16
|
+
"file not found"
|
|
17
|
+
);
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
test("throws on corrupt file", async () => {
|
|
21
|
+
await Bun.write("fixtures/corrupt.bin", "not audio data");
|
|
22
|
+
expect(convertToFloat32PCM("fixtures/corrupt.bin")).rejects.toThrow(
|
|
23
|
+
"failed to convert audio"
|
|
24
|
+
);
|
|
25
|
+
});
|
|
26
|
+
});
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { describe, test, expect } from "bun:test";
|
|
2
|
+
import { greedyDecode, type DecoderSession } from "../decoder";
|
|
3
|
+
|
|
4
|
+
function mockSession(responses: Array<{ tokenLogits: number[]; durationLogits: number[] }>): DecoderSession {
|
|
5
|
+
let callIndex = 0;
|
|
6
|
+
return {
|
|
7
|
+
async decode(_encoderFrame, _targets, _targetLength, _state1, _state2) {
|
|
8
|
+
const resp = responses[Math.min(callIndex++, responses.length - 1)];
|
|
9
|
+
const output = new Float32Array([...resp.tokenLogits, ...resp.durationLogits]);
|
|
10
|
+
const state1 = new Float32Array(1);
|
|
11
|
+
const state2 = new Float32Array(1);
|
|
12
|
+
return { output, state1, state2 };
|
|
13
|
+
},
|
|
14
|
+
vocabSize: responses[0]?.tokenLogits.length ?? 4,
|
|
15
|
+
blankId: (responses[0]?.tokenLogits.length ?? 4) - 1,
|
|
16
|
+
stateDims: { layers: 1, hidden: 1 },
|
|
17
|
+
};
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
describe("decoder", () => {
|
|
21
|
+
test("emits non-blank tokens", async () => {
|
|
22
|
+
const session = mockSession([
|
|
23
|
+
{ tokenLogits: [10, 0, 0, -10], durationLogits: [10, 0] },
|
|
24
|
+
{ tokenLogits: [0, 10, 0, -10], durationLogits: [10, 0] },
|
|
25
|
+
{ tokenLogits: [0, 0, 0, 10], durationLogits: [10, 0] },
|
|
26
|
+
]);
|
|
27
|
+
const tokens = await greedyDecode(session, 3);
|
|
28
|
+
expect(tokens).toEqual([0, 1]);
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
test("respects duration skipping", async () => {
|
|
32
|
+
const session = mockSession([
|
|
33
|
+
{ tokenLogits: [10, 0, 0, -10], durationLogits: [0, 0, 10] },
|
|
34
|
+
{ tokenLogits: [0, 10, 0, -10], durationLogits: [10, 0, 0] },
|
|
35
|
+
{ tokenLogits: [0, 0, 0, 10], durationLogits: [10, 0, 0] },
|
|
36
|
+
]);
|
|
37
|
+
const tokens = await greedyDecode(session, 5);
|
|
38
|
+
expect(tokens).toEqual([0, 1]);
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
test("handles max_tokens_per_step limit", async () => {
|
|
42
|
+
const session = mockSession([
|
|
43
|
+
{ tokenLogits: [10, 0, 0, -10], durationLogits: [10, 0] },
|
|
44
|
+
]);
|
|
45
|
+
const tokens = await greedyDecode(session, 2);
|
|
46
|
+
expect(tokens.length).toBeLessThanOrEqual(20);
|
|
47
|
+
expect(tokens.length).toBeGreaterThan(0);
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
test("returns empty for zero-length encoder output", async () => {
|
|
51
|
+
const session = mockSession([
|
|
52
|
+
{ tokenLogits: [0, 0, 0, 10], durationLogits: [10, 0] },
|
|
53
|
+
]);
|
|
54
|
+
const tokens = await greedyDecode(session, 0);
|
|
55
|
+
expect(tokens).toEqual([]);
|
|
56
|
+
});
|
|
57
|
+
});
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import { describe, test, expect } from "bun:test";
|
|
2
|
+
import { getModelDir, MODEL_FILES, HF_REPOS } from "../models";
|
|
3
|
+
import { join } from "path";
|
|
4
|
+
import { homedir } from "os";
|
|
5
|
+
|
|
6
|
+
describe("models", () => {
|
|
7
|
+
test("getModelDir returns correct cache path for v2", () => {
|
|
8
|
+
const dir = getModelDir("v2");
|
|
9
|
+
expect(dir).toBe(join(homedir(), ".cache", "parakeet", "v2"));
|
|
10
|
+
});
|
|
11
|
+
|
|
12
|
+
test("getModelDir returns correct cache path for v3", () => {
|
|
13
|
+
const dir = getModelDir("v3");
|
|
14
|
+
expect(dir).toBe(join(homedir(), ".cache", "parakeet", "v3"));
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
test("MODEL_FILES lists required files", () => {
|
|
18
|
+
expect(MODEL_FILES).toContain("encoder-model.onnx");
|
|
19
|
+
expect(MODEL_FILES).toContain("encoder-model.onnx.data");
|
|
20
|
+
expect(MODEL_FILES).toContain("decoder_joint-model.onnx");
|
|
21
|
+
expect(MODEL_FILES).toContain("nemo128.onnx");
|
|
22
|
+
expect(MODEL_FILES).toContain("vocab.txt");
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
test("HF_REPOS maps versions to repo IDs", () => {
|
|
26
|
+
expect(HF_REPOS.v2).toBe("istupakov/parakeet-tdt-0.6b-v2-onnx");
|
|
27
|
+
expect(HF_REPOS.v3).toBe("istupakov/parakeet-tdt-0.6b-v3-onnx");
|
|
28
|
+
});
|
|
29
|
+
});
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { describe, test, expect } from "bun:test";
|
|
2
|
+
import { Tokenizer } from "../tokenizer";
|
|
3
|
+
|
|
4
|
+
describe("tokenizer", () => {
|
|
5
|
+
test("loads vocab from file", async () => {
|
|
6
|
+
const tok = await Tokenizer.fromFile("fixtures/test-vocab.txt");
|
|
7
|
+
expect(tok.vocabSize).toBe(6);
|
|
8
|
+
expect(tok.blankId).toBe(5);
|
|
9
|
+
});
|
|
10
|
+
|
|
11
|
+
test("detokenizes token IDs to text", async () => {
|
|
12
|
+
const tok = await Tokenizer.fromFile("fixtures/test-vocab.txt");
|
|
13
|
+
const text = tok.detokenize([0, 1]);
|
|
14
|
+
expect(text).toBe("hello world");
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
test("handles blank tokens by skipping them", async () => {
|
|
18
|
+
const tok = await Tokenizer.fromFile("fixtures/test-vocab.txt");
|
|
19
|
+
const text = tok.detokenize([0, 5, 1]);
|
|
20
|
+
expect(text).toBe("hello world");
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
test("handles empty token list", async () => {
|
|
24
|
+
const tok = await Tokenizer.fromFile("fixtures/test-vocab.txt");
|
|
25
|
+
const text = tok.detokenize([]);
|
|
26
|
+
expect(text).toBe("");
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
test("handles only blank tokens", async () => {
|
|
30
|
+
const tok = await Tokenizer.fromFile("fixtures/test-vocab.txt");
|
|
31
|
+
const text = tok.detokenize([5, 5, 5]);
|
|
32
|
+
expect(text).toBe("");
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
test("joins subword tokens correctly", async () => {
|
|
36
|
+
const tok = await Tokenizer.fromFile("fixtures/test-vocab.txt");
|
|
37
|
+
const text = tok.detokenize([3, 4]);
|
|
38
|
+
expect(text).toBe("cats");
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
test("isAsciiDominant returns true for ASCII tokens", async () => {
|
|
42
|
+
const tok = await Tokenizer.fromFile("fixtures/test-vocab.txt");
|
|
43
|
+
expect(tok.isAsciiDominant([0, 1, 2])).toBe(true);
|
|
44
|
+
});
|
|
45
|
+
});
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { describe, test, expect } from "bun:test";
|
|
2
|
+
import { transcribe } from "../transcribe";
|
|
3
|
+
|
|
4
|
+
describe("transcribe", () => {
|
|
5
|
+
test("returns empty string for very short audio", async () => {
|
|
6
|
+
// Audio < 0.1s (1600 samples) should return empty
|
|
7
|
+
// We can't easily test this without a fixture, so this is a smoke test
|
|
8
|
+
// that the module exports correctly
|
|
9
|
+
expect(typeof transcribe).toBe("function");
|
|
10
|
+
});
|
|
11
|
+
});
|
package/src/audio.ts
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { existsSync, unlinkSync } from "fs";
|
|
2
|
+
import { tmpdir } from "os";
|
|
3
|
+
import { join } from "path";
|
|
4
|
+
import { randomUUID } from "crypto";
|
|
5
|
+
|
|
6
|
+
let ffmpegChecked = false;
|
|
7
|
+
|
|
8
|
+
export async function convertToFloat32PCM(inputPath: string): Promise<Float32Array> {
|
|
9
|
+
if (!existsSync(inputPath)) {
|
|
10
|
+
throw new Error(`file not found: ${inputPath}`);
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
await assertFfmpegExists();
|
|
14
|
+
|
|
15
|
+
const tmpPath = join(tmpdir(), `parakeet-${randomUUID()}.f32le`);
|
|
16
|
+
|
|
17
|
+
try {
|
|
18
|
+
const proc = Bun.spawn(
|
|
19
|
+
["ffmpeg", "-i", inputPath, "-ar", "16000", "-ac", "1", "-f", "f32le", "-acodec", "pcm_f32le", tmpPath, "-y"],
|
|
20
|
+
{ stdout: "pipe", stderr: "pipe" }
|
|
21
|
+
);
|
|
22
|
+
|
|
23
|
+
const exitCode = await proc.exited;
|
|
24
|
+
|
|
25
|
+
if (exitCode !== 0) {
|
|
26
|
+
const stderr = await new Response(proc.stderr).text();
|
|
27
|
+
throw new Error(`failed to convert audio: ${stderr.trim().split("\n").pop()}`);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
const raw = await Bun.file(tmpPath).arrayBuffer();
|
|
31
|
+
return new Float32Array(raw);
|
|
32
|
+
} finally {
|
|
33
|
+
try { unlinkSync(tmpPath); } catch {}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
async function assertFfmpegExists(): Promise<void> {
|
|
38
|
+
if (ffmpegChecked) return;
|
|
39
|
+
const proc = Bun.spawn(["which", "ffmpeg"], { stdout: "pipe", stderr: "pipe" });
|
|
40
|
+
const exitCode = await proc.exited;
|
|
41
|
+
if (exitCode !== 0) {
|
|
42
|
+
throw new Error("ffmpeg not found in PATH");
|
|
43
|
+
}
|
|
44
|
+
ffmpegChecked = true;
|
|
45
|
+
}
|
package/src/cli.ts
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
|
|
3
|
+
import { existsSync } from "fs";
|
|
4
|
+
import { transcribe } from "./transcribe";
|
|
5
|
+
|
|
6
|
+
async function main(): Promise<void> {
|
|
7
|
+
const args = process.argv.slice(2);
|
|
8
|
+
|
|
9
|
+
if (args.includes("--version")) {
|
|
10
|
+
const pkg = await Bun.file(new URL("../package.json", import.meta.url)).json();
|
|
11
|
+
console.log(pkg.version);
|
|
12
|
+
process.exit(0);
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
const noCache = args.includes("--no-cache");
|
|
16
|
+
const file = args.filter((a) => !a.startsWith("--"))[0];
|
|
17
|
+
|
|
18
|
+
if (!file) {
|
|
19
|
+
console.error("Usage: parakeet [--no-cache] <audio_file>");
|
|
20
|
+
process.exit(1);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
if (!existsSync(file)) {
|
|
24
|
+
console.error(`Error: file not found: ${file}`);
|
|
25
|
+
process.exit(1);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
try {
|
|
29
|
+
const text = await transcribe(file, { noCache });
|
|
30
|
+
if (text) process.stdout.write(text + "\n");
|
|
31
|
+
} catch (err: any) {
|
|
32
|
+
console.error(`Error: ${err.message}`);
|
|
33
|
+
process.exit(1);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
main();
|
package/src/decoder.ts
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import * as ort from "onnxruntime-node";
|
|
2
|
+
import { join } from "path";
|
|
3
|
+
import { ensureOrtBackend } from "./ort-backend-fix";
|
|
4
|
+
|
|
5
|
+
const MAX_TOKENS_PER_STEP = 10;
|
|
6
|
+
|
|
7
|
+
export interface DecoderSession {
|
|
8
|
+
decode(
|
|
9
|
+
encoderFrame: Float32Array,
|
|
10
|
+
targets: number[],
|
|
11
|
+
targetLength: number,
|
|
12
|
+
state1: Float32Array,
|
|
13
|
+
state2: Float32Array
|
|
14
|
+
): Promise<{ output: Float32Array; state1: Float32Array; state2: Float32Array }>;
|
|
15
|
+
vocabSize: number;
|
|
16
|
+
blankId: number;
|
|
17
|
+
stateDims: { layers: number; hidden: number };
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export async function greedyDecode(
|
|
21
|
+
session: DecoderSession,
|
|
22
|
+
encoderLength: number,
|
|
23
|
+
encoderData?: Float32Array,
|
|
24
|
+
encoderDim?: number
|
|
25
|
+
): Promise<number[]> {
|
|
26
|
+
if (encoderLength === 0) return [];
|
|
27
|
+
|
|
28
|
+
const tokens: number[] = [];
|
|
29
|
+
const stateSize = session.stateDims.layers * session.stateDims.hidden;
|
|
30
|
+
let state1 = new Float32Array(stateSize);
|
|
31
|
+
let state2 = new Float32Array(stateSize);
|
|
32
|
+
let lastToken = session.blankId;
|
|
33
|
+
|
|
34
|
+
let t = 0;
|
|
35
|
+
while (t < encoderLength) {
|
|
36
|
+
let tokensThisStep = 0;
|
|
37
|
+
|
|
38
|
+
while (tokensThisStep < MAX_TOKENS_PER_STEP) {
|
|
39
|
+
let frame: Float32Array;
|
|
40
|
+
if (encoderData && encoderDim) {
|
|
41
|
+
// Must copy — ort.Tensor doesn't work with subarray views under Bun
|
|
42
|
+
frame = encoderData.slice(t * encoderDim, (t + 1) * encoderDim);
|
|
43
|
+
} else {
|
|
44
|
+
frame = new Float32Array(1);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const result = await session.decode(frame, [lastToken], 1, state1, state2);
|
|
48
|
+
const output = result.output;
|
|
49
|
+
|
|
50
|
+
const tokenLogits = output.slice(0, session.vocabSize);
|
|
51
|
+
const durationLogits = output.slice(session.vocabSize);
|
|
52
|
+
|
|
53
|
+
const tokenId = argmax(tokenLogits);
|
|
54
|
+
const duration = argmax(durationLogits);
|
|
55
|
+
|
|
56
|
+
state1 = result.state1;
|
|
57
|
+
state2 = result.state2;
|
|
58
|
+
|
|
59
|
+
if (tokenId === session.blankId) {
|
|
60
|
+
t += 1;
|
|
61
|
+
break;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
tokens.push(tokenId);
|
|
65
|
+
lastToken = tokenId;
|
|
66
|
+
tokensThisStep++;
|
|
67
|
+
|
|
68
|
+
if (duration > 0) {
|
|
69
|
+
t += duration;
|
|
70
|
+
break;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if (tokensThisStep >= MAX_TOKENS_PER_STEP) {
|
|
75
|
+
t += 1;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
return tokens;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function argmax(arr: Float32Array): number {
|
|
83
|
+
let maxIdx = 0;
|
|
84
|
+
let maxVal = arr[0];
|
|
85
|
+
for (let i = 1; i < arr.length; i++) {
|
|
86
|
+
if (arr[i] > maxVal) {
|
|
87
|
+
maxVal = arr[i];
|
|
88
|
+
maxIdx = i;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
return maxIdx;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
let onnxSession: ort.InferenceSession | null = null;
|
|
95
|
+
|
|
96
|
+
export async function initDecoder(modelDir: string): Promise<void> {
|
|
97
|
+
if (onnxSession) return;
|
|
98
|
+
ensureOrtBackend();
|
|
99
|
+
onnxSession = await ort.InferenceSession.create(join(modelDir, "decoder_joint-model.onnx"));
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
export function createOnnxDecoderSession(
|
|
103
|
+
vocabSize: number,
|
|
104
|
+
blankId: number,
|
|
105
|
+
layers: number,
|
|
106
|
+
hidden: number
|
|
107
|
+
): DecoderSession {
|
|
108
|
+
return {
|
|
109
|
+
vocabSize,
|
|
110
|
+
blankId,
|
|
111
|
+
stateDims: { layers, hidden },
|
|
112
|
+
async decode(encoderFrame, targets, targetLength, state1, state2) {
|
|
113
|
+
if (!onnxSession) throw new Error("decoder not initialized");
|
|
114
|
+
|
|
115
|
+
const D = encoderFrame.length;
|
|
116
|
+
const results = await onnxSession.run({
|
|
117
|
+
encoder_outputs: new ort.Tensor("float32", encoderFrame, [1, D, 1]),
|
|
118
|
+
targets: new ort.Tensor("int32", Int32Array.from(targets), [1, targets.length]),
|
|
119
|
+
target_length: new ort.Tensor("int32", Int32Array.from([targetLength]), [1]),
|
|
120
|
+
input_states_1: new ort.Tensor("float32", state1, [layers, 1, hidden]),
|
|
121
|
+
input_states_2: new ort.Tensor("float32", state2, [layers, 1, hidden]),
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
return {
|
|
125
|
+
output: new Float32Array(results["outputs"].data as Float32Array),
|
|
126
|
+
state1: new Float32Array(results["output_states_1"].data as Float32Array),
|
|
127
|
+
state2: new Float32Array(results["output_states_2"].data as Float32Array),
|
|
128
|
+
};
|
|
129
|
+
},
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
export function releaseDecoder(): void {
|
|
134
|
+
onnxSession = null;
|
|
135
|
+
}
|
package/src/encoder.ts
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import * as ort from "onnxruntime-node";
|
|
2
|
+
import { join } from "path";
|
|
3
|
+
import { ensureOrtBackend } from "./ort-backend-fix";
|
|
4
|
+
|
|
5
|
+
let session: ort.InferenceSession | null = null;
|
|
6
|
+
|
|
7
|
+
export async function initEncoder(modelDir: string): Promise<void> {
|
|
8
|
+
if (session) return;
|
|
9
|
+
ensureOrtBackend();
|
|
10
|
+
session = await ort.InferenceSession.create(join(modelDir, "encoder-model.onnx"));
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export async function encode(
|
|
14
|
+
features: ort.Tensor,
|
|
15
|
+
length: ort.Tensor
|
|
16
|
+
): Promise<{ encoderOutput: ort.Tensor; encodedLength: number }> {
|
|
17
|
+
if (!session) throw new Error("encoder not initialized");
|
|
18
|
+
|
|
19
|
+
const results = await session.run({
|
|
20
|
+
audio_signal: features,
|
|
21
|
+
length: length,
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
const encoderOutput = results["outputs"];
|
|
25
|
+
const encodedLength = Number((results["encoded_lengths"].data as BigInt64Array)[0]);
|
|
26
|
+
|
|
27
|
+
return { encoderOutput, encodedLength };
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export function releaseEncoder(): void {
|
|
31
|
+
session = null;
|
|
32
|
+
}
|
package/src/models.ts
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { join } from "path";
|
|
2
|
+
import { homedir } from "os";
|
|
3
|
+
import { existsSync, mkdirSync } from "fs";
|
|
4
|
+
|
|
5
|
+
export type ModelVersion = "v2" | "v3";
|
|
6
|
+
|
|
7
|
+
export const HF_REPOS: Record<ModelVersion, string> = {
|
|
8
|
+
v2: "istupakov/parakeet-tdt-0.6b-v2-onnx",
|
|
9
|
+
v3: "istupakov/parakeet-tdt-0.6b-v3-onnx",
|
|
10
|
+
};
|
|
11
|
+
|
|
12
|
+
export const MODEL_FILES = [
|
|
13
|
+
"encoder-model.onnx",
|
|
14
|
+
"encoder-model.onnx.data",
|
|
15
|
+
"decoder_joint-model.onnx",
|
|
16
|
+
"nemo128.onnx",
|
|
17
|
+
"vocab.txt",
|
|
18
|
+
];
|
|
19
|
+
|
|
20
|
+
export function getModelDir(version: ModelVersion): string {
|
|
21
|
+
return join(homedir(), ".cache", "parakeet", version);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export function isModelCached(version: ModelVersion): boolean {
|
|
25
|
+
const dir = getModelDir(version);
|
|
26
|
+
return MODEL_FILES.every((f) => existsSync(join(dir, f)));
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export async function ensureModel(version: ModelVersion, noCache = false): Promise<string> {
|
|
30
|
+
const dir = getModelDir(version);
|
|
31
|
+
|
|
32
|
+
if (!noCache && isModelCached(version)) {
|
|
33
|
+
return dir;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
mkdirSync(dir, { recursive: true });
|
|
37
|
+
|
|
38
|
+
const repo = HF_REPOS[version];
|
|
39
|
+
|
|
40
|
+
for (const file of MODEL_FILES) {
|
|
41
|
+
const url = `https://huggingface.co/${repo}/resolve/main/${file}`;
|
|
42
|
+
const dest = join(dir, file);
|
|
43
|
+
|
|
44
|
+
if (!noCache && existsSync(dest)) continue;
|
|
45
|
+
|
|
46
|
+
console.error(`Downloading ${file}...`);
|
|
47
|
+
|
|
48
|
+
const res = await fetch(url, { redirect: "follow" });
|
|
49
|
+
|
|
50
|
+
if (!res.ok) {
|
|
51
|
+
throw new Error(`failed to download model: ${url} (${res.status})`);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
await Bun.write(dest, res);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
return dir;
|
|
58
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Workaround for Bun + onnxruntime-node backend registration issue.
|
|
3
|
+
*
|
|
4
|
+
* When Bun imports onnxruntime-node (CJS), the backend gets registered
|
|
5
|
+
* in the CJS instance of onnxruntime-common. But our ESM code gets the
|
|
6
|
+
* ESM instance of onnxruntime-common, which has no backends registered.
|
|
7
|
+
*
|
|
8
|
+
* This module manually registers the native backend into the ESM module.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
let registered = false;
|
|
12
|
+
|
|
13
|
+
export function ensureOrtBackend(): void {
|
|
14
|
+
if (registered) return;
|
|
15
|
+
registered = true;
|
|
16
|
+
|
|
17
|
+
try {
|
|
18
|
+
// Force-load onnxruntime-node via require() to trigger CJS side-effects
|
|
19
|
+
// that register the native backend. Under bun test this happens
|
|
20
|
+
// automatically, but bun run may need the nudge.
|
|
21
|
+
require("onnxruntime-node");
|
|
22
|
+
} catch {
|
|
23
|
+
// If it fails, the native backend might already be registered
|
|
24
|
+
// (e.g. running under Node.js or a future Bun version that fixes this)
|
|
25
|
+
}
|
|
26
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import * as ort from "onnxruntime-node";
|
|
2
|
+
import { join } from "path";
|
|
3
|
+
import { ensureOrtBackend } from "./ort-backend-fix";
|
|
4
|
+
|
|
5
|
+
let session: ort.InferenceSession | null = null;
|
|
6
|
+
|
|
7
|
+
export async function initPreprocessor(modelDir: string): Promise<void> {
|
|
8
|
+
if (session) return;
|
|
9
|
+
ensureOrtBackend();
|
|
10
|
+
session = await ort.InferenceSession.create(join(modelDir, "nemo128.onnx"));
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export async function preprocess(audio: Float32Array): Promise<{ features: ort.Tensor; length: ort.Tensor }> {
|
|
14
|
+
if (!session) throw new Error("preprocessor not initialized");
|
|
15
|
+
|
|
16
|
+
const inputTensor = new ort.Tensor("float32", audio, [1, audio.length]);
|
|
17
|
+
const lengthTensor = new ort.Tensor("int64", BigInt64Array.from([BigInt(audio.length)]), [1]);
|
|
18
|
+
|
|
19
|
+
const results = await session.run({
|
|
20
|
+
waveforms: inputTensor,
|
|
21
|
+
waveforms_lens: lengthTensor,
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
const melData = results["features"].data as Float32Array;
|
|
25
|
+
const melDims = results["features"].dims as readonly number[];
|
|
26
|
+
const T = melDims[2];
|
|
27
|
+
const actualLength = Number((results["features_lens"].data as BigInt64Array)[0]);
|
|
28
|
+
|
|
29
|
+
const numFeatures = melDims[1];
|
|
30
|
+
const normalized = new Float32Array(melData.length);
|
|
31
|
+
|
|
32
|
+
for (let f = 0; f < numFeatures; f++) {
|
|
33
|
+
let sum = 0;
|
|
34
|
+
let sumSq = 0;
|
|
35
|
+
|
|
36
|
+
for (let t = 0; t < actualLength; t++) {
|
|
37
|
+
const val = melData[f * T + t];
|
|
38
|
+
sum += val;
|
|
39
|
+
sumSq += val * val;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
const mean = sum / actualLength;
|
|
43
|
+
const variance = sumSq / actualLength - mean * mean;
|
|
44
|
+
const std = Math.sqrt(Math.max(variance, 1e-10));
|
|
45
|
+
|
|
46
|
+
for (let t = 0; t < T; t++) {
|
|
47
|
+
normalized[f * T + t] = t < actualLength ? (melData[f * T + t] - mean) / std : 0;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
const featureTensor = new ort.Tensor("float32", normalized, melDims as number[]);
|
|
52
|
+
const outputLength = new ort.Tensor("int64", BigInt64Array.from([BigInt(actualLength)]), [1]);
|
|
53
|
+
|
|
54
|
+
return { features: featureTensor, length: outputLength };
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export function releasePreprocessor(): void {
|
|
58
|
+
session = null;
|
|
59
|
+
}
|
package/src/tokenizer.ts
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
export class Tokenizer {
|
|
2
|
+
private idToToken: Map<number, string>;
|
|
3
|
+
readonly vocabSize: number;
|
|
4
|
+
readonly blankId: number;
|
|
5
|
+
|
|
6
|
+
private constructor(idToToken: Map<number, string>, blankId: number) {
|
|
7
|
+
this.idToToken = idToToken;
|
|
8
|
+
this.vocabSize = idToToken.size;
|
|
9
|
+
this.blankId = blankId;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
static async fromFile(path: string): Promise<Tokenizer> {
|
|
13
|
+
const content = await Bun.file(path).text();
|
|
14
|
+
const idToToken = new Map<number, string>();
|
|
15
|
+
let blankId = -1;
|
|
16
|
+
|
|
17
|
+
for (const line of content.trim().split("\n")) {
|
|
18
|
+
const lastSpace = line.lastIndexOf(" ");
|
|
19
|
+
if (lastSpace === -1) continue;
|
|
20
|
+
const token = line.slice(0, lastSpace);
|
|
21
|
+
const id = parseInt(line.slice(lastSpace + 1), 10);
|
|
22
|
+
if (isNaN(id)) continue;
|
|
23
|
+
idToToken.set(id, token);
|
|
24
|
+
if (token === "<blk>") blankId = id;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
if (blankId === -1) {
|
|
28
|
+
blankId = idToToken.size - 1;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
return new Tokenizer(idToToken, blankId);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
detokenize(tokenIds: number[]): string {
|
|
35
|
+
const pieces: string[] = [];
|
|
36
|
+
for (const id of tokenIds) {
|
|
37
|
+
if (id === this.blankId) continue;
|
|
38
|
+
const token = this.idToToken.get(id);
|
|
39
|
+
if (token !== undefined) pieces.push(token);
|
|
40
|
+
}
|
|
41
|
+
return pieces.join("").replaceAll("\u2581", " ").trim();
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
isAsciiDominant(tokenIds: number[], threshold = 0.9): boolean {
|
|
45
|
+
const nonBlank = tokenIds.filter((id) => id !== this.blankId);
|
|
46
|
+
if (nonBlank.length === 0) return false;
|
|
47
|
+
|
|
48
|
+
let asciiCount = 0;
|
|
49
|
+
for (const id of nonBlank) {
|
|
50
|
+
const token = this.idToToken.get(id) ?? "";
|
|
51
|
+
const cleaned = token.replaceAll("\u2581", "");
|
|
52
|
+
if (cleaned.length > 0 && /^[\x00-\x7F]+$/.test(cleaned)) {
|
|
53
|
+
asciiCount++;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
return asciiCount / nonBlank.length >= threshold;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { ensureModel } from "./models";
|
|
2
|
+
import { convertToFloat32PCM } from "./audio";
|
|
3
|
+
import { initPreprocessor, preprocess } from "./preprocess";
|
|
4
|
+
import { initEncoder, encode } from "./encoder";
|
|
5
|
+
import {
|
|
6
|
+
initDecoder,
|
|
7
|
+
createOnnxDecoderSession,
|
|
8
|
+
greedyDecode,
|
|
9
|
+
} from "./decoder";
|
|
10
|
+
import { Tokenizer } from "./tokenizer";
|
|
11
|
+
import { join } from "path";
|
|
12
|
+
|
|
13
|
+
// Parakeet TDT 0.6B decoder state dimensions (from ONNX model input shapes)
|
|
14
|
+
const DECODER_LAYERS = 2;
|
|
15
|
+
const DECODER_HIDDEN = 640;
|
|
16
|
+
|
|
17
|
+
export interface TranscribeOptions {
|
|
18
|
+
noCache?: boolean;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export async function transcribe(audioPath: string, opts: TranscribeOptions = {}): Promise<string> {
|
|
22
|
+
const audio = await convertToFloat32PCM(audioPath);
|
|
23
|
+
|
|
24
|
+
if (audio.length < 1600) {
|
|
25
|
+
return "";
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const noCache = opts.noCache ?? false;
|
|
29
|
+
const modelDir = await ensureModel("v3", noCache);
|
|
30
|
+
const tokenizer = await Tokenizer.fromFile(join(modelDir, "vocab.txt"));
|
|
31
|
+
|
|
32
|
+
await initPreprocessor(modelDir);
|
|
33
|
+
await initEncoder(modelDir);
|
|
34
|
+
await initDecoder(modelDir);
|
|
35
|
+
|
|
36
|
+
const { features, length } = await preprocess(audio);
|
|
37
|
+
const { encoderOutput, encodedLength } = await encode(features, length);
|
|
38
|
+
|
|
39
|
+
const encoderData = encoderOutput.data as Float32Array;
|
|
40
|
+
const dims = encoderOutput.dims as readonly number[];
|
|
41
|
+
const D = dims[1];
|
|
42
|
+
const T = dims[2];
|
|
43
|
+
|
|
44
|
+
// Transpose from [1, D, T] to [T, D] so each frame is contiguous
|
|
45
|
+
const transposed = new Float32Array(T * D);
|
|
46
|
+
for (let t = 0; t < T; t++) {
|
|
47
|
+
for (let d = 0; d < D; d++) {
|
|
48
|
+
transposed[t * D + d] = encoderData[d * T + t];
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
const session = createOnnxDecoderSession(
|
|
53
|
+
tokenizer.vocabSize,
|
|
54
|
+
tokenizer.blankId,
|
|
55
|
+
DECODER_LAYERS,
|
|
56
|
+
DECODER_HIDDEN,
|
|
57
|
+
);
|
|
58
|
+
|
|
59
|
+
const tokens = await greedyDecode(session, encodedLength, transposed, D);
|
|
60
|
+
return tokenizer.detokenize(tokens);
|
|
61
|
+
}
|
package/tsconfig.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
{
|
|
2
|
+
"compilerOptions": {
|
|
3
|
+
"target": "ESNext",
|
|
4
|
+
"module": "ESNext",
|
|
5
|
+
"moduleResolution": "bundler",
|
|
6
|
+
"types": ["bun-types"],
|
|
7
|
+
"strict": true,
|
|
8
|
+
"skipLibCheck": true,
|
|
9
|
+
"outDir": "./dist",
|
|
10
|
+
"rootDir": "./src"
|
|
11
|
+
},
|
|
12
|
+
"include": ["src/**/*.ts", "tests/**/*.ts"]
|
|
13
|
+
}
|