@drakulavich/parakeet-cli 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -3
- package/package.json +3 -2
- package/src/__tests__/audio.test.ts +4 -2
- package/src/__tests__/models.test.ts +5 -11
- package/src/__tests__/tokenizer.test.ts +0 -4
- package/src/audio.ts +1 -0
- package/src/cli.ts +3 -2
- package/src/decoder.ts +9 -10
- package/src/encoder.ts +0 -4
- package/src/models.ts +9 -16
- package/src/preprocess.ts +3 -5
- package/src/tokenizer.ts +0 -16
- package/src/transcribe.ts +16 -9
- package/tsconfig.json +1 -1
package/README.md
CHANGED
|
@@ -12,10 +12,18 @@ Fast multilingual speech-to-text CLI powered by NVIDIA Parakeet ONNX models. Zer
|
|
|
12
12
|
|
|
13
13
|
## Install
|
|
14
14
|
|
|
15
|
+
Using Bun (recommended):
|
|
16
|
+
|
|
15
17
|
```bash
|
|
16
18
|
bun install -g @drakulavich/parakeet-cli
|
|
17
19
|
```
|
|
18
20
|
|
|
21
|
+
Using npm (requires Bun runtime installed):
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
npm install -g @drakulavich/parakeet-cli
|
|
25
|
+
```
|
|
26
|
+
|
|
19
27
|
Or clone and link locally:
|
|
20
28
|
|
|
21
29
|
```bash
|
|
@@ -25,6 +33,8 @@ bun install
|
|
|
25
33
|
bun link
|
|
26
34
|
```
|
|
27
35
|
|
|
36
|
+
> **Note:** Bun is required as the runtime — the CLI uses Bun-native APIs and TypeScript execution. You can use either `bun` or `npm` as the package manager to install it, but Bun must be available in PATH to run the `parakeet` command.
|
|
37
|
+
|
|
28
38
|
## Usage
|
|
29
39
|
|
|
30
40
|
```bash
|
|
@@ -86,9 +96,10 @@ Uses [NVIDIA Parakeet TDT 0.6B v3](https://huggingface.co/nvidia/parakeet-tdt-0.
|
|
|
86
96
|
|
|
87
97
|
## Requirements
|
|
88
98
|
|
|
89
|
-
- [Bun](https://bun.sh) >= 1.3
|
|
99
|
+
- [Bun](https://bun.sh) >= 1.3 (runtime)
|
|
90
100
|
- [ffmpeg](https://ffmpeg.org) installed and in PATH
|
|
91
101
|
- ~3GB disk space for model cache
|
|
102
|
+
- npm or Bun can be used as the package manager
|
|
92
103
|
|
|
93
104
|
### macOS (Apple Silicon)
|
|
94
105
|
|
|
@@ -97,7 +108,7 @@ Works natively on M1/M2/M3/M4. Install dependencies with Homebrew:
|
|
|
97
108
|
```bash
|
|
98
109
|
brew install ffmpeg
|
|
99
110
|
curl -fsSL https://bun.sh/install | bash
|
|
100
|
-
bun install -g @drakulavich/parakeet-cli
|
|
111
|
+
bun install -g @drakulavich/parakeet-cli # or: npm install -g @drakulavich/parakeet-cli
|
|
101
112
|
```
|
|
102
113
|
|
|
103
114
|
### Linux
|
|
@@ -105,7 +116,7 @@ bun install -g @drakulavich/parakeet-cli
|
|
|
105
116
|
```bash
|
|
106
117
|
apt install ffmpeg # or yum, pacman, etc.
|
|
107
118
|
curl -fsSL https://bun.sh/install | bash
|
|
108
|
-
bun install -g @drakulavich/parakeet-cli
|
|
119
|
+
bun install -g @drakulavich/parakeet-cli # or: npm install -g @drakulavich/parakeet-cli
|
|
109
120
|
```
|
|
110
121
|
|
|
111
122
|
## OpenClaw Integration
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@drakulavich/parakeet-cli",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.4",
|
|
4
4
|
"description": "Fast multilingual speech-to-text CLI powered by NVIDIA Parakeet ONNX models",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -44,7 +44,8 @@
|
|
|
44
44
|
"bun": ">=1.3.0"
|
|
45
45
|
},
|
|
46
46
|
"devDependencies": {
|
|
47
|
-
"@types/bun": "latest"
|
|
47
|
+
"@types/bun": "latest",
|
|
48
|
+
"typescript": "^6.0.2"
|
|
48
49
|
},
|
|
49
50
|
"dependencies": {
|
|
50
51
|
"onnxruntime-node": "^1.24.0"
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import { describe, test, expect } from "bun:test";
|
|
2
2
|
import { convertToFloat32PCM } from "../audio";
|
|
3
|
-
import {
|
|
3
|
+
import { spawnSync } from "child_process";
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
const hasFfmpeg = spawnSync("which", ["ffmpeg"]).status === 0;
|
|
6
|
+
|
|
7
|
+
describe.skipIf(!hasFfmpeg)("audio", () => {
|
|
6
8
|
test("converts WAV to 16kHz mono Float32Array", async () => {
|
|
7
9
|
const buffer = await convertToFloat32PCM("fixtures/silence.wav");
|
|
8
10
|
expect(buffer).toBeInstanceOf(Float32Array);
|
|
@@ -1,16 +1,11 @@
|
|
|
1
1
|
import { describe, test, expect } from "bun:test";
|
|
2
|
-
import { getModelDir, MODEL_FILES,
|
|
2
|
+
import { getModelDir, MODEL_FILES, HF_REPO } from "../models";
|
|
3
3
|
import { join } from "path";
|
|
4
4
|
import { homedir } from "os";
|
|
5
5
|
|
|
6
6
|
describe("models", () => {
|
|
7
|
-
test("getModelDir returns correct cache path
|
|
8
|
-
const dir = getModelDir(
|
|
9
|
-
expect(dir).toBe(join(homedir(), ".cache", "parakeet", "v2"));
|
|
10
|
-
});
|
|
11
|
-
|
|
12
|
-
test("getModelDir returns correct cache path for v3", () => {
|
|
13
|
-
const dir = getModelDir("v3");
|
|
7
|
+
test("getModelDir returns correct cache path", () => {
|
|
8
|
+
const dir = getModelDir();
|
|
14
9
|
expect(dir).toBe(join(homedir(), ".cache", "parakeet", "v3"));
|
|
15
10
|
});
|
|
16
11
|
|
|
@@ -22,8 +17,7 @@ describe("models", () => {
|
|
|
22
17
|
expect(MODEL_FILES).toContain("vocab.txt");
|
|
23
18
|
});
|
|
24
19
|
|
|
25
|
-
test("
|
|
26
|
-
expect(
|
|
27
|
-
expect(HF_REPOS.v3).toBe("istupakov/parakeet-tdt-0.6b-v3-onnx");
|
|
20
|
+
test("HF_REPO points to v3 ONNX repo", () => {
|
|
21
|
+
expect(HF_REPO).toBe("istupakov/parakeet-tdt-0.6b-v3-onnx");
|
|
28
22
|
});
|
|
29
23
|
});
|
|
@@ -38,8 +38,4 @@ describe("tokenizer", () => {
|
|
|
38
38
|
expect(text).toBe("cats");
|
|
39
39
|
});
|
|
40
40
|
|
|
41
|
-
test("isAsciiDominant returns true for ASCII tokens", async () => {
|
|
42
|
-
const tok = await Tokenizer.fromFile("fixtures/test-vocab.txt");
|
|
43
|
-
expect(tok.isAsciiDominant([0, 1, 2])).toBe(true);
|
|
44
|
-
});
|
|
45
41
|
});
|
package/src/audio.ts
CHANGED
|
@@ -30,6 +30,7 @@ export async function convertToFloat32PCM(inputPath: string): Promise<Float32Arr
|
|
|
30
30
|
const raw = await Bun.file(tmpPath).arrayBuffer();
|
|
31
31
|
return new Float32Array(raw);
|
|
32
32
|
} finally {
|
|
33
|
+
// Best-effort cleanup; file may already be gone
|
|
33
34
|
try { unlinkSync(tmpPath); } catch {}
|
|
34
35
|
}
|
|
35
36
|
}
|
package/src/cli.ts
CHANGED
|
@@ -28,8 +28,9 @@ async function main(): Promise<void> {
|
|
|
28
28
|
try {
|
|
29
29
|
const text = await transcribe(file, { noCache });
|
|
30
30
|
if (text) process.stdout.write(text + "\n");
|
|
31
|
-
} catch (err:
|
|
32
|
-
|
|
31
|
+
} catch (err: unknown) {
|
|
32
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
33
|
+
console.error(`Error: ${message}`);
|
|
33
34
|
process.exit(1);
|
|
34
35
|
}
|
|
35
36
|
}
|
package/src/decoder.ts
CHANGED
|
@@ -2,16 +2,19 @@ import * as ort from "onnxruntime-node";
|
|
|
2
2
|
import { join } from "path";
|
|
3
3
|
import { ensureOrtBackend } from "./ort-backend-fix";
|
|
4
4
|
|
|
5
|
+
// TDT allows multiple tokens per encoder frame; cap to prevent runaway decoding
|
|
5
6
|
const MAX_TOKENS_PER_STEP = 10;
|
|
6
7
|
|
|
8
|
+
type F32 = Float32Array<ArrayBufferLike>;
|
|
9
|
+
|
|
7
10
|
export interface DecoderSession {
|
|
8
11
|
decode(
|
|
9
|
-
encoderFrame:
|
|
12
|
+
encoderFrame: F32,
|
|
10
13
|
targets: number[],
|
|
11
14
|
targetLength: number,
|
|
12
|
-
state1:
|
|
13
|
-
state2:
|
|
14
|
-
): Promise<{ output:
|
|
15
|
+
state1: F32,
|
|
16
|
+
state2: F32
|
|
17
|
+
): Promise<{ output: F32; state1: F32; state2: F32 }>;
|
|
15
18
|
vocabSize: number;
|
|
16
19
|
blankId: number;
|
|
17
20
|
stateDims: { layers: number; hidden: number };
|
|
@@ -27,8 +30,8 @@ export async function greedyDecode(
|
|
|
27
30
|
|
|
28
31
|
const tokens: number[] = [];
|
|
29
32
|
const stateSize = session.stateDims.layers * session.stateDims.hidden;
|
|
30
|
-
let state1 = new Float32Array(stateSize);
|
|
31
|
-
let state2 = new Float32Array(stateSize);
|
|
33
|
+
let state1: F32 = new Float32Array(stateSize);
|
|
34
|
+
let state2: F32 = new Float32Array(stateSize);
|
|
32
35
|
let lastToken = session.blankId;
|
|
33
36
|
|
|
34
37
|
let t = 0;
|
|
@@ -129,7 +132,3 @@ export function createOnnxDecoderSession(
|
|
|
129
132
|
},
|
|
130
133
|
};
|
|
131
134
|
}
|
|
132
|
-
|
|
133
|
-
export function releaseDecoder(): void {
|
|
134
|
-
onnxSession = null;
|
|
135
|
-
}
|
package/src/encoder.ts
CHANGED
package/src/models.ts
CHANGED
|
@@ -2,12 +2,7 @@ import { join } from "path";
|
|
|
2
2
|
import { homedir } from "os";
|
|
3
3
|
import { existsSync, mkdirSync } from "fs";
|
|
4
4
|
|
|
5
|
-
export
|
|
6
|
-
|
|
7
|
-
export const HF_REPOS: Record<ModelVersion, string> = {
|
|
8
|
-
v2: "istupakov/parakeet-tdt-0.6b-v2-onnx",
|
|
9
|
-
v3: "istupakov/parakeet-tdt-0.6b-v3-onnx",
|
|
10
|
-
};
|
|
5
|
+
export const HF_REPO = "istupakov/parakeet-tdt-0.6b-v3-onnx";
|
|
11
6
|
|
|
12
7
|
export const MODEL_FILES = [
|
|
13
8
|
"encoder-model.onnx",
|
|
@@ -17,28 +12,26 @@ export const MODEL_FILES = [
|
|
|
17
12
|
"vocab.txt",
|
|
18
13
|
];
|
|
19
14
|
|
|
20
|
-
export function getModelDir(
|
|
21
|
-
return join(homedir(), ".cache", "parakeet",
|
|
15
|
+
export function getModelDir(): string {
|
|
16
|
+
return join(homedir(), ".cache", "parakeet", "v3");
|
|
22
17
|
}
|
|
23
18
|
|
|
24
|
-
export function isModelCached(
|
|
25
|
-
const dir = getModelDir(
|
|
19
|
+
export function isModelCached(): boolean {
|
|
20
|
+
const dir = getModelDir();
|
|
26
21
|
return MODEL_FILES.every((f) => existsSync(join(dir, f)));
|
|
27
22
|
}
|
|
28
23
|
|
|
29
|
-
export async function ensureModel(
|
|
30
|
-
const dir = getModelDir(
|
|
24
|
+
export async function ensureModel(noCache = false): Promise<string> {
|
|
25
|
+
const dir = getModelDir();
|
|
31
26
|
|
|
32
|
-
if (!noCache && isModelCached(
|
|
27
|
+
if (!noCache && isModelCached()) {
|
|
33
28
|
return dir;
|
|
34
29
|
}
|
|
35
30
|
|
|
36
31
|
mkdirSync(dir, { recursive: true });
|
|
37
32
|
|
|
38
|
-
const repo = HF_REPOS[version];
|
|
39
|
-
|
|
40
33
|
for (const file of MODEL_FILES) {
|
|
41
|
-
const url = `https://huggingface.co/${
|
|
34
|
+
const url = `https://huggingface.co/${HF_REPO}/resolve/main/${file}`;
|
|
42
35
|
const dest = join(dir, file);
|
|
43
36
|
|
|
44
37
|
if (!noCache && existsSync(dest)) continue;
|
package/src/preprocess.ts
CHANGED
|
@@ -2,6 +2,8 @@ import * as ort from "onnxruntime-node";
|
|
|
2
2
|
import { join } from "path";
|
|
3
3
|
import { ensureOrtBackend } from "./ort-backend-fix";
|
|
4
4
|
|
|
5
|
+
const NORM_EPSILON = 1e-10;
|
|
6
|
+
|
|
5
7
|
let session: ort.InferenceSession | null = null;
|
|
6
8
|
|
|
7
9
|
export async function initPreprocessor(modelDir: string): Promise<void> {
|
|
@@ -41,7 +43,7 @@ export async function preprocess(audio: Float32Array): Promise<{ features: ort.T
|
|
|
41
43
|
|
|
42
44
|
const mean = sum / actualLength;
|
|
43
45
|
const variance = sumSq / actualLength - mean * mean;
|
|
44
|
-
const std = Math.sqrt(Math.max(variance,
|
|
46
|
+
const std = Math.sqrt(Math.max(variance, NORM_EPSILON));
|
|
45
47
|
|
|
46
48
|
for (let t = 0; t < T; t++) {
|
|
47
49
|
normalized[f * T + t] = t < actualLength ? (melData[f * T + t] - mean) / std : 0;
|
|
@@ -53,7 +55,3 @@ export async function preprocess(audio: Float32Array): Promise<{ features: ort.T
|
|
|
53
55
|
|
|
54
56
|
return { features: featureTensor, length: outputLength };
|
|
55
57
|
}
|
|
56
|
-
|
|
57
|
-
export function releasePreprocessor(): void {
|
|
58
|
-
session = null;
|
|
59
|
-
}
|
package/src/tokenizer.ts
CHANGED
|
@@ -40,20 +40,4 @@ export class Tokenizer {
|
|
|
40
40
|
}
|
|
41
41
|
return pieces.join("").replaceAll("\u2581", " ").trim();
|
|
42
42
|
}
|
|
43
|
-
|
|
44
|
-
isAsciiDominant(tokenIds: number[], threshold = 0.9): boolean {
|
|
45
|
-
const nonBlank = tokenIds.filter((id) => id !== this.blankId);
|
|
46
|
-
if (nonBlank.length === 0) return false;
|
|
47
|
-
|
|
48
|
-
let asciiCount = 0;
|
|
49
|
-
for (const id of nonBlank) {
|
|
50
|
-
const token = this.idToToken.get(id) ?? "";
|
|
51
|
-
const cleaned = token.replaceAll("\u2581", "");
|
|
52
|
-
if (cleaned.length > 0 && /^[\x00-\x7F]+$/.test(cleaned)) {
|
|
53
|
-
asciiCount++;
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
return asciiCount / nonBlank.length >= threshold;
|
|
58
|
-
}
|
|
59
43
|
}
|
package/src/transcribe.ts
CHANGED
|
@@ -10,6 +10,16 @@ import {
|
|
|
10
10
|
import { Tokenizer } from "./tokenizer";
|
|
11
11
|
import { join } from "path";
|
|
12
12
|
|
|
13
|
+
function transpose2D(data: Float32Array, rows: number, cols: number): Float32Array {
|
|
14
|
+
const out = new Float32Array(cols * rows);
|
|
15
|
+
for (let c = 0; c < cols; c++) {
|
|
16
|
+
for (let r = 0; r < rows; r++) {
|
|
17
|
+
out[c * rows + r] = data[r * cols + c];
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
return out;
|
|
21
|
+
}
|
|
22
|
+
|
|
13
23
|
// Parakeet TDT 0.6B decoder state dimensions (from ONNX model input shapes)
|
|
14
24
|
const DECODER_LAYERS = 2;
|
|
15
25
|
const DECODER_HIDDEN = 640;
|
|
@@ -18,15 +28,18 @@ export interface TranscribeOptions {
|
|
|
18
28
|
noCache?: boolean;
|
|
19
29
|
}
|
|
20
30
|
|
|
31
|
+
// Minimum 0.1s of audio at 16kHz to produce meaningful output
|
|
32
|
+
const MIN_AUDIO_SAMPLES = 1600;
|
|
33
|
+
|
|
21
34
|
export async function transcribe(audioPath: string, opts: TranscribeOptions = {}): Promise<string> {
|
|
22
35
|
const audio = await convertToFloat32PCM(audioPath);
|
|
23
36
|
|
|
24
|
-
if (audio.length <
|
|
37
|
+
if (audio.length < MIN_AUDIO_SAMPLES) {
|
|
25
38
|
return "";
|
|
26
39
|
}
|
|
27
40
|
|
|
28
41
|
const noCache = opts.noCache ?? false;
|
|
29
|
-
const modelDir = await ensureModel(
|
|
42
|
+
const modelDir = await ensureModel(noCache);
|
|
30
43
|
const tokenizer = await Tokenizer.fromFile(join(modelDir, "vocab.txt"));
|
|
31
44
|
|
|
32
45
|
await initPreprocessor(modelDir);
|
|
@@ -41,13 +54,7 @@ export async function transcribe(audioPath: string, opts: TranscribeOptions = {}
|
|
|
41
54
|
const D = dims[1];
|
|
42
55
|
const T = dims[2];
|
|
43
56
|
|
|
44
|
-
|
|
45
|
-
const transposed = new Float32Array(T * D);
|
|
46
|
-
for (let t = 0; t < T; t++) {
|
|
47
|
-
for (let d = 0; d < D; d++) {
|
|
48
|
-
transposed[t * D + d] = encoderData[d * T + t];
|
|
49
|
-
}
|
|
50
|
-
}
|
|
57
|
+
const transposed = transpose2D(encoderData, D, T);
|
|
51
58
|
|
|
52
59
|
const session = createOnnxDecoderSession(
|
|
53
60
|
tokenizer.vocabSize,
|