@drakulavich/parakeet-cli 0.1.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +5 -1
- package/src/__tests__/decoder.test.ts +6 -13
- package/src/__tests__/lib.test.ts +12 -0
- package/src/cli.ts +1 -7
- package/src/decoder.ts +69 -42
- package/src/lib.ts +23 -0
- package/src/models.ts +5 -5
- package/src/transcribe.ts +6 -3
package/package.json
CHANGED
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@drakulavich/parakeet-cli",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "Fast multilingual speech-to-text CLI powered by NVIDIA Parakeet ONNX models",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
7
7
|
"parakeet": "bin/parakeet.js"
|
|
8
8
|
},
|
|
9
|
+
"exports": {
|
|
10
|
+
".": "./src/cli.ts",
|
|
11
|
+
"./core": "./src/lib.ts"
|
|
12
|
+
},
|
|
9
13
|
"files": [
|
|
10
14
|
"bin/",
|
|
11
15
|
"src/",
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { describe, test, expect } from "bun:test";
|
|
2
|
-
import {
|
|
2
|
+
import { beamDecode, type DecoderSession } from "../decoder";
|
|
3
3
|
|
|
4
4
|
function mockSession(responses: Array<{ tokenLogits: number[]; durationLogits: number[] }>): DecoderSession {
|
|
5
5
|
let callIndex = 0;
|
|
@@ -24,7 +24,8 @@ describe("decoder", () => {
|
|
|
24
24
|
{ tokenLogits: [0, 10, 0, -10], durationLogits: [10, 0] },
|
|
25
25
|
{ tokenLogits: [0, 0, 0, 10], durationLogits: [10, 0] },
|
|
26
26
|
]);
|
|
27
|
-
const
|
|
27
|
+
const encoderData = new Float32Array(3);
|
|
28
|
+
const tokens = await beamDecode(session, 3, encoderData, 1, 1);
|
|
28
29
|
expect(tokens).toEqual([0, 1]);
|
|
29
30
|
});
|
|
30
31
|
|
|
@@ -34,24 +35,16 @@ describe("decoder", () => {
|
|
|
34
35
|
{ tokenLogits: [0, 10, 0, -10], durationLogits: [10, 0, 0] },
|
|
35
36
|
{ tokenLogits: [0, 0, 0, 10], durationLogits: [10, 0, 0] },
|
|
36
37
|
]);
|
|
37
|
-
const
|
|
38
|
+
const encoderData = new Float32Array(5);
|
|
39
|
+
const tokens = await beamDecode(session, 5, encoderData, 1, 1);
|
|
38
40
|
expect(tokens).toEqual([0, 1]);
|
|
39
41
|
});
|
|
40
42
|
|
|
41
|
-
test("handles max_tokens_per_step limit", async () => {
|
|
42
|
-
const session = mockSession([
|
|
43
|
-
{ tokenLogits: [10, 0, 0, -10], durationLogits: [10, 0] },
|
|
44
|
-
]);
|
|
45
|
-
const tokens = await greedyDecode(session, 2);
|
|
46
|
-
expect(tokens.length).toBeLessThanOrEqual(20);
|
|
47
|
-
expect(tokens.length).toBeGreaterThan(0);
|
|
48
|
-
});
|
|
49
|
-
|
|
50
43
|
test("returns empty for zero-length encoder output", async () => {
|
|
51
44
|
const session = mockSession([
|
|
52
45
|
{ tokenLogits: [0, 0, 0, 10], durationLogits: [10, 0] },
|
|
53
46
|
]);
|
|
54
|
-
const tokens = await
|
|
47
|
+
const tokens = await beamDecode(session, 0, new Float32Array(0), 1);
|
|
55
48
|
expect(tokens).toEqual([]);
|
|
56
49
|
});
|
|
57
50
|
});
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { describe, expect, it } from "bun:test";
|
|
2
|
+
import { transcribe } from "../lib";
|
|
3
|
+
|
|
4
|
+
describe("lib API", () => {
|
|
5
|
+
it("exports transcribe function", () => {
|
|
6
|
+
expect(typeof transcribe).toBe("function");
|
|
7
|
+
});
|
|
8
|
+
|
|
9
|
+
it("rejects missing file", async () => {
|
|
10
|
+
await expect(transcribe("/nonexistent/audio.wav")).rejects.toThrow("File not found");
|
|
11
|
+
});
|
|
12
|
+
});
|
package/src/cli.ts
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env bun
|
|
2
2
|
|
|
3
|
-
import {
|
|
4
|
-
import { transcribe } from "./transcribe";
|
|
3
|
+
import { transcribe } from "./lib";
|
|
5
4
|
|
|
6
5
|
async function main(): Promise<void> {
|
|
7
6
|
const args = process.argv.slice(2);
|
|
@@ -20,11 +19,6 @@ async function main(): Promise<void> {
|
|
|
20
19
|
process.exit(1);
|
|
21
20
|
}
|
|
22
21
|
|
|
23
|
-
if (!existsSync(file)) {
|
|
24
|
-
console.error(`Error: file not found: ${file}`);
|
|
25
|
-
process.exit(1);
|
|
26
|
-
}
|
|
27
|
-
|
|
28
22
|
try {
|
|
29
23
|
const text = await transcribe(file, { noCache });
|
|
30
24
|
if (text) process.stdout.write(text + "\n");
|
package/src/decoder.ts
CHANGED
|
@@ -20,69 +20,87 @@ export interface DecoderSession {
|
|
|
20
20
|
stateDims: { layers: number; hidden: number };
|
|
21
21
|
}
|
|
22
22
|
|
|
23
|
-
|
|
23
|
+
const DEFAULT_BEAM_WIDTH = 4;
|
|
24
|
+
|
|
25
|
+
interface Beam {
|
|
26
|
+
tokens: number[];
|
|
27
|
+
score: number;
|
|
28
|
+
lastToken: number;
|
|
29
|
+
state1: F32;
|
|
30
|
+
state2: F32;
|
|
31
|
+
t: number;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export async function beamDecode(
|
|
24
35
|
session: DecoderSession,
|
|
25
36
|
encoderLength: number,
|
|
26
|
-
encoderData
|
|
27
|
-
encoderDim
|
|
37
|
+
encoderData: Float32Array,
|
|
38
|
+
encoderDim: number,
|
|
39
|
+
beamWidth: number = DEFAULT_BEAM_WIDTH,
|
|
28
40
|
): Promise<number[]> {
|
|
29
41
|
if (encoderLength === 0) return [];
|
|
30
42
|
|
|
31
|
-
const tokens: number[] = [];
|
|
32
43
|
const stateSize = session.stateDims.layers * session.stateDims.hidden;
|
|
33
|
-
let state1: F32 = new Float32Array(stateSize);
|
|
34
|
-
let state2: F32 = new Float32Array(stateSize);
|
|
35
|
-
let lastToken = session.blankId;
|
|
36
|
-
|
|
37
|
-
let t = 0;
|
|
38
|
-
while (t < encoderLength) {
|
|
39
|
-
let tokensThisStep = 0;
|
|
40
|
-
|
|
41
|
-
while (tokensThisStep < MAX_TOKENS_PER_STEP) {
|
|
42
|
-
let frame: Float32Array;
|
|
43
|
-
if (encoderData && encoderDim) {
|
|
44
|
-
// Must copy — ort.Tensor doesn't work with subarray views under Bun
|
|
45
|
-
frame = encoderData.slice(t * encoderDim, (t + 1) * encoderDim);
|
|
46
|
-
} else {
|
|
47
|
-
frame = new Float32Array(1);
|
|
48
|
-
}
|
|
49
44
|
|
|
50
|
-
|
|
45
|
+
let beams: Beam[] = [{
|
|
46
|
+
tokens: [],
|
|
47
|
+
score: 0,
|
|
48
|
+
lastToken: session.blankId,
|
|
49
|
+
state1: new Float32Array(stateSize),
|
|
50
|
+
state2: new Float32Array(stateSize),
|
|
51
|
+
t: 0,
|
|
52
|
+
}];
|
|
53
|
+
|
|
54
|
+
const maxSteps = encoderLength * MAX_TOKENS_PER_STEP;
|
|
55
|
+
|
|
56
|
+
for (let step = 0; step < maxSteps; step++) {
|
|
57
|
+
const active = beams.filter(b => b.t < encoderLength);
|
|
58
|
+
if (active.length === 0) break;
|
|
59
|
+
|
|
60
|
+
const candidates: Beam[] = [];
|
|
61
|
+
|
|
62
|
+
for (const beam of active) {
|
|
63
|
+
// Must copy — ort.Tensor doesn't work with subarray views under Bun
|
|
64
|
+
const frame = encoderData.slice(beam.t * encoderDim, (beam.t + 1) * encoderDim);
|
|
65
|
+
const result = await session.decode(frame, [beam.lastToken], 1, beam.state1, beam.state2);
|
|
51
66
|
const output = result.output;
|
|
52
67
|
|
|
53
68
|
const tokenLogits = output.slice(0, session.vocabSize);
|
|
54
69
|
const durationLogits = output.slice(session.vocabSize);
|
|
55
|
-
|
|
56
|
-
const tokenId = argmax(tokenLogits);
|
|
57
70
|
const duration = argmax(durationLogits);
|
|
58
71
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
lastToken = tokenId;
|
|
69
|
-
tokensThisStep++;
|
|
72
|
+
// Blank option: advance one frame, keep same tokens
|
|
73
|
+
candidates.push({
|
|
74
|
+
tokens: beam.tokens,
|
|
75
|
+
score: beam.score + tokenLogits[session.blankId],
|
|
76
|
+
lastToken: beam.lastToken,
|
|
77
|
+
state1: result.state1,
|
|
78
|
+
state2: result.state2,
|
|
79
|
+
t: beam.t + 1,
|
|
80
|
+
});
|
|
70
81
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
82
|
+
// Top non-blank token options
|
|
83
|
+
const topK = topKIndices(tokenLogits, beamWidth, session.blankId);
|
|
84
|
+
for (const tokenId of topK) {
|
|
85
|
+
candidates.push({
|
|
86
|
+
tokens: [...beam.tokens, tokenId],
|
|
87
|
+
score: beam.score + tokenLogits[tokenId],
|
|
88
|
+
lastToken: tokenId,
|
|
89
|
+
state1: result.state1,
|
|
90
|
+
state2: result.state2,
|
|
91
|
+
t: duration > 0 ? beam.t + duration : beam.t,
|
|
92
|
+
});
|
|
74
93
|
}
|
|
75
94
|
}
|
|
76
95
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
}
|
|
96
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
97
|
+
beams = candidates.slice(0, beamWidth);
|
|
80
98
|
}
|
|
81
99
|
|
|
82
|
-
return tokens;
|
|
100
|
+
return beams[0].tokens;
|
|
83
101
|
}
|
|
84
102
|
|
|
85
|
-
function argmax(arr:
|
|
103
|
+
function argmax(arr: F32): number {
|
|
86
104
|
let maxIdx = 0;
|
|
87
105
|
let maxVal = arr[0];
|
|
88
106
|
for (let i = 1; i < arr.length; i++) {
|
|
@@ -94,6 +112,15 @@ function argmax(arr: Float32Array): number {
|
|
|
94
112
|
return maxIdx;
|
|
95
113
|
}
|
|
96
114
|
|
|
115
|
+
function topKIndices(arr: F32, k: number, excludeId: number): number[] {
|
|
116
|
+
const indexed: [number, number][] = [];
|
|
117
|
+
for (let i = 0; i < arr.length; i++) {
|
|
118
|
+
if (i !== excludeId) indexed.push([arr[i], i]);
|
|
119
|
+
}
|
|
120
|
+
indexed.sort((a, b) => b[0] - a[0]);
|
|
121
|
+
return indexed.slice(0, k).map(([, i]) => i);
|
|
122
|
+
}
|
|
123
|
+
|
|
97
124
|
let onnxSession: ort.InferenceSession | null = null;
|
|
98
125
|
|
|
99
126
|
export async function initDecoder(modelDir: string): Promise<void> {
|
package/src/lib.ts
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { existsSync } from "fs";
|
|
2
|
+
import { transcribe as internalTranscribe } from "./transcribe";
|
|
3
|
+
|
|
4
|
+
export interface TranscribeOptions {
|
|
5
|
+
beamWidth?: number;
|
|
6
|
+
noCache?: boolean;
|
|
7
|
+
modelDir?: string;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
export async function transcribe(
|
|
11
|
+
audioPath: string,
|
|
12
|
+
options: TranscribeOptions = {},
|
|
13
|
+
): Promise<string> {
|
|
14
|
+
if (!existsSync(audioPath)) {
|
|
15
|
+
throw new Error(`File not found: ${audioPath}`);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
return internalTranscribe(audioPath, {
|
|
19
|
+
noCache: options.noCache ?? false,
|
|
20
|
+
beamWidth: options.beamWidth,
|
|
21
|
+
modelDir: options.modelDir,
|
|
22
|
+
});
|
|
23
|
+
}
|
package/src/models.ts
CHANGED
|
@@ -16,13 +16,13 @@ export function getModelDir(): string {
|
|
|
16
16
|
return join(homedir(), ".cache", "parakeet", "v3");
|
|
17
17
|
}
|
|
18
18
|
|
|
19
|
-
export function isModelCached(): boolean {
|
|
20
|
-
const
|
|
21
|
-
return MODEL_FILES.every((f) => existsSync(join(
|
|
19
|
+
export function isModelCached(dir?: string): boolean {
|
|
20
|
+
const d = dir ?? getModelDir();
|
|
21
|
+
return MODEL_FILES.every((f) => existsSync(join(d, f)));
|
|
22
22
|
}
|
|
23
23
|
|
|
24
|
-
export async function ensureModel(noCache = false): Promise<string> {
|
|
25
|
-
const dir = getModelDir();
|
|
24
|
+
export async function ensureModel(noCache = false, modelDir?: string): Promise<string> {
|
|
25
|
+
const dir = modelDir ?? getModelDir();
|
|
26
26
|
|
|
27
27
|
if (!noCache && isModelCached()) {
|
|
28
28
|
return dir;
|
package/src/transcribe.ts
CHANGED
|
@@ -5,7 +5,7 @@ import { initEncoder, encode } from "./encoder";
|
|
|
5
5
|
import {
|
|
6
6
|
initDecoder,
|
|
7
7
|
createOnnxDecoderSession,
|
|
8
|
-
|
|
8
|
+
beamDecode,
|
|
9
9
|
} from "./decoder";
|
|
10
10
|
import { Tokenizer } from "./tokenizer";
|
|
11
11
|
import { join } from "path";
|
|
@@ -26,6 +26,8 @@ const DECODER_HIDDEN = 640;
|
|
|
26
26
|
|
|
27
27
|
export interface TranscribeOptions {
|
|
28
28
|
noCache?: boolean;
|
|
29
|
+
beamWidth?: number;
|
|
30
|
+
modelDir?: string;
|
|
29
31
|
}
|
|
30
32
|
|
|
31
33
|
// Minimum 0.1s of audio at 16kHz to produce meaningful output
|
|
@@ -39,7 +41,8 @@ export async function transcribe(audioPath: string, opts: TranscribeOptions = {}
|
|
|
39
41
|
}
|
|
40
42
|
|
|
41
43
|
const noCache = opts.noCache ?? false;
|
|
42
|
-
const
|
|
44
|
+
const beamWidth = opts.beamWidth ?? 4;
|
|
45
|
+
const modelDir = await ensureModel(noCache, opts.modelDir);
|
|
43
46
|
const tokenizer = await Tokenizer.fromFile(join(modelDir, "vocab.txt"));
|
|
44
47
|
|
|
45
48
|
await initPreprocessor(modelDir);
|
|
@@ -63,6 +66,6 @@ export async function transcribe(audioPath: string, opts: TranscribeOptions = {}
|
|
|
63
66
|
DECODER_HIDDEN,
|
|
64
67
|
);
|
|
65
68
|
|
|
66
|
-
const tokens = await
|
|
69
|
+
const tokens = await beamDecode(session, encodedLength, transposed, D, beamWidth);
|
|
67
70
|
return tokenizer.detokenize(tokens);
|
|
68
71
|
}
|