npm - @drakulavich/parakeet-cli - Versions diffs - 0.1.3 → 0.2.0 - Mend

@drakulavich/parakeet-cli 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/package.json +3 -2
package/src/__tests__/audio.test.ts +4 -2
package/src/__tests__/decoder.test.ts +6 -13
package/src/__tests__/models.test.ts +5 -11
package/src/__tests__/tokenizer.test.ts +0 -4
package/src/audio.ts +1 -0
package/src/cli.ts +3 -2
package/src/decoder.ts +76 -50
package/src/encoder.ts +0 -4
package/src/models.ts +9 -16
package/src/preprocess.ts +3 -5
package/src/tokenizer.ts +0 -16
package/src/transcribe.ts +18 -11
package/tsconfig.json +1 -1

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@drakulavich/parakeet-cli",
-  "version": "0.1.3",
+  "version": "0.2.0",
   "description": "Fast multilingual speech-to-text CLI powered by NVIDIA Parakeet ONNX models",
   "type": "module",
   "bin": {
@@ -44,7 +44,8 @@
     "bun": ">=1.3.0"
   },
   "devDependencies": {
-    "@types/bun": "latest"
+    "@types/bun": "latest",
+    "typescript": "^6.0.2"
   },
   "dependencies": {
     "onnxruntime-node": "^1.24.0"

package/src/__tests__/audio.test.ts CHANGED Viewed

@@ -1,8 +1,10 @@
 import { describe, test, expect } from "bun:test";
 import { convertToFloat32PCM } from "../audio";
-import { existsSync } from "fs";
+import { spawnSync } from "child_process";
-describe("audio", () => {
+const hasFfmpeg = spawnSync("which", ["ffmpeg"]).status === 0;
+describe.skipIf(!hasFfmpeg)("audio", () => {
   test("converts WAV to 16kHz mono Float32Array", async () => {
     const buffer = await convertToFloat32PCM("fixtures/silence.wav");
     expect(buffer).toBeInstanceOf(Float32Array);

package/src/__tests__/decoder.test.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import { describe, test, expect } from "bun:test";
-import { greedyDecode, type DecoderSession } from "../decoder";
+import { beamDecode, type DecoderSession } from "../decoder";
 function mockSession(responses: Array<{ tokenLogits: number[]; durationLogits: number[] }>): DecoderSession {
   let callIndex = 0;
@@ -24,7 +24,8 @@ describe("decoder", () => {
       { tokenLogits: [0, 10, 0, -10], durationLogits: [10, 0] },
       { tokenLogits: [0, 0, 0, 10], durationLogits: [10, 0] },
     ]);
-    const tokens = await greedyDecode(session, 3);
+    const encoderData = new Float32Array(3);
+    const tokens = await beamDecode(session, 3, encoderData, 1, 1);
     expect(tokens).toEqual([0, 1]);
   });
@@ -34,24 +35,16 @@ describe("decoder", () => {
       { tokenLogits: [0, 10, 0, -10], durationLogits: [10, 0, 0] },
       { tokenLogits: [0, 0, 0, 10], durationLogits: [10, 0, 0] },
     ]);
-    const tokens = await greedyDecode(session, 5);
+    const encoderData = new Float32Array(5);
+    const tokens = await beamDecode(session, 5, encoderData, 1, 1);
     expect(tokens).toEqual([0, 1]);
   });
-  test("handles max_tokens_per_step limit", async () => {
-    const session = mockSession([
-      { tokenLogits: [10, 0, 0, -10], durationLogits: [10, 0] },
-    ]);
-    const tokens = await greedyDecode(session, 2);
-    expect(tokens.length).toBeLessThanOrEqual(20);
-    expect(tokens.length).toBeGreaterThan(0);
-  });
   test("returns empty for zero-length encoder output", async () => {
     const session = mockSession([
       { tokenLogits: [0, 0, 0, 10], durationLogits: [10, 0] },
     ]);
-    const tokens = await greedyDecode(session, 0);
+    const tokens = await beamDecode(session, 0, new Float32Array(0), 1);
     expect(tokens).toEqual([]);
   });
 });

package/src/__tests__/models.test.ts CHANGED Viewed

@@ -1,16 +1,11 @@
 import { describe, test, expect } from "bun:test";
-import { getModelDir, MODEL_FILES, HF_REPOS } from "../models";
+import { getModelDir, MODEL_FILES, HF_REPO } from "../models";
 import { join } from "path";
 import { homedir } from "os";
 describe("models", () => {
-  test("getModelDir returns correct cache path for v2", () => {
-    const dir = getModelDir("v2");
-    expect(dir).toBe(join(homedir(), ".cache", "parakeet", "v2"));
-  });
-  test("getModelDir returns correct cache path for v3", () => {
-    const dir = getModelDir("v3");
+  test("getModelDir returns correct cache path", () => {
+    const dir = getModelDir();
     expect(dir).toBe(join(homedir(), ".cache", "parakeet", "v3"));
   });
@@ -22,8 +17,7 @@ describe("models", () => {
     expect(MODEL_FILES).toContain("vocab.txt");
   });
-  test("HF_REPOS maps versions to repo IDs", () => {
-    expect(HF_REPOS.v2).toBe("istupakov/parakeet-tdt-0.6b-v2-onnx");
-    expect(HF_REPOS.v3).toBe("istupakov/parakeet-tdt-0.6b-v3-onnx");
+  test("HF_REPO points to v3 ONNX repo", () => {
+    expect(HF_REPO).toBe("istupakov/parakeet-tdt-0.6b-v3-onnx");
   });
 });

package/src/__tests__/tokenizer.test.ts CHANGED Viewed

@@ -38,8 +38,4 @@ describe("tokenizer", () => {
     expect(text).toBe("cats");
   });
-  test("isAsciiDominant returns true for ASCII tokens", async () => {
-    const tok = await Tokenizer.fromFile("fixtures/test-vocab.txt");
-    expect(tok.isAsciiDominant([0, 1, 2])).toBe(true);
-  });
 });

package/src/audio.ts CHANGED Viewed

@@ -30,6 +30,7 @@ export async function convertToFloat32PCM(inputPath: string): Promise<Float32Arr
     const raw = await Bun.file(tmpPath).arrayBuffer();
     return new Float32Array(raw);
   } finally {
+    // Best-effort cleanup; file may already be gone
     try { unlinkSync(tmpPath); } catch {}
   }
 }

package/src/cli.ts CHANGED Viewed

@@ -28,8 +28,9 @@ async function main(): Promise<void> {
   try {
     const text = await transcribe(file, { noCache });
     if (text) process.stdout.write(text + "\n");
-  } catch (err: any) {
-    console.error(`Error: ${err.message}`);
+  } catch (err: unknown) {
+    const message = err instanceof Error ? err.message : String(err);
+    console.error(`Error: ${message}`);
     process.exit(1);
   }
 }

package/src/decoder.ts CHANGED Viewed

@@ -2,84 +2,105 @@ import * as ort from "onnxruntime-node";
 import { join } from "path";
 import { ensureOrtBackend } from "./ort-backend-fix";
+// TDT allows multiple tokens per encoder frame; cap to prevent runaway decoding
 const MAX_TOKENS_PER_STEP = 10;
+type F32 = Float32Array<ArrayBufferLike>;
 export interface DecoderSession {
   decode(
-    encoderFrame: Float32Array,
+    encoderFrame: F32,
     targets: number[],
     targetLength: number,
-    state1: Float32Array,
-    state2: Float32Array
-  ): Promise<{ output: Float32Array; state1: Float32Array; state2: Float32Array }>;
+    state1: F32,
+    state2: F32
+  ): Promise<{ output: F32; state1: F32; state2: F32 }>;
   vocabSize: number;
   blankId: number;
   stateDims: { layers: number; hidden: number };
 }
-export async function greedyDecode(
+const DEFAULT_BEAM_WIDTH = 4;
+interface Beam {
+  tokens: number[];
+  score: number;
+  lastToken: number;
+  state1: F32;
+  state2: F32;
+  t: number;
+}
+export async function beamDecode(
   session: DecoderSession,
   encoderLength: number,
-  encoderData?: Float32Array,
-  encoderDim?: number
+  encoderData: Float32Array,
+  encoderDim: number,
+  beamWidth: number = DEFAULT_BEAM_WIDTH,
 ): Promise<number[]> {
   if (encoderLength === 0) return [];
-  const tokens: number[] = [];
   const stateSize = session.stateDims.layers * session.stateDims.hidden;
-  let state1 = new Float32Array(stateSize);
-  let state2 = new Float32Array(stateSize);
-  let lastToken = session.blankId;
-  let t = 0;
-  while (t < encoderLength) {
-    let tokensThisStep = 0;
-    while (tokensThisStep < MAX_TOKENS_PER_STEP) {
-      let frame: Float32Array;
-      if (encoderData && encoderDim) {
-        // Must copy — ort.Tensor doesn't work with subarray views under Bun
-        frame = encoderData.slice(t * encoderDim, (t + 1) * encoderDim);
-      } else {
-        frame = new Float32Array(1);
-      }
-      const result = await session.decode(frame, [lastToken], 1, state1, state2);
+  let beams: Beam[] = [{
+    tokens: [],
+    score: 0,
+    lastToken: session.blankId,
+    state1: new Float32Array(stateSize),
+    state2: new Float32Array(stateSize),
+    t: 0,
+  }];
+  const maxSteps = encoderLength * MAX_TOKENS_PER_STEP;
+  for (let step = 0; step < maxSteps; step++) {
+    const active = beams.filter(b => b.t < encoderLength);
+    if (active.length === 0) break;
+    const candidates: Beam[] = [];
+    for (const beam of active) {
+      // Must copy — ort.Tensor doesn't work with subarray views under Bun
+      const frame = encoderData.slice(beam.t * encoderDim, (beam.t + 1) * encoderDim);
+      const result = await session.decode(frame, [beam.lastToken], 1, beam.state1, beam.state2);
       const output = result.output;
       const tokenLogits = output.slice(0, session.vocabSize);
       const durationLogits = output.slice(session.vocabSize);
-      const tokenId = argmax(tokenLogits);
       const duration = argmax(durationLogits);
-      state1 = result.state1;
-      state2 = result.state2;
-      if (tokenId === session.blankId) {
-        t += 1;
-        break;
-      }
-      tokens.push(tokenId);
-      lastToken = tokenId;
-      tokensThisStep++;
+      // Blank option: advance one frame, keep same tokens
+      candidates.push({
+        tokens: beam.tokens,
+        score: beam.score + tokenLogits[session.blankId],
+        lastToken: beam.lastToken,
+        state1: result.state1,
+        state2: result.state2,
+        t: beam.t + 1,
+      });
-      if (duration > 0) {
-        t += duration;
-        break;
+      // Top non-blank token options
+      const topK = topKIndices(tokenLogits, beamWidth, session.blankId);
+      for (const tokenId of topK) {
+        candidates.push({
+          tokens: [...beam.tokens, tokenId],
+          score: beam.score + tokenLogits[tokenId],
+          lastToken: tokenId,
+          state1: result.state1,
+          state2: result.state2,
+          t: duration > 0 ? beam.t + duration : beam.t,
+        });
       }
     }
-    if (tokensThisStep >= MAX_TOKENS_PER_STEP) {
-      t += 1;
-    }
+    candidates.sort((a, b) => b.score - a.score);
+    beams = candidates.slice(0, beamWidth);
   }
-  return tokens;
+  return beams[0].tokens;
 }
-function argmax(arr: Float32Array): number {
+function argmax(arr: F32): number {
   let maxIdx = 0;
   let maxVal = arr[0];
   for (let i = 1; i < arr.length; i++) {
@@ -91,6 +112,15 @@ function argmax(arr: Float32Array): number {
   return maxIdx;
 }
+function topKIndices(arr: F32, k: number, excludeId: number): number[] {
+  const indexed: [number, number][] = [];
+  for (let i = 0; i < arr.length; i++) {
+    if (i !== excludeId) indexed.push([arr[i], i]);
+  }
+  indexed.sort((a, b) => b[0] - a[0]);
+  return indexed.slice(0, k).map(([, i]) => i);
+}
 let onnxSession: ort.InferenceSession | null = null;
 export async function initDecoder(modelDir: string): Promise<void> {
@@ -129,7 +159,3 @@ export function createOnnxDecoderSession(
     },
   };
 }
-export function releaseDecoder(): void {
-  onnxSession = null;
-}

package/src/encoder.ts CHANGED Viewed

@@ -26,7 +26,3 @@ export async function encode(
   return { encoderOutput, encodedLength };
 }
-export function releaseEncoder(): void {
-  session = null;
-}

package/src/models.ts CHANGED Viewed

@@ -2,12 +2,7 @@ import { join } from "path";
 import { homedir } from "os";
 import { existsSync, mkdirSync } from "fs";
-export type ModelVersion = "v2" | "v3";
-export const HF_REPOS: Record<ModelVersion, string> = {
-  v2: "istupakov/parakeet-tdt-0.6b-v2-onnx",
-  v3: "istupakov/parakeet-tdt-0.6b-v3-onnx",
-};
+export const HF_REPO = "istupakov/parakeet-tdt-0.6b-v3-onnx";
 export const MODEL_FILES = [
   "encoder-model.onnx",
@@ -17,28 +12,26 @@ export const MODEL_FILES = [
   "vocab.txt",
 ];
-export function getModelDir(version: ModelVersion): string {
-  return join(homedir(), ".cache", "parakeet", version);
+export function getModelDir(): string {
+  return join(homedir(), ".cache", "parakeet", "v3");
 }
-export function isModelCached(version: ModelVersion): boolean {
-  const dir = getModelDir(version);
+export function isModelCached(): boolean {
+  const dir = getModelDir();
   return MODEL_FILES.every((f) => existsSync(join(dir, f)));
 }
-export async function ensureModel(version: ModelVersion, noCache = false): Promise<string> {
-  const dir = getModelDir(version);
+export async function ensureModel(noCache = false): Promise<string> {
+  const dir = getModelDir();
-  if (!noCache && isModelCached(version)) {
+  if (!noCache && isModelCached()) {
     return dir;
   }
   mkdirSync(dir, { recursive: true });
-  const repo = HF_REPOS[version];
   for (const file of MODEL_FILES) {
-    const url = `https://huggingface.co/${repo}/resolve/main/${file}`;
+    const url = `https://huggingface.co/${HF_REPO}/resolve/main/${file}`;
     const dest = join(dir, file);
     if (!noCache && existsSync(dest)) continue;

package/src/preprocess.ts CHANGED Viewed

@@ -2,6 +2,8 @@ import * as ort from "onnxruntime-node";
 import { join } from "path";
 import { ensureOrtBackend } from "./ort-backend-fix";
+const NORM_EPSILON = 1e-10;
 let session: ort.InferenceSession | null = null;
 export async function initPreprocessor(modelDir: string): Promise<void> {
@@ -41,7 +43,7 @@ export async function preprocess(audio: Float32Array): Promise<{ features: ort.T
     const mean = sum / actualLength;
     const variance = sumSq / actualLength - mean * mean;
-    const std = Math.sqrt(Math.max(variance, 1e-10));
+    const std = Math.sqrt(Math.max(variance, NORM_EPSILON));
     for (let t = 0; t < T; t++) {
       normalized[f * T + t] = t < actualLength ? (melData[f * T + t] - mean) / std : 0;
@@ -53,7 +55,3 @@ export async function preprocess(audio: Float32Array): Promise<{ features: ort.T
   return { features: featureTensor, length: outputLength };
 }
-export function releasePreprocessor(): void {
-  session = null;
-}

package/src/tokenizer.ts CHANGED Viewed

@@ -40,20 +40,4 @@ export class Tokenizer {
     }
     return pieces.join("").replaceAll("\u2581", " ").trim();
   }
-  isAsciiDominant(tokenIds: number[], threshold = 0.9): boolean {
-    const nonBlank = tokenIds.filter((id) => id !== this.blankId);
-    if (nonBlank.length === 0) return false;
-    let asciiCount = 0;
-    for (const id of nonBlank) {
-      const token = this.idToToken.get(id) ?? "";
-      const cleaned = token.replaceAll("\u2581", "");
-      if (cleaned.length > 0 && /^[\x00-\x7F]+$/.test(cleaned)) {
-        asciiCount++;
-      }
-    }
-    return asciiCount / nonBlank.length >= threshold;
-  }
 }

package/src/transcribe.ts CHANGED Viewed

@@ -5,11 +5,21 @@ import { initEncoder, encode } from "./encoder";
 import {
   initDecoder,
   createOnnxDecoderSession,
-  greedyDecode,
+  beamDecode,
 } from "./decoder";
 import { Tokenizer } from "./tokenizer";
 import { join } from "path";
+function transpose2D(data: Float32Array, rows: number, cols: number): Float32Array {
+  const out = new Float32Array(cols * rows);
+  for (let c = 0; c < cols; c++) {
+    for (let r = 0; r < rows; r++) {
+      out[c * rows + r] = data[r * cols + c];
+    }
+  }
+  return out;
+}
 // Parakeet TDT 0.6B decoder state dimensions (from ONNX model input shapes)
 const DECODER_LAYERS = 2;
 const DECODER_HIDDEN = 640;
@@ -18,15 +28,18 @@ export interface TranscribeOptions {
   noCache?: boolean;
 }
+// Minimum 0.1s of audio at 16kHz to produce meaningful output
+const MIN_AUDIO_SAMPLES = 1600;
 export async function transcribe(audioPath: string, opts: TranscribeOptions = {}): Promise<string> {
   const audio = await convertToFloat32PCM(audioPath);
-  if (audio.length < 1600) {
+  if (audio.length < MIN_AUDIO_SAMPLES) {
     return "";
   }
   const noCache = opts.noCache ?? false;
-  const modelDir = await ensureModel("v3", noCache);
+  const modelDir = await ensureModel(noCache);
   const tokenizer = await Tokenizer.fromFile(join(modelDir, "vocab.txt"));
   await initPreprocessor(modelDir);
@@ -41,13 +54,7 @@ export async function transcribe(audioPath: string, opts: TranscribeOptions = {}
   const D = dims[1];
   const T = dims[2];
-  // Transpose from [1, D, T] to [T, D] so each frame is contiguous
-  const transposed = new Float32Array(T * D);
-  for (let t = 0; t < T; t++) {
-    for (let d = 0; d < D; d++) {
-      transposed[t * D + d] = encoderData[d * T + t];
-    }
-  }
+  const transposed = transpose2D(encoderData, D, T);
   const session = createOnnxDecoderSession(
     tokenizer.vocabSize,
@@ -56,6 +63,6 @@ export async function transcribe(audioPath: string, opts: TranscribeOptions = {}
     DECODER_HIDDEN,
   );
-  const tokens = await greedyDecode(session, encodedLength, transposed, D);
+  const tokens = await beamDecode(session, encodedLength, transposed, D);
   return tokenizer.detokenize(tokens);
 }

package/tsconfig.json CHANGED Viewed

@@ -7,7 +7,7 @@
     "strict": true,
     "skipLibCheck": true,
     "outDir": "./dist",
-    "rootDir": "./src"
+    "rootDir": "."
   },
   "include": ["src/**/*.ts", "tests/**/*.ts"]
 }