parakeet.js 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.gitmodules +3 -0
- package/README.md +240 -239
- package/examples/hf-spaces-demo/README.md +6 -9
- package/examples/hf-spaces-demo/package.json +1 -1
- package/examples/hf-spaces-demo/src/App.js +307 -316
- package/examples/react-demo/package.json +19 -19
- package/examples/react-demo/src/App.jsx +324 -326
- package/examples/react-demo-dev/src/App.jsx +23 -24
- package/package.json +1 -1
- package/publish.ps1 +65 -0
- package/src/hub.js +235 -241
- package/src/parakeet.js +15 -8
- package/src/preprocessor.js +75 -68
- package/docs/parakeet-transformers-js/.gitattributes +0 -2
- package/docs/parakeet-transformers-js/.prettierignore +0 -8
- package/docs/parakeet-transformers-js/.prettierrc +0 -10
- package/docs/parakeet-transformers-js/.tmp_features.json +0 -1
- package/docs/parakeet-transformers-js/LICENSE +0 -202
- package/docs/parakeet-transformers-js/README.md +0 -448
- package/docs/parakeet-transformers-js/assets/nemo128.onnx +0 -0
- package/docs/parakeet-transformers-js/assets/nemo80.onnx +0 -0
- package/docs/parakeet-transformers-js/debug_test.js +0 -84
- package/docs/parakeet-transformers-js/dev/inspect_decoder.cjs +0 -9
- package/docs/parakeet-transformers-js/dev/inspect_joiner.cjs +0 -9
- package/docs/parakeet-transformers-js/dev/js_step_by_step.js +0 -249
- package/docs/parakeet-transformers-js/dev/parakeet_cli.js +0 -91
- package/docs/parakeet-transformers-js/jest.config.mjs +0 -194
- package/docs/parakeet-transformers-js/js_preprocessing.json +0 -225
- package/docs/parakeet-transformers-js/js_step_by_step.json +0 -837
- package/docs/parakeet-transformers-js/js_step_by_step_v2.json +0 -450
- package/docs/parakeet-transformers-js/js_step_by_step_v3.json +0 -450
- package/docs/parakeet-transformers-js/js_steps.json +0 -821
- package/docs/parakeet-transformers-js/package-lock.json +0 -12251
- package/docs/parakeet-transformers-js/package.json +0 -96
- package/docs/parakeet-transformers-js/src/audio_features.js +0 -178
- package/docs/parakeet-transformers-js/src/backends/onnx.js +0 -210
- package/docs/parakeet-transformers-js/src/base/feature_extraction_utils.js +0 -54
- package/docs/parakeet-transformers-js/src/base/image_processors_utils.js +0 -1105
- package/docs/parakeet-transformers-js/src/base/processing_utils.js +0 -173
- package/docs/parakeet-transformers-js/src/configs.js +0 -455
- package/docs/parakeet-transformers-js/src/env.js +0 -167
- package/docs/parakeet-transformers-js/src/generation/configuration_utils.js +0 -388
- package/docs/parakeet-transformers-js/src/generation/logits_process.js +0 -727
- package/docs/parakeet-transformers-js/src/generation/logits_sampler.js +0 -204
- package/docs/parakeet-transformers-js/src/generation/parameters.js +0 -35
- package/docs/parakeet-transformers-js/src/generation/stopping_criteria.js +0 -156
- package/docs/parakeet-transformers-js/src/generation/streamers.js +0 -225
- package/docs/parakeet-transformers-js/src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js +0 -85
- package/docs/parakeet-transformers-js/src/models/auto/feature_extraction_auto.js +0 -25
- package/docs/parakeet-transformers-js/src/models/auto/image_processing_auto.js +0 -29
- package/docs/parakeet-transformers-js/src/models/auto/processing_auto.js +0 -85
- package/docs/parakeet-transformers-js/src/models/beit/image_processing_beit.js +0 -5
- package/docs/parakeet-transformers-js/src/models/bit/image_processing_bit.js +0 -5
- package/docs/parakeet-transformers-js/src/models/chinese_clip/image_processing_chinese_clip.js +0 -5
- package/docs/parakeet-transformers-js/src/models/clap/feature_extraction_clap.js +0 -159
- package/docs/parakeet-transformers-js/src/models/clip/image_processing_clip.js +0 -6
- package/docs/parakeet-transformers-js/src/models/convnext/image_processing_convnext.js +0 -46
- package/docs/parakeet-transformers-js/src/models/dac/feature_extraction_dac.js +0 -3
- package/docs/parakeet-transformers-js/src/models/deit/image_processing_deit.js +0 -6
- package/docs/parakeet-transformers-js/src/models/detr/image_processing_detr.js +0 -52
- package/docs/parakeet-transformers-js/src/models/donut/image_processing_donut.js +0 -31
- package/docs/parakeet-transformers-js/src/models/dpt/image_processing_dpt.js +0 -6
- package/docs/parakeet-transformers-js/src/models/efficientnet/image_processing_efficientnet.js +0 -14
- package/docs/parakeet-transformers-js/src/models/encodec/feature_extraction_encodec.js +0 -32
- package/docs/parakeet-transformers-js/src/models/feature_extractors.js +0 -17
- package/docs/parakeet-transformers-js/src/models/florence2/processing_florence2.js +0 -131
- package/docs/parakeet-transformers-js/src/models/gemma3n/feature_extraction_gemma3n.js +0 -97
- package/docs/parakeet-transformers-js/src/models/gemma3n/processing_gemma3n.js +0 -74
- package/docs/parakeet-transformers-js/src/models/glpn/image_processing_glpn.js +0 -5
- package/docs/parakeet-transformers-js/src/models/grounding_dino/image_processing_grounding_dino.js +0 -29
- package/docs/parakeet-transformers-js/src/models/grounding_dino/processing_grounding_dino.js +0 -101
- package/docs/parakeet-transformers-js/src/models/idefics3/image_processing_idefics3.js +0 -232
- package/docs/parakeet-transformers-js/src/models/idefics3/processing_idefics3.js +0 -136
- package/docs/parakeet-transformers-js/src/models/image_processors.js +0 -40
- package/docs/parakeet-transformers-js/src/models/janus/image_processing_janus.js +0 -27
- package/docs/parakeet-transformers-js/src/models/janus/processing_janus.js +0 -123
- package/docs/parakeet-transformers-js/src/models/jina_clip/image_processing_jina_clip.js +0 -26
- package/docs/parakeet-transformers-js/src/models/jina_clip/processing_jina_clip.js +0 -24
- package/docs/parakeet-transformers-js/src/models/llava/processing_llava.js +0 -44
- package/docs/parakeet-transformers-js/src/models/llava_onevision/image_processing_llava_onevision.js +0 -5
- package/docs/parakeet-transformers-js/src/models/mask2former/image_processing_mask2former.js +0 -5
- package/docs/parakeet-transformers-js/src/models/maskformer/image_processing_maskformer.js +0 -18
- package/docs/parakeet-transformers-js/src/models/mgp_str/processing_mgp_str.js +0 -172
- package/docs/parakeet-transformers-js/src/models/mobilenet_v1/image_processing_mobilenet_v1.js +0 -7
- package/docs/parakeet-transformers-js/src/models/mobilenet_v2/image_processing_mobilenet_v2.js +0 -7
- package/docs/parakeet-transformers-js/src/models/mobilenet_v3/image_processing_mobilenet_v3.js +0 -7
- package/docs/parakeet-transformers-js/src/models/mobilenet_v4/image_processing_mobilenet_v4.js +0 -7
- package/docs/parakeet-transformers-js/src/models/mobilevit/image_processing_mobilevit.js +0 -6
- package/docs/parakeet-transformers-js/src/models/moonshine/feature_extraction_moonshine.js +0 -26
- package/docs/parakeet-transformers-js/src/models/moonshine/processing_moonshine.js +0 -20
- package/docs/parakeet-transformers-js/src/models/nougat/image_processing_nougat.js +0 -5
- package/docs/parakeet-transformers-js/src/models/owlv2/image_processing_owlv2.js +0 -5
- package/docs/parakeet-transformers-js/src/models/owlvit/image_processing_owlvit.js +0 -12
- package/docs/parakeet-transformers-js/src/models/owlvit/processing_owlvit.js +0 -7
- package/docs/parakeet-transformers-js/src/models/paligemma/processing_paligemma.js +0 -83
- package/docs/parakeet-transformers-js/src/models/parakeet/feature_extraction_parakeet.js +0 -3
- package/docs/parakeet-transformers-js/src/models/parakeet/modeling_parakeet.js +0 -3
- package/docs/parakeet-transformers-js/src/models/parakeet/processing_parakeet.js +0 -3
- package/docs/parakeet-transformers-js/src/models/parakeet/tokenization_parakeet.js +0 -3
- package/docs/parakeet-transformers-js/src/models/phi3_v/image_processing_phi3_v.js +0 -163
- package/docs/parakeet-transformers-js/src/models/phi3_v/processing_phi3_v.js +0 -53
- package/docs/parakeet-transformers-js/src/models/processors.js +0 -22
- package/docs/parakeet-transformers-js/src/models/pvt/image_processing_pvt.js +0 -5
- package/docs/parakeet-transformers-js/src/models/pyannote/feature_extraction_pyannote.js +0 -85
- package/docs/parakeet-transformers-js/src/models/pyannote/processing_pyannote.js +0 -24
- package/docs/parakeet-transformers-js/src/models/qwen2_vl/image_processing_qwen2_vl.js +0 -52
- package/docs/parakeet-transformers-js/src/models/qwen2_vl/processing_qwen2_vl.js +0 -53
- package/docs/parakeet-transformers-js/src/models/rt_detr/image_processing_rt_detr.js +0 -12
- package/docs/parakeet-transformers-js/src/models/sam/image_processing_sam.js +0 -242
- package/docs/parakeet-transformers-js/src/models/sam/processing_sam.js +0 -20
- package/docs/parakeet-transformers-js/src/models/sapiens/image_processing_sapiens.js +0 -13
- package/docs/parakeet-transformers-js/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +0 -175
- package/docs/parakeet-transformers-js/src/models/segformer/image_processing_segformer.js +0 -13
- package/docs/parakeet-transformers-js/src/models/siglip/image_processing_siglip.js +0 -5
- package/docs/parakeet-transformers-js/src/models/smolvlm/image_processing_smolvlm.js +0 -2
- package/docs/parakeet-transformers-js/src/models/smolvlm/processing_smolvlm.js +0 -2
- package/docs/parakeet-transformers-js/src/models/snac/feature_extraction_snac.js +0 -3
- package/docs/parakeet-transformers-js/src/models/speecht5/feature_extraction_speecht5.js +0 -4
- package/docs/parakeet-transformers-js/src/models/speecht5/processing_speecht5.js +0 -17
- package/docs/parakeet-transformers-js/src/models/swin2sr/image_processing_swin2sr.js +0 -24
- package/docs/parakeet-transformers-js/src/models/ultravox/processing_ultravox.js +0 -54
- package/docs/parakeet-transformers-js/src/models/vit/image_processing_vit.js +0 -7
- package/docs/parakeet-transformers-js/src/models/vitmatte/image_processing_vitmatte.js +0 -50
- package/docs/parakeet-transformers-js/src/models/vitpose/image_processing_vitpose.js +0 -89
- package/docs/parakeet-transformers-js/src/models/wav2vec2/feature_extraction_wav2vec2.js +0 -44
- package/docs/parakeet-transformers-js/src/models/wav2vec2/processing_wav2vec2.js +0 -17
- package/docs/parakeet-transformers-js/src/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.js +0 -17
- package/docs/parakeet-transformers-js/src/models/wespeaker/feature_extraction_wespeaker.js +0 -95
- package/docs/parakeet-transformers-js/src/models/whisper/common_whisper.js +0 -157
- package/docs/parakeet-transformers-js/src/models/whisper/feature_extraction_whisper.js +0 -92
- package/docs/parakeet-transformers-js/src/models/whisper/generation_whisper.js +0 -89
- package/docs/parakeet-transformers-js/src/models/whisper/processing_whisper.js +0 -21
- package/docs/parakeet-transformers-js/src/models/yolos/image_processing_yolos.js +0 -12
- package/docs/parakeet-transformers-js/src/models.js +0 -8644
- package/docs/parakeet-transformers-js/src/ops/registry.js +0 -133
- package/docs/parakeet-transformers-js/src/ort_env.js +0 -8
- package/docs/parakeet-transformers-js/src/parakeet.js +0 -792
- package/docs/parakeet-transformers-js/src/pipelines.js +0 -3540
- package/docs/parakeet-transformers-js/src/processors.js +0 -16
- package/docs/parakeet-transformers-js/src/tokenizers.js +0 -4432
- package/docs/parakeet-transformers-js/src/transformers.js +0 -50
- package/docs/parakeet-transformers-js/src/utils/audio.js +0 -893
- package/docs/parakeet-transformers-js/src/utils/constants.js +0 -9
- package/docs/parakeet-transformers-js/src/utils/core.js +0 -259
- package/docs/parakeet-transformers-js/src/utils/data-structures.js +0 -574
- package/docs/parakeet-transformers-js/src/utils/devices.js +0 -22
- package/docs/parakeet-transformers-js/src/utils/dtypes.js +0 -63
- package/docs/parakeet-transformers-js/src/utils/generic.js +0 -35
- package/docs/parakeet-transformers-js/src/utils/hub.js +0 -780
- package/docs/parakeet-transformers-js/src/utils/image.js +0 -834
- package/docs/parakeet-transformers-js/src/utils/maths.js +0 -1061
- package/docs/parakeet-transformers-js/src/utils/tensor.js +0 -1539
- package/docs/parakeet-transformers-js/src/utils/video.js +0 -128
- package/docs/parakeet-transformers-js/test/decoder.test.js +0 -114
- package/docs/parakeet-transformers-js/test/encoder.test.js +0 -108
- package/docs/parakeet-transformers-js/test/preprocessor.test.js +0 -85
- package/docs/parakeet-transformers-js/test/tokenizer.test.js +0 -24
- package/docs/parakeet-transformers-js/test/transcribe.js +0 -89
- package/docs/parakeet-transformers-js/tsconfig.json +0 -21
- package/docs/parakeet-transformers-js/webpack.config.js +0 -223
|
@@ -1,792 +0,0 @@
|
|
|
1
|
-
// @ts-nocheck
|
|
2
|
-
|
|
3
|
-
import * as fs from 'fs';
|
|
4
|
-
import * as path from 'path';
|
|
5
|
-
let ort;
|
|
6
|
-
if (isNode()) {
|
|
7
|
-
// eslint-disable-next-line global-require, import/no-extraneous-dependencies
|
|
8
|
-
ort = await import('onnxruntime-node');
|
|
9
|
-
ort = ort.default ?? ort;
|
|
10
|
-
} else {
|
|
11
|
-
ort = await import('onnxruntime-web').then(m=>m.default ?? m);
|
|
12
|
-
}
|
|
13
|
-
import { logMelSpectrogram, MEL_BINS } from './audio_features.js';
|
|
14
|
-
|
|
15
|
-
/**
|
|
16
|
-
* Small helper to detect if we are in Node (vs browser).
|
|
17
|
-
*/
|
|
18
|
-
function isNode() {
|
|
19
|
-
return typeof process !== 'undefined' && process.versions?.node;
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
/**
|
|
23
|
-
* ONNX-based preprocessor for audio feature extraction.
|
|
24
|
-
*/
|
|
25
|
-
export class OnnxPreprocessor {
|
|
26
|
-
constructor(modelPath) {
|
|
27
|
-
this.modelPath = modelPath;
|
|
28
|
-
this.session = null;
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
async init() {
|
|
32
|
-
if (!this.session) {
|
|
33
|
-
this.session = await ort.InferenceSession.create(this.modelPath);
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
async process(audio) {
|
|
38
|
-
if (!this.session) await this.init();
|
|
39
|
-
|
|
40
|
-
// Create a proper Float32Array with explicit ArrayBuffer
|
|
41
|
-
const buffer = new ArrayBuffer(audio.length * 4); // 4 bytes per float32
|
|
42
|
-
const audioFloat32 = new Float32Array(buffer);
|
|
43
|
-
for (let i = 0; i < audio.length; i++) {
|
|
44
|
-
audioFloat32[i] = audio[i];
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
// Reshape audio to [1, audio.length] for batch processing
|
|
48
|
-
const waveforms = new ort.Tensor('float32', audioFloat32, [1, audioFloat32.length]);
|
|
49
|
-
|
|
50
|
-
// Create proper BigInt64Array for lengths
|
|
51
|
-
const lenBuffer = new ArrayBuffer(8); // 8 bytes per int64
|
|
52
|
-
const lenArray = new BigInt64Array(lenBuffer);
|
|
53
|
-
lenArray[0] = BigInt(audioFloat32.length);
|
|
54
|
-
const waveforms_lens = new ort.Tensor('int64', lenArray, [1]);
|
|
55
|
-
|
|
56
|
-
const feeds = {
|
|
57
|
-
waveforms: waveforms,
|
|
58
|
-
waveforms_lens: waveforms_lens
|
|
59
|
-
};
|
|
60
|
-
|
|
61
|
-
const outputs = await this.session.run(feeds);
|
|
62
|
-
const featuresTensor = outputs['features'];
|
|
63
|
-
const features_lens = outputs['features_lens'];
|
|
64
|
-
|
|
65
|
-
// The preprocessor ONNX model outputs features in [batch, features, time]
|
|
66
|
-
// format. The python equivalent (onnx-asr) does not transpose here, so we will
|
|
67
|
-
// match that behavior and return the raw features. The consuming `transcribe`
|
|
68
|
-
// function will be updated to handle this layout.
|
|
69
|
-
return {
|
|
70
|
-
features: featuresTensor.data,
|
|
71
|
-
length: Number(features_lens.data[0]),
|
|
72
|
-
};
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
/**
|
|
77
|
-
* Load a text file returning its content as string.
|
|
78
|
-
* Works in both Node and browser (using fetch).
|
|
79
|
-
*/
|
|
80
|
-
async function loadText(filePath) {
|
|
81
|
-
if (isNode()) {
|
|
82
|
-
return fs.promises.readFile(filePath, 'utf8');
|
|
83
|
-
}
|
|
84
|
-
const resp = await fetch(filePath);
|
|
85
|
-
if (!resp.ok) throw new Error(`Failed to fetch ${filePath}: ${resp.status}`);
|
|
86
|
-
return resp.text();
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
/**
|
|
90
|
-
* Basic tokenizer that only supports decoding (id -> text).
|
|
91
|
-
* `tokens.txt` format: one token per line: "token id".
|
|
92
|
-
*/
|
|
93
|
-
export class ParakeetTokenizer {
|
|
94
|
-
/**
|
|
95
|
-
* @param {string[]} id2token Array where index=id, value=token string
|
|
96
|
-
*/
|
|
97
|
-
constructor(id2token) {
|
|
98
|
-
this.id2token = id2token;
|
|
99
|
-
this.blankToken = '<blk>';
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
static async fromTokensFile(tokensPath) {
|
|
103
|
-
const content = await loadText(tokensPath);
|
|
104
|
-
const lines = content.split(/\r?\n/).filter(Boolean);
|
|
105
|
-
const id2token = [];
|
|
106
|
-
for (const line of lines) {
|
|
107
|
-
const [tok, idStr] = line.split(/\s+/);
|
|
108
|
-
const id = parseInt(idStr, 10);
|
|
109
|
-
id2token[id] = tok;
|
|
110
|
-
}
|
|
111
|
-
return new ParakeetTokenizer(id2token);
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
/**
|
|
115
|
-
* Decode an array of token ids to string.
|
|
116
|
-
* Implements simple SentencePiece-style rule: leading "▁" marks space.
|
|
117
|
-
*/
|
|
118
|
-
decode(ids) {
|
|
119
|
-
let text = '';
|
|
120
|
-
for (const id of ids) {
|
|
121
|
-
const token = this.id2token[id];
|
|
122
|
-
if (token === undefined) continue;
|
|
123
|
-
if (token === this.blankToken) continue;
|
|
124
|
-
if (token.startsWith('▁')) {
|
|
125
|
-
// word start
|
|
126
|
-
text += ' ' + token.slice(1);
|
|
127
|
-
} else {
|
|
128
|
-
text += token;
|
|
129
|
-
}
|
|
130
|
-
}
|
|
131
|
-
return text.trim();
|
|
132
|
-
}
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
/**
|
|
136
|
-
* Skeleton for Parakeet RNNT model inference.
|
|
137
|
-
*/
|
|
138
|
-
export class ParakeetModel {
|
|
139
|
-
constructor(opts) {
|
|
140
|
-
this.tokenizer = opts.tokenizer;
|
|
141
|
-
this.encoderSession = opts.encoderSession;
|
|
142
|
-
this.decoderSession = opts.decoderSession;
|
|
143
|
-
this.joinerSession = opts.joinerSession;
|
|
144
|
-
this.blankId = opts.blankId ?? 1024; // default
|
|
145
|
-
this.predLayers = opts.predLayers;
|
|
146
|
-
this.predHidden = opts.predHidden;
|
|
147
|
-
this._normalizer = opts.normalizer ?? (s=>s);
|
|
148
|
-
this.modelFormat = opts.modelFormat;
|
|
149
|
-
this.isCombined = this.modelFormat === 'transformers.js';
|
|
150
|
-
this.maxTokensPerStep = opts.maxTokensPerStep ?? 10;
|
|
151
|
-
this.preprocessor = opts.preprocessor; // ONNX preprocessor
|
|
152
|
-
this.subsampling = opts.subsampling ?? 8;
|
|
153
|
-
this.windowStride = opts.windowStride ?? 0.01; // Store windowStride
|
|
154
|
-
|
|
155
|
-
if (this.isCombined) {
|
|
156
|
-
// Pre-create zero state tensors expected by combined model
|
|
157
|
-
const numLayers = this.predLayers ?? 2;
|
|
158
|
-
const hidden = this.predHidden ?? 640;
|
|
159
|
-
const size = numLayers * 1 * hidden;
|
|
160
|
-
const z = new Float32Array(size);
|
|
161
|
-
this.zeroState1 = new ort.Tensor('float32', z, [numLayers, 1, hidden]);
|
|
162
|
-
this.zeroState2 = new ort.Tensor('float32', z.slice(), [numLayers, 1, hidden]);
|
|
163
|
-
this._combState1 = this.zeroState1;
|
|
164
|
-
this._combState2 = this.zeroState2;
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
// cache for constant inputs that quantised decoder expects
|
|
168
|
-
this.constInputs = {};
|
|
169
|
-
|
|
170
|
-
// Determine joiner input names (encoder & decoder)
|
|
171
|
-
this.joinerInputs = Object.keys(this.joinerSession.inputMetadata ?? {});
|
|
172
|
-
if (isNode()) {
|
|
173
|
-
console.log('[Parakeet] Joiner input names:', this.joinerInputs);
|
|
174
|
-
console.log('[Parakeet] Encoder inputs:', Object.keys(this.encoderSession.inputMetadata ?? {}));
|
|
175
|
-
console.log('[Parakeet] Encoder outputs:', Object.keys(this.encoderSession.outputMetadata ?? {}));
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
static async fromDirectory(baseDir, { ortOptions, preferFloat32 = false } = {}) {
|
|
180
|
-
const modelFormat = 'transformers.js';
|
|
181
|
-
|
|
182
|
-
// Helper to select model path
|
|
183
|
-
const selectModelPath = (name) => {
|
|
184
|
-
const floatPath = path.join(baseDir, `${name}.onnx`);
|
|
185
|
-
const int8Path = path.join(baseDir, `${name}.int8.onnx`);
|
|
186
|
-
|
|
187
|
-
if (preferFloat32) {
|
|
188
|
-
if (fs.existsSync(floatPath)) return floatPath;
|
|
189
|
-
if (fs.existsSync(int8Path)) {
|
|
190
|
-
console.warn(`[Parakeet] preferFloat32=true but only found ${int8Path}`);
|
|
191
|
-
return int8Path;
|
|
192
|
-
}
|
|
193
|
-
} else {
|
|
194
|
-
// Default behavior: prefer int8
|
|
195
|
-
if (fs.existsSync(int8Path)) return int8Path;
|
|
196
|
-
if (fs.existsSync(floatPath)) return floatPath;
|
|
197
|
-
}
|
|
198
|
-
return null; // Not found
|
|
199
|
-
};
|
|
200
|
-
|
|
201
|
-
const encoderPath = selectModelPath('encoder-model');
|
|
202
|
-
const decoderJointPath = selectModelPath('decoder_joint-model');
|
|
203
|
-
|
|
204
|
-
if (isNode()) {
|
|
205
|
-
console.log(`[Parakeet] Selected encoder model: ${path.basename(encoderPath)}`);
|
|
206
|
-
console.log(`[Parakeet] Selected decoder/joiner model: ${path.basename(decoderJointPath)}`);
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
if (!encoderPath || !decoderJointPath) {
|
|
210
|
-
throw new Error('Model directory must contain encoder/decoder_joint ONNX files.');
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
// Try vocab.txt first, then tokens.txt
|
|
214
|
-
let tokensPath = path.join(baseDir, 'vocab.txt');
|
|
215
|
-
if (!fs.existsSync(tokensPath)) {
|
|
216
|
-
tokensPath = path.join(baseDir, 'tokens.txt');
|
|
217
|
-
}
|
|
218
|
-
|
|
219
|
-
console.log(`[Parakeet] Loading ${modelFormat} format models from ${baseDir}`);
|
|
220
|
-
|
|
221
|
-
const [tokenizer, encoderSession, joinerSession] = await Promise.all([
|
|
222
|
-
ParakeetTokenizer.fromTokensFile(tokensPath),
|
|
223
|
-
ort.InferenceSession.create(encoderPath, ortOptions),
|
|
224
|
-
ort.InferenceSession.create(decoderJointPath, ortOptions),
|
|
225
|
-
]);
|
|
226
|
-
|
|
227
|
-
// Load text normalizer rules if present
|
|
228
|
-
let normalizer = (s)=>s;
|
|
229
|
-
|
|
230
|
-
const normPath = path.join(baseDir, 'normalizer.json');
|
|
231
|
-
if (fs.existsSync(normPath)) {
|
|
232
|
-
try {
|
|
233
|
-
const rules = JSON.parse(await fs.promises.readFile(normPath, 'utf8'));
|
|
234
|
-
normalizer = (text)=>{
|
|
235
|
-
let out = text;
|
|
236
|
-
for (const [pattern, repl] of rules) {
|
|
237
|
-
out = out.replace(new RegExp(pattern, 'g'), repl);
|
|
238
|
-
}
|
|
239
|
-
return out;
|
|
240
|
-
};
|
|
241
|
-
} catch(e) { console.warn('[Parakeet] Failed to load normalizer.json', e); }
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
// Extract prediction network metadata (layer & hidden size) if provided by Sherpa-ONNX.
|
|
245
|
-
let predLayers = 2, predHidden = 640;
|
|
246
|
-
try {
|
|
247
|
-
const meta = encoderSession.metadata?.custom_metadata_map ?? {};
|
|
248
|
-
if (meta.pred_rnn_layers) predLayers = parseInt(meta.pred_rnn_layers, 10);
|
|
249
|
-
if (meta.pred_hidden) predHidden = parseInt(meta.pred_hidden, 10);
|
|
250
|
-
} catch (_) { /* ignore */ }
|
|
251
|
-
|
|
252
|
-
const configPath = path.join(baseDir, 'config.json');
|
|
253
|
-
let cfg = {};
|
|
254
|
-
if (fs.existsSync(configPath)) {
|
|
255
|
-
try { cfg = JSON.parse(await fs.promises.readFile(configPath,'utf8')); }
|
|
256
|
-
catch { cfg = {}; }
|
|
257
|
-
}
|
|
258
|
-
const maxTokensPerStep = cfg.max_tokens_per_step ?? 10;
|
|
259
|
-
|
|
260
|
-
const featureSize = cfg.features_size ?? 128;
|
|
261
|
-
const subsamplingFactor = cfg.subsampling_factor ?? 8;
|
|
262
|
-
const windowStride = cfg.window_stride ?? 0.01; // Read from config, default 0.01
|
|
263
|
-
|
|
264
|
-
// Create ONNX preprocessor (try nemo128 first, fallback to nemo80)
|
|
265
|
-
const __filename = new URL(import.meta.url).pathname;
|
|
266
|
-
const __dirname = path.dirname(__filename);
|
|
267
|
-
|
|
268
|
-
// Get the directory of the current script and find assets
|
|
269
|
-
let assetsDir = path.join(__dirname, '..', 'assets');
|
|
270
|
-
let preprocessorPath = path.join(assetsDir, featureSize === 80 ? 'nemo80.onnx' : 'nemo128.onnx');
|
|
271
|
-
|
|
272
|
-
// Fix Windows path issues - remove leading slash on Windows
|
|
273
|
-
if (process.platform === 'win32' && __dirname.startsWith('/')) {
|
|
274
|
-
assetsDir = path.join(__dirname.substring(1), '..', 'assets');
|
|
275
|
-
preprocessorPath = path.join(assetsDir, featureSize === 80 ? 'nemo80.onnx' : 'nemo128.onnx');
|
|
276
|
-
}
|
|
277
|
-
|
|
278
|
-
if (!fs.existsSync(preprocessorPath)) {
|
|
279
|
-
// fallback try the other size asset
|
|
280
|
-
preprocessorPath = path.join(assetsDir, featureSize === 80 ? 'nemo128.onnx' : 'nemo80.onnx');
|
|
281
|
-
}
|
|
282
|
-
|
|
283
|
-
// Final fallback - try relative to current working directory
|
|
284
|
-
if (!fs.existsSync(preprocessorPath)) {
|
|
285
|
-
console.warn('[Parakeet] Preprocessor not found at:', preprocessorPath);
|
|
286
|
-
preprocessorPath = path.resolve('./assets/nemo128.onnx');
|
|
287
|
-
if (!fs.existsSync(preprocessorPath)) {
|
|
288
|
-
preprocessorPath = path.resolve('./assets/nemo80.onnx');
|
|
289
|
-
}
|
|
290
|
-
}
|
|
291
|
-
|
|
292
|
-
const preprocessor = new OnnxPreprocessor(preprocessorPath);
|
|
293
|
-
|
|
294
|
-
// store subsampling factor for timestamp stride
|
|
295
|
-
const subsampling = subsamplingFactor;
|
|
296
|
-
|
|
297
|
-
return new ParakeetModel({
|
|
298
|
-
tokenizer,
|
|
299
|
-
encoderSession,
|
|
300
|
-
decoderSession: null,
|
|
301
|
-
joinerSession,
|
|
302
|
-
predLayers,
|
|
303
|
-
predHidden,
|
|
304
|
-
normalizer,
|
|
305
|
-
modelFormat,
|
|
306
|
-
maxTokensPerStep,
|
|
307
|
-
preprocessor,
|
|
308
|
-
subsampling,
|
|
309
|
-
windowStride
|
|
310
|
-
});
|
|
311
|
-
}
|
|
312
|
-
|
|
313
|
-
/**
|
|
314
|
-
* Compute 80-dim log-mel spectrogram using ONNX preprocessor.
|
|
315
|
-
* @param {Float32Array} audio PCM normalized -1..1 @ sampleRate Hz
|
|
316
|
-
* @param {number} sampleRate
|
|
317
|
-
* @param {{skipCMVN?:boolean, debug?:boolean}} opts
|
|
318
|
-
* @returns {Promise<Float32Array>} [T, 80]
|
|
319
|
-
*/
|
|
320
|
-
async computeFeatures(audio, sampleRate, {skipCMVN=false, debug=false}={}) {
|
|
321
|
-
if (!this.preprocessor) {
|
|
322
|
-
throw new Error('ONNX preprocessor not initialized');
|
|
323
|
-
}
|
|
324
|
-
|
|
325
|
-
const result = await this.preprocessor.process(audio);
|
|
326
|
-
const features = result.features;
|
|
327
|
-
const T = result.length;
|
|
328
|
-
const MEL = MEL_BINS;
|
|
329
|
-
|
|
330
|
-
if (debug && isNode()) {
|
|
331
|
-
console.log(`[Debug] ONNX preprocessor: ${T} frames x ${MEL} mel bins`);
|
|
332
|
-
console.log('[Debug] first-frame mel (20 bins):', Array.from(features.slice(0,20)).map(v=>v.toFixed(3)).join(' '));
|
|
333
|
-
}
|
|
334
|
-
|
|
335
|
-
return features;
|
|
336
|
-
}
|
|
337
|
-
|
|
338
|
-
async _runDecoderStep(token, state) {
|
|
339
|
-
const prev = new ort.Tensor('int32', new Int32Array([token]), [1, 1]);
|
|
340
|
-
const decInputs = { targets: prev, target_length: new ort.Tensor('int32', new Int32Array([1]), [1]) };
|
|
341
|
-
Object.assign(decInputs, state, this.constInputs);
|
|
342
|
-
|
|
343
|
-
while (true) {
|
|
344
|
-
try {
|
|
345
|
-
const decOut = await this.decoderSession.run(decInputs);
|
|
346
|
-
// Reshape decoder output to [1, 640, 1] for the joiner
|
|
347
|
-
const decTensor = decOut['outputs'] ?? Object.values(decOut)[0];
|
|
348
|
-
const reshapedData = new Float32Array(decTensor.data);
|
|
349
|
-
const reshapedTensor = new ort.Tensor('float32', reshapedData, [1, this.predHidden ?? 640, 1]);
|
|
350
|
-
|
|
351
|
-
const newState = Object.fromEntries(Object.entries(decOut).filter(([k]) => k.startsWith('states')));
|
|
352
|
-
|
|
353
|
-
return { dec: reshapedTensor, state: newState };
|
|
354
|
-
} catch (e) {
|
|
355
|
-
let handled = false;
|
|
356
|
-
// Missing input
|
|
357
|
-
let m = /input '([^']+)' is missing/.exec(e.message);
|
|
358
|
-
if (m) {
|
|
359
|
-
const missing = m[1];
|
|
360
|
-
const meta = this.decoderSession.inputMetadata?.[missing];
|
|
361
|
-
let dims = meta?.dimensions ?? [1];
|
|
362
|
-
dims = dims.map(d => (typeof d === 'number' && d > 0) ? d : 1);
|
|
363
|
-
const size = dims.reduce((a, b) => a * b, 1);
|
|
364
|
-
const type = meta?.type ?? 'float32';
|
|
365
|
-
let tensor;
|
|
366
|
-
switch (type) {
|
|
367
|
-
case 'int32':
|
|
368
|
-
tensor = new ort.Tensor('int32', new Int32Array(size), dims);
|
|
369
|
-
break;
|
|
370
|
-
case 'int64':
|
|
371
|
-
tensor = new ort.Tensor('int64', new BigInt64Array(size), dims);
|
|
372
|
-
break;
|
|
373
|
-
default:
|
|
374
|
-
tensor = new ort.Tensor('float32', new Float32Array(size), dims);
|
|
375
|
-
}
|
|
376
|
-
this.constInputs[missing] = tensor;
|
|
377
|
-
decInputs[missing] = tensor;
|
|
378
|
-
handled = true;
|
|
379
|
-
}
|
|
380
|
-
|
|
381
|
-
// Dimension mismatch (rank ok but specific axes wrong)
|
|
382
|
-
const dimLines = [...e.message.matchAll(/index:\s*(\d+)\s*Got:\s*\d+\s*Expected:\s*(\d+)/g)];
|
|
383
|
-
if (dimLines.length > 0) {
|
|
384
|
-
const nameMatch = /input: ([^ ]+)/.exec(e.message);
|
|
385
|
-
const tensorName = nameMatch ? nameMatch[1] : null;
|
|
386
|
-
if (tensorName) {
|
|
387
|
-
const dimsMap = {};
|
|
388
|
-
let maxIdx = 0;
|
|
389
|
-
for (const [, idxStr, expStr] of dimLines) {
|
|
390
|
-
const idx = parseInt(idxStr, 10);
|
|
391
|
-
const exp = parseInt(expStr, 10);
|
|
392
|
-
dimsMap[idx] = exp;
|
|
393
|
-
if (idx > maxIdx) maxIdx = idx;
|
|
394
|
-
}
|
|
395
|
-
const newDims = Array(maxIdx + 1).fill(1);
|
|
396
|
-
for (const k in dimsMap) newDims[k] = dimsMap[k];
|
|
397
|
-
const size = newDims.reduce((a, b) => a * b, 1);
|
|
398
|
-
this.constInputs[tensorName] = new ort.Tensor('float32', new Float32Array(size), newDims);
|
|
399
|
-
decInputs[tensorName] = this.constInputs[tensorName];
|
|
400
|
-
handled = true;
|
|
401
|
-
}
|
|
402
|
-
}
|
|
403
|
-
|
|
404
|
-
// Invalid rank mismatch
|
|
405
|
-
const rankMatch = /Invalid rank for input:?\s*([^ ]+)\s*Got:\s*(\d+)\s*Expected:\s*(\d+)/.exec(e.message);
|
|
406
|
-
if (rankMatch) {
|
|
407
|
-
const tensorName = rankMatch[1];
|
|
408
|
-
const expectedRank = parseInt(rankMatch[3], 10);
|
|
409
|
-
const meta = this.decoderSession.inputMetadata?.[tensorName];
|
|
410
|
-
let dims = meta?.dimensions ?? Array(expectedRank).fill(1);
|
|
411
|
-
if (dims.length !== expectedRank) {
|
|
412
|
-
dims = Array.from({ length: expectedRank }, (_, i) => (dims[i] && typeof dims[i] === 'number' && dims[i] > 0) ? dims[i] : 1);
|
|
413
|
-
} else {
|
|
414
|
-
dims = dims.map(d => (typeof d === 'number' && d > 0) ? d : 1);
|
|
415
|
-
}
|
|
416
|
-
const size = dims.reduce((a, b) => a * b, 1);
|
|
417
|
-
this.constInputs[tensorName] = new ort.Tensor('float32', new Float32Array(size), dims);
|
|
418
|
-
decInputs[tensorName] = this.constInputs[tensorName];
|
|
419
|
-
handled = true;
|
|
420
|
-
}
|
|
421
|
-
|
|
422
|
-
if (!handled) throw e;
|
|
423
|
-
}
|
|
424
|
-
}
|
|
425
|
-
}
|
|
426
|
-
|
|
427
|
-
async _runCombinedStep(encTensor, token, currentState = null) {
|
|
428
|
-
// Always pass a single token, matching Python onnx-asr behavior
|
|
429
|
-
const singleToken = typeof token === 'number' ? token : this.blankId;
|
|
430
|
-
|
|
431
|
-
const targetTensor = new ort.Tensor('int32', new Int32Array([singleToken]), [1, 1]);
|
|
432
|
-
const lenTensor = new ort.Tensor('int32', new Int32Array([1]), [1]);
|
|
433
|
-
|
|
434
|
-
// Use provided state or default internal state
|
|
435
|
-
const state1 = currentState?.state1 || this._combState1;
|
|
436
|
-
const state2 = currentState?.state2 || this._combState2;
|
|
437
|
-
|
|
438
|
-
const feeds = {
|
|
439
|
-
encoder_outputs: encTensor,
|
|
440
|
-
targets: targetTensor,
|
|
441
|
-
target_length: lenTensor,
|
|
442
|
-
input_states_1: state1,
|
|
443
|
-
input_states_2: state2,
|
|
444
|
-
};
|
|
445
|
-
|
|
446
|
-
const out = await this.joinerSession.run(feeds);
|
|
447
|
-
const logits = out['outputs'];
|
|
448
|
-
|
|
449
|
-
// The output tensor shape is [B, 1, 1, D] since we're passing a single token.
|
|
450
|
-
// We can directly use the logits without slicing.
|
|
451
|
-
const [_B, _, N, D] = logits.dims;
|
|
452
|
-
const lastLogits = logits.data.subarray(0, D);
|
|
453
|
-
|
|
454
|
-
// Split token logits / duration logits
|
|
455
|
-
const vocab = this.tokenizer.id2token.length;
|
|
456
|
-
const tokenLogits = lastLogits.slice(0, vocab);
|
|
457
|
-
const durLogits = lastLogits.slice(vocab);
|
|
458
|
-
let step = 0;
|
|
459
|
-
if (durLogits.length > 0) {
|
|
460
|
-
let max = -Infinity;
|
|
461
|
-
for (let i = 0; i < durLogits.length; ++i) if (durLogits[i] > max) { max = durLogits[i]; step = i; }
|
|
462
|
-
|
|
463
|
-
// Debug: Log duration logits to understand the pattern
|
|
464
|
-
if (isNode() && Math.random() < 0.1) { // Log 10% of the time to avoid spam
|
|
465
|
-
console.log('[DEBUG] Duration logits:', Array.from(durLogits).map((v, i) => `${i}:${v.toFixed(2)}`).join(' '));
|
|
466
|
-
console.log('[DEBUG] Duration argmax:', step, 'max_value:', max.toFixed(2));
|
|
467
|
-
}
|
|
468
|
-
}
|
|
469
|
-
|
|
470
|
-
// Return new state for external management
|
|
471
|
-
const newState = {
|
|
472
|
-
state1: out['output_states_1'] || state1,
|
|
473
|
-
state2: out['output_states_2'] || state2
|
|
474
|
-
};
|
|
475
|
-
|
|
476
|
-
return { tokenLogits, step, newState };
|
|
477
|
-
}
|
|
478
|
-
|
|
479
|
-
/**
|
|
480
|
-
* Transcribe audio.
|
|
481
|
-
* @param {Float32Array} audio 16 kHz mono PCM [-1,1]
|
|
482
|
-
* @param {number} sampleRate input sample rate (must be 16k for now)
|
|
483
|
-
* @param {{returnTimestamps?:boolean, returnConfidences?:boolean, temperature?:number, skipCMVN?:boolean, debug?:boolean}} opts
|
|
484
|
-
* @returns {Promise<string|{text:string,tokens:number[],timestamps?:[number,number][],confidences?:number[]}>}
|
|
485
|
-
*/
|
|
486
|
-
async transcribe(audio, sampleRate = 16000, opts = {}) {
|
|
487
|
-
const {
|
|
488
|
-
returnTimestamps = false,
|
|
489
|
-
returnConfidences = false,
|
|
490
|
-
temperature = 1.2,
|
|
491
|
-
debug = false,
|
|
492
|
-
skipCMVN = false
|
|
493
|
-
} = opts;
|
|
494
|
-
// 1. feature extraction
|
|
495
|
-
console.log(`[Parakeet] Got ${audio.length} audio samples @ ${sampleRate} Hz`);
|
|
496
|
-
const features = await this.computeFeatures(audio, sampleRate, {skipCMVN,debug});
|
|
497
|
-
const MEL = MEL_BINS;
|
|
498
|
-
const T = features.length / MEL;
|
|
499
|
-
|
|
500
|
-
// Debug audio preprocessing
|
|
501
|
-
if (debug && isNode()) {
|
|
502
|
-
console.log(`[Parakeet] Audio duration: ${(audio.length / sampleRate).toFixed(2)}s`);
|
|
503
|
-
console.log(`[Parakeet] Feature dims: ${T} frames x ${MEL} mel bins`);
|
|
504
|
-
|
|
505
|
-
// Find min/max without spread operator to avoid stack overflow
|
|
506
|
-
let audioMin = audio[0], audioMax = audio[0];
|
|
507
|
-
for (let i = 1; i < audio.length; i++) {
|
|
508
|
-
if (audio[i] < audioMin) audioMin = audio[i];
|
|
509
|
-
if (audio[i] > audioMax) audioMax = audio[i];
|
|
510
|
-
}
|
|
511
|
-
let featMin = features[0], featMax = features[0];
|
|
512
|
-
for (let i = 1; i < features.length; i++) {
|
|
513
|
-
if (features[i] < featMin) featMin = features[i];
|
|
514
|
-
if (features[i] > featMax) featMax = features[i];
|
|
515
|
-
}
|
|
516
|
-
|
|
517
|
-
console.log(`[Parakeet] Audio energy: min=${audioMin.toFixed(3)} max=${audioMax.toFixed(3)}`);
|
|
518
|
-
console.log(`[Parakeet] Feature range: min=${featMin.toFixed(3)} max=${featMax.toFixed(3)}`);
|
|
519
|
-
}
|
|
520
|
-
|
|
521
|
-
const ids = [];
|
|
522
|
-
const tokenTimes = [];
|
|
523
|
-
const tokenConfs = [];
|
|
524
|
-
const frameConfs = []; // per-frame confidences
|
|
525
|
-
let overallLogProb = 0;
|
|
526
|
-
|
|
527
|
-
// Track decoder state separately for proper state management
|
|
528
|
-
let decoderState = null;
|
|
529
|
-
|
|
530
|
-
// The entire logic is now simplified to process the full feature set at once,
|
|
531
|
-
// eliminating the need for chunking.
|
|
532
|
-
|
|
533
|
-
// 1. Encode the entire feature set
|
|
534
|
-
const input = new ort.Tensor('float32', features, [1, MEL, T]);
|
|
535
|
-
const lenTensor = new ort.Tensor('int64', BigInt64Array.from([BigInt(T)]), [1]);
|
|
536
|
-
const encOut = await this.encoderSession.run({ 'audio_signal': input, 'length': lenTensor });
|
|
537
|
-
const enc = encOut['outputs'] ?? Object.values(encOut)[0];
|
|
538
|
-
if (debug && isNode()) console.log(`[Parakeet] Encoder output dims: ${enc.dims}`);
|
|
539
|
-
|
|
540
|
-
// 2. Transpose encoder output to match onnx-asr: [B, D, T] -> [B, T, D]
|
|
541
|
-
const [B, D, T_enc] = enc.dims;
|
|
542
|
-
const transposedEncData = new Float32Array(B * T_enc * D);
|
|
543
|
-
for (let b = 0; b < B; ++b) {
|
|
544
|
-
for (let t = 0; t < T_enc; ++t) {
|
|
545
|
-
for (let d = 0; d < D; ++d) {
|
|
546
|
-
const srcIdx = b * (D * T_enc) + d * T_enc + t;
|
|
547
|
-
const destIdx = b * (T_enc * D) + t * D + d;
|
|
548
|
-
transposedEncData[destIdx] = enc.data[srcIdx];
|
|
549
|
-
}
|
|
550
|
-
}
|
|
551
|
-
}
|
|
552
|
-
const encTransposed = {
|
|
553
|
-
data: transposedEncData,
|
|
554
|
-
dims: [B, T_enc, D],
|
|
555
|
-
};
|
|
556
|
-
|
|
557
|
-
// 3. Decode the transposed encoder output frame by frame
|
|
558
|
-
const hiddenSize = encTransposed.dims[2]; // D
|
|
559
|
-
const seqLen = encTransposed.dims[1]; // T_enc (downsampled time-steps)
|
|
560
|
-
let t = 0;
|
|
561
|
-
let emittedTokens = 0;
|
|
562
|
-
|
|
563
|
-
while (t < seqLen) {
|
|
564
|
-
const currFrameIdx = t; // Absolute index in encoder frames
|
|
565
|
-
|
|
566
|
-
// Slice encoder output for timestep t.
|
|
567
|
-
const frameBuf = new Float32Array(hiddenSize);
|
|
568
|
-
const frameOffset = t * hiddenSize; // Since B=1, offset is t * D
|
|
569
|
-
for (let i = 0; i < hiddenSize; i++) {
|
|
570
|
-
frameBuf[i] = encTransposed.data[frameOffset + i];
|
|
571
|
-
}
|
|
572
|
-
const encTensor = new ort.Tensor('float32', frameBuf, [1, hiddenSize, 1]);
|
|
573
|
-
|
|
574
|
-
let tokenLogitsData, step; let durLogitsArr=null;
|
|
575
|
-
let newDecoderState = null;
|
|
576
|
-
if (this.isCombined) {
|
|
577
|
-
const lastToken = ids.length > 0 ? ids[ids.length - 1] : this.blankId;
|
|
578
|
-
const res = await this._runCombinedStep(encTensor, lastToken, decoderState);
|
|
579
|
-
tokenLogitsData = res.tokenLogits;
|
|
580
|
-
step = res.step;
|
|
581
|
-
newDecoderState = res.newState;
|
|
582
|
-
} else {
|
|
583
|
-
// This block for separate decoder/joiner is preserved but may need review
|
|
584
|
-
// if still in use.
|
|
585
|
-
const joinFeeds = {};
|
|
586
|
-
if (this.joinerInputs.length > 0) {
|
|
587
|
-
this.joinerInputs.forEach((name, idx) => {
|
|
588
|
-
if (/enc/i.test(name)) joinFeeds[name] = encTensor;
|
|
589
|
-
else if (/dec/i.test(name)) joinFeeds[name] = dec;
|
|
590
|
-
else joinFeeds[name] = idx === 0 ? encTensor : dec;
|
|
591
|
-
});
|
|
592
|
-
}
|
|
593
|
-
joinFeeds['encoder_outputs'] = encTensor;
|
|
594
|
-
joinFeeds['decoder_outputs'] = dec;
|
|
595
|
-
const joinOut = await this.joinerSession.run(joinFeeds);
|
|
596
|
-
const joinTensor = Object.values(joinOut)[0];
|
|
597
|
-
const vocabSize = this.tokenizer.id2token.length;
|
|
598
|
-
tokenLogitsData = joinTensor.data.slice(0, vocabSize);
|
|
599
|
-
durLogitsArr = joinTensor.data.slice(vocabSize);
|
|
600
|
-
step = 0;
|
|
601
|
-
if (durLogitsArr.length > 0) {
|
|
602
|
-
let max=-Infinity; for (let i=0;i<durLogitsArr.length;++i) if (durLogitsArr[i]>max){max=durLogitsArr[i]; step=i;}
|
|
603
|
-
}
|
|
604
|
-
}
|
|
605
|
-
|
|
606
|
-
const tokenLogits = tokenLogitsData;
|
|
607
|
-
// Apply temperature scaling
|
|
608
|
-
const scaled = new Float32Array(tokenLogits.length);
|
|
609
|
-
for (let i = 0; i < tokenLogits.length; ++i) scaled[i] = tokenLogits[i] / temperature;
|
|
610
|
-
|
|
611
|
-
// Argmax token & confidence
|
|
612
|
-
let max = -Infinity, maxId = 0;
|
|
613
|
-
for (let i = 0; i < scaled.length; ++i) if (scaled[i] > max) { max = scaled[i]; maxId = i; }
|
|
614
|
-
|
|
615
|
-
let confVal = 0;
|
|
616
|
-
if (returnConfidences) {
|
|
617
|
-
let sum = 0;
|
|
618
|
-
const maxLogit = max;
|
|
619
|
-
for (let i = 0; i < scaled.length; ++i) sum += Math.exp(scaled[i] - maxLogit);
|
|
620
|
-
confVal = 1 / sum;
|
|
621
|
-
frameConfs.push(confVal);
|
|
622
|
-
overallLogProb += Math.log(confVal);
|
|
623
|
-
}
|
|
624
|
-
|
|
625
|
-
// Use onnx-asr algorithm: first process token, then decide advancement
|
|
626
|
-
if (maxId !== this.blankId) {
|
|
627
|
-
ids.push(maxId);
|
|
628
|
-
if (returnTimestamps) {
|
|
629
|
-
const TIME_STRIDE = this.subsampling * this.windowStride;
|
|
630
|
-
const durationFrames = step > 0 ? step : 1; // at least 1 frame
|
|
631
|
-
const start = currFrameIdx * TIME_STRIDE;
|
|
632
|
-
const end = (currFrameIdx + durationFrames) * TIME_STRIDE;
|
|
633
|
-
tokenTimes.push([start,end]);
|
|
634
|
-
}
|
|
635
|
-
if (returnConfidences) tokenConfs.push(confVal);
|
|
636
|
-
|
|
637
|
-
if (!this.isCombined) {
|
|
638
|
-
({dec, state: newState} = await this._runDecoderStep(maxId,state));
|
|
639
|
-
state = newState;
|
|
640
|
-
} else {
|
|
641
|
-
decoderState = newDecoderState;
|
|
642
|
-
}
|
|
643
|
-
emittedTokens += 1;
|
|
644
|
-
}
|
|
645
|
-
|
|
646
|
-
// Now decide advancement based on onnx-asr algorithm
|
|
647
|
-
const shouldAdvance = maxId === this.blankId || emittedTokens >= this.maxTokensPerStep;
|
|
648
|
-
|
|
649
|
-
if (step > 0) {
|
|
650
|
-
t += step;
|
|
651
|
-
emittedTokens = 0;
|
|
652
|
-
} else if (shouldAdvance) {
|
|
653
|
-
t += 1;
|
|
654
|
-
emittedTokens = 0;
|
|
655
|
-
}
|
|
656
|
-
}
|
|
657
|
-
|
|
658
|
-
if (debug && isNode()) {
|
|
659
|
-
console.log(`[Parakeet] Processed ${t} feature frames (${T} total)`);
|
|
660
|
-
}
|
|
661
|
-
|
|
662
|
-
const text = this._normalizer(this.tokenizer.decode(ids));
|
|
663
|
-
|
|
664
|
-
if (!returnTimestamps && !returnConfidences) {
|
|
665
|
-
return { utterance_text: text, words: [], metrics: {}, is_final: true };
|
|
666
|
-
}
|
|
667
|
-
|
|
668
|
-
const words = [];
|
|
669
|
-
let avgWordConfidence = 0;
|
|
670
|
-
// Prepare arrays for per-token information regardless of timestamp/confidence flags so that
|
|
671
|
-
// they are in scope for the final return object.
|
|
672
|
-
const tokensDetailed = [];
|
|
673
|
-
let avgTokenConfidence = 0;
|
|
674
|
-
|
|
675
|
-
if (returnTimestamps) {
|
|
676
|
-
let currentWord = '';
|
|
677
|
-
let wordStartTime = 0;
|
|
678
|
-
let wordEndTime = 0;
|
|
679
|
-
let wordConfidences = [];
|
|
680
|
-
|
|
681
|
-
ids.forEach((tokenId, i) => {
|
|
682
|
-
const tokenText = this.tokenizer.id2token[tokenId];
|
|
683
|
-
if (tokenText === this.tokenizer.blankToken) return;
|
|
684
|
-
|
|
685
|
-
const tokenTime = tokenTimes[i];
|
|
686
|
-
const tokenConf = returnConfidences ? tokenConfs[i] : 0;
|
|
687
|
-
|
|
688
|
-
// SentencePiece uses a special character '\u2581' (visible as "▁") to mark the
|
|
689
|
-
// beginning of a new word. Detect this character instead of a normal space.
|
|
690
|
-
const isWordStart = tokenText.startsWith('▁');
|
|
691
|
-
|
|
692
|
-
if (isWordStart) {
|
|
693
|
-
// A new word is starting. First, push the old one if it exists.
|
|
694
|
-
if (currentWord) {
|
|
695
|
-
const avgConfidence = wordConfidences.length > 0 ? wordConfidences.reduce((a, b) => a + b, 0) / wordConfidences.length : 0;
|
|
696
|
-
words.push({
|
|
697
|
-
text: currentWord,
|
|
698
|
-
start_time: Number(wordStartTime.toFixed(3)),
|
|
699
|
-
end_time: Number(wordEndTime.toFixed(3)),
|
|
700
|
-
confidence: Number(avgConfidence.toFixed(4))
|
|
701
|
-
});
|
|
702
|
-
}
|
|
703
|
-
|
|
704
|
-
// Now, start the new word
|
|
705
|
-
currentWord = tokenText.slice(1);
|
|
706
|
-
wordStartTime = tokenTime[0];
|
|
707
|
-
wordEndTime = tokenTime[1];
|
|
708
|
-
wordConfidences = [tokenConf];
|
|
709
|
-
|
|
710
|
-
} else {
|
|
711
|
-
// Not a word start, so it's a continuation of the current word.
|
|
712
|
-
// This also handles the very first token if it has no leading space.
|
|
713
|
-
if (!currentWord) {
|
|
714
|
-
wordStartTime = tokenTime[0];
|
|
715
|
-
}
|
|
716
|
-
currentWord += tokenText;
|
|
717
|
-
wordEndTime = tokenTime[1]; // just update the end time
|
|
718
|
-
if (returnConfidences) {
|
|
719
|
-
wordConfidences.push(tokenConf);
|
|
720
|
-
}
|
|
721
|
-
}
|
|
722
|
-
});
|
|
723
|
-
|
|
724
|
-
// Add the very last word after the loop finishes
|
|
725
|
-
if (currentWord) {
|
|
726
|
-
const avgConfidence = wordConfidences.length > 0 ? wordConfidences.reduce((a, b) => a + b, 0) / wordConfidences.length : 0;
|
|
727
|
-
words.push({
|
|
728
|
-
text: currentWord,
|
|
729
|
-
start_time: Number(wordStartTime.toFixed(3)),
|
|
730
|
-
end_time: Number(wordEndTime.toFixed(3)),
|
|
731
|
-
confidence: Number(avgConfidence.toFixed(4))
|
|
732
|
-
});
|
|
733
|
-
}
|
|
734
|
-
|
|
735
|
-
if (words.length > 0) {
|
|
736
|
-
avgWordConfidence = words.map(w => w.confidence).reduce((a,b) => a + b, 0) / words.length;
|
|
737
|
-
}
|
|
738
|
-
|
|
739
|
-
// ------------------------------------------------------------------
|
|
740
|
-
// Build per-token information array similar to NeMo output structure
|
|
741
|
-
// ------------------------------------------------------------------
|
|
742
|
-
ids.forEach((tokenId, i) => {
|
|
743
|
-
const rawToken = this.tokenizer.id2token[tokenId];
|
|
744
|
-
if (rawToken === this.tokenizer.blankToken) return; // skip blanks
|
|
745
|
-
|
|
746
|
-
const cleanToken = rawToken.startsWith('▁') ? rawToken.slice(1) : rawToken;
|
|
747
|
-
|
|
748
|
-
const tokEntry = { token: [cleanToken] };
|
|
749
|
-
|
|
750
|
-
if (returnTimestamps && tokenTimes[i]) {
|
|
751
|
-
const [s, e] = tokenTimes[i];
|
|
752
|
-
tokEntry.start_time = Number(s.toFixed(3));
|
|
753
|
-
tokEntry.end_time = Number(e.toFixed(3));
|
|
754
|
-
}
|
|
755
|
-
|
|
756
|
-
if (returnConfidences && tokenConfs[i] !== undefined) {
|
|
757
|
-
const conf = tokenConfs[i];
|
|
758
|
-
tokEntry.confidence = Number(conf.toFixed(4));
|
|
759
|
-
}
|
|
760
|
-
|
|
761
|
-
tokensDetailed.push(tokEntry);
|
|
762
|
-
});
|
|
763
|
-
|
|
764
|
-
if (tokensDetailed.length > 0 && returnConfidences) {
|
|
765
|
-
avgTokenConfidence = tokensDetailed.map(t => t.confidence || 0).reduce((a, b) => a + b, 0) / tokensDetailed.length;
|
|
766
|
-
}
|
|
767
|
-
}
|
|
768
|
-
|
|
769
|
-
return {
|
|
770
|
-
utterance_text: text,
|
|
771
|
-
words,
|
|
772
|
-
tokens: tokensDetailed,
|
|
773
|
-
confidence_scores: {
|
|
774
|
-
...(returnConfidences ? {
|
|
775
|
-
token: tokenConfs.map(c=>Number(c.toFixed(4))),
|
|
776
|
-
token_avg: Number(avgTokenConfidence.toFixed(4)),
|
|
777
|
-
word: words.map(w=>w.confidence),
|
|
778
|
-
word_avg: Number(avgWordConfidence.toFixed(4)),
|
|
779
|
-
frame: frameConfs.map(f=>Number(f.toFixed(4))),
|
|
780
|
-
frame_avg: frameConfs.length ? Number((frameConfs.reduce((a,b)=>a+b,0)/frameConfs.length).toFixed(4)) : null,
|
|
781
|
-
overall_log_prob: Number(overallLogProb.toFixed(6))
|
|
782
|
-
} : {
|
|
783
|
-
overall_log_prob: null,
|
|
784
|
-
frame_avg: null,
|
|
785
|
-
frame: null,
|
|
786
|
-
}),
|
|
787
|
-
},
|
|
788
|
-
is_final: true,
|
|
789
|
-
...(debug ? { raw_tokens: ids, raw_token_timestamps: tokenTimes, raw_token_confidences: tokenConfs } : {})
|
|
790
|
-
};
|
|
791
|
-
}
|
|
792
|
-
}
|