parakeet.js 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.gitmodules +3 -0
- package/README.md +240 -239
- package/examples/hf-spaces-demo/README.md +6 -9
- package/examples/hf-spaces-demo/package.json +1 -1
- package/examples/hf-spaces-demo/src/App.js +307 -316
- package/examples/react-demo/package.json +19 -19
- package/examples/react-demo/src/App.jsx +324 -326
- package/examples/react-demo-dev/src/App.jsx +23 -24
- package/package.json +1 -1
- package/publish.ps1 +65 -0
- package/src/hub.js +235 -241
- package/src/parakeet.js +15 -8
- package/src/preprocessor.js +75 -68
- package/docs/parakeet-transformers-js/.gitattributes +0 -2
- package/docs/parakeet-transformers-js/.prettierignore +0 -8
- package/docs/parakeet-transformers-js/.prettierrc +0 -10
- package/docs/parakeet-transformers-js/.tmp_features.json +0 -1
- package/docs/parakeet-transformers-js/LICENSE +0 -202
- package/docs/parakeet-transformers-js/README.md +0 -448
- package/docs/parakeet-transformers-js/assets/nemo128.onnx +0 -0
- package/docs/parakeet-transformers-js/assets/nemo80.onnx +0 -0
- package/docs/parakeet-transformers-js/debug_test.js +0 -84
- package/docs/parakeet-transformers-js/dev/inspect_decoder.cjs +0 -9
- package/docs/parakeet-transformers-js/dev/inspect_joiner.cjs +0 -9
- package/docs/parakeet-transformers-js/dev/js_step_by_step.js +0 -249
- package/docs/parakeet-transformers-js/dev/parakeet_cli.js +0 -91
- package/docs/parakeet-transformers-js/jest.config.mjs +0 -194
- package/docs/parakeet-transformers-js/js_preprocessing.json +0 -225
- package/docs/parakeet-transformers-js/js_step_by_step.json +0 -837
- package/docs/parakeet-transformers-js/js_step_by_step_v2.json +0 -450
- package/docs/parakeet-transformers-js/js_step_by_step_v3.json +0 -450
- package/docs/parakeet-transformers-js/js_steps.json +0 -821
- package/docs/parakeet-transformers-js/package-lock.json +0 -12251
- package/docs/parakeet-transformers-js/package.json +0 -96
- package/docs/parakeet-transformers-js/src/audio_features.js +0 -178
- package/docs/parakeet-transformers-js/src/backends/onnx.js +0 -210
- package/docs/parakeet-transformers-js/src/base/feature_extraction_utils.js +0 -54
- package/docs/parakeet-transformers-js/src/base/image_processors_utils.js +0 -1105
- package/docs/parakeet-transformers-js/src/base/processing_utils.js +0 -173
- package/docs/parakeet-transformers-js/src/configs.js +0 -455
- package/docs/parakeet-transformers-js/src/env.js +0 -167
- package/docs/parakeet-transformers-js/src/generation/configuration_utils.js +0 -388
- package/docs/parakeet-transformers-js/src/generation/logits_process.js +0 -727
- package/docs/parakeet-transformers-js/src/generation/logits_sampler.js +0 -204
- package/docs/parakeet-transformers-js/src/generation/parameters.js +0 -35
- package/docs/parakeet-transformers-js/src/generation/stopping_criteria.js +0 -156
- package/docs/parakeet-transformers-js/src/generation/streamers.js +0 -225
- package/docs/parakeet-transformers-js/src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js +0 -85
- package/docs/parakeet-transformers-js/src/models/auto/feature_extraction_auto.js +0 -25
- package/docs/parakeet-transformers-js/src/models/auto/image_processing_auto.js +0 -29
- package/docs/parakeet-transformers-js/src/models/auto/processing_auto.js +0 -85
- package/docs/parakeet-transformers-js/src/models/beit/image_processing_beit.js +0 -5
- package/docs/parakeet-transformers-js/src/models/bit/image_processing_bit.js +0 -5
- package/docs/parakeet-transformers-js/src/models/chinese_clip/image_processing_chinese_clip.js +0 -5
- package/docs/parakeet-transformers-js/src/models/clap/feature_extraction_clap.js +0 -159
- package/docs/parakeet-transformers-js/src/models/clip/image_processing_clip.js +0 -6
- package/docs/parakeet-transformers-js/src/models/convnext/image_processing_convnext.js +0 -46
- package/docs/parakeet-transformers-js/src/models/dac/feature_extraction_dac.js +0 -3
- package/docs/parakeet-transformers-js/src/models/deit/image_processing_deit.js +0 -6
- package/docs/parakeet-transformers-js/src/models/detr/image_processing_detr.js +0 -52
- package/docs/parakeet-transformers-js/src/models/donut/image_processing_donut.js +0 -31
- package/docs/parakeet-transformers-js/src/models/dpt/image_processing_dpt.js +0 -6
- package/docs/parakeet-transformers-js/src/models/efficientnet/image_processing_efficientnet.js +0 -14
- package/docs/parakeet-transformers-js/src/models/encodec/feature_extraction_encodec.js +0 -32
- package/docs/parakeet-transformers-js/src/models/feature_extractors.js +0 -17
- package/docs/parakeet-transformers-js/src/models/florence2/processing_florence2.js +0 -131
- package/docs/parakeet-transformers-js/src/models/gemma3n/feature_extraction_gemma3n.js +0 -97
- package/docs/parakeet-transformers-js/src/models/gemma3n/processing_gemma3n.js +0 -74
- package/docs/parakeet-transformers-js/src/models/glpn/image_processing_glpn.js +0 -5
- package/docs/parakeet-transformers-js/src/models/grounding_dino/image_processing_grounding_dino.js +0 -29
- package/docs/parakeet-transformers-js/src/models/grounding_dino/processing_grounding_dino.js +0 -101
- package/docs/parakeet-transformers-js/src/models/idefics3/image_processing_idefics3.js +0 -232
- package/docs/parakeet-transformers-js/src/models/idefics3/processing_idefics3.js +0 -136
- package/docs/parakeet-transformers-js/src/models/image_processors.js +0 -40
- package/docs/parakeet-transformers-js/src/models/janus/image_processing_janus.js +0 -27
- package/docs/parakeet-transformers-js/src/models/janus/processing_janus.js +0 -123
- package/docs/parakeet-transformers-js/src/models/jina_clip/image_processing_jina_clip.js +0 -26
- package/docs/parakeet-transformers-js/src/models/jina_clip/processing_jina_clip.js +0 -24
- package/docs/parakeet-transformers-js/src/models/llava/processing_llava.js +0 -44
- package/docs/parakeet-transformers-js/src/models/llava_onevision/image_processing_llava_onevision.js +0 -5
- package/docs/parakeet-transformers-js/src/models/mask2former/image_processing_mask2former.js +0 -5
- package/docs/parakeet-transformers-js/src/models/maskformer/image_processing_maskformer.js +0 -18
- package/docs/parakeet-transformers-js/src/models/mgp_str/processing_mgp_str.js +0 -172
- package/docs/parakeet-transformers-js/src/models/mobilenet_v1/image_processing_mobilenet_v1.js +0 -7
- package/docs/parakeet-transformers-js/src/models/mobilenet_v2/image_processing_mobilenet_v2.js +0 -7
- package/docs/parakeet-transformers-js/src/models/mobilenet_v3/image_processing_mobilenet_v3.js +0 -7
- package/docs/parakeet-transformers-js/src/models/mobilenet_v4/image_processing_mobilenet_v4.js +0 -7
- package/docs/parakeet-transformers-js/src/models/mobilevit/image_processing_mobilevit.js +0 -6
- package/docs/parakeet-transformers-js/src/models/moonshine/feature_extraction_moonshine.js +0 -26
- package/docs/parakeet-transformers-js/src/models/moonshine/processing_moonshine.js +0 -20
- package/docs/parakeet-transformers-js/src/models/nougat/image_processing_nougat.js +0 -5
- package/docs/parakeet-transformers-js/src/models/owlv2/image_processing_owlv2.js +0 -5
- package/docs/parakeet-transformers-js/src/models/owlvit/image_processing_owlvit.js +0 -12
- package/docs/parakeet-transformers-js/src/models/owlvit/processing_owlvit.js +0 -7
- package/docs/parakeet-transformers-js/src/models/paligemma/processing_paligemma.js +0 -83
- package/docs/parakeet-transformers-js/src/models/parakeet/feature_extraction_parakeet.js +0 -3
- package/docs/parakeet-transformers-js/src/models/parakeet/modeling_parakeet.js +0 -3
- package/docs/parakeet-transformers-js/src/models/parakeet/processing_parakeet.js +0 -3
- package/docs/parakeet-transformers-js/src/models/parakeet/tokenization_parakeet.js +0 -3
- package/docs/parakeet-transformers-js/src/models/phi3_v/image_processing_phi3_v.js +0 -163
- package/docs/parakeet-transformers-js/src/models/phi3_v/processing_phi3_v.js +0 -53
- package/docs/parakeet-transformers-js/src/models/processors.js +0 -22
- package/docs/parakeet-transformers-js/src/models/pvt/image_processing_pvt.js +0 -5
- package/docs/parakeet-transformers-js/src/models/pyannote/feature_extraction_pyannote.js +0 -85
- package/docs/parakeet-transformers-js/src/models/pyannote/processing_pyannote.js +0 -24
- package/docs/parakeet-transformers-js/src/models/qwen2_vl/image_processing_qwen2_vl.js +0 -52
- package/docs/parakeet-transformers-js/src/models/qwen2_vl/processing_qwen2_vl.js +0 -53
- package/docs/parakeet-transformers-js/src/models/rt_detr/image_processing_rt_detr.js +0 -12
- package/docs/parakeet-transformers-js/src/models/sam/image_processing_sam.js +0 -242
- package/docs/parakeet-transformers-js/src/models/sam/processing_sam.js +0 -20
- package/docs/parakeet-transformers-js/src/models/sapiens/image_processing_sapiens.js +0 -13
- package/docs/parakeet-transformers-js/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +0 -175
- package/docs/parakeet-transformers-js/src/models/segformer/image_processing_segformer.js +0 -13
- package/docs/parakeet-transformers-js/src/models/siglip/image_processing_siglip.js +0 -5
- package/docs/parakeet-transformers-js/src/models/smolvlm/image_processing_smolvlm.js +0 -2
- package/docs/parakeet-transformers-js/src/models/smolvlm/processing_smolvlm.js +0 -2
- package/docs/parakeet-transformers-js/src/models/snac/feature_extraction_snac.js +0 -3
- package/docs/parakeet-transformers-js/src/models/speecht5/feature_extraction_speecht5.js +0 -4
- package/docs/parakeet-transformers-js/src/models/speecht5/processing_speecht5.js +0 -17
- package/docs/parakeet-transformers-js/src/models/swin2sr/image_processing_swin2sr.js +0 -24
- package/docs/parakeet-transformers-js/src/models/ultravox/processing_ultravox.js +0 -54
- package/docs/parakeet-transformers-js/src/models/vit/image_processing_vit.js +0 -7
- package/docs/parakeet-transformers-js/src/models/vitmatte/image_processing_vitmatte.js +0 -50
- package/docs/parakeet-transformers-js/src/models/vitpose/image_processing_vitpose.js +0 -89
- package/docs/parakeet-transformers-js/src/models/wav2vec2/feature_extraction_wav2vec2.js +0 -44
- package/docs/parakeet-transformers-js/src/models/wav2vec2/processing_wav2vec2.js +0 -17
- package/docs/parakeet-transformers-js/src/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.js +0 -17
- package/docs/parakeet-transformers-js/src/models/wespeaker/feature_extraction_wespeaker.js +0 -95
- package/docs/parakeet-transformers-js/src/models/whisper/common_whisper.js +0 -157
- package/docs/parakeet-transformers-js/src/models/whisper/feature_extraction_whisper.js +0 -92
- package/docs/parakeet-transformers-js/src/models/whisper/generation_whisper.js +0 -89
- package/docs/parakeet-transformers-js/src/models/whisper/processing_whisper.js +0 -21
- package/docs/parakeet-transformers-js/src/models/yolos/image_processing_yolos.js +0 -12
- package/docs/parakeet-transformers-js/src/models.js +0 -8644
- package/docs/parakeet-transformers-js/src/ops/registry.js +0 -133
- package/docs/parakeet-transformers-js/src/ort_env.js +0 -8
- package/docs/parakeet-transformers-js/src/parakeet.js +0 -792
- package/docs/parakeet-transformers-js/src/pipelines.js +0 -3540
- package/docs/parakeet-transformers-js/src/processors.js +0 -16
- package/docs/parakeet-transformers-js/src/tokenizers.js +0 -4432
- package/docs/parakeet-transformers-js/src/transformers.js +0 -50
- package/docs/parakeet-transformers-js/src/utils/audio.js +0 -893
- package/docs/parakeet-transformers-js/src/utils/constants.js +0 -9
- package/docs/parakeet-transformers-js/src/utils/core.js +0 -259
- package/docs/parakeet-transformers-js/src/utils/data-structures.js +0 -574
- package/docs/parakeet-transformers-js/src/utils/devices.js +0 -22
- package/docs/parakeet-transformers-js/src/utils/dtypes.js +0 -63
- package/docs/parakeet-transformers-js/src/utils/generic.js +0 -35
- package/docs/parakeet-transformers-js/src/utils/hub.js +0 -780
- package/docs/parakeet-transformers-js/src/utils/image.js +0 -834
- package/docs/parakeet-transformers-js/src/utils/maths.js +0 -1061
- package/docs/parakeet-transformers-js/src/utils/tensor.js +0 -1539
- package/docs/parakeet-transformers-js/src/utils/video.js +0 -128
- package/docs/parakeet-transformers-js/test/decoder.test.js +0 -114
- package/docs/parakeet-transformers-js/test/encoder.test.js +0 -108
- package/docs/parakeet-transformers-js/test/preprocessor.test.js +0 -85
- package/docs/parakeet-transformers-js/test/tokenizer.test.js +0 -24
- package/docs/parakeet-transformers-js/test/transcribe.js +0 -89
- package/docs/parakeet-transformers-js/tsconfig.json +0 -21
- package/docs/parakeet-transformers-js/webpack.config.js +0 -223
|
@@ -1,128 +0,0 @@
|
|
|
1
|
-
import { RawImage } from "./image.js";
|
|
2
|
-
import { apis } from "../env.js";
|
|
3
|
-
|
|
4
|
-
export class RawVideoFrame {
|
|
5
|
-
|
|
6
|
-
/**
|
|
7
|
-
* @param {RawImage} image
|
|
8
|
-
* @param {number} timestamp
|
|
9
|
-
*/
|
|
10
|
-
constructor(image, timestamp) {
|
|
11
|
-
this.image = image;
|
|
12
|
-
this.timestamp = timestamp;
|
|
13
|
-
}
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
export class RawVideo {
|
|
17
|
-
/**
|
|
18
|
-
* @param {RawVideoFrame[]|RawImage[]} frames
|
|
19
|
-
* @param {number} duration
|
|
20
|
-
*/
|
|
21
|
-
constructor(frames, duration) {
|
|
22
|
-
if (frames.length > 0 && frames[0] instanceof RawImage) {
|
|
23
|
-
// Assume uniform timestamps
|
|
24
|
-
frames = frames.map((image, i) => new RawVideoFrame(image, (i + 1) / (frames.length + 1) * duration));
|
|
25
|
-
}
|
|
26
|
-
this.frames = /** @type {RawVideoFrame[]} */ (frames);
|
|
27
|
-
this.duration = duration;
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
get width() {
|
|
31
|
-
return this.frames[0].image.width;
|
|
32
|
-
}
|
|
33
|
-
get height() {
|
|
34
|
-
return this.frames[0].image.height;
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
get fps() {
|
|
38
|
-
return this.frames.length / this.duration;
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
/**
|
|
44
|
-
* Loads a video.
|
|
45
|
-
*
|
|
46
|
-
* @param {string|Blob|HTMLVideoElement} src The video to process.
|
|
47
|
-
* @param {Object} [options] Optional parameters.
|
|
48
|
-
* @param {number} [options.num_frames=null] The number of frames to sample uniformly.
|
|
49
|
-
* @param {number} [options.fps=null] The number of frames to sample per second.
|
|
50
|
-
*
|
|
51
|
-
* @returns {Promise<RawVideo>} The loaded video.
|
|
52
|
-
*/
|
|
53
|
-
export async function load_video(src, { num_frames = null, fps = null } = {}) {
|
|
54
|
-
if (!apis.IS_BROWSER_ENV) {
|
|
55
|
-
throw new Error("`load_video` is currently only supported in browser environments.");
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
// TODO: Support efficiently loading all frames using the WebCodecs API.
|
|
59
|
-
// Specfically, https://developer.mozilla.org/en-US/docs/Web/API/VideoDecoder
|
|
60
|
-
if (num_frames == null && fps == null) {
|
|
61
|
-
throw new Error("Either num_frames or fps must be provided.");
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
const frames = [];
|
|
65
|
-
|
|
66
|
-
const video = document.createElement("video");
|
|
67
|
-
video.crossOrigin = "anonymous";
|
|
68
|
-
video.muted = true; // mute to allow autoplay and seeking
|
|
69
|
-
|
|
70
|
-
if (typeof src === 'string') {
|
|
71
|
-
video.src = src;
|
|
72
|
-
} else if (src instanceof Blob) {
|
|
73
|
-
video.src = URL.createObjectURL(src);
|
|
74
|
-
} else if (src instanceof HTMLVideoElement) {
|
|
75
|
-
video.src = src.src;
|
|
76
|
-
} else {
|
|
77
|
-
throw new Error("Invalid URL or video element provided.");
|
|
78
|
-
}
|
|
79
|
-
// Wait for metadata to load to obtain duration
|
|
80
|
-
await new Promise((resolve) => video.onloadedmetadata = resolve);
|
|
81
|
-
|
|
82
|
-
if (video.seekable.start(0) === video.seekable.end(0)) {
|
|
83
|
-
// Fallback: Download entire video if not seekable
|
|
84
|
-
const response = await fetch(video.src);
|
|
85
|
-
const blob = await response.blob();
|
|
86
|
-
video.src = URL.createObjectURL(blob);
|
|
87
|
-
await new Promise((resolve) => video.onloadedmetadata = resolve);
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
const duration = video.duration;
|
|
91
|
-
|
|
92
|
-
let count, step;
|
|
93
|
-
if (num_frames != null) {
|
|
94
|
-
count = num_frames;
|
|
95
|
-
step = num_frames === 1 ? 0 : duration / (num_frames - 1);
|
|
96
|
-
} else {
|
|
97
|
-
step = 1 / fps;
|
|
98
|
-
count = Math.floor(duration / step);
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
// Build an array of sample times based on num_frames or fps
|
|
102
|
-
let sampleTimes = [];
|
|
103
|
-
for (let i = 0; i < count; ++i) {
|
|
104
|
-
sampleTimes.push(num_frames === 1 ? duration / 2 : i * step);
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
const canvas = document.createElement("canvas");
|
|
108
|
-
canvas.width = video.videoWidth;
|
|
109
|
-
canvas.height = video.videoHeight;
|
|
110
|
-
const ctx = canvas.getContext("2d", { willReadFrequently: true });
|
|
111
|
-
for (const t of sampleTimes) {
|
|
112
|
-
video.currentTime = t;
|
|
113
|
-
await new Promise((resolve) => {
|
|
114
|
-
video.onseeked = resolve;
|
|
115
|
-
});
|
|
116
|
-
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
|
|
117
|
-
const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
|
|
118
|
-
const frameData = new RawImage(imageData.data, canvas.width, canvas.height, 4);
|
|
119
|
-
|
|
120
|
-
const frame = new RawVideoFrame(frameData, t);
|
|
121
|
-
frames.push(frame);
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
// Clean up video element.
|
|
125
|
-
video.remove();
|
|
126
|
-
|
|
127
|
-
return new RawVideo(frames, duration);
|
|
128
|
-
}
|
|
@@ -1,114 +0,0 @@
|
|
|
1
|
-
import fs from 'fs';
|
|
2
|
-
import path from 'path';
|
|
3
|
-
import { spawnSync } from 'child_process';
|
|
4
|
-
import { ParakeetModel } from '../src/parakeet.js';
|
|
5
|
-
import { MEL_BINS } from '../src/audio_features.js';
|
|
6
|
-
import { fileURLToPath } from 'url';
|
|
7
|
-
|
|
8
|
-
const CHUNK = 128;
|
|
9
|
-
const BLANK_ID = 1024;
|
|
10
|
-
|
|
11
|
-
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
12
|
-
|
|
13
|
-
test('Decoder joint first step logits match python reference (tolerance 1e-3)', async () => {
|
|
14
|
-
const projectRoot = path.resolve(__dirname, '..');
|
|
15
|
-
const assetsDir = path.join(projectRoot, 'assets');
|
|
16
|
-
const preprocPath = path.join(assetsDir, 'nemo128.onnx');
|
|
17
|
-
if (!fs.existsSync(preprocPath)) {
|
|
18
|
-
console.warn('Preprocessor not found, skipping');
|
|
19
|
-
return;
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
const modelDir = path.resolve(projectRoot, '..', 'parakeet-tdt-0.6b-v2-onnx');
|
|
23
|
-
let encoderPath = path.join(modelDir, 'encoder-model.int8.onnx');
|
|
24
|
-
if (!fs.existsSync(encoderPath)) encoderPath = path.join(modelDir, 'encoder-model.onnx');
|
|
25
|
-
let decJointPath = path.join(modelDir, 'decoder_joint-model.int8.onnx');
|
|
26
|
-
if (!fs.existsSync(decJointPath)) decJointPath = path.join(modelDir, 'decoder_joint-model.onnx');
|
|
27
|
-
if (!fs.existsSync(encoderPath) || !fs.existsSync(decJointPath)) {
|
|
28
|
-
console.warn('Model files not found, skipping');
|
|
29
|
-
return;
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
const wavPath = path.resolve(projectRoot, '..', 'jfk.wav');
|
|
33
|
-
if (!fs.existsSync(wavPath)) {
|
|
34
|
-
console.warn('WAV not found, skipping');
|
|
35
|
-
return;
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
const tmpJson = path.join(projectRoot, '.tmp_dec.json');
|
|
39
|
-
const pyScript = path.join(projectRoot, '..', 'scripts', 'decoder_step.py');
|
|
40
|
-
const pyRes = spawnSync('python', [pyScript, encoderPath, decJointPath, preprocPath, wavPath, tmpJson, BLANK_ID.toString(), CHUNK.toString()], { stdio: 'inherit' });
|
|
41
|
-
expect(pyRes.status).toBe(0);
|
|
42
|
-
const ref = JSON.parse(fs.readFileSync(tmpJson, 'utf8'));
|
|
43
|
-
fs.unlinkSync(tmpJson);
|
|
44
|
-
|
|
45
|
-
// JS path via ParakeetModel
|
|
46
|
-
const model = await ParakeetModel.fromDirectory(modelDir);
|
|
47
|
-
const audioData = fs.readFileSync(wavPath);
|
|
48
|
-
// quick wav parse using node-wav? Instead reuse test helper for reading audio; replicate minimal parse.
|
|
49
|
-
const wavBuf = audioData;
|
|
50
|
-
// To avoid dependency, skip JS compute; just validate length of logits using model.joinerSession dims
|
|
51
|
-
// Instead replicate encoder+preproc as in previous test but using model
|
|
52
|
-
// We'll reuse OnnxPreprocessor from model.preprocessor
|
|
53
|
-
const wav = await import('wav');
|
|
54
|
-
const { Readable } = await import('stream');
|
|
55
|
-
function readWavFloat(file) {
|
|
56
|
-
return new Promise((resolve, reject) => {
|
|
57
|
-
const reader = new wav.Reader();
|
|
58
|
-
const samples = [];
|
|
59
|
-
let channels = 0;
|
|
60
|
-
reader.on('format', fmt=>{ channels=fmt.channels; });
|
|
61
|
-
reader.on('data', chunk=>{ for(let i=0;i<chunk.length;i+=2) samples.push(chunk.readInt16LE(i)); });
|
|
62
|
-
reader.on('end', ()=>{
|
|
63
|
-
const mono = new Float32Array(Math.floor(samples.length/channels));
|
|
64
|
-
for(let i=0;i<mono.length;++i){
|
|
65
|
-
let sum=0; for(let c=0;c<channels;++c){ sum+=samples[i*channels+c]; }
|
|
66
|
-
mono[i]=sum/channels/32768;
|
|
67
|
-
}
|
|
68
|
-
resolve(mono);
|
|
69
|
-
});
|
|
70
|
-
reader.on('error', reject);
|
|
71
|
-
Readable.from(file).pipe(reader);
|
|
72
|
-
});
|
|
73
|
-
}
|
|
74
|
-
const audioFloat = await readWavFloat(fs.readFileSync(wavPath));
|
|
75
|
-
const featRes = await model.preprocessor.process(audioFloat);
|
|
76
|
-
const T = Math.min(featRes.length, CHUNK);
|
|
77
|
-
const bufMel = new Float32Array(MEL_BINS * T);
|
|
78
|
-
for (let t=0;t<T;++t){
|
|
79
|
-
for (let m=0;m<MEL_BINS;++m){
|
|
80
|
-
bufMel[m*T+t]=featRes.features[t*MEL_BINS+m];
|
|
81
|
-
}
|
|
82
|
-
}
|
|
83
|
-
const ortMod = await import('onnxruntime-node');
|
|
84
|
-
const ort = ortMod.default ?? ortMod;
|
|
85
|
-
const encTensor = new ort.Tensor('float32', bufMel, [1, MEL_BINS, T]);
|
|
86
|
-
const lenTensor = new ort.Tensor('int64', BigInt64Array.from([BigInt(T)]), [1]);
|
|
87
|
-
const encOut = await model.encoderSession.run({ 'audio_signal': encTensor, 'length': lenTensor });
|
|
88
|
-
const encTensorFrame = (encOut['outputs']||Object.values(encOut)[0]);
|
|
89
|
-
const hidden = encTensorFrame.dims[1];
|
|
90
|
-
// slice first frame
|
|
91
|
-
const frameBuf = new Float32Array(hidden);
|
|
92
|
-
for(let i=0;i<hidden;++i) frameBuf[i]=encTensorFrame.data[i];
|
|
93
|
-
const encFrame = new ort.Tensor('float32', frameBuf, [1, hidden, 1]);
|
|
94
|
-
// zero states
|
|
95
|
-
const numLayers = model.predLayers||2;
|
|
96
|
-
const state1 = new ort.Tensor('float32', new Float32Array(numLayers*1*model.predHidden), [numLayers,1,model.predHidden]);
|
|
97
|
-
const state2 = new ort.Tensor('float32', new Float32Array(numLayers*1*model.predHidden), [numLayers,1,model.predHidden]);
|
|
98
|
-
const out = await model.joinerSession.run({
|
|
99
|
-
encoder_outputs: encFrame,
|
|
100
|
-
targets: new ort.Tensor('int32', Int32Array.from([BLANK_ID]), [1,1]),
|
|
101
|
-
target_length: new ort.Tensor('int32', Int32Array.from([1]), [1]),
|
|
102
|
-
input_states_1: state1,
|
|
103
|
-
input_states_2: state2,
|
|
104
|
-
});
|
|
105
|
-
const logitsJS = (out['outputs']||out[0]||Object.values(out)[0]).data;
|
|
106
|
-
|
|
107
|
-
expect(logitsJS.length).toBe(ref.logits.length);
|
|
108
|
-
let maxDiff=0;
|
|
109
|
-
for(let i=0;i<logitsJS.length;++i){
|
|
110
|
-
const diff=Math.abs(logitsJS[i]-ref.logits[i]);
|
|
111
|
-
if(diff>maxDiff) maxDiff=diff;
|
|
112
|
-
}
|
|
113
|
-
expect(maxDiff).toBeLessThan(1e-3);
|
|
114
|
-
});
|
|
@@ -1,108 +0,0 @@
|
|
|
1
|
-
import fs from 'fs';
|
|
2
|
-
import path from 'path';
|
|
3
|
-
import { spawnSync } from 'child_process';
|
|
4
|
-
import { OnnxPreprocessor } from '../src/parakeet.js';
|
|
5
|
-
import * as ort from 'onnxruntime-node';
|
|
6
|
-
import wav from 'wav';
|
|
7
|
-
import { MEL_BINS } from '../src/audio_features.js';
|
|
8
|
-
import { fileURLToPath } from 'url';
|
|
9
|
-
|
|
10
|
-
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
11
|
-
|
|
12
|
-
const CHUNK = 128;
|
|
13
|
-
|
|
14
|
-
function readWavFloat32(filePath) {
|
|
15
|
-
return new Promise((resolve, reject) => {
|
|
16
|
-
const file = fs.createReadStream(filePath);
|
|
17
|
-
const reader = new wav.Reader();
|
|
18
|
-
const samples = [];
|
|
19
|
-
let sampleRate = 0;
|
|
20
|
-
let channels = 0;
|
|
21
|
-
|
|
22
|
-
reader.on('format', (format) => {
|
|
23
|
-
sampleRate = format.sampleRate;
|
|
24
|
-
channels = format.channels;
|
|
25
|
-
});
|
|
26
|
-
|
|
27
|
-
reader.on('data', (chunk) => {
|
|
28
|
-
for (let i = 0; i < chunk.length; i += 2) {
|
|
29
|
-
samples.push(chunk.readInt16LE(i));
|
|
30
|
-
}
|
|
31
|
-
});
|
|
32
|
-
|
|
33
|
-
reader.on('end', () => {
|
|
34
|
-
const mono = new Float32Array(Math.floor(samples.length / channels));
|
|
35
|
-
for (let i = 0; i < mono.length; ++i) {
|
|
36
|
-
let sum = 0;
|
|
37
|
-
for (let c = 0; c < channels; ++c) {
|
|
38
|
-
sum += samples[i * channels + c];
|
|
39
|
-
}
|
|
40
|
-
mono[i] = (sum / channels) / 32768;
|
|
41
|
-
}
|
|
42
|
-
resolve({ audio: mono, sampleRate });
|
|
43
|
-
});
|
|
44
|
-
|
|
45
|
-
reader.on('error', reject);
|
|
46
|
-
file.on('error', reject);
|
|
47
|
-
|
|
48
|
-
file.pipe(reader);
|
|
49
|
-
});
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
test('Encoder first vector matches python reference (tolerance 1e-3)', async () => {
|
|
53
|
-
const projectRoot = path.resolve(__dirname, '..');
|
|
54
|
-
const assetsDir = path.join(projectRoot, 'assets');
|
|
55
|
-
const preprocPath = path.join(assetsDir, 'nemo128.onnx');
|
|
56
|
-
if (!fs.existsSync(preprocPath)) {
|
|
57
|
-
console.warn('Preprocessor not found, skipping');
|
|
58
|
-
return;
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
const modelDir = path.resolve(projectRoot, '..', 'parakeet-tdt-0.6b-v2-onnx');
|
|
62
|
-
let encoderPath = path.join(modelDir, 'encoder-model.int8.onnx');
|
|
63
|
-
if (!fs.existsSync(encoderPath)) encoderPath = path.join(modelDir, 'encoder-model.onnx');
|
|
64
|
-
if (!fs.existsSync(encoderPath)) {
|
|
65
|
-
console.warn('Encoder model not found, skipping');
|
|
66
|
-
return;
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
const wavPath = path.resolve(projectRoot, '..', 'jfk.wav');
|
|
70
|
-
if (!fs.existsSync(wavPath)) {
|
|
71
|
-
console.warn('Test WAV not found, skipping');
|
|
72
|
-
return;
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
// Run python helper
|
|
76
|
-
const tmpJson = path.join(projectRoot, '.tmp_enc.json');
|
|
77
|
-
const pyScript = path.join(projectRoot, '..', 'scripts', 'encode_chunk.py');
|
|
78
|
-
const pyRes = spawnSync('python', [pyScript, encoderPath, preprocPath, wavPath, tmpJson, CHUNK.toString()], { stdio: 'inherit' });
|
|
79
|
-
expect(pyRes.status).toBe(0);
|
|
80
|
-
const ref = JSON.parse(fs.readFileSync(tmpJson, 'utf8'));
|
|
81
|
-
fs.unlinkSync(tmpJson);
|
|
82
|
-
|
|
83
|
-
// JS pipeline
|
|
84
|
-
const { audio } = await readWavFloat32(wavPath);
|
|
85
|
-
const preproc = new OnnxPreprocessor(preprocPath);
|
|
86
|
-
const procRes = await preproc.process(audio);
|
|
87
|
-
const T = Math.min(procRes.length, CHUNK);
|
|
88
|
-
const buf = new Float32Array(MEL_BINS * T);
|
|
89
|
-
for (let t = 0; t < T; ++t) {
|
|
90
|
-
for (let m = 0; m < MEL_BINS; ++m) {
|
|
91
|
-
buf[m * T + t] = procRes.features[t * MEL_BINS + m];
|
|
92
|
-
}
|
|
93
|
-
}
|
|
94
|
-
const inputTensor = new ort.Tensor('float32', buf, [1, MEL_BINS, T]);
|
|
95
|
-
const lenTensor = new ort.Tensor('int64', BigInt64Array.from([BigInt(T)]), [1]);
|
|
96
|
-
const encSession = await ort.InferenceSession.create(encoderPath);
|
|
97
|
-
const encOut = await encSession.run({ 'audio_signal': inputTensor, 'length': lenTensor });
|
|
98
|
-
const enc = encOut['outputs'] || Object.values(encOut)[0];
|
|
99
|
-
const jsVec = enc.data.slice(0, ref.hidden);
|
|
100
|
-
|
|
101
|
-
expect(jsVec.length).toBe(ref.vector.length);
|
|
102
|
-
let maxDiff = 0;
|
|
103
|
-
for (let i = 0; i < jsVec.length; ++i) {
|
|
104
|
-
const diff = Math.abs(jsVec[i] - ref.vector[i]);
|
|
105
|
-
if (diff > maxDiff) maxDiff = diff;
|
|
106
|
-
}
|
|
107
|
-
expect(maxDiff).toBeLessThan(1e-3);
|
|
108
|
-
});
|
|
@@ -1,85 +0,0 @@
|
|
|
1
|
-
import fs from 'fs';
|
|
2
|
-
import path from 'path';
|
|
3
|
-
import { spawnSync } from 'child_process';
|
|
4
|
-
import { OnnxPreprocessor } from '../src/parakeet.js';
|
|
5
|
-
import wav from 'wav';
|
|
6
|
-
import { fileURLToPath } from 'url';
|
|
7
|
-
|
|
8
|
-
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
9
|
-
|
|
10
|
-
function readWavFloat32(filePath) {
|
|
11
|
-
return new Promise((resolve, reject) => {
|
|
12
|
-
const file = fs.createReadStream(filePath);
|
|
13
|
-
const reader = new wav.Reader();
|
|
14
|
-
const samples = [];
|
|
15
|
-
let sampleRate = 0;
|
|
16
|
-
let channels = 0;
|
|
17
|
-
|
|
18
|
-
reader.on('format', (format) => {
|
|
19
|
-
sampleRate = format.sampleRate;
|
|
20
|
-
channels = format.channels;
|
|
21
|
-
});
|
|
22
|
-
|
|
23
|
-
reader.on('data', (chunk) => {
|
|
24
|
-
for (let i = 0; i < chunk.length; i += 2) {
|
|
25
|
-
samples.push(chunk.readInt16LE(i));
|
|
26
|
-
}
|
|
27
|
-
});
|
|
28
|
-
|
|
29
|
-
reader.on('end', () => {
|
|
30
|
-
const mono = new Float32Array(Math.floor(samples.length / channels));
|
|
31
|
-
for (let i = 0; i < mono.length; ++i) {
|
|
32
|
-
let sum = 0;
|
|
33
|
-
for (let c = 0; c < channels; ++c) {
|
|
34
|
-
sum += samples[i * channels + c];
|
|
35
|
-
}
|
|
36
|
-
mono[i] = (sum / channels) / 32768;
|
|
37
|
-
}
|
|
38
|
-
resolve({ audio: mono, sampleRate });
|
|
39
|
-
});
|
|
40
|
-
|
|
41
|
-
reader.on('error', reject);
|
|
42
|
-
file.on('error', reject);
|
|
43
|
-
|
|
44
|
-
file.pipe(reader);
|
|
45
|
-
});
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
test('ONNX preprocessor matches python reference output', async () => {
|
|
49
|
-
const projectRoot = path.resolve(__dirname, '..');
|
|
50
|
-
const assetsDir = path.join(projectRoot, 'assets');
|
|
51
|
-
const preprocPath = path.join(assetsDir, 'nemo128.onnx');
|
|
52
|
-
if (!fs.existsSync(preprocPath)) {
|
|
53
|
-
console.warn('Preprocessor model not found, skipping test');
|
|
54
|
-
return;
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
const wavPath = path.resolve(projectRoot, '..', 'jfk.wav');
|
|
58
|
-
if (!fs.existsSync(wavPath)) {
|
|
59
|
-
console.warn('Test WAV not found, skipping test');
|
|
60
|
-
return;
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
const tmpJson = path.join(projectRoot, '.tmp_features.json');
|
|
64
|
-
const pythonScript = path.join(projectRoot, '..', 'scripts', 'compute_features.py');
|
|
65
|
-
const pyRes = spawnSync('python', [pythonScript, preprocPath, wavPath, tmpJson], { stdio: 'inherit' });
|
|
66
|
-
expect(pyRes.status).toBe(0);
|
|
67
|
-
const ref = JSON.parse(fs.readFileSync(tmpJson, 'utf8'));
|
|
68
|
-
fs.unlinkSync(tmpJson);
|
|
69
|
-
|
|
70
|
-
const { audio } = await readWavFloat32(wavPath);
|
|
71
|
-
const jsPreproc = new OnnxPreprocessor(preprocPath);
|
|
72
|
-
const jsRes = await jsPreproc.process(audio);
|
|
73
|
-
|
|
74
|
-
expect(jsRes.length).toBe(ref.length);
|
|
75
|
-
const refFeat = Float32Array.from(ref.features);
|
|
76
|
-
const jsFeat = jsRes.features;
|
|
77
|
-
expect(jsFeat.length).toBe(refFeat.length);
|
|
78
|
-
|
|
79
|
-
let maxDiff = 0;
|
|
80
|
-
for (let i = 0; i < jsFeat.length; ++i) {
|
|
81
|
-
const diff = Math.abs(jsFeat[i] - refFeat[i]);
|
|
82
|
-
if (diff > maxDiff) maxDiff = diff;
|
|
83
|
-
}
|
|
84
|
-
expect(maxDiff).toBeLessThan(1e-4);
|
|
85
|
-
});
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
import { ParakeetTokenizer } from '../src/parakeet.js';
|
|
2
|
-
|
|
3
|
-
describe('ParakeetTokenizer', () => {
|
|
4
|
-
test('decode joins tokens with spaces correctly', () => {
|
|
5
|
-
const id2token = ['<blk>', '▁hello', '▁world', '!'];
|
|
6
|
-
const tok = new ParakeetTokenizer(id2token);
|
|
7
|
-
const text = tok.decode([1, 2, 3]);
|
|
8
|
-
expect(text).toBe('hello world!');
|
|
9
|
-
});
|
|
10
|
-
|
|
11
|
-
test('blank tokens are skipped', () => {
|
|
12
|
-
const id2token = ['<blk>', '▁test'];
|
|
13
|
-
const tok = new ParakeetTokenizer(id2token);
|
|
14
|
-
const text = tok.decode([0, 1, 0]);
|
|
15
|
-
expect(text).toBe('test');
|
|
16
|
-
});
|
|
17
|
-
|
|
18
|
-
test('unknown ids are ignored', () => {
|
|
19
|
-
const id2token = ['<blk>', '▁foo'];
|
|
20
|
-
const tok = new ParakeetTokenizer(id2token);
|
|
21
|
-
const text = tok.decode([1, 99, 0]);
|
|
22
|
-
expect(text).toBe('foo');
|
|
23
|
-
});
|
|
24
|
-
});
|
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
// Quick smoke test to transcribe sherpa-onnx-parakeet-model/test_wavs/0.wav
|
|
2
|
-
import fs from 'fs';
|
|
3
|
-
import path from 'path';
|
|
4
|
-
import { ParakeetModel } from '../src/parakeet.js';
|
|
5
|
-
import wav from 'wav';
|
|
6
|
-
const { Reader } = wav;
|
|
7
|
-
import { Writable } from 'stream';
|
|
8
|
-
|
|
9
|
-
function readWav(filePath) {
|
|
10
|
-
return new Promise((resolve, reject) => {
|
|
11
|
-
const file = fs.createReadStream(filePath);
|
|
12
|
-
const reader = new Reader();
|
|
13
|
-
const samples = [];
|
|
14
|
-
let sampleRate = 0;
|
|
15
|
-
let channels = 0;
|
|
16
|
-
|
|
17
|
-
const writer = new Writable({
|
|
18
|
-
write(chunk, encoding, callback) {
|
|
19
|
-
for (let i = 0; i < chunk.length; i += 2) {
|
|
20
|
-
samples.push(chunk.readInt16LE(i));
|
|
21
|
-
}
|
|
22
|
-
callback();
|
|
23
|
-
}
|
|
24
|
-
});
|
|
25
|
-
|
|
26
|
-
reader.on('format', (format) => {
|
|
27
|
-
sampleRate = format.sampleRate;
|
|
28
|
-
channels = format.channels;
|
|
29
|
-
console.log(`[Test] WAV format: ${sampleRate} Hz, ${channels} channels, ${format.bitDepth}-bit`);
|
|
30
|
-
});
|
|
31
|
-
|
|
32
|
-
file.pipe(reader).pipe(writer);
|
|
33
|
-
|
|
34
|
-
writer.on('finish', () => {
|
|
35
|
-
let monoSamples = new Float32Array(Math.floor(samples.length / channels));
|
|
36
|
-
for (let i = 0; i < monoSamples.length; i++) {
|
|
37
|
-
let sum = 0;
|
|
38
|
-
for (let c = 0; c < channels; c++) {
|
|
39
|
-
sum += samples[i * channels + c];
|
|
40
|
-
}
|
|
41
|
-
monoSamples[i] = (sum / channels) / 32768;
|
|
42
|
-
}
|
|
43
|
-
resolve({ audio: monoSamples, sampleRate });
|
|
44
|
-
});
|
|
45
|
-
|
|
46
|
-
writer.on('error', reject);
|
|
47
|
-
reader.on('error', reject);
|
|
48
|
-
file.on('error', reject);
|
|
49
|
-
});
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
async function main() {
|
|
53
|
-
const modelDir = path.resolve('../parakeet-transformers-js-model');
|
|
54
|
-
const model = await ParakeetModel.fromDirectory(modelDir);
|
|
55
|
-
const wavPath = path.resolve('../jfk.wav');
|
|
56
|
-
let { audio, sampleRate } = await readWav(wavPath);
|
|
57
|
-
|
|
58
|
-
// Resample if necessary (simple linear interpolation)
|
|
59
|
-
if (sampleRate !== 16000) {
|
|
60
|
-
console.log(`[Test] Resampling from ${sampleRate} Hz to 16000 Hz...`);
|
|
61
|
-
const ratio = sampleRate / 16000;
|
|
62
|
-
const newLength = Math.floor(audio.length / ratio);
|
|
63
|
-
const resampled = new Float32Array(newLength);
|
|
64
|
-
for (let i = 0; i < newLength; i++) {
|
|
65
|
-
const index = i * ratio;
|
|
66
|
-
const i_low = Math.floor(index);
|
|
67
|
-
const i_high = Math.ceil(index);
|
|
68
|
-
if (i_high >= audio.length) {
|
|
69
|
-
resampled[i] = audio[i_low];
|
|
70
|
-
continue;
|
|
71
|
-
}
|
|
72
|
-
const w = index - i_low;
|
|
73
|
-
resampled[i] = audio[i_low] * (1 - w) + audio[i_high] * w;
|
|
74
|
-
}
|
|
75
|
-
audio = resampled;
|
|
76
|
-
sampleRate = 16000;
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
// Tail padding 2 s
|
|
80
|
-
const pad = new Float32Array(sampleRate * 2);
|
|
81
|
-
const padded = new Float32Array(audio.length + pad.length);
|
|
82
|
-
padded.set(audio);
|
|
83
|
-
padded.set(pad, audio.length);
|
|
84
|
-
|
|
85
|
-
const result = await model.transcribe(padded, sampleRate, { returnTimestamps: true, returnConfidences: true, temperature: 2.5, debug: true, skipCMVN: true });
|
|
86
|
-
console.log(JSON.stringify(result, null, 2));
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
main();
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
// Only include files in the src directory
|
|
3
|
-
"include": ["src/**/*"],
|
|
4
|
-
"compilerOptions": {
|
|
5
|
-
// Tells the compiler to check JS files
|
|
6
|
-
"checkJs": true,
|
|
7
|
-
"target": "esnext",
|
|
8
|
-
"module": "nodenext",
|
|
9
|
-
"moduleResolution": "nodenext",
|
|
10
|
-
"outDir": "types",
|
|
11
|
-
"strict": false,
|
|
12
|
-
"skipLibCheck": true,
|
|
13
|
-
"declaration": true,
|
|
14
|
-
"declarationMap": true,
|
|
15
|
-
"noEmit": false,
|
|
16
|
-
"emitDeclarationOnly": true
|
|
17
|
-
},
|
|
18
|
-
"typeAcquisition": {
|
|
19
|
-
"include": ["jest"]
|
|
20
|
-
}
|
|
21
|
-
}
|