@huggingface/transformers 3.1.1 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -4
- package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
- package/dist/transformers.cjs +1062 -183
- package/dist/transformers.cjs.map +1 -1
- package/dist/transformers.js +2239 -1232
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.cjs +1 -358
- package/dist/transformers.min.cjs.map +1 -1
- package/dist/transformers.min.js +1 -421
- package/dist/transformers.min.js.map +1 -1
- package/dist/transformers.min.mjs +1 -358
- package/dist/transformers.min.mjs.map +1 -1
- package/dist/transformers.mjs +1082 -181
- package/dist/transformers.mjs.map +1 -1
- package/package.json +11 -16
- package/src/backends/onnx.js +2 -7
- package/src/base/image_processors_utils.js +3 -1
- package/src/configs.js +11 -2
- package/src/env.js +1 -1
- package/src/models/feature_extractors.js +1 -0
- package/src/models/idefics3/image_processing_idefics3.js +24 -13
- package/src/models/image_processors.js +1 -0
- package/src/models/moonshine/feature_extraction_moonshine.js +26 -0
- package/src/models/moonshine/processing_moonshine.js +20 -0
- package/src/models/paligemma/processing_paligemma.js +82 -0
- package/src/models/phi3_v/image_processing_phi3_v.js +163 -0
- package/src/models/phi3_v/processing_phi3_v.js +53 -0
- package/src/models/processors.js +3 -0
- package/src/models/pyannote/feature_extraction_pyannote.js +56 -0
- package/src/models/pyannote/processing_pyannote.js +7 -54
- package/src/models.js +233 -35
- package/src/ops/registry.js +11 -0
- package/src/pipelines.js +30 -0
- package/src/tokenizers.js +12 -1
- package/src/utils/core.js +39 -9
- package/src/utils/hub.js +8 -12
- package/src/utils/image.js +40 -0
- package/src/utils/tensor.js +51 -1
- package/types/backends/onnx.d.ts +2 -2
- package/types/backends/onnx.d.ts.map +1 -1
- package/types/base/feature_extraction_utils.d.ts +1 -1
- package/types/base/feature_extraction_utils.d.ts.map +1 -1
- package/types/base/image_processors_utils.d.ts +4 -4
- package/types/base/image_processors_utils.d.ts.map +1 -1
- package/types/base/processing_utils.d.ts +4 -4
- package/types/base/processing_utils.d.ts.map +1 -1
- package/types/configs.d.ts +7 -7
- package/types/configs.d.ts.map +1 -1
- package/types/env.d.ts +1 -1
- package/types/env.d.ts.map +1 -1
- package/types/generation/configuration_utils.d.ts +2 -2
- package/types/generation/logits_process.d.ts +2 -2
- package/types/generation/logits_process.d.ts.map +1 -1
- package/types/generation/logits_sampler.d.ts.map +1 -1
- package/types/generation/parameters.d.ts +5 -5
- package/types/generation/stopping_criteria.d.ts +1 -1
- package/types/generation/stopping_criteria.d.ts.map +1 -1
- package/types/generation/streamers.d.ts +2 -2
- package/types/generation/streamers.d.ts.map +1 -1
- package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts +1 -1
- package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts.map +1 -1
- package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
- package/types/models/auto/image_processing_auto.d.ts.map +1 -1
- package/types/models/auto/processing_auto.d.ts +1 -1
- package/types/models/auto/processing_auto.d.ts.map +1 -1
- package/types/models/clap/feature_extraction_clap.d.ts +1 -1
- package/types/models/clap/feature_extraction_clap.d.ts.map +1 -1
- package/types/models/detr/image_processing_detr.d.ts +11 -11
- package/types/models/detr/image_processing_detr.d.ts.map +1 -1
- package/types/models/donut/image_processing_donut.d.ts +1 -1
- package/types/models/donut/image_processing_donut.d.ts.map +1 -1
- package/types/models/feature_extractors.d.ts +1 -0
- package/types/models/florence2/processing_florence2.d.ts.map +1 -1
- package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
- package/types/models/idefics3/processing_idefics3.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +1 -0
- package/types/models/janus/image_processing_janus.d.ts +1 -1
- package/types/models/janus/image_processing_janus.d.ts.map +1 -1
- package/types/models/janus/processing_janus.d.ts.map +1 -1
- package/types/models/maskformer/image_processing_maskformer.d.ts +8 -8
- package/types/models/maskformer/image_processing_maskformer.d.ts.map +1 -1
- package/types/models/mgp_str/processing_mgp_str.d.ts +2 -2
- package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
- package/types/models/moonshine/feature_extraction_moonshine.d.ts +13 -0
- package/types/models/moonshine/feature_extraction_moonshine.d.ts.map +1 -0
- package/types/models/moonshine/processing_moonshine.d.ts +17 -0
- package/types/models/moonshine/processing_moonshine.d.ts.map +1 -0
- package/types/models/owlvit/image_processing_owlvit.d.ts.map +1 -1
- package/types/models/paligemma/processing_paligemma.d.ts +12 -0
- package/types/models/paligemma/processing_paligemma.d.ts.map +1 -0
- package/types/models/phi3_v/image_processing_phi3_v.d.ts +17 -0
- package/types/models/phi3_v/image_processing_phi3_v.d.ts.map +1 -0
- package/types/models/phi3_v/processing_phi3_v.d.ts +17 -0
- package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -0
- package/types/models/processors.d.ts +3 -0
- package/types/models/pyannote/feature_extraction_pyannote.d.ts +18 -0
- package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
- package/types/models/pyannote/processing_pyannote.d.ts +4 -15
- package/types/models/pyannote/processing_pyannote.d.ts.map +1 -1
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/rt_detr/image_processing_rt_detr.d.ts.map +1 -1
- package/types/models/sam/image_processing_sam.d.ts.map +1 -1
- package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts +1 -1
- package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts.map +1 -1
- package/types/models/segformer/image_processing_segformer.d.ts.map +1 -1
- package/types/models/speecht5/processing_speecht5.d.ts.map +1 -1
- package/types/models/swin2sr/image_processing_swin2sr.d.ts +1 -1
- package/types/models/swin2sr/image_processing_swin2sr.d.ts.map +1 -1
- package/types/models/vitmatte/image_processing_vitmatte.d.ts.map +1 -1
- package/types/models/vitpose/image_processing_vitpose.d.ts +1 -1
- package/types/models/vitpose/image_processing_vitpose.d.ts.map +1 -1
- package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts.map +1 -1
- package/types/models/wav2vec2/processing_wav2vec2.d.ts.map +1 -1
- package/types/models/wespeaker/feature_extraction_wespeaker.d.ts +1 -1
- package/types/models/wespeaker/feature_extraction_wespeaker.d.ts.map +1 -1
- package/types/models/whisper/feature_extraction_whisper.d.ts +1 -1
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
- package/types/models/whisper/generation_whisper.d.ts.map +1 -1
- package/types/models/whisper/processing_whisper.d.ts.map +1 -1
- package/types/models/yolos/image_processing_yolos.d.ts.map +1 -1
- package/types/models.d.ts +61 -5
- package/types/models.d.ts.map +1 -1
- package/types/ops/registry.d.ts +1 -0
- package/types/ops/registry.d.ts.map +1 -1
- package/types/pipelines.d.ts +31 -51
- package/types/pipelines.d.ts.map +1 -1
- package/types/tokenizers.d.ts +10 -6
- package/types/tokenizers.d.ts.map +1 -1
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/constants.d.ts.map +1 -1
- package/types/utils/core.d.ts +87 -22
- package/types/utils/core.d.ts.map +1 -1
- package/types/utils/data-structures.d.ts.map +1 -1
- package/types/utils/devices.d.ts.map +1 -1
- package/types/utils/dtypes.d.ts.map +1 -1
- package/types/utils/generic.d.ts.map +1 -1
- package/types/utils/hub.d.ts +3 -3
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/image.d.ts +10 -1
- package/types/utils/image.d.ts.map +1 -1
- package/types/utils/maths.d.ts +10 -10
- package/types/utils/maths.d.ts.map +1 -1
- package/types/utils/tensor.d.ts +22 -6
- package/types/utils/tensor.d.ts.map +1 -1
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@huggingface/transformers",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.2.0",
|
|
4
4
|
"description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
|
|
5
5
|
"main": "./src/transformers.js",
|
|
6
6
|
"types": "./types/transformers.d.ts",
|
|
@@ -21,12 +21,6 @@
|
|
|
21
21
|
"default": "./dist/transformers.js"
|
|
22
22
|
}
|
|
23
23
|
},
|
|
24
|
-
"imports": {
|
|
25
|
-
"#onnxruntime-webgpu": {
|
|
26
|
-
"node": "onnxruntime-web",
|
|
27
|
-
"default": "onnxruntime-web/webgpu"
|
|
28
|
-
}
|
|
29
|
-
},
|
|
30
24
|
"scripts": {
|
|
31
25
|
"format": "prettier --write .",
|
|
32
26
|
"format:check": "prettier --check .",
|
|
@@ -63,22 +57,23 @@
|
|
|
63
57
|
"dependencies": {
|
|
64
58
|
"@huggingface/jinja": "^0.3.2",
|
|
65
59
|
"onnxruntime-node": "1.20.1",
|
|
66
|
-
"onnxruntime-web": "1.
|
|
60
|
+
"onnxruntime-web": "1.21.0-dev.20241205-d27fecd3d3",
|
|
67
61
|
"sharp": "^0.33.5"
|
|
68
62
|
},
|
|
69
63
|
"devDependencies": {
|
|
70
64
|
"@types/jest": "^29.5.14",
|
|
71
|
-
"@
|
|
65
|
+
"@types/node": "^22.10.1",
|
|
66
|
+
"@webgpu/types": "^0.1.51",
|
|
72
67
|
"catharsis": "github:xenova/catharsis",
|
|
73
68
|
"jest": "^30.0.0-alpha.6",
|
|
74
69
|
"jest-environment-node": "^30.0.0-alpha.6",
|
|
75
|
-
"jsdoc-to-markdown": "^
|
|
76
|
-
"prettier": "3.
|
|
77
|
-
"typescript": "^5.
|
|
78
|
-
"wavefile": "
|
|
79
|
-
"webpack": "^5.
|
|
80
|
-
"webpack-cli": "^5.
|
|
81
|
-
"webpack-dev-server": "^
|
|
70
|
+
"jsdoc-to-markdown": "^9.1.1",
|
|
71
|
+
"prettier": "3.4.2",
|
|
72
|
+
"typescript": "^5.7.2",
|
|
73
|
+
"wavefile": "11.0.0",
|
|
74
|
+
"webpack": "^5.97.1",
|
|
75
|
+
"webpack-cli": "^5.1.4",
|
|
76
|
+
"webpack-dev-server": "^5.1.0"
|
|
82
77
|
},
|
|
83
78
|
"files": [
|
|
84
79
|
"src",
|
package/src/backends/onnx.js
CHANGED
|
@@ -21,12 +21,7 @@ import { env, apis } from '../env.js';
|
|
|
21
21
|
// NOTE: Import order matters here. We need to import `onnxruntime-node` before `onnxruntime-web`.
|
|
22
22
|
// In either case, we select the default export if it exists, otherwise we use the named export.
|
|
23
23
|
import * as ONNX_NODE from 'onnxruntime-node';
|
|
24
|
-
|
|
25
|
-
// Use subpath-imports to ensure Node.js and browser interoperability.
|
|
26
|
-
// See package.json and https://nodejs.org/api/packages.html#subpath-imports
|
|
27
|
-
// for more information.
|
|
28
|
-
// @ts-ignore
|
|
29
|
-
import * as ONNX_WEB from '#onnxruntime-webgpu';
|
|
24
|
+
import * as ONNX_WEB from 'onnxruntime-web';
|
|
30
25
|
|
|
31
26
|
export { Tensor } from 'onnxruntime-common';
|
|
32
27
|
|
|
@@ -68,7 +63,7 @@ if (ORT_SYMBOL in globalThis) {
|
|
|
68
63
|
} else if (apis.IS_NODE_ENV) {
|
|
69
64
|
ONNX = ONNX_NODE.default ?? ONNX_NODE;
|
|
70
65
|
|
|
71
|
-
// Updated as of ONNX Runtime 1.
|
|
66
|
+
// Updated as of ONNX Runtime 1.20.1
|
|
72
67
|
// The following table lists the supported versions of ONNX Runtime Node.js binding provided with pre-built binaries.
|
|
73
68
|
// | EPs/Platforms | Windows x64 | Windows arm64 | Linux x64 | Linux arm64 | MacOS x64 | MacOS arm64 |
|
|
74
69
|
// | ------------- | ----------- | ------------- | ----------------- | ----------- | --------- | ----------- |
|
|
@@ -699,7 +699,7 @@ export class ImageProcessor extends Callable {
|
|
|
699
699
|
* Pad the image by a certain amount.
|
|
700
700
|
* @param {Float32Array} pixelData The pixel data to pad.
|
|
701
701
|
* @param {number[]} imgDims The dimensions of the image (height, width, channels).
|
|
702
|
-
* @param {{width:number; height:number}|number} padSize The dimensions of the padded image.
|
|
702
|
+
* @param {{width:number; height:number}|number|'square'} padSize The dimensions of the padded image.
|
|
703
703
|
* @param {Object} options The options for padding.
|
|
704
704
|
* @param {'constant'|'symmetric'} [options.mode='constant'] The type of padding to add.
|
|
705
705
|
* @param {boolean} [options.center=false] Whether to center the image.
|
|
@@ -717,6 +717,8 @@ export class ImageProcessor extends Callable {
|
|
|
717
717
|
if (typeof padSize === 'number') {
|
|
718
718
|
paddedImageWidth = padSize;
|
|
719
719
|
paddedImageHeight = padSize;
|
|
720
|
+
} else if (padSize === 'square') {
|
|
721
|
+
paddedImageWidth = paddedImageHeight = Math.max(imageHeight, imageWidth);
|
|
720
722
|
} else {
|
|
721
723
|
paddedImageWidth = padSize.width;
|
|
722
724
|
paddedImageHeight = padSize.height;
|
package/src/configs.js
CHANGED
|
@@ -95,8 +95,6 @@ function getNormalizedConfig(config) {
|
|
|
95
95
|
case 'gpt_neox':
|
|
96
96
|
case 'stablelm':
|
|
97
97
|
case 'opt':
|
|
98
|
-
case 'phi':
|
|
99
|
-
case 'phi3':
|
|
100
98
|
case 'falcon':
|
|
101
99
|
mapping['num_heads'] = 'num_attention_heads';
|
|
102
100
|
mapping['num_layers'] = 'num_hidden_layers';
|
|
@@ -104,6 +102,7 @@ function getNormalizedConfig(config) {
|
|
|
104
102
|
break;
|
|
105
103
|
case 'llama':
|
|
106
104
|
case 'olmo':
|
|
105
|
+
case 'olmo2':
|
|
107
106
|
case 'mobilellm':
|
|
108
107
|
case 'granite':
|
|
109
108
|
case 'cohere':
|
|
@@ -111,6 +110,9 @@ function getNormalizedConfig(config) {
|
|
|
111
110
|
case 'starcoder2':
|
|
112
111
|
case 'qwen2':
|
|
113
112
|
case 'qwen2_vl':
|
|
113
|
+
case 'phi':
|
|
114
|
+
case 'phi3':
|
|
115
|
+
case 'phi3_v':
|
|
114
116
|
mapping['num_heads'] = 'num_key_value_heads';
|
|
115
117
|
mapping['num_layers'] = 'num_hidden_layers';
|
|
116
118
|
mapping['hidden_size'] = 'hidden_size';
|
|
@@ -143,6 +145,12 @@ function getNormalizedConfig(config) {
|
|
|
143
145
|
mapping['num_layers'] = 'n_layers';
|
|
144
146
|
mapping['hidden_size'] = 'd_model';
|
|
145
147
|
break;
|
|
148
|
+
case 'exaone':
|
|
149
|
+
mapping['num_heads'] = 'num_key_value_heads';
|
|
150
|
+
mapping['num_layers'] = 'num_layers';
|
|
151
|
+
mapping['dim_kv'] = 'head_dim';
|
|
152
|
+
mapping['num_attention_heads'] = 'num_attention_heads';
|
|
153
|
+
break;
|
|
146
154
|
|
|
147
155
|
// Encoder-decoder models
|
|
148
156
|
case 't5':
|
|
@@ -184,6 +192,7 @@ function getNormalizedConfig(config) {
|
|
|
184
192
|
mapping['encoder_hidden_size'] = mapping['decoder_hidden_size'] = 'd_model';
|
|
185
193
|
break;
|
|
186
194
|
case 'musicgen_decoder':
|
|
195
|
+
case 'moonshine':
|
|
187
196
|
mapping['num_encoder_layers'] = mapping['num_decoder_layers'] = 'num_hidden_layers';
|
|
188
197
|
mapping['num_encoder_heads'] = mapping['num_decoder_heads'] = 'num_attention_heads';
|
|
189
198
|
mapping['encoder_hidden_size'] = mapping['decoder_hidden_size'] = 'hidden_size';
|
package/src/env.js
CHANGED
|
@@ -26,7 +26,7 @@ import fs from 'fs';
|
|
|
26
26
|
import path from 'path';
|
|
27
27
|
import url from 'url';
|
|
28
28
|
|
|
29
|
-
const VERSION = '3.
|
|
29
|
+
const VERSION = '3.2.0';
|
|
30
30
|
|
|
31
31
|
// Check if various APIs are available (depends on environment)
|
|
32
32
|
const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
|
|
2
2
|
export * from './audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js';
|
|
3
3
|
export * from './clap/feature_extraction_clap.js';
|
|
4
|
+
export * from './moonshine/feature_extraction_moonshine.js';
|
|
4
5
|
export * from './pyannote/feature_extraction_pyannote.js';
|
|
5
6
|
export * from './seamless_m4t/feature_extraction_seamless_m4t.js';
|
|
6
7
|
export * from './speecht5/feature_extraction_speecht5.js';
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import {
|
|
4
4
|
ImageProcessor,
|
|
5
5
|
} from "../../base/image_processors_utils.js";
|
|
6
|
-
import { cat, full, interpolate_4d, stack } from "../../utils/tensor.js";
|
|
6
|
+
import { cat, full, interpolate_4d, slice, stack } from "../../utils/tensor.js";
|
|
7
7
|
|
|
8
8
|
export class Idefics3ImageProcessor extends ImageProcessor {
|
|
9
9
|
constructor(config) {
|
|
@@ -186,18 +186,29 @@ export class Idefics3ImageProcessor extends ImageProcessor {
|
|
|
186
186
|
const optimal_width = Math.ceil(width / num_splits_w);
|
|
187
187
|
|
|
188
188
|
// Iterate through each row and column
|
|
189
|
-
for (let r = 0; r < num_splits_h; r
|
|
190
|
-
for (let c = 0; c < num_splits_w; c
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
//
|
|
200
|
-
|
|
189
|
+
for (let r = 0; r < num_splits_h; ++r) {
|
|
190
|
+
for (let c = 0; c < num_splits_w; ++c) {
|
|
191
|
+
let start_x, start_y, end_x, end_y;
|
|
192
|
+
if (r === num_splits_h - 1) { // At bottom
|
|
193
|
+
start_y = height - optimal_height;
|
|
194
|
+
end_y = height;
|
|
195
|
+
} else {
|
|
196
|
+
start_y = r * optimal_height;
|
|
197
|
+
end_y = (r + 1) * optimal_height;
|
|
198
|
+
}
|
|
199
|
+
if (c === num_splits_w - 1) { // At right
|
|
200
|
+
start_x = width - optimal_width;
|
|
201
|
+
end_x = width;
|
|
202
|
+
} else {
|
|
203
|
+
start_x = c * optimal_width;
|
|
204
|
+
end_x = (c + 1) * optimal_width;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
const starts = [start_y, start_x];
|
|
208
|
+
const ends = [end_y, end_x];
|
|
209
|
+
|
|
210
|
+
const patch = await slice(pixel_values, starts, ends, [2, 3]);
|
|
211
|
+
frames.push(patch);
|
|
201
212
|
}
|
|
202
213
|
}
|
|
203
214
|
|
|
@@ -24,6 +24,7 @@ export * from './mobilevit/image_processing_mobilevit.js'
|
|
|
24
24
|
export * from './nougat/image_processing_nougat.js'
|
|
25
25
|
export * from './owlv2/image_processing_owlv2.js'
|
|
26
26
|
export * from './owlvit/image_processing_owlvit.js'
|
|
27
|
+
export * from './phi3_v/image_processing_phi3_v.js'
|
|
27
28
|
export * from './pvt/image_processing_pvt.js'
|
|
28
29
|
export * from './qwen2_vl/image_processing_qwen2_vl.js'
|
|
29
30
|
export * from './rt_detr/image_processing_rt_detr.js'
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
|
|
2
|
+
import { Tensor } from '../../utils/tensor.js';
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
export class MoonshineFeatureExtractor extends FeatureExtractor {
|
|
6
|
+
/**
|
|
7
|
+
* Asynchronously extracts input values from a given audio using the provided configuration.
|
|
8
|
+
* @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
|
|
9
|
+
* @returns {Promise<{ input_values: Tensor; }>} The extracted input values.
|
|
10
|
+
*/
|
|
11
|
+
async _call(audio) {
|
|
12
|
+
validate_audio_inputs(audio, 'MoonshineFeatureExtractor');
|
|
13
|
+
|
|
14
|
+
if (audio instanceof Float64Array) {
|
|
15
|
+
audio = new Float32Array(audio);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
const shape = [
|
|
19
|
+
1, /* batch_size */
|
|
20
|
+
audio.length, /* num_samples */
|
|
21
|
+
];
|
|
22
|
+
return {
|
|
23
|
+
input_values: new Tensor('float32', audio, shape),
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import { AutoFeatureExtractor } from "../auto/feature_extraction_auto.js"
|
|
2
|
+
import { AutoTokenizer } from "../../tokenizers.js"
|
|
3
|
+
import { Processor } from "../../base/processing_utils.js"
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Represents a MoonshineProcessor that extracts features from an audio input.
|
|
7
|
+
*/
|
|
8
|
+
export class MoonshineProcessor extends Processor {
|
|
9
|
+
static tokenizer_class = AutoTokenizer
|
|
10
|
+
static feature_extractor_class = AutoFeatureExtractor
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Calls the feature_extractor function with the given audio input.
|
|
14
|
+
* @param {any} audio The audio input to extract features from.
|
|
15
|
+
* @returns {Promise<any>} A Promise that resolves with the extracted features.
|
|
16
|
+
*/
|
|
17
|
+
async _call(audio) {
|
|
18
|
+
return await this.feature_extractor(audio);
|
|
19
|
+
}
|
|
20
|
+
}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import { Processor } from "../../base/processing_utils.js";
|
|
2
|
+
import { AutoImageProcessor } from "../auto/image_processing_auto.js";
|
|
3
|
+
import { AutoTokenizer } from "../../tokenizers.js";
|
|
4
|
+
|
|
5
|
+
const IMAGE_TOKEN = "<image>";
|
|
6
|
+
|
|
7
|
+
function build_string_from_input(
|
|
8
|
+
prompt,
|
|
9
|
+
bos_token,
|
|
10
|
+
image_seq_len,
|
|
11
|
+
image_token,
|
|
12
|
+
num_images,
|
|
13
|
+
) {
|
|
14
|
+
return `${image_token.repeat(image_seq_len * num_images)}${bos_token}${prompt}\n`
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export class PaliGemmaProcessor extends Processor {
|
|
18
|
+
static tokenizer_class = AutoTokenizer
|
|
19
|
+
static image_processor_class = AutoImageProcessor
|
|
20
|
+
static uses_processor_config = false;
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* @typedef {import('../../utils/image.js').RawImage} RawImage
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
// `images` is required, `text` is optional
|
|
27
|
+
async _call(/** @type {RawImage|RawImage[]} */ images, text = null, kwargs = {}) {
|
|
28
|
+
if (!text) {
|
|
29
|
+
console.warn(
|
|
30
|
+
"You are using PaliGemma without a text prefix. It will perform as a picture-captioning model."
|
|
31
|
+
)
|
|
32
|
+
text = ""
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
if (!Array.isArray(images)) {
|
|
36
|
+
images = [images]
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
if (!Array.isArray(text)) {
|
|
40
|
+
text = [text]
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
const bos_token = this.tokenizer.bos_token;
|
|
44
|
+
const image_seq_length = this.image_processor.config.image_seq_length;
|
|
45
|
+
let input_strings;
|
|
46
|
+
if (text.some((t) => t.includes(IMAGE_TOKEN))) {
|
|
47
|
+
input_strings = text.map(
|
|
48
|
+
sample => {
|
|
49
|
+
const expanded_sample = sample.replaceAll(IMAGE_TOKEN, IMAGE_TOKEN.repeat(image_seq_length));
|
|
50
|
+
const bos_rfind_index = expanded_sample.lastIndexOf(IMAGE_TOKEN);
|
|
51
|
+
const bos_index = bos_rfind_index === -1 ? 0 : bos_rfind_index + IMAGE_TOKEN.length;
|
|
52
|
+
return expanded_sample.slice(0, bos_index) + bos_token + expanded_sample.slice(bos_index) + "\n";
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
} else {
|
|
56
|
+
console.warn(
|
|
57
|
+
"You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special " +
|
|
58
|
+
"image tokens in the text, as many tokens as there are images per each text. It is recommended to " +
|
|
59
|
+
"add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images " +
|
|
60
|
+
"each text has and add special tokens."
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
input_strings = text.map(
|
|
64
|
+
sample => build_string_from_input(
|
|
65
|
+
sample,
|
|
66
|
+
bos_token,
|
|
67
|
+
image_seq_length,
|
|
68
|
+
IMAGE_TOKEN,
|
|
69
|
+
images.length,
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const text_inputs = this.tokenizer(input_strings, kwargs);
|
|
75
|
+
const image_inputs = await this.image_processor(images, kwargs);
|
|
76
|
+
|
|
77
|
+
return {
|
|
78
|
+
...image_inputs,
|
|
79
|
+
...text_inputs,
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
}
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import {
|
|
2
|
+
ImageProcessor,
|
|
3
|
+
} from "../../base/image_processors_utils.js";
|
|
4
|
+
import { cat, interpolate_4d, slice, stack, Tensor } from "../../utils/tensor.js";
|
|
5
|
+
|
|
6
|
+
const IMAGE_SIZE = 336;
|
|
7
|
+
const SLICE_AXES = [2, 3]; // axes to slice on
|
|
8
|
+
const { ceil, floor, sqrt } = Math;
|
|
9
|
+
|
|
10
|
+
export class Phi3VImageProcessor extends ImageProcessor {
|
|
11
|
+
constructor(config) {
|
|
12
|
+
super({
|
|
13
|
+
...config,
|
|
14
|
+
do_normalize: true,
|
|
15
|
+
do_pad: true,
|
|
16
|
+
pad_size: 'custom',
|
|
17
|
+
do_convert_rgb: true,
|
|
18
|
+
do_resize: true, // Smart resizing "hd_transform"
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
this._num_crops = config.num_crops;
|
|
22
|
+
}
|
|
23
|
+
calc_num_image_tokens_from_image_size(width, height) {
|
|
24
|
+
// @ts-expect-error
|
|
25
|
+
const { num_img_tokens } = this.config;
|
|
26
|
+
return floor(((floor((height / IMAGE_SIZE)) * floor((width / IMAGE_SIZE)) + 1) * num_img_tokens) + 1 + (floor(height / IMAGE_SIZE) + 1) * sqrt(num_img_tokens));
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
30
|
+
get_resize_output_image_size(image, size) {
|
|
31
|
+
const hd_num = this._num_crops;
|
|
32
|
+
const [width, height] = image.size
|
|
33
|
+
|
|
34
|
+
let ratio = width / height;
|
|
35
|
+
let scale = 1;
|
|
36
|
+
|
|
37
|
+
// Calculate the scaling factor
|
|
38
|
+
while (scale * Math.ceil(scale / ratio) <= hd_num) {
|
|
39
|
+
scale += 1;
|
|
40
|
+
}
|
|
41
|
+
scale -= 1;
|
|
42
|
+
|
|
43
|
+
// Compute the new dimensions
|
|
44
|
+
const new_w = Math.floor(scale * 336);
|
|
45
|
+
const new_h = Math.floor(new_w / ratio);
|
|
46
|
+
|
|
47
|
+
return [new_w, new_h]
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
/** @type {ImageProcessor['pad_image']} */
|
|
52
|
+
pad_image(pixelData, imgDims, padSize, options = {}) {
|
|
53
|
+
// Phi3V uses a custom padding strategy:
|
|
54
|
+
// - Pad to a multiple of 336
|
|
55
|
+
// - Pad with white pixels
|
|
56
|
+
const [imageHeight, imageWidth] = imgDims;
|
|
57
|
+
const height = IMAGE_SIZE * ceil(imageHeight / IMAGE_SIZE);
|
|
58
|
+
const width = IMAGE_SIZE * ceil(imageWidth / IMAGE_SIZE);
|
|
59
|
+
|
|
60
|
+
// NOTE: Since padding is done after normalization, we need to fill with the normalized values
|
|
61
|
+
const constant_values = [1, 1, 1].map((x, i) => (x - this.image_mean[i]) / this.image_std[i]);
|
|
62
|
+
return super.pad_image(pixelData, imgDims, { width, height }, {
|
|
63
|
+
center: true,
|
|
64
|
+
constant_values,
|
|
65
|
+
...options,
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
async _call(images, {
|
|
70
|
+
num_crops = null,
|
|
71
|
+
} = {}) {
|
|
72
|
+
// @ts-expect-error
|
|
73
|
+
this._num_crops = num_crops ??= this.config.num_crops;
|
|
74
|
+
if (num_crops < 4 || sqrt(num_crops) % 1 !== 0) {
|
|
75
|
+
throw new Error("num_crops must be a square number >= 4");
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
if (!Array.isArray(images)) {
|
|
79
|
+
images = [images];
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
const num_images = images.length;
|
|
83
|
+
const imageData = await Promise.all(images.map(x => this.preprocess(x)));
|
|
84
|
+
|
|
85
|
+
const original_sizes = imageData.map(x => x.original_size);
|
|
86
|
+
const reshaped_input_sizes = imageData.map(x => x.reshaped_input_size);
|
|
87
|
+
|
|
88
|
+
// Process each image in batch
|
|
89
|
+
const all_pixel_values = [];
|
|
90
|
+
for (const { pixel_values } of imageData) {
|
|
91
|
+
pixel_values.unsqueeze_(0); // Easier processing as 4D tensor
|
|
92
|
+
|
|
93
|
+
const [height, width] = pixel_values.dims.slice(-2);
|
|
94
|
+
|
|
95
|
+
// Global image (Tensor of shape [num_channels, height, width])
|
|
96
|
+
const batch_pixel_values = await interpolate_4d(pixel_values, {
|
|
97
|
+
size: [IMAGE_SIZE, IMAGE_SIZE],
|
|
98
|
+
mode: 'bicubic',
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
if (num_crops > 0) {
|
|
102
|
+
const patches = [];
|
|
103
|
+
const sqrt_patches = sqrt(num_crops);
|
|
104
|
+
const patch_width = floor(width / sqrt_patches);
|
|
105
|
+
const patch_height = floor(height / sqrt_patches);
|
|
106
|
+
for (let y = 0; y < sqrt_patches; ++y) {
|
|
107
|
+
for (let x = 0; x < sqrt_patches; ++x) {
|
|
108
|
+
let start_x, start_y, end_x, end_y;
|
|
109
|
+
if (y === sqrt_patches - 1) { // At bottom
|
|
110
|
+
start_y = height - patch_height;
|
|
111
|
+
end_y = height;
|
|
112
|
+
} else {
|
|
113
|
+
start_y = y * patch_height;
|
|
114
|
+
end_y = (y + 1) * patch_height;
|
|
115
|
+
}
|
|
116
|
+
if (x === sqrt_patches - 1) { // At right
|
|
117
|
+
start_x = width - patch_width;
|
|
118
|
+
end_x = width;
|
|
119
|
+
} else {
|
|
120
|
+
start_x = x * patch_width;
|
|
121
|
+
end_x = (x + 1) * patch_width;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const starts = [start_y, start_x];
|
|
125
|
+
const ends = [end_y, end_x];
|
|
126
|
+
const patch = await slice(pixel_values, starts, ends, SLICE_AXES);
|
|
127
|
+
patches.push(patch);
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
const resized_tensors = await interpolate_4d(cat(patches, 0), {
|
|
132
|
+
size: [IMAGE_SIZE, IMAGE_SIZE],
|
|
133
|
+
mode: 'bicubic',
|
|
134
|
+
}); // [num_crops, 3, 336, 336]
|
|
135
|
+
|
|
136
|
+
// Concatenate the global image with the patches
|
|
137
|
+
all_pixel_values.push(cat([batch_pixel_values, resized_tensors], 0));
|
|
138
|
+
} else {
|
|
139
|
+
// Only use the global image
|
|
140
|
+
// NOTE: Not currently supported in modelling code
|
|
141
|
+
all_pixel_values.push(batch_pixel_values);
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// [num_images, 1 + num_crops, num_channels=3, height, width]
|
|
146
|
+
const pixel_values = stack(all_pixel_values, 0);
|
|
147
|
+
|
|
148
|
+
// Calculate padded image sizes
|
|
149
|
+
const sizes = reshaped_input_sizes.map(x => x.map(y => IMAGE_SIZE * ceil(y / IMAGE_SIZE)));
|
|
150
|
+
|
|
151
|
+
const image_sizes = new Tensor(
|
|
152
|
+
'int64',
|
|
153
|
+
sizes.flat(),
|
|
154
|
+
[num_images, 2],
|
|
155
|
+
);
|
|
156
|
+
|
|
157
|
+
const num_img_tokens = sizes.map(
|
|
158
|
+
([height, width]) => this.calc_num_image_tokens_from_image_size(width, height),
|
|
159
|
+
);
|
|
160
|
+
|
|
161
|
+
return { pixel_values, original_sizes, reshaped_input_sizes, image_sizes, num_img_tokens };
|
|
162
|
+
}
|
|
163
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import { Processor } from "../../base/processing_utils.js";
|
|
2
|
+
import { AutoImageProcessor } from "../auto/image_processing_auto.js";
|
|
3
|
+
import { AutoTokenizer } from "../../tokenizers.js";
|
|
4
|
+
import { RawImage } from "../../utils/image.js";
|
|
5
|
+
|
|
6
|
+
const IMAGE_TOKEN = "<|image|>";
|
|
7
|
+
const IMAGE_TOKEN_PATTERN = /<\|image_\d+\|>/g;
|
|
8
|
+
|
|
9
|
+
export class Phi3VProcessor extends Processor {
|
|
10
|
+
static image_processor_class = AutoImageProcessor
|
|
11
|
+
static tokenizer_class = AutoTokenizer
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
*
|
|
15
|
+
* @param {string|string[]} text
|
|
16
|
+
* @param {RawImage|RawImage[]} images
|
|
17
|
+
* @param {...any} args
|
|
18
|
+
* @returns {Promise<any>}
|
|
19
|
+
*/
|
|
20
|
+
async _call(text, images = null, {
|
|
21
|
+
padding = true,
|
|
22
|
+
truncation = true,
|
|
23
|
+
num_crops = null,
|
|
24
|
+
} = {}) {
|
|
25
|
+
|
|
26
|
+
if (!Array.isArray(text)) {
|
|
27
|
+
text = [text];
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
let text_inputs, image_inputs;
|
|
31
|
+
if (images) {
|
|
32
|
+
image_inputs = await this.image_processor(images, { num_crops });
|
|
33
|
+
const { num_img_tokens } = image_inputs;
|
|
34
|
+
|
|
35
|
+
// The original implementation adds a bos_token before the image tokens
|
|
36
|
+
// TODO: Check if this affects performance, since it looks like a bug in the original implementation
|
|
37
|
+
const prompt_chunks = text.map((t, i) => t.split(IMAGE_TOKEN_PATTERN).join(IMAGE_TOKEN.repeat(num_img_tokens[i])));
|
|
38
|
+
|
|
39
|
+
text_inputs = this.tokenizer(prompt_chunks, { padding, truncation });
|
|
40
|
+
|
|
41
|
+
// The model expects image tokens to be negative, so we negate the image token ids
|
|
42
|
+
const image_token_id = this.tokenizer.model.convert_tokens_to_ids([IMAGE_TOKEN])[0];
|
|
43
|
+
text_inputs.input_ids.map_(id => (id == image_token_id) ? -id : id);
|
|
44
|
+
} else {
|
|
45
|
+
text_inputs = this.tokenizer(text);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
return {
|
|
49
|
+
...text_inputs,
|
|
50
|
+
...image_inputs,
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
package/src/models/processors.js
CHANGED
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
export * from './florence2/processing_florence2.js';
|
|
2
2
|
export * from './mgp_str/processing_mgp_str.js';
|
|
3
|
+
export * from './moonshine/processing_moonshine.js';
|
|
3
4
|
export * from './idefics3/processing_idefics3.js';
|
|
4
5
|
export * from './janus/processing_janus.js';
|
|
5
6
|
export * from './jina_clip/processing_jina_clip.js';
|
|
6
7
|
export * from './owlvit/processing_owlvit.js';
|
|
8
|
+
export * from './phi3_v/processing_phi3_v.js';
|
|
9
|
+
export * from './paligemma/processing_paligemma.js';
|
|
7
10
|
export * from './pyannote/processing_pyannote.js';
|
|
8
11
|
export * from './qwen2_vl/processing_qwen2_vl.js';
|
|
9
12
|
export * from './sam/processing_sam.js';
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
|
|
2
2
|
import { Tensor } from '../../utils/tensor.js';
|
|
3
|
+
import { max, softmax } from '../../utils/maths.js';
|
|
3
4
|
|
|
4
5
|
|
|
5
6
|
export class PyAnnoteFeatureExtractor extends FeatureExtractor {
|
|
@@ -25,4 +26,59 @@ export class PyAnnoteFeatureExtractor extends FeatureExtractor {
|
|
|
25
26
|
};
|
|
26
27
|
}
|
|
27
28
|
|
|
29
|
+
/**
|
|
30
|
+
* NOTE: Can return fractional values. `Math.ceil` will ensure correct value.
|
|
31
|
+
* @param {number} samples The number of frames in the audio.
|
|
32
|
+
* @returns {number} The number of frames in the audio.
|
|
33
|
+
*/
|
|
34
|
+
samples_to_frames(samples) {
|
|
35
|
+
return ((samples - this.config.offset) / this.config.step);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Post-processes the speaker diarization logits output by the model.
|
|
40
|
+
* @param {import('../../utils/tensor.js').Tensor} logits The speaker diarization logits output by the model.
|
|
41
|
+
* @param {number} num_samples Number of samples in the input audio.
|
|
42
|
+
* @returns {Array<Array<{ id: number, start: number, end: number, confidence: number }>>} The post-processed speaker diarization results.
|
|
43
|
+
*/
|
|
44
|
+
post_process_speaker_diarization(logits, num_samples) {
|
|
45
|
+
const ratio = (
|
|
46
|
+
num_samples / this.samples_to_frames(num_samples)
|
|
47
|
+
) / this.config.sampling_rate;
|
|
48
|
+
|
|
49
|
+
const results = [];
|
|
50
|
+
for (const scores of logits.tolist()) {
|
|
51
|
+
const accumulated_segments = [];
|
|
52
|
+
|
|
53
|
+
let current_speaker = -1;
|
|
54
|
+
for (let i = 0; i < scores.length; ++i) {
|
|
55
|
+
const probabilities = softmax(scores[i]);
|
|
56
|
+
const [score, id] = max(probabilities);
|
|
57
|
+
const [start, end] = [i, i + 1];
|
|
58
|
+
|
|
59
|
+
if (id !== current_speaker) {
|
|
60
|
+
// Speaker has changed
|
|
61
|
+
current_speaker = id;
|
|
62
|
+
accumulated_segments.push({ id, start, end, score });
|
|
63
|
+
} else {
|
|
64
|
+
// Continue the current segment
|
|
65
|
+
accumulated_segments.at(-1).end = end;
|
|
66
|
+
accumulated_segments.at(-1).score += score;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
results.push(accumulated_segments.map(
|
|
71
|
+
// Convert frame-space to time-space
|
|
72
|
+
// and compute the confidence
|
|
73
|
+
({ id, start, end, score }) => ({
|
|
74
|
+
id,
|
|
75
|
+
start: start * ratio,
|
|
76
|
+
end: end * ratio,
|
|
77
|
+
confidence: score / (end - start),
|
|
78
|
+
})
|
|
79
|
+
));
|
|
80
|
+
}
|
|
81
|
+
return results;
|
|
82
|
+
}
|
|
83
|
+
|
|
28
84
|
}
|