@huggingface/transformers 3.1.2 → 3.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -3
- package/dist/transformers.cjs +835 -144
- package/dist/transformers.cjs.map +1 -1
- package/dist/transformers.js +850 -144
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.cjs +1 -1
- package/dist/transformers.min.cjs.map +1 -1
- package/dist/transformers.min.js +1 -1
- package/dist/transformers.min.js.map +1 -1
- package/dist/transformers.min.mjs +1 -1
- package/dist/transformers.min.mjs.map +1 -1
- package/dist/transformers.mjs +850 -144
- package/dist/transformers.mjs.map +1 -1
- package/package.json +1 -1
- package/src/base/image_processors_utils.js +3 -1
- package/src/configs.js +10 -2
- package/src/env.js +1 -1
- package/src/models/feature_extractors.js +1 -0
- package/src/models/idefics3/image_processing_idefics3.js +24 -13
- package/src/models/image_processors.js +1 -0
- package/src/models/moonshine/feature_extraction_moonshine.js +26 -0
- package/src/models/moonshine/processing_moonshine.js +20 -0
- package/src/models/phi3_v/image_processing_phi3_v.js +163 -0
- package/src/models/phi3_v/processing_phi3_v.js +53 -0
- package/src/models/processors.js +2 -0
- package/src/models/pyannote/feature_extraction_pyannote.js +56 -0
- package/src/models/pyannote/processing_pyannote.js +7 -54
- package/src/models.js +223 -30
- package/src/ops/registry.js +11 -0
- package/src/pipelines.js +31 -1
- package/src/utils/tensor.js +51 -1
- package/types/base/image_processors_utils.d.ts +2 -2
- package/types/base/image_processors_utils.d.ts.map +1 -1
- package/types/configs.d.ts.map +1 -1
- package/types/models/auto/image_processing_auto.d.ts.map +1 -1
- package/types/models/feature_extractors.d.ts +1 -0
- package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +1 -0
- package/types/models/moonshine/feature_extraction_moonshine.d.ts +13 -0
- package/types/models/moonshine/feature_extraction_moonshine.d.ts.map +1 -0
- package/types/models/moonshine/processing_moonshine.d.ts +17 -0
- package/types/models/moonshine/processing_moonshine.d.ts.map +1 -0
- package/types/models/phi3_v/image_processing_phi3_v.d.ts +17 -0
- package/types/models/phi3_v/image_processing_phi3_v.d.ts.map +1 -0
- package/types/models/phi3_v/processing_phi3_v.d.ts +17 -0
- package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -0
- package/types/models/processors.d.ts +2 -0
- package/types/models/pyannote/feature_extraction_pyannote.d.ts +18 -0
- package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
- package/types/models/pyannote/processing_pyannote.d.ts +4 -15
- package/types/models/pyannote/processing_pyannote.d.ts.map +1 -1
- package/types/models.d.ts +64 -1
- package/types/models.d.ts.map +1 -1
- package/types/ops/registry.d.ts +1 -0
- package/types/ops/registry.d.ts.map +1 -1
- package/types/pipelines.d.ts +5 -0
- package/types/pipelines.d.ts.map +1 -1
- package/types/utils/tensor.d.ts +16 -0
- package/types/utils/tensor.d.ts.map +1 -1
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@huggingface/transformers",
|
|
3
|
-
"version": "3.1
|
|
3
|
+
"version": "3.2.1",
|
|
4
4
|
"description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
|
|
5
5
|
"main": "./src/transformers.js",
|
|
6
6
|
"types": "./types/transformers.d.ts",
|
|
@@ -699,7 +699,7 @@ export class ImageProcessor extends Callable {
|
|
|
699
699
|
* Pad the image by a certain amount.
|
|
700
700
|
* @param {Float32Array} pixelData The pixel data to pad.
|
|
701
701
|
* @param {number[]} imgDims The dimensions of the image (height, width, channels).
|
|
702
|
-
* @param {{width:number; height:number}|number} padSize The dimensions of the padded image.
|
|
702
|
+
* @param {{width:number; height:number}|number|'square'} padSize The dimensions of the padded image.
|
|
703
703
|
* @param {Object} options The options for padding.
|
|
704
704
|
* @param {'constant'|'symmetric'} [options.mode='constant'] The type of padding to add.
|
|
705
705
|
* @param {boolean} [options.center=false] Whether to center the image.
|
|
@@ -717,6 +717,8 @@ export class ImageProcessor extends Callable {
|
|
|
717
717
|
if (typeof padSize === 'number') {
|
|
718
718
|
paddedImageWidth = padSize;
|
|
719
719
|
paddedImageHeight = padSize;
|
|
720
|
+
} else if (padSize === 'square') {
|
|
721
|
+
paddedImageWidth = paddedImageHeight = Math.max(imageHeight, imageWidth);
|
|
720
722
|
} else {
|
|
721
723
|
paddedImageWidth = padSize.width;
|
|
722
724
|
paddedImageHeight = padSize.height;
|
package/src/configs.js
CHANGED
|
@@ -95,8 +95,6 @@ function getNormalizedConfig(config) {
|
|
|
95
95
|
case 'gpt_neox':
|
|
96
96
|
case 'stablelm':
|
|
97
97
|
case 'opt':
|
|
98
|
-
case 'phi':
|
|
99
|
-
case 'phi3':
|
|
100
98
|
case 'falcon':
|
|
101
99
|
mapping['num_heads'] = 'num_attention_heads';
|
|
102
100
|
mapping['num_layers'] = 'num_hidden_layers';
|
|
@@ -112,6 +110,9 @@ function getNormalizedConfig(config) {
|
|
|
112
110
|
case 'starcoder2':
|
|
113
111
|
case 'qwen2':
|
|
114
112
|
case 'qwen2_vl':
|
|
113
|
+
case 'phi':
|
|
114
|
+
case 'phi3':
|
|
115
|
+
case 'phi3_v':
|
|
115
116
|
mapping['num_heads'] = 'num_key_value_heads';
|
|
116
117
|
mapping['num_layers'] = 'num_hidden_layers';
|
|
117
118
|
mapping['hidden_size'] = 'hidden_size';
|
|
@@ -144,6 +145,12 @@ function getNormalizedConfig(config) {
|
|
|
144
145
|
mapping['num_layers'] = 'n_layers';
|
|
145
146
|
mapping['hidden_size'] = 'd_model';
|
|
146
147
|
break;
|
|
148
|
+
case 'exaone':
|
|
149
|
+
mapping['num_heads'] = 'num_key_value_heads';
|
|
150
|
+
mapping['num_layers'] = 'num_layers';
|
|
151
|
+
mapping['dim_kv'] = 'head_dim';
|
|
152
|
+
mapping['num_attention_heads'] = 'num_attention_heads';
|
|
153
|
+
break;
|
|
147
154
|
|
|
148
155
|
// Encoder-decoder models
|
|
149
156
|
case 't5':
|
|
@@ -185,6 +192,7 @@ function getNormalizedConfig(config) {
|
|
|
185
192
|
mapping['encoder_hidden_size'] = mapping['decoder_hidden_size'] = 'd_model';
|
|
186
193
|
break;
|
|
187
194
|
case 'musicgen_decoder':
|
|
195
|
+
case 'moonshine':
|
|
188
196
|
mapping['num_encoder_layers'] = mapping['num_decoder_layers'] = 'num_hidden_layers';
|
|
189
197
|
mapping['num_encoder_heads'] = mapping['num_decoder_heads'] = 'num_attention_heads';
|
|
190
198
|
mapping['encoder_hidden_size'] = mapping['decoder_hidden_size'] = 'hidden_size';
|
package/src/env.js
CHANGED
|
@@ -26,7 +26,7 @@ import fs from 'fs';
|
|
|
26
26
|
import path from 'path';
|
|
27
27
|
import url from 'url';
|
|
28
28
|
|
|
29
|
-
const VERSION = '3.1
|
|
29
|
+
const VERSION = '3.2.1';
|
|
30
30
|
|
|
31
31
|
// Check if various APIs are available (depends on environment)
|
|
32
32
|
const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
|
|
2
2
|
export * from './audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js';
|
|
3
3
|
export * from './clap/feature_extraction_clap.js';
|
|
4
|
+
export * from './moonshine/feature_extraction_moonshine.js';
|
|
4
5
|
export * from './pyannote/feature_extraction_pyannote.js';
|
|
5
6
|
export * from './seamless_m4t/feature_extraction_seamless_m4t.js';
|
|
6
7
|
export * from './speecht5/feature_extraction_speecht5.js';
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import {
|
|
4
4
|
ImageProcessor,
|
|
5
5
|
} from "../../base/image_processors_utils.js";
|
|
6
|
-
import { cat, full, interpolate_4d, stack } from "../../utils/tensor.js";
|
|
6
|
+
import { cat, full, interpolate_4d, slice, stack } from "../../utils/tensor.js";
|
|
7
7
|
|
|
8
8
|
export class Idefics3ImageProcessor extends ImageProcessor {
|
|
9
9
|
constructor(config) {
|
|
@@ -186,18 +186,29 @@ export class Idefics3ImageProcessor extends ImageProcessor {
|
|
|
186
186
|
const optimal_width = Math.ceil(width / num_splits_w);
|
|
187
187
|
|
|
188
188
|
// Iterate through each row and column
|
|
189
|
-
for (let r = 0; r < num_splits_h; r
|
|
190
|
-
for (let c = 0; c < num_splits_w; c
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
//
|
|
200
|
-
|
|
189
|
+
for (let r = 0; r < num_splits_h; ++r) {
|
|
190
|
+
for (let c = 0; c < num_splits_w; ++c) {
|
|
191
|
+
let start_x, start_y, end_x, end_y;
|
|
192
|
+
if (r === num_splits_h - 1) { // At bottom
|
|
193
|
+
start_y = height - optimal_height;
|
|
194
|
+
end_y = height;
|
|
195
|
+
} else {
|
|
196
|
+
start_y = r * optimal_height;
|
|
197
|
+
end_y = (r + 1) * optimal_height;
|
|
198
|
+
}
|
|
199
|
+
if (c === num_splits_w - 1) { // At right
|
|
200
|
+
start_x = width - optimal_width;
|
|
201
|
+
end_x = width;
|
|
202
|
+
} else {
|
|
203
|
+
start_x = c * optimal_width;
|
|
204
|
+
end_x = (c + 1) * optimal_width;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
const starts = [start_y, start_x];
|
|
208
|
+
const ends = [end_y, end_x];
|
|
209
|
+
|
|
210
|
+
const patch = await slice(pixel_values, starts, ends, [2, 3]);
|
|
211
|
+
frames.push(patch);
|
|
201
212
|
}
|
|
202
213
|
}
|
|
203
214
|
|
|
@@ -24,6 +24,7 @@ export * from './mobilevit/image_processing_mobilevit.js'
|
|
|
24
24
|
export * from './nougat/image_processing_nougat.js'
|
|
25
25
|
export * from './owlv2/image_processing_owlv2.js'
|
|
26
26
|
export * from './owlvit/image_processing_owlvit.js'
|
|
27
|
+
export * from './phi3_v/image_processing_phi3_v.js'
|
|
27
28
|
export * from './pvt/image_processing_pvt.js'
|
|
28
29
|
export * from './qwen2_vl/image_processing_qwen2_vl.js'
|
|
29
30
|
export * from './rt_detr/image_processing_rt_detr.js'
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
|
|
2
|
+
import { Tensor } from '../../utils/tensor.js';
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
export class MoonshineFeatureExtractor extends FeatureExtractor {
|
|
6
|
+
/**
|
|
7
|
+
* Asynchronously extracts input values from a given audio using the provided configuration.
|
|
8
|
+
* @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
|
|
9
|
+
* @returns {Promise<{ input_values: Tensor; }>} The extracted input values.
|
|
10
|
+
*/
|
|
11
|
+
async _call(audio) {
|
|
12
|
+
validate_audio_inputs(audio, 'MoonshineFeatureExtractor');
|
|
13
|
+
|
|
14
|
+
if (audio instanceof Float64Array) {
|
|
15
|
+
audio = new Float32Array(audio);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
const shape = [
|
|
19
|
+
1, /* batch_size */
|
|
20
|
+
audio.length, /* num_samples */
|
|
21
|
+
];
|
|
22
|
+
return {
|
|
23
|
+
input_values: new Tensor('float32', audio, shape),
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import { AutoFeatureExtractor } from "../auto/feature_extraction_auto.js"
|
|
2
|
+
import { AutoTokenizer } from "../../tokenizers.js"
|
|
3
|
+
import { Processor } from "../../base/processing_utils.js"
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Represents a MoonshineProcessor that extracts features from an audio input.
|
|
7
|
+
*/
|
|
8
|
+
export class MoonshineProcessor extends Processor {
|
|
9
|
+
static tokenizer_class = AutoTokenizer
|
|
10
|
+
static feature_extractor_class = AutoFeatureExtractor
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Calls the feature_extractor function with the given audio input.
|
|
14
|
+
* @param {any} audio The audio input to extract features from.
|
|
15
|
+
* @returns {Promise<any>} A Promise that resolves with the extracted features.
|
|
16
|
+
*/
|
|
17
|
+
async _call(audio) {
|
|
18
|
+
return await this.feature_extractor(audio);
|
|
19
|
+
}
|
|
20
|
+
}
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import {
|
|
2
|
+
ImageProcessor,
|
|
3
|
+
} from "../../base/image_processors_utils.js";
|
|
4
|
+
import { cat, interpolate_4d, slice, stack, Tensor } from "../../utils/tensor.js";
|
|
5
|
+
|
|
6
|
+
const IMAGE_SIZE = 336;
|
|
7
|
+
const SLICE_AXES = [2, 3]; // axes to slice on
|
|
8
|
+
const { ceil, floor, sqrt } = Math;
|
|
9
|
+
|
|
10
|
+
export class Phi3VImageProcessor extends ImageProcessor {
|
|
11
|
+
constructor(config) {
|
|
12
|
+
super({
|
|
13
|
+
...config,
|
|
14
|
+
do_normalize: true,
|
|
15
|
+
do_pad: true,
|
|
16
|
+
pad_size: 'custom',
|
|
17
|
+
do_convert_rgb: true,
|
|
18
|
+
do_resize: true, // Smart resizing "hd_transform"
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
this._num_crops = config.num_crops;
|
|
22
|
+
}
|
|
23
|
+
calc_num_image_tokens_from_image_size(width, height) {
|
|
24
|
+
// @ts-expect-error
|
|
25
|
+
const { num_img_tokens } = this.config;
|
|
26
|
+
return floor(((floor((height / IMAGE_SIZE)) * floor((width / IMAGE_SIZE)) + 1) * num_img_tokens) + 1 + (floor(height / IMAGE_SIZE) + 1) * sqrt(num_img_tokens));
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
30
|
+
get_resize_output_image_size(image, size) {
|
|
31
|
+
const hd_num = this._num_crops;
|
|
32
|
+
const [width, height] = image.size
|
|
33
|
+
|
|
34
|
+
let ratio = width / height;
|
|
35
|
+
let scale = 1;
|
|
36
|
+
|
|
37
|
+
// Calculate the scaling factor
|
|
38
|
+
while (scale * Math.ceil(scale / ratio) <= hd_num) {
|
|
39
|
+
scale += 1;
|
|
40
|
+
}
|
|
41
|
+
scale -= 1;
|
|
42
|
+
|
|
43
|
+
// Compute the new dimensions
|
|
44
|
+
const new_w = Math.floor(scale * 336);
|
|
45
|
+
const new_h = Math.floor(new_w / ratio);
|
|
46
|
+
|
|
47
|
+
return [new_w, new_h]
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
/** @type {ImageProcessor['pad_image']} */
|
|
52
|
+
pad_image(pixelData, imgDims, padSize, options = {}) {
|
|
53
|
+
// Phi3V uses a custom padding strategy:
|
|
54
|
+
// - Pad to a multiple of 336
|
|
55
|
+
// - Pad with white pixels
|
|
56
|
+
const [imageHeight, imageWidth] = imgDims;
|
|
57
|
+
const height = IMAGE_SIZE * ceil(imageHeight / IMAGE_SIZE);
|
|
58
|
+
const width = IMAGE_SIZE * ceil(imageWidth / IMAGE_SIZE);
|
|
59
|
+
|
|
60
|
+
// NOTE: Since padding is done after normalization, we need to fill with the normalized values
|
|
61
|
+
const constant_values = [1, 1, 1].map((x, i) => (x - this.image_mean[i]) / this.image_std[i]);
|
|
62
|
+
return super.pad_image(pixelData, imgDims, { width, height }, {
|
|
63
|
+
center: true,
|
|
64
|
+
constant_values,
|
|
65
|
+
...options,
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
async _call(images, {
|
|
70
|
+
num_crops = null,
|
|
71
|
+
} = {}) {
|
|
72
|
+
// @ts-expect-error
|
|
73
|
+
this._num_crops = num_crops ??= this.config.num_crops;
|
|
74
|
+
if (num_crops < 4 || sqrt(num_crops) % 1 !== 0) {
|
|
75
|
+
throw new Error("num_crops must be a square number >= 4");
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
if (!Array.isArray(images)) {
|
|
79
|
+
images = [images];
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
const num_images = images.length;
|
|
83
|
+
const imageData = await Promise.all(images.map(x => this.preprocess(x)));
|
|
84
|
+
|
|
85
|
+
const original_sizes = imageData.map(x => x.original_size);
|
|
86
|
+
const reshaped_input_sizes = imageData.map(x => x.reshaped_input_size);
|
|
87
|
+
|
|
88
|
+
// Process each image in batch
|
|
89
|
+
const all_pixel_values = [];
|
|
90
|
+
for (const { pixel_values } of imageData) {
|
|
91
|
+
pixel_values.unsqueeze_(0); // Easier processing as 4D tensor
|
|
92
|
+
|
|
93
|
+
const [height, width] = pixel_values.dims.slice(-2);
|
|
94
|
+
|
|
95
|
+
// Global image (Tensor of shape [num_channels, height, width])
|
|
96
|
+
const batch_pixel_values = await interpolate_4d(pixel_values, {
|
|
97
|
+
size: [IMAGE_SIZE, IMAGE_SIZE],
|
|
98
|
+
mode: 'bicubic',
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
if (num_crops > 0) {
|
|
102
|
+
const patches = [];
|
|
103
|
+
const sqrt_patches = sqrt(num_crops);
|
|
104
|
+
const patch_width = floor(width / sqrt_patches);
|
|
105
|
+
const patch_height = floor(height / sqrt_patches);
|
|
106
|
+
for (let y = 0; y < sqrt_patches; ++y) {
|
|
107
|
+
for (let x = 0; x < sqrt_patches; ++x) {
|
|
108
|
+
let start_x, start_y, end_x, end_y;
|
|
109
|
+
if (y === sqrt_patches - 1) { // At bottom
|
|
110
|
+
start_y = height - patch_height;
|
|
111
|
+
end_y = height;
|
|
112
|
+
} else {
|
|
113
|
+
start_y = y * patch_height;
|
|
114
|
+
end_y = (y + 1) * patch_height;
|
|
115
|
+
}
|
|
116
|
+
if (x === sqrt_patches - 1) { // At right
|
|
117
|
+
start_x = width - patch_width;
|
|
118
|
+
end_x = width;
|
|
119
|
+
} else {
|
|
120
|
+
start_x = x * patch_width;
|
|
121
|
+
end_x = (x + 1) * patch_width;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const starts = [start_y, start_x];
|
|
125
|
+
const ends = [end_y, end_x];
|
|
126
|
+
const patch = await slice(pixel_values, starts, ends, SLICE_AXES);
|
|
127
|
+
patches.push(patch);
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
const resized_tensors = await interpolate_4d(cat(patches, 0), {
|
|
132
|
+
size: [IMAGE_SIZE, IMAGE_SIZE],
|
|
133
|
+
mode: 'bicubic',
|
|
134
|
+
}); // [num_crops, 3, 336, 336]
|
|
135
|
+
|
|
136
|
+
// Concatenate the global image with the patches
|
|
137
|
+
all_pixel_values.push(cat([batch_pixel_values, resized_tensors], 0));
|
|
138
|
+
} else {
|
|
139
|
+
// Only use the global image
|
|
140
|
+
// NOTE: Not currently supported in modelling code
|
|
141
|
+
all_pixel_values.push(batch_pixel_values);
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// [num_images, 1 + num_crops, num_channels=3, height, width]
|
|
146
|
+
const pixel_values = stack(all_pixel_values, 0);
|
|
147
|
+
|
|
148
|
+
// Calculate padded image sizes
|
|
149
|
+
const sizes = reshaped_input_sizes.map(x => x.map(y => IMAGE_SIZE * ceil(y / IMAGE_SIZE)));
|
|
150
|
+
|
|
151
|
+
const image_sizes = new Tensor(
|
|
152
|
+
'int64',
|
|
153
|
+
sizes.flat(),
|
|
154
|
+
[num_images, 2],
|
|
155
|
+
);
|
|
156
|
+
|
|
157
|
+
const num_img_tokens = sizes.map(
|
|
158
|
+
([height, width]) => this.calc_num_image_tokens_from_image_size(width, height),
|
|
159
|
+
);
|
|
160
|
+
|
|
161
|
+
return { pixel_values, original_sizes, reshaped_input_sizes, image_sizes, num_img_tokens };
|
|
162
|
+
}
|
|
163
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import { Processor } from "../../base/processing_utils.js";
|
|
2
|
+
import { AutoImageProcessor } from "../auto/image_processing_auto.js";
|
|
3
|
+
import { AutoTokenizer } from "../../tokenizers.js";
|
|
4
|
+
import { RawImage } from "../../utils/image.js";
|
|
5
|
+
|
|
6
|
+
const IMAGE_TOKEN = "<|image|>";
|
|
7
|
+
const IMAGE_TOKEN_PATTERN = /<\|image_\d+\|>/g;
|
|
8
|
+
|
|
9
|
+
export class Phi3VProcessor extends Processor {
|
|
10
|
+
static image_processor_class = AutoImageProcessor
|
|
11
|
+
static tokenizer_class = AutoTokenizer
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
*
|
|
15
|
+
* @param {string|string[]} text
|
|
16
|
+
* @param {RawImage|RawImage[]} images
|
|
17
|
+
* @param {...any} args
|
|
18
|
+
* @returns {Promise<any>}
|
|
19
|
+
*/
|
|
20
|
+
async _call(text, images = null, {
|
|
21
|
+
padding = true,
|
|
22
|
+
truncation = true,
|
|
23
|
+
num_crops = null,
|
|
24
|
+
} = {}) {
|
|
25
|
+
|
|
26
|
+
if (!Array.isArray(text)) {
|
|
27
|
+
text = [text];
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
let text_inputs, image_inputs;
|
|
31
|
+
if (images) {
|
|
32
|
+
image_inputs = await this.image_processor(images, { num_crops });
|
|
33
|
+
const { num_img_tokens } = image_inputs;
|
|
34
|
+
|
|
35
|
+
// The original implementation adds a bos_token before the image tokens
|
|
36
|
+
// TODO: Check if this affects performance, since it looks like a bug in the original implementation
|
|
37
|
+
const prompt_chunks = text.map((t, i) => t.split(IMAGE_TOKEN_PATTERN).join(IMAGE_TOKEN.repeat(num_img_tokens[i])));
|
|
38
|
+
|
|
39
|
+
text_inputs = this.tokenizer(prompt_chunks, { padding, truncation });
|
|
40
|
+
|
|
41
|
+
// The model expects image tokens to be negative, so we negate the image token ids
|
|
42
|
+
const image_token_id = this.tokenizer.model.convert_tokens_to_ids([IMAGE_TOKEN])[0];
|
|
43
|
+
text_inputs.input_ids.map_(id => (id == image_token_id) ? -id : id);
|
|
44
|
+
} else {
|
|
45
|
+
text_inputs = this.tokenizer(text);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
return {
|
|
49
|
+
...text_inputs,
|
|
50
|
+
...image_inputs,
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
package/src/models/processors.js
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
export * from './florence2/processing_florence2.js';
|
|
2
2
|
export * from './mgp_str/processing_mgp_str.js';
|
|
3
|
+
export * from './moonshine/processing_moonshine.js';
|
|
3
4
|
export * from './idefics3/processing_idefics3.js';
|
|
4
5
|
export * from './janus/processing_janus.js';
|
|
5
6
|
export * from './jina_clip/processing_jina_clip.js';
|
|
6
7
|
export * from './owlvit/processing_owlvit.js';
|
|
8
|
+
export * from './phi3_v/processing_phi3_v.js';
|
|
7
9
|
export * from './paligemma/processing_paligemma.js';
|
|
8
10
|
export * from './pyannote/processing_pyannote.js';
|
|
9
11
|
export * from './qwen2_vl/processing_qwen2_vl.js';
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
|
|
2
2
|
import { Tensor } from '../../utils/tensor.js';
|
|
3
|
+
import { max, softmax } from '../../utils/maths.js';
|
|
3
4
|
|
|
4
5
|
|
|
5
6
|
export class PyAnnoteFeatureExtractor extends FeatureExtractor {
|
|
@@ -25,4 +26,59 @@ export class PyAnnoteFeatureExtractor extends FeatureExtractor {
|
|
|
25
26
|
};
|
|
26
27
|
}
|
|
27
28
|
|
|
29
|
+
/**
|
|
30
|
+
* NOTE: Can return fractional values. `Math.ceil` will ensure correct value.
|
|
31
|
+
* @param {number} samples The number of frames in the audio.
|
|
32
|
+
* @returns {number} The number of frames in the audio.
|
|
33
|
+
*/
|
|
34
|
+
samples_to_frames(samples) {
|
|
35
|
+
return ((samples - this.config.offset) / this.config.step);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Post-processes the speaker diarization logits output by the model.
|
|
40
|
+
* @param {import('../../utils/tensor.js').Tensor} logits The speaker diarization logits output by the model.
|
|
41
|
+
* @param {number} num_samples Number of samples in the input audio.
|
|
42
|
+
* @returns {Array<Array<{ id: number, start: number, end: number, confidence: number }>>} The post-processed speaker diarization results.
|
|
43
|
+
*/
|
|
44
|
+
post_process_speaker_diarization(logits, num_samples) {
|
|
45
|
+
const ratio = (
|
|
46
|
+
num_samples / this.samples_to_frames(num_samples)
|
|
47
|
+
) / this.config.sampling_rate;
|
|
48
|
+
|
|
49
|
+
const results = [];
|
|
50
|
+
for (const scores of logits.tolist()) {
|
|
51
|
+
const accumulated_segments = [];
|
|
52
|
+
|
|
53
|
+
let current_speaker = -1;
|
|
54
|
+
for (let i = 0; i < scores.length; ++i) {
|
|
55
|
+
const probabilities = softmax(scores[i]);
|
|
56
|
+
const [score, id] = max(probabilities);
|
|
57
|
+
const [start, end] = [i, i + 1];
|
|
58
|
+
|
|
59
|
+
if (id !== current_speaker) {
|
|
60
|
+
// Speaker has changed
|
|
61
|
+
current_speaker = id;
|
|
62
|
+
accumulated_segments.push({ id, start, end, score });
|
|
63
|
+
} else {
|
|
64
|
+
// Continue the current segment
|
|
65
|
+
accumulated_segments.at(-1).end = end;
|
|
66
|
+
accumulated_segments.at(-1).score += score;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
results.push(accumulated_segments.map(
|
|
71
|
+
// Convert frame-space to time-space
|
|
72
|
+
// and compute the confidence
|
|
73
|
+
({ id, start, end, score }) => ({
|
|
74
|
+
id,
|
|
75
|
+
start: start * ratio,
|
|
76
|
+
end: end * ratio,
|
|
77
|
+
confidence: score / (end - start),
|
|
78
|
+
})
|
|
79
|
+
));
|
|
80
|
+
}
|
|
81
|
+
return results;
|
|
82
|
+
}
|
|
83
|
+
|
|
28
84
|
}
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
import { Processor } from '../../base/processing_utils.js';
|
|
2
|
-
import {
|
|
3
|
-
import { max, softmax } from '../../utils/maths.js';
|
|
2
|
+
import { PyAnnoteFeatureExtractor } from './feature_extraction_pyannote.js';
|
|
4
3
|
|
|
5
4
|
export class PyAnnoteProcessor extends Processor {
|
|
6
|
-
static feature_extractor_class =
|
|
5
|
+
static feature_extractor_class = PyAnnoteFeatureExtractor
|
|
7
6
|
|
|
8
7
|
/**
|
|
9
8
|
* Calls the feature_extractor function with the given audio input.
|
|
@@ -14,58 +13,12 @@ export class PyAnnoteProcessor extends Processor {
|
|
|
14
13
|
return await this.feature_extractor(audio)
|
|
15
14
|
}
|
|
16
15
|
|
|
17
|
-
/**
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
* @returns {number} The number of frames in the audio.
|
|
21
|
-
*/
|
|
22
|
-
samples_to_frames(samples) {
|
|
23
|
-
return ((samples - this.config.offset) / this.config.step);
|
|
16
|
+
/** @type {PyAnnoteFeatureExtractor['post_process_speaker_diarization']} */
|
|
17
|
+
post_process_speaker_diarization(...args) {
|
|
18
|
+
return /** @type {PyAnnoteFeatureExtractor} */(this.feature_extractor).post_process_speaker_diarization(...args);
|
|
24
19
|
}
|
|
25
20
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
* @param {import('../../utils/tensor.js').Tensor} logits The speaker diarization logits output by the model.
|
|
29
|
-
* @param {number} num_samples Number of samples in the input audio.
|
|
30
|
-
* @returns {Array<Array<{ id: number, start: number, end: number, confidence: number }>>} The post-processed speaker diarization results.
|
|
31
|
-
*/
|
|
32
|
-
post_process_speaker_diarization(logits, num_samples) {
|
|
33
|
-
const ratio = (
|
|
34
|
-
num_samples / this.samples_to_frames(num_samples)
|
|
35
|
-
) / this.config.sampling_rate;
|
|
36
|
-
|
|
37
|
-
const results = [];
|
|
38
|
-
for (const scores of logits.tolist()) {
|
|
39
|
-
const accumulated_segments = [];
|
|
40
|
-
|
|
41
|
-
let current_speaker = -1;
|
|
42
|
-
for (let i = 0; i < scores.length; ++i) {
|
|
43
|
-
const probabilities = softmax(scores[i]);
|
|
44
|
-
const [score, id] = max(probabilities);
|
|
45
|
-
const [start, end] = [i, i + 1];
|
|
46
|
-
|
|
47
|
-
if (id !== current_speaker) {
|
|
48
|
-
// Speaker has changed
|
|
49
|
-
current_speaker = id;
|
|
50
|
-
accumulated_segments.push({ id, start, end, score });
|
|
51
|
-
} else {
|
|
52
|
-
// Continue the current segment
|
|
53
|
-
accumulated_segments.at(-1).end = end;
|
|
54
|
-
accumulated_segments.at(-1).score += score;
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
results.push(accumulated_segments.map(
|
|
59
|
-
// Convert frame-space to time-space
|
|
60
|
-
// and compute the confidence
|
|
61
|
-
({ id, start, end, score }) => ({
|
|
62
|
-
id,
|
|
63
|
-
start: start * ratio,
|
|
64
|
-
end: end * ratio,
|
|
65
|
-
confidence: score / (end - start),
|
|
66
|
-
})
|
|
67
|
-
));
|
|
68
|
-
}
|
|
69
|
-
return results;
|
|
21
|
+
get sampling_rate() {
|
|
22
|
+
return this.feature_extractor.config.sampling_rate;
|
|
70
23
|
}
|
|
71
24
|
}
|