@huggingface/transformers 3.2.3 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -3
- package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
- package/dist/ort.bundle.min.mjs +2776 -0
- package/dist/transformers.cjs +792 -330
- package/dist/transformers.cjs.map +1 -1
- package/dist/transformers.js +1150 -656
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.cjs +1 -1
- package/dist/transformers.min.cjs.map +1 -1
- package/dist/transformers.min.js +1 -1
- package/dist/transformers.min.js.map +1 -1
- package/dist/transformers.min.mjs +1 -1
- package/dist/transformers.min.mjs.map +1 -1
- package/dist/transformers.mjs +798 -331
- package/dist/transformers.mjs.map +1 -1
- package/package.json +3 -3
- package/src/base/feature_extraction_utils.js +9 -9
- package/src/base/image_processors_utils.js +12 -1
- package/src/base/processing_utils.js +24 -3
- package/src/configs.js +5 -0
- package/src/env.js +1 -2
- package/src/generation/streamers.js +5 -2
- package/src/models/auto/feature_extraction_auto.js +0 -16
- package/src/models/auto/processing_auto.js +0 -16
- package/src/models/convnext/image_processing_convnext.js +1 -0
- package/src/models/efficientnet/image_processing_efficientnet.js +1 -0
- package/src/models/florence2/processing_florence2.js +3 -0
- package/src/models/grounding_dino/image_processing_grounding_dino.js +29 -0
- package/src/models/grounding_dino/processing_grounding_dino.js +101 -0
- package/src/models/idefics3/image_processing_idefics3.js +2 -0
- package/src/models/image_processors.js +1 -0
- package/src/models/janus/image_processing_janus.js +1 -0
- package/src/models/mgp_str/processing_mgp_str.js +2 -0
- package/src/models/paligemma/processing_paligemma.js +1 -0
- package/src/models/phi3_v/processing_phi3_v.js +1 -1
- package/src/models/processors.js +3 -2
- package/src/models/pyannote/feature_extraction_pyannote.js +1 -0
- package/src/models/qwen2_vl/processing_qwen2_vl.js +1 -0
- package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +2 -2
- package/src/models/whisper/feature_extraction_whisper.js +1 -1
- package/src/models.js +72 -20
- package/src/ops/registry.js +10 -0
- package/src/pipelines.js +73 -23
- package/src/tokenizers.js +4 -7
- package/src/utils/audio.js +113 -1
- package/src/utils/core.js +26 -0
- package/src/utils/dtypes.js +2 -0
- package/src/utils/hub.js +1 -1
- package/src/utils/image.js +5 -18
- package/src/utils/maths.js +8 -6
- package/src/utils/tensor.js +134 -114
- package/types/base/feature_extraction_utils.d.ts +7 -7
- package/types/base/image_processors_utils.d.ts +7 -0
- package/types/base/image_processors_utils.d.ts.map +1 -1
- package/types/base/processing_utils.d.ts +25 -19
- package/types/base/processing_utils.d.ts.map +1 -1
- package/types/configs.d.ts.map +1 -1
- package/types/generation/parameters.d.ts +1 -1
- package/types/generation/streamers.d.ts +3 -1
- package/types/generation/streamers.d.ts.map +1 -1
- package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
- package/types/models/auto/image_processing_auto.d.ts.map +1 -1
- package/types/models/auto/processing_auto.d.ts.map +1 -1
- package/types/models/convnext/image_processing_convnext.d.ts.map +1 -1
- package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -1
- package/types/models/florence2/processing_florence2.d.ts.map +1 -1
- package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +20 -0
- package/types/models/grounding_dino/image_processing_grounding_dino.d.ts.map +1 -0
- package/types/models/grounding_dino/processing_grounding_dino.d.ts +27 -0
- package/types/models/grounding_dino/processing_grounding_dino.d.ts.map +1 -0
- package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +1 -0
- package/types/models/janus/image_processing_janus.d.ts.map +1 -1
- package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
- package/types/models/paligemma/processing_paligemma.d.ts.map +1 -1
- package/types/models/phi3_v/processing_phi3_v.d.ts +6 -2
- package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -1
- package/types/models/processors.d.ts +3 -2
- package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/sapiens/image_processing_sapiens.d.ts +10 -0
- package/types/models/sapiens/image_processing_sapiens.d.ts.map +1 -0
- package/types/models/whisper/generation_whisper.d.ts +1 -1
- package/types/models/whisper/generation_whisper.d.ts.map +1 -1
- package/types/models.d.ts +40 -17
- package/types/models.d.ts.map +1 -1
- package/types/ops/registry.d.ts +1 -0
- package/types/ops/registry.d.ts.map +1 -1
- package/types/pipelines.d.ts +7 -12
- package/types/pipelines.d.ts.map +1 -1
- package/types/tokenizers.d.ts.map +1 -1
- package/types/tsconfig.tsbuildinfo +1 -0
- package/types/utils/audio.d.ts +25 -0
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/core.d.ts +6 -0
- package/types/utils/core.d.ts.map +1 -1
- package/types/utils/dtypes.d.ts.map +1 -1
- package/types/utils/hub.d.ts +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/image.d.ts +3 -2
- package/types/utils/image.d.ts.map +1 -1
- package/types/utils/maths.d.ts +8 -6
- package/types/utils/maths.d.ts.map +1 -1
- package/types/utils/tensor.d.ts +22 -6
- package/types/utils/tensor.d.ts.map +1 -1
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@huggingface/transformers",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.3.0",
|
|
4
4
|
"description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
|
|
5
5
|
"main": "./src/transformers.js",
|
|
6
6
|
"types": "./types/transformers.d.ts",
|
|
@@ -24,7 +24,7 @@
|
|
|
24
24
|
"scripts": {
|
|
25
25
|
"format": "prettier --write .",
|
|
26
26
|
"format:check": "prettier --check .",
|
|
27
|
-
"typegen": "tsc
|
|
27
|
+
"typegen": "tsc --build",
|
|
28
28
|
"dev": "webpack serve --no-client-overlay",
|
|
29
29
|
"build": "webpack && npm run typegen",
|
|
30
30
|
"test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --verbose",
|
|
@@ -57,7 +57,7 @@
|
|
|
57
57
|
"dependencies": {
|
|
58
58
|
"@huggingface/jinja": "^0.3.2",
|
|
59
59
|
"onnxruntime-node": "1.20.1",
|
|
60
|
-
"onnxruntime-web": "1.21.0-dev.
|
|
60
|
+
"onnxruntime-web": "1.21.0-dev.20250111-73f5b0c597",
|
|
61
61
|
"sharp": "^0.33.5"
|
|
62
62
|
},
|
|
63
63
|
"devDependencies": {
|
|
@@ -17,23 +17,23 @@ export class FeatureExtractor extends Callable {
|
|
|
17
17
|
}
|
|
18
18
|
|
|
19
19
|
/**
|
|
20
|
-
* Instantiate one of the
|
|
20
|
+
* Instantiate one of the feature extractor classes of the library from a pretrained model.
|
|
21
21
|
*
|
|
22
|
-
* The
|
|
23
|
-
*
|
|
22
|
+
* The feature extractor class to instantiate is selected based on the `feature_extractor_type` property of
|
|
23
|
+
* the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
24
24
|
*
|
|
25
25
|
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
26
|
-
* - A string, the *model id* of a pretrained
|
|
26
|
+
* - A string, the *model id* of a pretrained feature_extractor hosted inside a model repo on huggingface.co.
|
|
27
27
|
* Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
|
28
28
|
* user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
29
|
-
* - A path to a *directory* containing
|
|
30
|
-
* @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the
|
|
29
|
+
* - A path to a *directory* containing feature_extractor files, e.g., `./my_model_directory/`.
|
|
30
|
+
* @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the feature_extractor.
|
|
31
31
|
*
|
|
32
|
-
* @returns {Promise<FeatureExtractor>} A new instance of the
|
|
32
|
+
* @returns {Promise<FeatureExtractor>} A new instance of the Feature Extractor class.
|
|
33
33
|
*/
|
|
34
34
|
static async from_pretrained(pretrained_model_name_or_path, options) {
|
|
35
|
-
const
|
|
36
|
-
return new this(
|
|
35
|
+
const config = await getModelJSON(pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME, true, options);
|
|
36
|
+
return new this(config);
|
|
37
37
|
}
|
|
38
38
|
}
|
|
39
39
|
|
|
@@ -68,7 +68,7 @@ function enforce_size_divisibility([width, height], divisor) {
|
|
|
68
68
|
* @param {number[]} arr The coordinate for the center of the box and its width, height dimensions (center_x, center_y, width, height)
|
|
69
69
|
* @returns {number[]} The coodinates for the top-left and bottom-right corners of the box (top_left_x, top_left_y, bottom_right_x, bottom_right_y)
|
|
70
70
|
*/
|
|
71
|
-
function center_to_corners_format([centerX, centerY, width, height]) {
|
|
71
|
+
export function center_to_corners_format([centerX, centerY, width, height]) {
|
|
72
72
|
return [
|
|
73
73
|
centerX - width / 2,
|
|
74
74
|
centerY - height / 2,
|
|
@@ -604,14 +604,20 @@ export class ImageProcessor extends Callable {
|
|
|
604
604
|
this.do_thumbnail = config.do_thumbnail;
|
|
605
605
|
this.size = config.size ?? config.image_size;
|
|
606
606
|
this.do_resize = config.do_resize ?? (this.size !== undefined);
|
|
607
|
+
// @ts-expect-error TS2339
|
|
607
608
|
this.size_divisibility = config.size_divisibility ?? config.size_divisor;
|
|
608
609
|
|
|
609
610
|
this.do_center_crop = config.do_center_crop;
|
|
611
|
+
// @ts-expect-error TS2339
|
|
610
612
|
this.crop_size = config.crop_size;
|
|
613
|
+
// @ts-expect-error TS2339
|
|
611
614
|
this.do_convert_rgb = config.do_convert_rgb ?? true;
|
|
615
|
+
// @ts-expect-error TS2339
|
|
612
616
|
this.do_crop_margin = config.do_crop_margin;
|
|
613
617
|
|
|
618
|
+
// @ts-expect-error TS2339
|
|
614
619
|
this.pad_size = config.pad_size;
|
|
620
|
+
// @ts-expect-error TS2339
|
|
615
621
|
this.do_pad = config.do_pad;
|
|
616
622
|
|
|
617
623
|
if (this.do_pad && !this.pad_size && this.size && this.size.width !== undefined && this.size.height !== undefined) {
|
|
@@ -820,6 +826,7 @@ export class ImageProcessor extends Callable {
|
|
|
820
826
|
// Support both formats for backwards compatibility
|
|
821
827
|
else if (Number.isInteger(size)) {
|
|
822
828
|
shortest_edge = size;
|
|
829
|
+
// @ts-expect-error TS2339
|
|
823
830
|
longest_edge = this.config.max_size ?? shortest_edge;
|
|
824
831
|
|
|
825
832
|
} else if (size !== undefined) {
|
|
@@ -888,6 +895,7 @@ export class ImageProcessor extends Callable {
|
|
|
888
895
|
} else if (size.min_pixels !== undefined && size.max_pixels !== undefined) {
|
|
889
896
|
// Custom resize logic for Qwen2-VL models
|
|
890
897
|
const { min_pixels, max_pixels } = size;
|
|
898
|
+
// @ts-expect-error TS2339
|
|
891
899
|
const factor = this.config.patch_size * this.config.merge_size;
|
|
892
900
|
return smart_resize(srcHeight, srcWidth, factor, min_pixels, max_pixels);
|
|
893
901
|
} else {
|
|
@@ -903,6 +911,7 @@ export class ImageProcessor extends Callable {
|
|
|
903
911
|
async resize(image) {
|
|
904
912
|
const [newWidth, newHeight] = this.get_resize_output_image_size(image, this.size);
|
|
905
913
|
return await image.resize(newWidth, newHeight, {
|
|
914
|
+
// @ts-expect-error TS2322
|
|
906
915
|
resample: this.resample,
|
|
907
916
|
});
|
|
908
917
|
}
|
|
@@ -953,6 +962,7 @@ export class ImageProcessor extends Callable {
|
|
|
953
962
|
|
|
954
963
|
// Resize the image using thumbnail method.
|
|
955
964
|
if (this.do_thumbnail) {
|
|
965
|
+
// @ts-expect-error TS2345
|
|
956
966
|
image = await this.thumbnail(image, this.size, this.resample);
|
|
957
967
|
}
|
|
958
968
|
|
|
@@ -977,6 +987,7 @@ export class ImageProcessor extends Callable {
|
|
|
977
987
|
// NOTE: All pixel-level manipulation (i.e., modifying `pixelData`)
|
|
978
988
|
// occurs with data in the hwc format (height, width, channels),
|
|
979
989
|
// to emulate the behavior of the original Python code (w/ numpy).
|
|
990
|
+
/** @type {Float32Array} */
|
|
980
991
|
let pixelData = Float32Array.from(image.data);
|
|
981
992
|
let imgDims = [image.height, image.width, image.channels];
|
|
982
993
|
|
|
@@ -28,6 +28,7 @@ import { getModelJSON } from '../utils/hub.js';
|
|
|
28
28
|
/**
|
|
29
29
|
* @typedef {Object} ProcessorProperties Additional processor-specific properties.
|
|
30
30
|
* @typedef {import('../utils/hub.js').PretrainedOptions & ProcessorProperties} PretrainedProcessorOptions
|
|
31
|
+
* @typedef {import('../tokenizers.js').PreTrainedTokenizer} PreTrainedTokenizer
|
|
31
32
|
*/
|
|
32
33
|
|
|
33
34
|
|
|
@@ -61,7 +62,7 @@ export class Processor extends Callable {
|
|
|
61
62
|
}
|
|
62
63
|
|
|
63
64
|
/**
|
|
64
|
-
* @returns {
|
|
65
|
+
* @returns {PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists.
|
|
65
66
|
*/
|
|
66
67
|
get tokenizer() {
|
|
67
68
|
return this.components.tokenizer;
|
|
@@ -74,6 +75,11 @@ export class Processor extends Callable {
|
|
|
74
75
|
return this.components.feature_extractor;
|
|
75
76
|
}
|
|
76
77
|
|
|
78
|
+
/**
|
|
79
|
+
* @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[0]} messages
|
|
80
|
+
* @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[1]} options
|
|
81
|
+
* @returns {ReturnType<PreTrainedTokenizer['apply_chat_template']>}
|
|
82
|
+
*/
|
|
77
83
|
apply_chat_template(messages, options = {}) {
|
|
78
84
|
if (!this.tokenizer) {
|
|
79
85
|
throw new Error('Unable to apply chat template without a tokenizer.');
|
|
@@ -84,6 +90,10 @@ export class Processor extends Callable {
|
|
|
84
90
|
});
|
|
85
91
|
}
|
|
86
92
|
|
|
93
|
+
/**
|
|
94
|
+
* @param {Parameters<PreTrainedTokenizer['batch_decode']>} args
|
|
95
|
+
* @returns {ReturnType<PreTrainedTokenizer['batch_decode']>}
|
|
96
|
+
*/
|
|
87
97
|
batch_decode(...args) {
|
|
88
98
|
if (!this.tokenizer) {
|
|
89
99
|
throw new Error('Unable to decode without a tokenizer.');
|
|
@@ -91,6 +101,17 @@ export class Processor extends Callable {
|
|
|
91
101
|
return this.tokenizer.batch_decode(...args);
|
|
92
102
|
}
|
|
93
103
|
|
|
104
|
+
/**
|
|
105
|
+
* @param {Parameters<PreTrainedTokenizer['decode']>} args
|
|
106
|
+
* @returns {ReturnType<PreTrainedTokenizer['decode']>}
|
|
107
|
+
*/
|
|
108
|
+
decode(...args) {
|
|
109
|
+
if (!this.tokenizer) {
|
|
110
|
+
throw new Error('Unable to decode without a tokenizer.');
|
|
111
|
+
}
|
|
112
|
+
return this.tokenizer.decode(...args);
|
|
113
|
+
}
|
|
114
|
+
|
|
94
115
|
|
|
95
116
|
/**
|
|
96
117
|
* Calls the feature_extractor function with the given input.
|
|
@@ -111,8 +132,8 @@ export class Processor extends Callable {
|
|
|
111
132
|
/**
|
|
112
133
|
* Instantiate one of the processor classes of the library from a pretrained model.
|
|
113
134
|
*
|
|
114
|
-
* The processor class to instantiate is selected based on the `
|
|
115
|
-
* (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
135
|
+
* The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
|
|
136
|
+
* property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
116
137
|
*
|
|
117
138
|
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
118
139
|
* - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
|
package/src/configs.js
CHANGED
|
@@ -70,15 +70,19 @@ function getNormalizedConfig(config) {
|
|
|
70
70
|
case 'florence2':
|
|
71
71
|
case 'llava_onevision':
|
|
72
72
|
case 'idefics3':
|
|
73
|
+
// @ts-expect-error TS2339
|
|
73
74
|
init_normalized_config = getNormalizedConfig(config.text_config);
|
|
74
75
|
break;
|
|
75
76
|
case 'moondream1':
|
|
77
|
+
// @ts-expect-error TS2339
|
|
76
78
|
init_normalized_config = getNormalizedConfig(config.phi_config);
|
|
77
79
|
break;
|
|
78
80
|
case 'musicgen':
|
|
81
|
+
// @ts-expect-error TS2339
|
|
79
82
|
init_normalized_config = getNormalizedConfig(config.decoder);
|
|
80
83
|
break;
|
|
81
84
|
case 'multi_modality':
|
|
85
|
+
// @ts-expect-error TS2339
|
|
82
86
|
init_normalized_config = getNormalizedConfig(config.language_config);
|
|
83
87
|
break;
|
|
84
88
|
|
|
@@ -199,6 +203,7 @@ function getNormalizedConfig(config) {
|
|
|
199
203
|
break;
|
|
200
204
|
|
|
201
205
|
case 'vision-encoder-decoder':
|
|
206
|
+
// @ts-expect-error TS2339
|
|
202
207
|
const decoderConfig = getNormalizedConfig(config.decoder);
|
|
203
208
|
|
|
204
209
|
const add_encoder_pkv = 'num_decoder_layers' in decoderConfig;
|
package/src/env.js
CHANGED
|
@@ -26,7 +26,7 @@ import fs from 'fs';
|
|
|
26
26
|
import path from 'path';
|
|
27
27
|
import url from 'url';
|
|
28
28
|
|
|
29
|
-
const VERSION = '3.
|
|
29
|
+
const VERSION = '3.3.0';
|
|
30
30
|
|
|
31
31
|
// Check if various APIs are available (depends on environment)
|
|
32
32
|
const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
|
|
@@ -160,4 +160,3 @@ export const env = {
|
|
|
160
160
|
function isEmpty(obj) {
|
|
161
161
|
return Object.keys(obj).length === 0;
|
|
162
162
|
}
|
|
163
|
-
|
|
@@ -37,6 +37,7 @@ export class TextStreamer extends BaseStreamer {
|
|
|
37
37
|
* @param {import('../tokenizers.js').PreTrainedTokenizer} tokenizer
|
|
38
38
|
* @param {Object} options
|
|
39
39
|
* @param {boolean} [options.skip_prompt=false] Whether to skip the prompt tokens
|
|
40
|
+
* @param {boolean} [options.skip_special_tokens=true] Whether to skip special tokens when decoding
|
|
40
41
|
* @param {function(string): void} [options.callback_function=null] Function to call when a piece of text is ready to display
|
|
41
42
|
* @param {function(bigint[]): void} [options.token_callback_function=null] Function to call when a new token is generated
|
|
42
43
|
* @param {Object} [options.decode_kwargs={}] Additional keyword arguments to pass to the tokenizer's decode method
|
|
@@ -45,6 +46,7 @@ export class TextStreamer extends BaseStreamer {
|
|
|
45
46
|
skip_prompt = false,
|
|
46
47
|
callback_function = null,
|
|
47
48
|
token_callback_function = null,
|
|
49
|
+
skip_special_tokens = true,
|
|
48
50
|
decode_kwargs = {},
|
|
49
51
|
...kwargs
|
|
50
52
|
} = {}) {
|
|
@@ -53,7 +55,7 @@ export class TextStreamer extends BaseStreamer {
|
|
|
53
55
|
this.skip_prompt = skip_prompt;
|
|
54
56
|
this.callback_function = callback_function ?? stdout_write;
|
|
55
57
|
this.token_callback_function = token_callback_function;
|
|
56
|
-
this.decode_kwargs = { ...decode_kwargs, ...kwargs };
|
|
58
|
+
this.decode_kwargs = { skip_special_tokens, ...decode_kwargs, ...kwargs };
|
|
57
59
|
|
|
58
60
|
// variables used in the streaming process
|
|
59
61
|
this.token_cache = [];
|
|
@@ -169,9 +171,10 @@ export class WhisperTextStreamer extends TextStreamer {
|
|
|
169
171
|
} = {}) {
|
|
170
172
|
super(tokenizer, {
|
|
171
173
|
skip_prompt,
|
|
174
|
+
skip_special_tokens,
|
|
172
175
|
callback_function,
|
|
173
176
|
token_callback_function,
|
|
174
|
-
decode_kwargs
|
|
177
|
+
decode_kwargs,
|
|
175
178
|
});
|
|
176
179
|
this.timestamp_begin = tokenizer.timestamp_begin;
|
|
177
180
|
|
|
@@ -6,22 +6,6 @@ import * as AllFeatureExtractors from '../feature_extractors.js';
|
|
|
6
6
|
|
|
7
7
|
export class AutoFeatureExtractor {
|
|
8
8
|
|
|
9
|
-
/**
|
|
10
|
-
* Instantiate one of the feature extractor classes of the library from a pretrained model.
|
|
11
|
-
*
|
|
12
|
-
* The processor class to instantiate is selected based on the `feature_extractor_type` property of
|
|
13
|
-
* the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
14
|
-
*
|
|
15
|
-
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
16
|
-
* - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
|
|
17
|
-
* Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
|
18
|
-
* user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
19
|
-
* - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
|
|
20
|
-
* @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
|
|
21
|
-
*
|
|
22
|
-
* @returns {Promise<AllFeatureExtractors.ImageProcessor>} A new instance of the Processor class.
|
|
23
|
-
*/
|
|
24
|
-
|
|
25
9
|
/** @type {typeof FeatureExtractor.from_pretrained} */
|
|
26
10
|
static async from_pretrained(pretrained_model_name_or_path, options={}) {
|
|
27
11
|
|
|
@@ -40,22 +40,6 @@ import * as AllFeatureExtractors from '../feature_extractors.js';
|
|
|
40
40
|
*/
|
|
41
41
|
export class AutoProcessor {
|
|
42
42
|
|
|
43
|
-
/**
|
|
44
|
-
* Instantiate one of the processor classes of the library from a pretrained model.
|
|
45
|
-
*
|
|
46
|
-
* The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
|
|
47
|
-
* property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
48
|
-
*
|
|
49
|
-
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
50
|
-
* - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
|
|
51
|
-
* Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
|
52
|
-
* user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
53
|
-
* - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
|
|
54
|
-
* @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
|
|
55
|
-
*
|
|
56
|
-
* @returns {Promise<Processor>} A new instance of the Processor class.
|
|
57
|
-
*/
|
|
58
|
-
|
|
59
43
|
/** @type {typeof Processor.from_pretrained} */
|
|
60
44
|
static async from_pretrained(pretrained_model_name_or_path, options={}) {
|
|
61
45
|
|
|
@@ -5,6 +5,7 @@ import {
|
|
|
5
5
|
export class EfficientNetImageProcessor extends ImageProcessor {
|
|
6
6
|
constructor(config) {
|
|
7
7
|
super(config);
|
|
8
|
+
// @ts-expect-error TS2339
|
|
8
9
|
this.include_top = this.config.include_top ?? true;
|
|
9
10
|
if (this.include_top) {
|
|
10
11
|
this.image_std = this.image_std.map(x => x * x);
|
|
@@ -10,8 +10,11 @@ export class Florence2Processor extends Processor {
|
|
|
10
10
|
super(config, components);
|
|
11
11
|
|
|
12
12
|
const {
|
|
13
|
+
// @ts-expect-error TS2339
|
|
13
14
|
tasks_answer_post_processing_type,
|
|
15
|
+
// @ts-expect-error TS2339
|
|
14
16
|
task_prompts_without_inputs,
|
|
17
|
+
// @ts-expect-error TS2339
|
|
15
18
|
task_prompts_with_input,
|
|
16
19
|
} = this.image_processor.config;
|
|
17
20
|
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
|
|
2
|
+
import {
|
|
3
|
+
ImageProcessor,
|
|
4
|
+
} from "../../base/image_processors_utils.js";
|
|
5
|
+
import { ones } from '../../utils/tensor.js';
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* @typedef {object} GroundingDinoFeatureExtractorResultProps
|
|
10
|
+
* @property {import('../../utils/tensor.js').Tensor} pixel_mask
|
|
11
|
+
* @typedef {import('../../base/image_processors_utils.js').ImageProcessorResult & GroundingDinoFeatureExtractorResultProps} GroundingDinoFeatureExtractorResult
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
export class GroundingDinoImageProcessor extends ImageProcessor {
|
|
15
|
+
/**
|
|
16
|
+
* Calls the feature extraction process on an array of images, preprocesses
|
|
17
|
+
* each image, and concatenates the resulting features into a single Tensor.
|
|
18
|
+
* @param {import('../../utils/image.js').RawImage[]} images The image(s) to extract features from.
|
|
19
|
+
* @returns {Promise<GroundingDinoFeatureExtractorResult>} An object containing the concatenated pixel values of the preprocessed images.
|
|
20
|
+
*/
|
|
21
|
+
async _call(images) {
|
|
22
|
+
const result = await super._call(images);
|
|
23
|
+
|
|
24
|
+
const dims = result.pixel_values.dims;
|
|
25
|
+
const pixel_mask = ones([dims[0], dims[2], dims[3]]);
|
|
26
|
+
|
|
27
|
+
return { ...result, pixel_mask };
|
|
28
|
+
}
|
|
29
|
+
}
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import { Processor } from "../../base/processing_utils.js";
|
|
2
|
+
import { AutoImageProcessor } from "../auto/image_processing_auto.js";
|
|
3
|
+
import { AutoTokenizer } from "../../tokenizers.js";
|
|
4
|
+
import { center_to_corners_format } from "../../base/image_processors_utils.js";
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Get token ids of phrases from posmaps and input_ids.
|
|
8
|
+
* @param {import('../../utils/tensor.js').Tensor} posmaps A boolean tensor of unbatched text-thresholded logits related to the detected bounding boxes of shape `(hidden_size, )`.
|
|
9
|
+
* @param {import('../../utils/tensor.js').Tensor} input_ids A tensor of token ids of shape `(sequence_length, )`.
|
|
10
|
+
*/
|
|
11
|
+
function get_phrases_from_posmap(posmaps, input_ids) {
|
|
12
|
+
|
|
13
|
+
const left_idx = 0;
|
|
14
|
+
const right_idx = posmaps.dims.at(-1) - 1;
|
|
15
|
+
|
|
16
|
+
const posmaps_list = posmaps.tolist();
|
|
17
|
+
posmaps_list.fill(false, 0, left_idx + 1);
|
|
18
|
+
posmaps_list.fill(false, right_idx);
|
|
19
|
+
|
|
20
|
+
const input_ids_list = input_ids.tolist();
|
|
21
|
+
return posmaps_list
|
|
22
|
+
.map((val, idx) => val ? idx : null)
|
|
23
|
+
.filter(idx => idx !== null)
|
|
24
|
+
.map(i => input_ids_list[i]);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export class GroundingDinoProcessor extends Processor {
|
|
28
|
+
static tokenizer_class = AutoTokenizer
|
|
29
|
+
static image_processor_class = AutoImageProcessor
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* @typedef {import('../../utils/image.js').RawImage} RawImage
|
|
33
|
+
*/
|
|
34
|
+
/**
|
|
35
|
+
*
|
|
36
|
+
* @param {RawImage|RawImage[]|RawImage[][]} images
|
|
37
|
+
* @param {string|string[]} text
|
|
38
|
+
* @returns {Promise<any>}
|
|
39
|
+
*/
|
|
40
|
+
async _call(images, text, options = {}) {
|
|
41
|
+
|
|
42
|
+
const image_inputs = images ? await this.image_processor(images, options) : {};
|
|
43
|
+
const text_inputs = text ? this.tokenizer(text, options) : {};
|
|
44
|
+
|
|
45
|
+
return {
|
|
46
|
+
...text_inputs,
|
|
47
|
+
...image_inputs,
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
post_process_grounded_object_detection(outputs, input_ids, {
|
|
51
|
+
box_threshold = 0.25,
|
|
52
|
+
text_threshold = 0.25,
|
|
53
|
+
target_sizes = null
|
|
54
|
+
} = {}) {
|
|
55
|
+
const { logits, pred_boxes } = outputs;
|
|
56
|
+
const batch_size = logits.dims[0];
|
|
57
|
+
|
|
58
|
+
if (target_sizes !== null && target_sizes.length !== batch_size) {
|
|
59
|
+
throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits")
|
|
60
|
+
}
|
|
61
|
+
const num_queries = logits.dims.at(1);
|
|
62
|
+
|
|
63
|
+
const probs = logits.sigmoid(); // (batch_size, num_queries, 256)
|
|
64
|
+
const scores = probs.max(-1).tolist(); // (batch_size, num_queries)
|
|
65
|
+
|
|
66
|
+
// Convert to [x0, y0, x1, y1] format
|
|
67
|
+
const boxes = pred_boxes.tolist() // (batch_size, num_queries, 4)
|
|
68
|
+
.map(batch => batch.map(box => center_to_corners_format(box)));
|
|
69
|
+
|
|
70
|
+
const results = [];
|
|
71
|
+
for (let i = 0; i < batch_size; ++i) {
|
|
72
|
+
const target_size = target_sizes !== null ? target_sizes[i] : null;
|
|
73
|
+
|
|
74
|
+
// Convert from relative [0, 1] to absolute [0, height] coordinates
|
|
75
|
+
if (target_size !== null) {
|
|
76
|
+
boxes[i] = boxes[i].map(box => box.map((x, j) => x * target_size[(j + 1) % 2]));
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
const batch_scores = scores[i];
|
|
80
|
+
const final_scores = [];
|
|
81
|
+
const final_phrases = [];
|
|
82
|
+
const final_boxes = [];
|
|
83
|
+
for (let j = 0; j < num_queries; ++j) {
|
|
84
|
+
const score = batch_scores[j];
|
|
85
|
+
if (score <= box_threshold) {
|
|
86
|
+
continue;
|
|
87
|
+
}
|
|
88
|
+
const box = boxes[i][j];
|
|
89
|
+
const prob = probs[i][j];
|
|
90
|
+
|
|
91
|
+
final_scores.push(score);
|
|
92
|
+
final_boxes.push(box);
|
|
93
|
+
|
|
94
|
+
const phrases = get_phrases_from_posmap(prob.gt(text_threshold), input_ids[i]);
|
|
95
|
+
final_phrases.push(phrases);
|
|
96
|
+
}
|
|
97
|
+
results.push({ scores: final_scores, boxes: final_boxes, labels: this.batch_decode(final_phrases) });
|
|
98
|
+
}
|
|
99
|
+
return results;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
@@ -146,6 +146,8 @@ export class Idefics3ImageProcessor extends ImageProcessor {
|
|
|
146
146
|
|
|
147
147
|
const start_offset = i * pixel_attention_mask_stride + num_patches * h * w;
|
|
148
148
|
const end_offset = (i + 1) * pixel_attention_mask_stride;
|
|
149
|
+
|
|
150
|
+
// @ts-expect-error
|
|
149
151
|
pixel_attention_mask_data.fill(false, start_offset, end_offset);
|
|
150
152
|
}
|
|
151
153
|
}
|
|
@@ -10,6 +10,7 @@ export * from './donut/image_processing_donut.js'
|
|
|
10
10
|
export * from './dpt/image_processing_dpt.js'
|
|
11
11
|
export * from './efficientnet/image_processing_efficientnet.js'
|
|
12
12
|
export * from './glpn/image_processing_glpn.js'
|
|
13
|
+
export * from './grounding_dino/image_processing_grounding_dino.js'
|
|
13
14
|
export * from './idefics3/image_processing_idefics3.js'
|
|
14
15
|
export * from './janus/image_processing_janus.js'
|
|
15
16
|
export * from './jina_clip/image_processing_jina_clip.js'
|
|
@@ -119,6 +119,8 @@ export class MgpstrProcessor extends Processor {
|
|
|
119
119
|
* - bpe_preds: The list of BPE decoded sentences.
|
|
120
120
|
* - wp_preds: The list of wp decoded sentences.
|
|
121
121
|
*/
|
|
122
|
+
// @ts-expect-error The type of this method is not compatible with the one
|
|
123
|
+
// in the base class. It might be a good idea to fix this.
|
|
122
124
|
batch_decode([char_logits, bpe_logits, wp_logits]) {
|
|
123
125
|
const [char_preds, char_scores] = this._decode_helper(char_logits, 'char');
|
|
124
126
|
const [bpe_preds, bpe_scores] = this._decode_helper(bpe_logits, 'bpe');
|
|
@@ -41,6 +41,7 @@ export class PaliGemmaProcessor extends Processor {
|
|
|
41
41
|
}
|
|
42
42
|
|
|
43
43
|
const bos_token = this.tokenizer.bos_token;
|
|
44
|
+
// @ts-expect-error TS2339
|
|
44
45
|
const image_seq_length = this.image_processor.config.image_seq_length;
|
|
45
46
|
let input_strings;
|
|
46
47
|
if (text.some((t) => t.includes(IMAGE_TOKEN))) {
|
|
@@ -14,7 +14,7 @@ export class Phi3VProcessor extends Processor {
|
|
|
14
14
|
*
|
|
15
15
|
* @param {string|string[]} text
|
|
16
16
|
* @param {RawImage|RawImage[]} images
|
|
17
|
-
* @param {
|
|
17
|
+
* @param { { padding?: boolean, truncation?: boolean, num_crops?: number } | undefined } options
|
|
18
18
|
* @returns {Promise<any>}
|
|
19
19
|
*/
|
|
20
20
|
async _call(text, images = null, {
|
package/src/models/processors.js
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
export * from './florence2/processing_florence2.js';
|
|
2
|
-
export * from './
|
|
3
|
-
export * from './moonshine/processing_moonshine.js';
|
|
2
|
+
export * from './grounding_dino/processing_grounding_dino.js';
|
|
4
3
|
export * from './idefics3/processing_idefics3.js';
|
|
5
4
|
export * from './janus/processing_janus.js';
|
|
6
5
|
export * from './jina_clip/processing_jina_clip.js';
|
|
6
|
+
export * from './mgp_str/processing_mgp_str.js';
|
|
7
|
+
export * from './moonshine/processing_moonshine.js';
|
|
7
8
|
export * from './owlvit/processing_owlvit.js';
|
|
8
9
|
export * from './phi3_v/processing_phi3_v.js';
|
|
9
10
|
export * from './paligemma/processing_paligemma.js';
|
|
@@ -52,6 +52,7 @@ export class PyAnnoteFeatureExtractor extends FeatureExtractor {
|
|
|
52
52
|
|
|
53
53
|
let current_speaker = -1;
|
|
54
54
|
for (let i = 0; i < scores.length; ++i) {
|
|
55
|
+
/** @type {number[]} */
|
|
55
56
|
const probabilities = softmax(scores[i]);
|
|
56
57
|
const [score, id] = max(probabilities);
|
|
57
58
|
const [start, end] = [i, i + 1];
|
|
@@ -133,8 +133,8 @@ export class SeamlessM4TFeatureExtractor extends FeatureExtractor {
|
|
|
133
133
|
'int64',
|
|
134
134
|
new BigInt64Array(numPaddedFrames),
|
|
135
135
|
[1, numPaddedFrames],
|
|
136
|
-
)
|
|
137
|
-
padded_attention_mask.data.fill(1n, 0, num_frames);
|
|
136
|
+
);
|
|
137
|
+
/** @type {BigInt64Array} */ (padded_attention_mask.data).fill(1n, 0, num_frames);
|
|
138
138
|
}
|
|
139
139
|
}
|
|
140
140
|
}
|
|
@@ -44,7 +44,7 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
|
|
|
44
44
|
)
|
|
45
45
|
|
|
46
46
|
const data = features.data;
|
|
47
|
-
const maxValue = max(data)[0];
|
|
47
|
+
const maxValue = max(/** @type {Float32Array} */(data))[0];
|
|
48
48
|
|
|
49
49
|
for (let i = 0; i < data.length; ++i) {
|
|
50
50
|
data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0;
|