@huggingface/transformers 3.2.3 → 3.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dist/transformers.cjs +203 -92
- package/dist/transformers.cjs.map +1 -1
- package/dist/transformers.js +203 -92
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.cjs +1 -1
- package/dist/transformers.min.cjs.map +1 -1
- package/dist/transformers.min.js +1 -1
- package/dist/transformers.min.js.map +1 -1
- package/dist/transformers.min.mjs +1 -1
- package/dist/transformers.min.mjs.map +1 -1
- package/dist/transformers.mjs +203 -92
- package/dist/transformers.mjs.map +1 -1
- package/package.json +2 -2
- package/src/base/feature_extraction_utils.js +9 -9
- package/src/base/image_processors_utils.js +11 -0
- package/src/base/processing_utils.js +13 -3
- package/src/configs.js +5 -0
- package/src/env.js +1 -1
- package/src/models/auto/feature_extraction_auto.js +0 -16
- package/src/models/auto/processing_auto.js +0 -16
- package/src/models/convnext/image_processing_convnext.js +1 -0
- package/src/models/efficientnet/image_processing_efficientnet.js +1 -0
- package/src/models/florence2/processing_florence2.js +3 -0
- package/src/models/idefics3/image_processing_idefics3.js +2 -0
- package/src/models/janus/image_processing_janus.js +1 -0
- package/src/models/mgp_str/processing_mgp_str.js +2 -0
- package/src/models/paligemma/processing_paligemma.js +1 -0
- package/src/models/phi3_v/processing_phi3_v.js +1 -1
- package/src/models/pyannote/feature_extraction_pyannote.js +1 -0
- package/src/models/qwen2_vl/processing_qwen2_vl.js +1 -0
- package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +2 -2
- package/src/models/whisper/feature_extraction_whisper.js +1 -1
- package/src/models.js +50 -15
- package/src/ops/registry.js +10 -0
- package/src/pipelines.js +34 -7
- package/src/tokenizers.js +4 -7
- package/src/utils/dtypes.js +2 -0
- package/src/utils/hub.js +1 -1
- package/src/utils/maths.js +8 -6
- package/src/utils/tensor.js +42 -10
- package/types/base/feature_extraction_utils.d.ts +7 -7
- package/types/base/image_processors_utils.d.ts.map +1 -1
- package/types/base/processing_utils.d.ts +17 -19
- package/types/base/processing_utils.d.ts.map +1 -1
- package/types/configs.d.ts.map +1 -1
- package/types/generation/parameters.d.ts +1 -1
- package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
- package/types/models/auto/image_processing_auto.d.ts.map +1 -1
- package/types/models/auto/processing_auto.d.ts.map +1 -1
- package/types/models/convnext/image_processing_convnext.d.ts.map +1 -1
- package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -1
- package/types/models/florence2/processing_florence2.d.ts.map +1 -1
- package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
- package/types/models/janus/image_processing_janus.d.ts.map +1 -1
- package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
- package/types/models/paligemma/processing_paligemma.d.ts.map +1 -1
- package/types/models/phi3_v/processing_phi3_v.d.ts +6 -2
- package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -1
- package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/sapiens/image_processing_sapiens.d.ts +10 -0
- package/types/models/sapiens/image_processing_sapiens.d.ts.map +1 -0
- package/types/models/whisper/generation_whisper.d.ts +1 -1
- package/types/models/whisper/generation_whisper.d.ts.map +1 -1
- package/types/models.d.ts +32 -17
- package/types/models.d.ts.map +1 -1
- package/types/ops/registry.d.ts +1 -0
- package/types/ops/registry.d.ts.map +1 -1
- package/types/pipelines.d.ts +2 -2
- package/types/pipelines.d.ts.map +1 -1
- package/types/tokenizers.d.ts.map +1 -1
- package/types/tsconfig.tsbuildinfo +1 -0
- package/types/utils/dtypes.d.ts.map +1 -1
- package/types/utils/hub.d.ts +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/image.d.ts +3 -2
- package/types/utils/image.d.ts.map +1 -1
- package/types/utils/maths.d.ts +8 -6
- package/types/utils/maths.d.ts.map +1 -1
- package/types/utils/tensor.d.ts +8 -4
- package/types/utils/tensor.d.ts.map +1 -1
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@huggingface/transformers",
|
|
3
|
-
"version": "3.2.
|
|
3
|
+
"version": "3.2.4",
|
|
4
4
|
"description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
|
|
5
5
|
"main": "./src/transformers.js",
|
|
6
6
|
"types": "./types/transformers.d.ts",
|
|
@@ -24,7 +24,7 @@
|
|
|
24
24
|
"scripts": {
|
|
25
25
|
"format": "prettier --write .",
|
|
26
26
|
"format:check": "prettier --check .",
|
|
27
|
-
"typegen": "tsc
|
|
27
|
+
"typegen": "tsc --build",
|
|
28
28
|
"dev": "webpack serve --no-client-overlay",
|
|
29
29
|
"build": "webpack && npm run typegen",
|
|
30
30
|
"test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --verbose",
|
|
@@ -17,23 +17,23 @@ export class FeatureExtractor extends Callable {
|
|
|
17
17
|
}
|
|
18
18
|
|
|
19
19
|
/**
|
|
20
|
-
* Instantiate one of the
|
|
20
|
+
* Instantiate one of the feature extractor classes of the library from a pretrained model.
|
|
21
21
|
*
|
|
22
|
-
* The
|
|
23
|
-
*
|
|
22
|
+
* The feature extractor class to instantiate is selected based on the `feature_extractor_type` property of
|
|
23
|
+
* the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
24
24
|
*
|
|
25
25
|
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
26
|
-
* - A string, the *model id* of a pretrained
|
|
26
|
+
* - A string, the *model id* of a pretrained feature_extractor hosted inside a model repo on huggingface.co.
|
|
27
27
|
* Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
|
28
28
|
* user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
29
|
-
* - A path to a *directory* containing
|
|
30
|
-
* @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the
|
|
29
|
+
* - A path to a *directory* containing feature_extractor files, e.g., `./my_model_directory/`.
|
|
30
|
+
* @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the feature_extractor.
|
|
31
31
|
*
|
|
32
|
-
* @returns {Promise<FeatureExtractor>} A new instance of the
|
|
32
|
+
* @returns {Promise<FeatureExtractor>} A new instance of the Feature Extractor class.
|
|
33
33
|
*/
|
|
34
34
|
static async from_pretrained(pretrained_model_name_or_path, options) {
|
|
35
|
-
const
|
|
36
|
-
return new this(
|
|
35
|
+
const config = await getModelJSON(pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME, true, options);
|
|
36
|
+
return new this(config);
|
|
37
37
|
}
|
|
38
38
|
}
|
|
39
39
|
|
|
@@ -604,14 +604,20 @@ export class ImageProcessor extends Callable {
|
|
|
604
604
|
this.do_thumbnail = config.do_thumbnail;
|
|
605
605
|
this.size = config.size ?? config.image_size;
|
|
606
606
|
this.do_resize = config.do_resize ?? (this.size !== undefined);
|
|
607
|
+
// @ts-expect-error TS2339
|
|
607
608
|
this.size_divisibility = config.size_divisibility ?? config.size_divisor;
|
|
608
609
|
|
|
609
610
|
this.do_center_crop = config.do_center_crop;
|
|
611
|
+
// @ts-expect-error TS2339
|
|
610
612
|
this.crop_size = config.crop_size;
|
|
613
|
+
// @ts-expect-error TS2339
|
|
611
614
|
this.do_convert_rgb = config.do_convert_rgb ?? true;
|
|
615
|
+
// @ts-expect-error TS2339
|
|
612
616
|
this.do_crop_margin = config.do_crop_margin;
|
|
613
617
|
|
|
618
|
+
// @ts-expect-error TS2339
|
|
614
619
|
this.pad_size = config.pad_size;
|
|
620
|
+
// @ts-expect-error TS2339
|
|
615
621
|
this.do_pad = config.do_pad;
|
|
616
622
|
|
|
617
623
|
if (this.do_pad && !this.pad_size && this.size && this.size.width !== undefined && this.size.height !== undefined) {
|
|
@@ -820,6 +826,7 @@ export class ImageProcessor extends Callable {
|
|
|
820
826
|
// Support both formats for backwards compatibility
|
|
821
827
|
else if (Number.isInteger(size)) {
|
|
822
828
|
shortest_edge = size;
|
|
829
|
+
// @ts-expect-error TS2339
|
|
823
830
|
longest_edge = this.config.max_size ?? shortest_edge;
|
|
824
831
|
|
|
825
832
|
} else if (size !== undefined) {
|
|
@@ -888,6 +895,7 @@ export class ImageProcessor extends Callable {
|
|
|
888
895
|
} else if (size.min_pixels !== undefined && size.max_pixels !== undefined) {
|
|
889
896
|
// Custom resize logic for Qwen2-VL models
|
|
890
897
|
const { min_pixels, max_pixels } = size;
|
|
898
|
+
// @ts-expect-error TS2339
|
|
891
899
|
const factor = this.config.patch_size * this.config.merge_size;
|
|
892
900
|
return smart_resize(srcHeight, srcWidth, factor, min_pixels, max_pixels);
|
|
893
901
|
} else {
|
|
@@ -903,6 +911,7 @@ export class ImageProcessor extends Callable {
|
|
|
903
911
|
async resize(image) {
|
|
904
912
|
const [newWidth, newHeight] = this.get_resize_output_image_size(image, this.size);
|
|
905
913
|
return await image.resize(newWidth, newHeight, {
|
|
914
|
+
// @ts-expect-error TS2322
|
|
906
915
|
resample: this.resample,
|
|
907
916
|
});
|
|
908
917
|
}
|
|
@@ -953,6 +962,7 @@ export class ImageProcessor extends Callable {
|
|
|
953
962
|
|
|
954
963
|
// Resize the image using thumbnail method.
|
|
955
964
|
if (this.do_thumbnail) {
|
|
965
|
+
// @ts-expect-error TS2345
|
|
956
966
|
image = await this.thumbnail(image, this.size, this.resample);
|
|
957
967
|
}
|
|
958
968
|
|
|
@@ -977,6 +987,7 @@ export class ImageProcessor extends Callable {
|
|
|
977
987
|
// NOTE: All pixel-level manipulation (i.e., modifying `pixelData`)
|
|
978
988
|
// occurs with data in the hwc format (height, width, channels),
|
|
979
989
|
// to emulate the behavior of the original Python code (w/ numpy).
|
|
990
|
+
/** @type {Float32Array} */
|
|
980
991
|
let pixelData = Float32Array.from(image.data);
|
|
981
992
|
let imgDims = [image.height, image.width, image.channels];
|
|
982
993
|
|
|
@@ -28,6 +28,7 @@ import { getModelJSON } from '../utils/hub.js';
|
|
|
28
28
|
/**
|
|
29
29
|
* @typedef {Object} ProcessorProperties Additional processor-specific properties.
|
|
30
30
|
* @typedef {import('../utils/hub.js').PretrainedOptions & ProcessorProperties} PretrainedProcessorOptions
|
|
31
|
+
* @typedef {import('../tokenizers.js').PreTrainedTokenizer} PreTrainedTokenizer
|
|
31
32
|
*/
|
|
32
33
|
|
|
33
34
|
|
|
@@ -61,7 +62,7 @@ export class Processor extends Callable {
|
|
|
61
62
|
}
|
|
62
63
|
|
|
63
64
|
/**
|
|
64
|
-
* @returns {
|
|
65
|
+
* @returns {PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists.
|
|
65
66
|
*/
|
|
66
67
|
get tokenizer() {
|
|
67
68
|
return this.components.tokenizer;
|
|
@@ -74,6 +75,11 @@ export class Processor extends Callable {
|
|
|
74
75
|
return this.components.feature_extractor;
|
|
75
76
|
}
|
|
76
77
|
|
|
78
|
+
/**
|
|
79
|
+
* @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[0]} messages
|
|
80
|
+
* @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[1]} options
|
|
81
|
+
* @returns {ReturnType<PreTrainedTokenizer['apply_chat_template']>}
|
|
82
|
+
*/
|
|
77
83
|
apply_chat_template(messages, options = {}) {
|
|
78
84
|
if (!this.tokenizer) {
|
|
79
85
|
throw new Error('Unable to apply chat template without a tokenizer.');
|
|
@@ -84,6 +90,10 @@ export class Processor extends Callable {
|
|
|
84
90
|
});
|
|
85
91
|
}
|
|
86
92
|
|
|
93
|
+
/**
|
|
94
|
+
* @param {Parameters<PreTrainedTokenizer['batch_decode']>} args
|
|
95
|
+
* @returns {ReturnType<PreTrainedTokenizer['batch_decode']>}
|
|
96
|
+
*/
|
|
87
97
|
batch_decode(...args) {
|
|
88
98
|
if (!this.tokenizer) {
|
|
89
99
|
throw new Error('Unable to decode without a tokenizer.');
|
|
@@ -111,8 +121,8 @@ export class Processor extends Callable {
|
|
|
111
121
|
/**
|
|
112
122
|
* Instantiate one of the processor classes of the library from a pretrained model.
|
|
113
123
|
*
|
|
114
|
-
* The processor class to instantiate is selected based on the `
|
|
115
|
-
* (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
124
|
+
* The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
|
|
125
|
+
* property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
116
126
|
*
|
|
117
127
|
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
118
128
|
* - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
|
package/src/configs.js
CHANGED
|
@@ -70,15 +70,19 @@ function getNormalizedConfig(config) {
|
|
|
70
70
|
case 'florence2':
|
|
71
71
|
case 'llava_onevision':
|
|
72
72
|
case 'idefics3':
|
|
73
|
+
// @ts-expect-error TS2339
|
|
73
74
|
init_normalized_config = getNormalizedConfig(config.text_config);
|
|
74
75
|
break;
|
|
75
76
|
case 'moondream1':
|
|
77
|
+
// @ts-expect-error TS2339
|
|
76
78
|
init_normalized_config = getNormalizedConfig(config.phi_config);
|
|
77
79
|
break;
|
|
78
80
|
case 'musicgen':
|
|
81
|
+
// @ts-expect-error TS2339
|
|
79
82
|
init_normalized_config = getNormalizedConfig(config.decoder);
|
|
80
83
|
break;
|
|
81
84
|
case 'multi_modality':
|
|
85
|
+
// @ts-expect-error TS2339
|
|
82
86
|
init_normalized_config = getNormalizedConfig(config.language_config);
|
|
83
87
|
break;
|
|
84
88
|
|
|
@@ -199,6 +203,7 @@ function getNormalizedConfig(config) {
|
|
|
199
203
|
break;
|
|
200
204
|
|
|
201
205
|
case 'vision-encoder-decoder':
|
|
206
|
+
// @ts-expect-error TS2339
|
|
202
207
|
const decoderConfig = getNormalizedConfig(config.decoder);
|
|
203
208
|
|
|
204
209
|
const add_encoder_pkv = 'num_decoder_layers' in decoderConfig;
|
package/src/env.js
CHANGED
|
@@ -26,7 +26,7 @@ import fs from 'fs';
|
|
|
26
26
|
import path from 'path';
|
|
27
27
|
import url from 'url';
|
|
28
28
|
|
|
29
|
-
const VERSION = '3.2.
|
|
29
|
+
const VERSION = '3.2.4';
|
|
30
30
|
|
|
31
31
|
// Check if various APIs are available (depends on environment)
|
|
32
32
|
const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
|
|
@@ -6,22 +6,6 @@ import * as AllFeatureExtractors from '../feature_extractors.js';
|
|
|
6
6
|
|
|
7
7
|
export class AutoFeatureExtractor {
|
|
8
8
|
|
|
9
|
-
/**
|
|
10
|
-
* Instantiate one of the feature extractor classes of the library from a pretrained model.
|
|
11
|
-
*
|
|
12
|
-
* The processor class to instantiate is selected based on the `feature_extractor_type` property of
|
|
13
|
-
* the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
14
|
-
*
|
|
15
|
-
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
16
|
-
* - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
|
|
17
|
-
* Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
|
18
|
-
* user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
19
|
-
* - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
|
|
20
|
-
* @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
|
|
21
|
-
*
|
|
22
|
-
* @returns {Promise<AllFeatureExtractors.ImageProcessor>} A new instance of the Processor class.
|
|
23
|
-
*/
|
|
24
|
-
|
|
25
9
|
/** @type {typeof FeatureExtractor.from_pretrained} */
|
|
26
10
|
static async from_pretrained(pretrained_model_name_or_path, options={}) {
|
|
27
11
|
|
|
@@ -40,22 +40,6 @@ import * as AllFeatureExtractors from '../feature_extractors.js';
|
|
|
40
40
|
*/
|
|
41
41
|
export class AutoProcessor {
|
|
42
42
|
|
|
43
|
-
/**
|
|
44
|
-
* Instantiate one of the processor classes of the library from a pretrained model.
|
|
45
|
-
*
|
|
46
|
-
* The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
|
|
47
|
-
* property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
48
|
-
*
|
|
49
|
-
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
50
|
-
* - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
|
|
51
|
-
* Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
|
52
|
-
* user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
53
|
-
* - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
|
|
54
|
-
* @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
|
|
55
|
-
*
|
|
56
|
-
* @returns {Promise<Processor>} A new instance of the Processor class.
|
|
57
|
-
*/
|
|
58
|
-
|
|
59
43
|
/** @type {typeof Processor.from_pretrained} */
|
|
60
44
|
static async from_pretrained(pretrained_model_name_or_path, options={}) {
|
|
61
45
|
|
|
@@ -5,6 +5,7 @@ import {
|
|
|
5
5
|
export class EfficientNetImageProcessor extends ImageProcessor {
|
|
6
6
|
constructor(config) {
|
|
7
7
|
super(config);
|
|
8
|
+
// @ts-expect-error TS2339
|
|
8
9
|
this.include_top = this.config.include_top ?? true;
|
|
9
10
|
if (this.include_top) {
|
|
10
11
|
this.image_std = this.image_std.map(x => x * x);
|
|
@@ -10,8 +10,11 @@ export class Florence2Processor extends Processor {
|
|
|
10
10
|
super(config, components);
|
|
11
11
|
|
|
12
12
|
const {
|
|
13
|
+
// @ts-expect-error TS2339
|
|
13
14
|
tasks_answer_post_processing_type,
|
|
15
|
+
// @ts-expect-error TS2339
|
|
14
16
|
task_prompts_without_inputs,
|
|
17
|
+
// @ts-expect-error TS2339
|
|
15
18
|
task_prompts_with_input,
|
|
16
19
|
} = this.image_processor.config;
|
|
17
20
|
|
|
@@ -146,6 +146,8 @@ export class Idefics3ImageProcessor extends ImageProcessor {
|
|
|
146
146
|
|
|
147
147
|
const start_offset = i * pixel_attention_mask_stride + num_patches * h * w;
|
|
148
148
|
const end_offset = (i + 1) * pixel_attention_mask_stride;
|
|
149
|
+
|
|
150
|
+
// @ts-expect-error
|
|
149
151
|
pixel_attention_mask_data.fill(false, start_offset, end_offset);
|
|
150
152
|
}
|
|
151
153
|
}
|
|
@@ -119,6 +119,8 @@ export class MgpstrProcessor extends Processor {
|
|
|
119
119
|
* - bpe_preds: The list of BPE decoded sentences.
|
|
120
120
|
* - wp_preds: The list of wp decoded sentences.
|
|
121
121
|
*/
|
|
122
|
+
// @ts-expect-error The type of this method is not compatible with the one
|
|
123
|
+
// in the base class. It might be a good idea to fix this.
|
|
122
124
|
batch_decode([char_logits, bpe_logits, wp_logits]) {
|
|
123
125
|
const [char_preds, char_scores] = this._decode_helper(char_logits, 'char');
|
|
124
126
|
const [bpe_preds, bpe_scores] = this._decode_helper(bpe_logits, 'bpe');
|
|
@@ -41,6 +41,7 @@ export class PaliGemmaProcessor extends Processor {
|
|
|
41
41
|
}
|
|
42
42
|
|
|
43
43
|
const bos_token = this.tokenizer.bos_token;
|
|
44
|
+
// @ts-expect-error TS2339
|
|
44
45
|
const image_seq_length = this.image_processor.config.image_seq_length;
|
|
45
46
|
let input_strings;
|
|
46
47
|
if (text.some((t) => t.includes(IMAGE_TOKEN))) {
|
|
@@ -14,7 +14,7 @@ export class Phi3VProcessor extends Processor {
|
|
|
14
14
|
*
|
|
15
15
|
* @param {string|string[]} text
|
|
16
16
|
* @param {RawImage|RawImage[]} images
|
|
17
|
-
* @param {
|
|
17
|
+
* @param { { padding?: boolean, truncation?: boolean, num_crops?: number } | undefined } options
|
|
18
18
|
* @returns {Promise<any>}
|
|
19
19
|
*/
|
|
20
20
|
async _call(text, images = null, {
|
|
@@ -52,6 +52,7 @@ export class PyAnnoteFeatureExtractor extends FeatureExtractor {
|
|
|
52
52
|
|
|
53
53
|
let current_speaker = -1;
|
|
54
54
|
for (let i = 0; i < scores.length; ++i) {
|
|
55
|
+
/** @type {number[]} */
|
|
55
56
|
const probabilities = softmax(scores[i]);
|
|
56
57
|
const [score, id] = max(probabilities);
|
|
57
58
|
const [start, end] = [i, i + 1];
|
|
@@ -133,8 +133,8 @@ export class SeamlessM4TFeatureExtractor extends FeatureExtractor {
|
|
|
133
133
|
'int64',
|
|
134
134
|
new BigInt64Array(numPaddedFrames),
|
|
135
135
|
[1, numPaddedFrames],
|
|
136
|
-
)
|
|
137
|
-
padded_attention_mask.data.fill(1n, 0, num_frames);
|
|
136
|
+
);
|
|
137
|
+
/** @type {BigInt64Array} */ (padded_attention_mask.data).fill(1n, 0, num_frames);
|
|
138
138
|
}
|
|
139
139
|
}
|
|
140
140
|
}
|
|
@@ -44,7 +44,7 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
|
|
|
44
44
|
)
|
|
45
45
|
|
|
46
46
|
const data = features.data;
|
|
47
|
-
const maxValue = max(data)[0];
|
|
47
|
+
const maxValue = max(/** @type {Float32Array} */(data))[0];
|
|
48
48
|
|
|
49
49
|
for (let i = 0; i < data.length; ++i) {
|
|
50
50
|
data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0;
|
package/src/models.js
CHANGED
|
@@ -270,8 +270,11 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
270
270
|
} else if (session_options.externalData !== undefined) {
|
|
271
271
|
externalDataPromises = session_options.externalData.map(async (ext) => {
|
|
272
272
|
// if the external data is a string, fetch the file and replace the string with its content
|
|
273
|
+
// @ts-expect-error TS2339
|
|
273
274
|
if (typeof ext.data === "string") {
|
|
275
|
+
// @ts-expect-error TS2339
|
|
274
276
|
const ext_buffer = await getModelFile(pretrained_model_name_or_path, ext.data, true, options);
|
|
277
|
+
// @ts-expect-error TS2698
|
|
275
278
|
return { ...ext, data: ext_buffer };
|
|
276
279
|
}
|
|
277
280
|
return ext;
|
|
@@ -1519,6 +1522,7 @@ export class PreTrainedModel extends Callable {
|
|
|
1519
1522
|
if (this.config.model_type === 'musicgen') {
|
|
1520
1523
|
// Custom logic (TODO: move to Musicgen class)
|
|
1521
1524
|
decoder_input_ids = Array.from({
|
|
1525
|
+
// @ts-expect-error TS2339
|
|
1522
1526
|
length: batch_size * this.config.decoder.num_codebooks
|
|
1523
1527
|
}, () => [decoder_start_token_id]);
|
|
1524
1528
|
|
|
@@ -1848,11 +1852,13 @@ export class PreTrainedModel extends Callable {
|
|
|
1848
1852
|
async encode_image({ pixel_values }) {
|
|
1849
1853
|
// image_inputs === { pixel_values }
|
|
1850
1854
|
const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values })).image_features;
|
|
1855
|
+
// @ts-expect-error TS2339
|
|
1851
1856
|
if (!this.config.num_image_tokens) {
|
|
1852
1857
|
console.warn(
|
|
1853
1858
|
'The number of image tokens was not set in the model configuration. ' +
|
|
1854
1859
|
`Setting it to the number of features detected by the vision encoder (${features.dims[1]}).`
|
|
1855
1860
|
)
|
|
1861
|
+
// @ts-expect-error TS2339
|
|
1856
1862
|
this.config.num_image_tokens = features.dims[1];
|
|
1857
1863
|
}
|
|
1858
1864
|
return features;
|
|
@@ -3280,6 +3286,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
|
|
|
3280
3286
|
|
|
3281
3287
|
if (generation_config.return_token_timestamps) {
|
|
3282
3288
|
outputs["token_timestamps"] = this._extract_token_timestamps(
|
|
3289
|
+
// @ts-expect-error TS2345
|
|
3283
3290
|
outputs,
|
|
3284
3291
|
generation_config.alignment_heads,
|
|
3285
3292
|
generation_config.num_frames,
|
|
@@ -3315,6 +3322,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
|
|
|
3315
3322
|
);
|
|
3316
3323
|
}
|
|
3317
3324
|
|
|
3325
|
+
// @ts-expect-error TS2339
|
|
3318
3326
|
let median_filter_width = this.config.median_filter_width;
|
|
3319
3327
|
if (median_filter_width === undefined) {
|
|
3320
3328
|
console.warn("Model config has no `median_filter_width`, using default value of 7.")
|
|
@@ -3325,6 +3333,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
|
|
|
3325
3333
|
const batch = generate_outputs.cross_attentions;
|
|
3326
3334
|
// Create a list with `decoder_layers` elements, each a tensor of shape
|
|
3327
3335
|
// (batch size, attention_heads, output length, input length).
|
|
3336
|
+
// @ts-expect-error TS2339
|
|
3328
3337
|
const cross_attentions = Array.from({ length: this.config.decoder_layers },
|
|
3329
3338
|
// Concatenate the cross attentions for each layer across sequence length dimension.
|
|
3330
3339
|
(_, i) => cat(batch.map(x => x[i]), 2)
|
|
@@ -3468,6 +3477,7 @@ export class LlavaForConditionalGeneration extends LlavaPreTrainedModel {
|
|
|
3468
3477
|
attention_mask,
|
|
3469
3478
|
}) {
|
|
3470
3479
|
|
|
3480
|
+
// @ts-expect-error TS2339
|
|
3471
3481
|
const image_token_index = this.config.image_token_index;
|
|
3472
3482
|
|
|
3473
3483
|
const idsList = input_ids.tolist();
|
|
@@ -4453,6 +4463,7 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
|
|
|
4453
4463
|
const image_nums = vision_tokens.filter(x => x == image_token_id).length;
|
|
4454
4464
|
const video_nums = vision_tokens.filter(x => x == video_token_id).length;
|
|
4455
4465
|
|
|
4466
|
+
/** @type {number[][]} */
|
|
4456
4467
|
let llm_pos_ids_list = [];
|
|
4457
4468
|
let st = 0;
|
|
4458
4469
|
let remain_images = image_nums;
|
|
@@ -4522,6 +4533,7 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
|
|
|
4522
4533
|
// NOTE: Each item in llm_pos_ids_list is an array of shape (3, text_len),
|
|
4523
4534
|
// meaning to perform concatenation along dim=1, we can do the following:
|
|
4524
4535
|
const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
4536
|
+
/** @type {number[]} */
|
|
4525
4537
|
const llm_positions = new Array(num_items);
|
|
4526
4538
|
let index = 0;
|
|
4527
4539
|
for (let x = 0; x < 3; ++x) {
|
|
@@ -4562,9 +4574,10 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
|
|
|
4562
4574
|
{ length: 3 * data.length },
|
|
4563
4575
|
(_, i) => data[i % data.length]
|
|
4564
4576
|
);
|
|
4577
|
+
/** @type {bigint[]} */
|
|
4565
4578
|
const mrope_position_deltas = Array.from(
|
|
4566
4579
|
{ length: dims[0] },
|
|
4567
|
-
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] +
|
|
4580
|
+
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
4568
4581
|
);
|
|
4569
4582
|
|
|
4570
4583
|
return [
|
|
@@ -5135,7 +5148,7 @@ export class DPTModel extends DPTPreTrainedModel { }
|
|
|
5135
5148
|
*
|
|
5136
5149
|
* **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas`.
|
|
5137
5150
|
* ```javascript
|
|
5138
|
-
* import { DPTForDepthEstimation, AutoProcessor, RawImage,
|
|
5151
|
+
* import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
|
|
5139
5152
|
*
|
|
5140
5153
|
* // Load model and processor
|
|
5141
5154
|
* const model_id = 'Xenova/dpt-hybrid-midas';
|
|
@@ -5144,7 +5157,7 @@ export class DPTModel extends DPTPreTrainedModel { }
|
|
|
5144
5157
|
*
|
|
5145
5158
|
* // Load image from URL
|
|
5146
5159
|
* const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
|
|
5147
|
-
* const image = await RawImage.
|
|
5160
|
+
* const image = await RawImage.read(url);
|
|
5148
5161
|
*
|
|
5149
5162
|
* // Prepare image for the model
|
|
5150
5163
|
* const inputs = await processor(image);
|
|
@@ -5153,10 +5166,15 @@ export class DPTModel extends DPTPreTrainedModel { }
|
|
|
5153
5166
|
* const { predicted_depth } = await model(inputs);
|
|
5154
5167
|
*
|
|
5155
5168
|
* // Interpolate to original size
|
|
5156
|
-
* const prediction =
|
|
5169
|
+
* const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
|
|
5170
|
+
* size: image.size.reverse(),
|
|
5171
|
+
* mode: 'bilinear',
|
|
5172
|
+
* })).squeeze(1);
|
|
5157
5173
|
*
|
|
5158
5174
|
* // Visualize the prediction
|
|
5159
|
-
* const
|
|
5175
|
+
* const min = prediction.min().item();
|
|
5176
|
+
* const max = prediction.max().item();
|
|
5177
|
+
* const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
|
|
5160
5178
|
* const depth = RawImage.fromTensor(formatted);
|
|
5161
5179
|
* // RawImage {
|
|
5162
5180
|
* // data: Uint8Array(307200) [ 85, 85, 84, ... ],
|
|
@@ -5206,11 +5224,7 @@ export class GLPNPreTrainedModel extends PreTrainedModel { }
|
|
|
5206
5224
|
export class GLPNModel extends GLPNPreTrainedModel { }
|
|
5207
5225
|
|
|
5208
5226
|
/**
|
|
5209
|
-
*
|
|
5210
|
-
*
|
|
5211
|
-
* **Example:** Depth estimation w/ `Xenova/glpn-kitti`.
|
|
5212
|
-
* ```javascript
|
|
5213
|
-
* import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
|
|
5227
|
+
* import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
|
|
5214
5228
|
*
|
|
5215
5229
|
* // Load model and processor
|
|
5216
5230
|
* const model_id = 'Xenova/glpn-kitti';
|
|
@@ -5219,7 +5233,7 @@ export class GLPNModel extends GLPNPreTrainedModel { }
|
|
|
5219
5233
|
*
|
|
5220
5234
|
* // Load image from URL
|
|
5221
5235
|
* const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
|
|
5222
|
-
* const image = await RawImage.
|
|
5236
|
+
* const image = await RawImage.read(url);
|
|
5223
5237
|
*
|
|
5224
5238
|
* // Prepare image for the model
|
|
5225
5239
|
* const inputs = await processor(image);
|
|
@@ -5228,13 +5242,18 @@ export class GLPNModel extends GLPNPreTrainedModel { }
|
|
|
5228
5242
|
* const { predicted_depth } = await model(inputs);
|
|
5229
5243
|
*
|
|
5230
5244
|
* // Interpolate to original size
|
|
5231
|
-
* const prediction =
|
|
5245
|
+
* const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
|
|
5246
|
+
* size: image.size.reverse(),
|
|
5247
|
+
* mode: 'bilinear',
|
|
5248
|
+
* })).squeeze(1);
|
|
5232
5249
|
*
|
|
5233
5250
|
* // Visualize the prediction
|
|
5234
|
-
* const
|
|
5251
|
+
* const min = prediction.min().item();
|
|
5252
|
+
* const max = prediction.max().item();
|
|
5253
|
+
* const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
|
|
5235
5254
|
* const depth = RawImage.fromTensor(formatted);
|
|
5236
5255
|
* // RawImage {
|
|
5237
|
-
* // data: Uint8Array(307200) [
|
|
5256
|
+
* // data: Uint8Array(307200) [ 85, 85, 84, ... ],
|
|
5238
5257
|
* // width: 640,
|
|
5239
5258
|
* // height: 480,
|
|
5240
5259
|
* // channels: 1
|
|
@@ -6201,10 +6220,12 @@ export class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel {
|
|
|
6201
6220
|
|
|
6202
6221
|
const { encoder_outputs, encoder_attention_mask } = await encoderForward(this, model_inputs);
|
|
6203
6222
|
|
|
6223
|
+
// @ts-expect-error TS2339
|
|
6204
6224
|
const r = encoder_outputs.dims[1] / this.config.reduction_factor;
|
|
6205
6225
|
const maxlen = Math.floor(r * maxlenratio);
|
|
6206
6226
|
const minlen = Math.floor(r * minlenratio);
|
|
6207
6227
|
|
|
6228
|
+
// @ts-expect-error TS2339
|
|
6208
6229
|
const num_mel_bins = this.config.num_mel_bins;
|
|
6209
6230
|
|
|
6210
6231
|
let spectrogramParts = [];
|
|
@@ -6569,11 +6590,13 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
|
|
|
6569
6590
|
*/
|
|
6570
6591
|
_apply_and_filter_by_delay_pattern_mask(outputs) {
|
|
6571
6592
|
const [bs_x_codebooks, seqLength] = outputs.dims;
|
|
6593
|
+
// @ts-expect-error TS2339
|
|
6572
6594
|
const num_codebooks = this.config.decoder.num_codebooks;
|
|
6573
6595
|
const upperBound = (seqLength - num_codebooks);
|
|
6574
6596
|
|
|
6575
6597
|
let newDataSize = 0;
|
|
6576
6598
|
for (let i = 0; i < outputs.size; ++i) {
|
|
6599
|
+
// @ts-expect-error TS2339
|
|
6577
6600
|
if (outputs.data[i] === this.config.decoder.pad_token_id) {
|
|
6578
6601
|
continue;
|
|
6579
6602
|
}
|
|
@@ -6603,7 +6626,9 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
|
|
|
6603
6626
|
let clonedInputIds = structuredClone(input_ids);
|
|
6604
6627
|
for (let i = 0; i < clonedInputIds.length; ++i) {
|
|
6605
6628
|
for (let j = 0; j < clonedInputIds[i].length; ++j) {
|
|
6629
|
+
// @ts-expect-error TS2339
|
|
6606
6630
|
if ((i % this.config.decoder.num_codebooks) >= j) {
|
|
6631
|
+
// @ts-expect-error TS2339
|
|
6607
6632
|
clonedInputIds[i][j] = BigInt(this.config.decoder.pad_token_id);
|
|
6608
6633
|
}
|
|
6609
6634
|
}
|
|
@@ -6760,6 +6785,9 @@ export class MultiModalityCausalLM extends MultiModalityPreTrainedModel {
|
|
|
6760
6785
|
'past_key_values',
|
|
6761
6786
|
];
|
|
6762
6787
|
|
|
6788
|
+
/**
|
|
6789
|
+
* @param {ConstructorParameters<typeof MultiModalityPreTrainedModel>} args
|
|
6790
|
+
*/
|
|
6763
6791
|
constructor(...args) {
|
|
6764
6792
|
super(...args);
|
|
6765
6793
|
|
|
@@ -7728,10 +7756,17 @@ export class SequenceClassifierOutput extends ModelOutput {
|
|
|
7728
7756
|
/**
|
|
7729
7757
|
* @param {Object} output The output of the model.
|
|
7730
7758
|
* @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax).
|
|
7759
|
+
* @param {Record<string, Tensor>} [output.attentions] Object of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
|
|
7760
|
+
* Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
|
7731
7761
|
*/
|
|
7732
|
-
constructor({ logits }) {
|
|
7762
|
+
constructor({ logits, ...attentions }) {
|
|
7733
7763
|
super();
|
|
7734
7764
|
this.logits = logits;
|
|
7765
|
+
const attentions_list = Object.values(attentions);
|
|
7766
|
+
if (attentions_list.length > 0) {
|
|
7767
|
+
// Only set attentions if they are not empty
|
|
7768
|
+
this.attentions = attentions_list;
|
|
7769
|
+
}
|
|
7735
7770
|
}
|
|
7736
7771
|
}
|
|
7737
7772
|
|
package/src/ops/registry.js
CHANGED
|
@@ -36,6 +36,16 @@ export class TensorOpRegistry {
|
|
|
36
36
|
// executionProviders: ['webgpu'],
|
|
37
37
|
};
|
|
38
38
|
|
|
39
|
+
static get nearest_interpolate_4d() {
|
|
40
|
+
if (!this._nearest_interpolate_4d) {
|
|
41
|
+
this._nearest_interpolate_4d = wrap(
|
|
42
|
+
[8, 10, 18, 0, 58, 129, 1, 10, 41, 10, 1, 120, 10, 0, 10, 0, 10, 1, 115, 18, 1, 121, 34, 6, 82, 101, 115, 105, 122, 101, 42, 18, 10, 4, 109, 111, 100, 101, 34, 7, 110, 101, 97, 114, 101, 115, 116, 160, 1, 3, 18, 1, 114, 90, 31, 10, 1, 120, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 90, 15, 10, 1, 115, 18, 10, 10, 8, 8, 7, 18, 4, 10, 2, 8, 4, 98, 31, 10, 1, 121, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 66, 2, 16, 21],
|
|
43
|
+
this.session_options,
|
|
44
|
+
'y',
|
|
45
|
+
);
|
|
46
|
+
}
|
|
47
|
+
return this._nearest_interpolate_4d;
|
|
48
|
+
}
|
|
39
49
|
static get bilinear_interpolate_4d() {
|
|
40
50
|
if (!this._bilinear_interpolate_4d) {
|
|
41
51
|
this._bilinear_interpolate_4d = wrap(
|