@huggingface/transformers 4.0.0-next.6 → 4.0.0-next.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -2
- package/dist/ort-wasm-simd-threaded.jsep.mjs +24 -24
- package/dist/transformers.js +2255 -931
- package/dist/transformers.min.js +19 -19
- package/dist/transformers.node.cjs +2300 -934
- package/dist/transformers.node.min.cjs +20 -20
- package/dist/transformers.node.min.mjs +20 -20
- package/dist/transformers.node.mjs +2336 -1012
- package/dist/transformers.web.js +2327 -1003
- package/dist/transformers.web.min.js +17 -17
- package/package.json +4 -4
- package/src/cache_utils.js +62 -0
- package/src/configs.js +45 -24
- package/src/env.js +8 -1
- package/src/image_processors_utils.js +27 -17
- package/src/models/chatterbox/modeling_chatterbox.js +1 -1
- package/src/models/chmv2/image_processing_chmv2.js +3 -0
- package/src/models/chmv2/modeling_chmv2.js +4 -0
- package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
- package/src/models/detr/image_processing_detr.js +1 -1
- package/src/models/eurobert/modeling_eurobert.js +41 -0
- package/src/models/feature_extractors.js +2 -0
- package/src/models/gemma3n/modeling_gemma3n.js +2 -0
- package/src/models/glm46v/image_processing_glm46v.js +12 -0
- package/src/models/glm46v/processing_glm46v.js +5 -0
- package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
- package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
- package/src/models/granite_speech/feature_extraction_granite_speech.js +58 -0
- package/src/models/granite_speech/modeling_granite_speech.js +5 -0
- package/src/models/granite_speech/processing_granite_speech.js +62 -0
- package/src/models/grounding_dino/image_processing_grounding_dino.js +1 -1
- package/src/models/idefics3/modeling_idefics3.js +5 -32
- package/src/models/image_processors.js +3 -0
- package/src/models/lfm2_vl/image_processing_lfm2_vl.js +305 -0
- package/src/models/lfm2_vl/modeling_lfm2_vl.js +13 -0
- package/src/models/lfm2_vl/processing_lfm2_vl.js +77 -0
- package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
- package/src/models/llava/modeling_llava.js +1 -1
- package/src/models/mistral3/modeling_mistral3.js +2 -2
- package/src/models/mistral4/modeling_mistral4.js +5 -0
- package/src/models/modeling_utils.js +224 -308
- package/src/models/models.js +14 -1
- package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
- package/src/models/paligemma/modeling_paligemma.js +2 -25
- package/src/models/processors.js +4 -0
- package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +5 -1
- package/src/models/qwen2_vl/image_processing_qwen2_vl.js +1 -41
- package/src/models/qwen2_vl/modeling_qwen2_vl.js +194 -143
- package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
- package/src/models/qwen3_5/modeling_qwen3_5.js +1 -0
- package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +2 -1
- package/src/models/qwen3_vl/modeling_qwen3_vl.js +2 -1
- package/src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js +2 -1
- package/src/models/registry.js +42 -0
- package/src/models/sam/image_processing_sam.js +1 -1
- package/src/models/session.js +17 -6
- package/src/models/smolvlm/modeling_smolvlm.js +7 -0
- package/src/models/solar_open/modeling_solar_open.js +5 -0
- package/src/models/ultravox/modeling_ultravox.js +1 -3
- package/src/models/voxtral/modeling_voxtral.js +3 -0
- package/src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js +71 -0
- package/src/models/voxtral_realtime/modeling_voxtral_realtime.js +239 -0
- package/src/models/voxtral_realtime/processing_voxtral_realtime.js +113 -0
- package/src/models/whisper/feature_extraction_whisper.js +2 -12
- package/src/pipelines.js +1 -0
- package/src/transformers.js +2 -0
- package/src/utils/audio.js +18 -2
- package/src/utils/cache/CrossOriginStorageCache.js +251 -0
- package/src/utils/cache/cross-origin-storage.d.ts +38 -0
- package/src/utils/cache.js +5 -0
- package/src/utils/hub.js +4 -1
- package/src/utils/lru_cache.js +67 -0
- package/src/utils/memoize_promise.js +45 -0
- package/src/utils/model_registry/get_file_metadata.js +15 -2
- package/src/utils/model_registry/get_model_files.js +52 -78
- package/src/utils/tensor.js +18 -2
- package/types/cache_utils.d.ts +29 -0
- package/types/cache_utils.d.ts.map +1 -0
- package/types/configs.d.ts.map +1 -1
- package/types/env.d.ts +8 -0
- package/types/env.d.ts.map +1 -1
- package/types/image_processors_utils.d.ts +18 -1
- package/types/image_processors_utils.d.ts.map +1 -1
- package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
- package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
- package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
- package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
- package/types/models/detr/image_processing_detr.d.ts +1 -1
- package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
- package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
- package/types/models/feature_extractors.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts.map +1 -1
- package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
- package/types/models/glm46v/processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts +16 -0
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts +4 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts +19 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts.map +1 -0
- package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +1 -1
- package/types/models/idefics3/modeling_idefics3.d.ts +2 -18
- package/types/models/idefics3/modeling_idefics3.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +3 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts +41 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts +4 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts +18 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
- package/types/models/mistral3/modeling_mistral3.d.ts +2 -2
- package/types/models/mistral3/modeling_mistral3.d.ts.map +1 -1
- package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
- package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
- package/types/models/modeling_utils.d.ts +44 -35
- package/types/models/modeling_utils.d.ts.map +1 -1
- package/types/models/models.d.ts +14 -1
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
- package/types/models/paligemma/modeling_paligemma.d.ts +2 -8
- package/types/models/paligemma/modeling_paligemma.d.ts.map +1 -1
- package/types/models/processors.d.ts +4 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +43 -6
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts +2 -0
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -1
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +3 -0
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -1
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +3 -0
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -1
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts +3 -0
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts.map +1 -1
- package/types/models/registry.d.ts.map +1 -1
- package/types/models/sam/image_processing_sam.d.ts +1 -1
- package/types/models/session.d.ts +3 -2
- package/types/models/session.d.ts.map +1 -1
- package/types/models/smolvlm/modeling_smolvlm.d.ts +8 -0
- package/types/models/smolvlm/modeling_smolvlm.d.ts.map +1 -0
- package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
- package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
- package/types/models/ultravox/modeling_ultravox.d.ts +0 -2
- package/types/models/ultravox/modeling_ultravox.d.ts.map +1 -1
- package/types/models/voxtral/modeling_voxtral.d.ts +4 -0
- package/types/models/voxtral/modeling_voxtral.d.ts.map +1 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts +28 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts +17 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts +44 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts.map +1 -0
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
- package/types/pipelines.d.ts +1 -0
- package/types/pipelines.d.ts.map +1 -1
- package/types/transformers.d.ts +1 -0
- package/types/transformers.d.ts.map +1 -1
- package/types/utils/audio.d.ts +5 -2
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/cache/CrossOriginStorageCache.d.ts +120 -0
- package/types/utils/cache/CrossOriginStorageCache.d.ts.map +1 -0
- package/types/utils/cache.d.ts.map +1 -1
- package/types/utils/dtypes.d.ts +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/image.d.ts +1 -1
- package/types/utils/lru_cache.d.ts +38 -0
- package/types/utils/lru_cache.d.ts.map +1 -0
- package/types/utils/memoize_promise.d.ts +14 -0
- package/types/utils/memoize_promise.d.ts.map +1 -0
- package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -1
- package/types/utils/model_registry/get_model_files.d.ts +1 -0
- package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
- package/types/utils/tensor.d.ts.map +1 -1
- package/src/utils/data-structures.js +0 -572
- package/types/models/ast/modeling_ast.d.ts.map +0 -1
- package/types/utils/data-structures.d.ts +0 -294
- package/types/utils/data-structures.d.ts.map +0 -1
- /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
package/src/models/models.js
CHANGED
|
@@ -2,7 +2,7 @@ export * from './albert/modeling_albert.js';
|
|
|
2
2
|
export * from './apertus/modeling_apertus.js';
|
|
3
3
|
export * from './afmoe/modeling_afmoe.js';
|
|
4
4
|
export * from './arcee/modeling_arcee.js';
|
|
5
|
-
export * from './
|
|
5
|
+
export * from './audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js';
|
|
6
6
|
export * from './bart/modeling_bart.js';
|
|
7
7
|
export * from './beit/modeling_beit.js';
|
|
8
8
|
export * from './bert/modeling_bert.js';
|
|
@@ -12,6 +12,7 @@ export * from './bloom/modeling_bloom.js';
|
|
|
12
12
|
export * from './camembert/modeling_camembert.js';
|
|
13
13
|
export * from './chatterbox/modeling_chatterbox.js';
|
|
14
14
|
export * from './chinese_clip/modeling_chinese_clip.js';
|
|
15
|
+
export * from './chmv2/modeling_chmv2.js';
|
|
15
16
|
export * from './clap/modeling_clap.js';
|
|
16
17
|
export * from './clip/modeling_clip.js';
|
|
17
18
|
export * from './clipseg/modeling_clipseg.js';
|
|
@@ -24,6 +25,7 @@ export * from './convnextv2/modeling_convnextv2.js';
|
|
|
24
25
|
export * from './d_fine/modeling_d_fine.js';
|
|
25
26
|
export * from './dac/modeling_dac.js';
|
|
26
27
|
export * from './deberta/modeling_deberta.js';
|
|
28
|
+
export * from './deepseek_v3/modeling_deepseek_v3.js';
|
|
27
29
|
export * from './deberta_v2/modeling_deberta_v2.js';
|
|
28
30
|
export * from './decision_transformer/modeling_decision_transformer.js';
|
|
29
31
|
export * from './deit/modeling_deit.js';
|
|
@@ -41,6 +43,7 @@ export * from './efficientnet/modeling_efficientnet.js';
|
|
|
41
43
|
export * from './electra/modeling_electra.js';
|
|
42
44
|
export * from './ernie4_5/modeling_ernie4_5.js';
|
|
43
45
|
export * from './esm/modeling_esm.js';
|
|
46
|
+
export * from './eurobert/modeling_eurobert.js';
|
|
44
47
|
export * from './exaone/modeling_exaone.js';
|
|
45
48
|
export * from './falcon/modeling_falcon.js';
|
|
46
49
|
export * from './falcon_h1/modeling_falcon_h1.js';
|
|
@@ -51,6 +54,8 @@ export * from './gemma2/modeling_gemma2.js';
|
|
|
51
54
|
export * from './gemma3/modeling_gemma3.js';
|
|
52
55
|
export * from './gemma3n/modeling_gemma3n.js';
|
|
53
56
|
export * from './glm/modeling_glm.js';
|
|
57
|
+
export * from './glm_moe_dsa/modeling_glm_moe_dsa.js';
|
|
58
|
+
export * from './glm_ocr/modeling_glm_ocr.js';
|
|
54
59
|
export * from './glpn/modeling_glpn.js';
|
|
55
60
|
export * from './gpt_bigcode/modeling_gpt_bigcode.js';
|
|
56
61
|
export * from './gpt_neo/modeling_gpt_neo.js';
|
|
@@ -60,6 +65,7 @@ export * from './gpt2/modeling_gpt2.js';
|
|
|
60
65
|
export * from './gptj/modeling_gptj.js';
|
|
61
66
|
export * from './granite/modeling_granite.js';
|
|
62
67
|
export * from './granitemoehybrid/modeling_granitemoehybrid.js';
|
|
68
|
+
export * from './granite_speech/modeling_granite_speech.js';
|
|
63
69
|
export * from './grounding_dino/modeling_grounding_dino.js';
|
|
64
70
|
export * from './groupvit/modeling_groupvit.js';
|
|
65
71
|
export * from './helium/modeling_helium.js';
|
|
@@ -71,7 +77,9 @@ export * from './ijepa/modeling_ijepa.js';
|
|
|
71
77
|
export * from './jais/modeling_jais.js';
|
|
72
78
|
export * from './jina_clip/modeling_jina_clip.js';
|
|
73
79
|
export * from './lfm2/modeling_lfm2.js';
|
|
80
|
+
export * from './lighton_ocr/modeling_lighton_ocr.js';
|
|
74
81
|
export * from './lfm2_moe/modeling_lfm2_moe.js';
|
|
82
|
+
export * from './lfm2_vl/modeling_lfm2_vl.js';
|
|
75
83
|
export * from './llama/modeling_llama.js';
|
|
76
84
|
export * from './llama4/modeling_llama4.js';
|
|
77
85
|
export * from './llava/modeling_llava.js';
|
|
@@ -86,6 +94,7 @@ export * from './metric3dv2/modeling_metric3dv2.js';
|
|
|
86
94
|
export * from './mgp_str/modeling_mgp_str.js';
|
|
87
95
|
export * from './mimi/modeling_mimi.js';
|
|
88
96
|
export * from './mistral/modeling_mistral.js';
|
|
97
|
+
export * from './mistral4/modeling_mistral4.js';
|
|
89
98
|
export * from './mobilebert/modeling_mobilebert.js';
|
|
90
99
|
export * from './mobilellm/modeling_mobilellm.js';
|
|
91
100
|
export * from './mobilenet_v1/modeling_mobilenet_v1.js';
|
|
@@ -103,6 +112,7 @@ export * from './mt5/modeling_mt5.js';
|
|
|
103
112
|
export * from './multi_modality/modeling_multi_modality.js';
|
|
104
113
|
export * from './musicgen/modeling_musicgen.js';
|
|
105
114
|
export * from './nanochat/modeling_nanochat.js';
|
|
115
|
+
export * from './nemotron_h/modeling_nemotron_h.js';
|
|
106
116
|
export * from './neobert/modeling_neobert.js';
|
|
107
117
|
export * from './nomic_bert/modeling_nomic_bert.js';
|
|
108
118
|
export * from './olmo/modeling_olmo.js';
|
|
@@ -146,6 +156,7 @@ export * from './segformer/modeling_segformer.js';
|
|
|
146
156
|
export * from './siglip/modeling_siglip.js';
|
|
147
157
|
export * from './smollm3/modeling_smollm3.js';
|
|
148
158
|
export * from './snac/modeling_snac.js';
|
|
159
|
+
export * from './solar_open/modeling_solar_open.js';
|
|
149
160
|
export * from './speecht5/modeling_speecht5.js';
|
|
150
161
|
export * from './squeezebert/modeling_squeezebert.js';
|
|
151
162
|
export * from './stablelm/modeling_stablelm.js';
|
|
@@ -168,6 +179,8 @@ export * from './vit_msn/modeling_vit_msn.js';
|
|
|
168
179
|
export * from './vitmatte/modeling_vitmatte.js';
|
|
169
180
|
export * from './vitpose/modeling_vitpose.js';
|
|
170
181
|
export * from './vits/modeling_vits.js';
|
|
182
|
+
export * from './voxtral/modeling_voxtral.js';
|
|
183
|
+
export * from './voxtral_realtime/modeling_voxtral_realtime.js';
|
|
171
184
|
export * from './wav2vec2/modeling_wav2vec2.js';
|
|
172
185
|
export * from './wav2vec2_bert/modeling_wav2vec2_bert.js';
|
|
173
186
|
export * from './wavlm/modeling_wavlm.js';
|
|
@@ -1,26 +1,3 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { LlavaForConditionalGeneration } from '../llava/modeling_llava.js';
|
|
2
2
|
|
|
3
|
-
export class
|
|
4
|
-
forward_params = [
|
|
5
|
-
'input_ids',
|
|
6
|
-
// 'inputs_embeds',
|
|
7
|
-
'attention_mask',
|
|
8
|
-
'pixel_values',
|
|
9
|
-
'position_ids',
|
|
10
|
-
'past_key_values',
|
|
11
|
-
];
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
export class PaliGemmaForConditionalGeneration extends PaliGemmaPreTrainedModel {
|
|
15
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
16
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
17
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
18
|
-
|
|
19
|
-
return default_merge_input_ids_with_image_features({
|
|
20
|
-
// @ts-ignore
|
|
21
|
-
image_token_id: this.config.image_token_index,
|
|
22
|
-
...kwargs,
|
|
23
|
-
image_features: reshaped_image_hidden_states,
|
|
24
|
-
});
|
|
25
|
-
}
|
|
26
|
-
}
|
|
3
|
+
export class PaliGemmaForConditionalGeneration extends LlavaForConditionalGeneration {}
|
package/src/models/processors.js
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
export * from './chatterbox/processing_chatterbox.js';
|
|
2
2
|
export * from './florence2/processing_florence2.js';
|
|
3
3
|
export * from './gemma3n/processing_gemma3n.js';
|
|
4
|
+
export * from './glm46v/processing_glm46v.js';
|
|
5
|
+
export * from './granite_speech/processing_granite_speech.js';
|
|
4
6
|
export * from './grounding_dino/processing_grounding_dino.js';
|
|
5
7
|
export * from './idefics3/processing_idefics3.js';
|
|
6
8
|
export * from './janus/processing_janus.js';
|
|
7
9
|
export * from './jina_clip/processing_jina_clip.js';
|
|
10
|
+
export * from './lfm2_vl/processing_lfm2_vl.js';
|
|
8
11
|
export * from './llava/processing_llava.js';
|
|
9
12
|
export * from './mgp_str/processing_mgp_str.js';
|
|
10
13
|
export * from './moonshine/processing_moonshine.js';
|
|
@@ -22,6 +25,7 @@ export * from './smolvlm/processing_smolvlm.js';
|
|
|
22
25
|
export * from './speecht5/processing_speecht5.js';
|
|
23
26
|
export * from './ultravox/processing_ultravox.js';
|
|
24
27
|
export * from './voxtral/processing_voxtral.js';
|
|
28
|
+
export * from './voxtral_realtime/processing_voxtral_realtime.js';
|
|
25
29
|
export * from './wav2vec2/processing_wav2vec2.js';
|
|
26
30
|
export * from './wav2vec2_with_lm/processing_wav2vec2_with_lm.js';
|
|
27
31
|
export * from './whisper/processing_whisper.js';
|
|
@@ -1,5 +1,9 @@
|
|
|
1
|
-
import { Qwen2VLForConditionalGeneration } from '../qwen2_vl/modeling_qwen2_vl.js';
|
|
1
|
+
import { Qwen2VLForConditionalGeneration, Qwen2VLForCausalLM } from '../qwen2_vl/modeling_qwen2_vl.js';
|
|
2
2
|
|
|
3
3
|
export class Qwen2_5_VLForConditionalGeneration extends Qwen2VLForConditionalGeneration {
|
|
4
4
|
image_grid_thw_name = 'image_grid_thw';
|
|
5
5
|
}
|
|
6
|
+
|
|
7
|
+
export class Qwen2_5_VLForCausalLM extends Qwen2VLForCausalLM {
|
|
8
|
+
image_grid_thw_name = 'image_grid_thw';
|
|
9
|
+
}
|
|
@@ -1,46 +1,6 @@
|
|
|
1
|
-
import { ImageProcessor } from '../../image_processors_utils.js';
|
|
1
|
+
import { ImageProcessor, smart_resize } from '../../image_processors_utils.js';
|
|
2
2
|
import { cat, Tensor } from '../../utils/tensor.js';
|
|
3
3
|
|
|
4
|
-
/**
|
|
5
|
-
* Rescales the image so that the following conditions are met:
|
|
6
|
-
*
|
|
7
|
-
* 1. Both dimensions (height and width) are divisible by 'factor'.
|
|
8
|
-
* 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
|
|
9
|
-
* 3. The aspect ratio of the image is maintained as closely as possible.
|
|
10
|
-
*
|
|
11
|
-
* @param {number} height The height of the image.
|
|
12
|
-
* @param {number} width The width of the image.
|
|
13
|
-
* @param {number} [factor=28] The factor to use for resizing.
|
|
14
|
-
* @param {number} [min_pixels=56*56] The minimum number of pixels.
|
|
15
|
-
* @param {number} [max_pixels=14*14*4*1280] The maximum number of pixels.
|
|
16
|
-
* @returns {[number, number]} The new height and width of the image.
|
|
17
|
-
* @throws {Error} If the height or width is smaller than the factor.
|
|
18
|
-
*/
|
|
19
|
-
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
20
|
-
if (height < factor || width < factor) {
|
|
21
|
-
throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
|
|
22
|
-
} else if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
23
|
-
throw new Error(
|
|
24
|
-
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`,
|
|
25
|
-
);
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
let h_bar = Math.round(height / factor) * factor;
|
|
29
|
-
let w_bar = Math.round(width / factor) * factor;
|
|
30
|
-
|
|
31
|
-
if (h_bar * w_bar > max_pixels) {
|
|
32
|
-
const beta = Math.sqrt((height * width) / max_pixels);
|
|
33
|
-
h_bar = Math.floor(height / beta / factor) * factor;
|
|
34
|
-
w_bar = Math.floor(width / beta / factor) * factor;
|
|
35
|
-
} else if (h_bar * w_bar < min_pixels) {
|
|
36
|
-
const beta = Math.sqrt(min_pixels / (height * width));
|
|
37
|
-
h_bar = Math.ceil((height * beta) / factor) * factor;
|
|
38
|
-
w_bar = Math.ceil((width * beta) / factor) * factor;
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
return [h_bar, w_bar];
|
|
42
|
-
}
|
|
43
|
-
|
|
44
4
|
export class Qwen2VLImageProcessor extends ImageProcessor {
|
|
45
5
|
constructor(config) {
|
|
46
6
|
super(config);
|
|
@@ -1,9 +1,4 @@
|
|
|
1
|
-
import {
|
|
2
|
-
PreTrainedModel,
|
|
3
|
-
cumsum_masked_fill,
|
|
4
|
-
default_merge_input_ids_with_image_features,
|
|
5
|
-
getPastLength,
|
|
6
|
-
} from '../modeling_utils.js';
|
|
1
|
+
import { PreTrainedModel, cumsum_masked_fill, default_merge_input_ids_with_image_features } from '../modeling_utils.js';
|
|
7
2
|
import { sessionRun } from '../session.js';
|
|
8
3
|
import { stack, Tensor, ones_like, zeros } from '../../utils/tensor.js';
|
|
9
4
|
import { max } from '../../utils/maths.js';
|
|
@@ -22,8 +17,172 @@ export class Qwen2VLPreTrainedModel extends PreTrainedModel {
|
|
|
22
17
|
];
|
|
23
18
|
}
|
|
24
19
|
export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
|
|
20
|
+
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
21
|
+
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
22
|
+
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
25
23
|
image_grid_thw_name = 'grid_thw';
|
|
26
24
|
|
|
25
|
+
/**
|
|
26
|
+
* Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
|
|
27
|
+
* @param {Tensor} input_ids
|
|
28
|
+
* @param {Tensor} attention_mask
|
|
29
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
30
|
+
*/
|
|
31
|
+
_get_text_only_rope_index(input_ids, attention_mask) {
|
|
32
|
+
if (attention_mask) {
|
|
33
|
+
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
34
|
+
|
|
35
|
+
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
36
|
+
/** @type {bigint[]} */
|
|
37
|
+
const mrope_position_deltas = Array.from(
|
|
38
|
+
{ length: dims[0] },
|
|
39
|
+
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1]),
|
|
40
|
+
);
|
|
41
|
+
|
|
42
|
+
return [
|
|
43
|
+
new Tensor('int64', position_ids, [3, ...dims]),
|
|
44
|
+
new Tensor('int64', mrope_position_deltas, [mrope_position_deltas.length, 1]),
|
|
45
|
+
];
|
|
46
|
+
} else {
|
|
47
|
+
const [batch_size, seq_length] = input_ids.dims;
|
|
48
|
+
const position_ids = BigInt64Array.from({ length: 3 * batch_size * seq_length }, (_, i) =>
|
|
49
|
+
BigInt(Math.floor((i % seq_length) / batch_size)),
|
|
50
|
+
);
|
|
51
|
+
|
|
52
|
+
return [new Tensor('int64', position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
|
|
58
|
+
* global [all_t, all_h, all_w] order, then write back into the position_ids array
|
|
59
|
+
* respecting attention mask.
|
|
60
|
+
* @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
|
|
61
|
+
* @param {number[]} attn_mask Attention mask for this batch element
|
|
62
|
+
* @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
|
|
63
|
+
* @param {number} batch_idx Current batch index
|
|
64
|
+
* @returns {number[]} Flat reordered positions of length total_len
|
|
65
|
+
*/
|
|
66
|
+
_reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
|
|
67
|
+
const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
68
|
+
const llm_positions = new Array(total_len);
|
|
69
|
+
let index = 0;
|
|
70
|
+
for (let x = 0; x < 3; ++x) {
|
|
71
|
+
for (const val of llm_pos_ids_list) {
|
|
72
|
+
const seg_len = val.length / 3;
|
|
73
|
+
for (let z = x * seg_len; z < (x + 1) * seg_len; ++z) {
|
|
74
|
+
llm_positions[index++] = val[z];
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
let count = 0;
|
|
80
|
+
for (let y = 0; y < attn_mask.length; ++y) {
|
|
81
|
+
if (attn_mask[y] == 1) {
|
|
82
|
+
for (let x = 0; x < 3; ++x) {
|
|
83
|
+
position_ids_list[x][batch_idx][y] = llm_positions[(x * total_len) / 3 + count];
|
|
84
|
+
}
|
|
85
|
+
++count;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
return llm_positions;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Build per-batch position ID segments for multimodal rope.
|
|
94
|
+
* Override this in subclasses to change how vision/text segments are identified and positioned.
|
|
95
|
+
* @param {object} params
|
|
96
|
+
* @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
|
|
97
|
+
* @param {any[][]} params.image_grid_thw_list - all image grid dimensions
|
|
98
|
+
* @param {any[][]} params.video_grid_thw_list - all video grid dimensions
|
|
99
|
+
* @param {number} params.spatial_merge_size
|
|
100
|
+
* @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
|
|
101
|
+
* @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
|
|
102
|
+
*/
|
|
103
|
+
_get_multimodal_rope_positions({
|
|
104
|
+
filtered_ids,
|
|
105
|
+
image_grid_thw_list,
|
|
106
|
+
video_grid_thw_list,
|
|
107
|
+
spatial_merge_size,
|
|
108
|
+
state,
|
|
109
|
+
}) {
|
|
110
|
+
// @ts-ignore
|
|
111
|
+
const { image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
112
|
+
|
|
113
|
+
const ids = filtered_ids;
|
|
114
|
+
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
115
|
+
if (x == vision_start_token_id) acc.push(idx);
|
|
116
|
+
return acc;
|
|
117
|
+
}, []);
|
|
118
|
+
|
|
119
|
+
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
120
|
+
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
121
|
+
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
122
|
+
|
|
123
|
+
/** @type {number[][]} */
|
|
124
|
+
const llm_pos_ids_list = [];
|
|
125
|
+
let st = 0;
|
|
126
|
+
let remain_images = image_nums;
|
|
127
|
+
let remain_videos = video_nums;
|
|
128
|
+
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
129
|
+
const next_image_token = ids.findIndex((x, i) => i > st && x == image_token_id);
|
|
130
|
+
const next_video_token = ids.findIndex((x, i) => i > st && x == video_token_id);
|
|
131
|
+
|
|
132
|
+
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
133
|
+
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
134
|
+
|
|
135
|
+
let ed;
|
|
136
|
+
let t, h, w;
|
|
137
|
+
if (ed_image < ed_video) {
|
|
138
|
+
[t, h, w] = image_grid_thw_list[state.image_index];
|
|
139
|
+
++state.image_index;
|
|
140
|
+
--remain_images;
|
|
141
|
+
ed = ed_image;
|
|
142
|
+
} else {
|
|
143
|
+
[t, h, w] = video_grid_thw_list[state.video_index];
|
|
144
|
+
++state.video_index;
|
|
145
|
+
--remain_videos;
|
|
146
|
+
ed = ed_video;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
150
|
+
Number(t),
|
|
151
|
+
Math.floor(Number(h) / spatial_merge_size),
|
|
152
|
+
Math.floor(Number(w) / spatial_merge_size),
|
|
153
|
+
];
|
|
154
|
+
const text_len = ed - st;
|
|
155
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
156
|
+
|
|
157
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + (i % text_len)));
|
|
158
|
+
|
|
159
|
+
const offset = text_len + st_idx;
|
|
160
|
+
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
161
|
+
const t_index = Array.from(
|
|
162
|
+
{ length: grid_size },
|
|
163
|
+
(_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w)),
|
|
164
|
+
);
|
|
165
|
+
const h_index = Array.from(
|
|
166
|
+
{ length: grid_size },
|
|
167
|
+
(_, i) => offset + (Math.floor(i / llm_grid_w) % llm_grid_h),
|
|
168
|
+
);
|
|
169
|
+
const w_index = Array.from({ length: grid_size }, (_, i) => offset + (i % llm_grid_w));
|
|
170
|
+
|
|
171
|
+
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
172
|
+
|
|
173
|
+
st = ed + grid_size;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
if (st < ids.length) {
|
|
177
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
178
|
+
const text_len = ids.length - st;
|
|
179
|
+
|
|
180
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + (i % text_len)));
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
return llm_pos_ids_list;
|
|
184
|
+
}
|
|
185
|
+
|
|
27
186
|
/**
|
|
28
187
|
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
29
188
|
*
|
|
@@ -53,137 +212,49 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
|
|
|
53
212
|
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
54
213
|
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
55
214
|
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
56
|
-
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
|
|
57
|
-
*
|
|
58
|
-
* - 0 for tokens that are **masked**.
|
|
59
|
-
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
|
|
60
|
-
* - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
|
|
61
|
-
* - mrope_position_deltas: Tensor of shape `(batch_size)`.
|
|
215
|
+
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
|
|
216
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
62
217
|
*/
|
|
63
218
|
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
64
219
|
// @ts-ignore
|
|
65
|
-
const { vision_config
|
|
220
|
+
const { vision_config } = this.config;
|
|
66
221
|
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
67
222
|
|
|
68
|
-
const mrope_position_deltas = [];
|
|
69
223
|
if (image_grid_thw || video_grid_thw) {
|
|
70
|
-
|
|
224
|
+
const total_input_ids = input_ids.tolist();
|
|
71
225
|
if (!attention_mask) {
|
|
72
226
|
attention_mask = ones_like(input_ids);
|
|
73
227
|
}
|
|
74
228
|
|
|
75
229
|
const attention_mask_list = attention_mask.tolist();
|
|
76
|
-
const position_ids_list = Array.from({ length: 3 }, (
|
|
77
|
-
Array.from({ length: input_ids.dims[0] }, (
|
|
230
|
+
const position_ids_list = Array.from({ length: 3 }, () =>
|
|
231
|
+
Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0)),
|
|
78
232
|
);
|
|
79
233
|
|
|
80
234
|
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
81
235
|
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
236
|
+
const state = { image_index: 0, video_index: 0 };
|
|
82
237
|
|
|
83
|
-
|
|
84
|
-
let video_index = 0;
|
|
238
|
+
const mrope_position_deltas = [];
|
|
85
239
|
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
86
|
-
const
|
|
87
|
-
|
|
88
|
-
const
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
103
|
-
const next_image_token = ids.findIndex((x, i) => i > st && x == image_token_id);
|
|
104
|
-
const next_video_token = ids.findIndex((x, i) => i > st && x == video_token_id);
|
|
105
|
-
|
|
106
|
-
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
107
|
-
|
|
108
|
-
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
109
|
-
|
|
110
|
-
let ed;
|
|
111
|
-
let t, h, w;
|
|
112
|
-
if (ed_image < ed_video) {
|
|
113
|
-
[t, h, w] = image_grid_thw_list[image_index];
|
|
114
|
-
++image_index;
|
|
115
|
-
--remain_images;
|
|
116
|
-
ed = ed_image;
|
|
117
|
-
} else {
|
|
118
|
-
[t, h, w] = video_grid_thw_list[video_index];
|
|
119
|
-
++video_index;
|
|
120
|
-
--remain_videos;
|
|
121
|
-
ed = ed_video;
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
125
|
-
Number(t),
|
|
126
|
-
Math.floor(Number(h) / spatial_merge_size),
|
|
127
|
-
Math.floor(Number(w) / spatial_merge_size),
|
|
128
|
-
];
|
|
129
|
-
const text_len = ed - st;
|
|
130
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
131
|
-
|
|
132
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + (i % text_len)));
|
|
133
|
-
|
|
134
|
-
const offset = text_len + st_idx;
|
|
135
|
-
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
136
|
-
const t_index = Array.from(
|
|
137
|
-
{ length: grid_size },
|
|
138
|
-
(_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w)),
|
|
139
|
-
);
|
|
140
|
-
const h_index = Array.from(
|
|
141
|
-
{ length: grid_size },
|
|
142
|
-
(_, i) => offset + (Math.floor(i / llm_grid_w) % llm_grid_h),
|
|
143
|
-
);
|
|
144
|
-
const w_index = Array.from({ length: grid_size }, (_, i) => offset + (i % llm_grid_w));
|
|
145
|
-
|
|
146
|
-
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
147
|
-
|
|
148
|
-
st = ed + grid_size;
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
if (st < ids.length) {
|
|
152
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
153
|
-
const text_len = ids.length - st;
|
|
154
|
-
|
|
155
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + (i % text_len)));
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
// NOTE: Each item in llm_pos_ids_list is an array of shape (3, text_len),
|
|
159
|
-
// meaning to perform concatenation along dim=1, we can do the following:
|
|
160
|
-
const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
161
|
-
/** @type {number[]} */
|
|
162
|
-
const llm_positions = new Array(num_items);
|
|
163
|
-
let index = 0;
|
|
164
|
-
for (let x = 0; x < 3; ++x) {
|
|
165
|
-
for (let y = 0; y < llm_pos_ids_list.length; ++y) {
|
|
166
|
-
const val = llm_pos_ids_list[y];
|
|
167
|
-
const text_len = val.length / 3;
|
|
168
|
-
for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
|
|
169
|
-
llm_positions[index++] = val[z];
|
|
170
|
-
}
|
|
171
|
-
}
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
let count = 0;
|
|
175
|
-
const attn_mask = attention_mask_list[i];
|
|
176
|
-
for (let y = 0; y < attn_mask.length; ++y) {
|
|
177
|
-
if (attn_mask[y] == 1) {
|
|
178
|
-
for (let x = 0; x < 3; ++x) {
|
|
179
|
-
position_ids_list[x][i][y] = llm_positions[(x * num_items) / 3 + count];
|
|
180
|
-
}
|
|
181
|
-
++count;
|
|
182
|
-
}
|
|
183
|
-
}
|
|
240
|
+
const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
241
|
+
|
|
242
|
+
const llm_pos_ids_list = this._get_multimodal_rope_positions({
|
|
243
|
+
filtered_ids,
|
|
244
|
+
image_grid_thw_list,
|
|
245
|
+
video_grid_thw_list,
|
|
246
|
+
spatial_merge_size,
|
|
247
|
+
state,
|
|
248
|
+
});
|
|
249
|
+
|
|
250
|
+
const llm_positions = this._reorder_and_write_positions(
|
|
251
|
+
llm_pos_ids_list,
|
|
252
|
+
attention_mask_list[i],
|
|
253
|
+
position_ids_list,
|
|
254
|
+
i,
|
|
255
|
+
);
|
|
184
256
|
|
|
185
|
-
|
|
186
|
-
mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
|
|
257
|
+
mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
|
|
187
258
|
}
|
|
188
259
|
|
|
189
260
|
return [
|
|
@@ -191,29 +262,7 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
|
|
|
191
262
|
new Tensor('int64', mrope_position_deltas, [mrope_position_deltas.length, 1]),
|
|
192
263
|
];
|
|
193
264
|
} else {
|
|
194
|
-
|
|
195
|
-
if (attention_mask) {
|
|
196
|
-
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
197
|
-
|
|
198
|
-
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
199
|
-
/** @type {bigint[]} */
|
|
200
|
-
const mrope_position_deltas = Array.from(
|
|
201
|
-
{ length: dims[0] },
|
|
202
|
-
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1]),
|
|
203
|
-
);
|
|
204
|
-
|
|
205
|
-
return [
|
|
206
|
-
new Tensor('int64', position_ids, [3, ...dims]),
|
|
207
|
-
new Tensor('int64', mrope_position_deltas, [mrope_position_deltas.length, 1]),
|
|
208
|
-
];
|
|
209
|
-
} else {
|
|
210
|
-
const [batch_size, seq_length] = input_ids.dims;
|
|
211
|
-
const position_ids = BigInt64Array.from({ length: 3 * batch_size * seq_length }, (_, i) =>
|
|
212
|
-
BigInt(Math.floor((i % seq_length) / batch_size)),
|
|
213
|
-
);
|
|
214
|
-
|
|
215
|
-
return [new Tensor('int64', position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
216
|
-
}
|
|
265
|
+
return this._get_text_only_rope_index(input_ids, attention_mask);
|
|
217
266
|
}
|
|
218
267
|
}
|
|
219
268
|
|
|
@@ -250,7 +299,7 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
|
|
|
250
299
|
model_inputs.pixel_values = null;
|
|
251
300
|
// model_inputs.pixel_values_videos = null;
|
|
252
301
|
|
|
253
|
-
const past_length =
|
|
302
|
+
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
254
303
|
|
|
255
304
|
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
256
305
|
// Externally provided `past_key_values` with full input_ids:
|
|
@@ -287,3 +336,5 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
|
|
|
287
336
|
return model_inputs;
|
|
288
337
|
}
|
|
289
338
|
}
|
|
339
|
+
|
|
340
|
+
export class Qwen2VLForCausalLM extends Qwen2VLForConditionalGeneration {}
|
|
@@ -6,6 +6,7 @@ import { RawImage } from '../../utils/image.js';
|
|
|
6
6
|
export class Qwen2VLProcessor extends Processor {
|
|
7
7
|
static image_processor_class = AutoImageProcessor;
|
|
8
8
|
static tokenizer_class = AutoTokenizer;
|
|
9
|
+
static image_token = '<|image_pad|>';
|
|
9
10
|
|
|
10
11
|
/**
|
|
11
12
|
*
|
|
@@ -31,13 +32,14 @@ export class Qwen2VLProcessor extends Processor {
|
|
|
31
32
|
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
32
33
|
let index = 0;
|
|
33
34
|
|
|
35
|
+
const image_token = /** @type {typeof Qwen2VLProcessor} */ (this.constructor).image_token;
|
|
34
36
|
const image_grid_thw_list = image_grid_thw.tolist();
|
|
35
37
|
text = text.map((t) => {
|
|
36
|
-
while (t.includes(
|
|
38
|
+
while (t.includes(image_token)) {
|
|
37
39
|
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
38
|
-
t = t.replace(
|
|
40
|
+
t = t.replace(image_token, '<|placeholder|>'.repeat(Math.floor(prod / merge_length)));
|
|
39
41
|
}
|
|
40
|
-
return t.replaceAll('<|placeholder|>',
|
|
42
|
+
return t.replaceAll('<|placeholder|>', image_token);
|
|
41
43
|
});
|
|
42
44
|
}
|
|
43
45
|
|
|
@@ -46,7 +48,6 @@ export class Qwen2VLProcessor extends Processor {
|
|
|
46
48
|
return {
|
|
47
49
|
...text_inputs,
|
|
48
50
|
...image_inputs,
|
|
49
|
-
// TODO: ...videos_inputs,
|
|
50
51
|
};
|
|
51
52
|
}
|
|
52
53
|
}
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
-
import { Qwen3_5ForConditionalGeneration } from '../qwen3_5/modeling_qwen3_5.js';
|
|
1
|
+
import { Qwen3_5ForConditionalGeneration, Qwen3_5ForCausalLM } from '../qwen3_5/modeling_qwen3_5.js';
|
|
2
2
|
|
|
3
3
|
export class Qwen3_5MoeForConditionalGeneration extends Qwen3_5ForConditionalGeneration {}
|
|
4
|
+
export class Qwen3_5MoeForCausalLM extends Qwen3_5ForCausalLM {}
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
-
import { Qwen2_5_VLForConditionalGeneration } from '../qwen2_5_vl/modeling_qwen2_5_vl.js';
|
|
1
|
+
import { Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLForCausalLM } from '../qwen2_5_vl/modeling_qwen2_5_vl.js';
|
|
2
2
|
|
|
3
3
|
export class Qwen3VLForConditionalGeneration extends Qwen2_5_VLForConditionalGeneration {}
|
|
4
|
+
export class Qwen3VLForCausalLM extends Qwen2_5_VLForCausalLM {}
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
-
import { Qwen3VLForConditionalGeneration } from '../qwen3_vl/modeling_qwen3_vl.js';
|
|
1
|
+
import { Qwen3VLForConditionalGeneration, Qwen3VLForCausalLM } from '../qwen3_vl/modeling_qwen3_vl.js';
|
|
2
2
|
|
|
3
3
|
export class Qwen3VLMoeForConditionalGeneration extends Qwen3VLForConditionalGeneration {}
|
|
4
|
+
export class Qwen3VLMoeForCausalLM extends Qwen3VLForCausalLM {}
|