@huggingface/transformers 3.1.0 → 3.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -3
- package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
- package/dist/transformers.cjs +965 -195
- package/dist/transformers.cjs.map +1 -1
- package/dist/transformers.js +2251 -1360
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.cjs +1 -352
- package/dist/transformers.min.cjs.map +1 -1
- package/dist/transformers.min.js +1 -415
- package/dist/transformers.min.js.map +1 -1
- package/dist/transformers.min.mjs +1 -352
- package/dist/transformers.min.mjs.map +1 -1
- package/dist/transformers.mjs +979 -194
- package/dist/transformers.mjs.map +1 -1
- package/package.json +11 -16
- package/src/backends/onnx.js +2 -7
- package/src/configs.js +3 -1
- package/src/env.js +6 -6
- package/src/generation/configuration_utils.js +7 -0
- package/src/generation/logits_process.js +22 -16
- package/src/generation/streamers.js +7 -2
- package/src/models/idefics3/image_processing_idefics3.js +219 -0
- package/src/models/idefics3/processing_idefics3.js +136 -0
- package/src/models/image_processors.js +1 -0
- package/src/models/paligemma/processing_paligemma.js +82 -0
- package/src/models/processors.js +2 -0
- package/src/models.js +169 -39
- package/src/tokenizers.js +12 -1
- package/src/utils/core.js +53 -9
- package/src/utils/dtypes.js +2 -1
- package/src/utils/hub.js +8 -12
- package/src/utils/image.js +59 -16
- package/src/utils/tensor.js +6 -1
- package/types/backends/onnx.d.ts +2 -2
- package/types/backends/onnx.d.ts.map +1 -1
- package/types/base/feature_extraction_utils.d.ts +1 -1
- package/types/base/feature_extraction_utils.d.ts.map +1 -1
- package/types/base/image_processors_utils.d.ts +2 -2
- package/types/base/image_processors_utils.d.ts.map +1 -1
- package/types/base/processing_utils.d.ts +4 -4
- package/types/base/processing_utils.d.ts.map +1 -1
- package/types/configs.d.ts +7 -7
- package/types/configs.d.ts.map +1 -1
- package/types/env.d.ts +2 -2
- package/types/env.d.ts.map +1 -1
- package/types/generation/configuration_utils.d.ts +7 -1
- package/types/generation/configuration_utils.d.ts.map +1 -1
- package/types/generation/logits_process.d.ts +32 -22
- package/types/generation/logits_process.d.ts.map +1 -1
- package/types/generation/logits_sampler.d.ts.map +1 -1
- package/types/generation/parameters.d.ts +5 -5
- package/types/generation/stopping_criteria.d.ts +1 -1
- package/types/generation/stopping_criteria.d.ts.map +1 -1
- package/types/generation/streamers.d.ts +15 -10
- package/types/generation/streamers.d.ts.map +1 -1
- package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts +1 -1
- package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts.map +1 -1
- package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
- package/types/models/auto/image_processing_auto.d.ts.map +1 -1
- package/types/models/auto/processing_auto.d.ts +1 -1
- package/types/models/auto/processing_auto.d.ts.map +1 -1
- package/types/models/clap/feature_extraction_clap.d.ts +1 -1
- package/types/models/clap/feature_extraction_clap.d.ts.map +1 -1
- package/types/models/detr/image_processing_detr.d.ts +11 -11
- package/types/models/detr/image_processing_detr.d.ts.map +1 -1
- package/types/models/donut/image_processing_donut.d.ts +1 -1
- package/types/models/donut/image_processing_donut.d.ts.map +1 -1
- package/types/models/florence2/processing_florence2.d.ts.map +1 -1
- package/types/models/idefics3/image_processing_idefics3.d.ts +40 -0
- package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -0
- package/types/models/idefics3/processing_idefics3.d.ts +19 -0
- package/types/models/idefics3/processing_idefics3.d.ts.map +1 -0
- package/types/models/image_processors.d.ts +1 -0
- package/types/models/janus/image_processing_janus.d.ts +1 -1
- package/types/models/janus/image_processing_janus.d.ts.map +1 -1
- package/types/models/janus/processing_janus.d.ts.map +1 -1
- package/types/models/maskformer/image_processing_maskformer.d.ts +8 -8
- package/types/models/maskformer/image_processing_maskformer.d.ts.map +1 -1
- package/types/models/mgp_str/processing_mgp_str.d.ts +2 -2
- package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
- package/types/models/owlvit/image_processing_owlvit.d.ts.map +1 -1
- package/types/models/paligemma/processing_paligemma.d.ts +12 -0
- package/types/models/paligemma/processing_paligemma.d.ts.map +1 -0
- package/types/models/processors.d.ts +2 -0
- package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
- package/types/models/pyannote/processing_pyannote.d.ts +1 -1
- package/types/models/pyannote/processing_pyannote.d.ts.map +1 -1
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/rt_detr/image_processing_rt_detr.d.ts.map +1 -1
- package/types/models/sam/image_processing_sam.d.ts.map +1 -1
- package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts +1 -1
- package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts.map +1 -1
- package/types/models/segformer/image_processing_segformer.d.ts.map +1 -1
- package/types/models/speecht5/processing_speecht5.d.ts.map +1 -1
- package/types/models/swin2sr/image_processing_swin2sr.d.ts +1 -1
- package/types/models/swin2sr/image_processing_swin2sr.d.ts.map +1 -1
- package/types/models/vitmatte/image_processing_vitmatte.d.ts.map +1 -1
- package/types/models/vitpose/image_processing_vitpose.d.ts +1 -1
- package/types/models/vitpose/image_processing_vitpose.d.ts.map +1 -1
- package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts.map +1 -1
- package/types/models/wav2vec2/processing_wav2vec2.d.ts.map +1 -1
- package/types/models/wespeaker/feature_extraction_wespeaker.d.ts +1 -1
- package/types/models/wespeaker/feature_extraction_wespeaker.d.ts.map +1 -1
- package/types/models/whisper/feature_extraction_whisper.d.ts +1 -1
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
- package/types/models/whisper/generation_whisper.d.ts.map +1 -1
- package/types/models/whisper/processing_whisper.d.ts.map +1 -1
- package/types/models/yolos/image_processing_yolos.d.ts.map +1 -1
- package/types/models.d.ts +44 -10
- package/types/models.d.ts.map +1 -1
- package/types/ops/registry.d.ts.map +1 -1
- package/types/pipelines.d.ts +26 -51
- package/types/pipelines.d.ts.map +1 -1
- package/types/tokenizers.d.ts +10 -6
- package/types/tokenizers.d.ts.map +1 -1
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/constants.d.ts.map +1 -1
- package/types/utils/core.d.ts +94 -22
- package/types/utils/core.d.ts.map +1 -1
- package/types/utils/data-structures.d.ts.map +1 -1
- package/types/utils/devices.d.ts.map +1 -1
- package/types/utils/dtypes.d.ts +3 -2
- package/types/utils/dtypes.d.ts.map +1 -1
- package/types/utils/generic.d.ts.map +1 -1
- package/types/utils/hub.d.ts +3 -3
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/image.d.ts +14 -1
- package/types/utils/image.d.ts.map +1 -1
- package/types/utils/maths.d.ts +10 -10
- package/types/utils/maths.d.ts.map +1 -1
- package/types/utils/tensor.d.ts +10 -8
- package/types/utils/tensor.d.ts.map +1 -1
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import { Processor } from "../../base/processing_utils.js";
|
|
2
|
+
import { AutoImageProcessor } from "../auto/image_processing_auto.js";
|
|
3
|
+
import { AutoTokenizer } from "../../tokenizers.js";
|
|
4
|
+
|
|
5
|
+
const IMAGE_TOKEN = "<image>";
|
|
6
|
+
|
|
7
|
+
function build_string_from_input(
|
|
8
|
+
prompt,
|
|
9
|
+
bos_token,
|
|
10
|
+
image_seq_len,
|
|
11
|
+
image_token,
|
|
12
|
+
num_images,
|
|
13
|
+
) {
|
|
14
|
+
return `${image_token.repeat(image_seq_len * num_images)}${bos_token}${prompt}\n`
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export class PaliGemmaProcessor extends Processor {
|
|
18
|
+
static tokenizer_class = AutoTokenizer
|
|
19
|
+
static image_processor_class = AutoImageProcessor
|
|
20
|
+
static uses_processor_config = false;
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* @typedef {import('../../utils/image.js').RawImage} RawImage
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
// `images` is required, `text` is optional
|
|
27
|
+
async _call(/** @type {RawImage|RawImage[]} */ images, text = null, kwargs = {}) {
|
|
28
|
+
if (!text) {
|
|
29
|
+
console.warn(
|
|
30
|
+
"You are using PaliGemma without a text prefix. It will perform as a picture-captioning model."
|
|
31
|
+
)
|
|
32
|
+
text = ""
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
if (!Array.isArray(images)) {
|
|
36
|
+
images = [images]
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
if (!Array.isArray(text)) {
|
|
40
|
+
text = [text]
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
const bos_token = this.tokenizer.bos_token;
|
|
44
|
+
const image_seq_length = this.image_processor.config.image_seq_length;
|
|
45
|
+
let input_strings;
|
|
46
|
+
if (text.some((t) => t.includes(IMAGE_TOKEN))) {
|
|
47
|
+
input_strings = text.map(
|
|
48
|
+
sample => {
|
|
49
|
+
const expanded_sample = sample.replaceAll(IMAGE_TOKEN, IMAGE_TOKEN.repeat(image_seq_length));
|
|
50
|
+
const bos_rfind_index = expanded_sample.lastIndexOf(IMAGE_TOKEN);
|
|
51
|
+
const bos_index = bos_rfind_index === -1 ? 0 : bos_rfind_index + IMAGE_TOKEN.length;
|
|
52
|
+
return expanded_sample.slice(0, bos_index) + bos_token + expanded_sample.slice(bos_index) + "\n";
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
} else {
|
|
56
|
+
console.warn(
|
|
57
|
+
"You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special " +
|
|
58
|
+
"image tokens in the text, as many tokens as there are images per each text. It is recommended to " +
|
|
59
|
+
"add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images " +
|
|
60
|
+
"each text has and add special tokens."
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
input_strings = text.map(
|
|
64
|
+
sample => build_string_from_input(
|
|
65
|
+
sample,
|
|
66
|
+
bos_token,
|
|
67
|
+
image_seq_length,
|
|
68
|
+
IMAGE_TOKEN,
|
|
69
|
+
images.length,
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const text_inputs = this.tokenizer(input_strings, kwargs);
|
|
75
|
+
const image_inputs = await this.image_processor(images, kwargs);
|
|
76
|
+
|
|
77
|
+
return {
|
|
78
|
+
...image_inputs,
|
|
79
|
+
...text_inputs,
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
}
|
package/src/models/processors.js
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
export * from './florence2/processing_florence2.js';
|
|
2
2
|
export * from './mgp_str/processing_mgp_str.js';
|
|
3
|
+
export * from './idefics3/processing_idefics3.js';
|
|
3
4
|
export * from './janus/processing_janus.js';
|
|
4
5
|
export * from './jina_clip/processing_jina_clip.js';
|
|
5
6
|
export * from './owlvit/processing_owlvit.js';
|
|
7
|
+
export * from './paligemma/processing_paligemma.js';
|
|
6
8
|
export * from './pyannote/processing_pyannote.js';
|
|
7
9
|
export * from './qwen2_vl/processing_qwen2_vl.js';
|
|
8
10
|
export * from './sam/processing_sam.js';
|
package/src/models.js
CHANGED
|
@@ -182,6 +182,22 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
182
182
|
}
|
|
183
183
|
}
|
|
184
184
|
|
|
185
|
+
if (dtype === DATA_TYPES.auto) {
|
|
186
|
+
// Try to choose the auto dtype based on the custom config
|
|
187
|
+
let config_dtype = custom_config.dtype;
|
|
188
|
+
if (typeof config_dtype !== 'string') {
|
|
189
|
+
config_dtype = config_dtype[fileName];
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
if (config_dtype && config_dtype !== DATA_TYPES.auto && DATA_TYPES.hasOwnProperty(config_dtype)) {
|
|
193
|
+
// Defined by the custom config, and is not "auto"
|
|
194
|
+
dtype = config_dtype;
|
|
195
|
+
} else {
|
|
196
|
+
// Choose default dtype based on device, falling back to fp32
|
|
197
|
+
dtype = DEFAULT_DEVICE_DTYPE_MAPPING[selectedDevice] ?? DATA_TYPES.fp32;
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
185
201
|
const selectedDtype = /** @type {import("./utils/dtypes.js").DataType} */(dtype);
|
|
186
202
|
|
|
187
203
|
if (!DEFAULT_DTYPE_SUFFIX_MAPPING.hasOwnProperty(selectedDtype)) {
|
|
@@ -387,9 +403,17 @@ async function sessionRun(session, inputs) {
|
|
|
387
403
|
output = replaceTensors(output);
|
|
388
404
|
return output;
|
|
389
405
|
} catch (e) {
|
|
406
|
+
// Error messages can be long (nested) and uninformative. For this reason,
|
|
407
|
+
// we apply minor formatting to show the most important information
|
|
408
|
+
const formatted = Object.fromEntries(Object.entries(checkedInputs)
|
|
409
|
+
.map(([k, { type, dims, data }]) => [k, {
|
|
410
|
+
// Extract these properties from the underlying ORT tensor
|
|
411
|
+
type, dims, data,
|
|
412
|
+
}]));
|
|
413
|
+
|
|
390
414
|
// This usually occurs when the inputs are of the wrong type.
|
|
391
415
|
console.error(`An error occurred during model execution: "${e}".`);
|
|
392
|
-
console.error('Inputs given to model:',
|
|
416
|
+
console.error('Inputs given to model:', formatted);
|
|
393
417
|
throw e;
|
|
394
418
|
}
|
|
395
419
|
}
|
|
@@ -534,7 +558,9 @@ async function decoderForward(self, model_inputs, is_encoder_decoder = false) {
|
|
|
534
558
|
new_model_inputs.use_cache_branch = boolTensor(!!past_key_values);
|
|
535
559
|
}
|
|
536
560
|
if (session.inputNames.includes('position_ids') && new_model_inputs.attention_mask && !new_model_inputs.position_ids) {
|
|
537
|
-
|
|
561
|
+
// NOTE: Handle a special case for paligemma models, where positions are 1-indexed
|
|
562
|
+
const start_index = self.config.model_type === 'paligemma' ? 1 : 0;
|
|
563
|
+
new_model_inputs.position_ids = createPositionIds(new_model_inputs, past_key_values, start_index);
|
|
538
564
|
}
|
|
539
565
|
|
|
540
566
|
// Unpack the `past_key_values` object into model inputs
|
|
@@ -546,6 +572,39 @@ async function decoderForward(self, model_inputs, is_encoder_decoder = false) {
|
|
|
546
572
|
}
|
|
547
573
|
|
|
548
574
|
|
|
575
|
+
|
|
576
|
+
function default_merge_input_ids_with_image_features({
|
|
577
|
+
image_token_id,
|
|
578
|
+
inputs_embeds,
|
|
579
|
+
image_features,
|
|
580
|
+
input_ids,
|
|
581
|
+
attention_mask,
|
|
582
|
+
}) {
|
|
583
|
+
const image_tokens = input_ids.tolist().map(ids =>
|
|
584
|
+
ids.reduce((acc, x, idx) => {
|
|
585
|
+
if (x == image_token_id) acc.push(idx);
|
|
586
|
+
return acc;
|
|
587
|
+
}, [])
|
|
588
|
+
);
|
|
589
|
+
const n_image_tokens = image_tokens.reduce((acc, x) => acc + x.length, 0);
|
|
590
|
+
const n_image_features = image_features.dims[0];
|
|
591
|
+
if (n_image_tokens !== n_image_features) {
|
|
592
|
+
throw new Error(`Image features and image tokens do not match: tokens: ${n_image_tokens}, features ${n_image_features}`);
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
// Equivalent to performing a masked_scatter
|
|
596
|
+
let img = 0;
|
|
597
|
+
for (let i = 0; i < image_tokens.length; ++i) {
|
|
598
|
+
const tokens = image_tokens[i];
|
|
599
|
+
const embeds = inputs_embeds[i];
|
|
600
|
+
for (let j = 0; j < tokens.length; ++j) {
|
|
601
|
+
embeds[tokens[j]].data.set(image_features[img++].data)
|
|
602
|
+
}
|
|
603
|
+
}
|
|
604
|
+
return { inputs_embeds, attention_mask }
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
|
|
549
608
|
/**
|
|
550
609
|
* Forward pass of an image-text-to-text model.
|
|
551
610
|
* @param {Object} self The image-text-to-text model model.
|
|
@@ -637,14 +696,14 @@ async function imageTextToTextForward(self, {
|
|
|
637
696
|
* @param {Tensor} attention_mask
|
|
638
697
|
* @returns {{data: BigInt64Array, dims: number[]}}
|
|
639
698
|
*/
|
|
640
|
-
function cumsum_masked_fill(attention_mask) {
|
|
699
|
+
function cumsum_masked_fill(attention_mask, start_index = 0) {
|
|
641
700
|
const [bz, seq_len] = attention_mask.dims;
|
|
642
701
|
const attn_mask_data = attention_mask.data;
|
|
643
702
|
|
|
644
703
|
const data = new BigInt64Array(attn_mask_data.length);
|
|
645
704
|
for (let i = 0; i < bz; ++i) {
|
|
646
705
|
const start = i * seq_len;
|
|
647
|
-
let sum = BigInt(
|
|
706
|
+
let sum = BigInt(start_index);
|
|
648
707
|
for (let j = 0; j < seq_len; ++j) {
|
|
649
708
|
const index = start + j;
|
|
650
709
|
if (attn_mask_data[index] === 0n) {
|
|
@@ -671,10 +730,10 @@ function cumsum_masked_fill(attention_mask) {
|
|
|
671
730
|
* position_ids = position_ids[:, -input_ids.shape[1] :]
|
|
672
731
|
* ```
|
|
673
732
|
*/
|
|
674
|
-
function createPositionIds(model_inputs, past_key_values = null) {
|
|
733
|
+
function createPositionIds(model_inputs, past_key_values = null, start_index = 0) {
|
|
675
734
|
const { input_ids, inputs_embeds, attention_mask } = model_inputs;
|
|
676
735
|
|
|
677
|
-
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
736
|
+
const { data, dims } = cumsum_masked_fill(attention_mask, start_index);
|
|
678
737
|
let position_ids = new Tensor('int64', data, dims);
|
|
679
738
|
if (past_key_values) {
|
|
680
739
|
const offset = -(input_ids ?? inputs_embeds).dims.at(1);
|
|
@@ -1013,7 +1072,10 @@ export class PreTrainedModel extends Callable {
|
|
|
1013
1072
|
|
|
1014
1073
|
} else { // should be MODEL_TYPES.EncoderOnly
|
|
1015
1074
|
if (modelType !== MODEL_TYPES.EncoderOnly) {
|
|
1016
|
-
|
|
1075
|
+
const type = modelName ?? config?.model_type;
|
|
1076
|
+
if (type !== 'custom') {
|
|
1077
|
+
console.warn(`Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`)
|
|
1078
|
+
}
|
|
1017
1079
|
}
|
|
1018
1080
|
info = await Promise.all([
|
|
1019
1081
|
constructSessions(pretrained_model_name_or_path, {
|
|
@@ -1757,7 +1819,7 @@ export class PreTrainedModel extends Callable {
|
|
|
1757
1819
|
const dtype = session?.config?.kv_cache_dtype ?? 'float32';
|
|
1758
1820
|
const empty = (dtype === 'float16') ? new Uint16Array() : [];
|
|
1759
1821
|
|
|
1760
|
-
const batch_size = (decoderFeeds[this.main_input_name] ?? decoderFeeds.attention_mask)
|
|
1822
|
+
const batch_size = (decoderFeeds[this.main_input_name] ?? decoderFeeds.attention_mask)?.dims?.[0] ?? 1;
|
|
1761
1823
|
const shapes = getKeyValueShapes(this.config, { batch_size });
|
|
1762
1824
|
|
|
1763
1825
|
for (const name in shapes) {
|
|
@@ -3304,8 +3366,8 @@ export class VisionEncoderDecoderModel extends PreTrainedModel {
|
|
|
3304
3366
|
export class LlavaPreTrainedModel extends PreTrainedModel {
|
|
3305
3367
|
forward_params = [
|
|
3306
3368
|
'input_ids',
|
|
3307
|
-
'pixel_values',
|
|
3308
3369
|
'attention_mask',
|
|
3370
|
+
'pixel_values',
|
|
3309
3371
|
'position_ids',
|
|
3310
3372
|
'past_key_values',
|
|
3311
3373
|
];
|
|
@@ -3487,6 +3549,70 @@ export class Florence2ForConditionalGeneration extends Florence2PreTrainedModel
|
|
|
3487
3549
|
return decoder_outputs;
|
|
3488
3550
|
}
|
|
3489
3551
|
}
|
|
3552
|
+
|
|
3553
|
+
export class PaliGemmaPreTrainedModel extends PreTrainedModel {
|
|
3554
|
+
forward_params = [
|
|
3555
|
+
'input_ids',
|
|
3556
|
+
// 'inputs_embeds',
|
|
3557
|
+
'attention_mask',
|
|
3558
|
+
'pixel_values',
|
|
3559
|
+
'position_ids',
|
|
3560
|
+
'past_key_values',
|
|
3561
|
+
];
|
|
3562
|
+
}
|
|
3563
|
+
|
|
3564
|
+
export class PaliGemmaForConditionalGeneration extends PaliGemmaPreTrainedModel {
|
|
3565
|
+
_merge_input_ids_with_image_features(kwargs) {
|
|
3566
|
+
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
3567
|
+
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
3568
|
+
|
|
3569
|
+
return default_merge_input_ids_with_image_features({
|
|
3570
|
+
// @ts-ignore
|
|
3571
|
+
image_token_id: this.config.image_token_index,
|
|
3572
|
+
...kwargs,
|
|
3573
|
+
image_features: reshaped_image_hidden_states,
|
|
3574
|
+
})
|
|
3575
|
+
}
|
|
3576
|
+
}
|
|
3577
|
+
|
|
3578
|
+
//////////////////////////////////////////////////
|
|
3579
|
+
// Idefics3 Models
|
|
3580
|
+
export class Idefics3PreTrainedModel extends PreTrainedModel {
|
|
3581
|
+
forward_params = [
|
|
3582
|
+
'input_ids',
|
|
3583
|
+
'attention_mask',
|
|
3584
|
+
'pixel_values',
|
|
3585
|
+
'pixel_attention_mask',
|
|
3586
|
+
'position_ids',
|
|
3587
|
+
'past_key_values',
|
|
3588
|
+
];
|
|
3589
|
+
}
|
|
3590
|
+
|
|
3591
|
+
/**
|
|
3592
|
+
* The LLAVA model which consists of a vision backbone and a language model.
|
|
3593
|
+
*/
|
|
3594
|
+
export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
|
|
3595
|
+
|
|
3596
|
+
async encode_image({ pixel_values, pixel_attention_mask }) {
|
|
3597
|
+
const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values, pixel_attention_mask })).image_features;
|
|
3598
|
+
return features;
|
|
3599
|
+
}
|
|
3600
|
+
|
|
3601
|
+
_merge_input_ids_with_image_features(kwargs) {
|
|
3602
|
+
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
3603
|
+
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
3604
|
+
|
|
3605
|
+
return default_merge_input_ids_with_image_features({
|
|
3606
|
+
// @ts-ignore
|
|
3607
|
+
image_token_id: this.config.image_token_id,
|
|
3608
|
+
...kwargs,
|
|
3609
|
+
image_features: reshaped_image_hidden_states,
|
|
3610
|
+
})
|
|
3611
|
+
}
|
|
3612
|
+
}
|
|
3613
|
+
//////////////////////////////////////////////////
|
|
3614
|
+
|
|
3615
|
+
//////////////////////////////////////////////////
|
|
3490
3616
|
export class CLIPPreTrainedModel extends PreTrainedModel { }
|
|
3491
3617
|
|
|
3492
3618
|
/**
|
|
@@ -3986,6 +4112,13 @@ export class OlmoModel extends OlmoPreTrainedModel { }
|
|
|
3986
4112
|
export class OlmoForCausalLM extends OlmoPreTrainedModel { }
|
|
3987
4113
|
//////////////////////////////////////////////////
|
|
3988
4114
|
|
|
4115
|
+
//////////////////////////////////////////////////
|
|
4116
|
+
// OLMo2 models
|
|
4117
|
+
export class Olmo2PreTrainedModel extends PreTrainedModel { }
|
|
4118
|
+
export class Olmo2Model extends Olmo2PreTrainedModel { }
|
|
4119
|
+
export class Olmo2ForCausalLM extends Olmo2PreTrainedModel { }
|
|
4120
|
+
//////////////////////////////////////////////////
|
|
4121
|
+
|
|
3989
4122
|
|
|
3990
4123
|
//////////////////////////////////////////////////
|
|
3991
4124
|
// Granite models
|
|
@@ -4280,36 +4413,12 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
|
|
|
4280
4413
|
return features;
|
|
4281
4414
|
}
|
|
4282
4415
|
|
|
4283
|
-
_merge_input_ids_with_image_features({
|
|
4284
|
-
|
|
4285
|
-
|
|
4286
|
-
|
|
4287
|
-
|
|
4288
|
-
|
|
4289
|
-
// @ts-ignore
|
|
4290
|
-
const { image_token_id } = this.config;
|
|
4291
|
-
const image_tokens = input_ids.tolist().map(ids =>
|
|
4292
|
-
ids.reduce((acc, x, idx) => {
|
|
4293
|
-
if (x == image_token_id) acc.push(idx);
|
|
4294
|
-
return acc;
|
|
4295
|
-
}, [])
|
|
4296
|
-
);
|
|
4297
|
-
const n_image_tokens = image_tokens.reduce((acc, x) => acc + x.length, 0);
|
|
4298
|
-
const n_image_features = image_features.dims[0];
|
|
4299
|
-
if (n_image_tokens !== n_image_features) {
|
|
4300
|
-
throw new Error(`Image features and image tokens do not match: tokens: ${n_image_tokens}, features ${n_image_features}`);
|
|
4301
|
-
}
|
|
4302
|
-
|
|
4303
|
-
// Equivalent to performing a masked_scatter
|
|
4304
|
-
let img = 0;
|
|
4305
|
-
for (let i = 0; i < image_tokens.length; ++i) {
|
|
4306
|
-
const tokens = image_tokens[i];
|
|
4307
|
-
const embeds = inputs_embeds[i];
|
|
4308
|
-
for (let j = 0; j < tokens.length; ++j) {
|
|
4309
|
-
embeds[tokens[j]].data.set(image_features[img++].data)
|
|
4310
|
-
}
|
|
4311
|
-
}
|
|
4312
|
-
return { inputs_embeds, attention_mask }
|
|
4416
|
+
_merge_input_ids_with_image_features(kwargs) {
|
|
4417
|
+
return default_merge_input_ids_with_image_features({
|
|
4418
|
+
// @ts-ignore
|
|
4419
|
+
image_token_id: this.config.image_token_id,
|
|
4420
|
+
...kwargs
|
|
4421
|
+
})
|
|
4313
4422
|
}
|
|
4314
4423
|
|
|
4315
4424
|
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
@@ -4426,6 +4535,20 @@ export class ViTForImageClassification extends ViTPreTrainedModel {
|
|
|
4426
4535
|
//////////////////////////////////////////////////
|
|
4427
4536
|
|
|
4428
4537
|
|
|
4538
|
+
//////////////////////////////////////////////////
|
|
4539
|
+
export class IJepaPreTrainedModel extends PreTrainedModel { }
|
|
4540
|
+
export class IJepaModel extends IJepaPreTrainedModel { }
|
|
4541
|
+
export class IJepaForImageClassification extends IJepaPreTrainedModel {
|
|
4542
|
+
/**
|
|
4543
|
+
* @param {any} model_inputs
|
|
4544
|
+
*/
|
|
4545
|
+
async _call(model_inputs) {
|
|
4546
|
+
return new SequenceClassifierOutput(await super._call(model_inputs));
|
|
4547
|
+
}
|
|
4548
|
+
}
|
|
4549
|
+
//////////////////////////////////////////////////
|
|
4550
|
+
|
|
4551
|
+
|
|
4429
4552
|
//////////////////////////////////////////////////
|
|
4430
4553
|
export class VitPosePreTrainedModel extends PreTrainedModel { }
|
|
4431
4554
|
|
|
@@ -6696,6 +6819,7 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
|
|
|
6696
6819
|
['rt_detr', ['RTDetrModel', RTDetrModel]],
|
|
6697
6820
|
['table-transformer', ['TableTransformerModel', TableTransformerModel]],
|
|
6698
6821
|
['vit', ['ViTModel', ViTModel]],
|
|
6822
|
+
['ijepa', ['IJepaModel', IJepaModel]],
|
|
6699
6823
|
['pvt', ['PvtModel', PvtModel]],
|
|
6700
6824
|
['vit_msn', ['ViTMSNModel', ViTMSNModel]],
|
|
6701
6825
|
['vit_mae', ['ViTMAEModel', ViTMAEModel]],
|
|
@@ -6760,6 +6884,7 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
|
|
|
6760
6884
|
['codegen', ['CodeGenModel', CodeGenModel]],
|
|
6761
6885
|
['llama', ['LlamaModel', LlamaModel]],
|
|
6762
6886
|
['olmo', ['OlmoModel', OlmoModel]],
|
|
6887
|
+
['olmo2', ['Olmo2Model', Olmo2Model]],
|
|
6763
6888
|
['mobilellm', ['MobileLLMModel', MobileLLMModel]],
|
|
6764
6889
|
['granite', ['GraniteModel', GraniteModel]],
|
|
6765
6890
|
['cohere', ['CohereModel', CohereModel]],
|
|
@@ -6851,6 +6976,7 @@ const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([
|
|
|
6851
6976
|
['codegen', ['CodeGenForCausalLM', CodeGenForCausalLM]],
|
|
6852
6977
|
['llama', ['LlamaForCausalLM', LlamaForCausalLM]],
|
|
6853
6978
|
['olmo', ['OlmoForCausalLM', OlmoForCausalLM]],
|
|
6979
|
+
['olmo2', ['Olmo2ForCausalLM', Olmo2ForCausalLM]],
|
|
6854
6980
|
['mobilellm', ['MobileLLMForCausalLM', MobileLLMForCausalLM]],
|
|
6855
6981
|
['granite', ['GraniteForCausalLM', GraniteForCausalLM]],
|
|
6856
6982
|
['cohere', ['CohereForCausalLM', CohereForCausalLM]],
|
|
@@ -6914,6 +7040,7 @@ const MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
|
|
|
6914
7040
|
|
|
6915
7041
|
const MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = new Map([
|
|
6916
7042
|
['vision-encoder-decoder', ['VisionEncoderDecoderModel', VisionEncoderDecoderModel]],
|
|
7043
|
+
['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]],
|
|
6917
7044
|
]);
|
|
6918
7045
|
|
|
6919
7046
|
const MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = new Map([
|
|
@@ -6922,6 +7049,8 @@ const MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = new Map([
|
|
|
6922
7049
|
['moondream1', ['Moondream1ForConditionalGeneration', Moondream1ForConditionalGeneration]],
|
|
6923
7050
|
['florence2', ['Florence2ForConditionalGeneration', Florence2ForConditionalGeneration]],
|
|
6924
7051
|
['qwen2-vl', ['Qwen2VLForConditionalGeneration', Qwen2VLForConditionalGeneration]],
|
|
7052
|
+
['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]],
|
|
7053
|
+
['paligemma', ['PaliGemmaForConditionalGeneration', PaliGemmaForConditionalGeneration]],
|
|
6925
7054
|
]);
|
|
6926
7055
|
|
|
6927
7056
|
const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
|
|
@@ -6930,6 +7059,7 @@ const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
|
|
|
6930
7059
|
|
|
6931
7060
|
const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([
|
|
6932
7061
|
['vit', ['ViTForImageClassification', ViTForImageClassification]],
|
|
7062
|
+
['ijepa', ['IJepaForImageClassification', IJepaForImageClassification]],
|
|
6933
7063
|
['pvt', ['PvtForImageClassification', PvtForImageClassification]],
|
|
6934
7064
|
['vit_msn', ['ViTMSNForImageClassification', ViTMSNForImageClassification]],
|
|
6935
7065
|
['fastvit', ['FastViTForImageClassification', FastViTForImageClassification]],
|
package/src/tokenizers.js
CHANGED
|
@@ -2605,6 +2605,12 @@ export class PreTrainedTokenizer extends Callable {
|
|
|
2605
2605
|
this.unk_token = this.getToken('unk_token');
|
|
2606
2606
|
this.unk_token_id = this.model.tokens_to_ids.get(this.unk_token);
|
|
2607
2607
|
|
|
2608
|
+
this.bos_token = this.getToken('bos_token');
|
|
2609
|
+
this.bos_token_id = this.model.tokens_to_ids.get(this.bos_token);
|
|
2610
|
+
|
|
2611
|
+
this.eos_token = this.getToken('eos_token');
|
|
2612
|
+
this.eos_token_id = this.model.tokens_to_ids.get(this.eos_token);
|
|
2613
|
+
|
|
2608
2614
|
this.model_max_length = tokenizerConfig.model_max_length;
|
|
2609
2615
|
|
|
2610
2616
|
/** @type {boolean} Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). */
|
|
@@ -3577,6 +3583,11 @@ export class WhisperTokenizer extends PreTrainedTokenizer {
|
|
|
3577
3583
|
let chunk = new_chunk();
|
|
3578
3584
|
let time_offset = 0.0;
|
|
3579
3585
|
const timestamp_begin = this.timestamp_begin;
|
|
3586
|
+
// Whisper timestamp tokens start from 0.00 and go to timestamp 30.00 in 0.02 increments.
|
|
3587
|
+
// We can calculate the last time stamp token as timestamp_begin plus the number of tokens
|
|
3588
|
+
// tokens from 0.00 to 30.00 which is 1500.
|
|
3589
|
+
const total_timestamp_tokens = 1500; // (30.00 - 0.00) / 0.02
|
|
3590
|
+
const timestamp_end = timestamp_begin + total_timestamp_tokens;
|
|
3580
3591
|
|
|
3581
3592
|
let previous_tokens = [];
|
|
3582
3593
|
let previous_token_timestamps = [];
|
|
@@ -3664,7 +3675,7 @@ export class WhisperTokenizer extends PreTrainedTokenizer {
|
|
|
3664
3675
|
} else {
|
|
3665
3676
|
// 2/ This is a regular special token, ignoring it
|
|
3666
3677
|
}
|
|
3667
|
-
} else if (token >= timestamp_begin) {
|
|
3678
|
+
} else if (token >= timestamp_begin && token <= timestamp_end) {
|
|
3668
3679
|
// 3/ Timestamp token
|
|
3669
3680
|
const time = (token - timestamp_begin) * time_precision + time_offset;
|
|
3670
3681
|
const rounded_time = round(time, 2);
|
package/src/utils/core.js
CHANGED
|
@@ -9,15 +9,45 @@
|
|
|
9
9
|
*/
|
|
10
10
|
|
|
11
11
|
/**
|
|
12
|
-
* @typedef {Object}
|
|
13
|
-
* @property {'initiate'
|
|
14
|
-
* @property {string} name
|
|
15
|
-
*
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
* @
|
|
20
|
-
* @property {
|
|
12
|
+
* @typedef {Object} InitiateProgressInfo
|
|
13
|
+
* @property {'initiate'} status
|
|
14
|
+
* @property {string} name The model id or directory path.
|
|
15
|
+
* @property {string} file The name of the file.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* @typedef {Object} DownloadProgressInfo
|
|
20
|
+
* @property {'download'} status
|
|
21
|
+
* @property {string} name The model id or directory path.
|
|
22
|
+
* @property {string} file The name of the file.
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* @typedef {Object} ProgressStatusInfo
|
|
27
|
+
* @property {'progress'} status
|
|
28
|
+
* @property {string} name The model id or directory path.
|
|
29
|
+
* @property {string} file The name of the file.
|
|
30
|
+
* @property {number} progress A number between 0 and 100.
|
|
31
|
+
* @property {number} loaded The number of bytes loaded.
|
|
32
|
+
* @property {number} total The total number of bytes to be loaded.
|
|
33
|
+
*/
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* @typedef {Object} DoneProgressInfo
|
|
37
|
+
* @property {'done'} status
|
|
38
|
+
* @property {string} name The model id or directory path.
|
|
39
|
+
* @property {string} file The name of the file.
|
|
40
|
+
*/
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* @typedef {Object} ReadyProgressInfo
|
|
44
|
+
* @property {'ready'} status
|
|
45
|
+
* @property {string} task The loaded task.
|
|
46
|
+
* @property {string} model The loaded model.
|
|
47
|
+
*/
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* @typedef {InitiateProgressInfo | DownloadProgressInfo | ProgressStatusInfo | DoneProgressInfo | ReadyProgressInfo} ProgressInfo
|
|
21
51
|
*/
|
|
22
52
|
|
|
23
53
|
/**
|
|
@@ -187,3 +217,17 @@ export function len(s) {
|
|
|
187
217
|
for (const c of s) ++length;
|
|
188
218
|
return length;
|
|
189
219
|
}
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* Count the occurrences of a value in an array or string.
|
|
223
|
+
* This mimics the behavior of Python's `count` method.
|
|
224
|
+
* @param {any[]|string} arr The array or string to search.
|
|
225
|
+
* @param {any} value The value to count.
|
|
226
|
+
*/
|
|
227
|
+
export function count(arr, value) {
|
|
228
|
+
let count = 0;
|
|
229
|
+
for (const v of arr) {
|
|
230
|
+
if (v === value) ++count;
|
|
231
|
+
}
|
|
232
|
+
return count;
|
|
233
|
+
}
|
package/src/utils/dtypes.js
CHANGED
|
@@ -31,6 +31,7 @@ export const isWebGpuFp16Supported = (function () {
|
|
|
31
31
|
})();
|
|
32
32
|
|
|
33
33
|
export const DATA_TYPES = Object.freeze({
|
|
34
|
+
auto: 'auto', // Auto-detect based on environment
|
|
34
35
|
fp32: 'fp32',
|
|
35
36
|
fp16: 'fp16',
|
|
36
37
|
q8: 'q8',
|
|
@@ -47,7 +48,7 @@ export const DEFAULT_DEVICE_DTYPE_MAPPING = Object.freeze({
|
|
|
47
48
|
[DEVICE_TYPES.wasm]: DATA_TYPES.q8,
|
|
48
49
|
});
|
|
49
50
|
|
|
50
|
-
/** @type {Record<DataType, string>} */
|
|
51
|
+
/** @type {Record<Exclude<DataType, "auto">, string>} */
|
|
51
52
|
export const DEFAULT_DTYPE_SUFFIX_MAPPING = Object.freeze({
|
|
52
53
|
[DATA_TYPES.fp32]: '',
|
|
53
54
|
[DATA_TYPES.fp16]: '_fp16',
|
package/src/utils/hub.js
CHANGED
|
@@ -504,13 +504,6 @@ export async function getModelFile(path_or_repo_id, filename, fatal = true, opti
|
|
|
504
504
|
file: filename
|
|
505
505
|
})
|
|
506
506
|
|
|
507
|
-
/** @type {import('./core.js').ProgressInfo} */
|
|
508
|
-
const progressInfo = {
|
|
509
|
-
status: 'progress',
|
|
510
|
-
name: path_or_repo_id,
|
|
511
|
-
file: filename
|
|
512
|
-
}
|
|
513
|
-
|
|
514
507
|
/** @type {Uint8Array} */
|
|
515
508
|
let buffer;
|
|
516
509
|
|
|
@@ -530,7 +523,9 @@ export async function getModelFile(path_or_repo_id, filename, fatal = true, opti
|
|
|
530
523
|
|
|
531
524
|
// For completeness, we still fire the final progress callback
|
|
532
525
|
dispatchCallback(options.progress_callback, {
|
|
533
|
-
|
|
526
|
+
status: 'progress',
|
|
527
|
+
name: path_or_repo_id,
|
|
528
|
+
file: filename,
|
|
534
529
|
progress: 100,
|
|
535
530
|
loaded: buffer.length,
|
|
536
531
|
total: buffer.length,
|
|
@@ -538,7 +533,9 @@ export async function getModelFile(path_or_repo_id, filename, fatal = true, opti
|
|
|
538
533
|
} else {
|
|
539
534
|
buffer = await readResponse(response, data => {
|
|
540
535
|
dispatchCallback(options.progress_callback, {
|
|
541
|
-
|
|
536
|
+
status: 'progress',
|
|
537
|
+
name: path_or_repo_id,
|
|
538
|
+
file: filename,
|
|
542
539
|
...data,
|
|
543
540
|
})
|
|
544
541
|
})
|
|
@@ -595,12 +592,11 @@ export async function getModelJSON(modelPath, fileName, fatal = true, options =
|
|
|
595
592
|
|
|
596
593
|
return JSON.parse(jsonData);
|
|
597
594
|
}
|
|
598
|
-
|
|
599
595
|
/**
|
|
600
596
|
* Read and track progress when reading a Response object
|
|
601
597
|
*
|
|
602
|
-
* @param {
|
|
603
|
-
* @param {
|
|
598
|
+
* @param {Response|FileResponse} response The Response object to read
|
|
599
|
+
* @param {(data: {progress: number, loaded: number, total: number}) => void} progress_callback The function to call with progress updates
|
|
604
600
|
* @returns {Promise<Uint8Array>} A Promise that resolves with the Uint8Array buffer
|
|
605
601
|
*/
|
|
606
602
|
async function readResponse(response, progress_callback) {
|