@huggingface/transformers 3.1.0 → 3.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/dist/transformers.cjs +678 -153
- package/dist/transformers.cjs.map +1 -1
- package/dist/transformers.js +682 -154
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.cjs +24 -18
- package/dist/transformers.min.cjs.map +1 -1
- package/dist/transformers.min.js +19 -13
- package/dist/transformers.min.js.map +1 -1
- package/dist/transformers.min.mjs +30 -24
- package/dist/transformers.min.mjs.map +1 -1
- package/dist/transformers.mjs +682 -154
- package/dist/transformers.mjs.map +1 -1
- package/package.json +1 -1
- package/src/configs.js +2 -1
- package/src/env.js +6 -6
- package/src/generation/configuration_utils.js +7 -0
- package/src/generation/logits_process.js +22 -16
- package/src/generation/streamers.js +7 -2
- package/src/models/idefics3/image_processing_idefics3.js +219 -0
- package/src/models/idefics3/processing_idefics3.js +136 -0
- package/src/models/image_processors.js +1 -0
- package/src/models/processors.js +1 -0
- package/src/models.js +112 -34
- package/src/utils/core.js +14 -0
- package/src/utils/dtypes.js +2 -1
- package/src/utils/image.js +19 -16
- package/src/utils/tensor.js +6 -1
- package/types/configs.d.ts +1 -1
- package/types/configs.d.ts.map +1 -1
- package/types/env.d.ts +1 -1
- package/types/env.d.ts.map +1 -1
- package/types/generation/configuration_utils.d.ts +6 -0
- package/types/generation/configuration_utils.d.ts.map +1 -1
- package/types/generation/logits_process.d.ts +30 -20
- package/types/generation/logits_process.d.ts.map +1 -1
- package/types/generation/streamers.d.ts +13 -8
- package/types/generation/streamers.d.ts.map +1 -1
- package/types/models/idefics3/image_processing_idefics3.d.ts +40 -0
- package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -0
- package/types/models/idefics3/processing_idefics3.d.ts +19 -0
- package/types/models/idefics3/processing_idefics3.d.ts.map +1 -0
- package/types/models/image_processors.d.ts +1 -0
- package/types/models/processors.d.ts +1 -0
- package/types/models.d.ts +16 -6
- package/types/models.d.ts.map +1 -1
- package/types/utils/core.d.ts +7 -0
- package/types/utils/core.d.ts.map +1 -1
- package/types/utils/dtypes.d.ts +3 -2
- package/types/utils/dtypes.d.ts.map +1 -1
- package/types/utils/image.d.ts +4 -0
- package/types/utils/image.d.ts.map +1 -1
- package/types/utils/tensor.d.ts +5 -3
- package/types/utils/tensor.d.ts.map +1 -1
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@huggingface/transformers",
|
|
3
|
-
"version": "3.1.
|
|
3
|
+
"version": "3.1.1",
|
|
4
4
|
"description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
|
|
5
5
|
"main": "./src/transformers.js",
|
|
6
6
|
"types": "./types/transformers.d.ts",
|
package/src/configs.js
CHANGED
|
@@ -69,6 +69,7 @@ function getNormalizedConfig(config) {
|
|
|
69
69
|
case 'paligemma':
|
|
70
70
|
case 'florence2':
|
|
71
71
|
case 'llava_onevision':
|
|
72
|
+
case 'idefics3':
|
|
72
73
|
init_normalized_config = getNormalizedConfig(config.text_config);
|
|
73
74
|
break;
|
|
74
75
|
case 'moondream1':
|
|
@@ -382,6 +383,6 @@ export class AutoConfig {
|
|
|
382
383
|
* See https://onnxruntime.ai/docs/tutorials/web/env-flags-and-session-options.html#freedimensionoverrides
|
|
383
384
|
* for more information.
|
|
384
385
|
* @property {import('./utils/devices.js').DeviceType} [device] The default device to use for the model.
|
|
385
|
-
* @property {import('./utils/dtypes.js').DataType} [dtype] The default data type to use for the model.
|
|
386
|
+
* @property {import('./utils/dtypes.js').DataType|Record<string, import('./utils/dtypes.js').DataType>} [dtype] The default data type to use for the model.
|
|
386
387
|
* @property {boolean|Record<string, boolean>} [use_external_data_format=false] Whether to load the model using the external data format (used for models >= 2GB in size).
|
|
387
388
|
*/
|
package/src/env.js
CHANGED
|
@@ -26,12 +26,12 @@ import fs from 'fs';
|
|
|
26
26
|
import path from 'path';
|
|
27
27
|
import url from 'url';
|
|
28
28
|
|
|
29
|
-
const VERSION = '3.1.
|
|
29
|
+
const VERSION = '3.1.1';
|
|
30
30
|
|
|
31
31
|
// Check if various APIs are available (depends on environment)
|
|
32
|
-
const IS_BROWSER_ENV = typeof
|
|
33
|
-
const IS_WEBWORKER_ENV =
|
|
34
|
-
const IS_WEB_CACHE_AVAILABLE =
|
|
32
|
+
const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
|
|
33
|
+
const IS_WEBWORKER_ENV = typeof self !== "undefined" && self.constructor?.name === 'DedicatedWorkerGlobalScope';
|
|
34
|
+
const IS_WEB_CACHE_AVAILABLE = typeof self !== "undefined" && 'caches' in self;
|
|
35
35
|
const IS_WEBGPU_AVAILABLE = typeof navigator !== 'undefined' && 'gpu' in navigator;
|
|
36
36
|
const IS_WEBNN_AVAILABLE = typeof navigator !== 'undefined' && 'ml' in navigator;
|
|
37
37
|
|
|
@@ -44,7 +44,7 @@ const IS_PATH_AVAILABLE = !isEmpty(path);
|
|
|
44
44
|
* A read-only object containing information about the APIs available in the current environment.
|
|
45
45
|
*/
|
|
46
46
|
export const apis = Object.freeze({
|
|
47
|
-
/** Whether we are running in a browser environment */
|
|
47
|
+
/** Whether we are running in a browser environment (and not a web worker) */
|
|
48
48
|
IS_BROWSER_ENV,
|
|
49
49
|
|
|
50
50
|
/** Whether we are running in a web worker environment */
|
|
@@ -137,7 +137,7 @@ export const env = {
|
|
|
137
137
|
remoteHost: 'https://huggingface.co/',
|
|
138
138
|
remotePathTemplate: '{model}/resolve/{revision}/',
|
|
139
139
|
|
|
140
|
-
allowLocalModels: !IS_BROWSER_ENV,
|
|
140
|
+
allowLocalModels: !(IS_BROWSER_ENV || IS_WEBWORKER_ENV),
|
|
141
141
|
localModelPath: localModelPath,
|
|
142
142
|
useFS: IS_FS_AVAILABLE,
|
|
143
143
|
|
|
@@ -259,6 +259,13 @@ export class GenerationConfig {
|
|
|
259
259
|
*/
|
|
260
260
|
suppress_tokens = null;
|
|
261
261
|
|
|
262
|
+
/**
|
|
263
|
+
* A streamer that will be used to stream the generation.
|
|
264
|
+
* @type {import('./streamers.js').TextStreamer}
|
|
265
|
+
* @default null
|
|
266
|
+
*/
|
|
267
|
+
streamer = null;
|
|
268
|
+
|
|
262
269
|
/**
|
|
263
270
|
* A list of tokens that will be suppressed at the beginning of the generation.
|
|
264
271
|
* The `SuppressBeginTokens` logit processor will set their log probs to `-inf` so that they are not sampled.
|
|
@@ -151,7 +151,7 @@ export class ForcedBOSTokenLogitsProcessor extends LogitsProcessor {
|
|
|
151
151
|
* Apply the BOS token forcing to the logits.
|
|
152
152
|
* @param {bigint[][]} input_ids The input IDs.
|
|
153
153
|
* @param {Tensor} logits The logits.
|
|
154
|
-
* @returns {
|
|
154
|
+
* @returns {Tensor} The logits with BOS token forcing.
|
|
155
155
|
*/
|
|
156
156
|
_call(input_ids, logits) {
|
|
157
157
|
for (let i = 0; i < input_ids.length; ++i) {
|
|
@@ -221,7 +221,7 @@ export class SuppressTokensAtBeginLogitsProcessor extends LogitsProcessor {
|
|
|
221
221
|
* Apply the BOS token forcing to the logits.
|
|
222
222
|
* @param {bigint[][]} input_ids The input IDs.
|
|
223
223
|
* @param {Tensor} logits The logits.
|
|
224
|
-
* @returns {
|
|
224
|
+
* @returns {Tensor} The logits with BOS token forcing.
|
|
225
225
|
*/
|
|
226
226
|
_call(input_ids, logits) {
|
|
227
227
|
for (let i = 0; i < input_ids.length; ++i) {
|
|
@@ -391,7 +391,7 @@ export class NoRepeatNGramLogitsProcessor extends LogitsProcessor {
|
|
|
391
391
|
* Apply the no-repeat-ngram processor to the logits.
|
|
392
392
|
* @param {bigint[][]} input_ids The input IDs.
|
|
393
393
|
* @param {Tensor} logits The logits.
|
|
394
|
-
* @returns {
|
|
394
|
+
* @returns {Tensor} The logits with no-repeat-ngram processing.
|
|
395
395
|
*/
|
|
396
396
|
_call(input_ids, logits) {
|
|
397
397
|
for (let i = 0; i < input_ids.length; ++i) {
|
|
@@ -406,12 +406,22 @@ export class NoRepeatNGramLogitsProcessor extends LogitsProcessor {
|
|
|
406
406
|
}
|
|
407
407
|
|
|
408
408
|
/**
|
|
409
|
-
* A logits processor that
|
|
409
|
+
* A logits processor that prevents the repetition of previous tokens through a penalty.
|
|
410
|
+
* This penalty is applied at most once per token. Note that, for decoder-only models like most LLMs,
|
|
411
|
+
* the considered tokens include the prompt.
|
|
412
|
+
*
|
|
413
|
+
* In the original [paper](https://arxiv.org/pdf/1909.05858.pdf), the authors suggest the use of a
|
|
414
|
+
* penalty of around 1.2 to achieve a good balance between truthful generation and lack of repetition.
|
|
415
|
+
* To penalize and reduce repetition, use `penalty` values above 1.0, where a higher value penalizes
|
|
416
|
+
* more strongly. To reward and encourage repetition, use `penalty` values between 0.0 and 1.0, where
|
|
417
|
+
* a lower value rewards more strongly.
|
|
410
418
|
*/
|
|
411
419
|
export class RepetitionPenaltyLogitsProcessor extends LogitsProcessor {
|
|
412
420
|
/**
|
|
413
421
|
* Create a RepetitionPenaltyLogitsProcessor.
|
|
414
|
-
* @param {number} penalty The
|
|
422
|
+
* @param {number} penalty The parameter for repetition penalty.
|
|
423
|
+
* - 1.0 means no penalty. Above 1.0 penalizes previously generated tokens.
|
|
424
|
+
* - Between 0.0 and 1.0 rewards previously generated tokens.
|
|
415
425
|
*/
|
|
416
426
|
constructor(penalty) {
|
|
417
427
|
super();
|
|
@@ -422,16 +432,12 @@ export class RepetitionPenaltyLogitsProcessor extends LogitsProcessor {
|
|
|
422
432
|
* Apply the repetition penalty to the logits.
|
|
423
433
|
* @param {bigint[][]} input_ids The input IDs.
|
|
424
434
|
* @param {Tensor} logits The logits.
|
|
425
|
-
* @returns {
|
|
435
|
+
* @returns {Tensor} The logits with repetition penalty processing.
|
|
426
436
|
*/
|
|
427
437
|
_call(input_ids, logits) {
|
|
428
|
-
// Modify the logits corresponding to each element in `input_ids`.
|
|
429
|
-
// As a consequence, the logits corresponding to tokens that appear
|
|
430
|
-
// many times in the output will be penalised more.
|
|
431
|
-
|
|
432
438
|
for (let i = 0; i < input_ids.length; ++i) {
|
|
433
439
|
const batch_logits_data = /** @type {Float32Array} */(logits[i].data);
|
|
434
|
-
for (const input_id of input_ids[i]) {
|
|
440
|
+
for (const input_id of new Set(input_ids[i])) {
|
|
435
441
|
const token = Number(input_id);
|
|
436
442
|
if (batch_logits_data[token] < 0) {
|
|
437
443
|
batch_logits_data[token] *= this.penalty;
|
|
@@ -464,7 +470,7 @@ export class MinLengthLogitsProcessor extends LogitsProcessor {
|
|
|
464
470
|
* Apply logit processor.
|
|
465
471
|
* @param {bigint[][]} input_ids The input IDs.
|
|
466
472
|
* @param {Tensor} logits The logits.
|
|
467
|
-
* @returns {
|
|
473
|
+
* @returns {Tensor} The processed logits.
|
|
468
474
|
*/
|
|
469
475
|
_call(input_ids, logits) {
|
|
470
476
|
for (let i = 0; i < input_ids.length; ++i) {
|
|
@@ -502,7 +508,7 @@ export class MinNewTokensLengthLogitsProcessor extends LogitsProcessor {
|
|
|
502
508
|
* Apply logit processor.
|
|
503
509
|
* @param {bigint[][]} input_ids The input IDs.
|
|
504
510
|
* @param {Tensor} logits The logits.
|
|
505
|
-
* @returns {
|
|
511
|
+
* @returns {Tensor} The processed logits.
|
|
506
512
|
*/
|
|
507
513
|
_call(input_ids, logits) {
|
|
508
514
|
for (let i = 0; i < input_ids.length; ++i) {
|
|
@@ -535,7 +541,7 @@ export class NoBadWordsLogitsProcessor extends LogitsProcessor {
|
|
|
535
541
|
* Apply logit processor.
|
|
536
542
|
* @param {bigint[][]} input_ids The input IDs.
|
|
537
543
|
* @param {Tensor} logits The logits.
|
|
538
|
-
* @returns {
|
|
544
|
+
* @returns {Tensor} The processed logits.
|
|
539
545
|
*/
|
|
540
546
|
_call(input_ids, logits) {
|
|
541
547
|
for (let i = 0; i < input_ids.length; ++i) {
|
|
@@ -596,7 +602,7 @@ export class ClassifierFreeGuidanceLogitsProcessor extends LogitsProcessor {
|
|
|
596
602
|
* Apply logit processor.
|
|
597
603
|
* @param {bigint[][]} input_ids The input IDs.
|
|
598
604
|
* @param {Tensor} logits The logits.
|
|
599
|
-
* @returns {
|
|
605
|
+
* @returns {Tensor} The processed logits.
|
|
600
606
|
*/
|
|
601
607
|
_call(input_ids, logits) {
|
|
602
608
|
if (logits.dims[0] !== 2 * input_ids.length) {
|
|
@@ -650,7 +656,7 @@ export class TemperatureLogitsWarper extends LogitsWarper {
|
|
|
650
656
|
* Apply logit warper.
|
|
651
657
|
* @param {bigint[][]} input_ids The input IDs.
|
|
652
658
|
* @param {Tensor} logits The logits.
|
|
653
|
-
* @returns {
|
|
659
|
+
* @returns {Tensor} The processed logits.
|
|
654
660
|
*/
|
|
655
661
|
_call(input_ids, logits) {
|
|
656
662
|
const batch_logits_data = /** @type {Float32Array} */(logits.data);
|
|
@@ -34,7 +34,12 @@ const stdout_write = apis.IS_PROCESS_AVAILABLE
|
|
|
34
34
|
export class TextStreamer extends BaseStreamer {
|
|
35
35
|
/**
|
|
36
36
|
*
|
|
37
|
-
* @param {import('../tokenizers.js').PreTrainedTokenizer} tokenizer
|
|
37
|
+
* @param {import('../tokenizers.js').PreTrainedTokenizer} tokenizer
|
|
38
|
+
* @param {Object} options
|
|
39
|
+
* @param {boolean} [options.skip_prompt=false] Whether to skip the prompt tokens
|
|
40
|
+
* @param {function(string): void} [options.callback_function=null] Function to call when a piece of text is ready to display
|
|
41
|
+
* @param {function(bigint[]): void} [options.token_callback_function=null] Function to call when a new token is generated
|
|
42
|
+
* @param {Object} [options.decode_kwargs={}] Additional keyword arguments to pass to the tokenizer's decode method
|
|
38
43
|
*/
|
|
39
44
|
constructor(tokenizer, {
|
|
40
45
|
skip_prompt = false,
|
|
@@ -143,7 +148,7 @@ export class WhisperTextStreamer extends TextStreamer {
|
|
|
143
148
|
* @param {Object} options
|
|
144
149
|
* @param {boolean} [options.skip_prompt=false] Whether to skip the prompt tokens
|
|
145
150
|
* @param {function(string): void} [options.callback_function=null] Function to call when a piece of text is ready to display
|
|
146
|
-
* @param {function(
|
|
151
|
+
* @param {function(bigint[]): void} [options.token_callback_function=null] Function to call when a new token is generated
|
|
147
152
|
* @param {function(number): void} [options.on_chunk_start=null] Function to call when a new chunk starts
|
|
148
153
|
* @param {function(number): void} [options.on_chunk_end=null] Function to call when a chunk ends
|
|
149
154
|
* @param {function(): void} [options.on_finalize=null] Function to call when the stream is finalized
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
import {
|
|
4
|
+
ImageProcessor,
|
|
5
|
+
} from "../../base/image_processors_utils.js";
|
|
6
|
+
import { cat, full, interpolate_4d, stack } from "../../utils/tensor.js";
|
|
7
|
+
|
|
8
|
+
export class Idefics3ImageProcessor extends ImageProcessor {
|
|
9
|
+
constructor(config) {
|
|
10
|
+
super(config);
|
|
11
|
+
|
|
12
|
+
this.do_image_splitting = config.do_image_splitting ?? true;
|
|
13
|
+
this.max_image_size = config.max_image_size;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* @typedef {import('../../utils/image.js').RawImage} RawImage
|
|
18
|
+
* @typedef {import('../../utils/tensor.js').Tensor} Tensor
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Calculate size to resize images to, to be multiples of `vision_encoder_max_size` while preserving the aspect ratio.
|
|
23
|
+
* @param {Tensor} pixel_values Tensor of the image to resize.
|
|
24
|
+
* @param {number} vision_encoder_max_size Maximum size of the output image. If the image is larger than this size,
|
|
25
|
+
* it will be split into patches of this size, and the original image will be concatenated with the patches, resized to max_size.
|
|
26
|
+
*/
|
|
27
|
+
get_resize_for_vision_encoder(pixel_values, vision_encoder_max_size) {
|
|
28
|
+
let [height, width] = pixel_values.dims.slice(-2);
|
|
29
|
+
|
|
30
|
+
const aspect_ratio = width / height;
|
|
31
|
+
if (width >= height) {
|
|
32
|
+
width = Math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size;
|
|
33
|
+
height = Math.floor(width / aspect_ratio);
|
|
34
|
+
height = Math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size;
|
|
35
|
+
} else {
|
|
36
|
+
height = Math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size;
|
|
37
|
+
width = Math.floor(height * aspect_ratio);
|
|
38
|
+
width = Math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size;
|
|
39
|
+
}
|
|
40
|
+
return { height, width };
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/** @param {RawImage|RawImage[]|RawImage[][]} images */
|
|
44
|
+
async _call(images, {
|
|
45
|
+
do_image_splitting = null,
|
|
46
|
+
return_row_col_info = false,
|
|
47
|
+
} = {}) {
|
|
48
|
+
|
|
49
|
+
/** @type {RawImage[][]} */
|
|
50
|
+
let batched_2d_images;
|
|
51
|
+
if (!Array.isArray(images)) {
|
|
52
|
+
batched_2d_images = [[images]];
|
|
53
|
+
} else {
|
|
54
|
+
if (images.length === 0 || !images[0]) {
|
|
55
|
+
throw new Error("No images provided.");
|
|
56
|
+
}
|
|
57
|
+
if (!Array.isArray(images[0])) {
|
|
58
|
+
batched_2d_images = [/** @type {RawImage[]} */(images)];
|
|
59
|
+
} else {
|
|
60
|
+
batched_2d_images = /** @type {RawImage[][]} */(images);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// List of tensors, each with shape [patches, channels, height, width]
|
|
65
|
+
let all_pixel_values = [];
|
|
66
|
+
let images_list_rows = [];
|
|
67
|
+
let images_list_cols = [];
|
|
68
|
+
|
|
69
|
+
const original_sizes = [];
|
|
70
|
+
const reshaped_input_sizes = [];
|
|
71
|
+
for (const image_batch of batched_2d_images) {
|
|
72
|
+
|
|
73
|
+
let images_list = await Promise.all(image_batch.map(x => this.preprocess(x)));
|
|
74
|
+
|
|
75
|
+
// Original sizes of images
|
|
76
|
+
original_sizes.push(...images_list.map(x => x.original_size));
|
|
77
|
+
|
|
78
|
+
// Reshaped sizes of images, before padding or cropping
|
|
79
|
+
reshaped_input_sizes.push(...images_list.map(x => x.reshaped_input_size));
|
|
80
|
+
|
|
81
|
+
// Convert images to 4D tensors for easier processing
|
|
82
|
+
images_list.forEach(x => x.pixel_values.unsqueeze_(0));
|
|
83
|
+
|
|
84
|
+
const { longest_edge } = this.max_image_size;
|
|
85
|
+
|
|
86
|
+
/** @type {Tensor[]} */
|
|
87
|
+
let images_tensor;
|
|
88
|
+
if (do_image_splitting ?? this.do_image_splitting) {
|
|
89
|
+
let image_rows = new Array(images_list.length);
|
|
90
|
+
let image_cols = new Array(images_list.length);
|
|
91
|
+
|
|
92
|
+
// We first resize both height and width of each image to the nearest max_image_size multiple, disregarding the aspect ratio
|
|
93
|
+
images_tensor = await Promise.all(images_list.map(async (x, i) => {
|
|
94
|
+
const new_size = this.get_resize_for_vision_encoder(x.pixel_values, longest_edge);
|
|
95
|
+
|
|
96
|
+
const resized = await interpolate_4d(x.pixel_values, {
|
|
97
|
+
size: [new_size.height, new_size.width],
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
const { frames, num_splits_h, num_splits_w } = await this.split_image(resized, this.max_image_size);
|
|
101
|
+
image_rows[i] = num_splits_h;
|
|
102
|
+
image_cols[i] = num_splits_w;
|
|
103
|
+
return cat(frames, 0);
|
|
104
|
+
}));
|
|
105
|
+
|
|
106
|
+
images_list_rows.push(image_rows);
|
|
107
|
+
images_list_cols.push(image_cols);
|
|
108
|
+
|
|
109
|
+
} else {
|
|
110
|
+
/** @type {[number, number]} */
|
|
111
|
+
const size = [longest_edge, longest_edge];
|
|
112
|
+
images_tensor = await Promise.all(
|
|
113
|
+
images_list.map(x => interpolate_4d(x.pixel_values, { size }))
|
|
114
|
+
);
|
|
115
|
+
|
|
116
|
+
images_list_rows.push(new Array(images_list.length).fill(0));
|
|
117
|
+
images_list_cols.push(new Array(images_list.length).fill(0));
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
all_pixel_values.push(cat(images_tensor, 0));
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
const batch_size = all_pixel_values.length;
|
|
124
|
+
const [n, c, h, w] = all_pixel_values[0].dims;
|
|
125
|
+
|
|
126
|
+
// Stack pixel values
|
|
127
|
+
let pixel_values;
|
|
128
|
+
let pixel_attention_mask;
|
|
129
|
+
if (batch_size === 1) {
|
|
130
|
+
pixel_values = all_pixel_values[0].unsqueeze_(0);
|
|
131
|
+
pixel_attention_mask = full([batch_size, n, h, w], true);
|
|
132
|
+
} else {
|
|
133
|
+
// Add padding (if necessary) to images with less patches than the maximum number of patches
|
|
134
|
+
const max_num_patches = Math.max(...all_pixel_values.map(x => x.dims.at(0)));
|
|
135
|
+
|
|
136
|
+
pixel_attention_mask = full([batch_size, max_num_patches, h, w], true);
|
|
137
|
+
const pixel_attention_mask_data = pixel_attention_mask.data;
|
|
138
|
+
const pixel_attention_mask_stride = max_num_patches * h * w;
|
|
139
|
+
for (let i = 0; i < batch_size; ++i) {
|
|
140
|
+
const num_patches = all_pixel_values[i].dims[0];
|
|
141
|
+
if (num_patches < max_num_patches) {
|
|
142
|
+
all_pixel_values[i] = cat([
|
|
143
|
+
all_pixel_values[i],
|
|
144
|
+
full([max_num_patches - num_patches, c, h, w], 0),
|
|
145
|
+
], 0);
|
|
146
|
+
|
|
147
|
+
const start_offset = i * pixel_attention_mask_stride + num_patches * h * w;
|
|
148
|
+
const end_offset = (i + 1) * pixel_attention_mask_stride;
|
|
149
|
+
pixel_attention_mask_data.fill(false, start_offset, end_offset);
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
pixel_values = stack(all_pixel_values, 0);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
return {
|
|
156
|
+
pixel_values,
|
|
157
|
+
pixel_attention_mask,
|
|
158
|
+
|
|
159
|
+
original_sizes,
|
|
160
|
+
reshaped_input_sizes,
|
|
161
|
+
...(
|
|
162
|
+
return_row_col_info
|
|
163
|
+
? { rows: images_list_rows, cols: images_list_cols }
|
|
164
|
+
: {}
|
|
165
|
+
),
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
async split_image(pixel_values, { longest_edge }) {
|
|
170
|
+
const max_height = longest_edge;
|
|
171
|
+
const max_width = longest_edge;
|
|
172
|
+
|
|
173
|
+
const frames = [];
|
|
174
|
+
|
|
175
|
+
const [height, width] = pixel_values.dims.slice(-2);
|
|
176
|
+
|
|
177
|
+
let num_splits_h = 0, num_splits_w = 0;
|
|
178
|
+
|
|
179
|
+
if (height > max_height || width > max_width) {
|
|
180
|
+
// Calculate the number of splits
|
|
181
|
+
num_splits_h = Math.ceil(height / max_height);
|
|
182
|
+
num_splits_w = Math.ceil(width / max_width);
|
|
183
|
+
|
|
184
|
+
// Calculate the optimal width and height for the sub-images
|
|
185
|
+
const optimal_height = Math.ceil(height / num_splits_h);
|
|
186
|
+
const optimal_width = Math.ceil(width / num_splits_w);
|
|
187
|
+
|
|
188
|
+
// Iterate through each row and column
|
|
189
|
+
for (let r = 0; r < num_splits_h; r++) {
|
|
190
|
+
for (let c = 0; c < num_splits_w; c++) {
|
|
191
|
+
// Calculate the starting point of the crop
|
|
192
|
+
const start_x = c * optimal_width;
|
|
193
|
+
const start_y = r * optimal_height;
|
|
194
|
+
|
|
195
|
+
// Calculate the ending point of the crop
|
|
196
|
+
const end_x = Math.min(start_x + optimal_width, width);
|
|
197
|
+
const end_y = Math.min(start_y + optimal_height, height);
|
|
198
|
+
|
|
199
|
+
// Crop the image
|
|
200
|
+
frames.push(pixel_values.slice(null, null, [start_y, end_y], [start_x, end_x]));
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// Resize the global image to match max dimensions for memory efficiency
|
|
205
|
+
const global_image_height = max_height;
|
|
206
|
+
const global_image_width = max_width;
|
|
207
|
+
|
|
208
|
+
if (height !== global_image_height || width !== global_image_width) {
|
|
209
|
+
pixel_values = await interpolate_4d(pixel_values, {
|
|
210
|
+
size: [global_image_height, global_image_width],
|
|
211
|
+
})
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
frames.push(pixel_values);
|
|
216
|
+
|
|
217
|
+
return { frames, num_splits_h, num_splits_w };
|
|
218
|
+
}
|
|
219
|
+
}
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
|
|
2
|
+
import { Processor } from "../../base/processing_utils.js";
|
|
3
|
+
import { AutoImageProcessor } from "../auto/image_processing_auto.js";
|
|
4
|
+
import { AutoTokenizer } from "../../tokenizers.js";
|
|
5
|
+
import { RawImage } from "../../utils/image.js";
|
|
6
|
+
import { count } from "../../utils/core.js";
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Prompt with expanded image tokens for when the image is split into patches.
|
|
10
|
+
* @private
|
|
11
|
+
*/
|
|
12
|
+
function _prompt_split_image(image_seq_len, image_rows, image_cols, fake_token_around_image, image_token, global_img_token) {
|
|
13
|
+
let text_split_images = "";
|
|
14
|
+
for (let n_h = 0; n_h < image_rows; ++n_h) {
|
|
15
|
+
for (let n_w = 0; n_w < image_cols; ++n_w) {
|
|
16
|
+
text_split_images += (
|
|
17
|
+
fake_token_around_image +
|
|
18
|
+
`<row_${n_h + 1}_col_${n_w + 1}>` +
|
|
19
|
+
image_token.repeat(image_seq_len)
|
|
20
|
+
);
|
|
21
|
+
}
|
|
22
|
+
text_split_images += "\n";
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
text_split_images += (
|
|
26
|
+
`\n${fake_token_around_image}` +
|
|
27
|
+
`${global_img_token}` +
|
|
28
|
+
image_token.repeat(image_seq_len) +
|
|
29
|
+
`${fake_token_around_image}`
|
|
30
|
+
);
|
|
31
|
+
return text_split_images;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Prompt with expanded image tokens for a single image.
|
|
36
|
+
* @private
|
|
37
|
+
*/
|
|
38
|
+
function _prompt_single_image(image_seq_len, fake_token_around_image, image_token, global_img_token) {
|
|
39
|
+
return (
|
|
40
|
+
`${fake_token_around_image}` +
|
|
41
|
+
`${global_img_token}` +
|
|
42
|
+
image_token.repeat(image_seq_len) +
|
|
43
|
+
`${fake_token_around_image}`
|
|
44
|
+
);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function get_image_prompt_string(image_rows, image_cols, image_seq_len, fake_token_around_image, image_token, global_img_token) {
|
|
48
|
+
if (image_rows === 0 && image_cols === 0) {
|
|
49
|
+
return _prompt_single_image(
|
|
50
|
+
image_seq_len,
|
|
51
|
+
fake_token_around_image,
|
|
52
|
+
image_token,
|
|
53
|
+
global_img_token
|
|
54
|
+
);
|
|
55
|
+
}
|
|
56
|
+
return _prompt_split_image(
|
|
57
|
+
image_seq_len, image_rows, image_cols, fake_token_around_image, image_token, global_img_token
|
|
58
|
+
);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
export class Idefics3Processor extends Processor {
|
|
63
|
+
static image_processor_class = AutoImageProcessor
|
|
64
|
+
static tokenizer_class = AutoTokenizer
|
|
65
|
+
static uses_processor_config = true;
|
|
66
|
+
|
|
67
|
+
fake_image_token = "<fake_token_around_image>";
|
|
68
|
+
image_token = "<image>";
|
|
69
|
+
global_img_token = "<global-img>";
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
*
|
|
73
|
+
* @param {string|string[]} text
|
|
74
|
+
* @param {RawImage|RawImage[]|RawImage[][]} images
|
|
75
|
+
* @returns {Promise<any>}
|
|
76
|
+
*/
|
|
77
|
+
async _call(text, images = null, options = {}) {
|
|
78
|
+
options.return_row_col_info ??= true;
|
|
79
|
+
|
|
80
|
+
let image_inputs;
|
|
81
|
+
|
|
82
|
+
if (images) {
|
|
83
|
+
image_inputs = await this.image_processor(images, options);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// NOTE: We assume text is present
|
|
87
|
+
if (!Array.isArray(text)) {
|
|
88
|
+
text = [text];
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const image_rows = image_inputs.rows ?? [new Array(text.length).fill(0)];
|
|
92
|
+
const image_cols = image_inputs.cols ?? [new Array(text.length).fill(0)];
|
|
93
|
+
|
|
94
|
+
const image_seq_len = this.config.image_seq_len;
|
|
95
|
+
const n_images_in_text = []
|
|
96
|
+
const prompt_strings = [];
|
|
97
|
+
for (let i = 0; i < text.length; ++i) {
|
|
98
|
+
const sample = text[i];
|
|
99
|
+
const sample_rows = image_rows[i];
|
|
100
|
+
const sample_cols = image_cols[i];
|
|
101
|
+
|
|
102
|
+
n_images_in_text.push(count(sample, this.image_token));
|
|
103
|
+
|
|
104
|
+
// Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
|
|
105
|
+
const image_prompt_strings = sample_rows.map(
|
|
106
|
+
(n_rows, j) => get_image_prompt_string(
|
|
107
|
+
n_rows,
|
|
108
|
+
sample_cols[j],
|
|
109
|
+
image_seq_len,
|
|
110
|
+
this.fake_image_token,
|
|
111
|
+
this.image_token,
|
|
112
|
+
this.global_img_token,
|
|
113
|
+
)
|
|
114
|
+
);
|
|
115
|
+
|
|
116
|
+
const split_sample = sample.split(this.image_token);
|
|
117
|
+
if (split_sample.length === 0) {
|
|
118
|
+
throw new Error("The image token should be present in the text.");
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// Place in the image prompt strings where the image tokens are
|
|
122
|
+
let new_sample = split_sample[0];
|
|
123
|
+
for (let j = 0; j < image_prompt_strings.length; ++j) {
|
|
124
|
+
new_sample += image_prompt_strings[j] + split_sample[j + 1];
|
|
125
|
+
}
|
|
126
|
+
prompt_strings.push(new_sample);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
const text_inputs = this.tokenizer(prompt_strings);
|
|
130
|
+
|
|
131
|
+
return {
|
|
132
|
+
...text_inputs,
|
|
133
|
+
...image_inputs,
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
@@ -10,6 +10,7 @@ export * from './donut/image_processing_donut.js'
|
|
|
10
10
|
export * from './dpt/image_processing_dpt.js'
|
|
11
11
|
export * from './efficientnet/image_processing_efficientnet.js'
|
|
12
12
|
export * from './glpn/image_processing_glpn.js'
|
|
13
|
+
export * from './idefics3/image_processing_idefics3.js'
|
|
13
14
|
export * from './janus/image_processing_janus.js'
|
|
14
15
|
export * from './jina_clip/image_processing_jina_clip.js'
|
|
15
16
|
export * from './llava_onevision/image_processing_llava_onevision.js'
|
package/src/models/processors.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
export * from './florence2/processing_florence2.js';
|
|
2
2
|
export * from './mgp_str/processing_mgp_str.js';
|
|
3
|
+
export * from './idefics3/processing_idefics3.js';
|
|
3
4
|
export * from './janus/processing_janus.js';
|
|
4
5
|
export * from './jina_clip/processing_jina_clip.js';
|
|
5
6
|
export * from './owlvit/processing_owlvit.js';
|