@huggingface/transformers 3.1.0 → 3.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/README.md +3 -2
  2. package/dist/transformers.cjs +678 -153
  3. package/dist/transformers.cjs.map +1 -1
  4. package/dist/transformers.js +682 -154
  5. package/dist/transformers.js.map +1 -1
  6. package/dist/transformers.min.cjs +24 -18
  7. package/dist/transformers.min.cjs.map +1 -1
  8. package/dist/transformers.min.js +19 -13
  9. package/dist/transformers.min.js.map +1 -1
  10. package/dist/transformers.min.mjs +30 -24
  11. package/dist/transformers.min.mjs.map +1 -1
  12. package/dist/transformers.mjs +682 -154
  13. package/dist/transformers.mjs.map +1 -1
  14. package/package.json +1 -1
  15. package/src/configs.js +2 -1
  16. package/src/env.js +6 -6
  17. package/src/generation/configuration_utils.js +7 -0
  18. package/src/generation/logits_process.js +22 -16
  19. package/src/generation/streamers.js +7 -2
  20. package/src/models/idefics3/image_processing_idefics3.js +219 -0
  21. package/src/models/idefics3/processing_idefics3.js +136 -0
  22. package/src/models/image_processors.js +1 -0
  23. package/src/models/processors.js +1 -0
  24. package/src/models.js +112 -34
  25. package/src/utils/core.js +14 -0
  26. package/src/utils/dtypes.js +2 -1
  27. package/src/utils/image.js +19 -16
  28. package/src/utils/tensor.js +6 -1
  29. package/types/configs.d.ts +1 -1
  30. package/types/configs.d.ts.map +1 -1
  31. package/types/env.d.ts +1 -1
  32. package/types/env.d.ts.map +1 -1
  33. package/types/generation/configuration_utils.d.ts +6 -0
  34. package/types/generation/configuration_utils.d.ts.map +1 -1
  35. package/types/generation/logits_process.d.ts +30 -20
  36. package/types/generation/logits_process.d.ts.map +1 -1
  37. package/types/generation/streamers.d.ts +13 -8
  38. package/types/generation/streamers.d.ts.map +1 -1
  39. package/types/models/idefics3/image_processing_idefics3.d.ts +40 -0
  40. package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -0
  41. package/types/models/idefics3/processing_idefics3.d.ts +19 -0
  42. package/types/models/idefics3/processing_idefics3.d.ts.map +1 -0
  43. package/types/models/image_processors.d.ts +1 -0
  44. package/types/models/processors.d.ts +1 -0
  45. package/types/models.d.ts +16 -6
  46. package/types/models.d.ts.map +1 -1
  47. package/types/utils/core.d.ts +7 -0
  48. package/types/utils/core.d.ts.map +1 -1
  49. package/types/utils/dtypes.d.ts +3 -2
  50. package/types/utils/dtypes.d.ts.map +1 -1
  51. package/types/utils/image.d.ts +4 -0
  52. package/types/utils/image.d.ts.map +1 -1
  53. package/types/utils/tensor.d.ts +5 -3
  54. package/types/utils/tensor.d.ts.map +1 -1
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@huggingface/transformers",
3
- "version": "3.1.0",
3
+ "version": "3.1.1",
4
4
  "description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
5
5
  "main": "./src/transformers.js",
6
6
  "types": "./types/transformers.d.ts",
package/src/configs.js CHANGED
@@ -69,6 +69,7 @@ function getNormalizedConfig(config) {
69
69
  case 'paligemma':
70
70
  case 'florence2':
71
71
  case 'llava_onevision':
72
+ case 'idefics3':
72
73
  init_normalized_config = getNormalizedConfig(config.text_config);
73
74
  break;
74
75
  case 'moondream1':
@@ -382,6 +383,6 @@ export class AutoConfig {
382
383
  * See https://onnxruntime.ai/docs/tutorials/web/env-flags-and-session-options.html#freedimensionoverrides
383
384
  * for more information.
384
385
  * @property {import('./utils/devices.js').DeviceType} [device] The default device to use for the model.
385
- * @property {import('./utils/dtypes.js').DataType} [dtype] The default data type to use for the model.
386
+ * @property {import('./utils/dtypes.js').DataType|Record<string, import('./utils/dtypes.js').DataType>} [dtype] The default data type to use for the model.
386
387
  * @property {boolean|Record<string, boolean>} [use_external_data_format=false] Whether to load the model using the external data format (used for models >= 2GB in size).
387
388
  */
package/src/env.js CHANGED
@@ -26,12 +26,12 @@ import fs from 'fs';
26
26
  import path from 'path';
27
27
  import url from 'url';
28
28
 
29
- const VERSION = '3.1.0';
29
+ const VERSION = '3.1.1';
30
30
 
31
31
  // Check if various APIs are available (depends on environment)
32
- const IS_BROWSER_ENV = typeof self !== 'undefined';
33
- const IS_WEBWORKER_ENV = IS_BROWSER_ENV && self.constructor.name === 'DedicatedWorkerGlobalScope';
34
- const IS_WEB_CACHE_AVAILABLE = IS_BROWSER_ENV && 'caches' in self;
32
+ const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
33
+ const IS_WEBWORKER_ENV = typeof self !== "undefined" && self.constructor?.name === 'DedicatedWorkerGlobalScope';
34
+ const IS_WEB_CACHE_AVAILABLE = typeof self !== "undefined" && 'caches' in self;
35
35
  const IS_WEBGPU_AVAILABLE = typeof navigator !== 'undefined' && 'gpu' in navigator;
36
36
  const IS_WEBNN_AVAILABLE = typeof navigator !== 'undefined' && 'ml' in navigator;
37
37
 
@@ -44,7 +44,7 @@ const IS_PATH_AVAILABLE = !isEmpty(path);
44
44
  * A read-only object containing information about the APIs available in the current environment.
45
45
  */
46
46
  export const apis = Object.freeze({
47
- /** Whether we are running in a browser environment */
47
+ /** Whether we are running in a browser environment (and not a web worker) */
48
48
  IS_BROWSER_ENV,
49
49
 
50
50
  /** Whether we are running in a web worker environment */
@@ -137,7 +137,7 @@ export const env = {
137
137
  remoteHost: 'https://huggingface.co/',
138
138
  remotePathTemplate: '{model}/resolve/{revision}/',
139
139
 
140
- allowLocalModels: !IS_BROWSER_ENV,
140
+ allowLocalModels: !(IS_BROWSER_ENV || IS_WEBWORKER_ENV),
141
141
  localModelPath: localModelPath,
142
142
  useFS: IS_FS_AVAILABLE,
143
143
 
@@ -259,6 +259,13 @@ export class GenerationConfig {
259
259
  */
260
260
  suppress_tokens = null;
261
261
 
262
+ /**
263
+ * A streamer that will be used to stream the generation.
264
+ * @type {import('./streamers.js').TextStreamer}
265
+ * @default null
266
+ */
267
+ streamer = null;
268
+
262
269
  /**
263
270
  * A list of tokens that will be suppressed at the beginning of the generation.
264
271
  * The `SuppressBeginTokens` logit processor will set their log probs to `-inf` so that they are not sampled.
@@ -151,7 +151,7 @@ export class ForcedBOSTokenLogitsProcessor extends LogitsProcessor {
151
151
  * Apply the BOS token forcing to the logits.
152
152
  * @param {bigint[][]} input_ids The input IDs.
153
153
  * @param {Tensor} logits The logits.
154
- * @returns {Object} The logits with BOS token forcing.
154
+ * @returns {Tensor} The logits with BOS token forcing.
155
155
  */
156
156
  _call(input_ids, logits) {
157
157
  for (let i = 0; i < input_ids.length; ++i) {
@@ -221,7 +221,7 @@ export class SuppressTokensAtBeginLogitsProcessor extends LogitsProcessor {
221
221
  * Apply the BOS token forcing to the logits.
222
222
  * @param {bigint[][]} input_ids The input IDs.
223
223
  * @param {Tensor} logits The logits.
224
- * @returns {Object} The logits with BOS token forcing.
224
+ * @returns {Tensor} The logits with BOS token forcing.
225
225
  */
226
226
  _call(input_ids, logits) {
227
227
  for (let i = 0; i < input_ids.length; ++i) {
@@ -391,7 +391,7 @@ export class NoRepeatNGramLogitsProcessor extends LogitsProcessor {
391
391
  * Apply the no-repeat-ngram processor to the logits.
392
392
  * @param {bigint[][]} input_ids The input IDs.
393
393
  * @param {Tensor} logits The logits.
394
- * @returns {Object} The logits with no-repeat-ngram processing.
394
+ * @returns {Tensor} The logits with no-repeat-ngram processing.
395
395
  */
396
396
  _call(input_ids, logits) {
397
397
  for (let i = 0; i < input_ids.length; ++i) {
@@ -406,12 +406,22 @@ export class NoRepeatNGramLogitsProcessor extends LogitsProcessor {
406
406
  }
407
407
 
408
408
  /**
409
- * A logits processor that penalises repeated output tokens.
409
+ * A logits processor that prevents the repetition of previous tokens through a penalty.
410
+ * This penalty is applied at most once per token. Note that, for decoder-only models like most LLMs,
411
+ * the considered tokens include the prompt.
412
+ *
413
+ * In the original [paper](https://arxiv.org/pdf/1909.05858.pdf), the authors suggest the use of a
414
+ * penalty of around 1.2 to achieve a good balance between truthful generation and lack of repetition.
415
+ * To penalize and reduce repetition, use `penalty` values above 1.0, where a higher value penalizes
416
+ * more strongly. To reward and encourage repetition, use `penalty` values between 0.0 and 1.0, where
417
+ * a lower value rewards more strongly.
410
418
  */
411
419
  export class RepetitionPenaltyLogitsProcessor extends LogitsProcessor {
412
420
  /**
413
421
  * Create a RepetitionPenaltyLogitsProcessor.
414
- * @param {number} penalty The penalty to apply for repeated tokens.
422
+ * @param {number} penalty The parameter for repetition penalty.
423
+ * - 1.0 means no penalty. Above 1.0 penalizes previously generated tokens.
424
+ * - Between 0.0 and 1.0 rewards previously generated tokens.
415
425
  */
416
426
  constructor(penalty) {
417
427
  super();
@@ -422,16 +432,12 @@ export class RepetitionPenaltyLogitsProcessor extends LogitsProcessor {
422
432
  * Apply the repetition penalty to the logits.
423
433
  * @param {bigint[][]} input_ids The input IDs.
424
434
  * @param {Tensor} logits The logits.
425
- * @returns {Object} The logits with repetition penalty processing.
435
+ * @returns {Tensor} The logits with repetition penalty processing.
426
436
  */
427
437
  _call(input_ids, logits) {
428
- // Modify the logits corresponding to each element in `input_ids`.
429
- // As a consequence, the logits corresponding to tokens that appear
430
- // many times in the output will be penalised more.
431
-
432
438
  for (let i = 0; i < input_ids.length; ++i) {
433
439
  const batch_logits_data = /** @type {Float32Array} */(logits[i].data);
434
- for (const input_id of input_ids[i]) {
440
+ for (const input_id of new Set(input_ids[i])) {
435
441
  const token = Number(input_id);
436
442
  if (batch_logits_data[token] < 0) {
437
443
  batch_logits_data[token] *= this.penalty;
@@ -464,7 +470,7 @@ export class MinLengthLogitsProcessor extends LogitsProcessor {
464
470
  * Apply logit processor.
465
471
  * @param {bigint[][]} input_ids The input IDs.
466
472
  * @param {Tensor} logits The logits.
467
- * @returns {Object} The processed logits.
473
+ * @returns {Tensor} The processed logits.
468
474
  */
469
475
  _call(input_ids, logits) {
470
476
  for (let i = 0; i < input_ids.length; ++i) {
@@ -502,7 +508,7 @@ export class MinNewTokensLengthLogitsProcessor extends LogitsProcessor {
502
508
  * Apply logit processor.
503
509
  * @param {bigint[][]} input_ids The input IDs.
504
510
  * @param {Tensor} logits The logits.
505
- * @returns {Object} The processed logits.
511
+ * @returns {Tensor} The processed logits.
506
512
  */
507
513
  _call(input_ids, logits) {
508
514
  for (let i = 0; i < input_ids.length; ++i) {
@@ -535,7 +541,7 @@ export class NoBadWordsLogitsProcessor extends LogitsProcessor {
535
541
  * Apply logit processor.
536
542
  * @param {bigint[][]} input_ids The input IDs.
537
543
  * @param {Tensor} logits The logits.
538
- * @returns {Object} The processed logits.
544
+ * @returns {Tensor} The processed logits.
539
545
  */
540
546
  _call(input_ids, logits) {
541
547
  for (let i = 0; i < input_ids.length; ++i) {
@@ -596,7 +602,7 @@ export class ClassifierFreeGuidanceLogitsProcessor extends LogitsProcessor {
596
602
  * Apply logit processor.
597
603
  * @param {bigint[][]} input_ids The input IDs.
598
604
  * @param {Tensor} logits The logits.
599
- * @returns {Object} The processed logits.
605
+ * @returns {Tensor} The processed logits.
600
606
  */
601
607
  _call(input_ids, logits) {
602
608
  if (logits.dims[0] !== 2 * input_ids.length) {
@@ -650,7 +656,7 @@ export class TemperatureLogitsWarper extends LogitsWarper {
650
656
  * Apply logit warper.
651
657
  * @param {bigint[][]} input_ids The input IDs.
652
658
  * @param {Tensor} logits The logits.
653
- * @returns {Object} The processed logits.
659
+ * @returns {Tensor} The processed logits.
654
660
  */
655
661
  _call(input_ids, logits) {
656
662
  const batch_logits_data = /** @type {Float32Array} */(logits.data);
@@ -34,7 +34,12 @@ const stdout_write = apis.IS_PROCESS_AVAILABLE
34
34
  export class TextStreamer extends BaseStreamer {
35
35
  /**
36
36
  *
37
- * @param {import('../tokenizers.js').PreTrainedTokenizer} tokenizer
37
+ * @param {import('../tokenizers.js').PreTrainedTokenizer} tokenizer
38
+ * @param {Object} options
39
+ * @param {boolean} [options.skip_prompt=false] Whether to skip the prompt tokens
40
+ * @param {function(string): void} [options.callback_function=null] Function to call when a piece of text is ready to display
41
+ * @param {function(bigint[]): void} [options.token_callback_function=null] Function to call when a new token is generated
42
+ * @param {Object} [options.decode_kwargs={}] Additional keyword arguments to pass to the tokenizer's decode method
38
43
  */
39
44
  constructor(tokenizer, {
40
45
  skip_prompt = false,
@@ -143,7 +148,7 @@ export class WhisperTextStreamer extends TextStreamer {
143
148
  * @param {Object} options
144
149
  * @param {boolean} [options.skip_prompt=false] Whether to skip the prompt tokens
145
150
  * @param {function(string): void} [options.callback_function=null] Function to call when a piece of text is ready to display
146
- * @param {function(string): void} [options.token_callback_function=null] Function to call when a new token is generated
151
+ * @param {function(bigint[]): void} [options.token_callback_function=null] Function to call when a new token is generated
147
152
  * @param {function(number): void} [options.on_chunk_start=null] Function to call when a new chunk starts
148
153
  * @param {function(number): void} [options.on_chunk_end=null] Function to call when a chunk ends
149
154
  * @param {function(): void} [options.on_finalize=null] Function to call when the stream is finalized
@@ -0,0 +1,219 @@
1
+
2
+
3
+ import {
4
+ ImageProcessor,
5
+ } from "../../base/image_processors_utils.js";
6
+ import { cat, full, interpolate_4d, stack } from "../../utils/tensor.js";
7
+
8
+ export class Idefics3ImageProcessor extends ImageProcessor {
9
+ constructor(config) {
10
+ super(config);
11
+
12
+ this.do_image_splitting = config.do_image_splitting ?? true;
13
+ this.max_image_size = config.max_image_size;
14
+ }
15
+
16
+ /**
17
+ * @typedef {import('../../utils/image.js').RawImage} RawImage
18
+ * @typedef {import('../../utils/tensor.js').Tensor} Tensor
19
+ */
20
+
21
+ /**
22
+ * Calculate size to resize images to, to be multiples of `vision_encoder_max_size` while preserving the aspect ratio.
23
+ * @param {Tensor} pixel_values Tensor of the image to resize.
24
+ * @param {number} vision_encoder_max_size Maximum size of the output image. If the image is larger than this size,
25
+ * it will be split into patches of this size, and the original image will be concatenated with the patches, resized to max_size.
26
+ */
27
+ get_resize_for_vision_encoder(pixel_values, vision_encoder_max_size) {
28
+ let [height, width] = pixel_values.dims.slice(-2);
29
+
30
+ const aspect_ratio = width / height;
31
+ if (width >= height) {
32
+ width = Math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size;
33
+ height = Math.floor(width / aspect_ratio);
34
+ height = Math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size;
35
+ } else {
36
+ height = Math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size;
37
+ width = Math.floor(height * aspect_ratio);
38
+ width = Math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size;
39
+ }
40
+ return { height, width };
41
+ }
42
+
43
+ /** @param {RawImage|RawImage[]|RawImage[][]} images */
44
+ async _call(images, {
45
+ do_image_splitting = null,
46
+ return_row_col_info = false,
47
+ } = {}) {
48
+
49
+ /** @type {RawImage[][]} */
50
+ let batched_2d_images;
51
+ if (!Array.isArray(images)) {
52
+ batched_2d_images = [[images]];
53
+ } else {
54
+ if (images.length === 0 || !images[0]) {
55
+ throw new Error("No images provided.");
56
+ }
57
+ if (!Array.isArray(images[0])) {
58
+ batched_2d_images = [/** @type {RawImage[]} */(images)];
59
+ } else {
60
+ batched_2d_images = /** @type {RawImage[][]} */(images);
61
+ }
62
+ }
63
+
64
+ // List of tensors, each with shape [patches, channels, height, width]
65
+ let all_pixel_values = [];
66
+ let images_list_rows = [];
67
+ let images_list_cols = [];
68
+
69
+ const original_sizes = [];
70
+ const reshaped_input_sizes = [];
71
+ for (const image_batch of batched_2d_images) {
72
+
73
+ let images_list = await Promise.all(image_batch.map(x => this.preprocess(x)));
74
+
75
+ // Original sizes of images
76
+ original_sizes.push(...images_list.map(x => x.original_size));
77
+
78
+ // Reshaped sizes of images, before padding or cropping
79
+ reshaped_input_sizes.push(...images_list.map(x => x.reshaped_input_size));
80
+
81
+ // Convert images to 4D tensors for easier processing
82
+ images_list.forEach(x => x.pixel_values.unsqueeze_(0));
83
+
84
+ const { longest_edge } = this.max_image_size;
85
+
86
+ /** @type {Tensor[]} */
87
+ let images_tensor;
88
+ if (do_image_splitting ?? this.do_image_splitting) {
89
+ let image_rows = new Array(images_list.length);
90
+ let image_cols = new Array(images_list.length);
91
+
92
+ // We first resize both height and width of each image to the nearest max_image_size multiple, disregarding the aspect ratio
93
+ images_tensor = await Promise.all(images_list.map(async (x, i) => {
94
+ const new_size = this.get_resize_for_vision_encoder(x.pixel_values, longest_edge);
95
+
96
+ const resized = await interpolate_4d(x.pixel_values, {
97
+ size: [new_size.height, new_size.width],
98
+ });
99
+
100
+ const { frames, num_splits_h, num_splits_w } = await this.split_image(resized, this.max_image_size);
101
+ image_rows[i] = num_splits_h;
102
+ image_cols[i] = num_splits_w;
103
+ return cat(frames, 0);
104
+ }));
105
+
106
+ images_list_rows.push(image_rows);
107
+ images_list_cols.push(image_cols);
108
+
109
+ } else {
110
+ /** @type {[number, number]} */
111
+ const size = [longest_edge, longest_edge];
112
+ images_tensor = await Promise.all(
113
+ images_list.map(x => interpolate_4d(x.pixel_values, { size }))
114
+ );
115
+
116
+ images_list_rows.push(new Array(images_list.length).fill(0));
117
+ images_list_cols.push(new Array(images_list.length).fill(0));
118
+ }
119
+
120
+ all_pixel_values.push(cat(images_tensor, 0));
121
+ }
122
+
123
+ const batch_size = all_pixel_values.length;
124
+ const [n, c, h, w] = all_pixel_values[0].dims;
125
+
126
+ // Stack pixel values
127
+ let pixel_values;
128
+ let pixel_attention_mask;
129
+ if (batch_size === 1) {
130
+ pixel_values = all_pixel_values[0].unsqueeze_(0);
131
+ pixel_attention_mask = full([batch_size, n, h, w], true);
132
+ } else {
133
+ // Add padding (if necessary) to images with less patches than the maximum number of patches
134
+ const max_num_patches = Math.max(...all_pixel_values.map(x => x.dims.at(0)));
135
+
136
+ pixel_attention_mask = full([batch_size, max_num_patches, h, w], true);
137
+ const pixel_attention_mask_data = pixel_attention_mask.data;
138
+ const pixel_attention_mask_stride = max_num_patches * h * w;
139
+ for (let i = 0; i < batch_size; ++i) {
140
+ const num_patches = all_pixel_values[i].dims[0];
141
+ if (num_patches < max_num_patches) {
142
+ all_pixel_values[i] = cat([
143
+ all_pixel_values[i],
144
+ full([max_num_patches - num_patches, c, h, w], 0),
145
+ ], 0);
146
+
147
+ const start_offset = i * pixel_attention_mask_stride + num_patches * h * w;
148
+ const end_offset = (i + 1) * pixel_attention_mask_stride;
149
+ pixel_attention_mask_data.fill(false, start_offset, end_offset);
150
+ }
151
+ }
152
+ pixel_values = stack(all_pixel_values, 0);
153
+ }
154
+
155
+ return {
156
+ pixel_values,
157
+ pixel_attention_mask,
158
+
159
+ original_sizes,
160
+ reshaped_input_sizes,
161
+ ...(
162
+ return_row_col_info
163
+ ? { rows: images_list_rows, cols: images_list_cols }
164
+ : {}
165
+ ),
166
+ }
167
+ }
168
+
169
+ async split_image(pixel_values, { longest_edge }) {
170
+ const max_height = longest_edge;
171
+ const max_width = longest_edge;
172
+
173
+ const frames = [];
174
+
175
+ const [height, width] = pixel_values.dims.slice(-2);
176
+
177
+ let num_splits_h = 0, num_splits_w = 0;
178
+
179
+ if (height > max_height || width > max_width) {
180
+ // Calculate the number of splits
181
+ num_splits_h = Math.ceil(height / max_height);
182
+ num_splits_w = Math.ceil(width / max_width);
183
+
184
+ // Calculate the optimal width and height for the sub-images
185
+ const optimal_height = Math.ceil(height / num_splits_h);
186
+ const optimal_width = Math.ceil(width / num_splits_w);
187
+
188
+ // Iterate through each row and column
189
+ for (let r = 0; r < num_splits_h; r++) {
190
+ for (let c = 0; c < num_splits_w; c++) {
191
+ // Calculate the starting point of the crop
192
+ const start_x = c * optimal_width;
193
+ const start_y = r * optimal_height;
194
+
195
+ // Calculate the ending point of the crop
196
+ const end_x = Math.min(start_x + optimal_width, width);
197
+ const end_y = Math.min(start_y + optimal_height, height);
198
+
199
+ // Crop the image
200
+ frames.push(pixel_values.slice(null, null, [start_y, end_y], [start_x, end_x]));
201
+ }
202
+ }
203
+
204
+ // Resize the global image to match max dimensions for memory efficiency
205
+ const global_image_height = max_height;
206
+ const global_image_width = max_width;
207
+
208
+ if (height !== global_image_height || width !== global_image_width) {
209
+ pixel_values = await interpolate_4d(pixel_values, {
210
+ size: [global_image_height, global_image_width],
211
+ })
212
+ }
213
+ }
214
+
215
+ frames.push(pixel_values);
216
+
217
+ return { frames, num_splits_h, num_splits_w };
218
+ }
219
+ }
@@ -0,0 +1,136 @@
1
+
2
+ import { Processor } from "../../base/processing_utils.js";
3
+ import { AutoImageProcessor } from "../auto/image_processing_auto.js";
4
+ import { AutoTokenizer } from "../../tokenizers.js";
5
+ import { RawImage } from "../../utils/image.js";
6
+ import { count } from "../../utils/core.js";
7
+
8
+ /**
9
+ * Prompt with expanded image tokens for when the image is split into patches.
10
+ * @private
11
+ */
12
+ function _prompt_split_image(image_seq_len, image_rows, image_cols, fake_token_around_image, image_token, global_img_token) {
13
+ let text_split_images = "";
14
+ for (let n_h = 0; n_h < image_rows; ++n_h) {
15
+ for (let n_w = 0; n_w < image_cols; ++n_w) {
16
+ text_split_images += (
17
+ fake_token_around_image +
18
+ `<row_${n_h + 1}_col_${n_w + 1}>` +
19
+ image_token.repeat(image_seq_len)
20
+ );
21
+ }
22
+ text_split_images += "\n";
23
+ }
24
+
25
+ text_split_images += (
26
+ `\n${fake_token_around_image}` +
27
+ `${global_img_token}` +
28
+ image_token.repeat(image_seq_len) +
29
+ `${fake_token_around_image}`
30
+ );
31
+ return text_split_images;
32
+ }
33
+
34
+ /**
35
+ * Prompt with expanded image tokens for a single image.
36
+ * @private
37
+ */
38
+ function _prompt_single_image(image_seq_len, fake_token_around_image, image_token, global_img_token) {
39
+ return (
40
+ `${fake_token_around_image}` +
41
+ `${global_img_token}` +
42
+ image_token.repeat(image_seq_len) +
43
+ `${fake_token_around_image}`
44
+ );
45
+ }
46
+
47
+ function get_image_prompt_string(image_rows, image_cols, image_seq_len, fake_token_around_image, image_token, global_img_token) {
48
+ if (image_rows === 0 && image_cols === 0) {
49
+ return _prompt_single_image(
50
+ image_seq_len,
51
+ fake_token_around_image,
52
+ image_token,
53
+ global_img_token
54
+ );
55
+ }
56
+ return _prompt_split_image(
57
+ image_seq_len, image_rows, image_cols, fake_token_around_image, image_token, global_img_token
58
+ );
59
+ }
60
+
61
+
62
+ export class Idefics3Processor extends Processor {
63
+ static image_processor_class = AutoImageProcessor
64
+ static tokenizer_class = AutoTokenizer
65
+ static uses_processor_config = true;
66
+
67
+ fake_image_token = "<fake_token_around_image>";
68
+ image_token = "<image>";
69
+ global_img_token = "<global-img>";
70
+
71
+ /**
72
+ *
73
+ * @param {string|string[]} text
74
+ * @param {RawImage|RawImage[]|RawImage[][]} images
75
+ * @returns {Promise<any>}
76
+ */
77
+ async _call(text, images = null, options = {}) {
78
+ options.return_row_col_info ??= true;
79
+
80
+ let image_inputs;
81
+
82
+ if (images) {
83
+ image_inputs = await this.image_processor(images, options);
84
+ }
85
+
86
+ // NOTE: We assume text is present
87
+ if (!Array.isArray(text)) {
88
+ text = [text];
89
+ }
90
+
91
+ const image_rows = image_inputs.rows ?? [new Array(text.length).fill(0)];
92
+ const image_cols = image_inputs.cols ?? [new Array(text.length).fill(0)];
93
+
94
+ const image_seq_len = this.config.image_seq_len;
95
+ const n_images_in_text = []
96
+ const prompt_strings = [];
97
+ for (let i = 0; i < text.length; ++i) {
98
+ const sample = text[i];
99
+ const sample_rows = image_rows[i];
100
+ const sample_cols = image_cols[i];
101
+
102
+ n_images_in_text.push(count(sample, this.image_token));
103
+
104
+ // Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
105
+ const image_prompt_strings = sample_rows.map(
106
+ (n_rows, j) => get_image_prompt_string(
107
+ n_rows,
108
+ sample_cols[j],
109
+ image_seq_len,
110
+ this.fake_image_token,
111
+ this.image_token,
112
+ this.global_img_token,
113
+ )
114
+ );
115
+
116
+ const split_sample = sample.split(this.image_token);
117
+ if (split_sample.length === 0) {
118
+ throw new Error("The image token should be present in the text.");
119
+ }
120
+
121
+ // Place in the image prompt strings where the image tokens are
122
+ let new_sample = split_sample[0];
123
+ for (let j = 0; j < image_prompt_strings.length; ++j) {
124
+ new_sample += image_prompt_strings[j] + split_sample[j + 1];
125
+ }
126
+ prompt_strings.push(new_sample);
127
+ }
128
+
129
+ const text_inputs = this.tokenizer(prompt_strings);
130
+
131
+ return {
132
+ ...text_inputs,
133
+ ...image_inputs,
134
+ }
135
+ }
136
+ }
@@ -10,6 +10,7 @@ export * from './donut/image_processing_donut.js'
10
10
  export * from './dpt/image_processing_dpt.js'
11
11
  export * from './efficientnet/image_processing_efficientnet.js'
12
12
  export * from './glpn/image_processing_glpn.js'
13
+ export * from './idefics3/image_processing_idefics3.js'
13
14
  export * from './janus/image_processing_janus.js'
14
15
  export * from './jina_clip/image_processing_jina_clip.js'
15
16
  export * from './llava_onevision/image_processing_llava_onevision.js'
@@ -1,5 +1,6 @@
1
1
  export * from './florence2/processing_florence2.js';
2
2
  export * from './mgp_str/processing_mgp_str.js';
3
+ export * from './idefics3/processing_idefics3.js';
3
4
  export * from './janus/processing_janus.js';
4
5
  export * from './jina_clip/processing_jina_clip.js';
5
6
  export * from './owlvit/processing_owlvit.js';