@huggingface/transformers 3.0.2 → 3.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (258) hide show
  1. package/README.md +13 -4
  2. package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
  3. package/dist/transformers.cjs +16655 -13040
  4. package/dist/transformers.cjs.map +1 -1
  5. package/dist/transformers.js +17095 -13468
  6. package/dist/transformers.js.map +1 -1
  7. package/dist/transformers.min.cjs +244 -52
  8. package/dist/transformers.min.cjs.map +1 -1
  9. package/dist/transformers.min.js +235 -43
  10. package/dist/transformers.min.js.map +1 -1
  11. package/dist/transformers.min.mjs +246 -54
  12. package/dist/transformers.min.mjs.map +1 -1
  13. package/dist/transformers.mjs +16818 -13202
  14. package/dist/transformers.mjs.map +1 -1
  15. package/package.json +4 -4
  16. package/src/base/feature_extraction_utils.js +54 -0
  17. package/src/base/image_processors_utils.js +1089 -0
  18. package/src/base/processing_utils.js +145 -0
  19. package/src/configs.js +15 -4
  20. package/src/env.js +6 -6
  21. package/src/generation/configuration_utils.js +7 -0
  22. package/src/generation/logits_process.js +22 -16
  23. package/src/generation/streamers.js +7 -2
  24. package/src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js +90 -0
  25. package/src/models/auto/feature_extraction_auto.js +41 -0
  26. package/src/models/auto/image_processing_auto.js +29 -0
  27. package/src/models/auto/processing_auto.js +100 -0
  28. package/src/models/beit/image_processing_beit.js +5 -0
  29. package/src/models/bit/image_processing_bit.js +5 -0
  30. package/src/models/chinese_clip/image_processing_chinese_clip.js +5 -0
  31. package/src/models/clap/feature_extraction_clap.js +159 -0
  32. package/src/models/clip/image_processing_clip.js +6 -0
  33. package/src/models/convnext/image_processing_convnext.js +45 -0
  34. package/src/models/deit/image_processing_deit.js +6 -0
  35. package/src/models/detr/image_processing_detr.js +52 -0
  36. package/src/models/donut/image_processing_donut.js +31 -0
  37. package/src/models/dpt/image_processing_dpt.js +6 -0
  38. package/src/models/efficientnet/image_processing_efficientnet.js +13 -0
  39. package/src/models/feature_extractors.js +12 -0
  40. package/src/models/florence2/processing_florence2.js +128 -0
  41. package/src/models/glpn/image_processing_glpn.js +5 -0
  42. package/src/models/idefics3/image_processing_idefics3.js +219 -0
  43. package/src/models/idefics3/processing_idefics3.js +136 -0
  44. package/src/models/image_processors.js +37 -0
  45. package/src/models/janus/image_processing_janus.js +26 -0
  46. package/src/models/janus/processing_janus.js +123 -0
  47. package/src/models/jina_clip/image_processing_jina_clip.js +26 -0
  48. package/src/models/jina_clip/processing_jina_clip.js +24 -0
  49. package/src/models/llava_onevision/image_processing_llava_onevision.js +5 -0
  50. package/src/models/mask2former/image_processing_mask2former.js +5 -0
  51. package/src/models/maskformer/image_processing_maskformer.js +18 -0
  52. package/src/models/mgp_str/processing_mgp_str.js +170 -0
  53. package/src/models/mobilenet_v1/image_processing_mobilenet_v1.js +7 -0
  54. package/src/models/mobilenet_v2/image_processing_mobilenet_v2.js +7 -0
  55. package/src/models/mobilenet_v3/image_processing_mobilenet_v3.js +7 -0
  56. package/src/models/mobilenet_v4/image_processing_mobilenet_v4.js +7 -0
  57. package/src/models/mobilevit/image_processing_mobilevit.js +6 -0
  58. package/src/models/nougat/image_processing_nougat.js +5 -0
  59. package/src/models/owlv2/image_processing_owlv2.js +5 -0
  60. package/src/models/owlvit/image_processing_owlvit.js +12 -0
  61. package/src/models/owlvit/processing_owlvit.js +7 -0
  62. package/src/models/processors.js +12 -0
  63. package/src/models/pvt/image_processing_pvt.js +5 -0
  64. package/src/models/pyannote/feature_extraction_pyannote.js +28 -0
  65. package/src/models/pyannote/processing_pyannote.js +71 -0
  66. package/src/models/qwen2_vl/image_processing_qwen2_vl.js +52 -0
  67. package/src/models/qwen2_vl/processing_qwen2_vl.js +52 -0
  68. package/src/models/rt_detr/image_processing_rt_detr.js +12 -0
  69. package/src/models/sam/image_processing_sam.js +242 -0
  70. package/src/models/sam/processing_sam.js +20 -0
  71. package/src/models/sapiens/image_processing_sapiens.js +13 -0
  72. package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +180 -0
  73. package/src/models/segformer/image_processing_segformer.js +13 -0
  74. package/src/models/siglip/image_processing_siglip.js +5 -0
  75. package/src/models/speecht5/feature_extraction_speecht5.js +4 -0
  76. package/src/models/speecht5/processing_speecht5.js +17 -0
  77. package/src/models/swin2sr/image_processing_swin2sr.js +24 -0
  78. package/src/models/vit/image_processing_vit.js +7 -0
  79. package/src/models/vitmatte/image_processing_vitmatte.js +50 -0
  80. package/src/models/vitpose/image_processing_vitpose.js +89 -0
  81. package/src/models/wav2vec2/feature_extraction_wav2vec2.js +44 -0
  82. package/src/models/wav2vec2/processing_wav2vec2.js +15 -0
  83. package/src/models/wespeaker/feature_extraction_wespeaker.js +100 -0
  84. package/src/models/whisper/feature_extraction_whisper.js +84 -0
  85. package/src/models/whisper/processing_whisper.js +21 -0
  86. package/src/models/yolos/image_processing_yolos.js +12 -0
  87. package/src/models.js +755 -34
  88. package/src/pipelines.js +8 -8
  89. package/src/tokenizers.js +5 -0
  90. package/src/transformers.js +15 -2
  91. package/src/utils/constants.js +8 -1
  92. package/src/utils/core.js +51 -9
  93. package/src/utils/dtypes.js +2 -1
  94. package/src/utils/hub.js +2 -1
  95. package/src/utils/image.js +87 -33
  96. package/src/utils/tensor.js +39 -2
  97. package/types/base/feature_extraction_utils.d.ts +41 -0
  98. package/types/base/feature_extraction_utils.d.ts.map +1 -0
  99. package/types/base/image_processors_utils.d.ts +323 -0
  100. package/types/base/image_processors_utils.d.ts.map +1 -0
  101. package/types/base/processing_utils.d.ts +80 -0
  102. package/types/base/processing_utils.d.ts.map +1 -0
  103. package/types/configs.d.ts +5 -2
  104. package/types/configs.d.ts.map +1 -1
  105. package/types/env.d.ts +1 -1
  106. package/types/env.d.ts.map +1 -1
  107. package/types/generation/configuration_utils.d.ts +6 -0
  108. package/types/generation/configuration_utils.d.ts.map +1 -1
  109. package/types/generation/logits_process.d.ts +30 -20
  110. package/types/generation/logits_process.d.ts.map +1 -1
  111. package/types/generation/streamers.d.ts +13 -8
  112. package/types/generation/streamers.d.ts.map +1 -1
  113. package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts +25 -0
  114. package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts.map +1 -0
  115. package/types/models/auto/feature_extraction_auto.d.ts +5 -0
  116. package/types/models/auto/feature_extraction_auto.d.ts.map +1 -0
  117. package/types/models/auto/image_processing_auto.d.ts +5 -0
  118. package/types/models/auto/image_processing_auto.d.ts.map +1 -0
  119. package/types/models/auto/processing_auto.d.ts +35 -0
  120. package/types/models/auto/processing_auto.d.ts.map +1 -0
  121. package/types/models/beit/image_processing_beit.d.ts +4 -0
  122. package/types/models/beit/image_processing_beit.d.ts.map +1 -0
  123. package/types/models/bit/image_processing_bit.d.ts +4 -0
  124. package/types/models/bit/image_processing_bit.d.ts.map +1 -0
  125. package/types/models/chinese_clip/image_processing_chinese_clip.d.ts +4 -0
  126. package/types/models/chinese_clip/image_processing_chinese_clip.d.ts.map +1 -0
  127. package/types/models/clap/feature_extraction_clap.d.ts +57 -0
  128. package/types/models/clap/feature_extraction_clap.d.ts.map +1 -0
  129. package/types/models/clip/image_processing_clip.d.ts +6 -0
  130. package/types/models/clip/image_processing_clip.d.ts.map +1 -0
  131. package/types/models/convnext/image_processing_convnext.d.ts +12 -0
  132. package/types/models/convnext/image_processing_convnext.d.ts.map +1 -0
  133. package/types/models/deit/image_processing_deit.d.ts +6 -0
  134. package/types/models/deit/image_processing_deit.d.ts.map +1 -0
  135. package/types/models/detr/image_processing_detr.d.ts +42 -0
  136. package/types/models/detr/image_processing_detr.d.ts.map +1 -0
  137. package/types/models/donut/image_processing_donut.d.ts +7 -0
  138. package/types/models/donut/image_processing_donut.d.ts.map +1 -0
  139. package/types/models/dpt/image_processing_dpt.d.ts +6 -0
  140. package/types/models/dpt/image_processing_dpt.d.ts.map +1 -0
  141. package/types/models/efficientnet/image_processing_efficientnet.d.ts +6 -0
  142. package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -0
  143. package/types/models/feature_extractors.d.ts +10 -0
  144. package/types/models/feature_extractors.d.ts.map +1 -0
  145. package/types/models/florence2/processing_florence2.d.ts +39 -0
  146. package/types/models/florence2/processing_florence2.d.ts.map +1 -0
  147. package/types/models/glpn/image_processing_glpn.d.ts +4 -0
  148. package/types/models/glpn/image_processing_glpn.d.ts.map +1 -0
  149. package/types/models/idefics3/image_processing_idefics3.d.ts +40 -0
  150. package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -0
  151. package/types/models/idefics3/processing_idefics3.d.ts +19 -0
  152. package/types/models/idefics3/processing_idefics3.d.ts.map +1 -0
  153. package/types/models/image_processors.d.ts +37 -0
  154. package/types/models/image_processors.d.ts.map +1 -0
  155. package/types/models/janus/image_processing_janus.d.ts +7 -0
  156. package/types/models/janus/image_processing_janus.d.ts.map +1 -0
  157. package/types/models/janus/processing_janus.d.ts +77 -0
  158. package/types/models/janus/processing_janus.d.ts.map +1 -0
  159. package/types/models/jina_clip/image_processing_jina_clip.d.ts +5 -0
  160. package/types/models/jina_clip/image_processing_jina_clip.d.ts.map +1 -0
  161. package/types/models/jina_clip/processing_jina_clip.d.ts +9 -0
  162. package/types/models/jina_clip/processing_jina_clip.d.ts.map +1 -0
  163. package/types/models/llava_onevision/image_processing_llava_onevision.d.ts +4 -0
  164. package/types/models/llava_onevision/image_processing_llava_onevision.d.ts.map +1 -0
  165. package/types/models/mask2former/image_processing_mask2former.d.ts +4 -0
  166. package/types/models/mask2former/image_processing_mask2former.d.ts.map +1 -0
  167. package/types/models/maskformer/image_processing_maskformer.d.ts +22 -0
  168. package/types/models/maskformer/image_processing_maskformer.d.ts.map +1 -0
  169. package/types/models/mgp_str/processing_mgp_str.d.ts +64 -0
  170. package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -0
  171. package/types/models/mobilenet_v1/image_processing_mobilenet_v1.d.ts +6 -0
  172. package/types/models/mobilenet_v1/image_processing_mobilenet_v1.d.ts.map +1 -0
  173. package/types/models/mobilenet_v2/image_processing_mobilenet_v2.d.ts +6 -0
  174. package/types/models/mobilenet_v2/image_processing_mobilenet_v2.d.ts.map +1 -0
  175. package/types/models/mobilenet_v3/image_processing_mobilenet_v3.d.ts +6 -0
  176. package/types/models/mobilenet_v3/image_processing_mobilenet_v3.d.ts.map +1 -0
  177. package/types/models/mobilenet_v4/image_processing_mobilenet_v4.d.ts +6 -0
  178. package/types/models/mobilenet_v4/image_processing_mobilenet_v4.d.ts.map +1 -0
  179. package/types/models/mobilevit/image_processing_mobilevit.d.ts +6 -0
  180. package/types/models/mobilevit/image_processing_mobilevit.d.ts.map +1 -0
  181. package/types/models/nougat/image_processing_nougat.d.ts +4 -0
  182. package/types/models/nougat/image_processing_nougat.d.ts.map +1 -0
  183. package/types/models/owlv2/image_processing_owlv2.d.ts +4 -0
  184. package/types/models/owlv2/image_processing_owlv2.d.ts.map +1 -0
  185. package/types/models/owlvit/image_processing_owlvit.d.ts +10 -0
  186. package/types/models/owlvit/image_processing_owlvit.d.ts.map +1 -0
  187. package/types/models/owlvit/processing_owlvit.d.ts +8 -0
  188. package/types/models/owlvit/processing_owlvit.d.ts.map +1 -0
  189. package/types/models/processors.d.ts +13 -0
  190. package/types/models/processors.d.ts.map +1 -0
  191. package/types/models/pvt/image_processing_pvt.d.ts +4 -0
  192. package/types/models/pvt/image_processing_pvt.d.ts.map +1 -0
  193. package/types/models/pyannote/feature_extraction_pyannote.d.ts +13 -0
  194. package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -0
  195. package/types/models/pyannote/processing_pyannote.d.ts +30 -0
  196. package/types/models/pyannote/processing_pyannote.d.ts.map +1 -0
  197. package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts +11 -0
  198. package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -0
  199. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +17 -0
  200. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -0
  201. package/types/models/rt_detr/image_processing_rt_detr.d.ts +8 -0
  202. package/types/models/rt_detr/image_processing_rt_detr.d.ts.map +1 -0
  203. package/types/models/sam/image_processing_sam.d.ts +103 -0
  204. package/types/models/sam/image_processing_sam.d.ts.map +1 -0
  205. package/types/models/sam/processing_sam.d.ts +9 -0
  206. package/types/models/sam/processing_sam.d.ts.map +1 -0
  207. package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts +34 -0
  208. package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts.map +1 -0
  209. package/types/models/segformer/image_processing_segformer.d.ts +10 -0
  210. package/types/models/segformer/image_processing_segformer.d.ts.map +1 -0
  211. package/types/models/siglip/image_processing_siglip.d.ts +4 -0
  212. package/types/models/siglip/image_processing_siglip.d.ts.map +1 -0
  213. package/types/models/speecht5/feature_extraction_speecht5.d.ts +4 -0
  214. package/types/models/speecht5/feature_extraction_speecht5.d.ts.map +1 -0
  215. package/types/models/speecht5/processing_speecht5.d.ts +14 -0
  216. package/types/models/speecht5/processing_speecht5.d.ts.map +1 -0
  217. package/types/models/swin2sr/image_processing_swin2sr.d.ts +5 -0
  218. package/types/models/swin2sr/image_processing_swin2sr.d.ts.map +1 -0
  219. package/types/models/vit/image_processing_vit.d.ts +6 -0
  220. package/types/models/vit/image_processing_vit.d.ts.map +1 -0
  221. package/types/models/vitmatte/image_processing_vitmatte.d.ts +12 -0
  222. package/types/models/vitmatte/image_processing_vitmatte.d.ts.map +1 -0
  223. package/types/models/vitpose/image_processing_vitpose.d.ts +26 -0
  224. package/types/models/vitpose/image_processing_vitpose.d.ts.map +1 -0
  225. package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts +19 -0
  226. package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts.map +1 -0
  227. package/types/models/wav2vec2/processing_wav2vec2.d.ts +12 -0
  228. package/types/models/wav2vec2/processing_wav2vec2.d.ts.map +1 -0
  229. package/types/models/wespeaker/feature_extraction_wespeaker.d.ts +23 -0
  230. package/types/models/wespeaker/feature_extraction_wespeaker.d.ts.map +1 -0
  231. package/types/models/whisper/feature_extraction_whisper.d.ts +21 -0
  232. package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -0
  233. package/types/models/whisper/processing_whisper.d.ts +17 -0
  234. package/types/models/whisper/processing_whisper.d.ts.map +1 -0
  235. package/types/models/yolos/image_processing_yolos.d.ts +10 -0
  236. package/types/models/yolos/image_processing_yolos.d.ts.map +1 -0
  237. package/types/models.d.ts +150 -0
  238. package/types/models.d.ts.map +1 -1
  239. package/types/pipelines.d.ts +2 -3
  240. package/types/pipelines.d.ts.map +1 -1
  241. package/types/tokenizers.d.ts +3 -0
  242. package/types/tokenizers.d.ts.map +1 -1
  243. package/types/transformers.d.ts +10 -1
  244. package/types/utils/constants.d.ts +6 -0
  245. package/types/utils/constants.d.ts.map +1 -1
  246. package/types/utils/core.d.ts +65 -3
  247. package/types/utils/core.d.ts.map +1 -1
  248. package/types/utils/dtypes.d.ts +3 -2
  249. package/types/utils/dtypes.d.ts.map +1 -1
  250. package/types/utils/hub.d.ts +1 -1
  251. package/types/utils/hub.d.ts.map +1 -1
  252. package/types/utils/image.d.ts +14 -2
  253. package/types/utils/image.d.ts.map +1 -1
  254. package/types/utils/tensor.d.ts +39 -4
  255. package/types/utils/tensor.d.ts.map +1 -1
  256. package/src/processors.js +0 -2655
  257. package/types/processors.d.ts +0 -924
  258. package/types/processors.d.ts.map +0 -1
@@ -0,0 +1,219 @@
1
+
2
+
3
+ import {
4
+ ImageProcessor,
5
+ } from "../../base/image_processors_utils.js";
6
+ import { cat, full, interpolate_4d, stack } from "../../utils/tensor.js";
7
+
8
+ export class Idefics3ImageProcessor extends ImageProcessor {
9
+ constructor(config) {
10
+ super(config);
11
+
12
+ this.do_image_splitting = config.do_image_splitting ?? true;
13
+ this.max_image_size = config.max_image_size;
14
+ }
15
+
16
+ /**
17
+ * @typedef {import('../../utils/image.js').RawImage} RawImage
18
+ * @typedef {import('../../utils/tensor.js').Tensor} Tensor
19
+ */
20
+
21
+ /**
22
+ * Calculate size to resize images to, to be multiples of `vision_encoder_max_size` while preserving the aspect ratio.
23
+ * @param {Tensor} pixel_values Tensor of the image to resize.
24
+ * @param {number} vision_encoder_max_size Maximum size of the output image. If the image is larger than this size,
25
+ * it will be split into patches of this size, and the original image will be concatenated with the patches, resized to max_size.
26
+ */
27
+ get_resize_for_vision_encoder(pixel_values, vision_encoder_max_size) {
28
+ let [height, width] = pixel_values.dims.slice(-2);
29
+
30
+ const aspect_ratio = width / height;
31
+ if (width >= height) {
32
+ width = Math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size;
33
+ height = Math.floor(width / aspect_ratio);
34
+ height = Math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size;
35
+ } else {
36
+ height = Math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size;
37
+ width = Math.floor(height * aspect_ratio);
38
+ width = Math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size;
39
+ }
40
+ return { height, width };
41
+ }
42
+
43
+ /** @param {RawImage|RawImage[]|RawImage[][]} images */
44
+ async _call(images, {
45
+ do_image_splitting = null,
46
+ return_row_col_info = false,
47
+ } = {}) {
48
+
49
+ /** @type {RawImage[][]} */
50
+ let batched_2d_images;
51
+ if (!Array.isArray(images)) {
52
+ batched_2d_images = [[images]];
53
+ } else {
54
+ if (images.length === 0 || !images[0]) {
55
+ throw new Error("No images provided.");
56
+ }
57
+ if (!Array.isArray(images[0])) {
58
+ batched_2d_images = [/** @type {RawImage[]} */(images)];
59
+ } else {
60
+ batched_2d_images = /** @type {RawImage[][]} */(images);
61
+ }
62
+ }
63
+
64
+ // List of tensors, each with shape [patches, channels, height, width]
65
+ let all_pixel_values = [];
66
+ let images_list_rows = [];
67
+ let images_list_cols = [];
68
+
69
+ const original_sizes = [];
70
+ const reshaped_input_sizes = [];
71
+ for (const image_batch of batched_2d_images) {
72
+
73
+ let images_list = await Promise.all(image_batch.map(x => this.preprocess(x)));
74
+
75
+ // Original sizes of images
76
+ original_sizes.push(...images_list.map(x => x.original_size));
77
+
78
+ // Reshaped sizes of images, before padding or cropping
79
+ reshaped_input_sizes.push(...images_list.map(x => x.reshaped_input_size));
80
+
81
+ // Convert images to 4D tensors for easier processing
82
+ images_list.forEach(x => x.pixel_values.unsqueeze_(0));
83
+
84
+ const { longest_edge } = this.max_image_size;
85
+
86
+ /** @type {Tensor[]} */
87
+ let images_tensor;
88
+ if (do_image_splitting ?? this.do_image_splitting) {
89
+ let image_rows = new Array(images_list.length);
90
+ let image_cols = new Array(images_list.length);
91
+
92
+ // We first resize both height and width of each image to the nearest max_image_size multiple, disregarding the aspect ratio
93
+ images_tensor = await Promise.all(images_list.map(async (x, i) => {
94
+ const new_size = this.get_resize_for_vision_encoder(x.pixel_values, longest_edge);
95
+
96
+ const resized = await interpolate_4d(x.pixel_values, {
97
+ size: [new_size.height, new_size.width],
98
+ });
99
+
100
+ const { frames, num_splits_h, num_splits_w } = await this.split_image(resized, this.max_image_size);
101
+ image_rows[i] = num_splits_h;
102
+ image_cols[i] = num_splits_w;
103
+ return cat(frames, 0);
104
+ }));
105
+
106
+ images_list_rows.push(image_rows);
107
+ images_list_cols.push(image_cols);
108
+
109
+ } else {
110
+ /** @type {[number, number]} */
111
+ const size = [longest_edge, longest_edge];
112
+ images_tensor = await Promise.all(
113
+ images_list.map(x => interpolate_4d(x.pixel_values, { size }))
114
+ );
115
+
116
+ images_list_rows.push(new Array(images_list.length).fill(0));
117
+ images_list_cols.push(new Array(images_list.length).fill(0));
118
+ }
119
+
120
+ all_pixel_values.push(cat(images_tensor, 0));
121
+ }
122
+
123
+ const batch_size = all_pixel_values.length;
124
+ const [n, c, h, w] = all_pixel_values[0].dims;
125
+
126
+ // Stack pixel values
127
+ let pixel_values;
128
+ let pixel_attention_mask;
129
+ if (batch_size === 1) {
130
+ pixel_values = all_pixel_values[0].unsqueeze_(0);
131
+ pixel_attention_mask = full([batch_size, n, h, w], true);
132
+ } else {
133
+ // Add padding (if necessary) to images with less patches than the maximum number of patches
134
+ const max_num_patches = Math.max(...all_pixel_values.map(x => x.dims.at(0)));
135
+
136
+ pixel_attention_mask = full([batch_size, max_num_patches, h, w], true);
137
+ const pixel_attention_mask_data = pixel_attention_mask.data;
138
+ const pixel_attention_mask_stride = max_num_patches * h * w;
139
+ for (let i = 0; i < batch_size; ++i) {
140
+ const num_patches = all_pixel_values[i].dims[0];
141
+ if (num_patches < max_num_patches) {
142
+ all_pixel_values[i] = cat([
143
+ all_pixel_values[i],
144
+ full([max_num_patches - num_patches, c, h, w], 0),
145
+ ], 0);
146
+
147
+ const start_offset = i * pixel_attention_mask_stride + num_patches * h * w;
148
+ const end_offset = (i + 1) * pixel_attention_mask_stride;
149
+ pixel_attention_mask_data.fill(false, start_offset, end_offset);
150
+ }
151
+ }
152
+ pixel_values = stack(all_pixel_values, 0);
153
+ }
154
+
155
+ return {
156
+ pixel_values,
157
+ pixel_attention_mask,
158
+
159
+ original_sizes,
160
+ reshaped_input_sizes,
161
+ ...(
162
+ return_row_col_info
163
+ ? { rows: images_list_rows, cols: images_list_cols }
164
+ : {}
165
+ ),
166
+ }
167
+ }
168
+
169
+ async split_image(pixel_values, { longest_edge }) {
170
+ const max_height = longest_edge;
171
+ const max_width = longest_edge;
172
+
173
+ const frames = [];
174
+
175
+ const [height, width] = pixel_values.dims.slice(-2);
176
+
177
+ let num_splits_h = 0, num_splits_w = 0;
178
+
179
+ if (height > max_height || width > max_width) {
180
+ // Calculate the number of splits
181
+ num_splits_h = Math.ceil(height / max_height);
182
+ num_splits_w = Math.ceil(width / max_width);
183
+
184
+ // Calculate the optimal width and height for the sub-images
185
+ const optimal_height = Math.ceil(height / num_splits_h);
186
+ const optimal_width = Math.ceil(width / num_splits_w);
187
+
188
+ // Iterate through each row and column
189
+ for (let r = 0; r < num_splits_h; r++) {
190
+ for (let c = 0; c < num_splits_w; c++) {
191
+ // Calculate the starting point of the crop
192
+ const start_x = c * optimal_width;
193
+ const start_y = r * optimal_height;
194
+
195
+ // Calculate the ending point of the crop
196
+ const end_x = Math.min(start_x + optimal_width, width);
197
+ const end_y = Math.min(start_y + optimal_height, height);
198
+
199
+ // Crop the image
200
+ frames.push(pixel_values.slice(null, null, [start_y, end_y], [start_x, end_x]));
201
+ }
202
+ }
203
+
204
+ // Resize the global image to match max dimensions for memory efficiency
205
+ const global_image_height = max_height;
206
+ const global_image_width = max_width;
207
+
208
+ if (height !== global_image_height || width !== global_image_width) {
209
+ pixel_values = await interpolate_4d(pixel_values, {
210
+ size: [global_image_height, global_image_width],
211
+ })
212
+ }
213
+ }
214
+
215
+ frames.push(pixel_values);
216
+
217
+ return { frames, num_splits_h, num_splits_w };
218
+ }
219
+ }
@@ -0,0 +1,136 @@
1
+
2
+ import { Processor } from "../../base/processing_utils.js";
3
+ import { AutoImageProcessor } from "../auto/image_processing_auto.js";
4
+ import { AutoTokenizer } from "../../tokenizers.js";
5
+ import { RawImage } from "../../utils/image.js";
6
+ import { count } from "../../utils/core.js";
7
+
8
+ /**
9
+ * Prompt with expanded image tokens for when the image is split into patches.
10
+ * @private
11
+ */
12
+ function _prompt_split_image(image_seq_len, image_rows, image_cols, fake_token_around_image, image_token, global_img_token) {
13
+ let text_split_images = "";
14
+ for (let n_h = 0; n_h < image_rows; ++n_h) {
15
+ for (let n_w = 0; n_w < image_cols; ++n_w) {
16
+ text_split_images += (
17
+ fake_token_around_image +
18
+ `<row_${n_h + 1}_col_${n_w + 1}>` +
19
+ image_token.repeat(image_seq_len)
20
+ );
21
+ }
22
+ text_split_images += "\n";
23
+ }
24
+
25
+ text_split_images += (
26
+ `\n${fake_token_around_image}` +
27
+ `${global_img_token}` +
28
+ image_token.repeat(image_seq_len) +
29
+ `${fake_token_around_image}`
30
+ );
31
+ return text_split_images;
32
+ }
33
+
34
+ /**
35
+ * Prompt with expanded image tokens for a single image.
36
+ * @private
37
+ */
38
+ function _prompt_single_image(image_seq_len, fake_token_around_image, image_token, global_img_token) {
39
+ return (
40
+ `${fake_token_around_image}` +
41
+ `${global_img_token}` +
42
+ image_token.repeat(image_seq_len) +
43
+ `${fake_token_around_image}`
44
+ );
45
+ }
46
+
47
+ function get_image_prompt_string(image_rows, image_cols, image_seq_len, fake_token_around_image, image_token, global_img_token) {
48
+ if (image_rows === 0 && image_cols === 0) {
49
+ return _prompt_single_image(
50
+ image_seq_len,
51
+ fake_token_around_image,
52
+ image_token,
53
+ global_img_token
54
+ );
55
+ }
56
+ return _prompt_split_image(
57
+ image_seq_len, image_rows, image_cols, fake_token_around_image, image_token, global_img_token
58
+ );
59
+ }
60
+
61
+
62
+ export class Idefics3Processor extends Processor {
63
+ static image_processor_class = AutoImageProcessor
64
+ static tokenizer_class = AutoTokenizer
65
+ static uses_processor_config = true;
66
+
67
+ fake_image_token = "<fake_token_around_image>";
68
+ image_token = "<image>";
69
+ global_img_token = "<global-img>";
70
+
71
+ /**
72
+ *
73
+ * @param {string|string[]} text
74
+ * @param {RawImage|RawImage[]|RawImage[][]} images
75
+ * @returns {Promise<any>}
76
+ */
77
+ async _call(text, images = null, options = {}) {
78
+ options.return_row_col_info ??= true;
79
+
80
+ let image_inputs;
81
+
82
+ if (images) {
83
+ image_inputs = await this.image_processor(images, options);
84
+ }
85
+
86
+ // NOTE: We assume text is present
87
+ if (!Array.isArray(text)) {
88
+ text = [text];
89
+ }
90
+
91
+ const image_rows = image_inputs.rows ?? [new Array(text.length).fill(0)];
92
+ const image_cols = image_inputs.cols ?? [new Array(text.length).fill(0)];
93
+
94
+ const image_seq_len = this.config.image_seq_len;
95
+ const n_images_in_text = []
96
+ const prompt_strings = [];
97
+ for (let i = 0; i < text.length; ++i) {
98
+ const sample = text[i];
99
+ const sample_rows = image_rows[i];
100
+ const sample_cols = image_cols[i];
101
+
102
+ n_images_in_text.push(count(sample, this.image_token));
103
+
104
+ // Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
105
+ const image_prompt_strings = sample_rows.map(
106
+ (n_rows, j) => get_image_prompt_string(
107
+ n_rows,
108
+ sample_cols[j],
109
+ image_seq_len,
110
+ this.fake_image_token,
111
+ this.image_token,
112
+ this.global_img_token,
113
+ )
114
+ );
115
+
116
+ const split_sample = sample.split(this.image_token);
117
+ if (split_sample.length === 0) {
118
+ throw new Error("The image token should be present in the text.");
119
+ }
120
+
121
+ // Place in the image prompt strings where the image tokens are
122
+ let new_sample = split_sample[0];
123
+ for (let j = 0; j < image_prompt_strings.length; ++j) {
124
+ new_sample += image_prompt_strings[j] + split_sample[j + 1];
125
+ }
126
+ prompt_strings.push(new_sample);
127
+ }
128
+
129
+ const text_inputs = this.tokenizer(prompt_strings);
130
+
131
+ return {
132
+ ...text_inputs,
133
+ ...image_inputs,
134
+ }
135
+ }
136
+ }
@@ -0,0 +1,37 @@
1
+
2
+ export * from './beit/image_processing_beit.js'
3
+ export * from './bit/image_processing_bit.js'
4
+ export * from './chinese_clip/image_processing_chinese_clip.js'
5
+ export * from './clip/image_processing_clip.js'
6
+ export * from './convnext/image_processing_convnext.js'
7
+ export * from './deit/image_processing_deit.js'
8
+ export * from './detr/image_processing_detr.js'
9
+ export * from './donut/image_processing_donut.js'
10
+ export * from './dpt/image_processing_dpt.js'
11
+ export * from './efficientnet/image_processing_efficientnet.js'
12
+ export * from './glpn/image_processing_glpn.js'
13
+ export * from './idefics3/image_processing_idefics3.js'
14
+ export * from './janus/image_processing_janus.js'
15
+ export * from './jina_clip/image_processing_jina_clip.js'
16
+ export * from './llava_onevision/image_processing_llava_onevision.js'
17
+ export * from './mask2former/image_processing_mask2former.js'
18
+ export * from './maskformer/image_processing_maskformer.js'
19
+ export * from './mobilenet_v1/image_processing_mobilenet_v1.js'
20
+ export * from './mobilenet_v2/image_processing_mobilenet_v2.js'
21
+ export * from './mobilenet_v3/image_processing_mobilenet_v3.js'
22
+ export * from './mobilenet_v4/image_processing_mobilenet_v4.js'
23
+ export * from './mobilevit/image_processing_mobilevit.js'
24
+ export * from './nougat/image_processing_nougat.js'
25
+ export * from './owlv2/image_processing_owlv2.js'
26
+ export * from './owlvit/image_processing_owlvit.js'
27
+ export * from './pvt/image_processing_pvt.js'
28
+ export * from './qwen2_vl/image_processing_qwen2_vl.js'
29
+ export * from './rt_detr/image_processing_rt_detr.js'
30
+ export * from './sam/image_processing_sam.js'
31
+ export * from './segformer/image_processing_segformer.js'
32
+ export * from './siglip/image_processing_siglip.js'
33
+ export * from './swin2sr/image_processing_swin2sr.js'
34
+ export * from './vit/image_processing_vit.js'
35
+ export * from './vitmatte/image_processing_vitmatte.js'
36
+ export * from './vitpose/image_processing_vitpose.js'
37
+ export * from './yolos/image_processing_yolos.js'
@@ -0,0 +1,26 @@
1
+
2
+ import {
3
+ ImageProcessor,
4
+ } from "../../base/image_processors_utils.js";
5
+
6
+ export class VLMImageProcessor extends ImageProcessor {
7
+ constructor(config) {
8
+ super({
9
+ do_pad: true,
10
+ pad_size: {
11
+ width: config.image_size,
12
+ height: config.image_size,
13
+ },
14
+ ...config,
15
+ });
16
+ this.constant_values = this.config.background_color.map(x => x * this.rescale_factor)
17
+ }
18
+
19
+ pad_image(pixelData, imgDims, padSize, options) {
20
+ return super.pad_image(pixelData, imgDims, padSize, {
21
+ constant_values: this.constant_values,
22
+ center: true,
23
+ ...options,
24
+ });
25
+ }
26
+ }
@@ -0,0 +1,123 @@
1
+
2
+ import { Processor } from "../../base/processing_utils.js";
3
+ import { AutoImageProcessor } from "../auto/image_processing_auto.js";
4
+ import { AutoTokenizer } from "../../tokenizers.js";
5
+ import { mergeArrays } from "../../utils/core.js";
6
+ import { Tensor } from "../../utils/tensor.js";
7
+ import { RawImage } from "../../utils/image.js";
8
+
9
+ export class VLChatProcessor extends Processor {
10
+ static image_processor_class = AutoImageProcessor
11
+ static tokenizer_class = AutoTokenizer
12
+ static uses_processor_config = true;
13
+
14
+ constructor(config, components) {
15
+ super(config, components);
16
+
17
+ this.image_tag = this.config.image_tag;
18
+ this.image_start_tag = this.config.image_start_tag;
19
+ this.image_end_tag = this.config.image_end_tag;
20
+ this.num_image_tokens = this.config.num_image_tokens;
21
+ }
22
+
23
+ /**
24
+ * @typedef {Object} MultimodalMessageProperties Additional properties for multimodal messages.
25
+ * @property {(RawImage | string | URL)[]} [images] The images in the message.
26
+ * @typedef {(import('../../tokenizers.js').Message & MultimodalMessageProperties)[]} MultimodalConversation The conversation possibly containing multimodal inputs.
27
+ */
28
+
29
+ /**
30
+ * @typedef {Object} VLCChatProcessorResult The processed input.
31
+ * @property {Tensor} input_ids The input IDs.
32
+ * @property {Tensor} attention_mask The attention mask.
33
+ * @property {Tensor} images_seq_mask The image sequence mask.
34
+ * @property {Tensor} images_emb_mask The image embedding mask.
35
+ */
36
+
37
+ /**
38
+ * @param {MultimodalConversation} conversation The chat messages to process.
39
+ * @param {Object} options Additional options for processing.
40
+ * @param {RawImage|RawImage[]} [options.images] The images to process, if not set in the conversation.
41
+ * @param {string} [options.chat_template="default"] The chat template to use.
42
+ * @returns {Promise<VLCChatProcessorResult | VLCChatProcessorResult & import('../../base/image_processors_utils.js').ImageProcessorResult>} The processed input.
43
+ */
44
+ async _call(conversation, {
45
+ images = null,
46
+ chat_template = "default",
47
+ }={}) {
48
+ if (!images) {
49
+ images = await Promise.all(
50
+ conversation
51
+ .filter((msg) => msg.images)
52
+ .flatMap((msg) => msg.images)
53
+ .map((img) => RawImage.read(img))
54
+ );
55
+ } else if (!Array.isArray(images)) {
56
+ images = [images];
57
+ }
58
+
59
+ const tokenizer = this.tokenizer;
60
+ const result = tokenizer.apply_chat_template(conversation, {
61
+ tokenize: false,
62
+ add_generation_prompt: true,
63
+ chat_template,
64
+ });
65
+
66
+ const encode = (text) => tokenizer.encode(text, { add_special_tokens: false });
67
+ const parts = (/** @type {string} */(result))
68
+ .split(this.image_tag);
69
+ const num_images = parts.length - 1;
70
+ if (images.length !== num_images) {
71
+ throw new Error(`Number of images provided (${images.length}) does not match number of "${this.image_tag}" image tags (${num_images})`);
72
+ }
73
+
74
+ const [
75
+ image_placeholder_tag_id,
76
+ image_start_tag_id,
77
+ image_end_tag_id,
78
+ ] = tokenizer.model.convert_tokens_to_ids([
79
+ this.image_tag,
80
+ this.image_start_tag,
81
+ this.image_end_tag,
82
+ ]);
83
+
84
+ let input_ids = encode(parts[0]);
85
+ let images_seq_mask = new Array(input_ids.length).fill(false);
86
+ for (let i = 1; i < parts.length; ++i) {
87
+ const placeholder_image_tokens = new Array(this.num_image_tokens).fill(image_placeholder_tag_id);
88
+ const tokens = encode(parts[i]);
89
+ input_ids = mergeArrays(
90
+ input_ids,
91
+ [image_start_tag_id], placeholder_image_tokens, [image_end_tag_id],
92
+ tokens,
93
+ );
94
+ const image_mask = new Array(this.num_image_tokens).fill(true);
95
+ images_seq_mask = mergeArrays(
96
+ images_seq_mask,
97
+ [false], image_mask, [false],
98
+ new Array(tokens.length).fill(false),
99
+ );
100
+ }
101
+
102
+ const dims = [1, input_ids.length];
103
+ const final = {
104
+ input_ids: new Tensor('int64', input_ids, dims),
105
+ attention_mask: new Tensor('int64', new Array(input_ids.length).fill(1), dims),
106
+ images_seq_mask: new Tensor('bool', images_seq_mask, dims),
107
+ images_emb_mask: new Tensor(
108
+ 'bool',
109
+ new Array(num_images * this.num_image_tokens).fill(true),
110
+ [1, num_images, this.num_image_tokens],
111
+ ),
112
+ }
113
+
114
+ if (images && images.length > 0) {
115
+ const image_inputs = await this.image_processor(images);
116
+ // Set the batch_size dimension to 1
117
+ image_inputs.pixel_values.unsqueeze_(0);
118
+ return { ...final, ...image_inputs };
119
+ }
120
+
121
+ return final;
122
+ }
123
+ }
@@ -0,0 +1,26 @@
1
+ import {
2
+ ImageProcessor,
3
+ } from "../../base/image_processors_utils.js";
4
+
5
+ export class JinaCLIPImageProcessor extends ImageProcessor {
6
+ constructor(config) {
7
+ // JinaCLIPImageProcessor uses a custom preprocessor_config.json, so we configure it here
8
+ const { resize_mode, fill_color, interpolation, size, ...other } = config;
9
+
10
+ const new_size = resize_mode === 'squash'
11
+ ? { width: size, height: size }
12
+ : resize_mode === 'shortest'
13
+ ? { shortest_edge: size }
14
+ : { longest_edge: size };
15
+
16
+ const resample = interpolation === 'bicubic' ? 3 : 2;
17
+ super({
18
+ ...other,
19
+ size: new_size,
20
+ resample,
21
+ do_center_crop: true,
22
+ crop_size: size,
23
+ do_normalize: true,
24
+ });
25
+ }
26
+ }
@@ -0,0 +1,24 @@
1
+
2
+ import { Processor } from "../../base/processing_utils.js";
3
+ import { AutoImageProcessor } from "../auto/image_processing_auto.js";
4
+ import { AutoTokenizer } from "../../tokenizers.js";
5
+
6
+ export class JinaCLIPProcessor extends Processor {
7
+ static tokenizer_class = AutoTokenizer
8
+ static image_processor_class = AutoImageProcessor
9
+
10
+ async _call(text=null, images=null, kwargs = {}) {
11
+
12
+ if (!text && !images){
13
+ throw new Error('Either text or images must be provided');
14
+ }
15
+
16
+ const text_inputs = text ? this.tokenizer(text, kwargs) : {};
17
+ const image_inputs = images ? await this.image_processor(images, kwargs) : {};
18
+
19
+ return {
20
+ ...text_inputs,
21
+ ...image_inputs,
22
+ }
23
+ }
24
+ }
@@ -0,0 +1,5 @@
1
+ import {
2
+ ImageProcessor,
3
+ } from "../../base/image_processors_utils.js";
4
+
5
+ export class LlavaOnevisionImageProcessor extends ImageProcessor {}
@@ -0,0 +1,5 @@
1
+
2
+ import { MaskFormerImageProcessor } from "../maskformer/image_processing_maskformer.js";
3
+
4
+ // NOTE: extends MaskFormerImageProcessor
5
+ export class Mask2FormerImageProcessor extends MaskFormerImageProcessor { }
@@ -0,0 +1,18 @@
1
+ import {
2
+ ImageProcessor,
3
+ post_process_panoptic_segmentation,
4
+ post_process_instance_segmentation,
5
+ } from "../../base/image_processors_utils.js";
6
+
7
+ export class MaskFormerImageProcessor extends ImageProcessor {
8
+
9
+ /** @type {typeof post_process_panoptic_segmentation} */
10
+ post_process_panoptic_segmentation(...args) {
11
+ return post_process_panoptic_segmentation(...args);
12
+ }
13
+ /** @type {typeof post_process_instance_segmentation} */
14
+ post_process_instance_segmentation(...args) {
15
+ return post_process_instance_segmentation(...args);
16
+ }
17
+ }
18
+ export class MaskFormerFeatureExtractor extends MaskFormerImageProcessor { }