@huggingface/transformers 3.0.1 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. package/README.md +14 -4
  2. package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
  3. package/dist/transformers.cjs +16607 -13472
  4. package/dist/transformers.cjs.map +1 -1
  5. package/dist/transformers.js +16601 -13451
  6. package/dist/transformers.js.map +1 -1
  7. package/dist/transformers.min.cjs +238 -52
  8. package/dist/transformers.min.cjs.map +1 -1
  9. package/dist/transformers.min.js +229 -43
  10. package/dist/transformers.min.js.map +1 -1
  11. package/dist/transformers.min.mjs +240 -54
  12. package/dist/transformers.min.mjs.map +1 -1
  13. package/dist/transformers.mjs +16017 -12878
  14. package/dist/transformers.mjs.map +1 -1
  15. package/package.json +7 -7
  16. package/src/base/feature_extraction_utils.js +54 -0
  17. package/src/base/image_processors_utils.js +1089 -0
  18. package/src/base/processing_utils.js +145 -0
  19. package/src/configs.js +15 -3
  20. package/src/env.js +15 -4
  21. package/src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js +90 -0
  22. package/src/models/auto/feature_extraction_auto.js +41 -0
  23. package/src/models/auto/image_processing_auto.js +29 -0
  24. package/src/models/auto/processing_auto.js +100 -0
  25. package/src/models/beit/image_processing_beit.js +5 -0
  26. package/src/models/bit/image_processing_bit.js +5 -0
  27. package/src/models/chinese_clip/image_processing_chinese_clip.js +5 -0
  28. package/src/models/clap/feature_extraction_clap.js +159 -0
  29. package/src/models/clip/image_processing_clip.js +6 -0
  30. package/src/models/convnext/image_processing_convnext.js +45 -0
  31. package/src/models/deit/image_processing_deit.js +6 -0
  32. package/src/models/detr/image_processing_detr.js +52 -0
  33. package/src/models/donut/image_processing_donut.js +31 -0
  34. package/src/models/dpt/image_processing_dpt.js +6 -0
  35. package/src/models/efficientnet/image_processing_efficientnet.js +13 -0
  36. package/src/models/feature_extractors.js +12 -0
  37. package/src/models/florence2/processing_florence2.js +128 -0
  38. package/src/models/glpn/image_processing_glpn.js +5 -0
  39. package/src/models/image_processors.js +36 -0
  40. package/src/models/janus/image_processing_janus.js +26 -0
  41. package/src/models/janus/processing_janus.js +123 -0
  42. package/src/models/jina_clip/image_processing_jina_clip.js +26 -0
  43. package/src/models/jina_clip/processing_jina_clip.js +24 -0
  44. package/src/models/llava_onevision/image_processing_llava_onevision.js +5 -0
  45. package/src/models/mask2former/image_processing_mask2former.js +5 -0
  46. package/src/models/maskformer/image_processing_maskformer.js +18 -0
  47. package/src/models/mgp_str/processing_mgp_str.js +170 -0
  48. package/src/models/mobilenet_v1/image_processing_mobilenet_v1.js +7 -0
  49. package/src/models/mobilenet_v2/image_processing_mobilenet_v2.js +7 -0
  50. package/src/models/mobilenet_v3/image_processing_mobilenet_v3.js +7 -0
  51. package/src/models/mobilenet_v4/image_processing_mobilenet_v4.js +7 -0
  52. package/src/models/mobilevit/image_processing_mobilevit.js +6 -0
  53. package/src/models/nougat/image_processing_nougat.js +5 -0
  54. package/src/models/owlv2/image_processing_owlv2.js +5 -0
  55. package/src/models/owlvit/image_processing_owlvit.js +12 -0
  56. package/src/models/owlvit/processing_owlvit.js +7 -0
  57. package/src/models/processors.js +11 -0
  58. package/src/models/pvt/image_processing_pvt.js +5 -0
  59. package/src/models/pyannote/feature_extraction_pyannote.js +28 -0
  60. package/src/models/pyannote/processing_pyannote.js +71 -0
  61. package/src/models/qwen2_vl/image_processing_qwen2_vl.js +52 -0
  62. package/src/models/qwen2_vl/processing_qwen2_vl.js +52 -0
  63. package/src/models/rt_detr/image_processing_rt_detr.js +12 -0
  64. package/src/models/sam/image_processing_sam.js +242 -0
  65. package/src/models/sam/processing_sam.js +20 -0
  66. package/src/models/sapiens/image_processing_sapiens.js +13 -0
  67. package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +180 -0
  68. package/src/models/segformer/image_processing_segformer.js +13 -0
  69. package/src/models/siglip/image_processing_siglip.js +5 -0
  70. package/src/models/speecht5/feature_extraction_speecht5.js +4 -0
  71. package/src/models/speecht5/processing_speecht5.js +17 -0
  72. package/src/models/swin2sr/image_processing_swin2sr.js +24 -0
  73. package/src/models/vit/image_processing_vit.js +7 -0
  74. package/src/models/vitmatte/image_processing_vitmatte.js +50 -0
  75. package/src/models/vitpose/image_processing_vitpose.js +89 -0
  76. package/src/models/wav2vec2/feature_extraction_wav2vec2.js +44 -0
  77. package/src/models/wav2vec2/processing_wav2vec2.js +15 -0
  78. package/src/models/wespeaker/feature_extraction_wespeaker.js +100 -0
  79. package/src/models/whisper/feature_extraction_whisper.js +84 -0
  80. package/src/models/whisper/processing_whisper.js +21 -0
  81. package/src/models/yolos/image_processing_yolos.js +12 -0
  82. package/src/models.js +695 -32
  83. package/src/pipelines.js +8 -8
  84. package/src/tokenizers.js +5 -0
  85. package/src/transformers.js +15 -2
  86. package/src/utils/constants.js +8 -1
  87. package/src/utils/core.js +37 -9
  88. package/src/utils/hub.js +2 -1
  89. package/src/utils/image.js +68 -17
  90. package/src/utils/tensor.js +33 -1
  91. package/types/base/feature_extraction_utils.d.ts +41 -0
  92. package/types/base/feature_extraction_utils.d.ts.map +1 -0
  93. package/types/base/image_processors_utils.d.ts +323 -0
  94. package/types/base/image_processors_utils.d.ts.map +1 -0
  95. package/types/base/processing_utils.d.ts +80 -0
  96. package/types/base/processing_utils.d.ts.map +1 -0
  97. package/types/configs.d.ts +4 -1
  98. package/types/configs.d.ts.map +1 -1
  99. package/types/env.d.ts.map +1 -1
  100. package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts +25 -0
  101. package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts.map +1 -0
  102. package/types/models/auto/feature_extraction_auto.d.ts +5 -0
  103. package/types/models/auto/feature_extraction_auto.d.ts.map +1 -0
  104. package/types/models/auto/image_processing_auto.d.ts +5 -0
  105. package/types/models/auto/image_processing_auto.d.ts.map +1 -0
  106. package/types/models/auto/processing_auto.d.ts +35 -0
  107. package/types/models/auto/processing_auto.d.ts.map +1 -0
  108. package/types/models/beit/image_processing_beit.d.ts +4 -0
  109. package/types/models/beit/image_processing_beit.d.ts.map +1 -0
  110. package/types/models/bit/image_processing_bit.d.ts +4 -0
  111. package/types/models/bit/image_processing_bit.d.ts.map +1 -0
  112. package/types/models/chinese_clip/image_processing_chinese_clip.d.ts +4 -0
  113. package/types/models/chinese_clip/image_processing_chinese_clip.d.ts.map +1 -0
  114. package/types/models/clap/feature_extraction_clap.d.ts +57 -0
  115. package/types/models/clap/feature_extraction_clap.d.ts.map +1 -0
  116. package/types/models/clip/image_processing_clip.d.ts +6 -0
  117. package/types/models/clip/image_processing_clip.d.ts.map +1 -0
  118. package/types/models/convnext/image_processing_convnext.d.ts +12 -0
  119. package/types/models/convnext/image_processing_convnext.d.ts.map +1 -0
  120. package/types/models/deit/image_processing_deit.d.ts +6 -0
  121. package/types/models/deit/image_processing_deit.d.ts.map +1 -0
  122. package/types/models/detr/image_processing_detr.d.ts +42 -0
  123. package/types/models/detr/image_processing_detr.d.ts.map +1 -0
  124. package/types/models/donut/image_processing_donut.d.ts +7 -0
  125. package/types/models/donut/image_processing_donut.d.ts.map +1 -0
  126. package/types/models/dpt/image_processing_dpt.d.ts +6 -0
  127. package/types/models/dpt/image_processing_dpt.d.ts.map +1 -0
  128. package/types/models/efficientnet/image_processing_efficientnet.d.ts +6 -0
  129. package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -0
  130. package/types/models/feature_extractors.d.ts +10 -0
  131. package/types/models/feature_extractors.d.ts.map +1 -0
  132. package/types/models/florence2/processing_florence2.d.ts +39 -0
  133. package/types/models/florence2/processing_florence2.d.ts.map +1 -0
  134. package/types/models/glpn/image_processing_glpn.d.ts +4 -0
  135. package/types/models/glpn/image_processing_glpn.d.ts.map +1 -0
  136. package/types/models/image_processors.d.ts +36 -0
  137. package/types/models/image_processors.d.ts.map +1 -0
  138. package/types/models/janus/image_processing_janus.d.ts +7 -0
  139. package/types/models/janus/image_processing_janus.d.ts.map +1 -0
  140. package/types/models/janus/processing_janus.d.ts +77 -0
  141. package/types/models/janus/processing_janus.d.ts.map +1 -0
  142. package/types/models/jina_clip/image_processing_jina_clip.d.ts +5 -0
  143. package/types/models/jina_clip/image_processing_jina_clip.d.ts.map +1 -0
  144. package/types/models/jina_clip/processing_jina_clip.d.ts +9 -0
  145. package/types/models/jina_clip/processing_jina_clip.d.ts.map +1 -0
  146. package/types/models/llava_onevision/image_processing_llava_onevision.d.ts +4 -0
  147. package/types/models/llava_onevision/image_processing_llava_onevision.d.ts.map +1 -0
  148. package/types/models/mask2former/image_processing_mask2former.d.ts +4 -0
  149. package/types/models/mask2former/image_processing_mask2former.d.ts.map +1 -0
  150. package/types/models/maskformer/image_processing_maskformer.d.ts +22 -0
  151. package/types/models/maskformer/image_processing_maskformer.d.ts.map +1 -0
  152. package/types/models/mgp_str/processing_mgp_str.d.ts +64 -0
  153. package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -0
  154. package/types/models/mobilenet_v1/image_processing_mobilenet_v1.d.ts +6 -0
  155. package/types/models/mobilenet_v1/image_processing_mobilenet_v1.d.ts.map +1 -0
  156. package/types/models/mobilenet_v2/image_processing_mobilenet_v2.d.ts +6 -0
  157. package/types/models/mobilenet_v2/image_processing_mobilenet_v2.d.ts.map +1 -0
  158. package/types/models/mobilenet_v3/image_processing_mobilenet_v3.d.ts +6 -0
  159. package/types/models/mobilenet_v3/image_processing_mobilenet_v3.d.ts.map +1 -0
  160. package/types/models/mobilenet_v4/image_processing_mobilenet_v4.d.ts +6 -0
  161. package/types/models/mobilenet_v4/image_processing_mobilenet_v4.d.ts.map +1 -0
  162. package/types/models/mobilevit/image_processing_mobilevit.d.ts +6 -0
  163. package/types/models/mobilevit/image_processing_mobilevit.d.ts.map +1 -0
  164. package/types/models/nougat/image_processing_nougat.d.ts +4 -0
  165. package/types/models/nougat/image_processing_nougat.d.ts.map +1 -0
  166. package/types/models/owlv2/image_processing_owlv2.d.ts +4 -0
  167. package/types/models/owlv2/image_processing_owlv2.d.ts.map +1 -0
  168. package/types/models/owlvit/image_processing_owlvit.d.ts +10 -0
  169. package/types/models/owlvit/image_processing_owlvit.d.ts.map +1 -0
  170. package/types/models/owlvit/processing_owlvit.d.ts +8 -0
  171. package/types/models/owlvit/processing_owlvit.d.ts.map +1 -0
  172. package/types/models/processors.d.ts +12 -0
  173. package/types/models/processors.d.ts.map +1 -0
  174. package/types/models/pvt/image_processing_pvt.d.ts +4 -0
  175. package/types/models/pvt/image_processing_pvt.d.ts.map +1 -0
  176. package/types/models/pyannote/feature_extraction_pyannote.d.ts +13 -0
  177. package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -0
  178. package/types/models/pyannote/processing_pyannote.d.ts +30 -0
  179. package/types/models/pyannote/processing_pyannote.d.ts.map +1 -0
  180. package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts +11 -0
  181. package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -0
  182. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +17 -0
  183. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -0
  184. package/types/models/rt_detr/image_processing_rt_detr.d.ts +8 -0
  185. package/types/models/rt_detr/image_processing_rt_detr.d.ts.map +1 -0
  186. package/types/models/sam/image_processing_sam.d.ts +103 -0
  187. package/types/models/sam/image_processing_sam.d.ts.map +1 -0
  188. package/types/models/sam/processing_sam.d.ts +9 -0
  189. package/types/models/sam/processing_sam.d.ts.map +1 -0
  190. package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts +34 -0
  191. package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts.map +1 -0
  192. package/types/models/segformer/image_processing_segformer.d.ts +10 -0
  193. package/types/models/segformer/image_processing_segformer.d.ts.map +1 -0
  194. package/types/models/siglip/image_processing_siglip.d.ts +4 -0
  195. package/types/models/siglip/image_processing_siglip.d.ts.map +1 -0
  196. package/types/models/speecht5/feature_extraction_speecht5.d.ts +4 -0
  197. package/types/models/speecht5/feature_extraction_speecht5.d.ts.map +1 -0
  198. package/types/models/speecht5/processing_speecht5.d.ts +14 -0
  199. package/types/models/speecht5/processing_speecht5.d.ts.map +1 -0
  200. package/types/models/swin2sr/image_processing_swin2sr.d.ts +5 -0
  201. package/types/models/swin2sr/image_processing_swin2sr.d.ts.map +1 -0
  202. package/types/models/vit/image_processing_vit.d.ts +6 -0
  203. package/types/models/vit/image_processing_vit.d.ts.map +1 -0
  204. package/types/models/vitmatte/image_processing_vitmatte.d.ts +12 -0
  205. package/types/models/vitmatte/image_processing_vitmatte.d.ts.map +1 -0
  206. package/types/models/vitpose/image_processing_vitpose.d.ts +26 -0
  207. package/types/models/vitpose/image_processing_vitpose.d.ts.map +1 -0
  208. package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts +19 -0
  209. package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts.map +1 -0
  210. package/types/models/wav2vec2/processing_wav2vec2.d.ts +12 -0
  211. package/types/models/wav2vec2/processing_wav2vec2.d.ts.map +1 -0
  212. package/types/models/wespeaker/feature_extraction_wespeaker.d.ts +23 -0
  213. package/types/models/wespeaker/feature_extraction_wespeaker.d.ts.map +1 -0
  214. package/types/models/whisper/feature_extraction_whisper.d.ts +21 -0
  215. package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -0
  216. package/types/models/whisper/processing_whisper.d.ts +17 -0
  217. package/types/models/whisper/processing_whisper.d.ts.map +1 -0
  218. package/types/models/yolos/image_processing_yolos.d.ts +10 -0
  219. package/types/models/yolos/image_processing_yolos.d.ts.map +1 -0
  220. package/types/models.d.ts +152 -0
  221. package/types/models.d.ts.map +1 -1
  222. package/types/pipelines.d.ts +2 -3
  223. package/types/pipelines.d.ts.map +1 -1
  224. package/types/tokenizers.d.ts +3 -0
  225. package/types/tokenizers.d.ts.map +1 -1
  226. package/types/transformers.d.ts +10 -1
  227. package/types/utils/constants.d.ts +6 -0
  228. package/types/utils/constants.d.ts.map +1 -1
  229. package/types/utils/core.d.ts +58 -3
  230. package/types/utils/core.d.ts.map +1 -1
  231. package/types/utils/hub.d.ts +1 -1
  232. package/types/utils/hub.d.ts.map +1 -1
  233. package/types/utils/image.d.ts +10 -2
  234. package/types/utils/image.d.ts.map +1 -1
  235. package/types/utils/tensor.d.ts +34 -1
  236. package/types/utils/tensor.d.ts.map +1 -1
  237. package/src/processors.js +0 -2655
  238. package/types/processors.d.ts +0 -924
  239. package/types/processors.d.ts.map +0 -1
@@ -0,0 +1,26 @@
1
+ import {
2
+ ImageProcessor,
3
+ } from "../../base/image_processors_utils.js";
4
+
5
+ export class JinaCLIPImageProcessor extends ImageProcessor {
6
+ constructor(config) {
7
+ // JinaCLIPImageProcessor uses a custom preprocessor_config.json, so we configure it here
8
+ const { resize_mode, fill_color, interpolation, size, ...other } = config;
9
+
10
+ const new_size = resize_mode === 'squash'
11
+ ? { width: size, height: size }
12
+ : resize_mode === 'shortest'
13
+ ? { shortest_edge: size }
14
+ : { longest_edge: size };
15
+
16
+ const resample = interpolation === 'bicubic' ? 3 : 2;
17
+ super({
18
+ ...other,
19
+ size: new_size,
20
+ resample,
21
+ do_center_crop: true,
22
+ crop_size: size,
23
+ do_normalize: true,
24
+ });
25
+ }
26
+ }
@@ -0,0 +1,24 @@
1
+
2
+ import { Processor } from "../../base/processing_utils.js";
3
+ import { AutoImageProcessor } from "../auto/image_processing_auto.js";
4
+ import { AutoTokenizer } from "../../tokenizers.js";
5
+
6
+ export class JinaCLIPProcessor extends Processor {
7
+ static tokenizer_class = AutoTokenizer
8
+ static image_processor_class = AutoImageProcessor
9
+
10
+ async _call(text=null, images=null, kwargs = {}) {
11
+
12
+ if (!text && !images){
13
+ throw new Error('Either text or images must be provided');
14
+ }
15
+
16
+ const text_inputs = text ? this.tokenizer(text, kwargs) : {};
17
+ const image_inputs = images ? await this.image_processor(images, kwargs) : {};
18
+
19
+ return {
20
+ ...text_inputs,
21
+ ...image_inputs,
22
+ }
23
+ }
24
+ }
@@ -0,0 +1,5 @@
1
+ import {
2
+ ImageProcessor,
3
+ } from "../../base/image_processors_utils.js";
4
+
5
+ export class LlavaOnevisionImageProcessor extends ImageProcessor {}
@@ -0,0 +1,5 @@
1
+
2
+ import { MaskFormerImageProcessor } from "../maskformer/image_processing_maskformer.js";
3
+
4
+ // NOTE: extends MaskFormerImageProcessor
5
+ export class Mask2FormerImageProcessor extends MaskFormerImageProcessor { }
@@ -0,0 +1,18 @@
1
+ import {
2
+ ImageProcessor,
3
+ post_process_panoptic_segmentation,
4
+ post_process_instance_segmentation,
5
+ } from "../../base/image_processors_utils.js";
6
+
7
+ export class MaskFormerImageProcessor extends ImageProcessor {
8
+
9
+ /** @type {typeof post_process_panoptic_segmentation} */
10
+ post_process_panoptic_segmentation(...args) {
11
+ return post_process_panoptic_segmentation(...args);
12
+ }
13
+ /** @type {typeof post_process_instance_segmentation} */
14
+ post_process_instance_segmentation(...args) {
15
+ return post_process_instance_segmentation(...args);
16
+ }
17
+ }
18
+ export class MaskFormerFeatureExtractor extends MaskFormerImageProcessor { }
@@ -0,0 +1,170 @@
1
+ import { Processor } from "../../base/processing_utils.js";
2
+ import { AutoImageProcessor } from "../auto/image_processing_auto.js";
3
+ import { AutoTokenizer } from "../../tokenizers.js";
4
+ import { max, softmax } from "../../utils/maths.js";
5
+
6
+ const DECODE_TYPE_MAPPING = {
7
+ 'char': ['char_decode', 1],
8
+ 'bpe': ['bpe_decode', 2],
9
+ 'wp': ['wp_decode', 102],
10
+ }
11
+ export class MgpstrProcessor extends Processor {
12
+ static tokenizer_class = AutoTokenizer
13
+ static image_processor_class = AutoImageProcessor
14
+
15
+ /**
16
+ * @returns {import('../../tokenizers.js').MgpstrTokenizer} The character tokenizer.
17
+ */
18
+ get char_tokenizer() {
19
+ return this.components.char_tokenizer;
20
+ }
21
+
22
+ /**
23
+ * @returns {import('../../tokenizers.js').GPT2Tokenizer} The BPE tokenizer.
24
+ */
25
+ get bpe_tokenizer() {
26
+ return this.components.bpe_tokenizer;
27
+ }
28
+
29
+ /**
30
+ * @returns {import('../../tokenizers.js').BertTokenizer} The WordPiece tokenizer.
31
+ */
32
+ get wp_tokenizer() {
33
+ return this.components.wp_tokenizer;
34
+ }
35
+
36
+ /**
37
+ * Helper function to decode the model prediction logits.
38
+ * @param {import('../../utils/tensor.js').Tensor} pred_logits Model prediction logits.
39
+ * @param {string} format Type of model prediction. Must be one of ['char', 'bpe', 'wp'].
40
+ * @returns {[string[], number[]]} The decoded sentences and their confidence scores.
41
+ */
42
+ _decode_helper(pred_logits, format) {
43
+ if (!DECODE_TYPE_MAPPING.hasOwnProperty(format)) {
44
+ throw new Error(`Format ${format} is not supported.`);
45
+ }
46
+
47
+ const [decoder_name, eos_token] = DECODE_TYPE_MAPPING[format];
48
+ const decoder = this[decoder_name].bind(this);
49
+
50
+ const [batch_size, batch_max_length] = pred_logits.dims;
51
+ const conf_scores = [];
52
+ const all_ids = [];
53
+
54
+ /** @type {number[][][]} */
55
+ const pred_logits_list = pred_logits.tolist();
56
+ for (let i = 0; i < batch_size; ++i) {
57
+ const logits = pred_logits_list[i];
58
+ const ids = [];
59
+ const scores = [];
60
+
61
+ // Start and index=1 to skip the first token
62
+ for (let j = 1; j < batch_max_length; ++j) {
63
+ // NOTE: == to match bigint and number
64
+ const [max_prob, max_prob_index] = max(softmax(logits[j]));
65
+ scores.push(max_prob);
66
+ if (max_prob_index == eos_token) {
67
+ break;
68
+ }
69
+ ids.push(max_prob_index);
70
+ }
71
+
72
+ const confidence_score = scores.length > 0
73
+ ? scores.reduce((a, b) => a * b, 1)
74
+ : 0;
75
+
76
+ all_ids.push(ids);
77
+ conf_scores.push(confidence_score);
78
+ }
79
+
80
+ const decoded = decoder(all_ids);
81
+ return [decoded, conf_scores];
82
+ }
83
+
84
+ /**
85
+ * Convert a list of lists of char token ids into a list of strings by calling char tokenizer.
86
+ * @param {number[][]} sequences List of tokenized input ids.
87
+ * @returns {string[]} The list of char decoded sentences.
88
+ */
89
+ char_decode(sequences) {
90
+ return this.char_tokenizer.batch_decode(sequences).map(str => str.replaceAll(' ', ''));
91
+ }
92
+
93
+ /**
94
+ * Convert a list of lists of BPE token ids into a list of strings by calling BPE tokenizer.
95
+ * @param {number[][]} sequences List of tokenized input ids.
96
+ * @returns {string[]} The list of BPE decoded sentences.
97
+ */
98
+ bpe_decode(sequences) {
99
+ return this.bpe_tokenizer.batch_decode(sequences)
100
+ }
101
+
102
+ /**
103
+ * Convert a list of lists of word piece token ids into a list of strings by calling word piece tokenizer.
104
+ * @param {number[][]} sequences List of tokenized input ids.
105
+ * @returns {string[]} The list of wp decoded sentences.
106
+ */
107
+ wp_decode(sequences) {
108
+ return this.wp_tokenizer.batch_decode(sequences).map(str => str.replaceAll(' ', ''));
109
+ }
110
+
111
+ /**
112
+ * Convert a list of lists of token ids into a list of strings by calling decode.
113
+ * @param {import('../../utils/tensor.js').Tensor[]} sequences List of tokenized input ids.
114
+ * @returns {{generated_text: string[], scores: number[], char_preds: string[], bpe_preds: string[], wp_preds: string[]}}
115
+ * Dictionary of all the outputs of the decoded results.
116
+ * - generated_text: The final results after fusion of char, bpe, and wp.
117
+ * - scores: The final scores after fusion of char, bpe, and wp.
118
+ * - char_preds: The list of character decoded sentences.
119
+ * - bpe_preds: The list of BPE decoded sentences.
120
+ * - wp_preds: The list of wp decoded sentences.
121
+ */
122
+ batch_decode([char_logits, bpe_logits, wp_logits]) {
123
+ const [char_preds, char_scores] = this._decode_helper(char_logits, 'char');
124
+ const [bpe_preds, bpe_scores] = this._decode_helper(bpe_logits, 'bpe');
125
+ const [wp_preds, wp_scores] = this._decode_helper(wp_logits, 'wp');
126
+
127
+ const generated_text = [];
128
+ const scores = [];
129
+ for (let i = 0; i < char_preds.length; ++i) {
130
+ const [max_score, max_score_index] = max([char_scores[i], bpe_scores[i], wp_scores[i]]);
131
+ generated_text.push([char_preds[i], bpe_preds[i], wp_preds[i]][max_score_index]);
132
+ scores.push(max_score);
133
+ }
134
+
135
+ return {
136
+ generated_text,
137
+ scores,
138
+ char_preds,
139
+ bpe_preds,
140
+ wp_preds,
141
+ }
142
+ }
143
+ /** @type {typeof Processor.from_pretrained} */
144
+ static async from_pretrained(...args) {
145
+ const base = await super.from_pretrained(...args);
146
+
147
+ // Load Transformers.js-compatible versions of the BPE and WordPiece tokenizers
148
+ const bpe_tokenizer = await AutoTokenizer.from_pretrained("Xenova/gpt2") // openai-community/gpt2
149
+ const wp_tokenizer = await AutoTokenizer.from_pretrained("Xenova/bert-base-uncased") // google-bert/bert-base-uncased
150
+
151
+ // Update components
152
+ base.components = {
153
+ image_processor: base.image_processor,
154
+ char_tokenizer: base.tokenizer,
155
+ bpe_tokenizer: bpe_tokenizer,
156
+ wp_tokenizer: wp_tokenizer,
157
+ }
158
+ return base;
159
+ }
160
+
161
+ async _call(images, text = null) {
162
+ const result = await this.image_processor(images);
163
+
164
+ if (text) {
165
+ result.labels = this.tokenizer(text).input_ids
166
+ }
167
+
168
+ return result;
169
+ }
170
+ }
@@ -0,0 +1,7 @@
1
+ import {
2
+ ImageProcessor,
3
+ } from "../../base/image_processors_utils.js";
4
+
5
+
6
+ export class MobileNetV1ImageProcessor extends ImageProcessor { }
7
+ export class MobileNetV1FeatureExtractor extends MobileNetV1ImageProcessor { }
@@ -0,0 +1,7 @@
1
+ import {
2
+ ImageProcessor,
3
+ } from "../../base/image_processors_utils.js";
4
+
5
+
6
+ export class MobileNetV2ImageProcessor extends ImageProcessor { }
7
+ export class MobileNetV2FeatureExtractor extends MobileNetV2ImageProcessor { }
@@ -0,0 +1,7 @@
1
+ import {
2
+ ImageProcessor,
3
+ } from "../../base/image_processors_utils.js";
4
+
5
+
6
+ export class MobileNetV3ImageProcessor extends ImageProcessor { }
7
+ export class MobileNetV3FeatureExtractor extends MobileNetV3ImageProcessor { }
@@ -0,0 +1,7 @@
1
+ import {
2
+ ImageProcessor,
3
+ } from "../../base/image_processors_utils.js";
4
+
5
+
6
+ export class MobileNetV4ImageProcessor extends ImageProcessor { }
7
+ export class MobileNetV4FeatureExtractor extends MobileNetV4ImageProcessor { }
@@ -0,0 +1,6 @@
1
+ import {
2
+ ImageProcessor,
3
+ } from "../../base/image_processors_utils.js";
4
+
5
+ export class MobileViTImageProcessor extends ImageProcessor { }
6
+ export class MobileViTFeatureExtractor extends MobileViTImageProcessor { }
@@ -0,0 +1,5 @@
1
+
2
+ import { DonutImageProcessor } from "../donut/image_processing_donut.js";
3
+
4
+ // NOTE: extends DonutImageProcessor
5
+ export class NougatImageProcessor extends DonutImageProcessor { }
@@ -0,0 +1,5 @@
1
+
2
+ import { OwlViTImageProcessor } from "../owlvit/image_processing_owlvit.js";
3
+
4
+ // NOTE: extends OwlViTImageProcessor
5
+ export class Owlv2ImageProcessor extends OwlViTImageProcessor { }
@@ -0,0 +1,12 @@
1
+ import {
2
+ ImageProcessor,
3
+ post_process_object_detection,
4
+ } from "../../base/image_processors_utils.js";
5
+
6
+ export class OwlViTImageProcessor extends ImageProcessor {
7
+ /** @type {typeof post_process_object_detection} */
8
+ post_process_object_detection(...args) {
9
+ return post_process_object_detection(...args);
10
+ }
11
+ }
12
+ export class OwlViTFeatureExtractor extends OwlViTImageProcessor { }
@@ -0,0 +1,7 @@
1
+ import { Processor } from "../../base/processing_utils.js";
2
+ import { AutoImageProcessor } from "../auto/image_processing_auto.js";
3
+ import { AutoTokenizer } from "../../tokenizers.js";
4
+ export class OwlViTProcessor extends Processor {
5
+ static tokenizer_class = AutoTokenizer
6
+ static image_processor_class = AutoImageProcessor
7
+ }
@@ -0,0 +1,11 @@
1
+ export * from './florence2/processing_florence2.js';
2
+ export * from './mgp_str/processing_mgp_str.js';
3
+ export * from './janus/processing_janus.js';
4
+ export * from './jina_clip/processing_jina_clip.js';
5
+ export * from './owlvit/processing_owlvit.js';
6
+ export * from './pyannote/processing_pyannote.js';
7
+ export * from './qwen2_vl/processing_qwen2_vl.js';
8
+ export * from './sam/processing_sam.js';
9
+ export * from './speecht5/processing_speecht5.js';
10
+ export * from './wav2vec2/processing_wav2vec2.js';
11
+ export * from './whisper/processing_whisper.js';
@@ -0,0 +1,5 @@
1
+ import {
2
+ ImageProcessor,
3
+ } from "../../base/image_processors_utils.js";
4
+
5
+ export class PvtImageProcessor extends ImageProcessor { }
@@ -0,0 +1,28 @@
1
+ import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
2
+ import { Tensor } from '../../utils/tensor.js';
3
+
4
+
5
+ export class PyAnnoteFeatureExtractor extends FeatureExtractor {
6
+ /**
7
+ * Asynchronously extracts features from a given audio using the provided configuration.
8
+ * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
9
+ * @returns {Promise<{ input_values: Tensor; }>} The extracted input features.
10
+ */
11
+ async _call(audio) {
12
+ validate_audio_inputs(audio, 'PyAnnoteFeatureExtractor');
13
+
14
+ if (audio instanceof Float64Array) {
15
+ audio = new Float32Array(audio);
16
+ }
17
+
18
+ const shape = [
19
+ 1, /* batch_size */
20
+ 1, /* num_channels */
21
+ audio.length, /* num_samples */
22
+ ];
23
+ return {
24
+ input_values: new Tensor('float32', audio, shape),
25
+ };
26
+ }
27
+
28
+ }
@@ -0,0 +1,71 @@
1
+ import { Processor } from '../../base/processing_utils.js';
2
+ import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
3
+ import { max, softmax } from '../../utils/maths.js';
4
+
5
+ export class PyAnnoteProcessor extends Processor {
6
+ static feature_extractor_class = AutoFeatureExtractor
7
+
8
+ /**
9
+ * Calls the feature_extractor function with the given audio input.
10
+ * @param {any} audio The audio input to extract features from.
11
+ * @returns {Promise<any>} A Promise that resolves with the extracted features.
12
+ */
13
+ async _call(audio) {
14
+ return await this.feature_extractor(audio)
15
+ }
16
+
17
+ /**
18
+ * NOTE: Can return fractional values. `Math.ceil` will ensure correct value.
19
+ * @param {number} samples The number of frames in the audio.
20
+ * @returns {number} The number of frames in the audio.
21
+ */
22
+ samples_to_frames(samples) {
23
+ return ((samples - this.config.offset) / this.config.step);
24
+ }
25
+
26
+ /**
27
+ * Post-processes the speaker diarization logits output by the model.
28
+ * @param {import('../../utils/tensor.js').Tensor} logits The speaker diarization logits output by the model.
29
+ * @param {number} num_samples Number of samples in the input audio.
30
+ * @returns {Array<Array<{ id: number, start: number, end: number, confidence: number }>>} The post-processed speaker diarization results.
31
+ */
32
+ post_process_speaker_diarization(logits, num_samples) {
33
+ const ratio = (
34
+ num_samples / this.samples_to_frames(num_samples)
35
+ ) / this.config.sampling_rate;
36
+
37
+ const results = [];
38
+ for (const scores of logits.tolist()) {
39
+ const accumulated_segments = [];
40
+
41
+ let current_speaker = -1;
42
+ for (let i = 0; i < scores.length; ++i) {
43
+ const probabilities = softmax(scores[i]);
44
+ const [score, id] = max(probabilities);
45
+ const [start, end] = [i, i + 1];
46
+
47
+ if (id !== current_speaker) {
48
+ // Speaker has changed
49
+ current_speaker = id;
50
+ accumulated_segments.push({ id, start, end, score });
51
+ } else {
52
+ // Continue the current segment
53
+ accumulated_segments.at(-1).end = end;
54
+ accumulated_segments.at(-1).score += score;
55
+ }
56
+ }
57
+
58
+ results.push(accumulated_segments.map(
59
+ // Convert frame-space to time-space
60
+ // and compute the confidence
61
+ ({ id, start, end, score }) => ({
62
+ id,
63
+ start: start * ratio,
64
+ end: end * ratio,
65
+ confidence: score / (end - start),
66
+ })
67
+ ));
68
+ }
69
+ return results;
70
+ }
71
+ }
@@ -0,0 +1,52 @@
1
+ import {
2
+ ImageProcessor,
3
+ } from "../../base/image_processors_utils.js";
4
+ import { cat, Tensor } from "../../utils/tensor.js";
5
+
6
+ export class Qwen2VLImageProcessor extends ImageProcessor {
7
+ async _call(images, ...args) {
8
+ const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
9
+
10
+ let patches = pixel_values;
11
+
12
+ // @ts-ignore
13
+ const { temporal_patch_size, merge_size, patch_size } = this.config;
14
+ if (patches.dims[0] === 1) {
15
+ // Equivalent to np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
16
+ patches = cat(Array.from({ length: temporal_patch_size }, () => patches), 0);
17
+ }
18
+
19
+ const grid_t = patches.dims[0] / temporal_patch_size;
20
+ const channel = patches.dims[1];
21
+ const grid_h = Math.floor(patches.dims[2] / patch_size);
22
+ const grid_w = Math.floor(patches.dims[3] / patch_size);
23
+
24
+ const flatten_patches = patches
25
+ .view(
26
+ grid_t,
27
+ temporal_patch_size,
28
+ channel,
29
+ Math.floor(grid_h / merge_size),
30
+ merge_size,
31
+ patch_size,
32
+ Math.floor(grid_w / merge_size),
33
+ merge_size,
34
+ patch_size,
35
+ )
36
+ .permute(0, 3, 6, 4, 7, 2, 1, 5, 8)
37
+ .view(
38
+ grid_t * grid_h * grid_w,
39
+ channel * temporal_patch_size * patch_size * patch_size,
40
+ )
41
+
42
+ const image_grid_thw = new Tensor('int64', [grid_t, grid_h, grid_w], [1, 3]);
43
+
44
+ return {
45
+ pixel_values: flatten_patches,
46
+ image_grid_thw,
47
+ original_sizes,
48
+ reshaped_input_sizes,
49
+ }
50
+ }
51
+ }
52
+
@@ -0,0 +1,52 @@
1
+ import { Processor } from "../../base/processing_utils.js";
2
+ import { AutoImageProcessor } from "../auto/image_processing_auto.js";
3
+ import { AutoTokenizer } from "../../tokenizers.js";
4
+ import { RawImage } from "../../utils/image.js";
5
+
6
+ export class Qwen2VLProcessor extends Processor {
7
+ static image_processor_class = AutoImageProcessor
8
+ static tokenizer_class = AutoTokenizer
9
+
10
+ /**
11
+ *
12
+ * @param {string|string[]} text
13
+ * @param {RawImage|RawImage[]} images
14
+ * @param {...any} args
15
+ * @returns {Promise<any>}
16
+ */
17
+ async _call(text, images = null, ...args) {
18
+
19
+ if (!Array.isArray(text)) {
20
+ text = [text];
21
+ }
22
+
23
+ let image_inputs, image_grid_thw;
24
+
25
+ if (images) {
26
+ image_inputs = await this.image_processor(images);
27
+ image_grid_thw = image_inputs.image_grid_thw;
28
+ }
29
+
30
+ if (image_grid_thw) {
31
+ let merge_length = this.image_processor.config.merge_size ** 2;
32
+ let index = 0;
33
+
34
+ const image_grid_thw_list = image_grid_thw.tolist();
35
+ text = text.map(t => {
36
+ while (t.includes("<|image_pad|>")) {
37
+ const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
38
+ t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
39
+ }
40
+ return t.replaceAll("<|placeholder|>", "<|image_pad|>");
41
+ });
42
+ }
43
+
44
+ const text_inputs = this.tokenizer(text);
45
+
46
+ return {
47
+ ...text_inputs,
48
+ ...image_inputs,
49
+ // TODO: ...videos_inputs,
50
+ }
51
+ }
52
+ }
@@ -0,0 +1,12 @@
1
+ import {
2
+ ImageProcessor,
3
+ post_process_object_detection,
4
+ } from "../../base/image_processors_utils.js";
5
+
6
+
7
+ export class RTDetrImageProcessor extends ImageProcessor {
8
+ /** @type {typeof post_process_object_detection} */
9
+ post_process_object_detection(...args) {
10
+ return post_process_object_detection(...args);
11
+ }
12
+ }