@huggingface/transformers 3.1.1 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. package/README.md +10 -4
  2. package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
  3. package/dist/transformers.cjs +1062 -183
  4. package/dist/transformers.cjs.map +1 -1
  5. package/dist/transformers.js +2239 -1232
  6. package/dist/transformers.js.map +1 -1
  7. package/dist/transformers.min.cjs +1 -358
  8. package/dist/transformers.min.cjs.map +1 -1
  9. package/dist/transformers.min.js +1 -421
  10. package/dist/transformers.min.js.map +1 -1
  11. package/dist/transformers.min.mjs +1 -358
  12. package/dist/transformers.min.mjs.map +1 -1
  13. package/dist/transformers.mjs +1082 -181
  14. package/dist/transformers.mjs.map +1 -1
  15. package/package.json +11 -16
  16. package/src/backends/onnx.js +2 -7
  17. package/src/base/image_processors_utils.js +3 -1
  18. package/src/configs.js +11 -2
  19. package/src/env.js +1 -1
  20. package/src/models/feature_extractors.js +1 -0
  21. package/src/models/idefics3/image_processing_idefics3.js +24 -13
  22. package/src/models/image_processors.js +1 -0
  23. package/src/models/moonshine/feature_extraction_moonshine.js +26 -0
  24. package/src/models/moonshine/processing_moonshine.js +20 -0
  25. package/src/models/paligemma/processing_paligemma.js +82 -0
  26. package/src/models/phi3_v/image_processing_phi3_v.js +163 -0
  27. package/src/models/phi3_v/processing_phi3_v.js +53 -0
  28. package/src/models/processors.js +3 -0
  29. package/src/models/pyannote/feature_extraction_pyannote.js +56 -0
  30. package/src/models/pyannote/processing_pyannote.js +7 -54
  31. package/src/models.js +233 -35
  32. package/src/ops/registry.js +11 -0
  33. package/src/pipelines.js +30 -0
  34. package/src/tokenizers.js +12 -1
  35. package/src/utils/core.js +39 -9
  36. package/src/utils/hub.js +8 -12
  37. package/src/utils/image.js +40 -0
  38. package/src/utils/tensor.js +51 -1
  39. package/types/backends/onnx.d.ts +2 -2
  40. package/types/backends/onnx.d.ts.map +1 -1
  41. package/types/base/feature_extraction_utils.d.ts +1 -1
  42. package/types/base/feature_extraction_utils.d.ts.map +1 -1
  43. package/types/base/image_processors_utils.d.ts +4 -4
  44. package/types/base/image_processors_utils.d.ts.map +1 -1
  45. package/types/base/processing_utils.d.ts +4 -4
  46. package/types/base/processing_utils.d.ts.map +1 -1
  47. package/types/configs.d.ts +7 -7
  48. package/types/configs.d.ts.map +1 -1
  49. package/types/env.d.ts +1 -1
  50. package/types/env.d.ts.map +1 -1
  51. package/types/generation/configuration_utils.d.ts +2 -2
  52. package/types/generation/logits_process.d.ts +2 -2
  53. package/types/generation/logits_process.d.ts.map +1 -1
  54. package/types/generation/logits_sampler.d.ts.map +1 -1
  55. package/types/generation/parameters.d.ts +5 -5
  56. package/types/generation/stopping_criteria.d.ts +1 -1
  57. package/types/generation/stopping_criteria.d.ts.map +1 -1
  58. package/types/generation/streamers.d.ts +2 -2
  59. package/types/generation/streamers.d.ts.map +1 -1
  60. package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts +1 -1
  61. package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts.map +1 -1
  62. package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
  63. package/types/models/auto/image_processing_auto.d.ts.map +1 -1
  64. package/types/models/auto/processing_auto.d.ts +1 -1
  65. package/types/models/auto/processing_auto.d.ts.map +1 -1
  66. package/types/models/clap/feature_extraction_clap.d.ts +1 -1
  67. package/types/models/clap/feature_extraction_clap.d.ts.map +1 -1
  68. package/types/models/detr/image_processing_detr.d.ts +11 -11
  69. package/types/models/detr/image_processing_detr.d.ts.map +1 -1
  70. package/types/models/donut/image_processing_donut.d.ts +1 -1
  71. package/types/models/donut/image_processing_donut.d.ts.map +1 -1
  72. package/types/models/feature_extractors.d.ts +1 -0
  73. package/types/models/florence2/processing_florence2.d.ts.map +1 -1
  74. package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
  75. package/types/models/idefics3/processing_idefics3.d.ts.map +1 -1
  76. package/types/models/image_processors.d.ts +1 -0
  77. package/types/models/janus/image_processing_janus.d.ts +1 -1
  78. package/types/models/janus/image_processing_janus.d.ts.map +1 -1
  79. package/types/models/janus/processing_janus.d.ts.map +1 -1
  80. package/types/models/maskformer/image_processing_maskformer.d.ts +8 -8
  81. package/types/models/maskformer/image_processing_maskformer.d.ts.map +1 -1
  82. package/types/models/mgp_str/processing_mgp_str.d.ts +2 -2
  83. package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
  84. package/types/models/moonshine/feature_extraction_moonshine.d.ts +13 -0
  85. package/types/models/moonshine/feature_extraction_moonshine.d.ts.map +1 -0
  86. package/types/models/moonshine/processing_moonshine.d.ts +17 -0
  87. package/types/models/moonshine/processing_moonshine.d.ts.map +1 -0
  88. package/types/models/owlvit/image_processing_owlvit.d.ts.map +1 -1
  89. package/types/models/paligemma/processing_paligemma.d.ts +12 -0
  90. package/types/models/paligemma/processing_paligemma.d.ts.map +1 -0
  91. package/types/models/phi3_v/image_processing_phi3_v.d.ts +17 -0
  92. package/types/models/phi3_v/image_processing_phi3_v.d.ts.map +1 -0
  93. package/types/models/phi3_v/processing_phi3_v.d.ts +17 -0
  94. package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -0
  95. package/types/models/processors.d.ts +3 -0
  96. package/types/models/pyannote/feature_extraction_pyannote.d.ts +18 -0
  97. package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
  98. package/types/models/pyannote/processing_pyannote.d.ts +4 -15
  99. package/types/models/pyannote/processing_pyannote.d.ts.map +1 -1
  100. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
  101. package/types/models/rt_detr/image_processing_rt_detr.d.ts.map +1 -1
  102. package/types/models/sam/image_processing_sam.d.ts.map +1 -1
  103. package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts +1 -1
  104. package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts.map +1 -1
  105. package/types/models/segformer/image_processing_segformer.d.ts.map +1 -1
  106. package/types/models/speecht5/processing_speecht5.d.ts.map +1 -1
  107. package/types/models/swin2sr/image_processing_swin2sr.d.ts +1 -1
  108. package/types/models/swin2sr/image_processing_swin2sr.d.ts.map +1 -1
  109. package/types/models/vitmatte/image_processing_vitmatte.d.ts.map +1 -1
  110. package/types/models/vitpose/image_processing_vitpose.d.ts +1 -1
  111. package/types/models/vitpose/image_processing_vitpose.d.ts.map +1 -1
  112. package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts.map +1 -1
  113. package/types/models/wav2vec2/processing_wav2vec2.d.ts.map +1 -1
  114. package/types/models/wespeaker/feature_extraction_wespeaker.d.ts +1 -1
  115. package/types/models/wespeaker/feature_extraction_wespeaker.d.ts.map +1 -1
  116. package/types/models/whisper/feature_extraction_whisper.d.ts +1 -1
  117. package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
  118. package/types/models/whisper/generation_whisper.d.ts.map +1 -1
  119. package/types/models/whisper/processing_whisper.d.ts.map +1 -1
  120. package/types/models/yolos/image_processing_yolos.d.ts.map +1 -1
  121. package/types/models.d.ts +61 -5
  122. package/types/models.d.ts.map +1 -1
  123. package/types/ops/registry.d.ts +1 -0
  124. package/types/ops/registry.d.ts.map +1 -1
  125. package/types/pipelines.d.ts +31 -51
  126. package/types/pipelines.d.ts.map +1 -1
  127. package/types/tokenizers.d.ts +10 -6
  128. package/types/tokenizers.d.ts.map +1 -1
  129. package/types/utils/audio.d.ts.map +1 -1
  130. package/types/utils/constants.d.ts.map +1 -1
  131. package/types/utils/core.d.ts +87 -22
  132. package/types/utils/core.d.ts.map +1 -1
  133. package/types/utils/data-structures.d.ts.map +1 -1
  134. package/types/utils/devices.d.ts.map +1 -1
  135. package/types/utils/dtypes.d.ts.map +1 -1
  136. package/types/utils/generic.d.ts.map +1 -1
  137. package/types/utils/hub.d.ts +3 -3
  138. package/types/utils/hub.d.ts.map +1 -1
  139. package/types/utils/image.d.ts +10 -1
  140. package/types/utils/image.d.ts.map +1 -1
  141. package/types/utils/maths.d.ts +10 -10
  142. package/types/utils/maths.d.ts.map +1 -1
  143. package/types/utils/tensor.d.ts +22 -6
  144. package/types/utils/tensor.d.ts.map +1 -1
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@huggingface/transformers",
3
- "version": "3.1.1",
3
+ "version": "3.2.0",
4
4
  "description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
5
5
  "main": "./src/transformers.js",
6
6
  "types": "./types/transformers.d.ts",
@@ -21,12 +21,6 @@
21
21
  "default": "./dist/transformers.js"
22
22
  }
23
23
  },
24
- "imports": {
25
- "#onnxruntime-webgpu": {
26
- "node": "onnxruntime-web",
27
- "default": "onnxruntime-web/webgpu"
28
- }
29
- },
30
24
  "scripts": {
31
25
  "format": "prettier --write .",
32
26
  "format:check": "prettier --check .",
@@ -63,22 +57,23 @@
63
57
  "dependencies": {
64
58
  "@huggingface/jinja": "^0.3.2",
65
59
  "onnxruntime-node": "1.20.1",
66
- "onnxruntime-web": "1.20.1",
60
+ "onnxruntime-web": "1.21.0-dev.20241205-d27fecd3d3",
67
61
  "sharp": "^0.33.5"
68
62
  },
69
63
  "devDependencies": {
70
64
  "@types/jest": "^29.5.14",
71
- "@webgpu/types": "^0.1.44",
65
+ "@types/node": "^22.10.1",
66
+ "@webgpu/types": "^0.1.51",
72
67
  "catharsis": "github:xenova/catharsis",
73
68
  "jest": "^30.0.0-alpha.6",
74
69
  "jest-environment-node": "^30.0.0-alpha.6",
75
- "jsdoc-to-markdown": "^8.0.1",
76
- "prettier": "3.3.3",
77
- "typescript": "^5.2.2",
78
- "wavefile": "^11.0.0",
79
- "webpack": "^5.80.0",
80
- "webpack-cli": "^5.0.2",
81
- "webpack-dev-server": "^4.13.3"
70
+ "jsdoc-to-markdown": "^9.1.1",
71
+ "prettier": "3.4.2",
72
+ "typescript": "^5.7.2",
73
+ "wavefile": "11.0.0",
74
+ "webpack": "^5.97.1",
75
+ "webpack-cli": "^5.1.4",
76
+ "webpack-dev-server": "^5.1.0"
82
77
  },
83
78
  "files": [
84
79
  "src",
@@ -21,12 +21,7 @@ import { env, apis } from '../env.js';
21
21
  // NOTE: Import order matters here. We need to import `onnxruntime-node` before `onnxruntime-web`.
22
22
  // In either case, we select the default export if it exists, otherwise we use the named export.
23
23
  import * as ONNX_NODE from 'onnxruntime-node';
24
-
25
- // Use subpath-imports to ensure Node.js and browser interoperability.
26
- // See package.json and https://nodejs.org/api/packages.html#subpath-imports
27
- // for more information.
28
- // @ts-ignore
29
- import * as ONNX_WEB from '#onnxruntime-webgpu';
24
+ import * as ONNX_WEB from 'onnxruntime-web';
30
25
 
31
26
  export { Tensor } from 'onnxruntime-common';
32
27
 
@@ -68,7 +63,7 @@ if (ORT_SYMBOL in globalThis) {
68
63
  } else if (apis.IS_NODE_ENV) {
69
64
  ONNX = ONNX_NODE.default ?? ONNX_NODE;
70
65
 
71
- // Updated as of ONNX Runtime 1.18.0
66
+ // Updated as of ONNX Runtime 1.20.1
72
67
  // The following table lists the supported versions of ONNX Runtime Node.js binding provided with pre-built binaries.
73
68
  // | EPs/Platforms | Windows x64 | Windows arm64 | Linux x64 | Linux arm64 | MacOS x64 | MacOS arm64 |
74
69
  // | ------------- | ----------- | ------------- | ----------------- | ----------- | --------- | ----------- |
@@ -699,7 +699,7 @@ export class ImageProcessor extends Callable {
699
699
  * Pad the image by a certain amount.
700
700
  * @param {Float32Array} pixelData The pixel data to pad.
701
701
  * @param {number[]} imgDims The dimensions of the image (height, width, channels).
702
- * @param {{width:number; height:number}|number} padSize The dimensions of the padded image.
702
+ * @param {{width:number; height:number}|number|'square'} padSize The dimensions of the padded image.
703
703
  * @param {Object} options The options for padding.
704
704
  * @param {'constant'|'symmetric'} [options.mode='constant'] The type of padding to add.
705
705
  * @param {boolean} [options.center=false] Whether to center the image.
@@ -717,6 +717,8 @@ export class ImageProcessor extends Callable {
717
717
  if (typeof padSize === 'number') {
718
718
  paddedImageWidth = padSize;
719
719
  paddedImageHeight = padSize;
720
+ } else if (padSize === 'square') {
721
+ paddedImageWidth = paddedImageHeight = Math.max(imageHeight, imageWidth);
720
722
  } else {
721
723
  paddedImageWidth = padSize.width;
722
724
  paddedImageHeight = padSize.height;
package/src/configs.js CHANGED
@@ -95,8 +95,6 @@ function getNormalizedConfig(config) {
95
95
  case 'gpt_neox':
96
96
  case 'stablelm':
97
97
  case 'opt':
98
- case 'phi':
99
- case 'phi3':
100
98
  case 'falcon':
101
99
  mapping['num_heads'] = 'num_attention_heads';
102
100
  mapping['num_layers'] = 'num_hidden_layers';
@@ -104,6 +102,7 @@ function getNormalizedConfig(config) {
104
102
  break;
105
103
  case 'llama':
106
104
  case 'olmo':
105
+ case 'olmo2':
107
106
  case 'mobilellm':
108
107
  case 'granite':
109
108
  case 'cohere':
@@ -111,6 +110,9 @@ function getNormalizedConfig(config) {
111
110
  case 'starcoder2':
112
111
  case 'qwen2':
113
112
  case 'qwen2_vl':
113
+ case 'phi':
114
+ case 'phi3':
115
+ case 'phi3_v':
114
116
  mapping['num_heads'] = 'num_key_value_heads';
115
117
  mapping['num_layers'] = 'num_hidden_layers';
116
118
  mapping['hidden_size'] = 'hidden_size';
@@ -143,6 +145,12 @@ function getNormalizedConfig(config) {
143
145
  mapping['num_layers'] = 'n_layers';
144
146
  mapping['hidden_size'] = 'd_model';
145
147
  break;
148
+ case 'exaone':
149
+ mapping['num_heads'] = 'num_key_value_heads';
150
+ mapping['num_layers'] = 'num_layers';
151
+ mapping['dim_kv'] = 'head_dim';
152
+ mapping['num_attention_heads'] = 'num_attention_heads';
153
+ break;
146
154
 
147
155
  // Encoder-decoder models
148
156
  case 't5':
@@ -184,6 +192,7 @@ function getNormalizedConfig(config) {
184
192
  mapping['encoder_hidden_size'] = mapping['decoder_hidden_size'] = 'd_model';
185
193
  break;
186
194
  case 'musicgen_decoder':
195
+ case 'moonshine':
187
196
  mapping['num_encoder_layers'] = mapping['num_decoder_layers'] = 'num_hidden_layers';
188
197
  mapping['num_encoder_heads'] = mapping['num_decoder_heads'] = 'num_attention_heads';
189
198
  mapping['encoder_hidden_size'] = mapping['decoder_hidden_size'] = 'hidden_size';
package/src/env.js CHANGED
@@ -26,7 +26,7 @@ import fs from 'fs';
26
26
  import path from 'path';
27
27
  import url from 'url';
28
28
 
29
- const VERSION = '3.1.1';
29
+ const VERSION = '3.2.0';
30
30
 
31
31
  // Check if various APIs are available (depends on environment)
32
32
  const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
@@ -1,6 +1,7 @@
1
1
 
2
2
  export * from './audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js';
3
3
  export * from './clap/feature_extraction_clap.js';
4
+ export * from './moonshine/feature_extraction_moonshine.js';
4
5
  export * from './pyannote/feature_extraction_pyannote.js';
5
6
  export * from './seamless_m4t/feature_extraction_seamless_m4t.js';
6
7
  export * from './speecht5/feature_extraction_speecht5.js';
@@ -3,7 +3,7 @@
3
3
  import {
4
4
  ImageProcessor,
5
5
  } from "../../base/image_processors_utils.js";
6
- import { cat, full, interpolate_4d, stack } from "../../utils/tensor.js";
6
+ import { cat, full, interpolate_4d, slice, stack } from "../../utils/tensor.js";
7
7
 
8
8
  export class Idefics3ImageProcessor extends ImageProcessor {
9
9
  constructor(config) {
@@ -186,18 +186,29 @@ export class Idefics3ImageProcessor extends ImageProcessor {
186
186
  const optimal_width = Math.ceil(width / num_splits_w);
187
187
 
188
188
  // Iterate through each row and column
189
- for (let r = 0; r < num_splits_h; r++) {
190
- for (let c = 0; c < num_splits_w; c++) {
191
- // Calculate the starting point of the crop
192
- const start_x = c * optimal_width;
193
- const start_y = r * optimal_height;
194
-
195
- // Calculate the ending point of the crop
196
- const end_x = Math.min(start_x + optimal_width, width);
197
- const end_y = Math.min(start_y + optimal_height, height);
198
-
199
- // Crop the image
200
- frames.push(pixel_values.slice(null, null, [start_y, end_y], [start_x, end_x]));
189
+ for (let r = 0; r < num_splits_h; ++r) {
190
+ for (let c = 0; c < num_splits_w; ++c) {
191
+ let start_x, start_y, end_x, end_y;
192
+ if (r === num_splits_h - 1) { // At bottom
193
+ start_y = height - optimal_height;
194
+ end_y = height;
195
+ } else {
196
+ start_y = r * optimal_height;
197
+ end_y = (r + 1) * optimal_height;
198
+ }
199
+ if (c === num_splits_w - 1) { // At right
200
+ start_x = width - optimal_width;
201
+ end_x = width;
202
+ } else {
203
+ start_x = c * optimal_width;
204
+ end_x = (c + 1) * optimal_width;
205
+ }
206
+
207
+ const starts = [start_y, start_x];
208
+ const ends = [end_y, end_x];
209
+
210
+ const patch = await slice(pixel_values, starts, ends, [2, 3]);
211
+ frames.push(patch);
201
212
  }
202
213
  }
203
214
 
@@ -24,6 +24,7 @@ export * from './mobilevit/image_processing_mobilevit.js'
24
24
  export * from './nougat/image_processing_nougat.js'
25
25
  export * from './owlv2/image_processing_owlv2.js'
26
26
  export * from './owlvit/image_processing_owlvit.js'
27
+ export * from './phi3_v/image_processing_phi3_v.js'
27
28
  export * from './pvt/image_processing_pvt.js'
28
29
  export * from './qwen2_vl/image_processing_qwen2_vl.js'
29
30
  export * from './rt_detr/image_processing_rt_detr.js'
@@ -0,0 +1,26 @@
1
+ import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
2
+ import { Tensor } from '../../utils/tensor.js';
3
+
4
+
5
+ export class MoonshineFeatureExtractor extends FeatureExtractor {
6
+ /**
7
+ * Asynchronously extracts input values from a given audio using the provided configuration.
8
+ * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
9
+ * @returns {Promise<{ input_values: Tensor; }>} The extracted input values.
10
+ */
11
+ async _call(audio) {
12
+ validate_audio_inputs(audio, 'MoonshineFeatureExtractor');
13
+
14
+ if (audio instanceof Float64Array) {
15
+ audio = new Float32Array(audio);
16
+ }
17
+
18
+ const shape = [
19
+ 1, /* batch_size */
20
+ audio.length, /* num_samples */
21
+ ];
22
+ return {
23
+ input_values: new Tensor('float32', audio, shape),
24
+ };
25
+ }
26
+ }
@@ -0,0 +1,20 @@
1
+ import { AutoFeatureExtractor } from "../auto/feature_extraction_auto.js"
2
+ import { AutoTokenizer } from "../../tokenizers.js"
3
+ import { Processor } from "../../base/processing_utils.js"
4
+
5
+ /**
6
+ * Represents a MoonshineProcessor that extracts features from an audio input.
7
+ */
8
+ export class MoonshineProcessor extends Processor {
9
+ static tokenizer_class = AutoTokenizer
10
+ static feature_extractor_class = AutoFeatureExtractor
11
+
12
+ /**
13
+ * Calls the feature_extractor function with the given audio input.
14
+ * @param {any} audio The audio input to extract features from.
15
+ * @returns {Promise<any>} A Promise that resolves with the extracted features.
16
+ */
17
+ async _call(audio) {
18
+ return await this.feature_extractor(audio);
19
+ }
20
+ }
@@ -0,0 +1,82 @@
1
+ import { Processor } from "../../base/processing_utils.js";
2
+ import { AutoImageProcessor } from "../auto/image_processing_auto.js";
3
+ import { AutoTokenizer } from "../../tokenizers.js";
4
+
5
+ const IMAGE_TOKEN = "<image>";
6
+
7
+ function build_string_from_input(
8
+ prompt,
9
+ bos_token,
10
+ image_seq_len,
11
+ image_token,
12
+ num_images,
13
+ ) {
14
+ return `${image_token.repeat(image_seq_len * num_images)}${bos_token}${prompt}\n`
15
+ }
16
+
17
+ export class PaliGemmaProcessor extends Processor {
18
+ static tokenizer_class = AutoTokenizer
19
+ static image_processor_class = AutoImageProcessor
20
+ static uses_processor_config = false;
21
+
22
+ /**
23
+ * @typedef {import('../../utils/image.js').RawImage} RawImage
24
+ */
25
+
26
+ // `images` is required, `text` is optional
27
+ async _call(/** @type {RawImage|RawImage[]} */ images, text = null, kwargs = {}) {
28
+ if (!text) {
29
+ console.warn(
30
+ "You are using PaliGemma without a text prefix. It will perform as a picture-captioning model."
31
+ )
32
+ text = ""
33
+ }
34
+
35
+ if (!Array.isArray(images)) {
36
+ images = [images]
37
+ }
38
+
39
+ if (!Array.isArray(text)) {
40
+ text = [text]
41
+ }
42
+
43
+ const bos_token = this.tokenizer.bos_token;
44
+ const image_seq_length = this.image_processor.config.image_seq_length;
45
+ let input_strings;
46
+ if (text.some((t) => t.includes(IMAGE_TOKEN))) {
47
+ input_strings = text.map(
48
+ sample => {
49
+ const expanded_sample = sample.replaceAll(IMAGE_TOKEN, IMAGE_TOKEN.repeat(image_seq_length));
50
+ const bos_rfind_index = expanded_sample.lastIndexOf(IMAGE_TOKEN);
51
+ const bos_index = bos_rfind_index === -1 ? 0 : bos_rfind_index + IMAGE_TOKEN.length;
52
+ return expanded_sample.slice(0, bos_index) + bos_token + expanded_sample.slice(bos_index) + "\n";
53
+ }
54
+ )
55
+ } else {
56
+ console.warn(
57
+ "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special " +
58
+ "image tokens in the text, as many tokens as there are images per each text. It is recommended to " +
59
+ "add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images " +
60
+ "each text has and add special tokens."
61
+ )
62
+
63
+ input_strings = text.map(
64
+ sample => build_string_from_input(
65
+ sample,
66
+ bos_token,
67
+ image_seq_length,
68
+ IMAGE_TOKEN,
69
+ images.length,
70
+ )
71
+ )
72
+ }
73
+
74
+ const text_inputs = this.tokenizer(input_strings, kwargs);
75
+ const image_inputs = await this.image_processor(images, kwargs);
76
+
77
+ return {
78
+ ...image_inputs,
79
+ ...text_inputs,
80
+ }
81
+ }
82
+ }
@@ -0,0 +1,163 @@
1
+ import {
2
+ ImageProcessor,
3
+ } from "../../base/image_processors_utils.js";
4
+ import { cat, interpolate_4d, slice, stack, Tensor } from "../../utils/tensor.js";
5
+
6
+ const IMAGE_SIZE = 336;
7
+ const SLICE_AXES = [2, 3]; // axes to slice on
8
+ const { ceil, floor, sqrt } = Math;
9
+
10
+ export class Phi3VImageProcessor extends ImageProcessor {
11
+ constructor(config) {
12
+ super({
13
+ ...config,
14
+ do_normalize: true,
15
+ do_pad: true,
16
+ pad_size: 'custom',
17
+ do_convert_rgb: true,
18
+ do_resize: true, // Smart resizing "hd_transform"
19
+ });
20
+
21
+ this._num_crops = config.num_crops;
22
+ }
23
+ calc_num_image_tokens_from_image_size(width, height) {
24
+ // @ts-expect-error
25
+ const { num_img_tokens } = this.config;
26
+ return floor(((floor((height / IMAGE_SIZE)) * floor((width / IMAGE_SIZE)) + 1) * num_img_tokens) + 1 + (floor(height / IMAGE_SIZE) + 1) * sqrt(num_img_tokens));
27
+ }
28
+
29
+ /** @type {ImageProcessor['get_resize_output_image_size']} */
30
+ get_resize_output_image_size(image, size) {
31
+ const hd_num = this._num_crops;
32
+ const [width, height] = image.size
33
+
34
+ let ratio = width / height;
35
+ let scale = 1;
36
+
37
+ // Calculate the scaling factor
38
+ while (scale * Math.ceil(scale / ratio) <= hd_num) {
39
+ scale += 1;
40
+ }
41
+ scale -= 1;
42
+
43
+ // Compute the new dimensions
44
+ const new_w = Math.floor(scale * 336);
45
+ const new_h = Math.floor(new_w / ratio);
46
+
47
+ return [new_w, new_h]
48
+ }
49
+
50
+
51
+ /** @type {ImageProcessor['pad_image']} */
52
+ pad_image(pixelData, imgDims, padSize, options = {}) {
53
+ // Phi3V uses a custom padding strategy:
54
+ // - Pad to a multiple of 336
55
+ // - Pad with white pixels
56
+ const [imageHeight, imageWidth] = imgDims;
57
+ const height = IMAGE_SIZE * ceil(imageHeight / IMAGE_SIZE);
58
+ const width = IMAGE_SIZE * ceil(imageWidth / IMAGE_SIZE);
59
+
60
+ // NOTE: Since padding is done after normalization, we need to fill with the normalized values
61
+ const constant_values = [1, 1, 1].map((x, i) => (x - this.image_mean[i]) / this.image_std[i]);
62
+ return super.pad_image(pixelData, imgDims, { width, height }, {
63
+ center: true,
64
+ constant_values,
65
+ ...options,
66
+ });
67
+ }
68
+
69
+ async _call(images, {
70
+ num_crops = null,
71
+ } = {}) {
72
+ // @ts-expect-error
73
+ this._num_crops = num_crops ??= this.config.num_crops;
74
+ if (num_crops < 4 || sqrt(num_crops) % 1 !== 0) {
75
+ throw new Error("num_crops must be a square number >= 4");
76
+ }
77
+
78
+ if (!Array.isArray(images)) {
79
+ images = [images];
80
+ }
81
+
82
+ const num_images = images.length;
83
+ const imageData = await Promise.all(images.map(x => this.preprocess(x)));
84
+
85
+ const original_sizes = imageData.map(x => x.original_size);
86
+ const reshaped_input_sizes = imageData.map(x => x.reshaped_input_size);
87
+
88
+ // Process each image in batch
89
+ const all_pixel_values = [];
90
+ for (const { pixel_values } of imageData) {
91
+ pixel_values.unsqueeze_(0); // Easier processing as 4D tensor
92
+
93
+ const [height, width] = pixel_values.dims.slice(-2);
94
+
95
+ // Global image (Tensor of shape [num_channels, height, width])
96
+ const batch_pixel_values = await interpolate_4d(pixel_values, {
97
+ size: [IMAGE_SIZE, IMAGE_SIZE],
98
+ mode: 'bicubic',
99
+ });
100
+
101
+ if (num_crops > 0) {
102
+ const patches = [];
103
+ const sqrt_patches = sqrt(num_crops);
104
+ const patch_width = floor(width / sqrt_patches);
105
+ const patch_height = floor(height / sqrt_patches);
106
+ for (let y = 0; y < sqrt_patches; ++y) {
107
+ for (let x = 0; x < sqrt_patches; ++x) {
108
+ let start_x, start_y, end_x, end_y;
109
+ if (y === sqrt_patches - 1) { // At bottom
110
+ start_y = height - patch_height;
111
+ end_y = height;
112
+ } else {
113
+ start_y = y * patch_height;
114
+ end_y = (y + 1) * patch_height;
115
+ }
116
+ if (x === sqrt_patches - 1) { // At right
117
+ start_x = width - patch_width;
118
+ end_x = width;
119
+ } else {
120
+ start_x = x * patch_width;
121
+ end_x = (x + 1) * patch_width;
122
+ }
123
+
124
+ const starts = [start_y, start_x];
125
+ const ends = [end_y, end_x];
126
+ const patch = await slice(pixel_values, starts, ends, SLICE_AXES);
127
+ patches.push(patch);
128
+ }
129
+ }
130
+
131
+ const resized_tensors = await interpolate_4d(cat(patches, 0), {
132
+ size: [IMAGE_SIZE, IMAGE_SIZE],
133
+ mode: 'bicubic',
134
+ }); // [num_crops, 3, 336, 336]
135
+
136
+ // Concatenate the global image with the patches
137
+ all_pixel_values.push(cat([batch_pixel_values, resized_tensors], 0));
138
+ } else {
139
+ // Only use the global image
140
+ // NOTE: Not currently supported in modelling code
141
+ all_pixel_values.push(batch_pixel_values);
142
+ }
143
+ }
144
+
145
+ // [num_images, 1 + num_crops, num_channels=3, height, width]
146
+ const pixel_values = stack(all_pixel_values, 0);
147
+
148
+ // Calculate padded image sizes
149
+ const sizes = reshaped_input_sizes.map(x => x.map(y => IMAGE_SIZE * ceil(y / IMAGE_SIZE)));
150
+
151
+ const image_sizes = new Tensor(
152
+ 'int64',
153
+ sizes.flat(),
154
+ [num_images, 2],
155
+ );
156
+
157
+ const num_img_tokens = sizes.map(
158
+ ([height, width]) => this.calc_num_image_tokens_from_image_size(width, height),
159
+ );
160
+
161
+ return { pixel_values, original_sizes, reshaped_input_sizes, image_sizes, num_img_tokens };
162
+ }
163
+ }
@@ -0,0 +1,53 @@
1
+ import { Processor } from "../../base/processing_utils.js";
2
+ import { AutoImageProcessor } from "../auto/image_processing_auto.js";
3
+ import { AutoTokenizer } from "../../tokenizers.js";
4
+ import { RawImage } from "../../utils/image.js";
5
+
6
+ const IMAGE_TOKEN = "<|image|>";
7
+ const IMAGE_TOKEN_PATTERN = /<\|image_\d+\|>/g;
8
+
9
+ export class Phi3VProcessor extends Processor {
10
+ static image_processor_class = AutoImageProcessor
11
+ static tokenizer_class = AutoTokenizer
12
+
13
+ /**
14
+ *
15
+ * @param {string|string[]} text
16
+ * @param {RawImage|RawImage[]} images
17
+ * @param {...any} args
18
+ * @returns {Promise<any>}
19
+ */
20
+ async _call(text, images = null, {
21
+ padding = true,
22
+ truncation = true,
23
+ num_crops = null,
24
+ } = {}) {
25
+
26
+ if (!Array.isArray(text)) {
27
+ text = [text];
28
+ }
29
+
30
+ let text_inputs, image_inputs;
31
+ if (images) {
32
+ image_inputs = await this.image_processor(images, { num_crops });
33
+ const { num_img_tokens } = image_inputs;
34
+
35
+ // The original implementation adds a bos_token before the image tokens
36
+ // TODO: Check if this affects performance, since it looks like a bug in the original implementation
37
+ const prompt_chunks = text.map((t, i) => t.split(IMAGE_TOKEN_PATTERN).join(IMAGE_TOKEN.repeat(num_img_tokens[i])));
38
+
39
+ text_inputs = this.tokenizer(prompt_chunks, { padding, truncation });
40
+
41
+ // The model expects image tokens to be negative, so we negate the image token ids
42
+ const image_token_id = this.tokenizer.model.convert_tokens_to_ids([IMAGE_TOKEN])[0];
43
+ text_inputs.input_ids.map_(id => (id == image_token_id) ? -id : id);
44
+ } else {
45
+ text_inputs = this.tokenizer(text);
46
+ }
47
+
48
+ return {
49
+ ...text_inputs,
50
+ ...image_inputs,
51
+ }
52
+ }
53
+ }
@@ -1,9 +1,12 @@
1
1
  export * from './florence2/processing_florence2.js';
2
2
  export * from './mgp_str/processing_mgp_str.js';
3
+ export * from './moonshine/processing_moonshine.js';
3
4
  export * from './idefics3/processing_idefics3.js';
4
5
  export * from './janus/processing_janus.js';
5
6
  export * from './jina_clip/processing_jina_clip.js';
6
7
  export * from './owlvit/processing_owlvit.js';
8
+ export * from './phi3_v/processing_phi3_v.js';
9
+ export * from './paligemma/processing_paligemma.js';
7
10
  export * from './pyannote/processing_pyannote.js';
8
11
  export * from './qwen2_vl/processing_qwen2_vl.js';
9
12
  export * from './sam/processing_sam.js';
@@ -1,5 +1,6 @@
1
1
  import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
2
2
  import { Tensor } from '../../utils/tensor.js';
3
+ import { max, softmax } from '../../utils/maths.js';
3
4
 
4
5
 
5
6
  export class PyAnnoteFeatureExtractor extends FeatureExtractor {
@@ -25,4 +26,59 @@ export class PyAnnoteFeatureExtractor extends FeatureExtractor {
25
26
  };
26
27
  }
27
28
 
29
+ /**
30
+ * NOTE: Can return fractional values. `Math.ceil` will ensure correct value.
31
+ * @param {number} samples The number of frames in the audio.
32
+ * @returns {number} The number of frames in the audio.
33
+ */
34
+ samples_to_frames(samples) {
35
+ return ((samples - this.config.offset) / this.config.step);
36
+ }
37
+
38
+ /**
39
+ * Post-processes the speaker diarization logits output by the model.
40
+ * @param {import('../../utils/tensor.js').Tensor} logits The speaker diarization logits output by the model.
41
+ * @param {number} num_samples Number of samples in the input audio.
42
+ * @returns {Array<Array<{ id: number, start: number, end: number, confidence: number }>>} The post-processed speaker diarization results.
43
+ */
44
+ post_process_speaker_diarization(logits, num_samples) {
45
+ const ratio = (
46
+ num_samples / this.samples_to_frames(num_samples)
47
+ ) / this.config.sampling_rate;
48
+
49
+ const results = [];
50
+ for (const scores of logits.tolist()) {
51
+ const accumulated_segments = [];
52
+
53
+ let current_speaker = -1;
54
+ for (let i = 0; i < scores.length; ++i) {
55
+ const probabilities = softmax(scores[i]);
56
+ const [score, id] = max(probabilities);
57
+ const [start, end] = [i, i + 1];
58
+
59
+ if (id !== current_speaker) {
60
+ // Speaker has changed
61
+ current_speaker = id;
62
+ accumulated_segments.push({ id, start, end, score });
63
+ } else {
64
+ // Continue the current segment
65
+ accumulated_segments.at(-1).end = end;
66
+ accumulated_segments.at(-1).score += score;
67
+ }
68
+ }
69
+
70
+ results.push(accumulated_segments.map(
71
+ // Convert frame-space to time-space
72
+ // and compute the confidence
73
+ ({ id, start, end, score }) => ({
74
+ id,
75
+ start: start * ratio,
76
+ end: end * ratio,
77
+ confidence: score / (end - start),
78
+ })
79
+ ));
80
+ }
81
+ return results;
82
+ }
83
+
28
84
  }