npm - @huggingface/transformers - Versions diffs - 3.1.1 → 3.2.0 - Mend

@huggingface/transformers 3.1.1 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (144) hide show

package/README.md +10 -4
package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
package/dist/transformers.cjs +1062 -183
package/dist/transformers.cjs.map +1 -1
package/dist/transformers.js +2239 -1232
package/dist/transformers.js.map +1 -1
package/dist/transformers.min.cjs +1 -358
package/dist/transformers.min.cjs.map +1 -1
package/dist/transformers.min.js +1 -421
package/dist/transformers.min.js.map +1 -1
package/dist/transformers.min.mjs +1 -358
package/dist/transformers.min.mjs.map +1 -1
package/dist/transformers.mjs +1082 -181
package/dist/transformers.mjs.map +1 -1
package/package.json +11 -16
package/src/backends/onnx.js +2 -7
package/src/base/image_processors_utils.js +3 -1
package/src/configs.js +11 -2
package/src/env.js +1 -1
package/src/models/feature_extractors.js +1 -0
package/src/models/idefics3/image_processing_idefics3.js +24 -13
package/src/models/image_processors.js +1 -0
package/src/models/moonshine/feature_extraction_moonshine.js +26 -0
package/src/models/moonshine/processing_moonshine.js +20 -0
package/src/models/paligemma/processing_paligemma.js +82 -0
package/src/models/phi3_v/image_processing_phi3_v.js +163 -0
package/src/models/phi3_v/processing_phi3_v.js +53 -0
package/src/models/processors.js +3 -0
package/src/models/pyannote/feature_extraction_pyannote.js +56 -0
package/src/models/pyannote/processing_pyannote.js +7 -54
package/src/models.js +233 -35
package/src/ops/registry.js +11 -0
package/src/pipelines.js +30 -0
package/src/tokenizers.js +12 -1
package/src/utils/core.js +39 -9
package/src/utils/hub.js +8 -12
package/src/utils/image.js +40 -0
package/src/utils/tensor.js +51 -1
package/types/backends/onnx.d.ts +2 -2
package/types/backends/onnx.d.ts.map +1 -1
package/types/base/feature_extraction_utils.d.ts +1 -1
package/types/base/feature_extraction_utils.d.ts.map +1 -1
package/types/base/image_processors_utils.d.ts +4 -4
package/types/base/image_processors_utils.d.ts.map +1 -1
package/types/base/processing_utils.d.ts +4 -4
package/types/base/processing_utils.d.ts.map +1 -1
package/types/configs.d.ts +7 -7
package/types/configs.d.ts.map +1 -1
package/types/env.d.ts +1 -1
package/types/env.d.ts.map +1 -1
package/types/generation/configuration_utils.d.ts +2 -2
package/types/generation/logits_process.d.ts +2 -2
package/types/generation/logits_process.d.ts.map +1 -1
package/types/generation/logits_sampler.d.ts.map +1 -1
package/types/generation/parameters.d.ts +5 -5
package/types/generation/stopping_criteria.d.ts +1 -1
package/types/generation/stopping_criteria.d.ts.map +1 -1
package/types/generation/streamers.d.ts +2 -2
package/types/generation/streamers.d.ts.map +1 -1
package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts +1 -1
package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts.map +1 -1
package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
package/types/models/auto/image_processing_auto.d.ts.map +1 -1
package/types/models/auto/processing_auto.d.ts +1 -1
package/types/models/auto/processing_auto.d.ts.map +1 -1
package/types/models/clap/feature_extraction_clap.d.ts +1 -1
package/types/models/clap/feature_extraction_clap.d.ts.map +1 -1
package/types/models/detr/image_processing_detr.d.ts +11 -11
package/types/models/detr/image_processing_detr.d.ts.map +1 -1
package/types/models/donut/image_processing_donut.d.ts +1 -1
package/types/models/donut/image_processing_donut.d.ts.map +1 -1
package/types/models/feature_extractors.d.ts +1 -0
package/types/models/florence2/processing_florence2.d.ts.map +1 -1
package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
package/types/models/idefics3/processing_idefics3.d.ts.map +1 -1
package/types/models/image_processors.d.ts +1 -0
package/types/models/janus/image_processing_janus.d.ts +1 -1
package/types/models/janus/image_processing_janus.d.ts.map +1 -1
package/types/models/janus/processing_janus.d.ts.map +1 -1
package/types/models/maskformer/image_processing_maskformer.d.ts +8 -8
package/types/models/maskformer/image_processing_maskformer.d.ts.map +1 -1
package/types/models/mgp_str/processing_mgp_str.d.ts +2 -2
package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
package/types/models/moonshine/feature_extraction_moonshine.d.ts +13 -0
package/types/models/moonshine/feature_extraction_moonshine.d.ts.map +1 -0
package/types/models/moonshine/processing_moonshine.d.ts +17 -0
package/types/models/moonshine/processing_moonshine.d.ts.map +1 -0
package/types/models/owlvit/image_processing_owlvit.d.ts.map +1 -1
package/types/models/paligemma/processing_paligemma.d.ts +12 -0
package/types/models/paligemma/processing_paligemma.d.ts.map +1 -0
package/types/models/phi3_v/image_processing_phi3_v.d.ts +17 -0
package/types/models/phi3_v/image_processing_phi3_v.d.ts.map +1 -0
package/types/models/phi3_v/processing_phi3_v.d.ts +17 -0
package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -0
package/types/models/processors.d.ts +3 -0
package/types/models/pyannote/feature_extraction_pyannote.d.ts +18 -0
package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
package/types/models/pyannote/processing_pyannote.d.ts +4 -15
package/types/models/pyannote/processing_pyannote.d.ts.map +1 -1
package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
package/types/models/rt_detr/image_processing_rt_detr.d.ts.map +1 -1
package/types/models/sam/image_processing_sam.d.ts.map +1 -1
package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts +1 -1
package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts.map +1 -1
package/types/models/segformer/image_processing_segformer.d.ts.map +1 -1
package/types/models/speecht5/processing_speecht5.d.ts.map +1 -1
package/types/models/swin2sr/image_processing_swin2sr.d.ts +1 -1
package/types/models/swin2sr/image_processing_swin2sr.d.ts.map +1 -1
package/types/models/vitmatte/image_processing_vitmatte.d.ts.map +1 -1
package/types/models/vitpose/image_processing_vitpose.d.ts +1 -1
package/types/models/vitpose/image_processing_vitpose.d.ts.map +1 -1
package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts.map +1 -1
package/types/models/wav2vec2/processing_wav2vec2.d.ts.map +1 -1
package/types/models/wespeaker/feature_extraction_wespeaker.d.ts +1 -1
package/types/models/wespeaker/feature_extraction_wespeaker.d.ts.map +1 -1
package/types/models/whisper/feature_extraction_whisper.d.ts +1 -1
package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
package/types/models/whisper/generation_whisper.d.ts.map +1 -1
package/types/models/whisper/processing_whisper.d.ts.map +1 -1
package/types/models/yolos/image_processing_yolos.d.ts.map +1 -1
package/types/models.d.ts +61 -5
package/types/models.d.ts.map +1 -1
package/types/ops/registry.d.ts +1 -0
package/types/ops/registry.d.ts.map +1 -1
package/types/pipelines.d.ts +31 -51
package/types/pipelines.d.ts.map +1 -1
package/types/tokenizers.d.ts +10 -6
package/types/tokenizers.d.ts.map +1 -1
package/types/utils/audio.d.ts.map +1 -1
package/types/utils/constants.d.ts.map +1 -1
package/types/utils/core.d.ts +87 -22
package/types/utils/core.d.ts.map +1 -1
package/types/utils/data-structures.d.ts.map +1 -1
package/types/utils/devices.d.ts.map +1 -1
package/types/utils/dtypes.d.ts.map +1 -1
package/types/utils/generic.d.ts.map +1 -1
package/types/utils/hub.d.ts +3 -3
package/types/utils/hub.d.ts.map +1 -1
package/types/utils/image.d.ts +10 -1
package/types/utils/image.d.ts.map +1 -1
package/types/utils/maths.d.ts +10 -10
package/types/utils/maths.d.ts.map +1 -1
package/types/utils/tensor.d.ts +22 -6
package/types/utils/tensor.d.ts.map +1 -1

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@huggingface/transformers",
-  "version": "3.1.1",
+  "version": "3.2.0",
   "description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
   "main": "./src/transformers.js",
   "types": "./types/transformers.d.ts",
@@ -21,12 +21,6 @@
       "default": "./dist/transformers.js"
     }
   },
-  "imports": {
-    "#onnxruntime-webgpu": {
-      "node": "onnxruntime-web",
-      "default": "onnxruntime-web/webgpu"
-    }
-  },
   "scripts": {
     "format": "prettier --write .",
     "format:check": "prettier --check .",
@@ -63,22 +57,23 @@
   "dependencies": {
     "@huggingface/jinja": "^0.3.2",
     "onnxruntime-node": "1.20.1",
-    "onnxruntime-web": "1.20.1",
+    "onnxruntime-web": "1.21.0-dev.20241205-d27fecd3d3",
     "sharp": "^0.33.5"
   },
   "devDependencies": {
     "@types/jest": "^29.5.14",
-    "@webgpu/types": "^0.1.44",
+    "@types/node": "^22.10.1",
+    "@webgpu/types": "^0.1.51",
     "catharsis": "github:xenova/catharsis",
     "jest": "^30.0.0-alpha.6",
     "jest-environment-node": "^30.0.0-alpha.6",
-    "jsdoc-to-markdown": "^8.0.1",
-    "prettier": "3.3.3",
-    "typescript": "^5.2.2",
-    "wavefile": "^11.0.0",
-    "webpack": "^5.80.0",
-    "webpack-cli": "^5.0.2",
-    "webpack-dev-server": "^4.13.3"
+    "jsdoc-to-markdown": "^9.1.1",
+    "prettier": "3.4.2",
+    "typescript": "^5.7.2",
+    "wavefile": "11.0.0",
+    "webpack": "^5.97.1",
+    "webpack-cli": "^5.1.4",
+    "webpack-dev-server": "^5.1.0"
   },
   "files": [
     "src",

package/src/backends/onnx.js CHANGED Viewed

@@ -21,12 +21,7 @@ import { env, apis } from '../env.js';
 // NOTE: Import order matters here. We need to import `onnxruntime-node` before `onnxruntime-web`.
 // In either case, we select the default export if it exists, otherwise we use the named export.
 import * as ONNX_NODE from 'onnxruntime-node';
-// Use subpath-imports to ensure Node.js and browser interoperability.
-// See package.json and https://nodejs.org/api/packages.html#subpath-imports
-// for more information.
-// @ts-ignore
-import * as ONNX_WEB from '#onnxruntime-webgpu';
+import * as ONNX_WEB from 'onnxruntime-web';
 export { Tensor } from 'onnxruntime-common';
@@ -68,7 +63,7 @@ if (ORT_SYMBOL in globalThis) {
 } else if (apis.IS_NODE_ENV) {
     ONNX = ONNX_NODE.default ?? ONNX_NODE;
-    // Updated as of ONNX Runtime 1.18.0
+    // Updated as of ONNX Runtime 1.20.1
     // The following table lists the supported versions of ONNX Runtime Node.js binding provided with pre-built binaries.
     // | EPs/Platforms | Windows x64 | Windows arm64 | Linux x64         | Linux arm64 | MacOS x64 | MacOS arm64 |
     // | ------------- | ----------- | ------------- | ----------------- | ----------- | --------- | ----------- |

package/src/base/image_processors_utils.js CHANGED Viewed

@@ -699,7 +699,7 @@ export class ImageProcessor extends Callable {
      * Pad the image by a certain amount.
      * @param {Float32Array} pixelData The pixel data to pad.
      * @param {number[]} imgDims The dimensions of the image (height, width, channels).
-     * @param {{width:number; height:number}|number} padSize The dimensions of the padded image.
+     * @param {{width:number; height:number}|number|'square'} padSize The dimensions of the padded image.
      * @param {Object} options The options for padding.
      * @param {'constant'|'symmetric'} [options.mode='constant'] The type of padding to add.
      * @param {boolean} [options.center=false] Whether to center the image.
@@ -717,6 +717,8 @@ export class ImageProcessor extends Callable {
         if (typeof padSize === 'number') {
             paddedImageWidth = padSize;
             paddedImageHeight = padSize;
+        } else if (padSize === 'square') {
+            paddedImageWidth = paddedImageHeight = Math.max(imageHeight, imageWidth);
         } else {
             paddedImageWidth = padSize.width;
             paddedImageHeight = padSize.height;

package/src/configs.js CHANGED Viewed

@@ -95,8 +95,6 @@ function getNormalizedConfig(config) {
         case 'gpt_neox':
         case 'stablelm':
         case 'opt':
-        case 'phi':
-        case 'phi3':
         case 'falcon':
             mapping['num_heads'] = 'num_attention_heads';
             mapping['num_layers'] = 'num_hidden_layers';
@@ -104,6 +102,7 @@ function getNormalizedConfig(config) {
             break;
         case 'llama':
         case 'olmo':
+        case 'olmo2':
         case 'mobilellm':
         case 'granite':
         case 'cohere':
@@ -111,6 +110,9 @@ function getNormalizedConfig(config) {
         case 'starcoder2':
         case 'qwen2':
         case 'qwen2_vl':
+        case 'phi':
+        case 'phi3':
+        case 'phi3_v':
             mapping['num_heads'] = 'num_key_value_heads';
             mapping['num_layers'] = 'num_hidden_layers';
             mapping['hidden_size'] = 'hidden_size';
@@ -143,6 +145,12 @@ function getNormalizedConfig(config) {
             mapping['num_layers'] = 'n_layers';
             mapping['hidden_size'] = 'd_model';
             break;
+        case 'exaone':
+            mapping['num_heads'] = 'num_key_value_heads';
+            mapping['num_layers'] = 'num_layers';
+            mapping['dim_kv'] = 'head_dim';
+            mapping['num_attention_heads'] = 'num_attention_heads';
+            break;
         // Encoder-decoder models
         case 't5':
@@ -184,6 +192,7 @@ function getNormalizedConfig(config) {
             mapping['encoder_hidden_size'] = mapping['decoder_hidden_size'] = 'd_model';
             break;
         case 'musicgen_decoder':
+        case 'moonshine':
             mapping['num_encoder_layers'] = mapping['num_decoder_layers'] = 'num_hidden_layers';
             mapping['num_encoder_heads'] = mapping['num_decoder_heads'] = 'num_attention_heads';
             mapping['encoder_hidden_size'] = mapping['decoder_hidden_size'] = 'hidden_size';

package/src/env.js CHANGED Viewed

@@ -26,7 +26,7 @@ import fs from 'fs';
 import path from 'path';
 import url from 'url';
-const VERSION = '3.1.1';
+const VERSION = '3.2.0';
 // Check if various APIs are available (depends on environment)
 const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";

package/src/models/feature_extractors.js CHANGED Viewed

@@ -1,6 +1,7 @@
 export * from './audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js';
 export * from './clap/feature_extraction_clap.js';
+export * from './moonshine/feature_extraction_moonshine.js';
 export * from './pyannote/feature_extraction_pyannote.js';
 export * from './seamless_m4t/feature_extraction_seamless_m4t.js';
 export * from './speecht5/feature_extraction_speecht5.js';

package/src/models/idefics3/image_processing_idefics3.js CHANGED Viewed

@@ -3,7 +3,7 @@
 import {
     ImageProcessor,
 } from "../../base/image_processors_utils.js";
-import { cat, full, interpolate_4d, stack } from "../../utils/tensor.js";
+import { cat, full, interpolate_4d, slice, stack } from "../../utils/tensor.js";
 export class Idefics3ImageProcessor extends ImageProcessor {
     constructor(config) {
@@ -186,18 +186,29 @@ export class Idefics3ImageProcessor extends ImageProcessor {
             const optimal_width = Math.ceil(width / num_splits_w);
             // Iterate through each row and column
-            for (let r = 0; r < num_splits_h; r++) {
-                for (let c = 0; c < num_splits_w; c++) {
-                    // Calculate the starting point of the crop
-                    const start_x = c * optimal_width;
-                    const start_y = r * optimal_height;
-                    // Calculate the ending point of the crop
-                    const end_x = Math.min(start_x + optimal_width, width);
-                    const end_y = Math.min(start_y + optimal_height, height);
-                    // Crop the image
-                    frames.push(pixel_values.slice(null, null, [start_y, end_y], [start_x, end_x]));
+            for (let r = 0; r < num_splits_h; ++r) {
+                for (let c = 0; c < num_splits_w; ++c) {
+                    let start_x, start_y, end_x, end_y;
+                    if (r === num_splits_h - 1) { // At bottom
+                        start_y = height - optimal_height;
+                        end_y = height;
+                    } else {
+                        start_y = r * optimal_height;
+                        end_y = (r + 1) * optimal_height;
+                    }
+                    if (c === num_splits_w - 1) { // At right
+                        start_x = width - optimal_width;
+                        end_x = width;
+                    } else {
+                        start_x = c * optimal_width;
+                        end_x = (c + 1) * optimal_width;
+                    }
+                    const starts = [start_y, start_x];
+                    const ends = [end_y, end_x];
+                    const patch = await slice(pixel_values, starts, ends, [2, 3]);
+                    frames.push(patch);
                 }
             }

package/src/models/image_processors.js CHANGED Viewed

@@ -24,6 +24,7 @@ export * from './mobilevit/image_processing_mobilevit.js'
 export * from './nougat/image_processing_nougat.js'
 export * from './owlv2/image_processing_owlv2.js'
 export * from './owlvit/image_processing_owlvit.js'
+export * from './phi3_v/image_processing_phi3_v.js'
 export * from './pvt/image_processing_pvt.js'
 export * from './qwen2_vl/image_processing_qwen2_vl.js'
 export * from './rt_detr/image_processing_rt_detr.js'

package/src/models/moonshine/feature_extraction_moonshine.js ADDED Viewed

@@ -0,0 +1,26 @@
+import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
+import { Tensor } from '../../utils/tensor.js';
+export class MoonshineFeatureExtractor extends FeatureExtractor {
+    /**
+     * Asynchronously extracts input values from a given audio using the provided configuration.
+     * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
+     * @returns {Promise<{ input_values: Tensor; }>} The extracted input values.
+     */
+    async _call(audio) {
+        validate_audio_inputs(audio, 'MoonshineFeatureExtractor');
+        if (audio instanceof Float64Array) {
+            audio = new Float32Array(audio);
+        }
+        const shape = [
+            1,            /* batch_size */
+            audio.length, /* num_samples */
+        ];
+        return {
+            input_values: new Tensor('float32', audio, shape),
+        };
+    }
+}

package/src/models/moonshine/processing_moonshine.js ADDED Viewed

@@ -0,0 +1,20 @@
+import { AutoFeatureExtractor } from "../auto/feature_extraction_auto.js"
+import { AutoTokenizer } from "../../tokenizers.js"
+import { Processor } from "../../base/processing_utils.js"
+/**
+ * Represents a MoonshineProcessor that extracts features from an audio input.
+ */
+export class MoonshineProcessor extends Processor {
+    static tokenizer_class = AutoTokenizer
+    static feature_extractor_class = AutoFeatureExtractor
+    /**
+     * Calls the feature_extractor function with the given audio input.
+     * @param {any} audio The audio input to extract features from.
+     * @returns {Promise<any>} A Promise that resolves with the extracted features.
+     */
+    async _call(audio) {
+        return await this.feature_extractor(audio);
+    }
+}

package/src/models/paligemma/processing_paligemma.js ADDED Viewed

@@ -0,0 +1,82 @@
+import { Processor } from "../../base/processing_utils.js";
+import { AutoImageProcessor } from "../auto/image_processing_auto.js";
+import { AutoTokenizer } from "../../tokenizers.js";
+const IMAGE_TOKEN = "<image>";
+function build_string_from_input(
+    prompt,
+    bos_token,
+    image_seq_len,
+    image_token,
+    num_images,
+) {
+    return `${image_token.repeat(image_seq_len * num_images)}${bos_token}${prompt}\n`
+}
+export class PaliGemmaProcessor extends Processor {
+    static tokenizer_class = AutoTokenizer
+    static image_processor_class = AutoImageProcessor
+    static uses_processor_config = false;
+    /**
+     * @typedef {import('../../utils/image.js').RawImage} RawImage
+     */
+    // `images` is required, `text` is optional
+    async _call(/** @type {RawImage|RawImage[]} */ images, text = null, kwargs = {}) {
+        if (!text) {
+            console.warn(
+                "You are using PaliGemma without a text prefix. It will perform as a picture-captioning model."
+            )
+            text = ""
+        }
+        if (!Array.isArray(images)) {
+            images = [images]
+        }
+        if (!Array.isArray(text)) {
+            text = [text]
+        }
+        const bos_token = this.tokenizer.bos_token;
+        const image_seq_length = this.image_processor.config.image_seq_length;
+        let input_strings;
+        if (text.some((t) => t.includes(IMAGE_TOKEN))) {
+            input_strings = text.map(
+                sample => {
+                    const expanded_sample = sample.replaceAll(IMAGE_TOKEN, IMAGE_TOKEN.repeat(image_seq_length));
+                    const bos_rfind_index = expanded_sample.lastIndexOf(IMAGE_TOKEN);
+                    const bos_index = bos_rfind_index === -1 ? 0 : bos_rfind_index + IMAGE_TOKEN.length;
+                    return expanded_sample.slice(0, bos_index) + bos_token + expanded_sample.slice(bos_index) + "\n";
+                }
+            )
+        } else {
+            console.warn(
+                "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special " +
+                "image tokens in the text, as many tokens as there are images per each text. It is recommended to " +
+                "add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images " +
+                "each text has and add special tokens."
+            )
+            input_strings = text.map(
+                sample => build_string_from_input(
+                    sample,
+                    bos_token,
+                    image_seq_length,
+                    IMAGE_TOKEN,
+                    images.length,
+                )
+            )
+        }
+        const text_inputs = this.tokenizer(input_strings, kwargs);
+        const image_inputs = await this.image_processor(images, kwargs);
+        return {
+            ...image_inputs,
+            ...text_inputs,
+        }
+    }
+}

package/src/models/phi3_v/image_processing_phi3_v.js ADDED Viewed

@@ -0,0 +1,163 @@
+import {
+    ImageProcessor,
+} from "../../base/image_processors_utils.js";
+import { cat, interpolate_4d, slice, stack, Tensor } from "../../utils/tensor.js";
+const IMAGE_SIZE = 336;
+const SLICE_AXES = [2, 3]; // axes to slice on
+const { ceil, floor, sqrt } = Math;
+export class Phi3VImageProcessor extends ImageProcessor {
+    constructor(config) {
+        super({
+            ...config,
+            do_normalize: true,
+            do_pad: true,
+            pad_size: 'custom',
+            do_convert_rgb: true,
+            do_resize: true, // Smart resizing "hd_transform"
+        });
+        this._num_crops = config.num_crops;
+    }
+    calc_num_image_tokens_from_image_size(width, height) {
+        // @ts-expect-error
+        const { num_img_tokens } = this.config;
+        return floor(((floor((height / IMAGE_SIZE)) * floor((width / IMAGE_SIZE)) + 1) * num_img_tokens) + 1 + (floor(height / IMAGE_SIZE) + 1) * sqrt(num_img_tokens));
+    }
+    /** @type {ImageProcessor['get_resize_output_image_size']} */
+    get_resize_output_image_size(image, size) {
+        const hd_num = this._num_crops;
+        const [width, height] = image.size
+        let ratio = width / height;
+        let scale = 1;
+        // Calculate the scaling factor
+        while (scale * Math.ceil(scale / ratio) <= hd_num) {
+            scale += 1;
+        }
+        scale -= 1;
+        // Compute the new dimensions
+        const new_w = Math.floor(scale * 336);
+        const new_h = Math.floor(new_w / ratio);
+        return [new_w, new_h]
+    }
+    /** @type {ImageProcessor['pad_image']} */
+    pad_image(pixelData, imgDims, padSize, options = {}) {
+        // Phi3V uses a custom padding strategy:
+        // - Pad to a multiple of 336
+        // - Pad with white pixels
+        const [imageHeight, imageWidth] = imgDims;
+        const height = IMAGE_SIZE * ceil(imageHeight / IMAGE_SIZE);
+        const width = IMAGE_SIZE * ceil(imageWidth / IMAGE_SIZE);
+        // NOTE: Since padding is done after normalization, we need to fill with the normalized values
+        const constant_values = [1, 1, 1].map((x, i) => (x - this.image_mean[i]) / this.image_std[i]);
+        return super.pad_image(pixelData, imgDims, { width, height }, {
+            center: true,
+            constant_values,
+            ...options,
+        });
+    }
+    async _call(images, {
+        num_crops = null,
+    } = {}) {
+        // @ts-expect-error
+        this._num_crops = num_crops ??= this.config.num_crops;
+        if (num_crops < 4 || sqrt(num_crops) % 1 !== 0) {
+            throw new Error("num_crops must be a square number >= 4");
+        }
+        if (!Array.isArray(images)) {
+            images = [images];
+        }
+        const num_images = images.length;
+        const imageData = await Promise.all(images.map(x => this.preprocess(x)));
+        const original_sizes = imageData.map(x => x.original_size);
+        const reshaped_input_sizes = imageData.map(x => x.reshaped_input_size);
+        // Process each image in batch
+        const all_pixel_values = [];
+        for (const { pixel_values } of imageData) {
+            pixel_values.unsqueeze_(0); // Easier processing as 4D tensor
+            const [height, width] = pixel_values.dims.slice(-2);
+            // Global image (Tensor of shape [num_channels, height, width])
+            const batch_pixel_values = await interpolate_4d(pixel_values, {
+                size: [IMAGE_SIZE, IMAGE_SIZE],
+                mode: 'bicubic',
+            });
+            if (num_crops > 0) {
+                const patches = [];
+                const sqrt_patches = sqrt(num_crops);
+                const patch_width = floor(width / sqrt_patches);
+                const patch_height = floor(height / sqrt_patches);
+                for (let y = 0; y < sqrt_patches; ++y) {
+                    for (let x = 0; x < sqrt_patches; ++x) {
+                        let start_x, start_y, end_x, end_y;
+                        if (y === sqrt_patches - 1) { // At bottom
+                            start_y = height - patch_height;
+                            end_y = height;
+                        } else {
+                            start_y = y * patch_height;
+                            end_y = (y + 1) * patch_height;
+                        }
+                        if (x === sqrt_patches - 1) { // At right
+                            start_x = width - patch_width;
+                            end_x = width;
+                        } else {
+                            start_x = x * patch_width;
+                            end_x = (x + 1) * patch_width;
+                        }
+                        const starts = [start_y, start_x];
+                        const ends = [end_y, end_x];
+                        const patch = await slice(pixel_values, starts, ends, SLICE_AXES);
+                        patches.push(patch);
+                    }
+                }
+                const resized_tensors = await interpolate_4d(cat(patches, 0), {
+                    size: [IMAGE_SIZE, IMAGE_SIZE],
+                    mode: 'bicubic',
+                }); // [num_crops, 3, 336, 336]
+                // Concatenate the global image with the patches
+                all_pixel_values.push(cat([batch_pixel_values, resized_tensors], 0));
+            } else {
+                // Only use the global image
+                // NOTE: Not currently supported in modelling code
+                all_pixel_values.push(batch_pixel_values);
+            }
+        }
+        // [num_images, 1 + num_crops, num_channels=3, height, width]
+        const pixel_values = stack(all_pixel_values, 0);
+        // Calculate padded image sizes
+        const sizes = reshaped_input_sizes.map(x => x.map(y => IMAGE_SIZE * ceil(y / IMAGE_SIZE)));
+        const image_sizes = new Tensor(
+            'int64',
+            sizes.flat(),
+            [num_images, 2],
+        );
+        const num_img_tokens = sizes.map(
+            ([height, width]) => this.calc_num_image_tokens_from_image_size(width, height),
+        );
+        return { pixel_values, original_sizes, reshaped_input_sizes, image_sizes, num_img_tokens };
+    }
+}

package/src/models/phi3_v/processing_phi3_v.js ADDED Viewed

@@ -0,0 +1,53 @@
+import { Processor } from "../../base/processing_utils.js";
+import { AutoImageProcessor } from "../auto/image_processing_auto.js";
+import { AutoTokenizer } from "../../tokenizers.js";
+import { RawImage } from "../../utils/image.js";
+const IMAGE_TOKEN = "<|image|>";
+const IMAGE_TOKEN_PATTERN = /<\|image_\d+\|>/g;
+export class Phi3VProcessor extends Processor {
+    static image_processor_class = AutoImageProcessor
+    static tokenizer_class = AutoTokenizer
+    /**
+     *
+     * @param {string|string[]} text
+     * @param {RawImage|RawImage[]} images
+     * @param  {...any} args
+     * @returns {Promise<any>}
+     */
+    async _call(text, images = null, {
+        padding = true,
+        truncation = true,
+        num_crops = null,
+    } = {}) {
+        if (!Array.isArray(text)) {
+            text = [text];
+        }
+        let text_inputs, image_inputs;
+        if (images) {
+            image_inputs = await this.image_processor(images, { num_crops });
+            const { num_img_tokens } = image_inputs;
+            // The original implementation adds a bos_token before the image tokens
+            // TODO: Check if this affects performance, since it looks like a bug in the original implementation
+            const prompt_chunks = text.map((t, i) => t.split(IMAGE_TOKEN_PATTERN).join(IMAGE_TOKEN.repeat(num_img_tokens[i])));
+            text_inputs = this.tokenizer(prompt_chunks, { padding, truncation });
+            // The model expects image tokens to be negative, so we negate the image token ids
+            const image_token_id = this.tokenizer.model.convert_tokens_to_ids([IMAGE_TOKEN])[0];
+            text_inputs.input_ids.map_(id => (id == image_token_id) ? -id : id);
+        } else {
+            text_inputs = this.tokenizer(text);
+        }
+        return {
+            ...text_inputs,
+            ...image_inputs,
+        }
+    }
+}

package/src/models/processors.js CHANGED Viewed

@@ -1,9 +1,12 @@
 export * from './florence2/processing_florence2.js';
 export * from './mgp_str/processing_mgp_str.js';
+export * from './moonshine/processing_moonshine.js';
 export * from './idefics3/processing_idefics3.js';
 export * from './janus/processing_janus.js';
 export * from './jina_clip/processing_jina_clip.js';
 export * from './owlvit/processing_owlvit.js';
+export * from './phi3_v/processing_phi3_v.js';
+export * from './paligemma/processing_paligemma.js';
 export * from './pyannote/processing_pyannote.js';
 export * from './qwen2_vl/processing_qwen2_vl.js';
 export * from './sam/processing_sam.js';

package/src/models/pyannote/feature_extraction_pyannote.js CHANGED Viewed

@@ -1,5 +1,6 @@
 import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
 import { Tensor } from '../../utils/tensor.js';
+import { max, softmax } from '../../utils/maths.js';
 export class PyAnnoteFeatureExtractor extends FeatureExtractor {
@@ -25,4 +26,59 @@ export class PyAnnoteFeatureExtractor extends FeatureExtractor {
         };
     }
+    /**
+     * NOTE: Can return fractional values. `Math.ceil` will ensure correct value.
+     * @param {number} samples The number of frames in the audio.
+     * @returns {number} The number of frames in the audio.
+     */
+    samples_to_frames(samples) {
+        return ((samples - this.config.offset) / this.config.step);
+    }
+    /**
+     * Post-processes the speaker diarization logits output by the model.
+     * @param {import('../../utils/tensor.js').Tensor} logits The speaker diarization logits output by the model.
+     * @param {number} num_samples Number of samples in the input audio.
+     * @returns {Array<Array<{ id: number, start: number, end: number, confidence: number }>>} The post-processed speaker diarization results.
+     */
+    post_process_speaker_diarization(logits, num_samples) {
+        const ratio = (
+            num_samples / this.samples_to_frames(num_samples)
+        ) / this.config.sampling_rate;
+        const results = [];
+        for (const scores of logits.tolist()) {
+            const accumulated_segments = [];
+            let current_speaker = -1;
+            for (let i = 0; i < scores.length; ++i) {
+                const probabilities = softmax(scores[i]);
+                const [score, id] = max(probabilities);
+                const [start, end] = [i, i + 1];
+                if (id !== current_speaker) {
+                    // Speaker has changed
+                    current_speaker = id;
+                    accumulated_segments.push({ id, start, end, score });
+                } else {
+                    // Continue the current segment
+                    accumulated_segments.at(-1).end = end;
+                    accumulated_segments.at(-1).score += score;
+                }
+            }
+            results.push(accumulated_segments.map(
+                // Convert frame-space to time-space
+                // and compute the confidence
+                ({ id, start, end, score }) => ({
+                    id,
+                    start: start * ratio,
+                    end: end * ratio,
+                    confidence: score / (end - start),
+                })
+            ));
+        }
+        return results;
+    }
 }