npm - @huggingface/transformers - Versions diffs - 3.3.3 → 3.4.1 - Mend

@huggingface/transformers 3.3.3 → 3.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

package/README.md +13 -3
package/dist/ort-wasm-simd-threaded.jsep.mjs +124 -115
package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
package/dist/transformers.js +2778 -1592
package/dist/transformers.js.map +1 -1
package/dist/transformers.min.js +1 -1
package/dist/transformers.min.js.map +1 -1
package/dist/{transformers.cjs → transformers.node.cjs} +1699 -2530
package/dist/transformers.node.cjs.map +1 -0
package/dist/transformers.node.min.cjs +2 -0
package/dist/transformers.node.min.cjs.map +1 -0
package/dist/transformers.node.min.mjs +2 -0
package/dist/transformers.node.min.mjs.map +1 -0
package/dist/{transformers.mjs → transformers.node.mjs} +1738 -2510
package/dist/transformers.node.mjs.map +1 -0
package/dist/transformers.web.js +35876 -0
package/dist/transformers.web.js.map +1 -0
package/dist/transformers.web.min.js +2 -0
package/dist/transformers.web.min.js.map +1 -0
package/package.json +6 -6
package/src/backends/onnx.js +14 -15
package/src/configs.js +6 -1
package/src/env.js +1 -1
package/src/generation/streamers.js +4 -3
package/src/models/dac/feature_extraction_dac.js +3 -0
package/src/models/encodec/feature_extraction_encodec.js +32 -0
package/src/models/feature_extractors.js +3 -0
package/src/models/idefics3/image_processing_idefics3.js +1 -1
package/src/models/image_processors.js +1 -0
package/src/models/processors.js +2 -0
package/src/models/smolvlm/image_processing_smolvlm.js +2 -0
package/src/models/smolvlm/processing_smolvlm.js +2 -0
package/src/models/snac/feature_extraction_snac.js +3 -0
package/src/models/ultravox/processing_ultravox.js +54 -0
package/src/models/whisper/common_whisper.js +7 -1
package/src/models/whisper/feature_extraction_whisper.js +18 -10
package/src/models.js +546 -78
package/src/pipelines.js +246 -137
package/src/tokenizers.js +42 -28
package/src/transformers.js +1 -0
package/src/utils/audio.js +2 -0
package/src/utils/hub.js +140 -80
package/src/utils/image.js +9 -1
package/src/utils/maths.js +1 -1
package/src/utils/tensor.js +12 -5
package/src/utils/video.js +128 -0
package/types/backends/onnx.d.ts +2 -2
package/types/backends/onnx.d.ts.map +1 -1
package/types/configs.d.ts +1 -1
package/types/configs.d.ts.map +1 -1
package/types/generation/streamers.d.ts.map +1 -1
package/types/models/dac/feature_extraction_dac.d.ts +4 -0
package/types/models/dac/feature_extraction_dac.d.ts.map +1 -0
package/types/models/encodec/feature_extraction_encodec.d.ts +13 -0
package/types/models/encodec/feature_extraction_encodec.d.ts.map +1 -0
package/types/models/feature_extractors.d.ts +3 -0
package/types/models/florence2/processing_florence2.d.ts +1 -1
package/types/models/florence2/processing_florence2.d.ts.map +1 -1
package/types/models/image_processors.d.ts +1 -0
package/types/models/processors.d.ts +2 -0
package/types/models/smolvlm/image_processing_smolvlm.d.ts +2 -0
package/types/models/smolvlm/image_processing_smolvlm.d.ts.map +1 -0
package/types/models/smolvlm/processing_smolvlm.d.ts +2 -0
package/types/models/smolvlm/processing_smolvlm.d.ts.map +1 -0
package/types/models/snac/feature_extraction_snac.d.ts +4 -0
package/types/models/snac/feature_extraction_snac.d.ts.map +1 -0
package/types/models/ultravox/processing_ultravox.d.ts +16 -0
package/types/models/ultravox/processing_ultravox.d.ts.map +1 -0
package/types/models/whisper/common_whisper.d.ts.map +1 -1
package/types/models/whisper/feature_extraction_whisper.d.ts +3 -1
package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
package/types/models.d.ts +180 -4
package/types/models.d.ts.map +1 -1
package/types/pipelines.d.ts +51 -5
package/types/pipelines.d.ts.map +1 -1
package/types/tokenizers.d.ts.map +1 -1
package/types/transformers.d.ts +1 -0
package/types/tsconfig.tsbuildinfo +1 -1
package/types/utils/audio.d.ts.map +1 -1
package/types/utils/hub.d.ts +19 -7
package/types/utils/hub.d.ts.map +1 -1
package/types/utils/image.d.ts +2 -2
package/types/utils/image.d.ts.map +1 -1
package/types/utils/maths.d.ts +2 -2
package/types/utils/maths.d.ts.map +1 -1
package/types/utils/tensor.d.ts +17 -18
package/types/utils/tensor.d.ts.map +1 -1
package/types/utils/video.d.ts +37 -0
package/types/utils/video.d.ts.map +1 -0
package/dist/transformers.cjs.map +0 -1
package/dist/transformers.min.cjs +0 -2
package/dist/transformers.min.cjs.map +0 -1
package/dist/transformers.min.mjs +0 -2
package/dist/transformers.min.mjs.map +0 -1
package/dist/transformers.mjs.map +0 -1

package/src/utils/video.js ADDED Viewed

@@ -0,0 +1,128 @@
+import { RawImage } from "./image.js";
+import { apis } from "../env.js";
+export class RawVideoFrame {
+    /**
+     * @param {RawImage} image
+     * @param {number} timestamp
+     */
+    constructor(image, timestamp) {
+        this.image = image;
+        this.timestamp = timestamp;
+    }
+}
+export class RawVideo {
+    /**
+     * @param {RawVideoFrame[]|RawImage[]} frames
+     * @param {number} duration
+     */
+    constructor(frames, duration) {
+        if (frames.length > 0 && frames[0] instanceof RawImage) {
+            // Assume uniform timestamps
+            frames = frames.map((image, i) => new RawVideoFrame(image, (i + 1) / (frames.length + 1) * duration));
+        }
+        this.frames = /** @type {RawVideoFrame[]} */ (frames);
+        this.duration = duration;
+    }
+    get width() {
+        return this.frames[0].image.width;
+    }
+    get height() {
+        return this.frames[0].image.height;
+    }
+    get fps() {
+        return this.frames.length / this.duration;
+    }
+}
+/**
+ * Loads a video.
+ *
+ * @param {string|Blob|HTMLVideoElement} src The video to process.
+ * @param {Object} [options] Optional parameters.
+ * @param {number} [options.num_frames=null] The number of frames to sample uniformly.
+ * @param {number} [options.fps=null] The number of frames to sample per second.
+ *
+ * @returns {Promise<RawVideo>} The loaded video.
+ */
+export async function load_video(src, { num_frames = null, fps = null } = {}) {
+    if (!apis.IS_BROWSER_ENV) {
+        throw new Error("`load_video` is currently only supported in browser environments.");
+    }
+    // TODO: Support efficiently loading all frames using the WebCodecs API.
+    // Specfically, https://developer.mozilla.org/en-US/docs/Web/API/VideoDecoder
+    if (num_frames == null && fps == null) {
+        throw new Error("Either num_frames or fps must be provided.");
+    }
+    const frames = [];
+    const video = document.createElement("video");
+    video.crossOrigin = "anonymous";
+    video.muted = true; // mute to allow autoplay and seeking
+    if (typeof src === 'string') {
+        video.src = src;
+    } else if (src instanceof Blob) {
+        video.src = URL.createObjectURL(src);
+    } else if (src instanceof HTMLVideoElement) {
+        video.src = src.src;
+    } else {
+        throw new Error("Invalid URL or video element provided.");
+    }
+    // Wait for metadata to load to obtain duration
+    await new Promise((resolve) => video.onloadedmetadata = resolve);
+    if (video.seekable.start(0) === video.seekable.end(0)) {
+        // Fallback: Download entire video if not seekable
+        const response = await fetch(video.src);
+        const blob = await response.blob();
+        video.src = URL.createObjectURL(blob);
+        await new Promise((resolve) => video.onloadedmetadata = resolve);
+    }
+    const duration = video.duration;
+    let count, step;
+    if (num_frames != null) {
+        count = num_frames;
+        step = num_frames === 1 ? 0 : duration / (num_frames - 1);
+    } else {
+        step = 1 / fps;
+        count = Math.floor(duration / step);
+    }
+    // Build an array of sample times based on num_frames or fps
+    let sampleTimes = [];
+    for (let i = 0; i < count; ++i) {
+        sampleTimes.push(num_frames === 1 ? duration / 2 : i * step);
+    }
+    const canvas = document.createElement("canvas");
+    canvas.width = video.videoWidth;
+    canvas.height = video.videoHeight;
+    const ctx = canvas.getContext("2d", { willReadFrequently: true });
+    for (const t of sampleTimes) {
+        video.currentTime = t;
+        await new Promise((resolve) => {
+            video.onseeked = resolve;
+        });
+        ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
+        const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
+        const frameData = new RawImage(imageData.data, canvas.width, canvas.height, 4);
+        const frame = new RawVideoFrame(frameData, t);
+        frames.push(frame);
+    }
+    // Clean up video element.
+    video.remove();
+    return new RawVideo(frames, duration);
+}

package/types/backends/onnx.d.ts CHANGED Viewed

@@ -6,12 +6,12 @@
 export function deviceToExecutionProviders(device?: import("../utils/devices.js").DeviceType | "auto" | null): ONNXExecutionProviders[];
 /**
  * Create an ONNX inference session.
- * @param {Uint8Array} buffer The ONNX model buffer.
+ * @param {Uint8Array|string} buffer_or_path The ONNX model buffer or path.
  * @param {import('onnxruntime-common').InferenceSession.SessionOptions} session_options ONNX inference session options.
  * @param {Object} session_config ONNX inference session configuration.
  * @returns {Promise<import('onnxruntime-common').InferenceSession & { config: Object}>} The ONNX inference session.
  */
-export function createInferenceSession(buffer: Uint8Array, session_options: import("onnxruntime-common").InferenceSession.SessionOptions, session_config: any): Promise<import("onnxruntime-common").InferenceSession & {
+export function createInferenceSession(buffer_or_path: Uint8Array | string, session_options: import("onnxruntime-common").InferenceSession.SessionOptions, session_config: any): Promise<import("onnxruntime-common").InferenceSession & {
     config: any;
 }>;
 /**

package/types/backends/onnx.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"onnx.d.ts","sourceRoot":"","sources":["../../src/backends/onnx.js"],"names":[],"mappings":"AA0GA;;;;GAIG;AACH,oDAHW,OAAO,qBAAqB,EAAE,UAAU,GAAC,MAAM,GAAC,IAAI,GAClD,sBAAsB,EAAE,CAqBpC;AAWD;;;;;;GAMG;AACH~~,+CALW~~,UAAU,~~mBACV~~,OAAO,oBAAoB,EAAE,gBAAgB,CAAC,cAAc,wBAE1D,OAAO,CAAC,OAAO,oBAAoB,EAAE,gBAAgB,GAAG;IAAE,MAAM,MAAQ;CAAC,CAAC,CActF;AAED;;;;GAIG;AACH,gCAHW,GAAG,GACD,OAAO,CAInB;~~AA+BD~~;;;GAGG;AACH,+BAFa,OAAO,CAKnB;;~~qCAnLY~~,OAAO,oBAAoB,EAAE,gBAAgB,CAAC,uBAAuB"}
1	+ {"version":3,"file":"onnx.d.ts","sourceRoot":"","sources":["../../src/backends/onnx.js"],"names":[],"mappings":"AA0GA;;;;GAIG;AACH,oDAHW,OAAO,qBAAqB,EAAE,UAAU,GAAC,MAAM,GAAC,IAAI,GAClD,sBAAsB,EAAE,CAqBpC;AAWD;;;;;;GAMG;AACH,uDALW,UAAU,GAAC,MAAM,mBACjB,OAAO,oBAAoB,EAAE,gBAAgB,CAAC,cAAc,wBAE1D,OAAO,CAAC,OAAO,oBAAoB,EAAE,gBAAgB,GAAG;IAAE,MAAM,MAAQ;CAAC,CAAC,CActF;AAED;;;;GAIG;AACH,gCAHW,GAAG,GACD,OAAO,CAInB;AA8BD;;;GAGG;AACH,+BAFa,OAAO,CAKnB;;qCAlLY,OAAO,oBAAoB,EAAE,gBAAgB,CAAC,uBAAuB"}

package/types/configs.d.ts CHANGED Viewed

@@ -83,6 +83,6 @@ export type TransformersJSConfig = {
     /**
      * Whether to load the model using the external data format (used for models >= 2GB in size).
      */
-    use_external_data_format?: boolean | Record<string, boolean>;
+    use_external_data_format?: import("./utils/hub.js").ExternalData | Record<string, import("./utils/hub.js").ExternalData>;
 };
 //# sourceMappingURL=configs.d.ts.map

package/types/configs.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"configs.d.ts","sourceRoot":"","sources":["../src/configs.js"],"names":[],"mappings":"~~AAuPA~~;;;;GAIG;AACH,0CAHW,gBAAgB;;;IACd,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CA2EpC;AACD;;;GAGG;AACH;IAwBI;;;;;;;;OAQG;IACH,sDANW,MAAM,0EACN,iBAAiB,GAGf,OAAO,CAAC,gBAAgB,CAAC,CAqBrC;IArCD;;;OAGG;IACH,6BAGC;IAnBD,0BAA0B;IAC1B,YADW,MAAM,GAAC,IAAI,CACJ;IAElB,sBAAsB;IACtB,oBADW,OAAO,CACS;IAE3B,qBAAqB;IACrB,yBADW,MAAM,CACO;IAExB,mCAAmC;IACnC,0BADW,oBAAoB,CACN;IAQrB,uBAAkD;CAgCzD;AAED;;;;;GAKG;AACH;IArCI;;;;;;;;OAQG;IACH,sDANW,MAAM,0EACN,iBAAiB,GAGf,OAAO,CAAC,gBAAgB,CAAC,CAqBrC;CAcJ;~~gCAzWY~~,OAAO,gBAAgB,EAAE,iBAAiB;+BAI1C,OAAO,iBAAiB,EAAE,gBAAgB;2BAI1C,OAAO,iBAAiB,EAAE,YAAY;;;;;;;;~~qBAsWrC~~,OAAO,mBAAmB,EAAE,QAAQ,GAAC,MAAM,CAAC,OAAO,mBAAmB,EAAE,QAAQ,EAAE,OAAO,mBAAmB,EAAE,QAAQ,CAAC;;;;;;+BACvH,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC;;;;aAGtB,OAAO,oBAAoB,EAAE,UAAU;;;;YACvC,OAAO,mBAAmB,EAAE,QAAQ,GAAC,MAAM,CAAC,MAAM,EAAE,OAAO,mBAAmB,EAAE,QAAQ,CAAC;;;;+BACzF,OAAO,GAAC,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC"}
1	+ {"version":3,"file":"configs.d.ts","sourceRoot":"","sources":["../src/configs.js"],"names":[],"mappings":"AA4PA;;;;GAIG;AACH,0CAHW,gBAAgB;;;IACd,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CA2EpC;AACD;;;GAGG;AACH;IAwBI;;;;;;;;OAQG;IACH,sDANW,MAAM,0EACN,iBAAiB,GAGf,OAAO,CAAC,gBAAgB,CAAC,CAqBrC;IArCD;;;OAGG;IACH,6BAGC;IAnBD,0BAA0B;IAC1B,YADW,MAAM,GAAC,IAAI,CACJ;IAElB,sBAAsB;IACtB,oBADW,OAAO,CACS;IAE3B,qBAAqB;IACrB,yBADW,MAAM,CACO;IAExB,mCAAmC;IACnC,0BADW,oBAAoB,CACN;IAQrB,uBAAkD;CAgCzD;AAED;;;;;GAKG;AACH;IArCI;;;;;;;;OAQG;IACH,sDANW,MAAM,0EACN,iBAAiB,GAGf,OAAO,CAAC,gBAAgB,CAAC,CAqBrC;CAcJ;gCA9WY,OAAO,gBAAgB,EAAE,iBAAiB;+BAI1C,OAAO,iBAAiB,EAAE,gBAAgB;2BAI1C,OAAO,iBAAiB,EAAE,YAAY;;;;;;;;qBA2WrC,OAAO,mBAAmB,EAAE,QAAQ,GAAC,MAAM,CAAC,OAAO,mBAAmB,EAAE,QAAQ,EAAE,OAAO,mBAAmB,EAAE,QAAQ,CAAC;;;;;;+BACvH,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC;;;;aAGtB,OAAO,oBAAoB,EAAE,UAAU;;;;YACvC,OAAO,mBAAmB,EAAE,QAAQ,GAAC,MAAM,CAAC,MAAM,EAAE,OAAO,mBAAmB,EAAE,QAAQ,CAAC;;;;+BACzF,OAAO,gBAAgB,EAAE,YAAY,GAAC,MAAM,CAAC,MAAM,EAAE,OAAO,gBAAgB,EAAE,YAAY,CAAC"}

package/types/generation/streamers.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"streamers.d.ts","sourceRoot":"","sources":["../../src/generation/streamers.js"],"names":[],"mappings":"AASA;IACI;;;OAGG;IACH,WAFW,MAAM,EAAE,EAAE,QAIpB;IAED;;OAEG;IACH,YAEC;CACJ;AAMD;;GAEG;AACH;IACI;;;;;;;;;OASG;IACH,uBARW,OAAO,kBAAkB,EAAE,mBAAmB,+GAEtD;QAA0B,WAAW,GAA7B,OAAO;QACW,mBAAmB,GAArC,OAAO;QAC0B,iBAAiB,GAAlD,CAAS,IAAM,EAAN,MAAM,KAAG,IAAI;QACa,uBAAuB,GAA1D,CAAS,IAAQ,EAAR,MAAM,EAAE,KAAG,IAAI;QACP,aAAa;KACxC,EAoBA;IAVG,0DAA0B;IAC1B,qBAA8B;IAC9B,oCAA0D;IAC1D,gCAfgB,MAAM,EAAE,KAAG,IAAI,CAeuB;IACtD,mBAAyE;IAGzE,mBAAqB;IACrB,kBAAkB;IAClB,gCAAkC;~~IA6DtC~~;;;;OAIG;IACH,wBAHW,MAAM,cACN,OAAO,QASjB;CACJ;AAED;;;;;;;GAOG;AACH;IACI;;;;;;;;;;;;OAYG;IACH,uBAZW,OAAO,kBAAkB,EAAE,gBAAgB,gKAEnD;QAA0B,WAAW,GAA7B,OAAO;QAC0B,iBAAiB,GAAlD,CAAS,IAAM,EAAN,MAAM,KAAG,IAAI;QACa,uBAAuB,GAA1D,CAAS,IAAQ,EAAR,MAAM,EAAE,KAAG,IAAI;QACS,cAAc,GAA/C,CAAS,IAAM,EAAN,MAAM,KAAG,IAAI;QACW,YAAY,GAA7C,CAAS,IAAM,EAAN,MAAM,KAAG,IAAI;QACK,WAAW,GAAtC,MAAY,IAAI;QACC,cAAc,GAA/B,MAAM;QACY,mBAAmB,GAArC,OAAO;QACU,aAAa;KACxC,EA4BA;IATG,wBAAgD;IAEhD,uBA3BgB,MAAM,KAAG,IAAI,CA2BO;IACpC,qBA3BgB,MAAM,KAAG,IAAI,CA2BG;IAChC,mBA3BmB,IAAI,CA2BO;IAE9B,uBAAoC;IAEpC,+BAAkC;CAiCzC"}
1	+ {"version":3,"file":"streamers.d.ts","sourceRoot":"","sources":["../../src/generation/streamers.js"],"names":[],"mappings":"AASA;IACI;;;OAGG;IACH,WAFW,MAAM,EAAE,EAAE,QAIpB;IAED;;OAEG;IACH,YAEC;CACJ;AAMD;;GAEG;AACH;IACI;;;;;;;;;OASG;IACH,uBARW,OAAO,kBAAkB,EAAE,mBAAmB,+GAEtD;QAA0B,WAAW,GAA7B,OAAO;QACW,mBAAmB,GAArC,OAAO;QAC0B,iBAAiB,GAAlD,CAAS,IAAM,EAAN,MAAM,KAAG,IAAI;QACa,uBAAuB,GAA1D,CAAS,IAAQ,EAAR,MAAM,EAAE,KAAG,IAAI;QACP,aAAa;KACxC,EAoBA;IAVG,0DAA0B;IAC1B,qBAA8B;IAC9B,oCAA0D;IAC1D,gCAfgB,MAAM,EAAE,KAAG,IAAI,CAeuB;IACtD,mBAAyE;IAGzE,mBAAqB;IACrB,kBAAkB;IAClB,gCAAkC;IA8DtC;;;;OAIG;IACH,wBAHW,MAAM,cACN,OAAO,QASjB;CACJ;AAED;;;;;;;GAOG;AACH;IACI;;;;;;;;;;;;OAYG;IACH,uBAZW,OAAO,kBAAkB,EAAE,gBAAgB,gKAEnD;QAA0B,WAAW,GAA7B,OAAO;QAC0B,iBAAiB,GAAlD,CAAS,IAAM,EAAN,MAAM,KAAG,IAAI;QACa,uBAAuB,GAA1D,CAAS,IAAQ,EAAR,MAAM,EAAE,KAAG,IAAI;QACS,cAAc,GAA/C,CAAS,IAAM,EAAN,MAAM,KAAG,IAAI;QACW,YAAY,GAA7C,CAAS,IAAM,EAAN,MAAM,KAAG,IAAI;QACK,WAAW,GAAtC,MAAY,IAAI;QACC,cAAc,GAA/B,MAAM;QACY,mBAAmB,GAArC,OAAO;QACU,aAAa;KACxC,EA4BA;IATG,wBAAgD;IAEhD,uBA3BgB,MAAM,KAAG,IAAI,CA2BO;IACpC,qBA3BgB,MAAM,KAAG,IAAI,CA2BG;IAChC,mBA3BmB,IAAI,CA2BO;IAE9B,uBAAoC;IAEpC,+BAAkC;CAiCzC"}

package/types/models/dac/feature_extraction_dac.d.ts ADDED Viewed

@@ -0,0 +1,4 @@
+export class DacFeatureExtractor extends EncodecFeatureExtractor {
+}
+import { EncodecFeatureExtractor } from '../encodec/feature_extraction_encodec.js';
+//# sourceMappingURL=feature_extraction_dac.d.ts.map

package/types/models/dac/feature_extraction_dac.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"feature_extraction_dac.d.ts","sourceRoot":"","sources":["../../../src/models/dac/feature_extraction_dac.js"],"names":[],"mappings":"AAEA;CAAoE;wCAF5B,0CAA0C"}

package/types/models/encodec/feature_extraction_encodec.d.ts ADDED Viewed

@@ -0,0 +1,13 @@
+export class EncodecFeatureExtractor extends FeatureExtractor {
+    /**
+     * Asynchronously extracts input values from a given audio using the provided configuration.
+     * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
+     * @returns {Promise<{ input_values: Tensor; }>} The extracted input values.
+     */
+    _call(audio: Float32Array | Float64Array): Promise<{
+        input_values: Tensor;
+    }>;
+}
+import { FeatureExtractor } from '../../base/feature_extraction_utils.js';
+import { Tensor } from '../../utils/tensor.js';
+//# sourceMappingURL=feature_extraction_encodec.d.ts.map

package/types/models/encodec/feature_extraction_encodec.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"feature_extraction_encodec.d.ts","sourceRoot":"","sources":["../../../src/models/encodec/feature_extraction_encodec.js"],"names":[],"mappings":"AAIA;IACI;;;;OAIG;IACH,aAHW,YAAY,GAAC,YAAY,GACvB,OAAO,CAAC;QAAE,YAAY,EAAE,MAAM,CAAC;KAAE,CAAC,CAsB9C;CACJ;iCA/BuD,wCAAwC;uBACzE,uBAAuB"}

package/types/models/feature_extractors.d.ts CHANGED Viewed

@@ -1,8 +1,11 @@
 export * from "./audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js";
+export * from "./encodec/feature_extraction_encodec.js";
 export * from "./clap/feature_extraction_clap.js";
+export * from "./dac/feature_extraction_dac.js";
 export * from "./moonshine/feature_extraction_moonshine.js";
 export * from "./pyannote/feature_extraction_pyannote.js";
 export * from "./seamless_m4t/feature_extraction_seamless_m4t.js";
+export * from "./snac/feature_extraction_snac.js";
 export * from "./speecht5/feature_extraction_speecht5.js";
 export * from "./wav2vec2/feature_extraction_wav2vec2.js";
 export * from "./wespeaker/feature_extraction_wespeaker.js";

package/types/models/florence2/processing_florence2.d.ts CHANGED Viewed

@@ -26,7 +26,7 @@ export class Florence2Processor extends Processor {
      * @param {[number, number]} image_size The size of the image. height x width.
      */
     post_process_generation(text: string, task: string, image_size: [number, number]): {
-        [x: string]: string | {
+        [task]: string | {
             [x: string]: any[];
             labels: any[];
         };

package/types/models/florence2/processing_florence2.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"processing_florence2.d.ts","sourceRoot":"","sources":["../../../src/models/florence2/processing_florence2.js"],"names":[],"mappings":"AAIA;IACI,6CAAsC;IACtC,wDAAiD;IAEjD,0CA0BC;IAdG,kCAAkC;IAClC,mCADW,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAC2E;IAEzG,kCAAkC;IAClC,6BADW,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAC+D;IAE7F,kCAAkC;IAClC,yBADW,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CACuD;IAErF;;;MAGC;IACD,qBAAwB;IAG5B;;;;OAIG;IACH,wBAHW,MAAM,GAAC,MAAM,EAAE,GACb,MAAM,EAAE,CA6BpB;IAED;;;;;OAKG;IACH,8BAJW,MAAM,QACN,MAAM,cACN,CAAC,MAAM,EAAE,MAAM,CAAC~~;;;;;MAsC1B~~;IAID,0DAaC;CACJ;0BAlIyB,gCAAgC;8BAE5B,qBAAqB;mCADhB,kCAAkC"}
1	+ {"version":3,"file":"processing_florence2.d.ts","sourceRoot":"","sources":["../../../src/models/florence2/processing_florence2.js"],"names":[],"mappings":"AAIA;IACI,6CAAsC;IACtC,wDAAiD;IAEjD,0CA0BC;IAdG,kCAAkC;IAClC,mCADW,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAC2E;IAEzG,kCAAkC;IAClC,6BADW,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAC+D;IAE7F,kCAAkC;IAClC,yBADW,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CACuD;IAErF;;;MAGC;IACD,qBAAwB;IAG5B;;;;OAIG;IACH,wBAHW,MAAM,GAAC,MAAM,EAAE,GACb,MAAM,EAAE,CA6BpB;IAED;;;;;OAKG;IACH,8BAJW,MAAM,QACN,MAAM,cACN,CAAC,MAAM,EAAE,MAAM,CAAC;QAqCd,CAAC,IAAI,CAAC;;;UAAc;MAChC;IAID,0DAaC;CACJ;0BAlIyB,gCAAgC;8BAE5B,qBAAqB;mCADhB,kCAAkC"}

package/types/models/image_processors.d.ts CHANGED Viewed

@@ -31,6 +31,7 @@ export * from "./rt_detr/image_processing_rt_detr.js";
 export * from "./sam/image_processing_sam.js";
 export * from "./segformer/image_processing_segformer.js";
 export * from "./siglip/image_processing_siglip.js";
+export * from "./smolvlm/image_processing_smolvlm.js";
 export * from "./swin2sr/image_processing_swin2sr.js";
 export * from "./vit/image_processing_vit.js";
 export * from "./vitmatte/image_processing_vitmatte.js";

package/types/models/processors.d.ts CHANGED Viewed

@@ -11,7 +11,9 @@ export * from "./paligemma/processing_paligemma.js";
 export * from "./pyannote/processing_pyannote.js";
 export * from "./qwen2_vl/processing_qwen2_vl.js";
 export * from "./sam/processing_sam.js";
+export * from "./smolvlm/processing_smolvlm.js";
 export * from "./speecht5/processing_speecht5.js";
+export * from "./ultravox/processing_ultravox.js";
 export * from "./wav2vec2/processing_wav2vec2.js";
 export * from "./wav2vec2_with_lm/processing_wav2vec2_with_lm.js";
 export * from "./whisper/processing_whisper.js";

package/types/models/smolvlm/image_processing_smolvlm.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ export { Idefics3ImageProcessor as SmolVLMImageProcessor } from "../idefics3/image_processing_idefics3.js";
2	+ //# sourceMappingURL=image_processing_smolvlm.d.ts.map

package/types/models/smolvlm/image_processing_smolvlm.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"image_processing_smolvlm.d.ts","sourceRoot":"","sources":["../../../src/models/smolvlm/image_processing_smolvlm.js"],"names":[],"mappings":""}

package/types/models/smolvlm/processing_smolvlm.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ export { Idefics3Processor as SmolVLMProcessor } from "../idefics3/processing_idefics3.js";
2	+ //# sourceMappingURL=processing_smolvlm.d.ts.map

package/types/models/smolvlm/processing_smolvlm.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"processing_smolvlm.d.ts","sourceRoot":"","sources":["../../../src/models/smolvlm/processing_smolvlm.js"],"names":[],"mappings":""}

package/types/models/snac/feature_extraction_snac.d.ts ADDED Viewed

@@ -0,0 +1,4 @@
+export class SnacFeatureExtractor extends DacFeatureExtractor {
+}
+import { DacFeatureExtractor } from '../dac/feature_extraction_dac.js';
+//# sourceMappingURL=feature_extraction_snac.d.ts.map

package/types/models/snac/feature_extraction_snac.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"feature_extraction_snac.d.ts","sourceRoot":"","sources":["../../../src/models/snac/feature_extraction_snac.js"],"names":[],"mappings":"AAEA;CAAiE;oCAF7B,kCAAkC"}

package/types/models/ultravox/processing_ultravox.d.ts ADDED Viewed

@@ -0,0 +1,16 @@
+/**
+ * Represents a UltravoxProcessor that extracts features from an audio input.
+ */
+export class UltravoxProcessor extends Processor {
+    static tokenizer_class: typeof AutoTokenizer;
+    static feature_extractor_class: typeof AutoFeatureExtractor;
+    /**
+     * @param {string} text The text input to process.
+     * @param {Float32Array} audio The audio input to process.
+     */
+    _call(text: string, audio?: Float32Array, kwargs?: {}): Promise<any>;
+}
+import { Processor } from "../../base/processing_utils.js";
+import { AutoTokenizer } from "../../tokenizers.js";
+import { AutoFeatureExtractor } from "../auto/feature_extraction_auto.js";
+//# sourceMappingURL=processing_ultravox.d.ts.map

package/types/models/ultravox/processing_ultravox.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"processing_ultravox.d.ts","sourceRoot":"","sources":["../../../src/models/ultravox/processing_ultravox.js"],"names":[],"mappings":"AAIA;;GAEG;AACH;IACI,6CAAsC;IACtC,4DAAqD;IAGrD;;;OAGG;IACH,YAHW,MAAM,UACN,YAAY,6BAsCtB;CACJ;0BAnDyB,gCAAgC;8BAD5B,qBAAqB;qCADd,oCAAoC"}

package/types/models/whisper/common_whisper.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"common_whisper.d.ts","sourceRoot":"","sources":["../../../src/models/whisper/common_whisper.js"],"names":[],"mappings":"AA4HA;;;GAGG;AACH,mDAHW,MAAM,GACJ,MAAM,~~CAwBlB~~;~~AA7CD~~,qDAAmE;AAEnE,6DAeG"}
1	+ {"version":3,"file":"common_whisper.d.ts","sourceRoot":"","sources":["../../../src/models/whisper/common_whisper.js"],"names":[],"mappings":"AA4HA;;;GAGG;AACH,mDAHW,MAAM,GACJ,MAAM,CA8BlB;AAnDD,qDAAmE;AAEnE,6DAeG"}

package/types/models/whisper/feature_extraction_whisper.d.ts CHANGED Viewed

@@ -12,7 +12,9 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
      * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
      * @returns {Promise<{ input_features: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor.
      */
-    _call(audio: Float32Array | Float64Array): Promise<{
+    _call(audio: Float32Array | Float64Array, { max_length, }?: {
+        max_length?: any;
+    }): Promise<{
         input_features: Tensor;
     }>;
 }

package/types/models/whisper/feature_extraction_whisper.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"feature_extraction_whisper.d.ts","sourceRoot":"","sources":["../../../src/models/whisper/feature_extraction_whisper.js"],"names":[],"mappings":"AAKA;IAEI,yBAeC;IADG,sCAAwD;IAG5D;;;;OAIG;IACH,kCAHW,YAAY,GAAC,YAAY,GACvB,OAAO,CAAC,MAAM,CAAC,~~CA0B3B~~;IAED;;;;OAIG;IACH,aAHW,YAAY,GAAC,YAAY,~~GACvB,~~OAAO,CAAC;QAAE,cAAc,EAAE,MAAM,CAAA;KAAE,CAAC,~~CAwB~~/C;CACJ;~~iCAnFuD~~,wCAAwC;uBACzE,uBAAuB"}
1	+ {"version":3,"file":"feature_extraction_whisper.d.ts","sourceRoot":"","sources":["../../../src/models/whisper/feature_extraction_whisper.js"],"names":[],"mappings":"AAKA;IAEI,yBAeC;IADG,sCAAwD;IAG5D;;;;OAIG;IACH,kCAHW,YAAY,GAAC,YAAY,GACvB,OAAO,CAAC,MAAM,CAAC,CA6B3B;IAED;;;;OAIG;IACH,aAHW,YAAY,GAAC,YAAY;;QACvB,OAAO,CAAC;QAAE,cAAc,EAAE,MAAM,CAAA;KAAE,CAAC,CA6B/C;CACJ;iCA3FuD,wCAAwC;uBACzE,uBAAuB"}

package/types/models.d.ts CHANGED Viewed

@@ -35,8 +35,8 @@ export class PreTrainedModel extends PreTrainedModel_base {
     sessions: Record<string, any>;
     configs: Record<string, any>;
     can_generate: boolean;
-    _forward: typeof decoderForward;
-    _prepare_inputs_for_generation: typeof image_text_to_text_prepare_inputs_for_generation;
+    _forward: typeof decoderForward | typeof autoEncoderForward;
+    _prepare_inputs_for_generation: typeof multimodal_text_to_text_prepare_inputs_for_generation;
     /** @type {import('./configs.js').TransformersJSConfig} */
     custom_config: import("./configs.js").TransformersJSConfig;
     /**
@@ -180,6 +180,9 @@ export class PreTrainedModel extends PreTrainedModel_base {
     encode_text({ input_ids }: {
         input_ids: any;
     }): Promise<any>;
+    encode_audio({ audio_values }: {
+        audio_values: any;
+    }): Promise<any>;
 }
 export class ModelOutput {
 }
@@ -1204,6 +1207,8 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
         sequences: Tensor;
     }, alignment_heads: number[][], num_frames?: number, time_precision?: number): Tensor;
 }
+export class LiteWhisperForConditionalGeneration extends WhisperForConditionalGeneration {
+}
 export class MoonshinePreTrainedModel extends PreTrainedModel {
     requires_attention_mask: boolean;
 }
@@ -1283,7 +1288,7 @@ export class PaliGemmaForConditionalGeneration extends PaliGemmaPreTrainedModel
 export class Idefics3PreTrainedModel extends PreTrainedModel {
 }
 /**
- * The LLAVA model which consists of a vision backbone and a language model.
+ * The Idefics3 model which consists of a vision backbone and a language model.
  */
 export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
     encode_image({ pixel_values, pixel_attention_mask }: {
@@ -1295,6 +1300,12 @@ export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
         attention_mask: any;
     };
 }
+/**
+ * The SmolVLM Model with a language modeling head.
+ * It is made up a SigLIP vision encoder, with a language modeling head on top.
+ */
+export class SmolVLMForConditionalGeneration extends Idefics3ForConditionalGeneration {
+}
 export class Phi3VPreTrainedModel extends PreTrainedModel {
 }
 export class Phi3VForCausalLM extends Phi3VPreTrainedModel {
@@ -1741,6 +1752,18 @@ export class Gemma2Model extends Gemma2PreTrainedModel {
 }
 export class Gemma2ForCausalLM extends Gemma2PreTrainedModel {
 }
+/**
+ * The bare Gemma3 Model outputting raw hidden-states without any specific head on top.
+ */
+export class Gemma3PreTrainedModel extends PreTrainedModel {
+}
+/**
+ * The bare Gemma3 Model outputting raw hidden-states without any specific head on top.
+ */
+export class Gemma3Model extends Gemma3PreTrainedModel {
+}
+export class Gemma3ForCausalLM extends Gemma3PreTrainedModel {
+}
 export class OpenELMPreTrainedModel extends PreTrainedModel {
 }
 export class OpenELMModel extends OpenELMPreTrainedModel {
@@ -2173,6 +2196,8 @@ export class SwinForImageClassification extends SwinPreTrainedModel {
      */
     _call(model_inputs: any): Promise<SequenceClassifierOutput>;
 }
+export class SwinForSemanticSegmentation extends SwinPreTrainedModel {
+}
 export class Swin2SRPreTrainedModel extends PreTrainedModel {
 }
 /**
@@ -2283,6 +2308,14 @@ export class DepthProPreTrainedModel extends PreTrainedModel {
 }
 export class DepthProForDepthEstimation extends DepthProPreTrainedModel {
 }
+export class Metric3DPreTrainedModel extends PreTrainedModel {
+}
+export class Metric3DForDepthEstimation extends Metric3DPreTrainedModel {
+}
+export class Metric3Dv2PreTrainedModel extends PreTrainedModel {
+}
+export class Metric3Dv2ForDepthEstimation extends Metric3Dv2PreTrainedModel {
+}
 export class MaskFormerPreTrainedModel extends PreTrainedModel {
 }
 export class MaskFormerModel extends MaskFormerPreTrainedModel {
@@ -3446,6 +3479,8 @@ export class MobileNetV1ForImageClassification extends MobileNetV1PreTrainedMode
      */
     _call(model_inputs: any): Promise<SequenceClassifierOutput>;
 }
+export class MobileNetV1ForSemanticSegmentation extends MobileNetV1PreTrainedModel {
+}
 export class MobileNetV2PreTrainedModel extends PreTrainedModel {
 }
 /**
@@ -3463,6 +3498,8 @@ export class MobileNetV2ForImageClassification extends MobileNetV2PreTrainedMode
      */
     _call(model_inputs: any): Promise<SequenceClassifierOutput>;
 }
+export class MobileNetV2ForSemanticSegmentation extends MobileNetV2PreTrainedModel {
+}
 export class MobileNetV3PreTrainedModel extends PreTrainedModel {
 }
 /**
@@ -3480,6 +3517,8 @@ export class MobileNetV3ForImageClassification extends MobileNetV3PreTrainedMode
      */
     _call(model_inputs: any): Promise<SequenceClassifierOutput>;
 }
+export class MobileNetV3ForSemanticSegmentation extends MobileNetV3PreTrainedModel {
+}
 export class MobileNetV4PreTrainedModel extends PreTrainedModel {
 }
 /**
@@ -3497,6 +3536,8 @@ export class MobileNetV4ForImageClassification extends MobileNetV4PreTrainedMode
      */
     _call(model_inputs: any): Promise<SequenceClassifierOutput>;
 }
+export class MobileNetV4ForSemanticSegmentation extends MobileNetV4PreTrainedModel {
+}
 export class DecisionTransformerPreTrainedModel extends PreTrainedModel {
 }
 /**
@@ -3562,6 +3603,134 @@ export class PatchTSMixerModel extends PatchTSMixerPreTrainedModel {
  */
 export class PatchTSMixerForPrediction extends PatchTSMixerPreTrainedModel {
 }
+export class UltravoxPreTrainedModel extends PreTrainedModel {
+}
+export class UltravoxModel extends UltravoxPreTrainedModel {
+    _merge_input_ids_with_audio_features(kwargs: any): {
+        inputs_embeds: any;
+        attention_mask: any;
+    };
+}
+export class MimiPreTrainedModel extends PreTrainedModel {
+}
+export class MimiEncoderOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.audio_codes Discrete code embeddings, of shape `(batch_size, num_quantizers, codes_length)`.
+     */
+    constructor({ audio_codes }: {
+        audio_codes: Tensor;
+    });
+    audio_codes: Tensor;
+}
+export class MimiDecoderOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.audio_values Decoded audio values, of shape `(batch_size, num_channels, sequence_length)`.
+     */
+    constructor({ audio_values }: {
+        audio_values: Tensor;
+    });
+    audio_values: Tensor;
+}
+/**
+ * The Mimi neural audio codec model.
+ */
+export class MimiModel extends MimiPreTrainedModel {
+    /**
+     * Encodes the input audio waveform into discrete codes.
+     * @param {Object} inputs Model inputs
+     * @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`).
+     * @returns {Promise<MimiEncoderOutput>} The output tensor of shape `(batch_size, num_codebooks, sequence_length)`.
+     */
+    encode(inputs: {
+        input_values?: Tensor;
+    }): Promise<MimiEncoderOutput>;
+    /**
+     * Decodes the given frames into an output audio waveform.
+     * @param {MimiEncoderOutput} inputs The encoded audio codes.
+     * @returns {Promise<MimiDecoderOutput>} The output tensor of shape `(batch_size, num_channels, sequence_length)`.
+     */
+    decode(inputs: MimiEncoderOutput): Promise<MimiDecoderOutput>;
+}
+export class MimiEncoderModel extends MimiPreTrainedModel {
+}
+export class MimiDecoderModel extends MimiPreTrainedModel {
+}
+export class DacPreTrainedModel extends PreTrainedModel {
+}
+export class DacEncoderOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.audio_codes Discrete code embeddings, of shape `(batch_size, num_quantizers, codes_length)`.
+     */
+    constructor({ audio_codes }: {
+        audio_codes: Tensor;
+    });
+    audio_codes: Tensor;
+}
+export class DacDecoderOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.audio_values Decoded audio values, of shape `(batch_size, num_channels, sequence_length)`.
+     */
+    constructor({ audio_values }: {
+        audio_values: Tensor;
+    });
+    audio_values: Tensor;
+}
+/**
+ * The DAC (Descript Audio Codec) model.
+ */
+export class DacModel extends DacPreTrainedModel {
+    /**
+     * Encodes the input audio waveform into discrete codes.
+     * @param {Object} inputs Model inputs
+     * @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`).
+     * @returns {Promise<DacEncoderOutput>} The output tensor of shape `(batch_size, num_codebooks, sequence_length)`.
+     */
+    encode(inputs: {
+        input_values?: Tensor;
+    }): Promise<DacEncoderOutput>;
+    /**
+     * Decodes the given frames into an output audio waveform.
+     * @param {DacEncoderOutput} inputs The encoded audio codes.
+     * @returns {Promise<DacDecoderOutput>} The output tensor of shape `(batch_size, num_channels, sequence_length)`.
+     */
+    decode(inputs: DacEncoderOutput): Promise<DacDecoderOutput>;
+}
+export class DacEncoderModel extends DacPreTrainedModel {
+}
+export class DacDecoderModel extends DacPreTrainedModel {
+}
+export class SnacPreTrainedModel extends PreTrainedModel {
+}
+/**
+ * The SNAC (Multi-Scale Neural Audio Codec) model.
+ */
+export class SnacModel extends SnacPreTrainedModel {
+    /**
+     * Encodes the input audio waveform into discrete codes.
+     * @param {Object} inputs Model inputs
+     * @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`).
+     * @returns {Promise<Record<string, Tensor>>} The output tensors of shape `(batch_size, num_codebooks, sequence_length)`.
+     */
+    encode(inputs: {
+        input_values?: Tensor;
+    }): Promise<Record<string, Tensor>>;
+    /**
+     * Decodes the given frames into an output audio waveform.
+     * @param {Record<string, Tensor>} inputs The encoded audio codes.
+     * @returns {Promise<{audio_values: Tensor}>} The output tensor of shape `(batch_size, num_channels, sequence_length)`.
+     */
+    decode(inputs: Record<string, Tensor>): Promise<{
+        audio_values: Tensor;
+    }>;
+}
+export class SnacEncoderModel extends SnacPreTrainedModel {
+}
+export class SnacDecoderModel extends SnacPreTrainedModel {
+}
 /**
  * Base class of all AutoModels. Contains the `from_pretrained` function
  * which is used to instantiate pretrained models.
@@ -3799,6 +3968,12 @@ export class AutoModelForPoseEstimation extends PretrainedMixin {
 export class AutoModelForImageFeatureExtraction extends PretrainedMixin {
     static MODEL_CLASS_MAPPINGS: Map<string, (string | typeof CLIPVisionModelWithProjection)[]>[];
 }
+export class AutoModelForImageTextToText extends PretrainedMixin {
+    static MODEL_CLASS_MAPPINGS: Map<string, (string | typeof Idefics3ForConditionalGeneration)[] | (string | typeof Florence2ForConditionalGeneration)[]>[];
+}
+export class AutoModelForAudioTextToText extends PretrainedMixin {
+    static MODEL_CLASS_MAPPINGS: Map<string, (string | typeof UltravoxModel)[]>[];
+}
 export class Seq2SeqLMOutput extends ModelOutput {
     /**
      * @param {Object} output The output of the model.
@@ -3961,7 +4136,8 @@ export class VitsModelOutput extends ModelOutput {
  * @private
  */
 declare function decoderForward(self: any, model_inputs: any, is_encoder_decoder?: boolean): Promise<any>;
-declare function image_text_to_text_prepare_inputs_for_generation(self: any, ...args: any[]): any;
+declare function autoEncoderForward(self: any, model_inputs: any): Promise<any>;
+declare function multimodal_text_to_text_prepare_inputs_for_generation(self: any, ...args: any[]): any;
 import { GenerationConfig } from './generation/configuration_utils.js';
 import { LogitsProcessorList } from './generation/logits_process.js';
 import { StoppingCriteriaList } from './generation/stopping_criteria.js';