@huggingface/transformers 3.3.3 → 3.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/README.md +13 -3
  2. package/dist/ort-wasm-simd-threaded.jsep.mjs +124 -115
  3. package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
  4. package/dist/transformers.js +2778 -1592
  5. package/dist/transformers.js.map +1 -1
  6. package/dist/transformers.min.js +1 -1
  7. package/dist/transformers.min.js.map +1 -1
  8. package/dist/{transformers.cjs → transformers.node.cjs} +1699 -2530
  9. package/dist/transformers.node.cjs.map +1 -0
  10. package/dist/transformers.node.min.cjs +2 -0
  11. package/dist/transformers.node.min.cjs.map +1 -0
  12. package/dist/transformers.node.min.mjs +2 -0
  13. package/dist/transformers.node.min.mjs.map +1 -0
  14. package/dist/{transformers.mjs → transformers.node.mjs} +1738 -2510
  15. package/dist/transformers.node.mjs.map +1 -0
  16. package/dist/transformers.web.js +35876 -0
  17. package/dist/transformers.web.js.map +1 -0
  18. package/dist/transformers.web.min.js +2 -0
  19. package/dist/transformers.web.min.js.map +1 -0
  20. package/package.json +6 -6
  21. package/src/backends/onnx.js +14 -15
  22. package/src/configs.js +6 -1
  23. package/src/env.js +1 -1
  24. package/src/generation/streamers.js +4 -3
  25. package/src/models/dac/feature_extraction_dac.js +3 -0
  26. package/src/models/encodec/feature_extraction_encodec.js +32 -0
  27. package/src/models/feature_extractors.js +3 -0
  28. package/src/models/idefics3/image_processing_idefics3.js +1 -1
  29. package/src/models/image_processors.js +1 -0
  30. package/src/models/processors.js +2 -0
  31. package/src/models/smolvlm/image_processing_smolvlm.js +2 -0
  32. package/src/models/smolvlm/processing_smolvlm.js +2 -0
  33. package/src/models/snac/feature_extraction_snac.js +3 -0
  34. package/src/models/ultravox/processing_ultravox.js +54 -0
  35. package/src/models/whisper/common_whisper.js +7 -1
  36. package/src/models/whisper/feature_extraction_whisper.js +18 -10
  37. package/src/models.js +546 -78
  38. package/src/pipelines.js +246 -137
  39. package/src/tokenizers.js +42 -28
  40. package/src/transformers.js +1 -0
  41. package/src/utils/audio.js +2 -0
  42. package/src/utils/hub.js +140 -80
  43. package/src/utils/image.js +9 -1
  44. package/src/utils/maths.js +1 -1
  45. package/src/utils/tensor.js +12 -5
  46. package/src/utils/video.js +128 -0
  47. package/types/backends/onnx.d.ts +2 -2
  48. package/types/backends/onnx.d.ts.map +1 -1
  49. package/types/configs.d.ts +1 -1
  50. package/types/configs.d.ts.map +1 -1
  51. package/types/generation/streamers.d.ts.map +1 -1
  52. package/types/models/dac/feature_extraction_dac.d.ts +4 -0
  53. package/types/models/dac/feature_extraction_dac.d.ts.map +1 -0
  54. package/types/models/encodec/feature_extraction_encodec.d.ts +13 -0
  55. package/types/models/encodec/feature_extraction_encodec.d.ts.map +1 -0
  56. package/types/models/feature_extractors.d.ts +3 -0
  57. package/types/models/florence2/processing_florence2.d.ts +1 -1
  58. package/types/models/florence2/processing_florence2.d.ts.map +1 -1
  59. package/types/models/image_processors.d.ts +1 -0
  60. package/types/models/processors.d.ts +2 -0
  61. package/types/models/smolvlm/image_processing_smolvlm.d.ts +2 -0
  62. package/types/models/smolvlm/image_processing_smolvlm.d.ts.map +1 -0
  63. package/types/models/smolvlm/processing_smolvlm.d.ts +2 -0
  64. package/types/models/smolvlm/processing_smolvlm.d.ts.map +1 -0
  65. package/types/models/snac/feature_extraction_snac.d.ts +4 -0
  66. package/types/models/snac/feature_extraction_snac.d.ts.map +1 -0
  67. package/types/models/ultravox/processing_ultravox.d.ts +16 -0
  68. package/types/models/ultravox/processing_ultravox.d.ts.map +1 -0
  69. package/types/models/whisper/common_whisper.d.ts.map +1 -1
  70. package/types/models/whisper/feature_extraction_whisper.d.ts +3 -1
  71. package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
  72. package/types/models.d.ts +180 -4
  73. package/types/models.d.ts.map +1 -1
  74. package/types/pipelines.d.ts +51 -5
  75. package/types/pipelines.d.ts.map +1 -1
  76. package/types/tokenizers.d.ts.map +1 -1
  77. package/types/transformers.d.ts +1 -0
  78. package/types/tsconfig.tsbuildinfo +1 -1
  79. package/types/utils/audio.d.ts.map +1 -1
  80. package/types/utils/hub.d.ts +19 -7
  81. package/types/utils/hub.d.ts.map +1 -1
  82. package/types/utils/image.d.ts +2 -2
  83. package/types/utils/image.d.ts.map +1 -1
  84. package/types/utils/maths.d.ts +2 -2
  85. package/types/utils/maths.d.ts.map +1 -1
  86. package/types/utils/tensor.d.ts +17 -18
  87. package/types/utils/tensor.d.ts.map +1 -1
  88. package/types/utils/video.d.ts +37 -0
  89. package/types/utils/video.d.ts.map +1 -0
  90. package/dist/transformers.cjs.map +0 -1
  91. package/dist/transformers.min.cjs +0 -2
  92. package/dist/transformers.min.cjs.map +0 -1
  93. package/dist/transformers.min.mjs +0 -2
  94. package/dist/transformers.min.mjs.map +0 -1
  95. package/dist/transformers.mjs.map +0 -1
@@ -0,0 +1,128 @@
1
+ import { RawImage } from "./image.js";
2
+ import { apis } from "../env.js";
3
+
4
+ export class RawVideoFrame {
5
+
6
+ /**
7
+ * @param {RawImage} image
8
+ * @param {number} timestamp
9
+ */
10
+ constructor(image, timestamp) {
11
+ this.image = image;
12
+ this.timestamp = timestamp;
13
+ }
14
+ }
15
+
16
+ export class RawVideo {
17
+ /**
18
+ * @param {RawVideoFrame[]|RawImage[]} frames
19
+ * @param {number} duration
20
+ */
21
+ constructor(frames, duration) {
22
+ if (frames.length > 0 && frames[0] instanceof RawImage) {
23
+ // Assume uniform timestamps
24
+ frames = frames.map((image, i) => new RawVideoFrame(image, (i + 1) / (frames.length + 1) * duration));
25
+ }
26
+ this.frames = /** @type {RawVideoFrame[]} */ (frames);
27
+ this.duration = duration;
28
+ }
29
+
30
+ get width() {
31
+ return this.frames[0].image.width;
32
+ }
33
+ get height() {
34
+ return this.frames[0].image.height;
35
+ }
36
+
37
+ get fps() {
38
+ return this.frames.length / this.duration;
39
+ }
40
+ }
41
+
42
+
43
+ /**
44
+ * Loads a video.
45
+ *
46
+ * @param {string|Blob|HTMLVideoElement} src The video to process.
47
+ * @param {Object} [options] Optional parameters.
48
+ * @param {number} [options.num_frames=null] The number of frames to sample uniformly.
49
+ * @param {number} [options.fps=null] The number of frames to sample per second.
50
+ *
51
+ * @returns {Promise<RawVideo>} The loaded video.
52
+ */
53
+ export async function load_video(src, { num_frames = null, fps = null } = {}) {
54
+ if (!apis.IS_BROWSER_ENV) {
55
+ throw new Error("`load_video` is currently only supported in browser environments.");
56
+ }
57
+
58
+ // TODO: Support efficiently loading all frames using the WebCodecs API.
59
+ // Specfically, https://developer.mozilla.org/en-US/docs/Web/API/VideoDecoder
60
+ if (num_frames == null && fps == null) {
61
+ throw new Error("Either num_frames or fps must be provided.");
62
+ }
63
+
64
+ const frames = [];
65
+
66
+ const video = document.createElement("video");
67
+ video.crossOrigin = "anonymous";
68
+ video.muted = true; // mute to allow autoplay and seeking
69
+
70
+ if (typeof src === 'string') {
71
+ video.src = src;
72
+ } else if (src instanceof Blob) {
73
+ video.src = URL.createObjectURL(src);
74
+ } else if (src instanceof HTMLVideoElement) {
75
+ video.src = src.src;
76
+ } else {
77
+ throw new Error("Invalid URL or video element provided.");
78
+ }
79
+ // Wait for metadata to load to obtain duration
80
+ await new Promise((resolve) => video.onloadedmetadata = resolve);
81
+
82
+ if (video.seekable.start(0) === video.seekable.end(0)) {
83
+ // Fallback: Download entire video if not seekable
84
+ const response = await fetch(video.src);
85
+ const blob = await response.blob();
86
+ video.src = URL.createObjectURL(blob);
87
+ await new Promise((resolve) => video.onloadedmetadata = resolve);
88
+ }
89
+
90
+ const duration = video.duration;
91
+
92
+ let count, step;
93
+ if (num_frames != null) {
94
+ count = num_frames;
95
+ step = num_frames === 1 ? 0 : duration / (num_frames - 1);
96
+ } else {
97
+ step = 1 / fps;
98
+ count = Math.floor(duration / step);
99
+ }
100
+
101
+ // Build an array of sample times based on num_frames or fps
102
+ let sampleTimes = [];
103
+ for (let i = 0; i < count; ++i) {
104
+ sampleTimes.push(num_frames === 1 ? duration / 2 : i * step);
105
+ }
106
+
107
+ const canvas = document.createElement("canvas");
108
+ canvas.width = video.videoWidth;
109
+ canvas.height = video.videoHeight;
110
+ const ctx = canvas.getContext("2d", { willReadFrequently: true });
111
+ for (const t of sampleTimes) {
112
+ video.currentTime = t;
113
+ await new Promise((resolve) => {
114
+ video.onseeked = resolve;
115
+ });
116
+ ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
117
+ const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
118
+ const frameData = new RawImage(imageData.data, canvas.width, canvas.height, 4);
119
+
120
+ const frame = new RawVideoFrame(frameData, t);
121
+ frames.push(frame);
122
+ }
123
+
124
+ // Clean up video element.
125
+ video.remove();
126
+
127
+ return new RawVideo(frames, duration);
128
+ }
@@ -6,12 +6,12 @@
6
6
  export function deviceToExecutionProviders(device?: import("../utils/devices.js").DeviceType | "auto" | null): ONNXExecutionProviders[];
7
7
  /**
8
8
  * Create an ONNX inference session.
9
- * @param {Uint8Array} buffer The ONNX model buffer.
9
+ * @param {Uint8Array|string} buffer_or_path The ONNX model buffer or path.
10
10
  * @param {import('onnxruntime-common').InferenceSession.SessionOptions} session_options ONNX inference session options.
11
11
  * @param {Object} session_config ONNX inference session configuration.
12
12
  * @returns {Promise<import('onnxruntime-common').InferenceSession & { config: Object}>} The ONNX inference session.
13
13
  */
14
- export function createInferenceSession(buffer: Uint8Array, session_options: import("onnxruntime-common").InferenceSession.SessionOptions, session_config: any): Promise<import("onnxruntime-common").InferenceSession & {
14
+ export function createInferenceSession(buffer_or_path: Uint8Array | string, session_options: import("onnxruntime-common").InferenceSession.SessionOptions, session_config: any): Promise<import("onnxruntime-common").InferenceSession & {
15
15
  config: any;
16
16
  }>;
17
17
  /**
@@ -1 +1 @@
1
- {"version":3,"file":"onnx.d.ts","sourceRoot":"","sources":["../../src/backends/onnx.js"],"names":[],"mappings":"AA0GA;;;;GAIG;AACH,oDAHW,OAAO,qBAAqB,EAAE,UAAU,GAAC,MAAM,GAAC,IAAI,GAClD,sBAAsB,EAAE,CAqBpC;AAWD;;;;;;GAMG;AACH,+CALW,UAAU,mBACV,OAAO,oBAAoB,EAAE,gBAAgB,CAAC,cAAc,wBAE1D,OAAO,CAAC,OAAO,oBAAoB,EAAE,gBAAgB,GAAG;IAAE,MAAM,MAAQ;CAAC,CAAC,CActF;AAED;;;;GAIG;AACH,gCAHW,GAAG,GACD,OAAO,CAInB;AA+BD;;;GAGG;AACH,+BAFa,OAAO,CAKnB;;qCAnLY,OAAO,oBAAoB,EAAE,gBAAgB,CAAC,uBAAuB"}
1
+ {"version":3,"file":"onnx.d.ts","sourceRoot":"","sources":["../../src/backends/onnx.js"],"names":[],"mappings":"AA0GA;;;;GAIG;AACH,oDAHW,OAAO,qBAAqB,EAAE,UAAU,GAAC,MAAM,GAAC,IAAI,GAClD,sBAAsB,EAAE,CAqBpC;AAWD;;;;;;GAMG;AACH,uDALW,UAAU,GAAC,MAAM,mBACjB,OAAO,oBAAoB,EAAE,gBAAgB,CAAC,cAAc,wBAE1D,OAAO,CAAC,OAAO,oBAAoB,EAAE,gBAAgB,GAAG;IAAE,MAAM,MAAQ;CAAC,CAAC,CActF;AAED;;;;GAIG;AACH,gCAHW,GAAG,GACD,OAAO,CAInB;AA8BD;;;GAGG;AACH,+BAFa,OAAO,CAKnB;;qCAlLY,OAAO,oBAAoB,EAAE,gBAAgB,CAAC,uBAAuB"}
@@ -83,6 +83,6 @@ export type TransformersJSConfig = {
83
83
  /**
84
84
  * Whether to load the model using the external data format (used for models >= 2GB in size).
85
85
  */
86
- use_external_data_format?: boolean | Record<string, boolean>;
86
+ use_external_data_format?: import("./utils/hub.js").ExternalData | Record<string, import("./utils/hub.js").ExternalData>;
87
87
  };
88
88
  //# sourceMappingURL=configs.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"configs.d.ts","sourceRoot":"","sources":["../src/configs.js"],"names":[],"mappings":"AAuPA;;;;GAIG;AACH,0CAHW,gBAAgB;;;IACd,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CA2EpC;AACD;;;GAGG;AACH;IAwBI;;;;;;;;OAQG;IACH,sDANW,MAAM,0EACN,iBAAiB,GAGf,OAAO,CAAC,gBAAgB,CAAC,CAqBrC;IArCD;;;OAGG;IACH,6BAGC;IAnBD,0BAA0B;IAC1B,YADW,MAAM,GAAC,IAAI,CACJ;IAElB,sBAAsB;IACtB,oBADW,OAAO,CACS;IAE3B,qBAAqB;IACrB,yBADW,MAAM,CACO;IAExB,mCAAmC;IACnC,0BADW,oBAAoB,CACN;IAQrB,uBAAkD;CAgCzD;AAED;;;;;GAKG;AACH;IArCI;;;;;;;;OAQG;IACH,sDANW,MAAM,0EACN,iBAAiB,GAGf,OAAO,CAAC,gBAAgB,CAAC,CAqBrC;CAcJ;gCAzWY,OAAO,gBAAgB,EAAE,iBAAiB;+BAI1C,OAAO,iBAAiB,EAAE,gBAAgB;2BAI1C,OAAO,iBAAiB,EAAE,YAAY;;;;;;;;qBAsWrC,OAAO,mBAAmB,EAAE,QAAQ,GAAC,MAAM,CAAC,OAAO,mBAAmB,EAAE,QAAQ,EAAE,OAAO,mBAAmB,EAAE,QAAQ,CAAC;;;;;;+BACvH,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC;;;;aAGtB,OAAO,oBAAoB,EAAE,UAAU;;;;YACvC,OAAO,mBAAmB,EAAE,QAAQ,GAAC,MAAM,CAAC,MAAM,EAAE,OAAO,mBAAmB,EAAE,QAAQ,CAAC;;;;+BACzF,OAAO,GAAC,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC"}
1
+ {"version":3,"file":"configs.d.ts","sourceRoot":"","sources":["../src/configs.js"],"names":[],"mappings":"AA4PA;;;;GAIG;AACH,0CAHW,gBAAgB;;;IACd,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CA2EpC;AACD;;;GAGG;AACH;IAwBI;;;;;;;;OAQG;IACH,sDANW,MAAM,0EACN,iBAAiB,GAGf,OAAO,CAAC,gBAAgB,CAAC,CAqBrC;IArCD;;;OAGG;IACH,6BAGC;IAnBD,0BAA0B;IAC1B,YADW,MAAM,GAAC,IAAI,CACJ;IAElB,sBAAsB;IACtB,oBADW,OAAO,CACS;IAE3B,qBAAqB;IACrB,yBADW,MAAM,CACO;IAExB,mCAAmC;IACnC,0BADW,oBAAoB,CACN;IAQrB,uBAAkD;CAgCzD;AAED;;;;;GAKG;AACH;IArCI;;;;;;;;OAQG;IACH,sDANW,MAAM,0EACN,iBAAiB,GAGf,OAAO,CAAC,gBAAgB,CAAC,CAqBrC;CAcJ;gCA9WY,OAAO,gBAAgB,EAAE,iBAAiB;+BAI1C,OAAO,iBAAiB,EAAE,gBAAgB;2BAI1C,OAAO,iBAAiB,EAAE,YAAY;;;;;;;;qBA2WrC,OAAO,mBAAmB,EAAE,QAAQ,GAAC,MAAM,CAAC,OAAO,mBAAmB,EAAE,QAAQ,EAAE,OAAO,mBAAmB,EAAE,QAAQ,CAAC;;;;;;+BACvH,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC;;;;aAGtB,OAAO,oBAAoB,EAAE,UAAU;;;;YACvC,OAAO,mBAAmB,EAAE,QAAQ,GAAC,MAAM,CAAC,MAAM,EAAE,OAAO,mBAAmB,EAAE,QAAQ,CAAC;;;;+BACzF,OAAO,gBAAgB,EAAE,YAAY,GAAC,MAAM,CAAC,MAAM,EAAE,OAAO,gBAAgB,EAAE,YAAY,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"streamers.d.ts","sourceRoot":"","sources":["../../src/generation/streamers.js"],"names":[],"mappings":"AASA;IACI;;;OAGG;IACH,WAFW,MAAM,EAAE,EAAE,QAIpB;IAED;;OAEG;IACH,YAEC;CACJ;AAMD;;GAEG;AACH;IACI;;;;;;;;;OASG;IACH,uBARW,OAAO,kBAAkB,EAAE,mBAAmB,+GAEtD;QAA0B,WAAW,GAA7B,OAAO;QACW,mBAAmB,GAArC,OAAO;QAC0B,iBAAiB,GAAlD,CAAS,IAAM,EAAN,MAAM,KAAG,IAAI;QACa,uBAAuB,GAA1D,CAAS,IAAQ,EAAR,MAAM,EAAE,KAAG,IAAI;QACP,aAAa;KACxC,EAoBA;IAVG,0DAA0B;IAC1B,qBAA8B;IAC9B,oCAA0D;IAC1D,gCAfgB,MAAM,EAAE,KAAG,IAAI,CAeuB;IACtD,mBAAyE;IAGzE,mBAAqB;IACrB,kBAAkB;IAClB,gCAAkC;IA6DtC;;;;OAIG;IACH,wBAHW,MAAM,cACN,OAAO,QASjB;CACJ;AAED;;;;;;;GAOG;AACH;IACI;;;;;;;;;;;;OAYG;IACH,uBAZW,OAAO,kBAAkB,EAAE,gBAAgB,gKAEnD;QAA0B,WAAW,GAA7B,OAAO;QAC0B,iBAAiB,GAAlD,CAAS,IAAM,EAAN,MAAM,KAAG,IAAI;QACa,uBAAuB,GAA1D,CAAS,IAAQ,EAAR,MAAM,EAAE,KAAG,IAAI;QACS,cAAc,GAA/C,CAAS,IAAM,EAAN,MAAM,KAAG,IAAI;QACW,YAAY,GAA7C,CAAS,IAAM,EAAN,MAAM,KAAG,IAAI;QACK,WAAW,GAAtC,MAAY,IAAI;QACC,cAAc,GAA/B,MAAM;QACY,mBAAmB,GAArC,OAAO;QACU,aAAa;KACxC,EA4BA;IATG,wBAAgD;IAEhD,uBA3BgB,MAAM,KAAG,IAAI,CA2BO;IACpC,qBA3BgB,MAAM,KAAG,IAAI,CA2BG;IAChC,mBA3BmB,IAAI,CA2BO;IAE9B,uBAAoC;IAEpC,+BAAkC;CAiCzC"}
1
+ {"version":3,"file":"streamers.d.ts","sourceRoot":"","sources":["../../src/generation/streamers.js"],"names":[],"mappings":"AASA;IACI;;;OAGG;IACH,WAFW,MAAM,EAAE,EAAE,QAIpB;IAED;;OAEG;IACH,YAEC;CACJ;AAMD;;GAEG;AACH;IACI;;;;;;;;;OASG;IACH,uBARW,OAAO,kBAAkB,EAAE,mBAAmB,+GAEtD;QAA0B,WAAW,GAA7B,OAAO;QACW,mBAAmB,GAArC,OAAO;QAC0B,iBAAiB,GAAlD,CAAS,IAAM,EAAN,MAAM,KAAG,IAAI;QACa,uBAAuB,GAA1D,CAAS,IAAQ,EAAR,MAAM,EAAE,KAAG,IAAI;QACP,aAAa;KACxC,EAoBA;IAVG,0DAA0B;IAC1B,qBAA8B;IAC9B,oCAA0D;IAC1D,gCAfgB,MAAM,EAAE,KAAG,IAAI,CAeuB;IACtD,mBAAyE;IAGzE,mBAAqB;IACrB,kBAAkB;IAClB,gCAAkC;IA8DtC;;;;OAIG;IACH,wBAHW,MAAM,cACN,OAAO,QASjB;CACJ;AAED;;;;;;;GAOG;AACH;IACI;;;;;;;;;;;;OAYG;IACH,uBAZW,OAAO,kBAAkB,EAAE,gBAAgB,gKAEnD;QAA0B,WAAW,GAA7B,OAAO;QAC0B,iBAAiB,GAAlD,CAAS,IAAM,EAAN,MAAM,KAAG,IAAI;QACa,uBAAuB,GAA1D,CAAS,IAAQ,EAAR,MAAM,EAAE,KAAG,IAAI;QACS,cAAc,GAA/C,CAAS,IAAM,EAAN,MAAM,KAAG,IAAI;QACW,YAAY,GAA7C,CAAS,IAAM,EAAN,MAAM,KAAG,IAAI;QACK,WAAW,GAAtC,MAAY,IAAI;QACC,cAAc,GAA/B,MAAM;QACY,mBAAmB,GAArC,OAAO;QACU,aAAa;KACxC,EA4BA;IATG,wBAAgD;IAEhD,uBA3BgB,MAAM,KAAG,IAAI,CA2BO;IACpC,qBA3BgB,MAAM,KAAG,IAAI,CA2BG;IAChC,mBA3BmB,IAAI,CA2BO;IAE9B,uBAAoC;IAEpC,+BAAkC;CAiCzC"}
@@ -0,0 +1,4 @@
1
+ export class DacFeatureExtractor extends EncodecFeatureExtractor {
2
+ }
3
+ import { EncodecFeatureExtractor } from '../encodec/feature_extraction_encodec.js';
4
+ //# sourceMappingURL=feature_extraction_dac.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"feature_extraction_dac.d.ts","sourceRoot":"","sources":["../../../src/models/dac/feature_extraction_dac.js"],"names":[],"mappings":"AAEA;CAAoE;wCAF5B,0CAA0C"}
@@ -0,0 +1,13 @@
1
+ export class EncodecFeatureExtractor extends FeatureExtractor {
2
+ /**
3
+ * Asynchronously extracts input values from a given audio using the provided configuration.
4
+ * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
5
+ * @returns {Promise<{ input_values: Tensor; }>} The extracted input values.
6
+ */
7
+ _call(audio: Float32Array | Float64Array): Promise<{
8
+ input_values: Tensor;
9
+ }>;
10
+ }
11
+ import { FeatureExtractor } from '../../base/feature_extraction_utils.js';
12
+ import { Tensor } from '../../utils/tensor.js';
13
+ //# sourceMappingURL=feature_extraction_encodec.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"feature_extraction_encodec.d.ts","sourceRoot":"","sources":["../../../src/models/encodec/feature_extraction_encodec.js"],"names":[],"mappings":"AAIA;IACI;;;;OAIG;IACH,aAHW,YAAY,GAAC,YAAY,GACvB,OAAO,CAAC;QAAE,YAAY,EAAE,MAAM,CAAC;KAAE,CAAC,CAsB9C;CACJ;iCA/BuD,wCAAwC;uBACzE,uBAAuB"}
@@ -1,8 +1,11 @@
1
1
  export * from "./audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js";
2
+ export * from "./encodec/feature_extraction_encodec.js";
2
3
  export * from "./clap/feature_extraction_clap.js";
4
+ export * from "./dac/feature_extraction_dac.js";
3
5
  export * from "./moonshine/feature_extraction_moonshine.js";
4
6
  export * from "./pyannote/feature_extraction_pyannote.js";
5
7
  export * from "./seamless_m4t/feature_extraction_seamless_m4t.js";
8
+ export * from "./snac/feature_extraction_snac.js";
6
9
  export * from "./speecht5/feature_extraction_speecht5.js";
7
10
  export * from "./wav2vec2/feature_extraction_wav2vec2.js";
8
11
  export * from "./wespeaker/feature_extraction_wespeaker.js";
@@ -26,7 +26,7 @@ export class Florence2Processor extends Processor {
26
26
  * @param {[number, number]} image_size The size of the image. height x width.
27
27
  */
28
28
  post_process_generation(text: string, task: string, image_size: [number, number]): {
29
- [x: string]: string | {
29
+ [task]: string | {
30
30
  [x: string]: any[];
31
31
  labels: any[];
32
32
  };
@@ -1 +1 @@
1
- {"version":3,"file":"processing_florence2.d.ts","sourceRoot":"","sources":["../../../src/models/florence2/processing_florence2.js"],"names":[],"mappings":"AAIA;IACI,6CAAsC;IACtC,wDAAiD;IAEjD,0CA0BC;IAdG,kCAAkC;IAClC,mCADW,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAC2E;IAEzG,kCAAkC;IAClC,6BADW,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAC+D;IAE7F,kCAAkC;IAClC,yBADW,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CACuD;IAErF;;;MAGC;IACD,qBAAwB;IAG5B;;;;OAIG;IACH,wBAHW,MAAM,GAAC,MAAM,EAAE,GACb,MAAM,EAAE,CA6BpB;IAED;;;;;OAKG;IACH,8BAJW,MAAM,QACN,MAAM,cACN,CAAC,MAAM,EAAE,MAAM,CAAC;;;;;MAsC1B;IAID,0DAaC;CACJ;0BAlIyB,gCAAgC;8BAE5B,qBAAqB;mCADhB,kCAAkC"}
1
+ {"version":3,"file":"processing_florence2.d.ts","sourceRoot":"","sources":["../../../src/models/florence2/processing_florence2.js"],"names":[],"mappings":"AAIA;IACI,6CAAsC;IACtC,wDAAiD;IAEjD,0CA0BC;IAdG,kCAAkC;IAClC,mCADW,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAC2E;IAEzG,kCAAkC;IAClC,6BADW,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAC+D;IAE7F,kCAAkC;IAClC,yBADW,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CACuD;IAErF;;;MAGC;IACD,qBAAwB;IAG5B;;;;OAIG;IACH,wBAHW,MAAM,GAAC,MAAM,EAAE,GACb,MAAM,EAAE,CA6BpB;IAED;;;;;OAKG;IACH,8BAJW,MAAM,QACN,MAAM,cACN,CAAC,MAAM,EAAE,MAAM,CAAC;QAqCd,CAAC,IAAI,CAAC;;;UAAc;MAChC;IAID,0DAaC;CACJ;0BAlIyB,gCAAgC;8BAE5B,qBAAqB;mCADhB,kCAAkC"}
@@ -31,6 +31,7 @@ export * from "./rt_detr/image_processing_rt_detr.js";
31
31
  export * from "./sam/image_processing_sam.js";
32
32
  export * from "./segformer/image_processing_segformer.js";
33
33
  export * from "./siglip/image_processing_siglip.js";
34
+ export * from "./smolvlm/image_processing_smolvlm.js";
34
35
  export * from "./swin2sr/image_processing_swin2sr.js";
35
36
  export * from "./vit/image_processing_vit.js";
36
37
  export * from "./vitmatte/image_processing_vitmatte.js";
@@ -11,7 +11,9 @@ export * from "./paligemma/processing_paligemma.js";
11
11
  export * from "./pyannote/processing_pyannote.js";
12
12
  export * from "./qwen2_vl/processing_qwen2_vl.js";
13
13
  export * from "./sam/processing_sam.js";
14
+ export * from "./smolvlm/processing_smolvlm.js";
14
15
  export * from "./speecht5/processing_speecht5.js";
16
+ export * from "./ultravox/processing_ultravox.js";
15
17
  export * from "./wav2vec2/processing_wav2vec2.js";
16
18
  export * from "./wav2vec2_with_lm/processing_wav2vec2_with_lm.js";
17
19
  export * from "./whisper/processing_whisper.js";
@@ -0,0 +1,2 @@
1
+ export { Idefics3ImageProcessor as SmolVLMImageProcessor } from "../idefics3/image_processing_idefics3.js";
2
+ //# sourceMappingURL=image_processing_smolvlm.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"image_processing_smolvlm.d.ts","sourceRoot":"","sources":["../../../src/models/smolvlm/image_processing_smolvlm.js"],"names":[],"mappings":""}
@@ -0,0 +1,2 @@
1
+ export { Idefics3Processor as SmolVLMProcessor } from "../idefics3/processing_idefics3.js";
2
+ //# sourceMappingURL=processing_smolvlm.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"processing_smolvlm.d.ts","sourceRoot":"","sources":["../../../src/models/smolvlm/processing_smolvlm.js"],"names":[],"mappings":""}
@@ -0,0 +1,4 @@
1
+ export class SnacFeatureExtractor extends DacFeatureExtractor {
2
+ }
3
+ import { DacFeatureExtractor } from '../dac/feature_extraction_dac.js';
4
+ //# sourceMappingURL=feature_extraction_snac.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"feature_extraction_snac.d.ts","sourceRoot":"","sources":["../../../src/models/snac/feature_extraction_snac.js"],"names":[],"mappings":"AAEA;CAAiE;oCAF7B,kCAAkC"}
@@ -0,0 +1,16 @@
1
+ /**
2
+ * Represents a UltravoxProcessor that extracts features from an audio input.
3
+ */
4
+ export class UltravoxProcessor extends Processor {
5
+ static tokenizer_class: typeof AutoTokenizer;
6
+ static feature_extractor_class: typeof AutoFeatureExtractor;
7
+ /**
8
+ * @param {string} text The text input to process.
9
+ * @param {Float32Array} audio The audio input to process.
10
+ */
11
+ _call(text: string, audio?: Float32Array, kwargs?: {}): Promise<any>;
12
+ }
13
+ import { Processor } from "../../base/processing_utils.js";
14
+ import { AutoTokenizer } from "../../tokenizers.js";
15
+ import { AutoFeatureExtractor } from "../auto/feature_extraction_auto.js";
16
+ //# sourceMappingURL=processing_ultravox.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"processing_ultravox.d.ts","sourceRoot":"","sources":["../../../src/models/ultravox/processing_ultravox.js"],"names":[],"mappings":"AAIA;;GAEG;AACH;IACI,6CAAsC;IACtC,4DAAqD;IAGrD;;;OAGG;IACH,YAHW,MAAM,UACN,YAAY,6BAsCtB;CACJ;0BAnDyB,gCAAgC;8BAD5B,qBAAqB;qCADd,oCAAoC"}
@@ -1 +1 @@
1
- {"version":3,"file":"common_whisper.d.ts","sourceRoot":"","sources":["../../../src/models/whisper/common_whisper.js"],"names":[],"mappings":"AA4HA;;;GAGG;AACH,mDAHW,MAAM,GACJ,MAAM,CAwBlB;AA7CD,qDAAmE;AAEnE,6DAeG"}
1
+ {"version":3,"file":"common_whisper.d.ts","sourceRoot":"","sources":["../../../src/models/whisper/common_whisper.js"],"names":[],"mappings":"AA4HA;;;GAGG;AACH,mDAHW,MAAM,GACJ,MAAM,CA8BlB;AAnDD,qDAAmE;AAEnE,6DAeG"}
@@ -12,7 +12,9 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
12
12
  * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
13
13
  * @returns {Promise<{ input_features: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor.
14
14
  */
15
- _call(audio: Float32Array | Float64Array): Promise<{
15
+ _call(audio: Float32Array | Float64Array, { max_length, }?: {
16
+ max_length?: any;
17
+ }): Promise<{
16
18
  input_features: Tensor;
17
19
  }>;
18
20
  }
@@ -1 +1 @@
1
- {"version":3,"file":"feature_extraction_whisper.d.ts","sourceRoot":"","sources":["../../../src/models/whisper/feature_extraction_whisper.js"],"names":[],"mappings":"AAKA;IAEI,yBAeC;IADG,sCAAwD;IAG5D;;;;OAIG;IACH,kCAHW,YAAY,GAAC,YAAY,GACvB,OAAO,CAAC,MAAM,CAAC,CA0B3B;IAED;;;;OAIG;IACH,aAHW,YAAY,GAAC,YAAY,GACvB,OAAO,CAAC;QAAE,cAAc,EAAE,MAAM,CAAA;KAAE,CAAC,CAwB/C;CACJ;iCAnFuD,wCAAwC;uBACzE,uBAAuB"}
1
+ {"version":3,"file":"feature_extraction_whisper.d.ts","sourceRoot":"","sources":["../../../src/models/whisper/feature_extraction_whisper.js"],"names":[],"mappings":"AAKA;IAEI,yBAeC;IADG,sCAAwD;IAG5D;;;;OAIG;IACH,kCAHW,YAAY,GAAC,YAAY,GACvB,OAAO,CAAC,MAAM,CAAC,CA6B3B;IAED;;;;OAIG;IACH,aAHW,YAAY,GAAC,YAAY;;QACvB,OAAO,CAAC;QAAE,cAAc,EAAE,MAAM,CAAA;KAAE,CAAC,CA6B/C;CACJ;iCA3FuD,wCAAwC;uBACzE,uBAAuB"}
package/types/models.d.ts CHANGED
@@ -35,8 +35,8 @@ export class PreTrainedModel extends PreTrainedModel_base {
35
35
  sessions: Record<string, any>;
36
36
  configs: Record<string, any>;
37
37
  can_generate: boolean;
38
- _forward: typeof decoderForward;
39
- _prepare_inputs_for_generation: typeof image_text_to_text_prepare_inputs_for_generation;
38
+ _forward: typeof decoderForward | typeof autoEncoderForward;
39
+ _prepare_inputs_for_generation: typeof multimodal_text_to_text_prepare_inputs_for_generation;
40
40
  /** @type {import('./configs.js').TransformersJSConfig} */
41
41
  custom_config: import("./configs.js").TransformersJSConfig;
42
42
  /**
@@ -180,6 +180,9 @@ export class PreTrainedModel extends PreTrainedModel_base {
180
180
  encode_text({ input_ids }: {
181
181
  input_ids: any;
182
182
  }): Promise<any>;
183
+ encode_audio({ audio_values }: {
184
+ audio_values: any;
185
+ }): Promise<any>;
183
186
  }
184
187
  export class ModelOutput {
185
188
  }
@@ -1204,6 +1207,8 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
1204
1207
  sequences: Tensor;
1205
1208
  }, alignment_heads: number[][], num_frames?: number, time_precision?: number): Tensor;
1206
1209
  }
1210
+ export class LiteWhisperForConditionalGeneration extends WhisperForConditionalGeneration {
1211
+ }
1207
1212
  export class MoonshinePreTrainedModel extends PreTrainedModel {
1208
1213
  requires_attention_mask: boolean;
1209
1214
  }
@@ -1283,7 +1288,7 @@ export class PaliGemmaForConditionalGeneration extends PaliGemmaPreTrainedModel
1283
1288
  export class Idefics3PreTrainedModel extends PreTrainedModel {
1284
1289
  }
1285
1290
  /**
1286
- * The LLAVA model which consists of a vision backbone and a language model.
1291
+ * The Idefics3 model which consists of a vision backbone and a language model.
1287
1292
  */
1288
1293
  export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
1289
1294
  encode_image({ pixel_values, pixel_attention_mask }: {
@@ -1295,6 +1300,12 @@ export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
1295
1300
  attention_mask: any;
1296
1301
  };
1297
1302
  }
1303
+ /**
1304
+ * The SmolVLM Model with a language modeling head.
1305
+ * It is made up a SigLIP vision encoder, with a language modeling head on top.
1306
+ */
1307
+ export class SmolVLMForConditionalGeneration extends Idefics3ForConditionalGeneration {
1308
+ }
1298
1309
  export class Phi3VPreTrainedModel extends PreTrainedModel {
1299
1310
  }
1300
1311
  export class Phi3VForCausalLM extends Phi3VPreTrainedModel {
@@ -1741,6 +1752,18 @@ export class Gemma2Model extends Gemma2PreTrainedModel {
1741
1752
  }
1742
1753
  export class Gemma2ForCausalLM extends Gemma2PreTrainedModel {
1743
1754
  }
1755
+ /**
1756
+ * The bare Gemma3 Model outputting raw hidden-states without any specific head on top.
1757
+ */
1758
+ export class Gemma3PreTrainedModel extends PreTrainedModel {
1759
+ }
1760
+ /**
1761
+ * The bare Gemma3 Model outputting raw hidden-states without any specific head on top.
1762
+ */
1763
+ export class Gemma3Model extends Gemma3PreTrainedModel {
1764
+ }
1765
+ export class Gemma3ForCausalLM extends Gemma3PreTrainedModel {
1766
+ }
1744
1767
  export class OpenELMPreTrainedModel extends PreTrainedModel {
1745
1768
  }
1746
1769
  export class OpenELMModel extends OpenELMPreTrainedModel {
@@ -2173,6 +2196,8 @@ export class SwinForImageClassification extends SwinPreTrainedModel {
2173
2196
  */
2174
2197
  _call(model_inputs: any): Promise<SequenceClassifierOutput>;
2175
2198
  }
2199
+ export class SwinForSemanticSegmentation extends SwinPreTrainedModel {
2200
+ }
2176
2201
  export class Swin2SRPreTrainedModel extends PreTrainedModel {
2177
2202
  }
2178
2203
  /**
@@ -2283,6 +2308,14 @@ export class DepthProPreTrainedModel extends PreTrainedModel {
2283
2308
  }
2284
2309
  export class DepthProForDepthEstimation extends DepthProPreTrainedModel {
2285
2310
  }
2311
+ export class Metric3DPreTrainedModel extends PreTrainedModel {
2312
+ }
2313
+ export class Metric3DForDepthEstimation extends Metric3DPreTrainedModel {
2314
+ }
2315
+ export class Metric3Dv2PreTrainedModel extends PreTrainedModel {
2316
+ }
2317
+ export class Metric3Dv2ForDepthEstimation extends Metric3Dv2PreTrainedModel {
2318
+ }
2286
2319
  export class MaskFormerPreTrainedModel extends PreTrainedModel {
2287
2320
  }
2288
2321
  export class MaskFormerModel extends MaskFormerPreTrainedModel {
@@ -3446,6 +3479,8 @@ export class MobileNetV1ForImageClassification extends MobileNetV1PreTrainedMode
3446
3479
  */
3447
3480
  _call(model_inputs: any): Promise<SequenceClassifierOutput>;
3448
3481
  }
3482
+ export class MobileNetV1ForSemanticSegmentation extends MobileNetV1PreTrainedModel {
3483
+ }
3449
3484
  export class MobileNetV2PreTrainedModel extends PreTrainedModel {
3450
3485
  }
3451
3486
  /**
@@ -3463,6 +3498,8 @@ export class MobileNetV2ForImageClassification extends MobileNetV2PreTrainedMode
3463
3498
  */
3464
3499
  _call(model_inputs: any): Promise<SequenceClassifierOutput>;
3465
3500
  }
3501
+ export class MobileNetV2ForSemanticSegmentation extends MobileNetV2PreTrainedModel {
3502
+ }
3466
3503
  export class MobileNetV3PreTrainedModel extends PreTrainedModel {
3467
3504
  }
3468
3505
  /**
@@ -3480,6 +3517,8 @@ export class MobileNetV3ForImageClassification extends MobileNetV3PreTrainedMode
3480
3517
  */
3481
3518
  _call(model_inputs: any): Promise<SequenceClassifierOutput>;
3482
3519
  }
3520
+ export class MobileNetV3ForSemanticSegmentation extends MobileNetV3PreTrainedModel {
3521
+ }
3483
3522
  export class MobileNetV4PreTrainedModel extends PreTrainedModel {
3484
3523
  }
3485
3524
  /**
@@ -3497,6 +3536,8 @@ export class MobileNetV4ForImageClassification extends MobileNetV4PreTrainedMode
3497
3536
  */
3498
3537
  _call(model_inputs: any): Promise<SequenceClassifierOutput>;
3499
3538
  }
3539
+ export class MobileNetV4ForSemanticSegmentation extends MobileNetV4PreTrainedModel {
3540
+ }
3500
3541
  export class DecisionTransformerPreTrainedModel extends PreTrainedModel {
3501
3542
  }
3502
3543
  /**
@@ -3562,6 +3603,134 @@ export class PatchTSMixerModel extends PatchTSMixerPreTrainedModel {
3562
3603
  */
3563
3604
  export class PatchTSMixerForPrediction extends PatchTSMixerPreTrainedModel {
3564
3605
  }
3606
+ export class UltravoxPreTrainedModel extends PreTrainedModel {
3607
+ }
3608
+ export class UltravoxModel extends UltravoxPreTrainedModel {
3609
+ _merge_input_ids_with_audio_features(kwargs: any): {
3610
+ inputs_embeds: any;
3611
+ attention_mask: any;
3612
+ };
3613
+ }
3614
+ export class MimiPreTrainedModel extends PreTrainedModel {
3615
+ }
3616
+ export class MimiEncoderOutput extends ModelOutput {
3617
+ /**
3618
+ * @param {Object} output The output of the model.
3619
+ * @param {Tensor} output.audio_codes Discrete code embeddings, of shape `(batch_size, num_quantizers, codes_length)`.
3620
+ */
3621
+ constructor({ audio_codes }: {
3622
+ audio_codes: Tensor;
3623
+ });
3624
+ audio_codes: Tensor;
3625
+ }
3626
+ export class MimiDecoderOutput extends ModelOutput {
3627
+ /**
3628
+ * @param {Object} output The output of the model.
3629
+ * @param {Tensor} output.audio_values Decoded audio values, of shape `(batch_size, num_channels, sequence_length)`.
3630
+ */
3631
+ constructor({ audio_values }: {
3632
+ audio_values: Tensor;
3633
+ });
3634
+ audio_values: Tensor;
3635
+ }
3636
+ /**
3637
+ * The Mimi neural audio codec model.
3638
+ */
3639
+ export class MimiModel extends MimiPreTrainedModel {
3640
+ /**
3641
+ * Encodes the input audio waveform into discrete codes.
3642
+ * @param {Object} inputs Model inputs
3643
+ * @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`).
3644
+ * @returns {Promise<MimiEncoderOutput>} The output tensor of shape `(batch_size, num_codebooks, sequence_length)`.
3645
+ */
3646
+ encode(inputs: {
3647
+ input_values?: Tensor;
3648
+ }): Promise<MimiEncoderOutput>;
3649
+ /**
3650
+ * Decodes the given frames into an output audio waveform.
3651
+ * @param {MimiEncoderOutput} inputs The encoded audio codes.
3652
+ * @returns {Promise<MimiDecoderOutput>} The output tensor of shape `(batch_size, num_channels, sequence_length)`.
3653
+ */
3654
+ decode(inputs: MimiEncoderOutput): Promise<MimiDecoderOutput>;
3655
+ }
3656
+ export class MimiEncoderModel extends MimiPreTrainedModel {
3657
+ }
3658
+ export class MimiDecoderModel extends MimiPreTrainedModel {
3659
+ }
3660
+ export class DacPreTrainedModel extends PreTrainedModel {
3661
+ }
3662
+ export class DacEncoderOutput extends ModelOutput {
3663
+ /**
3664
+ * @param {Object} output The output of the model.
3665
+ * @param {Tensor} output.audio_codes Discrete code embeddings, of shape `(batch_size, num_quantizers, codes_length)`.
3666
+ */
3667
+ constructor({ audio_codes }: {
3668
+ audio_codes: Tensor;
3669
+ });
3670
+ audio_codes: Tensor;
3671
+ }
3672
+ export class DacDecoderOutput extends ModelOutput {
3673
+ /**
3674
+ * @param {Object} output The output of the model.
3675
+ * @param {Tensor} output.audio_values Decoded audio values, of shape `(batch_size, num_channels, sequence_length)`.
3676
+ */
3677
+ constructor({ audio_values }: {
3678
+ audio_values: Tensor;
3679
+ });
3680
+ audio_values: Tensor;
3681
+ }
3682
+ /**
3683
+ * The DAC (Descript Audio Codec) model.
3684
+ */
3685
+ export class DacModel extends DacPreTrainedModel {
3686
+ /**
3687
+ * Encodes the input audio waveform into discrete codes.
3688
+ * @param {Object} inputs Model inputs
3689
+ * @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`).
3690
+ * @returns {Promise<DacEncoderOutput>} The output tensor of shape `(batch_size, num_codebooks, sequence_length)`.
3691
+ */
3692
+ encode(inputs: {
3693
+ input_values?: Tensor;
3694
+ }): Promise<DacEncoderOutput>;
3695
+ /**
3696
+ * Decodes the given frames into an output audio waveform.
3697
+ * @param {DacEncoderOutput} inputs The encoded audio codes.
3698
+ * @returns {Promise<DacDecoderOutput>} The output tensor of shape `(batch_size, num_channels, sequence_length)`.
3699
+ */
3700
+ decode(inputs: DacEncoderOutput): Promise<DacDecoderOutput>;
3701
+ }
3702
+ export class DacEncoderModel extends DacPreTrainedModel {
3703
+ }
3704
+ export class DacDecoderModel extends DacPreTrainedModel {
3705
+ }
3706
+ export class SnacPreTrainedModel extends PreTrainedModel {
3707
+ }
3708
+ /**
3709
+ * The SNAC (Multi-Scale Neural Audio Codec) model.
3710
+ */
3711
+ export class SnacModel extends SnacPreTrainedModel {
3712
+ /**
3713
+ * Encodes the input audio waveform into discrete codes.
3714
+ * @param {Object} inputs Model inputs
3715
+ * @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`).
3716
+ * @returns {Promise<Record<string, Tensor>>} The output tensors of shape `(batch_size, num_codebooks, sequence_length)`.
3717
+ */
3718
+ encode(inputs: {
3719
+ input_values?: Tensor;
3720
+ }): Promise<Record<string, Tensor>>;
3721
+ /**
3722
+ * Decodes the given frames into an output audio waveform.
3723
+ * @param {Record<string, Tensor>} inputs The encoded audio codes.
3724
+ * @returns {Promise<{audio_values: Tensor}>} The output tensor of shape `(batch_size, num_channels, sequence_length)`.
3725
+ */
3726
+ decode(inputs: Record<string, Tensor>): Promise<{
3727
+ audio_values: Tensor;
3728
+ }>;
3729
+ }
3730
+ export class SnacEncoderModel extends SnacPreTrainedModel {
3731
+ }
3732
+ export class SnacDecoderModel extends SnacPreTrainedModel {
3733
+ }
3565
3734
  /**
3566
3735
  * Base class of all AutoModels. Contains the `from_pretrained` function
3567
3736
  * which is used to instantiate pretrained models.
@@ -3799,6 +3968,12 @@ export class AutoModelForPoseEstimation extends PretrainedMixin {
3799
3968
  export class AutoModelForImageFeatureExtraction extends PretrainedMixin {
3800
3969
  static MODEL_CLASS_MAPPINGS: Map<string, (string | typeof CLIPVisionModelWithProjection)[]>[];
3801
3970
  }
3971
+ export class AutoModelForImageTextToText extends PretrainedMixin {
3972
+ static MODEL_CLASS_MAPPINGS: Map<string, (string | typeof Idefics3ForConditionalGeneration)[] | (string | typeof Florence2ForConditionalGeneration)[]>[];
3973
+ }
3974
+ export class AutoModelForAudioTextToText extends PretrainedMixin {
3975
+ static MODEL_CLASS_MAPPINGS: Map<string, (string | typeof UltravoxModel)[]>[];
3976
+ }
3802
3977
  export class Seq2SeqLMOutput extends ModelOutput {
3803
3978
  /**
3804
3979
  * @param {Object} output The output of the model.
@@ -3961,7 +4136,8 @@ export class VitsModelOutput extends ModelOutput {
3961
4136
  * @private
3962
4137
  */
3963
4138
  declare function decoderForward(self: any, model_inputs: any, is_encoder_decoder?: boolean): Promise<any>;
3964
- declare function image_text_to_text_prepare_inputs_for_generation(self: any, ...args: any[]): any;
4139
+ declare function autoEncoderForward(self: any, model_inputs: any): Promise<any>;
4140
+ declare function multimodal_text_to_text_prepare_inputs_for_generation(self: any, ...args: any[]): any;
3965
4141
  import { GenerationConfig } from './generation/configuration_utils.js';
3966
4142
  import { LogitsProcessorList } from './generation/logits_process.js';
3967
4143
  import { StoppingCriteriaList } from './generation/stopping_criteria.js';