@huggingface/transformers 3.3.3 → 3.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -3
- package/dist/ort-wasm-simd-threaded.jsep.mjs +124 -115
- package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
- package/dist/transformers.js +2778 -1592
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.js +1 -1
- package/dist/transformers.min.js.map +1 -1
- package/dist/{transformers.cjs → transformers.node.cjs} +1699 -2530
- package/dist/transformers.node.cjs.map +1 -0
- package/dist/transformers.node.min.cjs +2 -0
- package/dist/transformers.node.min.cjs.map +1 -0
- package/dist/transformers.node.min.mjs +2 -0
- package/dist/transformers.node.min.mjs.map +1 -0
- package/dist/{transformers.mjs → transformers.node.mjs} +1738 -2510
- package/dist/transformers.node.mjs.map +1 -0
- package/dist/transformers.web.js +35876 -0
- package/dist/transformers.web.js.map +1 -0
- package/dist/transformers.web.min.js +2 -0
- package/dist/transformers.web.min.js.map +1 -0
- package/package.json +6 -6
- package/src/backends/onnx.js +14 -15
- package/src/configs.js +6 -1
- package/src/env.js +1 -1
- package/src/generation/streamers.js +4 -3
- package/src/models/dac/feature_extraction_dac.js +3 -0
- package/src/models/encodec/feature_extraction_encodec.js +32 -0
- package/src/models/feature_extractors.js +3 -0
- package/src/models/idefics3/image_processing_idefics3.js +1 -1
- package/src/models/image_processors.js +1 -0
- package/src/models/processors.js +2 -0
- package/src/models/smolvlm/image_processing_smolvlm.js +2 -0
- package/src/models/smolvlm/processing_smolvlm.js +2 -0
- package/src/models/snac/feature_extraction_snac.js +3 -0
- package/src/models/ultravox/processing_ultravox.js +54 -0
- package/src/models/whisper/common_whisper.js +7 -1
- package/src/models/whisper/feature_extraction_whisper.js +18 -10
- package/src/models.js +546 -78
- package/src/pipelines.js +246 -137
- package/src/tokenizers.js +42 -28
- package/src/transformers.js +1 -0
- package/src/utils/audio.js +2 -0
- package/src/utils/hub.js +140 -80
- package/src/utils/image.js +9 -1
- package/src/utils/maths.js +1 -1
- package/src/utils/tensor.js +12 -5
- package/src/utils/video.js +128 -0
- package/types/backends/onnx.d.ts +2 -2
- package/types/backends/onnx.d.ts.map +1 -1
- package/types/configs.d.ts +1 -1
- package/types/configs.d.ts.map +1 -1
- package/types/generation/streamers.d.ts.map +1 -1
- package/types/models/dac/feature_extraction_dac.d.ts +4 -0
- package/types/models/dac/feature_extraction_dac.d.ts.map +1 -0
- package/types/models/encodec/feature_extraction_encodec.d.ts +13 -0
- package/types/models/encodec/feature_extraction_encodec.d.ts.map +1 -0
- package/types/models/feature_extractors.d.ts +3 -0
- package/types/models/florence2/processing_florence2.d.ts +1 -1
- package/types/models/florence2/processing_florence2.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +1 -0
- package/types/models/processors.d.ts +2 -0
- package/types/models/smolvlm/image_processing_smolvlm.d.ts +2 -0
- package/types/models/smolvlm/image_processing_smolvlm.d.ts.map +1 -0
- package/types/models/smolvlm/processing_smolvlm.d.ts +2 -0
- package/types/models/smolvlm/processing_smolvlm.d.ts.map +1 -0
- package/types/models/snac/feature_extraction_snac.d.ts +4 -0
- package/types/models/snac/feature_extraction_snac.d.ts.map +1 -0
- package/types/models/ultravox/processing_ultravox.d.ts +16 -0
- package/types/models/ultravox/processing_ultravox.d.ts.map +1 -0
- package/types/models/whisper/common_whisper.d.ts.map +1 -1
- package/types/models/whisper/feature_extraction_whisper.d.ts +3 -1
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
- package/types/models.d.ts +180 -4
- package/types/models.d.ts.map +1 -1
- package/types/pipelines.d.ts +51 -5
- package/types/pipelines.d.ts.map +1 -1
- package/types/tokenizers.d.ts.map +1 -1
- package/types/transformers.d.ts +1 -0
- package/types/tsconfig.tsbuildinfo +1 -1
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/hub.d.ts +19 -7
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/image.d.ts +2 -2
- package/types/utils/image.d.ts.map +1 -1
- package/types/utils/maths.d.ts +2 -2
- package/types/utils/maths.d.ts.map +1 -1
- package/types/utils/tensor.d.ts +17 -18
- package/types/utils/tensor.d.ts.map +1 -1
- package/types/utils/video.d.ts +37 -0
- package/types/utils/video.d.ts.map +1 -0
- package/dist/transformers.cjs.map +0 -1
- package/dist/transformers.min.cjs +0 -2
- package/dist/transformers.min.cjs.map +0 -1
- package/dist/transformers.min.mjs +0 -2
- package/dist/transformers.min.mjs.map +0 -1
- package/dist/transformers.mjs.map +0 -1
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import { RawImage } from "./image.js";
|
|
2
|
+
import { apis } from "../env.js";
|
|
3
|
+
|
|
4
|
+
export class RawVideoFrame {
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* @param {RawImage} image
|
|
8
|
+
* @param {number} timestamp
|
|
9
|
+
*/
|
|
10
|
+
constructor(image, timestamp) {
|
|
11
|
+
this.image = image;
|
|
12
|
+
this.timestamp = timestamp;
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export class RawVideo {
|
|
17
|
+
/**
|
|
18
|
+
* @param {RawVideoFrame[]|RawImage[]} frames
|
|
19
|
+
* @param {number} duration
|
|
20
|
+
*/
|
|
21
|
+
constructor(frames, duration) {
|
|
22
|
+
if (frames.length > 0 && frames[0] instanceof RawImage) {
|
|
23
|
+
// Assume uniform timestamps
|
|
24
|
+
frames = frames.map((image, i) => new RawVideoFrame(image, (i + 1) / (frames.length + 1) * duration));
|
|
25
|
+
}
|
|
26
|
+
this.frames = /** @type {RawVideoFrame[]} */ (frames);
|
|
27
|
+
this.duration = duration;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
get width() {
|
|
31
|
+
return this.frames[0].image.width;
|
|
32
|
+
}
|
|
33
|
+
get height() {
|
|
34
|
+
return this.frames[0].image.height;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
get fps() {
|
|
38
|
+
return this.frames.length / this.duration;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Loads a video.
|
|
45
|
+
*
|
|
46
|
+
* @param {string|Blob|HTMLVideoElement} src The video to process.
|
|
47
|
+
* @param {Object} [options] Optional parameters.
|
|
48
|
+
* @param {number} [options.num_frames=null] The number of frames to sample uniformly.
|
|
49
|
+
* @param {number} [options.fps=null] The number of frames to sample per second.
|
|
50
|
+
*
|
|
51
|
+
* @returns {Promise<RawVideo>} The loaded video.
|
|
52
|
+
*/
|
|
53
|
+
export async function load_video(src, { num_frames = null, fps = null } = {}) {
|
|
54
|
+
if (!apis.IS_BROWSER_ENV) {
|
|
55
|
+
throw new Error("`load_video` is currently only supported in browser environments.");
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// TODO: Support efficiently loading all frames using the WebCodecs API.
|
|
59
|
+
// Specfically, https://developer.mozilla.org/en-US/docs/Web/API/VideoDecoder
|
|
60
|
+
if (num_frames == null && fps == null) {
|
|
61
|
+
throw new Error("Either num_frames or fps must be provided.");
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
const frames = [];
|
|
65
|
+
|
|
66
|
+
const video = document.createElement("video");
|
|
67
|
+
video.crossOrigin = "anonymous";
|
|
68
|
+
video.muted = true; // mute to allow autoplay and seeking
|
|
69
|
+
|
|
70
|
+
if (typeof src === 'string') {
|
|
71
|
+
video.src = src;
|
|
72
|
+
} else if (src instanceof Blob) {
|
|
73
|
+
video.src = URL.createObjectURL(src);
|
|
74
|
+
} else if (src instanceof HTMLVideoElement) {
|
|
75
|
+
video.src = src.src;
|
|
76
|
+
} else {
|
|
77
|
+
throw new Error("Invalid URL or video element provided.");
|
|
78
|
+
}
|
|
79
|
+
// Wait for metadata to load to obtain duration
|
|
80
|
+
await new Promise((resolve) => video.onloadedmetadata = resolve);
|
|
81
|
+
|
|
82
|
+
if (video.seekable.start(0) === video.seekable.end(0)) {
|
|
83
|
+
// Fallback: Download entire video if not seekable
|
|
84
|
+
const response = await fetch(video.src);
|
|
85
|
+
const blob = await response.blob();
|
|
86
|
+
video.src = URL.createObjectURL(blob);
|
|
87
|
+
await new Promise((resolve) => video.onloadedmetadata = resolve);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
const duration = video.duration;
|
|
91
|
+
|
|
92
|
+
let count, step;
|
|
93
|
+
if (num_frames != null) {
|
|
94
|
+
count = num_frames;
|
|
95
|
+
step = num_frames === 1 ? 0 : duration / (num_frames - 1);
|
|
96
|
+
} else {
|
|
97
|
+
step = 1 / fps;
|
|
98
|
+
count = Math.floor(duration / step);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Build an array of sample times based on num_frames or fps
|
|
102
|
+
let sampleTimes = [];
|
|
103
|
+
for (let i = 0; i < count; ++i) {
|
|
104
|
+
sampleTimes.push(num_frames === 1 ? duration / 2 : i * step);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
const canvas = document.createElement("canvas");
|
|
108
|
+
canvas.width = video.videoWidth;
|
|
109
|
+
canvas.height = video.videoHeight;
|
|
110
|
+
const ctx = canvas.getContext("2d", { willReadFrequently: true });
|
|
111
|
+
for (const t of sampleTimes) {
|
|
112
|
+
video.currentTime = t;
|
|
113
|
+
await new Promise((resolve) => {
|
|
114
|
+
video.onseeked = resolve;
|
|
115
|
+
});
|
|
116
|
+
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
|
|
117
|
+
const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
|
|
118
|
+
const frameData = new RawImage(imageData.data, canvas.width, canvas.height, 4);
|
|
119
|
+
|
|
120
|
+
const frame = new RawVideoFrame(frameData, t);
|
|
121
|
+
frames.push(frame);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// Clean up video element.
|
|
125
|
+
video.remove();
|
|
126
|
+
|
|
127
|
+
return new RawVideo(frames, duration);
|
|
128
|
+
}
|
package/types/backends/onnx.d.ts
CHANGED
|
@@ -6,12 +6,12 @@
|
|
|
6
6
|
export function deviceToExecutionProviders(device?: import("../utils/devices.js").DeviceType | "auto" | null): ONNXExecutionProviders[];
|
|
7
7
|
/**
|
|
8
8
|
* Create an ONNX inference session.
|
|
9
|
-
* @param {Uint8Array}
|
|
9
|
+
* @param {Uint8Array|string} buffer_or_path The ONNX model buffer or path.
|
|
10
10
|
* @param {import('onnxruntime-common').InferenceSession.SessionOptions} session_options ONNX inference session options.
|
|
11
11
|
* @param {Object} session_config ONNX inference session configuration.
|
|
12
12
|
* @returns {Promise<import('onnxruntime-common').InferenceSession & { config: Object}>} The ONNX inference session.
|
|
13
13
|
*/
|
|
14
|
-
export function createInferenceSession(
|
|
14
|
+
export function createInferenceSession(buffer_or_path: Uint8Array | string, session_options: import("onnxruntime-common").InferenceSession.SessionOptions, session_config: any): Promise<import("onnxruntime-common").InferenceSession & {
|
|
15
15
|
config: any;
|
|
16
16
|
}>;
|
|
17
17
|
/**
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"onnx.d.ts","sourceRoot":"","sources":["../../src/backends/onnx.js"],"names":[],"mappings":"AA0GA;;;;GAIG;AACH,oDAHW,OAAO,qBAAqB,EAAE,UAAU,GAAC,MAAM,GAAC,IAAI,GAClD,sBAAsB,EAAE,CAqBpC;AAWD;;;;;;GAMG;AACH
|
|
1
|
+
{"version":3,"file":"onnx.d.ts","sourceRoot":"","sources":["../../src/backends/onnx.js"],"names":[],"mappings":"AA0GA;;;;GAIG;AACH,oDAHW,OAAO,qBAAqB,EAAE,UAAU,GAAC,MAAM,GAAC,IAAI,GAClD,sBAAsB,EAAE,CAqBpC;AAWD;;;;;;GAMG;AACH,uDALW,UAAU,GAAC,MAAM,mBACjB,OAAO,oBAAoB,EAAE,gBAAgB,CAAC,cAAc,wBAE1D,OAAO,CAAC,OAAO,oBAAoB,EAAE,gBAAgB,GAAG;IAAE,MAAM,MAAQ;CAAC,CAAC,CActF;AAED;;;;GAIG;AACH,gCAHW,GAAG,GACD,OAAO,CAInB;AA8BD;;;GAGG;AACH,+BAFa,OAAO,CAKnB;;qCAlLY,OAAO,oBAAoB,EAAE,gBAAgB,CAAC,uBAAuB"}
|
package/types/configs.d.ts
CHANGED
|
@@ -83,6 +83,6 @@ export type TransformersJSConfig = {
|
|
|
83
83
|
/**
|
|
84
84
|
* Whether to load the model using the external data format (used for models >= 2GB in size).
|
|
85
85
|
*/
|
|
86
|
-
use_external_data_format?:
|
|
86
|
+
use_external_data_format?: import("./utils/hub.js").ExternalData | Record<string, import("./utils/hub.js").ExternalData>;
|
|
87
87
|
};
|
|
88
88
|
//# sourceMappingURL=configs.d.ts.map
|
package/types/configs.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"configs.d.ts","sourceRoot":"","sources":["../src/configs.js"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"configs.d.ts","sourceRoot":"","sources":["../src/configs.js"],"names":[],"mappings":"AA4PA;;;;GAIG;AACH,0CAHW,gBAAgB;;;IACd,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CA2EpC;AACD;;;GAGG;AACH;IAwBI;;;;;;;;OAQG;IACH,sDANW,MAAM,0EACN,iBAAiB,GAGf,OAAO,CAAC,gBAAgB,CAAC,CAqBrC;IArCD;;;OAGG;IACH,6BAGC;IAnBD,0BAA0B;IAC1B,YADW,MAAM,GAAC,IAAI,CACJ;IAElB,sBAAsB;IACtB,oBADW,OAAO,CACS;IAE3B,qBAAqB;IACrB,yBADW,MAAM,CACO;IAExB,mCAAmC;IACnC,0BADW,oBAAoB,CACN;IAQrB,uBAAkD;CAgCzD;AAED;;;;;GAKG;AACH;IArCI;;;;;;;;OAQG;IACH,sDANW,MAAM,0EACN,iBAAiB,GAGf,OAAO,CAAC,gBAAgB,CAAC,CAqBrC;CAcJ;gCA9WY,OAAO,gBAAgB,EAAE,iBAAiB;+BAI1C,OAAO,iBAAiB,EAAE,gBAAgB;2BAI1C,OAAO,iBAAiB,EAAE,YAAY;;;;;;;;qBA2WrC,OAAO,mBAAmB,EAAE,QAAQ,GAAC,MAAM,CAAC,OAAO,mBAAmB,EAAE,QAAQ,EAAE,OAAO,mBAAmB,EAAE,QAAQ,CAAC;;;;;;+BACvH,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC;;;;aAGtB,OAAO,oBAAoB,EAAE,UAAU;;;;YACvC,OAAO,mBAAmB,EAAE,QAAQ,GAAC,MAAM,CAAC,MAAM,EAAE,OAAO,mBAAmB,EAAE,QAAQ,CAAC;;;;+BACzF,OAAO,gBAAgB,EAAE,YAAY,GAAC,MAAM,CAAC,MAAM,EAAE,OAAO,gBAAgB,EAAE,YAAY,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"streamers.d.ts","sourceRoot":"","sources":["../../src/generation/streamers.js"],"names":[],"mappings":"AASA;IACI;;;OAGG;IACH,WAFW,MAAM,EAAE,EAAE,QAIpB;IAED;;OAEG;IACH,YAEC;CACJ;AAMD;;GAEG;AACH;IACI;;;;;;;;;OASG;IACH,uBARW,OAAO,kBAAkB,EAAE,mBAAmB,+GAEtD;QAA0B,WAAW,GAA7B,OAAO;QACW,mBAAmB,GAArC,OAAO;QAC0B,iBAAiB,GAAlD,CAAS,IAAM,EAAN,MAAM,KAAG,IAAI;QACa,uBAAuB,GAA1D,CAAS,IAAQ,EAAR,MAAM,EAAE,KAAG,IAAI;QACP,aAAa;KACxC,EAoBA;IAVG,0DAA0B;IAC1B,qBAA8B;IAC9B,oCAA0D;IAC1D,gCAfgB,MAAM,EAAE,KAAG,IAAI,CAeuB;IACtD,mBAAyE;IAGzE,mBAAqB;IACrB,kBAAkB;IAClB,gCAAkC;
|
|
1
|
+
{"version":3,"file":"streamers.d.ts","sourceRoot":"","sources":["../../src/generation/streamers.js"],"names":[],"mappings":"AASA;IACI;;;OAGG;IACH,WAFW,MAAM,EAAE,EAAE,QAIpB;IAED;;OAEG;IACH,YAEC;CACJ;AAMD;;GAEG;AACH;IACI;;;;;;;;;OASG;IACH,uBARW,OAAO,kBAAkB,EAAE,mBAAmB,+GAEtD;QAA0B,WAAW,GAA7B,OAAO;QACW,mBAAmB,GAArC,OAAO;QAC0B,iBAAiB,GAAlD,CAAS,IAAM,EAAN,MAAM,KAAG,IAAI;QACa,uBAAuB,GAA1D,CAAS,IAAQ,EAAR,MAAM,EAAE,KAAG,IAAI;QACP,aAAa;KACxC,EAoBA;IAVG,0DAA0B;IAC1B,qBAA8B;IAC9B,oCAA0D;IAC1D,gCAfgB,MAAM,EAAE,KAAG,IAAI,CAeuB;IACtD,mBAAyE;IAGzE,mBAAqB;IACrB,kBAAkB;IAClB,gCAAkC;IA8DtC;;;;OAIG;IACH,wBAHW,MAAM,cACN,OAAO,QASjB;CACJ;AAED;;;;;;;GAOG;AACH;IACI;;;;;;;;;;;;OAYG;IACH,uBAZW,OAAO,kBAAkB,EAAE,gBAAgB,gKAEnD;QAA0B,WAAW,GAA7B,OAAO;QAC0B,iBAAiB,GAAlD,CAAS,IAAM,EAAN,MAAM,KAAG,IAAI;QACa,uBAAuB,GAA1D,CAAS,IAAQ,EAAR,MAAM,EAAE,KAAG,IAAI;QACS,cAAc,GAA/C,CAAS,IAAM,EAAN,MAAM,KAAG,IAAI;QACW,YAAY,GAA7C,CAAS,IAAM,EAAN,MAAM,KAAG,IAAI;QACK,WAAW,GAAtC,MAAY,IAAI;QACC,cAAc,GAA/B,MAAM;QACY,mBAAmB,GAArC,OAAO;QACU,aAAa;KACxC,EA4BA;IATG,wBAAgD;IAEhD,uBA3BgB,MAAM,KAAG,IAAI,CA2BO;IACpC,qBA3BgB,MAAM,KAAG,IAAI,CA2BG;IAChC,mBA3BmB,IAAI,CA2BO;IAE9B,uBAAoC;IAEpC,+BAAkC;CAiCzC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"feature_extraction_dac.d.ts","sourceRoot":"","sources":["../../../src/models/dac/feature_extraction_dac.js"],"names":[],"mappings":"AAEA;CAAoE;wCAF5B,0CAA0C"}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
export class EncodecFeatureExtractor extends FeatureExtractor {
|
|
2
|
+
/**
|
|
3
|
+
* Asynchronously extracts input values from a given audio using the provided configuration.
|
|
4
|
+
* @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
|
|
5
|
+
* @returns {Promise<{ input_values: Tensor; }>} The extracted input values.
|
|
6
|
+
*/
|
|
7
|
+
_call(audio: Float32Array | Float64Array): Promise<{
|
|
8
|
+
input_values: Tensor;
|
|
9
|
+
}>;
|
|
10
|
+
}
|
|
11
|
+
import { FeatureExtractor } from '../../base/feature_extraction_utils.js';
|
|
12
|
+
import { Tensor } from '../../utils/tensor.js';
|
|
13
|
+
//# sourceMappingURL=feature_extraction_encodec.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"feature_extraction_encodec.d.ts","sourceRoot":"","sources":["../../../src/models/encodec/feature_extraction_encodec.js"],"names":[],"mappings":"AAIA;IACI;;;;OAIG;IACH,aAHW,YAAY,GAAC,YAAY,GACvB,OAAO,CAAC;QAAE,YAAY,EAAE,MAAM,CAAC;KAAE,CAAC,CAsB9C;CACJ;iCA/BuD,wCAAwC;uBACzE,uBAAuB"}
|
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
export * from "./audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js";
|
|
2
|
+
export * from "./encodec/feature_extraction_encodec.js";
|
|
2
3
|
export * from "./clap/feature_extraction_clap.js";
|
|
4
|
+
export * from "./dac/feature_extraction_dac.js";
|
|
3
5
|
export * from "./moonshine/feature_extraction_moonshine.js";
|
|
4
6
|
export * from "./pyannote/feature_extraction_pyannote.js";
|
|
5
7
|
export * from "./seamless_m4t/feature_extraction_seamless_m4t.js";
|
|
8
|
+
export * from "./snac/feature_extraction_snac.js";
|
|
6
9
|
export * from "./speecht5/feature_extraction_speecht5.js";
|
|
7
10
|
export * from "./wav2vec2/feature_extraction_wav2vec2.js";
|
|
8
11
|
export * from "./wespeaker/feature_extraction_wespeaker.js";
|
|
@@ -26,7 +26,7 @@ export class Florence2Processor extends Processor {
|
|
|
26
26
|
* @param {[number, number]} image_size The size of the image. height x width.
|
|
27
27
|
*/
|
|
28
28
|
post_process_generation(text: string, task: string, image_size: [number, number]): {
|
|
29
|
-
[
|
|
29
|
+
[task]: string | {
|
|
30
30
|
[x: string]: any[];
|
|
31
31
|
labels: any[];
|
|
32
32
|
};
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"processing_florence2.d.ts","sourceRoot":"","sources":["../../../src/models/florence2/processing_florence2.js"],"names":[],"mappings":"AAIA;IACI,6CAAsC;IACtC,wDAAiD;IAEjD,0CA0BC;IAdG,kCAAkC;IAClC,mCADW,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAC2E;IAEzG,kCAAkC;IAClC,6BADW,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAC+D;IAE7F,kCAAkC;IAClC,yBADW,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CACuD;IAErF;;;MAGC;IACD,qBAAwB;IAG5B;;;;OAIG;IACH,wBAHW,MAAM,GAAC,MAAM,EAAE,GACb,MAAM,EAAE,CA6BpB;IAED;;;;;OAKG;IACH,8BAJW,MAAM,QACN,MAAM,cACN,CAAC,MAAM,EAAE,MAAM,CAAC
|
|
1
|
+
{"version":3,"file":"processing_florence2.d.ts","sourceRoot":"","sources":["../../../src/models/florence2/processing_florence2.js"],"names":[],"mappings":"AAIA;IACI,6CAAsC;IACtC,wDAAiD;IAEjD,0CA0BC;IAdG,kCAAkC;IAClC,mCADW,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAC2E;IAEzG,kCAAkC;IAClC,6BADW,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAC+D;IAE7F,kCAAkC;IAClC,yBADW,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CACuD;IAErF;;;MAGC;IACD,qBAAwB;IAG5B;;;;OAIG;IACH,wBAHW,MAAM,GAAC,MAAM,EAAE,GACb,MAAM,EAAE,CA6BpB;IAED;;;;;OAKG;IACH,8BAJW,MAAM,QACN,MAAM,cACN,CAAC,MAAM,EAAE,MAAM,CAAC;QAqCd,CAAC,IAAI,CAAC;;;UAAc;MAChC;IAID,0DAaC;CACJ;0BAlIyB,gCAAgC;8BAE5B,qBAAqB;mCADhB,kCAAkC"}
|
|
@@ -31,6 +31,7 @@ export * from "./rt_detr/image_processing_rt_detr.js";
|
|
|
31
31
|
export * from "./sam/image_processing_sam.js";
|
|
32
32
|
export * from "./segformer/image_processing_segformer.js";
|
|
33
33
|
export * from "./siglip/image_processing_siglip.js";
|
|
34
|
+
export * from "./smolvlm/image_processing_smolvlm.js";
|
|
34
35
|
export * from "./swin2sr/image_processing_swin2sr.js";
|
|
35
36
|
export * from "./vit/image_processing_vit.js";
|
|
36
37
|
export * from "./vitmatte/image_processing_vitmatte.js";
|
|
@@ -11,7 +11,9 @@ export * from "./paligemma/processing_paligemma.js";
|
|
|
11
11
|
export * from "./pyannote/processing_pyannote.js";
|
|
12
12
|
export * from "./qwen2_vl/processing_qwen2_vl.js";
|
|
13
13
|
export * from "./sam/processing_sam.js";
|
|
14
|
+
export * from "./smolvlm/processing_smolvlm.js";
|
|
14
15
|
export * from "./speecht5/processing_speecht5.js";
|
|
16
|
+
export * from "./ultravox/processing_ultravox.js";
|
|
15
17
|
export * from "./wav2vec2/processing_wav2vec2.js";
|
|
16
18
|
export * from "./wav2vec2_with_lm/processing_wav2vec2_with_lm.js";
|
|
17
19
|
export * from "./whisper/processing_whisper.js";
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"image_processing_smolvlm.d.ts","sourceRoot":"","sources":["../../../src/models/smolvlm/image_processing_smolvlm.js"],"names":[],"mappings":""}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"processing_smolvlm.d.ts","sourceRoot":"","sources":["../../../src/models/smolvlm/processing_smolvlm.js"],"names":[],"mappings":""}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"feature_extraction_snac.d.ts","sourceRoot":"","sources":["../../../src/models/snac/feature_extraction_snac.js"],"names":[],"mappings":"AAEA;CAAiE;oCAF7B,kCAAkC"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Represents a UltravoxProcessor that extracts features from an audio input.
|
|
3
|
+
*/
|
|
4
|
+
export class UltravoxProcessor extends Processor {
|
|
5
|
+
static tokenizer_class: typeof AutoTokenizer;
|
|
6
|
+
static feature_extractor_class: typeof AutoFeatureExtractor;
|
|
7
|
+
/**
|
|
8
|
+
* @param {string} text The text input to process.
|
|
9
|
+
* @param {Float32Array} audio The audio input to process.
|
|
10
|
+
*/
|
|
11
|
+
_call(text: string, audio?: Float32Array, kwargs?: {}): Promise<any>;
|
|
12
|
+
}
|
|
13
|
+
import { Processor } from "../../base/processing_utils.js";
|
|
14
|
+
import { AutoTokenizer } from "../../tokenizers.js";
|
|
15
|
+
import { AutoFeatureExtractor } from "../auto/feature_extraction_auto.js";
|
|
16
|
+
//# sourceMappingURL=processing_ultravox.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"processing_ultravox.d.ts","sourceRoot":"","sources":["../../../src/models/ultravox/processing_ultravox.js"],"names":[],"mappings":"AAIA;;GAEG;AACH;IACI,6CAAsC;IACtC,4DAAqD;IAGrD;;;OAGG;IACH,YAHW,MAAM,UACN,YAAY,6BAsCtB;CACJ;0BAnDyB,gCAAgC;8BAD5B,qBAAqB;qCADd,oCAAoC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"common_whisper.d.ts","sourceRoot":"","sources":["../../../src/models/whisper/common_whisper.js"],"names":[],"mappings":"AA4HA;;;GAGG;AACH,mDAHW,MAAM,GACJ,MAAM,
|
|
1
|
+
{"version":3,"file":"common_whisper.d.ts","sourceRoot":"","sources":["../../../src/models/whisper/common_whisper.js"],"names":[],"mappings":"AA4HA;;;GAGG;AACH,mDAHW,MAAM,GACJ,MAAM,CA8BlB;AAnDD,qDAAmE;AAEnE,6DAeG"}
|
|
@@ -12,7 +12,9 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
|
|
|
12
12
|
* @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
|
|
13
13
|
* @returns {Promise<{ input_features: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor.
|
|
14
14
|
*/
|
|
15
|
-
_call(audio: Float32Array | Float64Array
|
|
15
|
+
_call(audio: Float32Array | Float64Array, { max_length, }?: {
|
|
16
|
+
max_length?: any;
|
|
17
|
+
}): Promise<{
|
|
16
18
|
input_features: Tensor;
|
|
17
19
|
}>;
|
|
18
20
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"feature_extraction_whisper.d.ts","sourceRoot":"","sources":["../../../src/models/whisper/feature_extraction_whisper.js"],"names":[],"mappings":"AAKA;IAEI,yBAeC;IADG,sCAAwD;IAG5D;;;;OAIG;IACH,kCAHW,YAAY,GAAC,YAAY,GACvB,OAAO,CAAC,MAAM,CAAC,
|
|
1
|
+
{"version":3,"file":"feature_extraction_whisper.d.ts","sourceRoot":"","sources":["../../../src/models/whisper/feature_extraction_whisper.js"],"names":[],"mappings":"AAKA;IAEI,yBAeC;IADG,sCAAwD;IAG5D;;;;OAIG;IACH,kCAHW,YAAY,GAAC,YAAY,GACvB,OAAO,CAAC,MAAM,CAAC,CA6B3B;IAED;;;;OAIG;IACH,aAHW,YAAY,GAAC,YAAY;;QACvB,OAAO,CAAC;QAAE,cAAc,EAAE,MAAM,CAAA;KAAE,CAAC,CA6B/C;CACJ;iCA3FuD,wCAAwC;uBACzE,uBAAuB"}
|
package/types/models.d.ts
CHANGED
|
@@ -35,8 +35,8 @@ export class PreTrainedModel extends PreTrainedModel_base {
|
|
|
35
35
|
sessions: Record<string, any>;
|
|
36
36
|
configs: Record<string, any>;
|
|
37
37
|
can_generate: boolean;
|
|
38
|
-
_forward: typeof decoderForward;
|
|
39
|
-
_prepare_inputs_for_generation: typeof
|
|
38
|
+
_forward: typeof decoderForward | typeof autoEncoderForward;
|
|
39
|
+
_prepare_inputs_for_generation: typeof multimodal_text_to_text_prepare_inputs_for_generation;
|
|
40
40
|
/** @type {import('./configs.js').TransformersJSConfig} */
|
|
41
41
|
custom_config: import("./configs.js").TransformersJSConfig;
|
|
42
42
|
/**
|
|
@@ -180,6 +180,9 @@ export class PreTrainedModel extends PreTrainedModel_base {
|
|
|
180
180
|
encode_text({ input_ids }: {
|
|
181
181
|
input_ids: any;
|
|
182
182
|
}): Promise<any>;
|
|
183
|
+
encode_audio({ audio_values }: {
|
|
184
|
+
audio_values: any;
|
|
185
|
+
}): Promise<any>;
|
|
183
186
|
}
|
|
184
187
|
export class ModelOutput {
|
|
185
188
|
}
|
|
@@ -1204,6 +1207,8 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
|
|
|
1204
1207
|
sequences: Tensor;
|
|
1205
1208
|
}, alignment_heads: number[][], num_frames?: number, time_precision?: number): Tensor;
|
|
1206
1209
|
}
|
|
1210
|
+
export class LiteWhisperForConditionalGeneration extends WhisperForConditionalGeneration {
|
|
1211
|
+
}
|
|
1207
1212
|
export class MoonshinePreTrainedModel extends PreTrainedModel {
|
|
1208
1213
|
requires_attention_mask: boolean;
|
|
1209
1214
|
}
|
|
@@ -1283,7 +1288,7 @@ export class PaliGemmaForConditionalGeneration extends PaliGemmaPreTrainedModel
|
|
|
1283
1288
|
export class Idefics3PreTrainedModel extends PreTrainedModel {
|
|
1284
1289
|
}
|
|
1285
1290
|
/**
|
|
1286
|
-
* The
|
|
1291
|
+
* The Idefics3 model which consists of a vision backbone and a language model.
|
|
1287
1292
|
*/
|
|
1288
1293
|
export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
|
|
1289
1294
|
encode_image({ pixel_values, pixel_attention_mask }: {
|
|
@@ -1295,6 +1300,12 @@ export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
|
|
|
1295
1300
|
attention_mask: any;
|
|
1296
1301
|
};
|
|
1297
1302
|
}
|
|
1303
|
+
/**
|
|
1304
|
+
* The SmolVLM Model with a language modeling head.
|
|
1305
|
+
* It is made up a SigLIP vision encoder, with a language modeling head on top.
|
|
1306
|
+
*/
|
|
1307
|
+
export class SmolVLMForConditionalGeneration extends Idefics3ForConditionalGeneration {
|
|
1308
|
+
}
|
|
1298
1309
|
export class Phi3VPreTrainedModel extends PreTrainedModel {
|
|
1299
1310
|
}
|
|
1300
1311
|
export class Phi3VForCausalLM extends Phi3VPreTrainedModel {
|
|
@@ -1741,6 +1752,18 @@ export class Gemma2Model extends Gemma2PreTrainedModel {
|
|
|
1741
1752
|
}
|
|
1742
1753
|
export class Gemma2ForCausalLM extends Gemma2PreTrainedModel {
|
|
1743
1754
|
}
|
|
1755
|
+
/**
|
|
1756
|
+
* The bare Gemma3 Model outputting raw hidden-states without any specific head on top.
|
|
1757
|
+
*/
|
|
1758
|
+
export class Gemma3PreTrainedModel extends PreTrainedModel {
|
|
1759
|
+
}
|
|
1760
|
+
/**
|
|
1761
|
+
* The bare Gemma3 Model outputting raw hidden-states without any specific head on top.
|
|
1762
|
+
*/
|
|
1763
|
+
export class Gemma3Model extends Gemma3PreTrainedModel {
|
|
1764
|
+
}
|
|
1765
|
+
export class Gemma3ForCausalLM extends Gemma3PreTrainedModel {
|
|
1766
|
+
}
|
|
1744
1767
|
export class OpenELMPreTrainedModel extends PreTrainedModel {
|
|
1745
1768
|
}
|
|
1746
1769
|
export class OpenELMModel extends OpenELMPreTrainedModel {
|
|
@@ -2173,6 +2196,8 @@ export class SwinForImageClassification extends SwinPreTrainedModel {
|
|
|
2173
2196
|
*/
|
|
2174
2197
|
_call(model_inputs: any): Promise<SequenceClassifierOutput>;
|
|
2175
2198
|
}
|
|
2199
|
+
export class SwinForSemanticSegmentation extends SwinPreTrainedModel {
|
|
2200
|
+
}
|
|
2176
2201
|
export class Swin2SRPreTrainedModel extends PreTrainedModel {
|
|
2177
2202
|
}
|
|
2178
2203
|
/**
|
|
@@ -2283,6 +2308,14 @@ export class DepthProPreTrainedModel extends PreTrainedModel {
|
|
|
2283
2308
|
}
|
|
2284
2309
|
export class DepthProForDepthEstimation extends DepthProPreTrainedModel {
|
|
2285
2310
|
}
|
|
2311
|
+
export class Metric3DPreTrainedModel extends PreTrainedModel {
|
|
2312
|
+
}
|
|
2313
|
+
export class Metric3DForDepthEstimation extends Metric3DPreTrainedModel {
|
|
2314
|
+
}
|
|
2315
|
+
export class Metric3Dv2PreTrainedModel extends PreTrainedModel {
|
|
2316
|
+
}
|
|
2317
|
+
export class Metric3Dv2ForDepthEstimation extends Metric3Dv2PreTrainedModel {
|
|
2318
|
+
}
|
|
2286
2319
|
export class MaskFormerPreTrainedModel extends PreTrainedModel {
|
|
2287
2320
|
}
|
|
2288
2321
|
export class MaskFormerModel extends MaskFormerPreTrainedModel {
|
|
@@ -3446,6 +3479,8 @@ export class MobileNetV1ForImageClassification extends MobileNetV1PreTrainedMode
|
|
|
3446
3479
|
*/
|
|
3447
3480
|
_call(model_inputs: any): Promise<SequenceClassifierOutput>;
|
|
3448
3481
|
}
|
|
3482
|
+
export class MobileNetV1ForSemanticSegmentation extends MobileNetV1PreTrainedModel {
|
|
3483
|
+
}
|
|
3449
3484
|
export class MobileNetV2PreTrainedModel extends PreTrainedModel {
|
|
3450
3485
|
}
|
|
3451
3486
|
/**
|
|
@@ -3463,6 +3498,8 @@ export class MobileNetV2ForImageClassification extends MobileNetV2PreTrainedMode
|
|
|
3463
3498
|
*/
|
|
3464
3499
|
_call(model_inputs: any): Promise<SequenceClassifierOutput>;
|
|
3465
3500
|
}
|
|
3501
|
+
export class MobileNetV2ForSemanticSegmentation extends MobileNetV2PreTrainedModel {
|
|
3502
|
+
}
|
|
3466
3503
|
export class MobileNetV3PreTrainedModel extends PreTrainedModel {
|
|
3467
3504
|
}
|
|
3468
3505
|
/**
|
|
@@ -3480,6 +3517,8 @@ export class MobileNetV3ForImageClassification extends MobileNetV3PreTrainedMode
|
|
|
3480
3517
|
*/
|
|
3481
3518
|
_call(model_inputs: any): Promise<SequenceClassifierOutput>;
|
|
3482
3519
|
}
|
|
3520
|
+
export class MobileNetV3ForSemanticSegmentation extends MobileNetV3PreTrainedModel {
|
|
3521
|
+
}
|
|
3483
3522
|
export class MobileNetV4PreTrainedModel extends PreTrainedModel {
|
|
3484
3523
|
}
|
|
3485
3524
|
/**
|
|
@@ -3497,6 +3536,8 @@ export class MobileNetV4ForImageClassification extends MobileNetV4PreTrainedMode
|
|
|
3497
3536
|
*/
|
|
3498
3537
|
_call(model_inputs: any): Promise<SequenceClassifierOutput>;
|
|
3499
3538
|
}
|
|
3539
|
+
export class MobileNetV4ForSemanticSegmentation extends MobileNetV4PreTrainedModel {
|
|
3540
|
+
}
|
|
3500
3541
|
export class DecisionTransformerPreTrainedModel extends PreTrainedModel {
|
|
3501
3542
|
}
|
|
3502
3543
|
/**
|
|
@@ -3562,6 +3603,134 @@ export class PatchTSMixerModel extends PatchTSMixerPreTrainedModel {
|
|
|
3562
3603
|
*/
|
|
3563
3604
|
export class PatchTSMixerForPrediction extends PatchTSMixerPreTrainedModel {
|
|
3564
3605
|
}
|
|
3606
|
+
export class UltravoxPreTrainedModel extends PreTrainedModel {
|
|
3607
|
+
}
|
|
3608
|
+
export class UltravoxModel extends UltravoxPreTrainedModel {
|
|
3609
|
+
_merge_input_ids_with_audio_features(kwargs: any): {
|
|
3610
|
+
inputs_embeds: any;
|
|
3611
|
+
attention_mask: any;
|
|
3612
|
+
};
|
|
3613
|
+
}
|
|
3614
|
+
export class MimiPreTrainedModel extends PreTrainedModel {
|
|
3615
|
+
}
|
|
3616
|
+
export class MimiEncoderOutput extends ModelOutput {
|
|
3617
|
+
/**
|
|
3618
|
+
* @param {Object} output The output of the model.
|
|
3619
|
+
* @param {Tensor} output.audio_codes Discrete code embeddings, of shape `(batch_size, num_quantizers, codes_length)`.
|
|
3620
|
+
*/
|
|
3621
|
+
constructor({ audio_codes }: {
|
|
3622
|
+
audio_codes: Tensor;
|
|
3623
|
+
});
|
|
3624
|
+
audio_codes: Tensor;
|
|
3625
|
+
}
|
|
3626
|
+
export class MimiDecoderOutput extends ModelOutput {
|
|
3627
|
+
/**
|
|
3628
|
+
* @param {Object} output The output of the model.
|
|
3629
|
+
* @param {Tensor} output.audio_values Decoded audio values, of shape `(batch_size, num_channels, sequence_length)`.
|
|
3630
|
+
*/
|
|
3631
|
+
constructor({ audio_values }: {
|
|
3632
|
+
audio_values: Tensor;
|
|
3633
|
+
});
|
|
3634
|
+
audio_values: Tensor;
|
|
3635
|
+
}
|
|
3636
|
+
/**
|
|
3637
|
+
* The Mimi neural audio codec model.
|
|
3638
|
+
*/
|
|
3639
|
+
export class MimiModel extends MimiPreTrainedModel {
|
|
3640
|
+
/**
|
|
3641
|
+
* Encodes the input audio waveform into discrete codes.
|
|
3642
|
+
* @param {Object} inputs Model inputs
|
|
3643
|
+
* @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`).
|
|
3644
|
+
* @returns {Promise<MimiEncoderOutput>} The output tensor of shape `(batch_size, num_codebooks, sequence_length)`.
|
|
3645
|
+
*/
|
|
3646
|
+
encode(inputs: {
|
|
3647
|
+
input_values?: Tensor;
|
|
3648
|
+
}): Promise<MimiEncoderOutput>;
|
|
3649
|
+
/**
|
|
3650
|
+
* Decodes the given frames into an output audio waveform.
|
|
3651
|
+
* @param {MimiEncoderOutput} inputs The encoded audio codes.
|
|
3652
|
+
* @returns {Promise<MimiDecoderOutput>} The output tensor of shape `(batch_size, num_channels, sequence_length)`.
|
|
3653
|
+
*/
|
|
3654
|
+
decode(inputs: MimiEncoderOutput): Promise<MimiDecoderOutput>;
|
|
3655
|
+
}
|
|
3656
|
+
export class MimiEncoderModel extends MimiPreTrainedModel {
|
|
3657
|
+
}
|
|
3658
|
+
export class MimiDecoderModel extends MimiPreTrainedModel {
|
|
3659
|
+
}
|
|
3660
|
+
export class DacPreTrainedModel extends PreTrainedModel {
|
|
3661
|
+
}
|
|
3662
|
+
export class DacEncoderOutput extends ModelOutput {
|
|
3663
|
+
/**
|
|
3664
|
+
* @param {Object} output The output of the model.
|
|
3665
|
+
* @param {Tensor} output.audio_codes Discrete code embeddings, of shape `(batch_size, num_quantizers, codes_length)`.
|
|
3666
|
+
*/
|
|
3667
|
+
constructor({ audio_codes }: {
|
|
3668
|
+
audio_codes: Tensor;
|
|
3669
|
+
});
|
|
3670
|
+
audio_codes: Tensor;
|
|
3671
|
+
}
|
|
3672
|
+
export class DacDecoderOutput extends ModelOutput {
|
|
3673
|
+
/**
|
|
3674
|
+
* @param {Object} output The output of the model.
|
|
3675
|
+
* @param {Tensor} output.audio_values Decoded audio values, of shape `(batch_size, num_channels, sequence_length)`.
|
|
3676
|
+
*/
|
|
3677
|
+
constructor({ audio_values }: {
|
|
3678
|
+
audio_values: Tensor;
|
|
3679
|
+
});
|
|
3680
|
+
audio_values: Tensor;
|
|
3681
|
+
}
|
|
3682
|
+
/**
|
|
3683
|
+
* The DAC (Descript Audio Codec) model.
|
|
3684
|
+
*/
|
|
3685
|
+
export class DacModel extends DacPreTrainedModel {
|
|
3686
|
+
/**
|
|
3687
|
+
* Encodes the input audio waveform into discrete codes.
|
|
3688
|
+
* @param {Object} inputs Model inputs
|
|
3689
|
+
* @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`).
|
|
3690
|
+
* @returns {Promise<DacEncoderOutput>} The output tensor of shape `(batch_size, num_codebooks, sequence_length)`.
|
|
3691
|
+
*/
|
|
3692
|
+
encode(inputs: {
|
|
3693
|
+
input_values?: Tensor;
|
|
3694
|
+
}): Promise<DacEncoderOutput>;
|
|
3695
|
+
/**
|
|
3696
|
+
* Decodes the given frames into an output audio waveform.
|
|
3697
|
+
* @param {DacEncoderOutput} inputs The encoded audio codes.
|
|
3698
|
+
* @returns {Promise<DacDecoderOutput>} The output tensor of shape `(batch_size, num_channels, sequence_length)`.
|
|
3699
|
+
*/
|
|
3700
|
+
decode(inputs: DacEncoderOutput): Promise<DacDecoderOutput>;
|
|
3701
|
+
}
|
|
3702
|
+
export class DacEncoderModel extends DacPreTrainedModel {
|
|
3703
|
+
}
|
|
3704
|
+
export class DacDecoderModel extends DacPreTrainedModel {
|
|
3705
|
+
}
|
|
3706
|
+
export class SnacPreTrainedModel extends PreTrainedModel {
|
|
3707
|
+
}
|
|
3708
|
+
/**
|
|
3709
|
+
* The SNAC (Multi-Scale Neural Audio Codec) model.
|
|
3710
|
+
*/
|
|
3711
|
+
export class SnacModel extends SnacPreTrainedModel {
|
|
3712
|
+
/**
|
|
3713
|
+
* Encodes the input audio waveform into discrete codes.
|
|
3714
|
+
* @param {Object} inputs Model inputs
|
|
3715
|
+
* @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`).
|
|
3716
|
+
* @returns {Promise<Record<string, Tensor>>} The output tensors of shape `(batch_size, num_codebooks, sequence_length)`.
|
|
3717
|
+
*/
|
|
3718
|
+
encode(inputs: {
|
|
3719
|
+
input_values?: Tensor;
|
|
3720
|
+
}): Promise<Record<string, Tensor>>;
|
|
3721
|
+
/**
|
|
3722
|
+
* Decodes the given frames into an output audio waveform.
|
|
3723
|
+
* @param {Record<string, Tensor>} inputs The encoded audio codes.
|
|
3724
|
+
* @returns {Promise<{audio_values: Tensor}>} The output tensor of shape `(batch_size, num_channels, sequence_length)`.
|
|
3725
|
+
*/
|
|
3726
|
+
decode(inputs: Record<string, Tensor>): Promise<{
|
|
3727
|
+
audio_values: Tensor;
|
|
3728
|
+
}>;
|
|
3729
|
+
}
|
|
3730
|
+
export class SnacEncoderModel extends SnacPreTrainedModel {
|
|
3731
|
+
}
|
|
3732
|
+
export class SnacDecoderModel extends SnacPreTrainedModel {
|
|
3733
|
+
}
|
|
3565
3734
|
/**
|
|
3566
3735
|
* Base class of all AutoModels. Contains the `from_pretrained` function
|
|
3567
3736
|
* which is used to instantiate pretrained models.
|
|
@@ -3799,6 +3968,12 @@ export class AutoModelForPoseEstimation extends PretrainedMixin {
|
|
|
3799
3968
|
export class AutoModelForImageFeatureExtraction extends PretrainedMixin {
|
|
3800
3969
|
static MODEL_CLASS_MAPPINGS: Map<string, (string | typeof CLIPVisionModelWithProjection)[]>[];
|
|
3801
3970
|
}
|
|
3971
|
+
export class AutoModelForImageTextToText extends PretrainedMixin {
|
|
3972
|
+
static MODEL_CLASS_MAPPINGS: Map<string, (string | typeof Idefics3ForConditionalGeneration)[] | (string | typeof Florence2ForConditionalGeneration)[]>[];
|
|
3973
|
+
}
|
|
3974
|
+
export class AutoModelForAudioTextToText extends PretrainedMixin {
|
|
3975
|
+
static MODEL_CLASS_MAPPINGS: Map<string, (string | typeof UltravoxModel)[]>[];
|
|
3976
|
+
}
|
|
3802
3977
|
export class Seq2SeqLMOutput extends ModelOutput {
|
|
3803
3978
|
/**
|
|
3804
3979
|
* @param {Object} output The output of the model.
|
|
@@ -3961,7 +4136,8 @@ export class VitsModelOutput extends ModelOutput {
|
|
|
3961
4136
|
* @private
|
|
3962
4137
|
*/
|
|
3963
4138
|
declare function decoderForward(self: any, model_inputs: any, is_encoder_decoder?: boolean): Promise<any>;
|
|
3964
|
-
declare function
|
|
4139
|
+
declare function autoEncoderForward(self: any, model_inputs: any): Promise<any>;
|
|
4140
|
+
declare function multimodal_text_to_text_prepare_inputs_for_generation(self: any, ...args: any[]): any;
|
|
3965
4141
|
import { GenerationConfig } from './generation/configuration_utils.js';
|
|
3966
4142
|
import { LogitsProcessorList } from './generation/logits_process.js';
|
|
3967
4143
|
import { StoppingCriteriaList } from './generation/stopping_criteria.js';
|