npm - @huggingface/transformers - Versions diffs - 4.0.0-next.0 → 4.0.0-next.10 - Mend

@huggingface/transformers 4.0.0-next.0 → 4.0.0-next.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (364) hide show

package/src/models/voxtral_realtime/modeling_voxtral_realtime.js ADDED Viewed

@@ -0,0 +1,239 @@
+import { PreTrainedModel } from '../modeling_utils.js';
+import { sessionRun } from '../session.js';
+import { getCacheShapes } from '../../configs.js';
+import { Tensor, ones } from '../../utils/tensor.js';
+import { DataTypeMap } from '../../utils/dtypes.js';
+import { pick } from '../../utils/core.js';
+import { DynamicCache } from '../../cache_utils.js';
+import { StoppingCriteria, StoppingCriteriaList } from '../../generation/stopping_criteria.js';
+// Causal conv padding constants
+const CONV1_LEFT_PAD = 2;
+const CONV2_LEFT_PAD = 1;
+/**
+ * WeakMap to hold encoder streaming states for each model instance during generation.
+ * This allows the state to be accessed and modified across the generation process
+ * without exposing it on the model instance itself.
+ * @private
+ * @type {WeakMap<VoxtralRealtimeForConditionalGeneration, Object>}
+ */
+const states = new WeakMap();
+/**
+ * Creates encoder streaming state for a VoxtralRealtime generation session.
+ * @param {VoxtralRealtimeForConditionalGeneration} model
+ * @param {Iterable<Tensor>|AsyncIterable<Tensor>} input_features
+ * @returns {Object} Encoder state object.
+ * @private
+ */
+function createEncoderState(model, input_features) {
+    const { text_config, audio_config } = /** @type {any} */ (model.config);
+    const encoder_session = model.sessions['audio_encoder'];
+    const { num_mel_bins, hidden_size: enc_hidden_size } = audio_config;
+    const PADDING_CACHE_CHANNELS = num_mel_bins + enc_hidden_size;
+    // Initialize encoder KV cache
+    const enc_kv_cache = new DynamicCache();
+    const enc_dtype = encoder_session?.config?.kv_cache_dtype ?? 'float32';
+    const enc_cls = enc_dtype === 'float16' ? DataTypeMap.float16 : DataTypeMap.float32;
+    const enc_shapes = getCacheShapes(audio_config, { batch_size: 1 });
+    for (const name in enc_shapes) {
+        const size = enc_shapes[name].reduce((a, b) => a * b, 1);
+        enc_kv_cache[name] = new Tensor(enc_dtype, new enc_cls(size), enc_shapes[name]);
+    }
+    const enc_padding_cache = new Tensor(enc_dtype, new enc_cls(PADDING_CACHE_CHANNELS * CONV1_LEFT_PAD), [
+        1,
+        PADDING_CACHE_CHANNELS,
+        CONV1_LEFT_PAD,
+    ]);
+    // Set up iterator from input_features
+    const chunks_iter = input_features[Symbol.asyncIterator]?.() ?? input_features[Symbol.iterator]?.();
+    if (!chunks_iter) {
+        throw new Error('input_features must be iterable or async iterable');
+    }
+    return {
+        encoder_session,
+        enc_kv_cache,
+        enc_padding_cache,
+        enc_past_seq_len: 0,
+        audio_embed_queue: [],
+        audio_embed_total_tokens: 0,
+        audio_queue_offset: 0,
+        audio_consumed: 0,
+        stream_exhausted: false,
+        chunks_iter,
+        text_hidden_size: text_config.hidden_size,
+    };
+}
+/**
+ * Encodes one audio chunk through the audio encoder.
+ * @param {Object} s Encoder state.
+ * @param {Tensor} chunk_features Mel spectrogram chunk [1, num_mel_bins, seq_len].
+ * @returns {Promise<Tensor>} Audio embeddings.
+ * @private
+ */
+async function encodeChunk(s, chunk_features) {
+    const audio_seq_len = chunk_features.dims[2];
+    const conv2_output_len = Math.floor((CONV2_LEFT_PAD + audio_seq_len - 3) / 2) + 1;
+    const position_ids = new Tensor(
+        'int64',
+        BigInt64Array.from({ length: conv2_output_len }, (_, i) => BigInt(s.enc_past_seq_len + i)),
+        [1, conv2_output_len],
+    );
+    const total_seq_len = s.enc_past_seq_len + conv2_output_len;
+    const attention_mask = ones([1, total_seq_len]);
+    const { audio_embeds, present_padding_cache, ...present_cache } = await sessionRun(s.encoder_session, {
+        input_features: chunk_features,
+        attention_mask,
+        position_ids,
+        past_padding_cache: s.enc_padding_cache,
+        ...s.enc_kv_cache,
+    });
+    // Dispose previous padding cache and update
+    if (s.enc_padding_cache.location === 'gpu-buffer') {
+        s.enc_padding_cache.dispose();
+    }
+    s.enc_padding_cache = present_padding_cache;
+    // Update encoder KV cache, disposing previous tensors
+    for (const name in present_cache) {
+        if (name.startsWith('present.')) {
+            const pastName = name.replace('present', 'past_key_values');
+            const prev = s.enc_kv_cache[pastName];
+            if (prev?.location === 'gpu-buffer') {
+                prev.dispose();
+            }
+            s.enc_kv_cache[pastName] = present_cache[name];
+        }
+    }
+    s.enc_past_seq_len = total_seq_len;
+    return audio_embeds;
+}
+/**
+ * Fills the audio embedding buffer until it has enough tokens.
+ * @param {Object} s Encoder state.
+ * @param {number} needed Total number of audio tokens needed.
+ * @private
+ */
+async function fillAudioBuffer(s, needed) {
+    while (s.audio_embed_total_tokens < needed && !s.stream_exhausted) {
+        const result = await s.chunks_iter.next();
+        if (result.done) {
+            s.stream_exhausted = true;
+            break;
+        }
+        const new_embeds = await encodeChunk(s, result.value);
+        s.audio_embed_queue.push({ data: new_embeds.data, tokens: new_embeds.dims[1] });
+        s.audio_embed_total_tokens += new_embeds.dims[1];
+    }
+}
+/**
+ * Adds audio embeddings to text embeddings from the queue.
+ * @param {Object} s Encoder state.
+ * @param {Tensor} inputs_embeds Text embeddings tensor (modified in-place).
+ * @param {number} current_len Number of tokens to consume.
+ * @private
+ */
+function addAudioEmbeddings(s, inputs_embeds, current_len) {
+    if (s.audio_embed_queue.length === 0) return;
+    const embed_data = inputs_embeds.data;
+    let embed_write_pos = 0;
+    let remaining = current_len;
+    while (remaining > 0 && s.audio_embed_queue.length > 0) {
+        const front = s.audio_embed_queue[0];
+        const available = front.tokens - s.audio_queue_offset;
+        const n = Math.min(remaining, available);
+        const src_offset = s.audio_queue_offset * s.text_hidden_size;
+        for (let i = 0; i < n * s.text_hidden_size; ++i) {
+            embed_data[embed_write_pos * s.text_hidden_size + i] += front.data[src_offset + i];
+        }
+        embed_write_pos += n;
+        remaining -= n;
+        s.audio_queue_offset += n;
+        if (s.audio_queue_offset >= front.tokens) {
+            s.audio_embed_queue.shift();
+            s.audio_queue_offset = 0;
+        }
+    }
+    s.audio_consumed += current_len - remaining;
+}
+/**
+ * Stopping criterion that triggers when the audio stream is exhausted
+ * and all buffered audio embeddings have been consumed.
+ * @private
+ */
+class AudioExhaustedCriteria extends StoppingCriteria {
+    constructor(enc_state) {
+        super();
+        this._s = enc_state;
+    }
+    _call(input_ids) {
+        const done = this._s.stream_exhausted && this._s.audio_embed_queue.length === 0;
+        return input_ids.map(() => done);
+    }
+}
+export class VoxtralRealtimePreTrainedModel extends PreTrainedModel {
+    forward_params = ['input_ids', 'attention_mask', 'position_ids', 'past_key_values'];
+}
+export class VoxtralRealtimeForConditionalGeneration extends VoxtralRealtimePreTrainedModel {
+    async forward({ input_ids, past_key_values, ...kwargs }) {
+        const current_len = input_ids.dims[1];
+        const enc = states.get(this);
+        if (enc) {
+            // Fill audio buffer and embed tokens with audio
+            await fillAudioBuffer(enc, enc.audio_consumed + current_len);
+        }
+        const { inputs_embeds } = await sessionRun(this.sessions['embed_tokens'], { input_ids });
+        if (enc) {
+            addAudioEmbeddings(enc, inputs_embeds, current_len);
+        }
+        const decoder_feeds = { inputs_embeds, ...kwargs };
+        this.addPastKeyValues(decoder_feeds, past_key_values);
+        const session = this.sessions['decoder_model_merged'];
+        const fixed = pick(decoder_feeds, session.inputNames);
+        return await sessionRun(session, fixed);
+    }
+    async generate({ input_features, stopping_criteria: userStoppingCriteria, ...kwargs }) {
+        if (!input_features) {
+            throw new Error('input_features (generator/iterable) must be provided');
+        }
+        const enc_state = createEncoderState(this, input_features);
+        states.set(this, enc_state);
+        const stopping_criteria = new StoppingCriteriaList();
+        stopping_criteria.push(new AudioExhaustedCriteria(enc_state));
+        if (userStoppingCriteria) stopping_criteria.extend(userStoppingCriteria);
+        try {
+            return await super.generate({ ...kwargs, stopping_criteria });
+        } finally {
+            // Cleanup encoder state
+            enc_state.enc_kv_cache.dispose();
+            states.delete(this);
+        }
+    }
+}

package/src/models/voxtral_realtime/processing_voxtral_realtime.js ADDED Viewed

@@ -0,0 +1,113 @@
+import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
+import { AutoTokenizer } from '../auto/tokenization_auto.js';
+import { Processor } from '../../processing_utils.js';
+import { Tensor } from '../../utils/tensor.js';
+import { validate_audio_inputs } from '../../feature_extraction_utils.js';
+// Voxtral Realtime audio config constants (from mistral_common AudioConfig)
+const NUM_LEFT_PAD_TOKENS = 32;
+const NUM_DELAY_TOKENS = 6;
+const AUDIO_LENGTH_PER_TOK = 8;
+const OFFLINE_STREAMING_BUFFER_TOKENS = 10;
+/** Token ID for [STREAMING_PAD] in the Voxtral tokenizer. */
+const STREAMING_PAD_TOKEN_ID = 32;
+export class VoxtralRealtimeProcessor extends Processor {
+    static tokenizer_class = AutoTokenizer;
+    static feature_extractor_class = AutoFeatureExtractor;
+    static uses_processor_config = false;
+    /** Number of mel frames in the first audio chunk. */
+    get num_mel_frames_first_audio_chunk() {
+        return (NUM_DELAY_TOKENS + 1) * AUDIO_LENGTH_PER_TOK;
+    }
+    /** Number of raw audio samples in the first audio chunk. */
+    get num_samples_first_audio_chunk() {
+        const { hop_length, n_fft } = this.feature_extractor.config;
+        return (this.num_mel_frames_first_audio_chunk - 1) * hop_length + Math.floor(n_fft / 2);
+    }
+    /** Number of raw audio samples per subsequent audio chunk. */
+    get num_samples_per_audio_chunk() {
+        const { hop_length, n_fft } = this.feature_extractor.config;
+        return AUDIO_LENGTH_PER_TOK * hop_length + n_fft;
+    }
+    /** Number of right-pad tokens for non-streaming mode. */
+    get num_right_pad_tokens() {
+        return NUM_DELAY_TOKENS + 1 + OFFLINE_STREAMING_BUFFER_TOKENS;
+    }
+    /** Number of mel frames per text token. */
+    get audio_length_per_tok() {
+        return AUDIO_LENGTH_PER_TOK;
+    }
+    /** Number of raw audio samples per token. */
+    get raw_audio_length_per_tok() {
+        return AUDIO_LENGTH_PER_TOK * this.feature_extractor.config.hop_length;
+    }
+    /**
+     * Process audio input for VoxtralRealtime.
+     *
+     * In streaming mode with `is_first_audio_chunk=true`, the audio is left-padded
+     * with silence and mel features are extracted with `center=true`.
+     * Returns `{ input_ids, input_features }`.
+     *
+     * In streaming mode with `is_first_audio_chunk=false`, the audio chunk is
+     * processed with `center=false` and only `{ input_features }` is returned.
+     *
+     * In non-streaming mode, the audio is right-padded to ensure the model
+     * transcribes the full audio, then processed with `center=true`.
+     * Returns `{ input_features }`.
+     *
+     * @param {Float32Array|Float64Array} audio The audio waveform.
+     * @param {Object} [options]
+     * @param {boolean} [options.is_streaming=false] Whether processing in streaming mode.
+     * @param {boolean} [options.is_first_audio_chunk=true] Whether this is the first audio chunk.
+     * @returns {Promise<Object>}
+     */
+    async _call(audio, { is_streaming = false, is_first_audio_chunk = true } = {}) {
+        validate_audio_inputs(audio, 'VoxtralRealtimeProcessor');
+        if (!is_streaming && !is_first_audio_chunk) {
+            throw new Error('In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.');
+        }
+        if (is_first_audio_chunk) {
+            if (is_streaming) {
+                // Streaming first chunk: left-pad audio with silence, extract mel with center=true, build input_ids
+                const num_left_pad_samples = NUM_LEFT_PAD_TOKENS * this.raw_audio_length_per_tok;
+                const padded_audio = new Float32Array(num_left_pad_samples + audio.length);
+                padded_audio.set(audio, num_left_pad_samples);
+                const audio_encoding = await this.feature_extractor(padded_audio, { center: true });
+                // Build input_ids: BOS + (num_left_pad_tokens + num_delay_tokens) * [STREAMING_PAD]
+                const num_pad_tokens = NUM_LEFT_PAD_TOKENS + NUM_DELAY_TOKENS;
+                const num_input_tokens = 1 + num_pad_tokens;
+                const input_ids_data = new BigInt64Array(num_input_tokens).fill(BigInt(STREAMING_PAD_TOKEN_ID));
+                input_ids_data[0] = 1n; // BOS
+                const input_ids = new Tensor('int64', input_ids_data, [1, num_input_tokens]);
+                return {
+                    input_ids,
+                    ...audio_encoding,
+                };
+            } else {
+                // Non-streaming: right-pad audio to ensure full transcription, extract mel with center=true
+                const right_pad_samples = this.num_right_pad_tokens * this.raw_audio_length_per_tok;
+                const padded_audio = new Float32Array(audio.length + right_pad_samples);
+                padded_audio.set(audio);
+                return await this.feature_extractor(padded_audio, { center: true });
+            }
+        } else {
+            // Subsequent streaming chunks: extract mel with center=false
+            return await this.feature_extractor(audio, { center: false });
+        }
+    }
+}

package/src/models/whisper/feature_extraction_whisper.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import { FeatureExtractor, validate_audio_inputs } from '../../feature_extraction_utils.js';
 import { Tensor } from '../../utils/tensor.js';
 import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
-import { max } from '../../utils/maths.js';
+import { logger } from '../../utils/logger.js';
 export class WhisperFeatureExtractor extends FeatureExtractor {
     constructor(config) {
@@ -27,7 +27,7 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
      * @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
      */
     async _extract_fbank_features(waveform) {
-        const features = await spectrogram(
+        return await spectrogram(
             waveform,
             this.window, // window
             this.config.n_fft, // frame_length
@@ -35,7 +35,7 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
             {
                 power: 2.0,
                 mel_filters: this.config.mel_filters,
-                log_mel: 'log10',
+                log_mel: 'log10_max_norm',
                 // Custom
                 max_num_frames: Math.min(
@@ -44,15 +44,6 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
                 ),
             },
         );
-        const data = features.data;
-        const maxValue = max(/** @type {Float32Array} */ (data))[0];
-        for (let i = 0; i < data.length; ++i) {
-            data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0;
-        }
-        return features;
     }
     /**
@@ -67,7 +58,7 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
         const length = max_length ?? this.config.n_samples;
         if (audio.length > length) {
             if (audio.length > this.config.n_samples) {
-                console.warn(
+                logger.warn(
                     'Attempting to extract features for audio longer than 30 seconds. ' +
                         'If using a pipeline to extract transcript from a long audio clip, ' +
                         'remember to specify `chunk_length_s` and/or `stride_length_s`.',

package/src/models/whisper/modeling_whisper.js CHANGED Viewed

@@ -10,6 +10,7 @@ import {
 import { medianFilter, dynamic_time_warping } from '../../utils/maths.js';
 import { mergeArrays } from '../../utils/core.js';
 import { ModelOutput } from '../modeling_outputs.js';
+import { logger } from '../../utils/logger.js';
 export class WhisperPreTrainedModel extends PreTrainedModel {
     requires_attention_mask = false;
@@ -56,7 +57,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
         if (generation_config.is_multilingual) {
             if (!language) {
                 // TODO: Implement language detection
-                console.warn('No language specified - defaulting to English (en).');
+                logger.warn('No language specified - defaulting to English (en).');
                 language = 'en';
             }
@@ -85,7 +86,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
             generation_config.return_timestamps &&
             init_tokens.at(-1) === generation_config.no_timestamps_token_id
         ) {
-            console.warn(
+            logger.warn(
                 '<|notimestamps|> prompt token is removed from generation_config since `return_timestamps` is set to `true`.',
             );
             init_tokens.pop();
@@ -138,7 +139,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
             }
             if (generation_config.task === 'translate') {
-                console.warn("Token-level timestamps may not be reliable for task 'translate'.");
+                logger.warn("Token-level timestamps may not be reliable for task 'translate'.");
             }
             generation_config.output_attentions = true;
@@ -185,7 +186,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
             );
         }
         if (num_frames == null) {
-            console.warn(
+            logger.warn(
                 '`num_frames` has not been set, meaning the entire audio will be analyzed. ' +
                     'This may lead to inaccurate token-level timestamps for short audios (< 30 seconds).',
             );
@@ -194,7 +195,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
         // @ts-expect-error TS2339
         let median_filter_width = this.config.median_filter_width;
         if (median_filter_width === undefined) {
-            console.warn('Model config has no `median_filter_width`, using default value of 7.');
+            logger.warn('Model config has no `median_filter_width`, using default value of 7.');
             median_filter_width = 7;
         }

package/src/models/xlm/tokenization_xlm.js CHANGED Viewed

@@ -1,11 +1,12 @@
 import { PreTrainedTokenizer } from '../../tokenization_utils.js';
+import { logger } from '../../utils/logger.js';
 export class XLMTokenizer extends PreTrainedTokenizer {
     return_token_type_ids = true;
     constructor(tokenizerJSON, tokenizerConfig) {
         super(tokenizerJSON, tokenizerConfig);
-        console.warn(
+        logger.warn(
             'WARNING: `XLMTokenizer` is not yet supported by Hugging Face\'s "fast" tokenizers library. Therefore, you may experience slightly inaccurate results.',
         );
     }

package/src/pipelines/automatic-speech-recognition.js CHANGED Viewed

@@ -2,6 +2,7 @@ import { Pipeline, prepareAudios } from './_base.js';
 import { Tensor } from '../utils/tensor.js';
 import { max, round } from '../utils/maths.js';
+import { logger } from '../utils/logger.js';
 /**
  * @typedef {import('./_base.js').TextAudioPipelineConstructorArgs} TextAudioPipelineConstructorArgs
@@ -29,7 +30,7 @@ import { max, round } from '../utils/maths.js';
  * @property {string} [language] The source language. Default is `null`, meaning it should be auto-detected. Use this to potentially improve performance if the source language is known.
  * @property {string} [task] The task to perform. Default is `null`, meaning it should be auto-detected.
  * @property {number} [num_frames] The number of frames in the input audio.
- * @typedef {import('../generation/configuration_utils.js').GenerationConfig & AutomaticSpeechRecognitionSpecificParams} AutomaticSpeechRecognitionConfig
+ * @typedef {import('../generation/parameters.js').GenerationFunctionParameters & AutomaticSpeechRecognitionSpecificParams} AutomaticSpeechRecognitionConfig
  *
  * @callback AutomaticSpeechRecognitionPipelineCallbackSingle Transcribe the audio sequence given as inputs to text.
  * @param {AudioInput} audio The input audio file(s) to be transcribed. The input is either:
@@ -153,6 +154,8 @@ export class AutomaticSpeechRecognitionPipeline
                 return this._call_wav2vec2(audio, kwargs);
             case 'moonshine':
                 return this._call_moonshine(audio, kwargs);
+            case 'cohere_asr':
+                return this._call_cohere_asr(audio, kwargs);
             default:
                 throw new Error(
                     `AutomaticSpeechRecognitionPipeline does not support model type '${this.model.config.model_type}'.`,
@@ -164,10 +167,10 @@ export class AutomaticSpeechRecognitionPipeline
         // TODO use kwargs
         if (kwargs.language) {
-            console.warn('`language` parameter is not yet supported for `wav2vec2` models, defaulting to "English".');
+            logger.warn('`language` parameter is not yet supported for `wav2vec2` models, defaulting to "English".');
         }
         if (kwargs.task) {
-            console.warn('`task` parameter is not yet supported for `wav2vec2` models, defaulting to "transcribe".');
+            logger.warn('`task` parameter is not yet supported for `wav2vec2` models, defaulting to "transcribe".');
         }
         const single = !Array.isArray(audio);
@@ -319,4 +322,45 @@ export class AutomaticSpeechRecognitionPipeline
         }
         return single ? toReturn[0] : toReturn;
     }
+    async _call_cohere_asr(audio, kwargs) {
+        const single = !Array.isArray(audio);
+        const batchedAudio = single ? [audio] : audio;
+        const feature_extractor = this.processor.feature_extractor;
+        const sampling_rate = feature_extractor.config.sampling_rate;
+        const preparedAudios = await prepareAudios(batchedAudio, sampling_rate);
+        const language = kwargs.language ?? 'en';
+        // @ts-expect-error TS2339
+        const decoder_input_ids = this.processor.get_decoder_prompt_ids(language);
+        const toReturn = [];
+        for (const aud of preparedAudios) {
+            // Split long audio at energy-based boundaries
+            // @ts-expect-error TS2339
+            const audioChunks = feature_extractor.split_audio(aud);
+            const chunk_texts = [];
+            for (const chunk of audioChunks) {
+                const inputs = await this.processor(chunk);
+                const outputs = await this.model.generate({
+                    ...inputs,
+                    decoder_input_ids,
+                    ...kwargs,
+                });
+                const text = this.tokenizer
+                    .decode(/** @type {Tensor} */ (outputs)[0].tolist(), { skip_special_tokens: true })
+                    .trim();
+                chunk_texts.push(text);
+            }
+            // @ts-expect-error TS2339
+            const full_text = this.processor.constructor.join_chunks(chunk_texts, language);
+            toReturn.push({ text: full_text });
+        }
+        return single ? toReturn[0] : toReturn;
+    }
 }

package/src/pipelines/document-question-answering.js CHANGED Viewed

@@ -16,7 +16,7 @@ import { Tensor } from '../utils/tensor.js';
  * @callback DocumentQuestionAnsweringPipelineCallback Answer the question given as input by using the document.
  * @param {ImageInput|ImageInput[]} image The image of the document to use.
  * @param {string} question A question to ask of the document.
- * @param {Partial<import('../generation/configuration_utils.js').GenerationConfig>} [options] Additional keyword arguments to pass along to the generate method of the model.
+ * @param {Partial<import('../generation/parameters.js').GenerationFunctionParameters>} [options] Additional keyword arguments to pass along to the generate method of the model.
  * @returns {Promise<DocumentQuestionAnsweringOutput>} An object (or array of objects) containing the answer(s).
  *
  * @typedef {TextImagePipelineConstructorArgs & DocumentQuestionAnsweringPipelineCallback & Disposable} DocumentQuestionAnsweringPipelineType

package/src/pipelines/image-to-text.js CHANGED Viewed

@@ -15,12 +15,12 @@ import { Tensor } from '../utils/tensor.js';
  *
  * @callback ImageToTextPipelineCallbackSingle Assign labels to the image passed as input.
  * @param {ImageInput} texts The image to be captioned.
- * @param {Partial<import('../generation/configuration_utils.js').GenerationConfig>} [options] Additional keyword arguments to pass along to the generate method of the model.
+ * @param {Partial<import('../generation/parameters.js').GenerationFunctionParameters>} [options] Additional keyword arguments to pass along to the generate method of the model.
  * @returns {Promise<ImageToTextOutput>} An object containing the generated text(s).
  *
  * @callback ImageToTextPipelineCallbackBatch Assign labels to the images passed as inputs.
  * @param {ImageInput[]} texts The images to be captioned.
- * @param {Partial<import('../generation/configuration_utils.js').GenerationConfig>} [options] Additional keyword arguments to pass along to the generate method of the model.
+ * @param {Partial<import('../generation/parameters.js').GenerationFunctionParameters>} [options] Additional keyword arguments to pass along to the generate method of the model.
  * @returns {Promise<ImageToTextOutput[]>} An array containing the generated text(s) for each image.
  *
  * @typedef {ImageToTextPipelineCallbackSingle & ImageToTextPipelineCallbackBatch} ImageToTextPipelineCallback