npm - @huggingface/transformers - Versions diffs - 3.0.1 → 3.1.0 - Mend

@huggingface/transformers 3.0.1 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (239) hide show

package/src/models/jina_clip/image_processing_jina_clip.js ADDED Viewed

@@ -0,0 +1,26 @@
+import {
+    ImageProcessor,
+} from "../../base/image_processors_utils.js";
+export class JinaCLIPImageProcessor extends ImageProcessor {
+    constructor(config) {
+        // JinaCLIPImageProcessor uses a custom preprocessor_config.json, so we configure it here
+        const { resize_mode, fill_color, interpolation, size, ...other } = config;
+        const new_size = resize_mode === 'squash'
+            ? { width: size, height: size }
+            : resize_mode === 'shortest'
+                ? { shortest_edge: size }
+                : { longest_edge: size };
+        const resample = interpolation === 'bicubic' ? 3 : 2;
+        super({
+            ...other,
+            size: new_size,
+            resample,
+            do_center_crop: true,
+            crop_size: size,
+            do_normalize: true,
+        });
+    }
+}

package/src/models/jina_clip/processing_jina_clip.js ADDED Viewed

@@ -0,0 +1,24 @@
+import { Processor } from "../../base/processing_utils.js";
+import { AutoImageProcessor } from "../auto/image_processing_auto.js";
+import { AutoTokenizer } from "../../tokenizers.js";
+export class JinaCLIPProcessor extends Processor {
+    static tokenizer_class = AutoTokenizer
+    static image_processor_class = AutoImageProcessor
+    async _call(text=null, images=null, kwargs = {}) {
+        if (!text && !images){
+            throw new Error('Either text or images must be provided');
+        }
+        const text_inputs = text ? this.tokenizer(text, kwargs) : {};
+        const image_inputs = images ? await this.image_processor(images, kwargs) : {};
+        return {
+            ...text_inputs,
+            ...image_inputs,
+        }
+    }
+}

package/src/models/llava_onevision/image_processing_llava_onevision.js ADDED Viewed

@@ -0,0 +1,5 @@
+import {
+    ImageProcessor,
+} from "../../base/image_processors_utils.js";
+export class LlavaOnevisionImageProcessor extends ImageProcessor {}

package/src/models/mask2former/image_processing_mask2former.js ADDED Viewed

@@ -0,0 +1,5 @@
+import { MaskFormerImageProcessor } from "../maskformer/image_processing_maskformer.js";
+// NOTE: extends MaskFormerImageProcessor
+export class Mask2FormerImageProcessor extends MaskFormerImageProcessor { }

package/src/models/maskformer/image_processing_maskformer.js ADDED Viewed

@@ -0,0 +1,18 @@
+import {
+    ImageProcessor,
+    post_process_panoptic_segmentation,
+    post_process_instance_segmentation,
+} from "../../base/image_processors_utils.js";
+export class MaskFormerImageProcessor extends ImageProcessor {
+    /** @type {typeof post_process_panoptic_segmentation} */
+    post_process_panoptic_segmentation(...args) {
+        return post_process_panoptic_segmentation(...args);
+    }
+    /** @type {typeof post_process_instance_segmentation} */
+    post_process_instance_segmentation(...args) {
+        return post_process_instance_segmentation(...args);
+    }
+}
+export class MaskFormerFeatureExtractor extends MaskFormerImageProcessor { }

package/src/models/mgp_str/processing_mgp_str.js ADDED Viewed

@@ -0,0 +1,170 @@
+import { Processor } from "../../base/processing_utils.js";
+import { AutoImageProcessor } from "../auto/image_processing_auto.js";
+import { AutoTokenizer } from "../../tokenizers.js";
+import { max, softmax } from "../../utils/maths.js";
+const DECODE_TYPE_MAPPING = {
+    'char': ['char_decode', 1],
+    'bpe': ['bpe_decode', 2],
+    'wp': ['wp_decode', 102],
+}
+export class MgpstrProcessor extends Processor {
+    static tokenizer_class = AutoTokenizer
+    static image_processor_class = AutoImageProcessor
+    /**
+     * @returns {import('../../tokenizers.js').MgpstrTokenizer} The character tokenizer.
+     */
+    get char_tokenizer() {
+        return this.components.char_tokenizer;
+    }
+    /**
+     * @returns {import('../../tokenizers.js').GPT2Tokenizer} The BPE tokenizer.
+     */
+    get bpe_tokenizer() {
+        return this.components.bpe_tokenizer;
+    }
+    /**
+     * @returns {import('../../tokenizers.js').BertTokenizer} The WordPiece tokenizer.
+     */
+    get wp_tokenizer() {
+        return this.components.wp_tokenizer;
+    }
+    /**
+     * Helper function to decode the model prediction logits.
+     * @param {import('../../utils/tensor.js').Tensor} pred_logits Model prediction logits.
+     * @param {string} format Type of model prediction. Must be one of ['char', 'bpe', 'wp'].
+     * @returns {[string[], number[]]} The decoded sentences and their confidence scores.
+     */
+    _decode_helper(pred_logits, format) {
+        if (!DECODE_TYPE_MAPPING.hasOwnProperty(format)) {
+            throw new Error(`Format ${format} is not supported.`);
+        }
+        const [decoder_name, eos_token] = DECODE_TYPE_MAPPING[format];
+        const decoder = this[decoder_name].bind(this);
+        const [batch_size, batch_max_length] = pred_logits.dims;
+        const conf_scores = [];
+        const all_ids = [];
+        /** @type {number[][][]} */
+        const pred_logits_list = pred_logits.tolist();
+        for (let i = 0; i < batch_size; ++i) {
+            const logits = pred_logits_list[i];
+            const ids = [];
+            const scores = [];
+            // Start and index=1 to skip the first token
+            for (let j = 1; j < batch_max_length; ++j) {
+                // NOTE: == to match bigint and number
+                const [max_prob, max_prob_index] = max(softmax(logits[j]));
+                scores.push(max_prob);
+                if (max_prob_index == eos_token) {
+                    break;
+                }
+                ids.push(max_prob_index);
+            }
+            const confidence_score = scores.length > 0
+                ? scores.reduce((a, b) => a * b, 1)
+                : 0;
+            all_ids.push(ids);
+            conf_scores.push(confidence_score);
+        }
+        const decoded = decoder(all_ids);
+        return [decoded, conf_scores];
+    }
+    /**
+     * Convert a list of lists of char token ids into a list of strings by calling char tokenizer.
+     * @param {number[][]} sequences List of tokenized input ids.
+     * @returns {string[]} The list of char decoded sentences.
+     */
+    char_decode(sequences) {
+        return this.char_tokenizer.batch_decode(sequences).map(str => str.replaceAll(' ', ''));
+    }
+    /**
+     * Convert a list of lists of BPE token ids into a list of strings by calling BPE tokenizer.
+     * @param {number[][]} sequences List of tokenized input ids.
+     * @returns {string[]} The list of BPE decoded sentences.
+     */
+    bpe_decode(sequences) {
+        return this.bpe_tokenizer.batch_decode(sequences)
+    }
+    /**
+     * Convert a list of lists of word piece token ids into a list of strings by calling word piece tokenizer.
+     * @param {number[][]} sequences List of tokenized input ids.
+     * @returns {string[]} The list of wp decoded sentences.
+     */
+    wp_decode(sequences) {
+        return this.wp_tokenizer.batch_decode(sequences).map(str => str.replaceAll(' ', ''));
+    }
+    /**
+     * Convert a list of lists of token ids into a list of strings by calling decode.
+     * @param {import('../../utils/tensor.js').Tensor[]} sequences List of tokenized input ids.
+     * @returns {{generated_text: string[], scores: number[], char_preds: string[], bpe_preds: string[], wp_preds: string[]}}
+     * Dictionary of all the outputs of the decoded results.
+     * - generated_text: The final results after fusion of char, bpe, and wp.
+     * - scores: The final scores after fusion of char, bpe, and wp.
+     * - char_preds: The list of character decoded sentences.
+     * - bpe_preds: The list of BPE decoded sentences.
+     * - wp_preds: The list of wp decoded sentences.
+     */
+    batch_decode([char_logits, bpe_logits, wp_logits]) {
+        const [char_preds, char_scores] = this._decode_helper(char_logits, 'char');
+        const [bpe_preds, bpe_scores] = this._decode_helper(bpe_logits, 'bpe');
+        const [wp_preds, wp_scores] = this._decode_helper(wp_logits, 'wp');
+        const generated_text = [];
+        const scores = [];
+        for (let i = 0; i < char_preds.length; ++i) {
+            const [max_score, max_score_index] = max([char_scores[i], bpe_scores[i], wp_scores[i]]);
+            generated_text.push([char_preds[i], bpe_preds[i], wp_preds[i]][max_score_index]);
+            scores.push(max_score);
+        }
+        return {
+            generated_text,
+            scores,
+            char_preds,
+            bpe_preds,
+            wp_preds,
+        }
+    }
+    /** @type {typeof Processor.from_pretrained} */
+    static async from_pretrained(...args) {
+        const base = await super.from_pretrained(...args);
+        // Load Transformers.js-compatible versions of the BPE and WordPiece tokenizers
+        const bpe_tokenizer = await AutoTokenizer.from_pretrained("Xenova/gpt2") // openai-community/gpt2
+        const wp_tokenizer = await AutoTokenizer.from_pretrained("Xenova/bert-base-uncased") // google-bert/bert-base-uncased
+        // Update components
+        base.components = {
+            image_processor: base.image_processor,
+            char_tokenizer: base.tokenizer,
+            bpe_tokenizer: bpe_tokenizer,
+            wp_tokenizer: wp_tokenizer,
+        }
+        return base;
+    }
+    async _call(images, text = null) {
+        const result = await this.image_processor(images);
+        if (text) {
+            result.labels = this.tokenizer(text).input_ids
+        }
+        return result;
+    }
+}

package/src/models/mobilenet_v1/image_processing_mobilenet_v1.js ADDED Viewed

@@ -0,0 +1,7 @@
+import {
+    ImageProcessor,
+} from "../../base/image_processors_utils.js";
+export class MobileNetV1ImageProcessor extends ImageProcessor { }
+export class MobileNetV1FeatureExtractor extends MobileNetV1ImageProcessor { }

package/src/models/mobilenet_v2/image_processing_mobilenet_v2.js ADDED Viewed

@@ -0,0 +1,7 @@
+import {
+    ImageProcessor,
+} from "../../base/image_processors_utils.js";
+export class MobileNetV2ImageProcessor extends ImageProcessor { }
+export class MobileNetV2FeatureExtractor extends MobileNetV2ImageProcessor { }

package/src/models/mobilenet_v3/image_processing_mobilenet_v3.js ADDED Viewed

@@ -0,0 +1,7 @@
+import {
+    ImageProcessor,
+} from "../../base/image_processors_utils.js";
+export class MobileNetV3ImageProcessor extends ImageProcessor { }
+export class MobileNetV3FeatureExtractor extends MobileNetV3ImageProcessor { }

package/src/models/mobilenet_v4/image_processing_mobilenet_v4.js ADDED Viewed

@@ -0,0 +1,7 @@
+import {
+    ImageProcessor,
+} from "../../base/image_processors_utils.js";
+export class MobileNetV4ImageProcessor extends ImageProcessor { }
+export class MobileNetV4FeatureExtractor extends MobileNetV4ImageProcessor { }

package/src/models/mobilevit/image_processing_mobilevit.js ADDED Viewed

@@ -0,0 +1,6 @@
+import {
+    ImageProcessor,
+} from "../../base/image_processors_utils.js";
+export class MobileViTImageProcessor extends ImageProcessor { }
+export class MobileViTFeatureExtractor extends MobileViTImageProcessor { }

package/src/models/nougat/image_processing_nougat.js ADDED Viewed

@@ -0,0 +1,5 @@
+import { DonutImageProcessor } from "../donut/image_processing_donut.js";
+// NOTE: extends DonutImageProcessor
+export class NougatImageProcessor extends DonutImageProcessor { }

package/src/models/owlv2/image_processing_owlv2.js ADDED Viewed

@@ -0,0 +1,5 @@
+import { OwlViTImageProcessor } from "../owlvit/image_processing_owlvit.js";
+// NOTE: extends OwlViTImageProcessor
+export class Owlv2ImageProcessor extends OwlViTImageProcessor { }

package/src/models/owlvit/image_processing_owlvit.js ADDED Viewed

@@ -0,0 +1,12 @@
+import {
+    ImageProcessor,
+    post_process_object_detection,
+} from "../../base/image_processors_utils.js";
+export class OwlViTImageProcessor extends ImageProcessor {
+    /** @type {typeof post_process_object_detection} */
+    post_process_object_detection(...args) {
+        return post_process_object_detection(...args);
+    }
+}
+export class OwlViTFeatureExtractor extends OwlViTImageProcessor { }

package/src/models/owlvit/processing_owlvit.js ADDED Viewed

@@ -0,0 +1,7 @@
+import { Processor } from "../../base/processing_utils.js";
+import { AutoImageProcessor } from "../auto/image_processing_auto.js";
+import { AutoTokenizer } from "../../tokenizers.js";
+export class OwlViTProcessor extends Processor {
+    static tokenizer_class = AutoTokenizer
+    static image_processor_class = AutoImageProcessor
+}

package/src/models/processors.js ADDED Viewed

@@ -0,0 +1,11 @@
+export * from './florence2/processing_florence2.js';
+export * from './mgp_str/processing_mgp_str.js';
+export * from './janus/processing_janus.js';
+export * from './jina_clip/processing_jina_clip.js';
+export * from './owlvit/processing_owlvit.js';
+export * from './pyannote/processing_pyannote.js';
+export * from './qwen2_vl/processing_qwen2_vl.js';
+export * from './sam/processing_sam.js';
+export * from './speecht5/processing_speecht5.js';
+export * from './wav2vec2/processing_wav2vec2.js';
+export * from './whisper/processing_whisper.js';

package/src/models/pvt/image_processing_pvt.js ADDED Viewed

@@ -0,0 +1,5 @@
+import {
+    ImageProcessor,
+} from "../../base/image_processors_utils.js";
+export class PvtImageProcessor extends ImageProcessor { }

package/src/models/pyannote/feature_extraction_pyannote.js ADDED Viewed

@@ -0,0 +1,28 @@
+import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
+import { Tensor } from '../../utils/tensor.js';
+export class PyAnnoteFeatureExtractor extends FeatureExtractor {
+    /**
+     * Asynchronously extracts features from a given audio using the provided configuration.
+     * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
+     * @returns {Promise<{ input_values: Tensor; }>} The extracted input features.
+     */
+    async _call(audio) {
+        validate_audio_inputs(audio, 'PyAnnoteFeatureExtractor');
+        if (audio instanceof Float64Array) {
+            audio = new Float32Array(audio);
+        }
+        const shape = [
+            1,            /* batch_size */
+            1,            /* num_channels */
+            audio.length, /* num_samples */
+        ];
+        return {
+            input_values: new Tensor('float32', audio, shape),
+        };
+    }
+}

package/src/models/pyannote/processing_pyannote.js ADDED Viewed

@@ -0,0 +1,71 @@
+import { Processor } from '../../base/processing_utils.js';
+import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
+import { max, softmax } from '../../utils/maths.js';
+export class PyAnnoteProcessor extends Processor {
+    static feature_extractor_class = AutoFeatureExtractor
+    /**
+     * Calls the feature_extractor function with the given audio input.
+     * @param {any} audio The audio input to extract features from.
+     * @returns {Promise<any>} A Promise that resolves with the extracted features.
+     */
+    async _call(audio) {
+        return await this.feature_extractor(audio)
+    }
+    /**
+     * NOTE: Can return fractional values. `Math.ceil` will ensure correct value.
+     * @param {number} samples The number of frames in the audio.
+     * @returns {number} The number of frames in the audio.
+     */
+    samples_to_frames(samples) {
+        return ((samples - this.config.offset) / this.config.step);
+    }
+    /**
+     * Post-processes the speaker diarization logits output by the model.
+     * @param {import('../../utils/tensor.js').Tensor} logits The speaker diarization logits output by the model.
+     * @param {number} num_samples Number of samples in the input audio.
+     * @returns {Array<Array<{ id: number, start: number, end: number, confidence: number }>>} The post-processed speaker diarization results.
+     */
+    post_process_speaker_diarization(logits, num_samples) {
+        const ratio = (
+            num_samples / this.samples_to_frames(num_samples)
+        ) / this.config.sampling_rate;
+        const results = [];
+        for (const scores of logits.tolist()) {
+            const accumulated_segments = [];
+            let current_speaker = -1;
+            for (let i = 0; i < scores.length; ++i) {
+                const probabilities = softmax(scores[i]);
+                const [score, id] = max(probabilities);
+                const [start, end] = [i, i + 1];
+                if (id !== current_speaker) {
+                    // Speaker has changed
+                    current_speaker = id;
+                    accumulated_segments.push({ id, start, end, score });
+                } else {
+                    // Continue the current segment
+                    accumulated_segments.at(-1).end = end;
+                    accumulated_segments.at(-1).score += score;
+                }
+            }
+            results.push(accumulated_segments.map(
+                // Convert frame-space to time-space
+                // and compute the confidence
+                ({ id, start, end, score }) => ({
+                    id,
+                    start: start * ratio,
+                    end: end * ratio,
+                    confidence: score / (end - start),
+                })
+            ));
+        }
+        return results;
+    }
+}

package/src/models/qwen2_vl/image_processing_qwen2_vl.js ADDED Viewed

@@ -0,0 +1,52 @@
+import {
+    ImageProcessor,
+} from "../../base/image_processors_utils.js";
+import { cat, Tensor } from "../../utils/tensor.js";
+export class Qwen2VLImageProcessor extends ImageProcessor {
+    async _call(images, ...args) {
+        const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
+        let patches = pixel_values;
+        // @ts-ignore
+        const { temporal_patch_size, merge_size, patch_size } = this.config;
+        if (patches.dims[0] === 1) {
+            // Equivalent to np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
+            patches = cat(Array.from({ length: temporal_patch_size }, () => patches), 0);
+        }
+        const grid_t = patches.dims[0] / temporal_patch_size;
+        const channel = patches.dims[1];
+        const grid_h = Math.floor(patches.dims[2] / patch_size);
+        const grid_w = Math.floor(patches.dims[3] / patch_size);
+        const flatten_patches = patches
+            .view(
+                grid_t,
+                temporal_patch_size,
+                channel,
+                Math.floor(grid_h / merge_size),
+                merge_size,
+                patch_size,
+                Math.floor(grid_w / merge_size),
+                merge_size,
+                patch_size,
+            )
+            .permute(0, 3, 6, 4, 7, 2, 1, 5, 8)
+            .view(
+                grid_t * grid_h * grid_w,
+                channel * temporal_patch_size * patch_size * patch_size,
+            )
+        const image_grid_thw = new Tensor('int64', [grid_t, grid_h, grid_w], [1, 3]);
+        return {
+            pixel_values: flatten_patches,
+            image_grid_thw,
+            original_sizes,
+            reshaped_input_sizes,
+        }
+    }
+}

package/src/models/qwen2_vl/processing_qwen2_vl.js ADDED Viewed

@@ -0,0 +1,52 @@
+import { Processor } from "../../base/processing_utils.js";
+import { AutoImageProcessor } from "../auto/image_processing_auto.js";
+import { AutoTokenizer } from "../../tokenizers.js";
+import { RawImage } from "../../utils/image.js";
+export class Qwen2VLProcessor extends Processor {
+    static image_processor_class = AutoImageProcessor
+    static tokenizer_class = AutoTokenizer
+    /**
+     *
+     * @param {string|string[]} text
+     * @param {RawImage|RawImage[]} images
+     * @param  {...any} args
+     * @returns {Promise<any>}
+     */
+    async _call(text, images = null, ...args) {
+        if (!Array.isArray(text)) {
+            text = [text];
+        }
+        let image_inputs, image_grid_thw;
+        if (images) {
+            image_inputs = await this.image_processor(images);
+            image_grid_thw = image_inputs.image_grid_thw;
+        }
+        if (image_grid_thw) {
+            let merge_length = this.image_processor.config.merge_size ** 2;
+            let index = 0;
+            const image_grid_thw_list = image_grid_thw.tolist();
+            text = text.map(t => {
+                while (t.includes("<|image_pad|>")) {
+                    const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
+                    t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
+                }
+                return t.replaceAll("<|placeholder|>", "<|image_pad|>");
+            });
+        }
+        const text_inputs = this.tokenizer(text);
+        return {
+            ...text_inputs,
+            ...image_inputs,
+            // TODO: ...videos_inputs,
+        }
+    }
+}

package/src/models/rt_detr/image_processing_rt_detr.js ADDED Viewed

@@ -0,0 +1,12 @@
+import {
+    ImageProcessor,
+    post_process_object_detection,
+} from "../../base/image_processors_utils.js";
+export class RTDetrImageProcessor extends ImageProcessor {
+    /** @type {typeof post_process_object_detection} */
+    post_process_object_detection(...args) {
+        return post_process_object_detection(...args);
+    }
+}