npm - @huggingface/transformers - Versions diffs - 3.0.0-alpha.14 → 3.0.0-alpha.16 - Mend

@huggingface/transformers 3.0.0-alpha.14 → 3.0.0-alpha.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

package/README.md +12 -6
package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
package/dist/transformers.cjs +678 -443
package/dist/transformers.cjs.map +1 -1
package/dist/transformers.js +1107 -825
package/dist/transformers.js.map +1 -1
package/dist/transformers.min.cjs +14 -14
package/dist/transformers.min.cjs.map +1 -1
package/dist/transformers.min.js +17 -17
package/dist/transformers.min.js.map +1 -1
package/dist/transformers.min.mjs +52 -52
package/dist/transformers.min.mjs.map +1 -1
package/dist/transformers.mjs +699 -444
package/dist/transformers.mjs.map +1 -1
package/package.json +4 -5
package/src/configs.js +16 -4
package/src/env.js +4 -4
package/src/models.js +151 -58
package/src/pipelines.js +5 -4
package/src/processors.js +313 -285
package/src/tokenizers.js +111 -72
package/src/utils/core.js +12 -0
package/src/utils/data-structures.js +13 -11
package/src/utils/hub.js +1 -1
package/src/utils/maths.js +13 -4
package/types/configs.d.ts +25 -3
package/types/configs.d.ts.map +1 -1
package/types/models.d.ts +63 -2
package/types/models.d.ts.map +1 -1
package/types/pipelines.d.ts.map +1 -1
package/types/processors.d.ts +42 -52
package/types/processors.d.ts.map +1 -1
package/types/tokenizers.d.ts +23 -1
package/types/tokenizers.d.ts.map +1 -1
package/types/utils/core.d.ts +7 -0
package/types/utils/core.d.ts.map +1 -1
package/types/utils/data-structures.d.ts +6 -6
package/types/utils/data-structures.d.ts.map +1 -1
package/types/utils/hub.d.ts +1 -1
package/types/utils/hub.d.ts.map +1 -1
package/types/utils/maths.d.ts.map +1 -1

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@huggingface/transformers",
-  "version": "3.0.0-alpha.14",
+  "version": "3.0.0-alpha.16",
   "description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
   "main": "./src/transformers.js",
   "types": "./types/transformers.d.ts",
@@ -33,8 +33,7 @@
     "typegen": "tsc ./src/transformers.js --allowJs --declaration --emitDeclarationOnly --declarationMap --outDir types",
     "dev": "webpack serve --no-client-overlay",
     "build": "webpack && npm run typegen",
-    "generate-tests": "python -m tests.generate_tests",
-    "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --verbose --maxConcurrency 1",
+    "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --verbose",
     "readme": "python ./docs/scripts/build_readme.py",
     "docs-api": "node ./docs/scripts/generate.js",
     "docs-preview": "doc-builder preview transformers.js ./docs/source/ --not_python_module",
@@ -63,8 +62,8 @@
   "homepage": "https://github.com/xenova/transformers.js#readme",
   "dependencies": {
     "@huggingface/jinja": "^0.3.0",
-    "onnxruntime-node": "1.19.0",
-    "onnxruntime-web": "1.20.0-dev.20240827-1d059b8702",
+    "onnxruntime-node": "1.19.2",
+    "onnxruntime-web": "1.20.0-dev.20240908-de7a02beef",
     "sharp": "^0.33.5"
   },
   "devDependencies": {

package/src/configs.js CHANGED Viewed

@@ -296,16 +296,23 @@ export function getKeyValueShapes(config, {
 export class PretrainedConfig {
     // NOTE: Typo in original
+    /** @type {string|null} */
+    model_type = null;
+    /** @type {boolean} */
+    is_encoder_decoder = false;
+    /** @type {number} */
     max_position_embeddings;
+    /** @type {TransformersJSConfig} */
+    'transformers.js_config';
     /**
      * Create a new PreTrainedTokenizer instance.
      * @param {Object} configJSON The JSON of the config.
      */
     constructor(configJSON) {
-        this.model_type = null;
-        this.is_encoder_decoder = false;
         Object.assign(this, configJSON);
         this.normalized_config = getNormalizedConfig(this);
     }
@@ -357,5 +364,10 @@ export class AutoConfig {
 /**
  * Transformers.js-specific configuration, possibly present in config.json under the key `transformers.js_config`.
  * @typedef {Object} TransformersJSConfig
- * @property {import('./transformers.js').DataType} [kv_cache_dtype]
+ * @property {import('./utils/tensor.js').DataType} [kv_cache_dtype] The data type of the key-value cache.
+ * @property {Record<string, number>} [free_dimension_overrides] Override the free dimensions of the model.
+ * See https://onnxruntime.ai/docs/tutorials/web/env-flags-and-session-options.html#freedimensionoverrides
+ * for more information.
+ * @property {import('./utils/devices.js').DeviceType} [device] The default device to use for the model.
+ * @property {import('./utils/dtypes.js').DataType} [dtype] The default data type to use for the model.
  */

package/src/env.js CHANGED Viewed

@@ -26,7 +26,7 @@ import fs from 'fs';
 import path from 'path';
 import url from 'url';
-const VERSION = '3.0.0-alpha.14';
+const VERSION = '3.0.0-alpha.16';
 // Check if various APIs are available (depends on environment)
 const IS_BROWSER_ENV = typeof self !== 'undefined';
@@ -73,19 +73,19 @@ export const apis = Object.freeze({
 });
 const RUNNING_LOCALLY = IS_FS_AVAILABLE && IS_PATH_AVAILABLE;
-const __dirname = RUNNING_LOCALLY
+const dirname__ = RUNNING_LOCALLY
     ? path.dirname(path.dirname(url.fileURLToPath(import.meta.url)))
     : './';
 // Only used for environments with access to file system
 const DEFAULT_CACHE_DIR = RUNNING_LOCALLY
-    ? path.join(__dirname, '/.cache/')
+    ? path.join(dirname__, '/.cache/')
     : null;
 // Set local model path, based on available APIs
 const DEFAULT_LOCAL_MODEL_PATH = '/models/';
 const localModelPath = RUNNING_LOCALLY
-    ? path.join(__dirname, DEFAULT_LOCAL_MODEL_PATH)
+    ? path.join(dirname__, DEFAULT_LOCAL_MODEL_PATH)
     : DEFAULT_LOCAL_MODEL_PATH;
 /**

package/src/models.js CHANGED Viewed

@@ -146,7 +146,8 @@ const MODEL_CLASS_TO_NAME_MAPPING = new Map();
  * @private
  */
 async function getSession(pretrained_model_name_or_path, fileName, options) {
-    let device = options.device;
+    const custom_config = options.config?.['transformers.js_config'] ?? {};
+    let device = options.device ?? custom_config.device;
     if (device && typeof device !== 'string') {
         if (device.hasOwnProperty(fileName)) {
             device = device[fileName];
@@ -164,7 +165,7 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
     // If options.dtype is specified, we use it to choose the suffix for the model file.
     // Otherwise, we use the default dtype for the device.
-    let dtype = options.dtype;
+    let dtype = options.dtype ?? custom_config.dtype;
     if (typeof dtype !== 'string') {
         if (dtype && dtype.hasOwnProperty(fileName)) {
             dtype = dtype[fileName];
@@ -191,6 +192,16 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
     // Overwrite `executionProviders` if not specified
     session_options.executionProviders ??= executionProviders;
+    // Overwrite `freeDimensionOverrides` if specified in config and not set in session options
+    const free_dimension_overrides = custom_config.free_dimension_overrides;
+    if (free_dimension_overrides) {
+        session_options.freeDimensionOverrides ??= free_dimension_overrides;
+    } else if (selectedDevice.startsWith('webnn') && !session_options.freeDimensionOverrides) {
+        console.warn(
+            'WebNN does not currently support dynamic shapes and requires `free_dimension_overrides` to be set in config.json as a field within "transformers.js_config". ' +
+            'When `free_dimension_overrides` is not set, you may experience significant performance degradation.'
+        );
+    }
     const bufferPromise = getModelFile(pretrained_model_name_or_path, modelFileName, true, options);
@@ -239,6 +250,9 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
             /** @type {Record<string, import('onnxruntime-common').Tensor.DataLocation>} */
             const preferredOutputLocation = {};
             for (const key in shapes) {
+                // TODO: For now, we keep encoder outputs on the CPU
+                // (otherwise, this causes a memory leak or throws an error "Error: previous buffer is not registered")
+                if (key.includes('encoder')) continue;
                 preferredOutputLocation[key] = 'gpu-buffer';
             }
             session_options.preferredOutputLocation = preferredOutputLocation;
@@ -394,37 +408,6 @@ function toI64Tensor(items) {
     }
 }
-/**
- * Prepares an attention mask for a sequence of tokens based on configuration options.
- * @param {Object} self The calling object instance.
- * @param {Tensor} tokens The input tokens.
- * @returns {Tensor} The attention mask tensor.
- * @private
- */
-function prepareAttentionMask(self, tokens) {
-    // Prepare attention mask
-    let pad_token_id = self.config.pad_token_id ?? null;
-    let eos_token_id = self.config.eos_token_id ?? null;
-    if (isIntegralNumber(eos_token_id)) {
-        eos_token_id = [eos_token_id];
-    }
-    let is_pad_token_in_inputs = tokens.indexOf(pad_token_id) !== -1;
-    let is_pad_token_not_equal_to_eos_token_id = (eos_token_id === null) || !eos_token_id.includes(pad_token_id)
-    if (is_pad_token_in_inputs && is_pad_token_not_equal_to_eos_token_id) {
-        let data = BigInt64Array.from(
-            // Note: != so that int matches bigint
-            // @ts-ignore
-            tokens.data.map(x => x != pad_token_id)
-        )
-        return new Tensor('int64', data, tokens.dims)
-    } else {
-        return ones_like(tokens);
-    }
-}
 /**
  * Creates a boolean tensor with a single value.
  * @param {boolean} value The value of the tensor.
@@ -695,8 +678,8 @@ function image_text_to_text_prepare_inputs_for_generation(self, ...args) {
     } else {
         return decoder_prepare_inputs_for_generation(self, ...args);
     }
 }
 //////////////////////////////////////////////////
 //////////////////////////////////////////////////
@@ -1459,13 +1442,12 @@ export class PreTrainedModel extends Callable {
         // - GenerationMode.BEAM_SEARCH
         // - GenerationMode.BEAM_SAMPLE
         ////////////////////////////////////////////////////
-        let past_key_values = null;
+        let outputs;
         let attentions = {};
         while (true) {
             // prepare model inputs
             model_inputs = this.prepare_inputs_for_generation(all_input_ids, model_inputs, generation_config);
-            const outputs = await this.forward(model_inputs);
+            outputs = await this.forward(model_inputs);
             if (generation_config.output_attentions && generation_config.return_dict_in_generate) {
                 // Get attentions if they are present
@@ -1512,10 +1494,6 @@ export class PreTrainedModel extends Callable {
             const stop = prepared_stopping_criteria(all_input_ids);
             if (stop.every(x => x)) {
-                if (generation_config.return_dict_in_generate) {
-                    // Get past key values without disposing buffers
-                    past_key_values = this.getPastKeyValues(outputs, model_inputs.past_key_values, false);
-                }
                 break;
             }
@@ -1528,6 +1506,9 @@ export class PreTrainedModel extends Callable {
             streamer.end();
         }
+        // Retrieve and dispose all final past key values (including encoder attentions)
+        const past_key_values = this.getPastKeyValues(outputs, model_inputs.past_key_values, true);
         // TODO: ensure all_input_ids is padded correctly...
         const sequences = new Tensor('int64', all_input_ids.flat(), [all_input_ids.length, all_input_ids[0].length]);
@@ -1541,6 +1522,12 @@ export class PreTrainedModel extends Callable {
                 // logits,
             }
         } else {
+            // Dispose all remaining tensors
+            for (const tensor of Object.values(outputs)) {
+                if (tensor.location === 'gpu-buffer') {
+                    tensor.dispose();
+                }
+            }
             return sequences;
         }
     }
@@ -1550,31 +1537,32 @@ export class PreTrainedModel extends Callable {
      *
      * @param {Object} decoderResults The decoder results object.
      * @param {Object} pastKeyValues The previous past key values.
-     * @param {boolean} [dispose=true] Whether to dispose of the old gpu buffer.
      * @returns {Object} An object containing past key values.
      */
-    getPastKeyValues(decoderResults, pastKeyValues, dispose = true) {
+    getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) {
         const pkvs = Object.create(null);
         for (const name in decoderResults) {
             if (name.startsWith('present')) {
                 const newName = name.replace('present', 'past_key_values');
-                if (pastKeyValues && name.includes('encoder')) {
-                    // Optimization introduced by optimum to reuse past key values. So, we just replace the constant
-                    // outputs with the previous past key values.
+                const is_encoder_pkv = name.includes('encoder');
+                if (is_encoder_pkv && pastKeyValues) {
+                    // Optimization introduced by optimum to reuse past key values.
+                    // So, we just replace the constant outputs (`decoderResults[name]`) with the previous past key values.
                     // https://github.com/huggingface/optimum/blob/0bf2c05fb7e1182b52d21b703cfc95fd9e4ea3dc/optimum/onnxruntime/base.py#L677-L704
                     pkvs[newName] = pastKeyValues[newName];
-                } else {
-                    if (dispose && pastKeyValues) {
-                        // Free old gpu buffer
-                        const t = pastKeyValues[newName];
-                        if (t.location === 'gpu-buffer') {
-                            t.dispose();
-                        }
-                    }
+                } else { // decoder or using first encoder PKVs
                     pkvs[newName] = decoderResults[name];
                 }
+                if (pastKeyValues && (!is_encoder_pkv || disposeEncoderPKVs)) {
+                    // - Always dispose decoder PKVs
+                    // - Only dispose encoder past key values when requested (after generation)
+                    const t = pastKeyValues[newName];
+                    if (t.location === 'gpu-buffer') {
+                        t.dispose();
+                    }
+                }
             }
         }
         return pkvs;
@@ -3502,6 +3490,18 @@ export class CLIPPreTrainedModel extends PreTrainedModel { }
  */
 export class CLIPModel extends CLIPPreTrainedModel { }
+/**
+ * The text model from CLIP without any head or projection on top.
+ */
+export class CLIPTextModel extends CLIPPreTrainedModel {
+    /** @type {PreTrainedModel.from_pretrained} */
+    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
+        // Update default model file name if not provided
+        options.model_file_name ??= 'text_model';
+        return super.from_pretrained(pretrained_model_name_or_path, options);
+    }
+}
 /**
  * CLIP Text Model with a projection layer on top (a linear layer on top of the pooled output)
  *
@@ -3529,7 +3529,6 @@ export class CLIPModel extends CLIPPreTrainedModel { }
  * ```
  */
 export class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
     /** @type {PreTrainedModel.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options = {}) {
         // Update default model file name if not provided
@@ -3538,6 +3537,18 @@ export class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
     }
 }
+/**
+ * The vision model from CLIP without any head or projection on top.
+ */
+export class CLIPVisionModel extends CLIPPreTrainedModel {
+    /** @type {PreTrainedModel.from_pretrained} */
+    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
+        // Update default model file name if not provided
+        options.model_file_name ??= 'vision_model';
+        return super.from_pretrained(pretrained_model_name_or_path, options);
+    }
+}
 /**
  * CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output)
  *
@@ -4204,6 +4215,43 @@ export class ViTForImageClassification extends ViTPreTrainedModel {
 }
 //////////////////////////////////////////////////
+//////////////////////////////////////////////////
+export class PvtPreTrainedModel extends PreTrainedModel { }
+export class PvtModel extends PvtPreTrainedModel { }
+export class PvtForImageClassification extends PvtPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+//////////////////////////////////////////////////
+//////////////////////////////////////////////////
+export class ViTMAEPreTrainedModel extends PreTrainedModel { }
+export class ViTMAEModel extends ViTMAEPreTrainedModel { }
+//////////////////////////////////////////////////
+//////////////////////////////////////////////////
+export class ViTMSNPreTrainedModel extends PreTrainedModel { }
+export class ViTMSNModel extends ViTMSNPreTrainedModel { }
+export class ViTMSNForImageClassification extends ViTMSNPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+//////////////////////////////////////////////////
+//////////////////////////////////////////////////
+export class GroupViTPreTrainedModel extends PreTrainedModel { }
+export class GroupViTModel extends GroupViTPreTrainedModel { }
+//////////////////////////////////////////////////
 //////////////////////////////////////////////////
 export class FastViTPreTrainedModel extends PreTrainedModel { }
@@ -4616,6 +4664,11 @@ export class SapiensForDepthEstimation extends SapiensPreTrainedModel { }
 export class SapiensForNormalEstimation extends SapiensPreTrainedModel { }
 //////////////////////////////////////////////////
+//////////////////////////////////////////////////
+export class MaskFormerPreTrainedModel extends PreTrainedModel { }
+export class MaskFormerModel extends MaskFormerPreTrainedModel { }
+export class MaskFormerForInstanceSegmentation extends MaskFormerPreTrainedModel { }
+//////////////////////////////////////////////////
 //////////////////////////////////////////////////
 export class GLPNPreTrainedModel extends PreTrainedModel { }
@@ -6138,6 +6191,7 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
         return audio_values;
     }
 }
+//////////////////////////////////////////////////
 //////////////////////////////////////////////////
 // MobileNetV1 models
@@ -6231,6 +6285,17 @@ export class MobileNetV4ForImageClassification extends MobileNetV4PreTrainedMode
 }
 //////////////////////////////////////////////////
+//////////////////////////////////////////////////
+// Decision Transformer models
+export class DecisionTransformerPreTrainedModel extends PreTrainedModel { }
+/**
+ * The model builds upon the GPT2 architecture to perform autoregressive prediction of actions in an offline RL setting.
+ * Refer to the paper for more details: https://arxiv.org/abs/2106.01345
+ */
+export class DecisionTransformerModel extends DecisionTransformerPreTrainedModel { }
+//////////////////////////////////////////////////
 //////////////////////////////////////////////////
 // AutoModels, used to simplify construction of PreTrainedModels
@@ -6269,7 +6334,7 @@ export class PretrainedMixin {
         session_options = {},
     } = {}) {
-        let options = {
+        const options = {
             progress_callback,
             config,
             cache_dir,
@@ -6288,7 +6353,7 @@ export class PretrainedMixin {
             throw new Error("`MODEL_CLASS_MAPPINGS` not implemented for this type of `AutoClass`: " + this.name);
         }
-        for (let MODEL_CLASS_MAPPING of this.MODEL_CLASS_MAPPINGS) {
+        for (const MODEL_CLASS_MAPPING of this.MODEL_CLASS_MAPPINGS) {
             const modelInfo = MODEL_CLASS_MAPPING.get(options.config.model_type);
             if (!modelInfo) {
                 continue; // Item not found in this mapping
@@ -6343,6 +6408,10 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
     ['rt_detr', ['RTDetrModel', RTDetrModel]],
     ['table-transformer', ['TableTransformerModel', TableTransformerModel]],
     ['vit', ['ViTModel', ViTModel]],
+    ['pvt', ['PvtModel', PvtModel]],
+    ['vit_msn', ['ViTMSNModel', ViTMSNModel]],
+    ['vit_mae', ['ViTMAEModel', ViTMAEModel]],
+    ['groupvit', ['GroupViTModel', GroupViTModel]],
     ['fastvit', ['FastViTModel', FastViTModel]],
     ['mobilevit', ['MobileViTModel', MobileViTModel]],
     ['mobilevitv2', ['MobileViTV2Model', MobileViTV2Model]],
@@ -6365,10 +6434,14 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
     ['hifigan', ['SpeechT5HifiGan', SpeechT5HifiGan]],
     ['efficientnet', ['EfficientNetModel', EfficientNetModel]],
+    ['decision_transformer', ['DecisionTransformerModel', DecisionTransformerModel]],
     ['mobilenet_v1', ['MobileNetV1Model', MobileNetV1Model]],
     ['mobilenet_v2', ['MobileNetV2Model', MobileNetV2Model]],
     ['mobilenet_v3', ['MobileNetV3Model', MobileNetV3Model]],
     ['mobilenet_v4', ['MobileNetV4Model', MobileNetV4Model]],
+    ['maskformer', ['MaskFormerModel', MaskFormerModel]],
 ]);
 const MODEL_MAPPING_NAMES_ENCODER_DECODER = new Map([
@@ -6553,6 +6626,8 @@ const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
 const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([
     ['vit', ['ViTForImageClassification', ViTForImageClassification]],
+    ['pvt', ['PvtForImageClassification', PvtForImageClassification]],
+    ['vit_msn', ['ViTMSNForImageClassification', ViTMSNForImageClassification]],
     ['fastvit', ['FastViTForImageClassification', FastViTForImageClassification]],
     ['mobilevit', ['MobileViTForImageClassification', MobileViTForImageClassification]],
     ['mobilevitv2', ['MobileViTV2ForImageClassification', MobileViTV2ForImageClassification]],
@@ -6585,6 +6660,7 @@ const MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = new Map([
 ]);
 const MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = new Map([
+    // TODO: Do not add new models here
     ['detr', ['DetrForSegmentation', DetrForSegmentation]],
     ['clipseg', ['CLIPSegForImageSegmentation', CLIPSegForImageSegmentation]],
 ]);
@@ -6594,6 +6670,11 @@ const MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = new Map([
     ['sapiens', ['SapiensForSemanticSegmentation', SapiensForSemanticSegmentation]],
 ]);
+const MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES = new Map([
+    ['detr', ['DetrForSegmentation', DetrForSegmentation]],
+    ['maskformer', ['MaskFormerForInstanceSegmentation', MaskFormerForInstanceSegmentation]],
+]);
 const MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = new Map([
     ['sam', ['SamModel', SamModel]],
 ]);
@@ -6669,6 +6750,7 @@ const MODEL_CLASS_TYPE_MAPPING = [
     [MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES, MODEL_TYPES.ImageTextToText],
     [MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
     [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+    [MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
     [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
     [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
     [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
@@ -6871,6 +6953,17 @@ export class AutoModelForSemanticSegmentation extends PretrainedMixin {
     static MODEL_CLASS_MAPPINGS = [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES];
 }
+/**
+ * Helper class which is used to instantiate pretrained universal image segmentation models with the `from_pretrained` function.
+ * The chosen model class is determined by the type specified in the model config.
+ *
+ * @example
+ * let model = await AutoModelForUniversalSegmentation.from_pretrained('hf-internal-testing/tiny-random-MaskFormerForInstanceSegmentation');
+ */
+export class AutoModelForUniversalSegmentation extends PretrainedMixin {
+    static MODEL_CLASS_MAPPINGS = [MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES];
+}
 /**
  * Helper class which is used to instantiate pretrained object detection models with the `from_pretrained` function.
  * The chosen model class is determined by the type specified in the model config.

package/src/pipelines.js CHANGED Viewed

@@ -34,6 +34,7 @@ import {
     AutoModelForImageClassification,
     AutoModelForImageSegmentation,
     AutoModelForSemanticSegmentation,
+    AutoModelForUniversalSegmentation,
     AutoModelForObjectDetection,
     AutoModelForZeroShotObjectDetection,
     AutoModelForDocumentQuestionAnswering,
@@ -3045,7 +3046,7 @@ const SUPPORTED_TASKS = Object.freeze({
     "image-segmentation": {
         // no tokenizer
         "pipeline": ImageSegmentationPipeline,
-        "model": [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation],
+        "model": [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation, AutoModelForUniversalSegmentation],
         "processor": AutoProcessor,
         "default": {
             // TODO: replace with original
@@ -3287,7 +3288,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
     /**@type {Promise[]} */
     const promises = [];
-    for (let [name, cls] of mapping.entries()) {
+    for (const [name, cls] of mapping.entries()) {
         if (!cls) continue;
         /**@type {Promise} */
@@ -3295,7 +3296,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
         if (Array.isArray(cls)) {
             promise = new Promise(async (resolve, reject) => {
                 let e;
-                for (let c of cls) {
+                for (const c of cls) {
                     if (c === null) {
                         // If null, we resolve it immediately, meaning the relevant
                         // class was not found, but it is optional.
@@ -3333,7 +3334,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
     await Promise.all(promises);
     // Then assign to result
-    for (let [name, promise] of Object.entries(result)) {
+    for (const [name, promise] of Object.entries(result)) {
         result[name] = await promise;
     }