@huggingface/transformers 3.0.0-alpha.14 → 3.0.0-alpha.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/README.md +12 -6
  2. package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
  3. package/dist/transformers.cjs +678 -443
  4. package/dist/transformers.cjs.map +1 -1
  5. package/dist/transformers.js +1107 -825
  6. package/dist/transformers.js.map +1 -1
  7. package/dist/transformers.min.cjs +14 -14
  8. package/dist/transformers.min.cjs.map +1 -1
  9. package/dist/transformers.min.js +17 -17
  10. package/dist/transformers.min.js.map +1 -1
  11. package/dist/transformers.min.mjs +52 -52
  12. package/dist/transformers.min.mjs.map +1 -1
  13. package/dist/transformers.mjs +699 -444
  14. package/dist/transformers.mjs.map +1 -1
  15. package/package.json +4 -5
  16. package/src/configs.js +16 -4
  17. package/src/env.js +4 -4
  18. package/src/models.js +151 -58
  19. package/src/pipelines.js +5 -4
  20. package/src/processors.js +313 -285
  21. package/src/tokenizers.js +111 -72
  22. package/src/utils/core.js +12 -0
  23. package/src/utils/data-structures.js +13 -11
  24. package/src/utils/hub.js +1 -1
  25. package/src/utils/maths.js +13 -4
  26. package/types/configs.d.ts +25 -3
  27. package/types/configs.d.ts.map +1 -1
  28. package/types/models.d.ts +63 -2
  29. package/types/models.d.ts.map +1 -1
  30. package/types/pipelines.d.ts.map +1 -1
  31. package/types/processors.d.ts +42 -52
  32. package/types/processors.d.ts.map +1 -1
  33. package/types/tokenizers.d.ts +23 -1
  34. package/types/tokenizers.d.ts.map +1 -1
  35. package/types/utils/core.d.ts +7 -0
  36. package/types/utils/core.d.ts.map +1 -1
  37. package/types/utils/data-structures.d.ts +6 -6
  38. package/types/utils/data-structures.d.ts.map +1 -1
  39. package/types/utils/hub.d.ts +1 -1
  40. package/types/utils/hub.d.ts.map +1 -1
  41. package/types/utils/maths.d.ts.map +1 -1
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@huggingface/transformers",
3
- "version": "3.0.0-alpha.14",
3
+ "version": "3.0.0-alpha.16",
4
4
  "description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
5
5
  "main": "./src/transformers.js",
6
6
  "types": "./types/transformers.d.ts",
@@ -33,8 +33,7 @@
33
33
  "typegen": "tsc ./src/transformers.js --allowJs --declaration --emitDeclarationOnly --declarationMap --outDir types",
34
34
  "dev": "webpack serve --no-client-overlay",
35
35
  "build": "webpack && npm run typegen",
36
- "generate-tests": "python -m tests.generate_tests",
37
- "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --verbose --maxConcurrency 1",
36
+ "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --verbose",
38
37
  "readme": "python ./docs/scripts/build_readme.py",
39
38
  "docs-api": "node ./docs/scripts/generate.js",
40
39
  "docs-preview": "doc-builder preview transformers.js ./docs/source/ --not_python_module",
@@ -63,8 +62,8 @@
63
62
  "homepage": "https://github.com/xenova/transformers.js#readme",
64
63
  "dependencies": {
65
64
  "@huggingface/jinja": "^0.3.0",
66
- "onnxruntime-node": "1.19.0",
67
- "onnxruntime-web": "1.20.0-dev.20240827-1d059b8702",
65
+ "onnxruntime-node": "1.19.2",
66
+ "onnxruntime-web": "1.20.0-dev.20240908-de7a02beef",
68
67
  "sharp": "^0.33.5"
69
68
  },
70
69
  "devDependencies": {
package/src/configs.js CHANGED
@@ -296,16 +296,23 @@ export function getKeyValueShapes(config, {
296
296
  export class PretrainedConfig {
297
297
  // NOTE: Typo in original
298
298
 
299
+ /** @type {string|null} */
300
+ model_type = null;
301
+
302
+ /** @type {boolean} */
303
+ is_encoder_decoder = false;
304
+
305
+ /** @type {number} */
299
306
  max_position_embeddings;
300
307
 
308
+ /** @type {TransformersJSConfig} */
309
+ 'transformers.js_config';
310
+
301
311
  /**
302
312
  * Create a new PreTrainedTokenizer instance.
303
313
  * @param {Object} configJSON The JSON of the config.
304
314
  */
305
315
  constructor(configJSON) {
306
- this.model_type = null;
307
- this.is_encoder_decoder = false;
308
-
309
316
  Object.assign(this, configJSON);
310
317
  this.normalized_config = getNormalizedConfig(this);
311
318
  }
@@ -357,5 +364,10 @@ export class AutoConfig {
357
364
  /**
358
365
  * Transformers.js-specific configuration, possibly present in config.json under the key `transformers.js_config`.
359
366
  * @typedef {Object} TransformersJSConfig
360
- * @property {import('./transformers.js').DataType} [kv_cache_dtype]
367
+ * @property {import('./utils/tensor.js').DataType} [kv_cache_dtype] The data type of the key-value cache.
368
+ * @property {Record<string, number>} [free_dimension_overrides] Override the free dimensions of the model.
369
+ * See https://onnxruntime.ai/docs/tutorials/web/env-flags-and-session-options.html#freedimensionoverrides
370
+ * for more information.
371
+ * @property {import('./utils/devices.js').DeviceType} [device] The default device to use for the model.
372
+ * @property {import('./utils/dtypes.js').DataType} [dtype] The default data type to use for the model.
361
373
  */
package/src/env.js CHANGED
@@ -26,7 +26,7 @@ import fs from 'fs';
26
26
  import path from 'path';
27
27
  import url from 'url';
28
28
 
29
- const VERSION = '3.0.0-alpha.14';
29
+ const VERSION = '3.0.0-alpha.16';
30
30
 
31
31
  // Check if various APIs are available (depends on environment)
32
32
  const IS_BROWSER_ENV = typeof self !== 'undefined';
@@ -73,19 +73,19 @@ export const apis = Object.freeze({
73
73
  });
74
74
 
75
75
  const RUNNING_LOCALLY = IS_FS_AVAILABLE && IS_PATH_AVAILABLE;
76
- const __dirname = RUNNING_LOCALLY
76
+ const dirname__ = RUNNING_LOCALLY
77
77
  ? path.dirname(path.dirname(url.fileURLToPath(import.meta.url)))
78
78
  : './';
79
79
 
80
80
  // Only used for environments with access to file system
81
81
  const DEFAULT_CACHE_DIR = RUNNING_LOCALLY
82
- ? path.join(__dirname, '/.cache/')
82
+ ? path.join(dirname__, '/.cache/')
83
83
  : null;
84
84
 
85
85
  // Set local model path, based on available APIs
86
86
  const DEFAULT_LOCAL_MODEL_PATH = '/models/';
87
87
  const localModelPath = RUNNING_LOCALLY
88
- ? path.join(__dirname, DEFAULT_LOCAL_MODEL_PATH)
88
+ ? path.join(dirname__, DEFAULT_LOCAL_MODEL_PATH)
89
89
  : DEFAULT_LOCAL_MODEL_PATH;
90
90
 
91
91
  /**
package/src/models.js CHANGED
@@ -146,7 +146,8 @@ const MODEL_CLASS_TO_NAME_MAPPING = new Map();
146
146
  * @private
147
147
  */
148
148
  async function getSession(pretrained_model_name_or_path, fileName, options) {
149
- let device = options.device;
149
+ const custom_config = options.config?.['transformers.js_config'] ?? {};
150
+ let device = options.device ?? custom_config.device;
150
151
  if (device && typeof device !== 'string') {
151
152
  if (device.hasOwnProperty(fileName)) {
152
153
  device = device[fileName];
@@ -164,7 +165,7 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
164
165
 
165
166
  // If options.dtype is specified, we use it to choose the suffix for the model file.
166
167
  // Otherwise, we use the default dtype for the device.
167
- let dtype = options.dtype;
168
+ let dtype = options.dtype ?? custom_config.dtype;
168
169
  if (typeof dtype !== 'string') {
169
170
  if (dtype && dtype.hasOwnProperty(fileName)) {
170
171
  dtype = dtype[fileName];
@@ -191,6 +192,16 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
191
192
  // Overwrite `executionProviders` if not specified
192
193
  session_options.executionProviders ??= executionProviders;
193
194
 
195
+ // Overwrite `freeDimensionOverrides` if specified in config and not set in session options
196
+ const free_dimension_overrides = custom_config.free_dimension_overrides;
197
+ if (free_dimension_overrides) {
198
+ session_options.freeDimensionOverrides ??= free_dimension_overrides;
199
+ } else if (selectedDevice.startsWith('webnn') && !session_options.freeDimensionOverrides) {
200
+ console.warn(
201
+ 'WebNN does not currently support dynamic shapes and requires `free_dimension_overrides` to be set in config.json as a field within "transformers.js_config". ' +
202
+ 'When `free_dimension_overrides` is not set, you may experience significant performance degradation.'
203
+ );
204
+ }
194
205
 
195
206
  const bufferPromise = getModelFile(pretrained_model_name_or_path, modelFileName, true, options);
196
207
 
@@ -239,6 +250,9 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
239
250
  /** @type {Record<string, import('onnxruntime-common').Tensor.DataLocation>} */
240
251
  const preferredOutputLocation = {};
241
252
  for (const key in shapes) {
253
+ // TODO: For now, we keep encoder outputs on the CPU
254
+ // (otherwise, this causes a memory leak or throws an error "Error: previous buffer is not registered")
255
+ if (key.includes('encoder')) continue;
242
256
  preferredOutputLocation[key] = 'gpu-buffer';
243
257
  }
244
258
  session_options.preferredOutputLocation = preferredOutputLocation;
@@ -394,37 +408,6 @@ function toI64Tensor(items) {
394
408
  }
395
409
  }
396
410
 
397
- /**
398
- * Prepares an attention mask for a sequence of tokens based on configuration options.
399
- * @param {Object} self The calling object instance.
400
- * @param {Tensor} tokens The input tokens.
401
- * @returns {Tensor} The attention mask tensor.
402
- * @private
403
- */
404
- function prepareAttentionMask(self, tokens) {
405
-
406
- // Prepare attention mask
407
- let pad_token_id = self.config.pad_token_id ?? null;
408
- let eos_token_id = self.config.eos_token_id ?? null;
409
- if (isIntegralNumber(eos_token_id)) {
410
- eos_token_id = [eos_token_id];
411
- }
412
-
413
- let is_pad_token_in_inputs = tokens.indexOf(pad_token_id) !== -1;
414
- let is_pad_token_not_equal_to_eos_token_id = (eos_token_id === null) || !eos_token_id.includes(pad_token_id)
415
-
416
- if (is_pad_token_in_inputs && is_pad_token_not_equal_to_eos_token_id) {
417
- let data = BigInt64Array.from(
418
- // Note: != so that int matches bigint
419
- // @ts-ignore
420
- tokens.data.map(x => x != pad_token_id)
421
- )
422
- return new Tensor('int64', data, tokens.dims)
423
- } else {
424
- return ones_like(tokens);
425
- }
426
- }
427
-
428
411
  /**
429
412
  * Creates a boolean tensor with a single value.
430
413
  * @param {boolean} value The value of the tensor.
@@ -695,8 +678,8 @@ function image_text_to_text_prepare_inputs_for_generation(self, ...args) {
695
678
  } else {
696
679
  return decoder_prepare_inputs_for_generation(self, ...args);
697
680
  }
698
-
699
681
  }
682
+
700
683
  //////////////////////////////////////////////////
701
684
 
702
685
  //////////////////////////////////////////////////
@@ -1459,13 +1442,12 @@ export class PreTrainedModel extends Callable {
1459
1442
  // - GenerationMode.BEAM_SEARCH
1460
1443
  // - GenerationMode.BEAM_SAMPLE
1461
1444
  ////////////////////////////////////////////////////
1462
- let past_key_values = null;
1445
+ let outputs;
1463
1446
  let attentions = {};
1464
1447
  while (true) {
1465
1448
  // prepare model inputs
1466
1449
  model_inputs = this.prepare_inputs_for_generation(all_input_ids, model_inputs, generation_config);
1467
-
1468
- const outputs = await this.forward(model_inputs);
1450
+ outputs = await this.forward(model_inputs);
1469
1451
 
1470
1452
  if (generation_config.output_attentions && generation_config.return_dict_in_generate) {
1471
1453
  // Get attentions if they are present
@@ -1512,10 +1494,6 @@ export class PreTrainedModel extends Callable {
1512
1494
 
1513
1495
  const stop = prepared_stopping_criteria(all_input_ids);
1514
1496
  if (stop.every(x => x)) {
1515
- if (generation_config.return_dict_in_generate) {
1516
- // Get past key values without disposing buffers
1517
- past_key_values = this.getPastKeyValues(outputs, model_inputs.past_key_values, false);
1518
- }
1519
1497
  break;
1520
1498
  }
1521
1499
 
@@ -1528,6 +1506,9 @@ export class PreTrainedModel extends Callable {
1528
1506
  streamer.end();
1529
1507
  }
1530
1508
 
1509
+ // Retrieve and dispose all final past key values (including encoder attentions)
1510
+ const past_key_values = this.getPastKeyValues(outputs, model_inputs.past_key_values, true);
1511
+
1531
1512
  // TODO: ensure all_input_ids is padded correctly...
1532
1513
  const sequences = new Tensor('int64', all_input_ids.flat(), [all_input_ids.length, all_input_ids[0].length]);
1533
1514
 
@@ -1541,6 +1522,12 @@ export class PreTrainedModel extends Callable {
1541
1522
  // logits,
1542
1523
  }
1543
1524
  } else {
1525
+ // Dispose all remaining tensors
1526
+ for (const tensor of Object.values(outputs)) {
1527
+ if (tensor.location === 'gpu-buffer') {
1528
+ tensor.dispose();
1529
+ }
1530
+ }
1544
1531
  return sequences;
1545
1532
  }
1546
1533
  }
@@ -1550,31 +1537,32 @@ export class PreTrainedModel extends Callable {
1550
1537
  *
1551
1538
  * @param {Object} decoderResults The decoder results object.
1552
1539
  * @param {Object} pastKeyValues The previous past key values.
1553
- * @param {boolean} [dispose=true] Whether to dispose of the old gpu buffer.
1554
1540
  * @returns {Object} An object containing past key values.
1555
1541
  */
1556
- getPastKeyValues(decoderResults, pastKeyValues, dispose = true) {
1542
+ getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) {
1557
1543
  const pkvs = Object.create(null);
1558
1544
 
1559
1545
  for (const name in decoderResults) {
1560
1546
  if (name.startsWith('present')) {
1561
1547
  const newName = name.replace('present', 'past_key_values');
1562
-
1563
- if (pastKeyValues && name.includes('encoder')) {
1564
- // Optimization introduced by optimum to reuse past key values. So, we just replace the constant
1565
- // outputs with the previous past key values.
1548
+ const is_encoder_pkv = name.includes('encoder');
1549
+ if (is_encoder_pkv && pastKeyValues) {
1550
+ // Optimization introduced by optimum to reuse past key values.
1551
+ // So, we just replace the constant outputs (`decoderResults[name]`) with the previous past key values.
1566
1552
  // https://github.com/huggingface/optimum/blob/0bf2c05fb7e1182b52d21b703cfc95fd9e4ea3dc/optimum/onnxruntime/base.py#L677-L704
1567
1553
  pkvs[newName] = pastKeyValues[newName];
1568
- } else {
1569
- if (dispose && pastKeyValues) {
1570
- // Free old gpu buffer
1571
- const t = pastKeyValues[newName];
1572
- if (t.location === 'gpu-buffer') {
1573
- t.dispose();
1574
- }
1575
- }
1554
+ } else { // decoder or using first encoder PKVs
1576
1555
  pkvs[newName] = decoderResults[name];
1577
1556
  }
1557
+
1558
+ if (pastKeyValues && (!is_encoder_pkv || disposeEncoderPKVs)) {
1559
+ // - Always dispose decoder PKVs
1560
+ // - Only dispose encoder past key values when requested (after generation)
1561
+ const t = pastKeyValues[newName];
1562
+ if (t.location === 'gpu-buffer') {
1563
+ t.dispose();
1564
+ }
1565
+ }
1578
1566
  }
1579
1567
  }
1580
1568
  return pkvs;
@@ -3502,6 +3490,18 @@ export class CLIPPreTrainedModel extends PreTrainedModel { }
3502
3490
  */
3503
3491
  export class CLIPModel extends CLIPPreTrainedModel { }
3504
3492
 
3493
+ /**
3494
+ * The text model from CLIP without any head or projection on top.
3495
+ */
3496
+ export class CLIPTextModel extends CLIPPreTrainedModel {
3497
+ /** @type {PreTrainedModel.from_pretrained} */
3498
+ static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3499
+ // Update default model file name if not provided
3500
+ options.model_file_name ??= 'text_model';
3501
+ return super.from_pretrained(pretrained_model_name_or_path, options);
3502
+ }
3503
+ }
3504
+
3505
3505
  /**
3506
3506
  * CLIP Text Model with a projection layer on top (a linear layer on top of the pooled output)
3507
3507
  *
@@ -3529,7 +3529,6 @@ export class CLIPModel extends CLIPPreTrainedModel { }
3529
3529
  * ```
3530
3530
  */
3531
3531
  export class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
3532
-
3533
3532
  /** @type {PreTrainedModel.from_pretrained} */
3534
3533
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3535
3534
  // Update default model file name if not provided
@@ -3538,6 +3537,18 @@ export class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
3538
3537
  }
3539
3538
  }
3540
3539
 
3540
+ /**
3541
+ * The vision model from CLIP without any head or projection on top.
3542
+ */
3543
+ export class CLIPVisionModel extends CLIPPreTrainedModel {
3544
+ /** @type {PreTrainedModel.from_pretrained} */
3545
+ static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3546
+ // Update default model file name if not provided
3547
+ options.model_file_name ??= 'vision_model';
3548
+ return super.from_pretrained(pretrained_model_name_or_path, options);
3549
+ }
3550
+ }
3551
+
3541
3552
  /**
3542
3553
  * CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output)
3543
3554
  *
@@ -4204,6 +4215,43 @@ export class ViTForImageClassification extends ViTPreTrainedModel {
4204
4215
  }
4205
4216
  //////////////////////////////////////////////////
4206
4217
 
4218
+ //////////////////////////////////////////////////
4219
+ export class PvtPreTrainedModel extends PreTrainedModel { }
4220
+ export class PvtModel extends PvtPreTrainedModel { }
4221
+ export class PvtForImageClassification extends PvtPreTrainedModel {
4222
+ /**
4223
+ * @param {any} model_inputs
4224
+ */
4225
+ async _call(model_inputs) {
4226
+ return new SequenceClassifierOutput(await super._call(model_inputs));
4227
+ }
4228
+ }
4229
+ //////////////////////////////////////////////////
4230
+
4231
+ //////////////////////////////////////////////////
4232
+ export class ViTMAEPreTrainedModel extends PreTrainedModel { }
4233
+ export class ViTMAEModel extends ViTMAEPreTrainedModel { }
4234
+ //////////////////////////////////////////////////
4235
+
4236
+
4237
+ //////////////////////////////////////////////////
4238
+ export class ViTMSNPreTrainedModel extends PreTrainedModel { }
4239
+ export class ViTMSNModel extends ViTMSNPreTrainedModel { }
4240
+ export class ViTMSNForImageClassification extends ViTMSNPreTrainedModel {
4241
+ /**
4242
+ * @param {any} model_inputs
4243
+ */
4244
+ async _call(model_inputs) {
4245
+ return new SequenceClassifierOutput(await super._call(model_inputs));
4246
+ }
4247
+ }
4248
+ //////////////////////////////////////////////////
4249
+
4250
+ //////////////////////////////////////////////////
4251
+ export class GroupViTPreTrainedModel extends PreTrainedModel { }
4252
+ export class GroupViTModel extends GroupViTPreTrainedModel { }
4253
+ //////////////////////////////////////////////////
4254
+
4207
4255
 
4208
4256
  //////////////////////////////////////////////////
4209
4257
  export class FastViTPreTrainedModel extends PreTrainedModel { }
@@ -4616,6 +4664,11 @@ export class SapiensForDepthEstimation extends SapiensPreTrainedModel { }
4616
4664
  export class SapiensForNormalEstimation extends SapiensPreTrainedModel { }
4617
4665
  //////////////////////////////////////////////////
4618
4666
 
4667
+ //////////////////////////////////////////////////
4668
+ export class MaskFormerPreTrainedModel extends PreTrainedModel { }
4669
+ export class MaskFormerModel extends MaskFormerPreTrainedModel { }
4670
+ export class MaskFormerForInstanceSegmentation extends MaskFormerPreTrainedModel { }
4671
+ //////////////////////////////////////////////////
4619
4672
 
4620
4673
  //////////////////////////////////////////////////
4621
4674
  export class GLPNPreTrainedModel extends PreTrainedModel { }
@@ -6138,6 +6191,7 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
6138
6191
  return audio_values;
6139
6192
  }
6140
6193
  }
6194
+ //////////////////////////////////////////////////
6141
6195
 
6142
6196
  //////////////////////////////////////////////////
6143
6197
  // MobileNetV1 models
@@ -6231,6 +6285,17 @@ export class MobileNetV4ForImageClassification extends MobileNetV4PreTrainedMode
6231
6285
  }
6232
6286
  //////////////////////////////////////////////////
6233
6287
 
6288
+ //////////////////////////////////////////////////
6289
+ // Decision Transformer models
6290
+ export class DecisionTransformerPreTrainedModel extends PreTrainedModel { }
6291
+
6292
+ /**
6293
+ * The model builds upon the GPT2 architecture to perform autoregressive prediction of actions in an offline RL setting.
6294
+ * Refer to the paper for more details: https://arxiv.org/abs/2106.01345
6295
+ */
6296
+ export class DecisionTransformerModel extends DecisionTransformerPreTrainedModel { }
6297
+
6298
+ //////////////////////////////////////////////////
6234
6299
 
6235
6300
  //////////////////////////////////////////////////
6236
6301
  // AutoModels, used to simplify construction of PreTrainedModels
@@ -6269,7 +6334,7 @@ export class PretrainedMixin {
6269
6334
  session_options = {},
6270
6335
  } = {}) {
6271
6336
 
6272
- let options = {
6337
+ const options = {
6273
6338
  progress_callback,
6274
6339
  config,
6275
6340
  cache_dir,
@@ -6288,7 +6353,7 @@ export class PretrainedMixin {
6288
6353
  throw new Error("`MODEL_CLASS_MAPPINGS` not implemented for this type of `AutoClass`: " + this.name);
6289
6354
  }
6290
6355
 
6291
- for (let MODEL_CLASS_MAPPING of this.MODEL_CLASS_MAPPINGS) {
6356
+ for (const MODEL_CLASS_MAPPING of this.MODEL_CLASS_MAPPINGS) {
6292
6357
  const modelInfo = MODEL_CLASS_MAPPING.get(options.config.model_type);
6293
6358
  if (!modelInfo) {
6294
6359
  continue; // Item not found in this mapping
@@ -6343,6 +6408,10 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
6343
6408
  ['rt_detr', ['RTDetrModel', RTDetrModel]],
6344
6409
  ['table-transformer', ['TableTransformerModel', TableTransformerModel]],
6345
6410
  ['vit', ['ViTModel', ViTModel]],
6411
+ ['pvt', ['PvtModel', PvtModel]],
6412
+ ['vit_msn', ['ViTMSNModel', ViTMSNModel]],
6413
+ ['vit_mae', ['ViTMAEModel', ViTMAEModel]],
6414
+ ['groupvit', ['GroupViTModel', GroupViTModel]],
6346
6415
  ['fastvit', ['FastViTModel', FastViTModel]],
6347
6416
  ['mobilevit', ['MobileViTModel', MobileViTModel]],
6348
6417
  ['mobilevitv2', ['MobileViTV2Model', MobileViTV2Model]],
@@ -6365,10 +6434,14 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
6365
6434
  ['hifigan', ['SpeechT5HifiGan', SpeechT5HifiGan]],
6366
6435
  ['efficientnet', ['EfficientNetModel', EfficientNetModel]],
6367
6436
 
6437
+ ['decision_transformer', ['DecisionTransformerModel', DecisionTransformerModel]],
6438
+
6368
6439
  ['mobilenet_v1', ['MobileNetV1Model', MobileNetV1Model]],
6369
6440
  ['mobilenet_v2', ['MobileNetV2Model', MobileNetV2Model]],
6370
6441
  ['mobilenet_v3', ['MobileNetV3Model', MobileNetV3Model]],
6371
6442
  ['mobilenet_v4', ['MobileNetV4Model', MobileNetV4Model]],
6443
+
6444
+ ['maskformer', ['MaskFormerModel', MaskFormerModel]],
6372
6445
  ]);
6373
6446
 
6374
6447
  const MODEL_MAPPING_NAMES_ENCODER_DECODER = new Map([
@@ -6553,6 +6626,8 @@ const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
6553
6626
 
6554
6627
  const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([
6555
6628
  ['vit', ['ViTForImageClassification', ViTForImageClassification]],
6629
+ ['pvt', ['PvtForImageClassification', PvtForImageClassification]],
6630
+ ['vit_msn', ['ViTMSNForImageClassification', ViTMSNForImageClassification]],
6556
6631
  ['fastvit', ['FastViTForImageClassification', FastViTForImageClassification]],
6557
6632
  ['mobilevit', ['MobileViTForImageClassification', MobileViTForImageClassification]],
6558
6633
  ['mobilevitv2', ['MobileViTV2ForImageClassification', MobileViTV2ForImageClassification]],
@@ -6585,6 +6660,7 @@ const MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = new Map([
6585
6660
  ]);
6586
6661
 
6587
6662
  const MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = new Map([
6663
+ // TODO: Do not add new models here
6588
6664
  ['detr', ['DetrForSegmentation', DetrForSegmentation]],
6589
6665
  ['clipseg', ['CLIPSegForImageSegmentation', CLIPSegForImageSegmentation]],
6590
6666
  ]);
@@ -6594,6 +6670,11 @@ const MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = new Map([
6594
6670
  ['sapiens', ['SapiensForSemanticSegmentation', SapiensForSemanticSegmentation]],
6595
6671
  ]);
6596
6672
 
6673
+ const MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES = new Map([
6674
+ ['detr', ['DetrForSegmentation', DetrForSegmentation]],
6675
+ ['maskformer', ['MaskFormerForInstanceSegmentation', MaskFormerForInstanceSegmentation]],
6676
+ ]);
6677
+
6597
6678
  const MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = new Map([
6598
6679
  ['sam', ['SamModel', SamModel]],
6599
6680
  ]);
@@ -6669,6 +6750,7 @@ const MODEL_CLASS_TYPE_MAPPING = [
6669
6750
  [MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES, MODEL_TYPES.ImageTextToText],
6670
6751
  [MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
6671
6752
  [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
6753
+ [MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
6672
6754
  [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
6673
6755
  [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
6674
6756
  [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
@@ -6871,6 +6953,17 @@ export class AutoModelForSemanticSegmentation extends PretrainedMixin {
6871
6953
  static MODEL_CLASS_MAPPINGS = [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES];
6872
6954
  }
6873
6955
 
6956
+ /**
6957
+ * Helper class which is used to instantiate pretrained universal image segmentation models with the `from_pretrained` function.
6958
+ * The chosen model class is determined by the type specified in the model config.
6959
+ *
6960
+ * @example
6961
+ * let model = await AutoModelForUniversalSegmentation.from_pretrained('hf-internal-testing/tiny-random-MaskFormerForInstanceSegmentation');
6962
+ */
6963
+ export class AutoModelForUniversalSegmentation extends PretrainedMixin {
6964
+ static MODEL_CLASS_MAPPINGS = [MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES];
6965
+ }
6966
+
6874
6967
  /**
6875
6968
  * Helper class which is used to instantiate pretrained object detection models with the `from_pretrained` function.
6876
6969
  * The chosen model class is determined by the type specified in the model config.
package/src/pipelines.js CHANGED
@@ -34,6 +34,7 @@ import {
34
34
  AutoModelForImageClassification,
35
35
  AutoModelForImageSegmentation,
36
36
  AutoModelForSemanticSegmentation,
37
+ AutoModelForUniversalSegmentation,
37
38
  AutoModelForObjectDetection,
38
39
  AutoModelForZeroShotObjectDetection,
39
40
  AutoModelForDocumentQuestionAnswering,
@@ -3045,7 +3046,7 @@ const SUPPORTED_TASKS = Object.freeze({
3045
3046
  "image-segmentation": {
3046
3047
  // no tokenizer
3047
3048
  "pipeline": ImageSegmentationPipeline,
3048
- "model": [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation],
3049
+ "model": [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation, AutoModelForUniversalSegmentation],
3049
3050
  "processor": AutoProcessor,
3050
3051
  "default": {
3051
3052
  // TODO: replace with original
@@ -3287,7 +3288,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
3287
3288
 
3288
3289
  /**@type {Promise[]} */
3289
3290
  const promises = [];
3290
- for (let [name, cls] of mapping.entries()) {
3291
+ for (const [name, cls] of mapping.entries()) {
3291
3292
  if (!cls) continue;
3292
3293
 
3293
3294
  /**@type {Promise} */
@@ -3295,7 +3296,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
3295
3296
  if (Array.isArray(cls)) {
3296
3297
  promise = new Promise(async (resolve, reject) => {
3297
3298
  let e;
3298
- for (let c of cls) {
3299
+ for (const c of cls) {
3299
3300
  if (c === null) {
3300
3301
  // If null, we resolve it immediately, meaning the relevant
3301
3302
  // class was not found, but it is optional.
@@ -3333,7 +3334,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
3333
3334
  await Promise.all(promises);
3334
3335
 
3335
3336
  // Then assign to result
3336
- for (let [name, promise] of Object.entries(result)) {
3337
+ for (const [name, promise] of Object.entries(result)) {
3337
3338
  result[name] = await promise;
3338
3339
  }
3339
3340