@huggingface/transformers 3.0.0-alpha.14 → 3.0.0-alpha.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3779,7 +3779,7 @@ __webpack_require__.r(__webpack_exports__);
3779
3779
  // Licensed under the MIT License.
3780
3780
  // This file is generated by /js/scripts/update-version.ts
3781
3781
  // Do not modify file content manually.
3782
- const version = '1.19.0';
3782
+ const version = '1.19.2';
3783
3783
  //# sourceMappingURL=version.js.map
3784
3784
 
3785
3785
  /***/ }),
@@ -4437,7 +4437,7 @@ __webpack_require__.r(__webpack_exports__);
4437
4437
 
4438
4438
 
4439
4439
 
4440
- const VERSION = '3.0.0-alpha.14';
4440
+ const VERSION = '3.0.0-alpha.15';
4441
4441
 
4442
4442
  // Check if various APIs are available (depends on environment)
4443
4443
  const IS_BROWSER_ENV = typeof self !== 'undefined';
@@ -4484,19 +4484,19 @@ const apis = Object.freeze({
4484
4484
  });
4485
4485
 
4486
4486
  const RUNNING_LOCALLY = IS_FS_AVAILABLE && IS_PATH_AVAILABLE;
4487
- const __dirname = RUNNING_LOCALLY
4487
+ const dirname__ = RUNNING_LOCALLY
4488
4488
  ? path__WEBPACK_IMPORTED_MODULE_1__.dirname(path__WEBPACK_IMPORTED_MODULE_1__.dirname(url__WEBPACK_IMPORTED_MODULE_2__.fileURLToPath("file:///workspaces/transformers.js/src/env.js")))
4489
4489
  : './';
4490
4490
 
4491
4491
  // Only used for environments with access to file system
4492
4492
  const DEFAULT_CACHE_DIR = RUNNING_LOCALLY
4493
- ? path__WEBPACK_IMPORTED_MODULE_1__.join(__dirname, '/.cache/')
4493
+ ? path__WEBPACK_IMPORTED_MODULE_1__.join(dirname__, '/.cache/')
4494
4494
  : null;
4495
4495
 
4496
4496
  // Set local model path, based on available APIs
4497
4497
  const DEFAULT_LOCAL_MODEL_PATH = '/models/';
4498
4498
  const localModelPath = RUNNING_LOCALLY
4499
- ? path__WEBPACK_IMPORTED_MODULE_1__.join(__dirname, DEFAULT_LOCAL_MODEL_PATH)
4499
+ ? path__WEBPACK_IMPORTED_MODULE_1__.join(dirname__, DEFAULT_LOCAL_MODEL_PATH)
4500
4500
  : DEFAULT_LOCAL_MODEL_PATH;
4501
4501
 
4502
4502
  /**
@@ -6382,6 +6382,7 @@ __webpack_require__.r(__webpack_exports__);
6382
6382
  /* harmony export */ AutoModelForTextToSpectrogram: () => (/* binding */ AutoModelForTextToSpectrogram),
6383
6383
  /* harmony export */ AutoModelForTextToWaveform: () => (/* binding */ AutoModelForTextToWaveform),
6384
6384
  /* harmony export */ AutoModelForTokenClassification: () => (/* binding */ AutoModelForTokenClassification),
6385
+ /* harmony export */ AutoModelForUniversalSegmentation: () => (/* binding */ AutoModelForUniversalSegmentation),
6385
6386
  /* harmony export */ AutoModelForVision2Seq: () => (/* binding */ AutoModelForVision2Seq),
6386
6387
  /* harmony export */ AutoModelForXVector: () => (/* binding */ AutoModelForXVector),
6387
6388
  /* harmony export */ AutoModelForZeroShotObjectDetection: () => (/* binding */ AutoModelForZeroShotObjectDetection),
@@ -6413,7 +6414,9 @@ __webpack_require__.r(__webpack_exports__);
6413
6414
  /* harmony export */ CLIPSegForImageSegmentation: () => (/* binding */ CLIPSegForImageSegmentation),
6414
6415
  /* harmony export */ CLIPSegModel: () => (/* binding */ CLIPSegModel),
6415
6416
  /* harmony export */ CLIPSegPreTrainedModel: () => (/* binding */ CLIPSegPreTrainedModel),
6417
+ /* harmony export */ CLIPTextModel: () => (/* binding */ CLIPTextModel),
6416
6418
  /* harmony export */ CLIPTextModelWithProjection: () => (/* binding */ CLIPTextModelWithProjection),
6419
+ /* harmony export */ CLIPVisionModel: () => (/* binding */ CLIPVisionModel),
6417
6420
  /* harmony export */ CLIPVisionModelWithProjection: () => (/* binding */ CLIPVisionModelWithProjection),
6418
6421
  /* harmony export */ CamembertForMaskedLM: () => (/* binding */ CamembertForMaskedLM),
6419
6422
  /* harmony export */ CamembertForQuestionAnswering: () => (/* binding */ CamembertForQuestionAnswering),
@@ -6462,6 +6465,8 @@ __webpack_require__.r(__webpack_exports__);
6462
6465
  /* harmony export */ DebertaV2ForTokenClassification: () => (/* binding */ DebertaV2ForTokenClassification),
6463
6466
  /* harmony export */ DebertaV2Model: () => (/* binding */ DebertaV2Model),
6464
6467
  /* harmony export */ DebertaV2PreTrainedModel: () => (/* binding */ DebertaV2PreTrainedModel),
6468
+ /* harmony export */ DecisionTransformerModel: () => (/* binding */ DecisionTransformerModel),
6469
+ /* harmony export */ DecisionTransformerPreTrainedModel: () => (/* binding */ DecisionTransformerPreTrainedModel),
6465
6470
  /* harmony export */ DeiTForImageClassification: () => (/* binding */ DeiTForImageClassification),
6466
6471
  /* harmony export */ DeiTModel: () => (/* binding */ DeiTModel),
6467
6472
  /* harmony export */ DeiTPreTrainedModel: () => (/* binding */ DeiTPreTrainedModel),
@@ -6530,6 +6535,8 @@ __webpack_require__.r(__webpack_exports__);
6530
6535
  /* harmony export */ GemmaForCausalLM: () => (/* binding */ GemmaForCausalLM),
6531
6536
  /* harmony export */ GemmaModel: () => (/* binding */ GemmaModel),
6532
6537
  /* harmony export */ GemmaPreTrainedModel: () => (/* binding */ GemmaPreTrainedModel),
6538
+ /* harmony export */ GroupViTModel: () => (/* binding */ GroupViTModel),
6539
+ /* harmony export */ GroupViTPreTrainedModel: () => (/* binding */ GroupViTPreTrainedModel),
6533
6540
  /* harmony export */ HieraForImageClassification: () => (/* binding */ HieraForImageClassification),
6534
6541
  /* harmony export */ HieraModel: () => (/* binding */ HieraModel),
6535
6542
  /* harmony export */ HieraPreTrainedModel: () => (/* binding */ HieraPreTrainedModel),
@@ -6569,6 +6576,9 @@ __webpack_require__.r(__webpack_exports__);
6569
6576
  /* harmony export */ MarianMTModel: () => (/* binding */ MarianMTModel),
6570
6577
  /* harmony export */ MarianModel: () => (/* binding */ MarianModel),
6571
6578
  /* harmony export */ MarianPreTrainedModel: () => (/* binding */ MarianPreTrainedModel),
6579
+ /* harmony export */ MaskFormerForInstanceSegmentation: () => (/* binding */ MaskFormerForInstanceSegmentation),
6580
+ /* harmony export */ MaskFormerModel: () => (/* binding */ MaskFormerModel),
6581
+ /* harmony export */ MaskFormerPreTrainedModel: () => (/* binding */ MaskFormerPreTrainedModel),
6572
6582
  /* harmony export */ MaskedLMOutput: () => (/* binding */ MaskedLMOutput),
6573
6583
  /* harmony export */ MistralForCausalLM: () => (/* binding */ MistralForCausalLM),
6574
6584
  /* harmony export */ MistralModel: () => (/* binding */ MistralModel),
@@ -6627,6 +6637,9 @@ __webpack_require__.r(__webpack_exports__);
6627
6637
  /* harmony export */ PhiPreTrainedModel: () => (/* binding */ PhiPreTrainedModel),
6628
6638
  /* harmony export */ PreTrainedModel: () => (/* binding */ PreTrainedModel),
6629
6639
  /* harmony export */ PretrainedMixin: () => (/* binding */ PretrainedMixin),
6640
+ /* harmony export */ PvtForImageClassification: () => (/* binding */ PvtForImageClassification),
6641
+ /* harmony export */ PvtModel: () => (/* binding */ PvtModel),
6642
+ /* harmony export */ PvtPreTrainedModel: () => (/* binding */ PvtPreTrainedModel),
6630
6643
  /* harmony export */ PyAnnoteForAudioFrameClassification: () => (/* binding */ PyAnnoteForAudioFrameClassification),
6631
6644
  /* harmony export */ PyAnnoteModel: () => (/* binding */ PyAnnoteModel),
6632
6645
  /* harmony export */ PyAnnotePreTrainedModel: () => (/* binding */ PyAnnotePreTrainedModel),
@@ -6712,6 +6725,11 @@ __webpack_require__.r(__webpack_exports__);
6712
6725
  /* harmony export */ UniSpeechSatModel: () => (/* binding */ UniSpeechSatModel),
6713
6726
  /* harmony export */ UniSpeechSatPreTrainedModel: () => (/* binding */ UniSpeechSatPreTrainedModel),
6714
6727
  /* harmony export */ ViTForImageClassification: () => (/* binding */ ViTForImageClassification),
6728
+ /* harmony export */ ViTMAEModel: () => (/* binding */ ViTMAEModel),
6729
+ /* harmony export */ ViTMAEPreTrainedModel: () => (/* binding */ ViTMAEPreTrainedModel),
6730
+ /* harmony export */ ViTMSNForImageClassification: () => (/* binding */ ViTMSNForImageClassification),
6731
+ /* harmony export */ ViTMSNModel: () => (/* binding */ ViTMSNModel),
6732
+ /* harmony export */ ViTMSNPreTrainedModel: () => (/* binding */ ViTMSNPreTrainedModel),
6715
6733
  /* harmony export */ ViTModel: () => (/* binding */ ViTModel),
6716
6734
  /* harmony export */ ViTPreTrainedModel: () => (/* binding */ ViTPreTrainedModel),
6717
6735
  /* harmony export */ VisionEncoderDecoderModel: () => (/* binding */ VisionEncoderDecoderModel),
@@ -10227,6 +10245,18 @@ class CLIPPreTrainedModel extends PreTrainedModel { }
10227
10245
  */
10228
10246
  class CLIPModel extends CLIPPreTrainedModel { }
10229
10247
 
10248
+ /**
10249
+ * The text model from CLIP without any head or projection on top.
10250
+ */
10251
+ class CLIPTextModel extends CLIPPreTrainedModel {
10252
+ /** @type {PreTrainedModel.from_pretrained} */
10253
+ static async from_pretrained(pretrained_model_name_or_path, options = {}) {
10254
+ // Update default model file name if not provided
10255
+ options.model_file_name ??= 'text_model';
10256
+ return super.from_pretrained(pretrained_model_name_or_path, options);
10257
+ }
10258
+ }
10259
+
10230
10260
  /**
10231
10261
  * CLIP Text Model with a projection layer on top (a linear layer on top of the pooled output)
10232
10262
  *
@@ -10254,7 +10284,6 @@ class CLIPModel extends CLIPPreTrainedModel { }
10254
10284
  * ```
10255
10285
  */
10256
10286
  class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
10257
-
10258
10287
  /** @type {PreTrainedModel.from_pretrained} */
10259
10288
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
10260
10289
  // Update default model file name if not provided
@@ -10263,6 +10292,18 @@ class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
10263
10292
  }
10264
10293
  }
10265
10294
 
10295
+ /**
10296
+ * The vision model from CLIP without any head or projection on top.
10297
+ */
10298
+ class CLIPVisionModel extends CLIPPreTrainedModel {
10299
+ /** @type {PreTrainedModel.from_pretrained} */
10300
+ static async from_pretrained(pretrained_model_name_or_path, options = {}) {
10301
+ // Update default model file name if not provided
10302
+ options.model_file_name ??= 'vision_model';
10303
+ return super.from_pretrained(pretrained_model_name_or_path, options);
10304
+ }
10305
+ }
10306
+
10266
10307
  /**
10267
10308
  * CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output)
10268
10309
  *
@@ -10929,6 +10970,43 @@ class ViTForImageClassification extends ViTPreTrainedModel {
10929
10970
  }
10930
10971
  //////////////////////////////////////////////////
10931
10972
 
10973
+ //////////////////////////////////////////////////
10974
+ class PvtPreTrainedModel extends PreTrainedModel { }
10975
+ class PvtModel extends PvtPreTrainedModel { }
10976
+ class PvtForImageClassification extends PvtPreTrainedModel {
10977
+ /**
10978
+ * @param {any} model_inputs
10979
+ */
10980
+ async _call(model_inputs) {
10981
+ return new SequenceClassifierOutput(await super._call(model_inputs));
10982
+ }
10983
+ }
10984
+ //////////////////////////////////////////////////
10985
+
10986
+ //////////////////////////////////////////////////
10987
+ class ViTMAEPreTrainedModel extends PreTrainedModel { }
10988
+ class ViTMAEModel extends ViTMAEPreTrainedModel { }
10989
+ //////////////////////////////////////////////////
10990
+
10991
+
10992
+ //////////////////////////////////////////////////
10993
+ class ViTMSNPreTrainedModel extends PreTrainedModel { }
10994
+ class ViTMSNModel extends ViTMSNPreTrainedModel { }
10995
+ class ViTMSNForImageClassification extends ViTMSNPreTrainedModel {
10996
+ /**
10997
+ * @param {any} model_inputs
10998
+ */
10999
+ async _call(model_inputs) {
11000
+ return new SequenceClassifierOutput(await super._call(model_inputs));
11001
+ }
11002
+ }
11003
+ //////////////////////////////////////////////////
11004
+
11005
+ //////////////////////////////////////////////////
11006
+ class GroupViTPreTrainedModel extends PreTrainedModel { }
11007
+ class GroupViTModel extends GroupViTPreTrainedModel { }
11008
+ //////////////////////////////////////////////////
11009
+
10932
11010
 
10933
11011
  //////////////////////////////////////////////////
10934
11012
  class FastViTPreTrainedModel extends PreTrainedModel { }
@@ -11341,6 +11419,11 @@ class SapiensForDepthEstimation extends SapiensPreTrainedModel { }
11341
11419
  class SapiensForNormalEstimation extends SapiensPreTrainedModel { }
11342
11420
  //////////////////////////////////////////////////
11343
11421
 
11422
+ //////////////////////////////////////////////////
11423
+ class MaskFormerPreTrainedModel extends PreTrainedModel { }
11424
+ class MaskFormerModel extends MaskFormerPreTrainedModel { }
11425
+ class MaskFormerForInstanceSegmentation extends MaskFormerPreTrainedModel { }
11426
+ //////////////////////////////////////////////////
11344
11427
 
11345
11428
  //////////////////////////////////////////////////
11346
11429
  class GLPNPreTrainedModel extends PreTrainedModel { }
@@ -12863,6 +12946,7 @@ class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE: not Mu
12863
12946
  return audio_values;
12864
12947
  }
12865
12948
  }
12949
+ //////////////////////////////////////////////////
12866
12950
 
12867
12951
  //////////////////////////////////////////////////
12868
12952
  // MobileNetV1 models
@@ -12956,6 +13040,17 @@ class MobileNetV4ForImageClassification extends MobileNetV4PreTrainedModel {
12956
13040
  }
12957
13041
  //////////////////////////////////////////////////
12958
13042
 
13043
+ //////////////////////////////////////////////////
13044
+ // Decision Transformer models
13045
+ class DecisionTransformerPreTrainedModel extends PreTrainedModel { }
13046
+
13047
+ /**
13048
+ * The model builds upon the GPT2 architecture to perform autoregressive prediction of actions in an offline RL setting.
13049
+ * Refer to the paper for more details: https://arxiv.org/abs/2106.01345
13050
+ */
13051
+ class DecisionTransformerModel extends DecisionTransformerPreTrainedModel { }
13052
+
13053
+ //////////////////////////////////////////////////
12959
13054
 
12960
13055
  //////////////////////////////////////////////////
12961
13056
  // AutoModels, used to simplify construction of PreTrainedModels
@@ -12994,7 +13089,7 @@ class PretrainedMixin {
12994
13089
  session_options = {},
12995
13090
  } = {}) {
12996
13091
 
12997
- let options = {
13092
+ const options = {
12998
13093
  progress_callback,
12999
13094
  config,
13000
13095
  cache_dir,
@@ -13013,7 +13108,7 @@ class PretrainedMixin {
13013
13108
  throw new Error("`MODEL_CLASS_MAPPINGS` not implemented for this type of `AutoClass`: " + this.name);
13014
13109
  }
13015
13110
 
13016
- for (let MODEL_CLASS_MAPPING of this.MODEL_CLASS_MAPPINGS) {
13111
+ for (const MODEL_CLASS_MAPPING of this.MODEL_CLASS_MAPPINGS) {
13017
13112
  const modelInfo = MODEL_CLASS_MAPPING.get(options.config.model_type);
13018
13113
  if (!modelInfo) {
13019
13114
  continue; // Item not found in this mapping
@@ -13068,6 +13163,10 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
13068
13163
  ['rt_detr', ['RTDetrModel', RTDetrModel]],
13069
13164
  ['table-transformer', ['TableTransformerModel', TableTransformerModel]],
13070
13165
  ['vit', ['ViTModel', ViTModel]],
13166
+ ['pvt', ['PvtModel', PvtModel]],
13167
+ ['vit_msn', ['ViTMSNModel', ViTMSNModel]],
13168
+ ['vit_mae', ['ViTMAEModel', ViTMAEModel]],
13169
+ ['groupvit', ['GroupViTModel', GroupViTModel]],
13071
13170
  ['fastvit', ['FastViTModel', FastViTModel]],
13072
13171
  ['mobilevit', ['MobileViTModel', MobileViTModel]],
13073
13172
  ['mobilevitv2', ['MobileViTV2Model', MobileViTV2Model]],
@@ -13090,10 +13189,14 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
13090
13189
  ['hifigan', ['SpeechT5HifiGan', SpeechT5HifiGan]],
13091
13190
  ['efficientnet', ['EfficientNetModel', EfficientNetModel]],
13092
13191
 
13192
+ ['decision_transformer', ['DecisionTransformerModel', DecisionTransformerModel]],
13193
+
13093
13194
  ['mobilenet_v1', ['MobileNetV1Model', MobileNetV1Model]],
13094
13195
  ['mobilenet_v2', ['MobileNetV2Model', MobileNetV2Model]],
13095
13196
  ['mobilenet_v3', ['MobileNetV3Model', MobileNetV3Model]],
13096
13197
  ['mobilenet_v4', ['MobileNetV4Model', MobileNetV4Model]],
13198
+
13199
+ ['maskformer', ['MaskFormerModel', MaskFormerModel]],
13097
13200
  ]);
13098
13201
 
13099
13202
  const MODEL_MAPPING_NAMES_ENCODER_DECODER = new Map([
@@ -13278,6 +13381,8 @@ const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
13278
13381
 
13279
13382
  const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([
13280
13383
  ['vit', ['ViTForImageClassification', ViTForImageClassification]],
13384
+ ['pvt', ['PvtForImageClassification', PvtForImageClassification]],
13385
+ ['vit_msn', ['ViTMSNForImageClassification', ViTMSNForImageClassification]],
13281
13386
  ['fastvit', ['FastViTForImageClassification', FastViTForImageClassification]],
13282
13387
  ['mobilevit', ['MobileViTForImageClassification', MobileViTForImageClassification]],
13283
13388
  ['mobilevitv2', ['MobileViTV2ForImageClassification', MobileViTV2ForImageClassification]],
@@ -13310,6 +13415,7 @@ const MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = new Map([
13310
13415
  ]);
13311
13416
 
13312
13417
  const MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = new Map([
13418
+ // TODO: Do not add new models here
13313
13419
  ['detr', ['DetrForSegmentation', DetrForSegmentation]],
13314
13420
  ['clipseg', ['CLIPSegForImageSegmentation', CLIPSegForImageSegmentation]],
13315
13421
  ]);
@@ -13319,6 +13425,11 @@ const MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = new Map([
13319
13425
  ['sapiens', ['SapiensForSemanticSegmentation', SapiensForSemanticSegmentation]],
13320
13426
  ]);
13321
13427
 
13428
+ const MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES = new Map([
13429
+ ['detr', ['DetrForSegmentation', DetrForSegmentation]],
13430
+ ['maskformer', ['MaskFormerForInstanceSegmentation', MaskFormerForInstanceSegmentation]],
13431
+ ]);
13432
+
13322
13433
  const MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = new Map([
13323
13434
  ['sam', ['SamModel', SamModel]],
13324
13435
  ]);
@@ -13394,6 +13505,7 @@ const MODEL_CLASS_TYPE_MAPPING = [
13394
13505
  [MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES, MODEL_TYPES.ImageTextToText],
13395
13506
  [MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
13396
13507
  [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
13508
+ [MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
13397
13509
  [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
13398
13510
  [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
13399
13511
  [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
@@ -13596,6 +13708,17 @@ class AutoModelForSemanticSegmentation extends PretrainedMixin {
13596
13708
  static MODEL_CLASS_MAPPINGS = [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES];
13597
13709
  }
13598
13710
 
13711
+ /**
13712
+ * Helper class which is used to instantiate pretrained universal image segmentation models with the `from_pretrained` function.
13713
+ * The chosen model class is determined by the type specified in the model config.
13714
+ *
13715
+ * @example
13716
+ * let model = await AutoModelForUniversalSegmentation.from_pretrained('hf-internal-testing/tiny-random-MaskFormerForInstanceSegmentation');
13717
+ */
13718
+ class AutoModelForUniversalSegmentation extends PretrainedMixin {
13719
+ static MODEL_CLASS_MAPPINGS = [MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES];
13720
+ }
13721
+
13599
13722
  /**
13600
13723
  * Helper class which is used to instantiate pretrained object detection models with the `from_pretrained` function.
13601
13724
  * The chosen model class is determined by the type specified in the model config.
@@ -17259,7 +17382,7 @@ const SUPPORTED_TASKS = Object.freeze({
17259
17382
  "image-segmentation": {
17260
17383
  // no tokenizer
17261
17384
  "pipeline": ImageSegmentationPipeline,
17262
- "model": [_models_js__WEBPACK_IMPORTED_MODULE_1__.AutoModelForImageSegmentation, _models_js__WEBPACK_IMPORTED_MODULE_1__.AutoModelForSemanticSegmentation],
17385
+ "model": [_models_js__WEBPACK_IMPORTED_MODULE_1__.AutoModelForImageSegmentation, _models_js__WEBPACK_IMPORTED_MODULE_1__.AutoModelForSemanticSegmentation, _models_js__WEBPACK_IMPORTED_MODULE_1__.AutoModelForUniversalSegmentation],
17263
17386
  "processor": _processors_js__WEBPACK_IMPORTED_MODULE_2__.AutoProcessor,
17264
17387
  "default": {
17265
17388
  // TODO: replace with original
@@ -17501,7 +17624,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
17501
17624
 
17502
17625
  /**@type {Promise[]} */
17503
17626
  const promises = [];
17504
- for (let [name, cls] of mapping.entries()) {
17627
+ for (const [name, cls] of mapping.entries()) {
17505
17628
  if (!cls) continue;
17506
17629
 
17507
17630
  /**@type {Promise} */
@@ -17509,7 +17632,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
17509
17632
  if (Array.isArray(cls)) {
17510
17633
  promise = new Promise(async (resolve, reject) => {
17511
17634
  let e;
17512
- for (let c of cls) {
17635
+ for (const c of cls) {
17513
17636
  if (c === null) {
17514
17637
  // If null, we resolve it immediately, meaning the relevant
17515
17638
  // class was not found, but it is optional.
@@ -17547,7 +17670,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
17547
17670
  await Promise.all(promises);
17548
17671
 
17549
17672
  // Then assign to result
17550
- for (let [name, promise] of Object.entries(result)) {
17673
+ for (const [name, promise] of Object.entries(result)) {
17551
17674
  result[name] = await promise;
17552
17675
  }
17553
17676
 
@@ -17585,6 +17708,7 @@ __webpack_require__.r(__webpack_exports__);
17585
17708
  /* harmony export */ Florence2Processor: () => (/* binding */ Florence2Processor),
17586
17709
  /* harmony export */ GLPNFeatureExtractor: () => (/* binding */ GLPNFeatureExtractor),
17587
17710
  /* harmony export */ ImageFeatureExtractor: () => (/* binding */ ImageFeatureExtractor),
17711
+ /* harmony export */ MaskFormerFeatureExtractor: () => (/* binding */ MaskFormerFeatureExtractor),
17588
17712
  /* harmony export */ MobileNetV1FeatureExtractor: () => (/* binding */ MobileNetV1FeatureExtractor),
17589
17713
  /* harmony export */ MobileNetV2FeatureExtractor: () => (/* binding */ MobileNetV2FeatureExtractor),
17590
17714
  /* harmony export */ MobileNetV3FeatureExtractor: () => (/* binding */ MobileNetV3FeatureExtractor),
@@ -17596,6 +17720,7 @@ __webpack_require__.r(__webpack_exports__);
17596
17720
  /* harmony export */ OwlViTProcessor: () => (/* binding */ OwlViTProcessor),
17597
17721
  /* harmony export */ Owlv2ImageProcessor: () => (/* binding */ Owlv2ImageProcessor),
17598
17722
  /* harmony export */ Processor: () => (/* binding */ Processor),
17723
+ /* harmony export */ PvtImageProcessor: () => (/* binding */ PvtImageProcessor),
17599
17724
  /* harmony export */ PyAnnoteFeatureExtractor: () => (/* binding */ PyAnnoteFeatureExtractor),
17600
17725
  /* harmony export */ PyAnnoteProcessor: () => (/* binding */ PyAnnoteProcessor),
17601
17726
  /* harmony export */ RTDetrImageProcessor: () => (/* binding */ RTDetrImageProcessor),
@@ -17684,7 +17809,7 @@ function center_to_corners_format([centerX, centerY, width, height]) {
17684
17809
  * @param {Tensor} outputs.logits The logits
17685
17810
  * @param {Tensor} outputs.pred_boxes The predicted boxes.
17686
17811
  * @param {number} [threshold=0.5] The threshold to use for the scores.
17687
- * @param {number[][]} [target_sizes=null] The sizes of the original images.
17812
+ * @param {[number, number][]} [target_sizes=null] The sizes of the original images.
17688
17813
  * @param {boolean} [is_zero_shot=false] Whether zero-shot object detection was performed.
17689
17814
  * @return {Object[]} An array of objects containing the post-processed outputs.
17690
17815
  * @private
@@ -17765,7 +17890,7 @@ function post_process_object_detection(outputs, threshold = 0.5, target_sizes =
17765
17890
  /**
17766
17891
  * Post-processes the outputs of the model (for semantic segmentation).
17767
17892
  * @param {*} outputs Raw outputs of the model.
17768
- * @param {number[][]} [target_sizes=null] List of tuples corresponding to the requested final size
17893
+ * @param {[number, number][]} [target_sizes=null] List of tuples corresponding to the requested final size
17769
17894
  * (height, width) of each prediction. If unset, predictions will not be resized.
17770
17895
  * @returns {{segmentation: Tensor; labels: number[]}[]} The semantic segmentation maps.
17771
17896
  */
@@ -17825,6 +17950,300 @@ function post_process_semantic_segmentation(outputs, target_sizes = null) {
17825
17950
  return toReturn;
17826
17951
  }
17827
17952
 
17953
+
17954
+ /**
17955
+ * Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and `labels`.
17956
+ * @param {Tensor} class_logits The class logits.
17957
+ * @param {Tensor} mask_logits The mask logits.
17958
+ * @param {number} object_mask_threshold A number between 0 and 1 used to binarize the masks.
17959
+ * @param {number} num_labels The number of labels.
17960
+ * @returns {[Tensor[], number[], number[]]} The binarized masks, the scores, and the labels.
17961
+ * @private
17962
+ */
17963
+ function remove_low_and_no_objects(class_logits, mask_logits, object_mask_threshold, num_labels) {
17964
+
17965
+ const mask_probs_item = [];
17966
+ const pred_scores_item = [];
17967
+ const pred_labels_item = [];
17968
+
17969
+ for (let j = 0; j < class_logits.dims[0]; ++j) {
17970
+ const cls = class_logits[j];
17971
+ const mask = mask_logits[j];
17972
+
17973
+ const pred_label = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.max)(cls.data)[1];
17974
+ if (pred_label === num_labels) {
17975
+ // Is the background, so we ignore it
17976
+ continue;
17977
+ }
17978
+
17979
+ const scores = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.softmax)(cls.data);
17980
+ const pred_score = scores[pred_label];
17981
+ if (pred_score > object_mask_threshold) {
17982
+ mask_probs_item.push(mask);
17983
+ pred_scores_item.push(pred_score);
17984
+ pred_labels_item.push(pred_label);
17985
+ }
17986
+ }
17987
+
17988
+ return [mask_probs_item, pred_scores_item, pred_labels_item];
17989
+ }
17990
+
17991
+ /**
17992
+ * Checks whether the segment is valid or not.
17993
+ * @param {Int32Array} mask_labels Labels for each pixel in the mask.
17994
+ * @param {Tensor[]} mask_probs Probabilities for each pixel in the masks.
17995
+ * @param {number} k The class id of the segment.
17996
+ * @param {number} mask_threshold The mask threshold.
17997
+ * @param {number} overlap_mask_area_threshold The overlap mask area threshold.
17998
+ * @returns {[boolean, number[]]} Whether the segment is valid or not, and the indices of the valid labels.
17999
+ * @private
18000
+ */
18001
+ function check_segment_validity(
18002
+ mask_labels,
18003
+ mask_probs,
18004
+ k,
18005
+ mask_threshold = 0.5,
18006
+ overlap_mask_area_threshold = 0.8
18007
+ ) {
18008
+ // mask_k is a 1D array of indices, indicating where the mask is equal to k
18009
+ const mask_k = [];
18010
+ let mask_k_area = 0;
18011
+ let original_area = 0;
18012
+
18013
+ const mask_probs_k_data = mask_probs[k].data;
18014
+
18015
+ // Compute the area of all the stuff in query k
18016
+ for (let i = 0; i < mask_labels.length; ++i) {
18017
+ if (mask_labels[i] === k) {
18018
+ mask_k.push(i);
18019
+ ++mask_k_area;
18020
+ }
18021
+
18022
+ if (mask_probs_k_data[i] >= mask_threshold) {
18023
+ ++original_area;
18024
+ }
18025
+ }
18026
+ let mask_exists = mask_k_area > 0 && original_area > 0;
18027
+
18028
+ // Eliminate disconnected tiny segments
18029
+ if (mask_exists) {
18030
+ // Perform additional check
18031
+ let area_ratio = mask_k_area / original_area;
18032
+ mask_exists = area_ratio > overlap_mask_area_threshold;
18033
+ }
18034
+
18035
+ return [mask_exists, mask_k]
18036
+ }
18037
+
18038
+ /**
18039
+ * Computes the segments.
18040
+ * @param {Tensor[]} mask_probs The mask probabilities.
18041
+ * @param {number[]} pred_scores The predicted scores.
18042
+ * @param {number[]} pred_labels The predicted labels.
18043
+ * @param {number} mask_threshold The mask threshold.
18044
+ * @param {number} overlap_mask_area_threshold The overlap mask area threshold.
18045
+ * @param {Set<number>} label_ids_to_fuse The label ids to fuse.
18046
+ * @param {number[]} target_size The target size of the image.
18047
+ * @returns {[Tensor, Array<{id: number, label_id: number, score: number}>]} The computed segments.
18048
+ * @private
18049
+ */
18050
+ function compute_segments(
18051
+ mask_probs,
18052
+ pred_scores,
18053
+ pred_labels,
18054
+ mask_threshold,
18055
+ overlap_mask_area_threshold,
18056
+ label_ids_to_fuse = null,
18057
+ target_size = null,
18058
+ ) {
18059
+ const [height, width] = target_size ?? mask_probs[0].dims;
18060
+
18061
+ const segmentation = new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_4__.Tensor(
18062
+ 'int32',
18063
+ new Int32Array(height * width),
18064
+ [height, width]
18065
+ );
18066
+ const segments = [];
18067
+
18068
+ // 1. If target_size is not null, we need to resize the masks to the target size
18069
+ if (target_size !== null) {
18070
+ // resize the masks to the target size
18071
+ for (let i = 0; i < mask_probs.length; ++i) {
18072
+ mask_probs[i] = (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_4__.interpolate)(mask_probs[i], target_size, 'bilinear', false);
18073
+ }
18074
+ }
18075
+
18076
+ // 2. Weigh each mask by its prediction score
18077
+ // NOTE: `mask_probs` is updated in-place
18078
+ //
18079
+ // Temporary storage for the best label/scores for each pixel ([height, width]):
18080
+ const mask_labels = new Int32Array(mask_probs[0].data.length);
18081
+ const bestScores = new Float32Array(mask_probs[0].data.length);
18082
+
18083
+ for (let i = 0; i < mask_probs.length; ++i) {
18084
+ let score = pred_scores[i];
18085
+
18086
+ const mask_probs_i_data = mask_probs[i].data;
18087
+
18088
+ for (let j = 0; j < mask_probs_i_data.length; ++j) {
18089
+ mask_probs_i_data[j] *= score
18090
+ if (mask_probs_i_data[j] > bestScores[j]) {
18091
+ mask_labels[j] = i;
18092
+ bestScores[j] = mask_probs_i_data[j];
18093
+ }
18094
+ }
18095
+ }
18096
+
18097
+ let current_segment_id = 0;
18098
+
18099
+ // let stuff_memory_list = {}
18100
+ const segmentation_data = segmentation.data;
18101
+ for (let k = 0; k < pred_labels.length; ++k) {
18102
+ const pred_class = pred_labels[k];
18103
+
18104
+ // TODO add `should_fuse`
18105
+ // let should_fuse = pred_class in label_ids_to_fuse
18106
+
18107
+ // Check if mask exists and large enough to be a segment
18108
+ const [mask_exists, mask_k] = check_segment_validity(
18109
+ mask_labels,
18110
+ mask_probs,
18111
+ k,
18112
+ mask_threshold,
18113
+ overlap_mask_area_threshold
18114
+ )
18115
+
18116
+ if (!mask_exists) {
18117
+ // Nothing to see here
18118
+ continue;
18119
+ }
18120
+
18121
+ // TODO
18122
+ // if (pred_class in stuff_memory_list) {
18123
+ // current_segment_id = stuff_memory_list[pred_class]
18124
+ // } else {
18125
+ // current_segment_id += 1;
18126
+ // }
18127
+ ++current_segment_id;
18128
+
18129
+
18130
+ // Add current object segment to final segmentation map
18131
+ for (const index of mask_k) {
18132
+ segmentation_data[index] = current_segment_id;
18133
+ }
18134
+
18135
+ segments.push({
18136
+ id: current_segment_id,
18137
+ label_id: pred_class,
18138
+ // was_fused: should_fuse, TODO
18139
+ score: pred_scores[k],
18140
+ })
18141
+
18142
+ // TODO
18143
+ // if(should_fuse){
18144
+ // stuff_memory_list[pred_class] = current_segment_id
18145
+ // }
18146
+ }
18147
+
18148
+ return [segmentation, segments];
18149
+ }
18150
+
18151
+
18152
+ /**
18153
+ * Post-process the model output to generate the final panoptic segmentation.
18154
+ * @param {*} outputs The model output to post process
18155
+ * @param {number} [threshold=0.5] The probability score threshold to keep predicted instance masks.
18156
+ * @param {number} [mask_threshold=0.5] Threshold to use when turning the predicted masks into binary values.
18157
+ * @param {number} [overlap_mask_area_threshold=0.8] The overlap mask area threshold to merge or discard small disconnected parts within each binary instance mask.
18158
+ * @param {Set<number>} [label_ids_to_fuse=null] The labels in this state will have all their instances be fused together.
18159
+ * @param {[number, number][]} [target_sizes=null] The target sizes to resize the masks to.
18160
+ * @returns {Array<{ segmentation: Tensor, segments_info: Array<{id: number, label_id: number, score: number}>}>}
18161
+ */
18162
+ function post_process_panoptic_segmentation(
18163
+ outputs,
18164
+ threshold = 0.5,
18165
+ mask_threshold = 0.5,
18166
+ overlap_mask_area_threshold = 0.8,
18167
+ label_ids_to_fuse = null,
18168
+ target_sizes = null,
18169
+ ) {
18170
+ if (label_ids_to_fuse === null) {
18171
+ console.warn("`label_ids_to_fuse` unset. No instance will be fused.")
18172
+ label_ids_to_fuse = new Set();
18173
+ }
18174
+
18175
+ const class_queries_logits = outputs.class_queries_logits ?? outputs.logits; // [batch_size, num_queries, num_classes+1]
18176
+ const masks_queries_logits = outputs.masks_queries_logits ?? outputs.pred_masks; // [batch_size, num_queries, height, width]
18177
+
18178
+ const mask_probs = masks_queries_logits.sigmoid() // [batch_size, num_queries, height, width]
18179
+
18180
+ let [batch_size, num_queries, num_labels] = class_queries_logits.dims;
18181
+ num_labels -= 1; // Remove last class (background)
18182
+
18183
+ if (target_sizes !== null && target_sizes.length !== batch_size) {
18184
+ throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits")
18185
+ }
18186
+
18187
+ let toReturn = [];
18188
+ for (let i = 0; i < batch_size; ++i) {
18189
+ let target_size = target_sizes !== null ? target_sizes[i] : null;
18190
+
18191
+ let class_logits = class_queries_logits[i];
18192
+ let mask_logits = mask_probs[i];
18193
+
18194
+ let [mask_probs_item, pred_scores_item, pred_labels_item] = remove_low_and_no_objects(class_logits, mask_logits, threshold, num_labels);
18195
+
18196
+ if (pred_labels_item.length === 0) {
18197
+ // No mask found
18198
+ let [height, width] = target_size ?? mask_logits.dims.slice(-2);
18199
+
18200
+ let segmentation = new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_4__.Tensor(
18201
+ 'int32',
18202
+ new Int32Array(height * width).fill(-1),
18203
+ [height, width]
18204
+ )
18205
+ toReturn.push({
18206
+ segmentation: segmentation,
18207
+ segments_info: []
18208
+ });
18209
+ continue;
18210
+ }
18211
+
18212
+
18213
+ // Get segmentation map and segment information of batch item
18214
+ let [segmentation, segments] = compute_segments(
18215
+ mask_probs_item,
18216
+ pred_scores_item,
18217
+ pred_labels_item,
18218
+ mask_threshold,
18219
+ overlap_mask_area_threshold,
18220
+ label_ids_to_fuse,
18221
+ target_size,
18222
+ )
18223
+
18224
+ toReturn.push({
18225
+ segmentation: segmentation,
18226
+ segments_info: segments
18227
+ })
18228
+ }
18229
+
18230
+ return toReturn;
18231
+ }
18232
+
18233
+
18234
+ /**
18235
+ * Post-processes the outputs of the model (for instance segmentation).
18236
+ * @param {*} outputs Raw outputs of the model.
18237
+ * @param {number} [threshold=0.5] The probability score threshold to keep predicted instance masks.
18238
+ * @param {[number, number][]} [target_sizes=null] List of tuples corresponding to the requested final size
18239
+ * (height, width) of each prediction. If unset, predictions will not be resized.
18240
+ * @returns {Array<{ segmentation: Tensor, segments_info: Array<{id: number, label_id: number, score: number}>}>}
18241
+ */
18242
+ function post_process_instance_segmentation(outputs, threshold = 0.5, target_sizes = null) {
18243
+ throw new Error('Not implemented yet');
18244
+ return [];
18245
+ }
18246
+
17828
18247
  /**
17829
18248
  * Named tuple to indicate the order we are using is (height x width), even though
17830
18249
  * the Graphics’ industry standard is (width x height).
@@ -18413,6 +18832,7 @@ class SegformerFeatureExtractor extends ImageFeatureExtractor {
18413
18832
  return post_process_semantic_segmentation(...args);
18414
18833
  }
18415
18834
  }
18835
+ class PvtImageProcessor extends ImageFeatureExtractor { }
18416
18836
  class DPTFeatureExtractor extends ImageFeatureExtractor { }
18417
18837
  class DPTImageProcessor extends DPTFeatureExtractor { } // NOTE: extends DPTFeatureExtractor
18418
18838
  class BitImageProcessor extends ImageFeatureExtractor { }
@@ -18552,302 +18972,32 @@ class DetrFeatureExtractor extends ImageFeatureExtractor {
18552
18972
  // TODO support different mask sizes (not just 64x64)
18553
18973
  // Currently, just fill pixel mask with 1s
18554
18974
  const maskSize = [result.pixel_values.dims[0], 64, 64];
18555
- const pixel_mask = new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_4__.Tensor(
18556
- 'int64',
18557
- new BigInt64Array(maskSize.reduce((a, b) => a * b)).fill(1n),
18558
- maskSize
18559
- );
18975
+ const pixel_mask = (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_4__.full)(maskSize, 1n);
18560
18976
 
18561
18977
  return { ...result, pixel_mask };
18562
18978
  }
18563
18979
 
18564
- /**
18565
- * Post-processes the outputs of the model (for object detection).
18566
- * @param {Object} outputs The outputs of the model that must be post-processed
18567
- * @param {Tensor} outputs.logits The logits
18568
- * @param {Tensor} outputs.pred_boxes The predicted boxes.
18569
- * @return {Object[]} An array of objects containing the post-processed outputs.
18570
- */
18571
-
18572
18980
  /** @type {typeof post_process_object_detection} */
18573
18981
  post_process_object_detection(...args) {
18574
18982
  return post_process_object_detection(...args);
18575
18983
  }
18576
18984
 
18577
- /**
18578
- * Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and `labels`.
18579
- * @param {Tensor} class_logits The class logits.
18580
- * @param {Tensor} mask_logits The mask logits.
18581
- * @param {number} object_mask_threshold A number between 0 and 1 used to binarize the masks.
18582
- * @param {number} num_labels The number of labels.
18583
- * @returns {[Tensor[], number[], number[]]} The binarized masks, the scores, and the labels.
18584
- */
18585
- remove_low_and_no_objects(class_logits, mask_logits, object_mask_threshold, num_labels) {
18586
-
18587
- let mask_probs_item = [];
18588
- let pred_scores_item = [];
18589
- let pred_labels_item = [];
18590
-
18591
- for (let j = 0; j < class_logits.dims[0]; ++j) {
18592
- let cls = class_logits[j];
18593
- let mask = mask_logits[j];
18594
-
18595
- let pred_label = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.max)(cls.data)[1];
18596
- if (pred_label === num_labels) {
18597
- // Is the background, so we ignore it
18598
- continue;
18599
- }
18600
-
18601
- let scores = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.softmax)(cls.data);
18602
- let pred_score = scores[pred_label];
18603
- if (pred_score > object_mask_threshold) {
18604
- mask_probs_item.push(mask);
18605
- pred_scores_item.push(pred_score);
18606
- pred_labels_item.push(pred_label);
18607
- }
18608
- }
18609
-
18610
- return [mask_probs_item, pred_scores_item, pred_labels_item];
18611
-
18612
- }
18613
-
18614
- /**
18615
- * Checks whether the segment is valid or not.
18616
- * @param {Int32Array} mask_labels Labels for each pixel in the mask.
18617
- * @param {Tensor[]} mask_probs Probabilities for each pixel in the masks.
18618
- * @param {number} k The class id of the segment.
18619
- * @param {number} mask_threshold The mask threshold.
18620
- * @param {number} overlap_mask_area_threshold The overlap mask area threshold.
18621
- * @returns {[boolean, number[]]} Whether the segment is valid or not, and the indices of the valid labels.
18622
- */
18623
- check_segment_validity(
18624
- mask_labels,
18625
- mask_probs,
18626
- k,
18627
- mask_threshold = 0.5,
18628
- overlap_mask_area_threshold = 0.8
18629
- ) {
18630
- // mask_k is a 1D array of indices, indicating where the mask is equal to k
18631
- let mask_k = [];
18632
- let mask_k_area = 0;
18633
- let original_area = 0;
18634
-
18635
- const mask_probs_k_data = mask_probs[k].data;
18636
-
18637
- // Compute the area of all the stuff in query k
18638
- for (let i = 0; i < mask_labels.length; ++i) {
18639
- if (mask_labels[i] === k) {
18640
- mask_k.push(i);
18641
- ++mask_k_area;
18642
- }
18643
-
18644
- if (mask_probs_k_data[i] >= mask_threshold) {
18645
- ++original_area;
18646
- }
18647
- }
18648
- let mask_exists = mask_k_area > 0 && original_area > 0;
18649
-
18650
- // Eliminate disconnected tiny segments
18651
- if (mask_exists) {
18652
- // Perform additional check
18653
- let area_ratio = mask_k_area / original_area;
18654
- mask_exists = area_ratio > overlap_mask_area_threshold;
18655
- }
18656
-
18657
- return [mask_exists, mask_k]
18985
+ /** @type {typeof post_process_panoptic_segmentation} */
18986
+ post_process_panoptic_segmentation(...args) {
18987
+ return post_process_panoptic_segmentation(...args);
18658
18988
  }
18659
18989
 
18660
- /**
18661
- * Computes the segments.
18662
- * @param {Tensor[]} mask_probs The mask probabilities.
18663
- * @param {number[]} pred_scores The predicted scores.
18664
- * @param {number[]} pred_labels The predicted labels.
18665
- * @param {number} mask_threshold The mask threshold.
18666
- * @param {number} overlap_mask_area_threshold The overlap mask area threshold.
18667
- * @param {Set<number>} label_ids_to_fuse The label ids to fuse.
18668
- * @param {number[]} target_size The target size of the image.
18669
- * @returns {[Tensor, Array<{id: number, label_id: number, score: number}>]} The computed segments.
18670
- */
18671
- compute_segments(
18672
- mask_probs,
18673
- pred_scores,
18674
- pred_labels,
18675
- mask_threshold,
18676
- overlap_mask_area_threshold,
18677
- label_ids_to_fuse = null,
18678
- target_size = null,
18679
- ) {
18680
- let [height, width] = target_size ?? mask_probs[0].dims;
18681
-
18682
- let segmentation = new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_4__.Tensor(
18683
- 'int32',
18684
- new Int32Array(height * width),
18685
- [height, width]
18686
- );
18687
- let segments = [];
18688
-
18689
- // 1. If target_size is not null, we need to resize the masks to the target size
18690
- if (target_size !== null) {
18691
- // resize the masks to the target size
18692
- for (let i = 0; i < mask_probs.length; ++i) {
18693
- mask_probs[i] = (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_4__.interpolate)(mask_probs[i], target_size, 'bilinear', false);
18694
- }
18695
- }
18696
-
18697
- // 2. Weigh each mask by its prediction score
18698
- // NOTE: `mask_probs` is updated in-place
18699
- //
18700
- // Temporary storage for the best label/scores for each pixel ([height, width]):
18701
- let mask_labels = new Int32Array(mask_probs[0].data.length);
18702
- let bestScores = new Float32Array(mask_probs[0].data.length);
18703
-
18704
- for (let i = 0; i < mask_probs.length; ++i) {
18705
- let score = pred_scores[i];
18706
-
18707
- const mask_probs_i_data = mask_probs[i].data;
18708
-
18709
- for (let j = 0; j < mask_probs_i_data.length; ++j) {
18710
- mask_probs_i_data[j] *= score
18711
- if (mask_probs_i_data[j] > bestScores[j]) {
18712
- mask_labels[j] = i;
18713
- bestScores[j] = mask_probs_i_data[j];
18714
- }
18715
- }
18716
- }
18717
-
18718
- let current_segment_id = 0;
18719
-
18720
- // let stuff_memory_list = {}
18721
- const segmentation_data = segmentation.data;
18722
- for (let k = 0; k < pred_labels.length; ++k) {
18723
- let pred_class = pred_labels[k];
18724
-
18725
- // TODO add `should_fuse`
18726
- // let should_fuse = pred_class in label_ids_to_fuse
18727
-
18728
- // Check if mask exists and large enough to be a segment
18729
- let [mask_exists, mask_k] = this.check_segment_validity(
18730
- mask_labels,
18731
- mask_probs,
18732
- k,
18733
- mask_threshold,
18734
- overlap_mask_area_threshold
18735
- )
18736
-
18737
- if (!mask_exists) {
18738
- // Nothing to see here
18739
- continue;
18740
- }
18741
-
18742
- // TODO
18743
- // if (pred_class in stuff_memory_list) {
18744
- // current_segment_id = stuff_memory_list[pred_class]
18745
- // } else {
18746
- // current_segment_id += 1;
18747
- // }
18748
- ++current_segment_id;
18749
-
18750
-
18751
- // Add current object segment to final segmentation map
18752
- for (let index of mask_k) {
18753
- segmentation_data[index] = current_segment_id;
18754
- }
18755
-
18756
- segments.push({
18757
- id: current_segment_id,
18758
- label_id: pred_class,
18759
- // was_fused: should_fuse, TODO
18760
- score: pred_scores[k],
18761
- })
18762
-
18763
- // TODO
18764
- // if(should_fuse){
18765
- // stuff_memory_list[pred_class] = current_segment_id
18766
- // }
18767
- }
18768
-
18769
- return [segmentation, segments];
18990
+ post_process_instance_segmentation() {
18991
+ // TODO
18992
+ throw Error("Not implemented yet");
18770
18993
  }
18994
+ }
18771
18995
 
18772
- /**
18773
- * Post-process the model output to generate the final panoptic segmentation.
18774
- * @param {*} outputs The model output to post process
18775
- * @param {number} [threshold=0.5] The probability score threshold to keep predicted instance masks.
18776
- * @param {number} [mask_threshold=0.5] Threshold to use when turning the predicted masks into binary values.
18777
- * @param {number} [overlap_mask_area_threshold=0.8] The overlap mask area threshold to merge or discard small disconnected parts within each binary instance mask.
18778
- * @param {Set<number>} [label_ids_to_fuse=null] The labels in this state will have all their instances be fused together.
18779
- * @param {number[][]} [target_sizes=null] The target sizes to resize the masks to.
18780
- * @returns {Array<{ segmentation: Tensor, segments_info: Array<{id: number, label_id: number, score: number}>}>}
18781
- */
18782
- post_process_panoptic_segmentation(
18783
- outputs,
18784
- threshold = 0.5,
18785
- mask_threshold = 0.5,
18786
- overlap_mask_area_threshold = 0.8,
18787
- label_ids_to_fuse = null,
18788
- target_sizes = null,
18789
- ) {
18790
- if (label_ids_to_fuse === null) {
18791
- console.warn("`label_ids_to_fuse` unset. No instance will be fused.")
18792
- label_ids_to_fuse = new Set();
18793
- }
18794
-
18795
- const class_queries_logits = outputs.logits; // [batch_size, num_queries, num_classes+1]
18796
- const masks_queries_logits = outputs.pred_masks; // [batch_size, num_queries, height, width]
18797
-
18798
- const mask_probs = masks_queries_logits.sigmoid() // [batch_size, num_queries, height, width]
18799
-
18800
- let [batch_size, num_queries, num_labels] = class_queries_logits.dims;
18801
- num_labels -= 1; // Remove last class (background)
18802
-
18803
- if (target_sizes !== null && target_sizes.length !== batch_size) {
18804
- throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits")
18805
- }
18806
-
18807
- let toReturn = [];
18808
- for (let i = 0; i < batch_size; ++i) {
18809
- let target_size = target_sizes !== null ? target_sizes[i] : null;
18810
-
18811
- let class_logits = class_queries_logits[i];
18812
- let mask_logits = mask_probs[i];
18813
-
18814
- let [mask_probs_item, pred_scores_item, pred_labels_item] = this.remove_low_and_no_objects(class_logits, mask_logits, threshold, num_labels);
18815
-
18816
- if (pred_labels_item.length === 0) {
18817
- // No mask found
18818
- let [height, width] = target_size ?? mask_logits.dims.slice(-2);
18819
-
18820
- let segmentation = new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_4__.Tensor(
18821
- 'int32',
18822
- new Int32Array(height * width).fill(-1),
18823
- [height, width]
18824
- )
18825
- toReturn.push({
18826
- segmentation: segmentation,
18827
- segments_info: []
18828
- });
18829
- continue;
18830
- }
18831
-
18832
-
18833
- // Get segmentation map and segment information of batch item
18834
- let [segmentation, segments] = this.compute_segments(
18835
- mask_probs_item,
18836
- pred_scores_item,
18837
- pred_labels_item,
18838
- mask_threshold,
18839
- overlap_mask_area_threshold,
18840
- label_ids_to_fuse,
18841
- target_size,
18842
- )
18843
-
18844
- toReturn.push({
18845
- segmentation: segmentation,
18846
- segments_info: segments
18847
- })
18848
- }
18996
+ class MaskFormerFeatureExtractor extends ImageFeatureExtractor {
18849
18997
 
18850
- return toReturn;
18998
+ /** @type {typeof post_process_panoptic_segmentation} */
18999
+ post_process_panoptic_segmentation(...args) {
19000
+ return post_process_panoptic_segmentation(...args);
18851
19001
  }
18852
19002
 
18853
19003
  post_process_instance_segmentation() {
@@ -18856,6 +19006,7 @@ class DetrFeatureExtractor extends ImageFeatureExtractor {
18856
19006
  }
18857
19007
  }
18858
19008
 
19009
+
18859
19010
  class YolosFeatureExtractor extends ImageFeatureExtractor {
18860
19011
  /** @type {typeof post_process_object_detection} */
18861
19012
  post_process_object_detection(...args) {
@@ -20145,11 +20296,13 @@ class AutoProcessor {
20145
20296
  BitImageProcessor,
20146
20297
  DPTImageProcessor,
20147
20298
  DPTFeatureExtractor,
20299
+ PvtImageProcessor,
20148
20300
  GLPNFeatureExtractor,
20149
20301
  BeitFeatureExtractor,
20150
20302
  DeiTFeatureExtractor,
20151
20303
  DetrFeatureExtractor,
20152
20304
  RTDetrImageProcessor,
20305
+ MaskFormerFeatureExtractor,
20153
20306
  YolosFeatureExtractor,
20154
20307
  DonutFeatureExtractor,
20155
20308
  NougatImageProcessor,
@@ -20527,23 +20680,26 @@ function is_chinese_char(cp) {
20527
20680
  }
20528
20681
 
20529
20682
  /**
20530
- * Helper function to fuse consecutive values in an array equal to the specified value.
20531
- * @param {string[]} arr The input array
20532
- * @param {any} value The value to fuse on.
20533
- * @param {Map<string, any>} mapping The mapping from input domain to value.
20683
+ * Helper function to fuse consecutive unknown tokens.
20684
+ * @param {string[]} arr The list of input tokens
20685
+ * @param {Map<string, any>} tokens_to_ids The mapping from tokens to token ids.
20686
+ * @param {number} unk_token_id The value to fuse on.
20687
+ * @private
20534
20688
  */
20535
- function fuse(arr, value, mapping) {
20689
+ function fuse_unk(arr, tokens_to_ids, unk_token_id) {
20536
20690
  const fused = [];
20537
20691
  let i = 0;
20538
20692
  while (i < arr.length) {
20539
20693
  fused.push(arr[i])
20540
- if ((mapping.get(arr[i]) ?? value) !== value) {
20694
+ if ((tokens_to_ids.get(arr[i]) ?? unk_token_id) !== unk_token_id) {
20541
20695
  ++i;
20542
20696
  continue;
20543
20697
  }
20544
20698
 
20545
- while (i < arr.length && (mapping.get(arr[i]) ?? value) === value) {
20546
- ++i;
20699
+ while (++i < arr.length && (tokens_to_ids.get(arr[i]) ?? unk_token_id) === unk_token_id) {
20700
+ if (tokens_to_ids.get(fused.at(-1)) !== unk_token_id) {
20701
+ fused[fused.length - 1] += arr[i];
20702
+ }
20547
20703
  }
20548
20704
  }
20549
20705
 
@@ -20660,15 +20816,15 @@ class TokenizerModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
20660
20816
  /**
20661
20817
  * Internal function to call the TokenizerModel instance.
20662
20818
  * @param {string[]} tokens The tokens to encode.
20663
- * @returns {string[]} The encoded token IDs.
20819
+ * @returns {string[]} The encoded tokens.
20664
20820
  */
20665
20821
  _call(tokens) {
20666
- let ids = this.encode(tokens);
20822
+ tokens = this.encode(tokens);
20667
20823
  if (this.fuse_unk) {
20668
20824
  // Fuse unknown tokens
20669
- ids = fuse(ids, this.unk_token_id, this.tokens_to_ids);
20825
+ tokens = fuse_unk(tokens, this.tokens_to_ids, this.unk_token_id);
20670
20826
  }
20671
- return ids;
20827
+ return tokens;
20672
20828
  }
20673
20829
 
20674
20830
  /**
@@ -21166,15 +21322,19 @@ class BPE extends TokenizerModel {
21166
21322
  for (const t of bpe_token_list) {
21167
21323
  if (this.tokens_to_ids.has(t)) {
21168
21324
  outputTokens.push(t);
21169
- } else {
21170
- if (this.byte_fallback) {
21171
- outputTokens.push(
21172
- ...Array.from(this.text_encoder.encode(t))
21173
- .map(x => `<0x${x.toString(16).toUpperCase().padStart(2, '0')}>`)
21174
- );
21325
+ } else if (this.byte_fallback) {
21326
+ const byteTokens = Array.from(this.text_encoder.encode(t))
21327
+ .map(x => `<0x${x.toString(16).toUpperCase().padStart(2, '0')}>`);
21328
+ if (byteTokens.every(x => this.tokens_to_ids.has(x))) {
21329
+ // Ensure the byte tokens are actually in the vocabulary, otherwise
21330
+ // we fall back to the unknown token. For more information, see
21331
+ // https://github.com/huggingface/transformers/issues/28096.
21332
+ outputTokens.push(...byteTokens);
21175
21333
  } else {
21176
21334
  outputTokens.push(this.unk_token);
21177
21335
  }
21336
+ } else {
21337
+ outputTokens.push(this.unk_token);
21178
21338
  }
21179
21339
  }
21180
21340
  }
@@ -22846,11 +23006,10 @@ class PreTrainedTokenizer extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__
22846
23006
  this.decoder.end_of_word_suffix = this.model.end_of_word_suffix;
22847
23007
  }
22848
23008
 
22849
-
22850
23009
  this.added_tokens_regex = this.added_tokens.length > 0 ? new RegExp(
22851
- this.added_tokens
23010
+ this.added_tokens.slice()
22852
23011
  // Sort by length (desc) to avoid early partial matches
22853
- .toSorted((a, b) => b.content.length - a.content.length)
23012
+ .sort((a, b) => b.content.length - a.content.length)
22854
23013
  .map(x => `${x.lstrip ? '\\s*' : ''}(${(0,_utils_core_js__WEBPACK_IMPORTED_MODULE_1__.escapeRegExp)(x.content)})${x.rstrip ? '\\s*' : ''}`)
22855
23014
  .join('|')
22856
23015
  ) : null;
@@ -23348,6 +23507,67 @@ class PreTrainedTokenizer extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__
23348
23507
 
23349
23508
  return decoded;
23350
23509
  }
23510
+
23511
+ /**
23512
+ * Retrieve the chat template string used for tokenizing chat messages. This template is used
23513
+ * internally by the `apply_chat_template` method and can also be used externally to retrieve the model's chat
23514
+ * template for better generation tracking.
23515
+ *
23516
+ * @param {Object} options An optional object containing the following properties:
23517
+ * @param {string} [options.chat_template=null]
23518
+ * A Jinja template or the name of a template to use for this conversion.
23519
+ * It is usually not necessary to pass anything to this argument,
23520
+ * as the model's template will be used by default.
23521
+ * @param {Object[]} [options.tools=null]
23522
+ * A list of tools (callable functions) that will be accessible to the model. If the template does not
23523
+ * support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema,
23524
+ * giving the name, description and argument types for the tool. See our
23525
+ * [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use)
23526
+ * for more information.
23527
+ * @returns {string} The chat template string.
23528
+ */
23529
+ get_chat_template({
23530
+ chat_template = null,
23531
+ tools = null,
23532
+ } = {}) {
23533
+
23534
+ // First, handle the cases when the model has a dict of multiple templates
23535
+ if (this.chat_template && typeof this.chat_template === 'object') {
23536
+ const template_dict = this.chat_template;
23537
+
23538
+ if (chat_template !== null && Object.hasOwn(template_dict, chat_template)) {
23539
+ // The user can pass the name of a template to the chat template argument instead of an entire template
23540
+ chat_template = template_dict[chat_template];
23541
+ } else if (chat_template === null) {
23542
+ if (tools !== null && 'tool_use' in template_dict) {
23543
+ chat_template = template_dict['tool_use'];
23544
+ } else if ('default' in template_dict) {
23545
+ chat_template = template_dict['default'];
23546
+ } else {
23547
+ throw Error(
23548
+ `This model has multiple chat templates with no default specified! Please either pass a chat ` +
23549
+ `template or the name of the template you wish to use to the 'chat_template' argument. Available ` +
23550
+ `template names are ${Object.keys(template_dict).sort()}.`
23551
+ )
23552
+ }
23553
+ }
23554
+ } else if (chat_template === null) {
23555
+ // These are the cases when the model has a single template
23556
+ // priority: `chat_template` argument > `tokenizer.chat_template`
23557
+ if (this.chat_template) {
23558
+ chat_template = this.chat_template;
23559
+ } else {
23560
+ throw Error(
23561
+ "Cannot use apply_chat_template() because tokenizer.chat_template is not set and no template " +
23562
+ "argument was passed! For information about writing templates and setting the " +
23563
+ "tokenizer.chat_template attribute, please see the documentation at " +
23564
+ "https://huggingface.co/docs/transformers/main/en/chat_templating"
23565
+ )
23566
+ }
23567
+ }
23568
+ return chat_template;
23569
+ }
23570
+
23351
23571
  /**
23352
23572
  * Converts a list of message objects with `"role"` and `"content"` keys to a list of token
23353
23573
  * ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to
@@ -23421,39 +23641,8 @@ class PreTrainedTokenizer extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__
23421
23641
  ...kwargs
23422
23642
  } = {}) {
23423
23643
 
23424
- // First, handle the cases when the model has a dict of multiple templates
23425
- if (
23426
- (this.chat_template && typeof this.chat_template === 'object')
23427
- || this.chat_template === null
23428
- ) {
23429
- const template_dict = this.chat_template;
23644
+ chat_template = this.get_chat_template({ chat_template, tools });
23430
23645
 
23431
- if (chat_template !== null && Object.hasOwn(template_dict, chat_template)) {
23432
- // The user can pass the name of a template to the chat template argument instead of an entire template
23433
- chat_template = template_dict[chat_template];
23434
- } else if (chat_template === null && 'default' in template_dict) {
23435
- chat_template = template_dict['default'];
23436
- } else if (chat_template === null) {
23437
- throw Error(
23438
- `This model has multiple chat templates with no default specified! Please either pass a chat ` +
23439
- `template or the name of the template you wish to use to the 'chat_template' argument. Available ` +
23440
- `template names are ${Object.keys(template_dict).sort()}.`
23441
- )
23442
- }
23443
- } else {
23444
- // These are the cases when the model has a single template
23445
- // priority: `chat_template` argument > `tokenizer.chat_template`
23446
- if (this.chat_template) {
23447
- chat_template = this.chat_template;
23448
- } else {
23449
- throw Error(
23450
- "Cannot use apply_chat_template() because tokenizer.chat_template is not set and no template " +
23451
- "argument was passed! For information about writing templates and setting the " +
23452
- "tokenizer.chat_template attribute, please see the documentation at " +
23453
- "https://huggingface.co/docs/transformers/main/en/chat_templating"
23454
- )
23455
- }
23456
- }
23457
23646
  if (typeof chat_template !== 'string') {
23458
23647
  throw Error(`chat_template must be a string, but got ${typeof chat_template}`);
23459
23648
  }
@@ -27871,11 +28060,20 @@ function softmax(arr) {
27871
28060
  * @returns {T} The resulting log_softmax array.
27872
28061
  */
27873
28062
  function log_softmax(arr) {
27874
- // Compute the softmax values
27875
- const softmaxArr = softmax(arr);
28063
+ // Compute the maximum value in the array
28064
+ const maxVal = max(arr)[0];
28065
+
28066
+ // Compute the sum of the exponentials
28067
+ let sumExps = 0;
28068
+ for(let i = 0; i < arr.length; ++i) {
28069
+ sumExps += Math.exp(arr[i] - maxVal);
28070
+ }
28071
+
28072
+ // Compute the log of the sum
28073
+ const logSum = Math.log(sumExps);
27876
28074
 
27877
- // Apply log formula to each element
27878
- const logSoftmaxArr = softmaxArr.map(x => Math.log(x));
28075
+ // Compute the softmax values
28076
+ const logSoftmaxArr = arr.map(x => x - maxVal - logSum);
27879
28077
 
27880
28078
  return /** @type {T} */(logSoftmaxArr);
27881
28079
  }
@@ -30329,6 +30527,7 @@ __webpack_require__.r(__webpack_exports__);
30329
30527
  /* harmony export */ AutoModelForTextToSpectrogram: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.AutoModelForTextToSpectrogram),
30330
30528
  /* harmony export */ AutoModelForTextToWaveform: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.AutoModelForTextToWaveform),
30331
30529
  /* harmony export */ AutoModelForTokenClassification: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.AutoModelForTokenClassification),
30530
+ /* harmony export */ AutoModelForUniversalSegmentation: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.AutoModelForUniversalSegmentation),
30332
30531
  /* harmony export */ AutoModelForVision2Seq: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.AutoModelForVision2Seq),
30333
30532
  /* harmony export */ AutoModelForXVector: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.AutoModelForXVector),
30334
30533
  /* harmony export */ AutoModelForZeroShotObjectDetection: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.AutoModelForZeroShotObjectDetection),
@@ -30373,8 +30572,10 @@ __webpack_require__.r(__webpack_exports__);
30373
30572
  /* harmony export */ CLIPSegForImageSegmentation: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.CLIPSegForImageSegmentation),
30374
30573
  /* harmony export */ CLIPSegModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.CLIPSegModel),
30375
30574
  /* harmony export */ CLIPSegPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.CLIPSegPreTrainedModel),
30575
+ /* harmony export */ CLIPTextModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.CLIPTextModel),
30376
30576
  /* harmony export */ CLIPTextModelWithProjection: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.CLIPTextModelWithProjection),
30377
30577
  /* harmony export */ CLIPTokenizer: () => (/* reexport safe */ _tokenizers_js__WEBPACK_IMPORTED_MODULE_3__.CLIPTokenizer),
30578
+ /* harmony export */ CLIPVisionModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.CLIPVisionModel),
30378
30579
  /* harmony export */ CLIPVisionModelWithProjection: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.CLIPVisionModelWithProjection),
30379
30580
  /* harmony export */ CamembertForMaskedLM: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.CamembertForMaskedLM),
30380
30581
  /* harmony export */ CamembertForQuestionAnswering: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.CamembertForQuestionAnswering),
@@ -30436,6 +30637,8 @@ __webpack_require__.r(__webpack_exports__);
30436
30637
  /* harmony export */ DebertaV2Model: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.DebertaV2Model),
30437
30638
  /* harmony export */ DebertaV2PreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.DebertaV2PreTrainedModel),
30438
30639
  /* harmony export */ DebertaV2Tokenizer: () => (/* reexport safe */ _tokenizers_js__WEBPACK_IMPORTED_MODULE_3__.DebertaV2Tokenizer),
30640
+ /* harmony export */ DecisionTransformerModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.DecisionTransformerModel),
30641
+ /* harmony export */ DecisionTransformerPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.DecisionTransformerPreTrainedModel),
30439
30642
  /* harmony export */ DeiTFeatureExtractor: () => (/* reexport safe */ _processors_js__WEBPACK_IMPORTED_MODULE_4__.DeiTFeatureExtractor),
30440
30643
  /* harmony export */ DeiTForImageClassification: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.DeiTForImageClassification),
30441
30644
  /* harmony export */ DeiTModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.DeiTModel),
@@ -30525,6 +30728,8 @@ __webpack_require__.r(__webpack_exports__);
30525
30728
  /* harmony export */ GemmaPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.GemmaPreTrainedModel),
30526
30729
  /* harmony export */ GemmaTokenizer: () => (/* reexport safe */ _tokenizers_js__WEBPACK_IMPORTED_MODULE_3__.GemmaTokenizer),
30527
30730
  /* harmony export */ Grok1Tokenizer: () => (/* reexport safe */ _tokenizers_js__WEBPACK_IMPORTED_MODULE_3__.Grok1Tokenizer),
30731
+ /* harmony export */ GroupViTModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.GroupViTModel),
30732
+ /* harmony export */ GroupViTPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.GroupViTPreTrainedModel),
30528
30733
  /* harmony export */ HerbertTokenizer: () => (/* reexport safe */ _tokenizers_js__WEBPACK_IMPORTED_MODULE_3__.HerbertTokenizer),
30529
30734
  /* harmony export */ HieraForImageClassification: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.HieraForImageClassification),
30530
30735
  /* harmony export */ HieraModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.HieraModel),
@@ -30578,6 +30783,10 @@ __webpack_require__.r(__webpack_exports__);
30578
30783
  /* harmony export */ MarianModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.MarianModel),
30579
30784
  /* harmony export */ MarianPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.MarianPreTrainedModel),
30580
30785
  /* harmony export */ MarianTokenizer: () => (/* reexport safe */ _tokenizers_js__WEBPACK_IMPORTED_MODULE_3__.MarianTokenizer),
30786
+ /* harmony export */ MaskFormerFeatureExtractor: () => (/* reexport safe */ _processors_js__WEBPACK_IMPORTED_MODULE_4__.MaskFormerFeatureExtractor),
30787
+ /* harmony export */ MaskFormerForInstanceSegmentation: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.MaskFormerForInstanceSegmentation),
30788
+ /* harmony export */ MaskFormerModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.MaskFormerModel),
30789
+ /* harmony export */ MaskFormerPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.MaskFormerPreTrainedModel),
30581
30790
  /* harmony export */ MaskedLMOutput: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.MaskedLMOutput),
30582
30791
  /* harmony export */ MaxLengthCriteria: () => (/* reexport safe */ _generation_stopping_criteria_js__WEBPACK_IMPORTED_MODULE_11__.MaxLengthCriteria),
30583
30792
  /* harmony export */ MistralForCausalLM: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.MistralForCausalLM),
@@ -30655,6 +30864,10 @@ __webpack_require__.r(__webpack_exports__);
30655
30864
  /* harmony export */ PretrainedConfig: () => (/* reexport safe */ _configs_js__WEBPACK_IMPORTED_MODULE_5__.PretrainedConfig),
30656
30865
  /* harmony export */ PretrainedMixin: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.PretrainedMixin),
30657
30866
  /* harmony export */ Processor: () => (/* reexport safe */ _processors_js__WEBPACK_IMPORTED_MODULE_4__.Processor),
30867
+ /* harmony export */ PvtForImageClassification: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.PvtForImageClassification),
30868
+ /* harmony export */ PvtImageProcessor: () => (/* reexport safe */ _processors_js__WEBPACK_IMPORTED_MODULE_4__.PvtImageProcessor),
30869
+ /* harmony export */ PvtModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.PvtModel),
30870
+ /* harmony export */ PvtPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.PvtPreTrainedModel),
30658
30871
  /* harmony export */ PyAnnoteFeatureExtractor: () => (/* reexport safe */ _processors_js__WEBPACK_IMPORTED_MODULE_4__.PyAnnoteFeatureExtractor),
30659
30872
  /* harmony export */ PyAnnoteForAudioFrameClassification: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.PyAnnoteForAudioFrameClassification),
30660
30873
  /* harmony export */ PyAnnoteModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.PyAnnoteModel),
@@ -30775,6 +30988,11 @@ __webpack_require__.r(__webpack_exports__);
30775
30988
  /* harmony export */ ViTFeatureExtractor: () => (/* reexport safe */ _processors_js__WEBPACK_IMPORTED_MODULE_4__.ViTFeatureExtractor),
30776
30989
  /* harmony export */ ViTForImageClassification: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ViTForImageClassification),
30777
30990
  /* harmony export */ ViTImageProcessor: () => (/* reexport safe */ _processors_js__WEBPACK_IMPORTED_MODULE_4__.ViTImageProcessor),
30991
+ /* harmony export */ ViTMAEModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ViTMAEModel),
30992
+ /* harmony export */ ViTMAEPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ViTMAEPreTrainedModel),
30993
+ /* harmony export */ ViTMSNForImageClassification: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ViTMSNForImageClassification),
30994
+ /* harmony export */ ViTMSNModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ViTMSNModel),
30995
+ /* harmony export */ ViTMSNPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ViTMSNPreTrainedModel),
30778
30996
  /* harmony export */ ViTModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ViTModel),
30779
30997
  /* harmony export */ ViTPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ViTPreTrainedModel),
30780
30998
  /* harmony export */ VisionEncoderDecoderModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.VisionEncoderDecoderModel),