@huggingface/transformers 3.0.0-alpha.13 → 3.0.0-alpha.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3779,7 +3779,7 @@ __webpack_require__.r(__webpack_exports__);
3779
3779
  // Licensed under the MIT License.
3780
3780
  // This file is generated by /js/scripts/update-version.ts
3781
3781
  // Do not modify file content manually.
3782
- const version = '1.19.0';
3782
+ const version = '1.19.2';
3783
3783
  //# sourceMappingURL=version.js.map
3784
3784
 
3785
3785
  /***/ }),
@@ -4437,7 +4437,7 @@ __webpack_require__.r(__webpack_exports__);
4437
4437
 
4438
4438
 
4439
4439
 
4440
- const VERSION = '3.0.0-alpha.13';
4440
+ const VERSION = '3.0.0-alpha.15';
4441
4441
 
4442
4442
  // Check if various APIs are available (depends on environment)
4443
4443
  const IS_BROWSER_ENV = typeof self !== 'undefined';
@@ -4484,19 +4484,19 @@ const apis = Object.freeze({
4484
4484
  });
4485
4485
 
4486
4486
  const RUNNING_LOCALLY = IS_FS_AVAILABLE && IS_PATH_AVAILABLE;
4487
- const __dirname = RUNNING_LOCALLY
4487
+ const dirname__ = RUNNING_LOCALLY
4488
4488
  ? path__WEBPACK_IMPORTED_MODULE_1__.dirname(path__WEBPACK_IMPORTED_MODULE_1__.dirname(url__WEBPACK_IMPORTED_MODULE_2__.fileURLToPath("file:///workspaces/transformers.js/src/env.js")))
4489
4489
  : './';
4490
4490
 
4491
4491
  // Only used for environments with access to file system
4492
4492
  const DEFAULT_CACHE_DIR = RUNNING_LOCALLY
4493
- ? path__WEBPACK_IMPORTED_MODULE_1__.join(__dirname, '/.cache/')
4493
+ ? path__WEBPACK_IMPORTED_MODULE_1__.join(dirname__, '/.cache/')
4494
4494
  : null;
4495
4495
 
4496
4496
  // Set local model path, based on available APIs
4497
4497
  const DEFAULT_LOCAL_MODEL_PATH = '/models/';
4498
4498
  const localModelPath = RUNNING_LOCALLY
4499
- ? path__WEBPACK_IMPORTED_MODULE_1__.join(__dirname, DEFAULT_LOCAL_MODEL_PATH)
4499
+ ? path__WEBPACK_IMPORTED_MODULE_1__.join(dirname__, DEFAULT_LOCAL_MODEL_PATH)
4500
4500
  : DEFAULT_LOCAL_MODEL_PATH;
4501
4501
 
4502
4502
  /**
@@ -5533,18 +5533,18 @@ class NoBadWordsLogitsProcessor extends LogitsProcessor {
5533
5533
  _call(input_ids, logits) {
5534
5534
  for (let i = 0; i < input_ids.length; ++i) {
5535
5535
  const batch_logits_data = /** @type {Float32Array} */(logits[i].data);
5536
-
5536
+ const ids = input_ids[i];
5537
5537
  for (const bad_word_ids of this.bad_words_ids) {
5538
5538
  // Whether to modify the logits of the last token in the bad word id sequence
5539
5539
  let mark = true;
5540
5540
 
5541
5541
  // For each bad word in the list, if the current sequence of input ids ends with this sequence (excluding the last),
5542
5542
  // then we set the logits of the last bad word id to -Infinity.
5543
- for (let i = 1; i <= bad_word_ids.length - 1 && bad_word_ids.length < input_ids[i].length; ++i) {
5543
+ for (let j = 1; j <= bad_word_ids.length - 1 && bad_word_ids.length < ids.length; ++j) {
5544
5544
 
5545
5545
  // NOTE: We use != instead of !== to compare bigint and number
5546
5546
  // @ts-ignore
5547
- if (bad_word_ids.at(-i - 1) != input_ids[i].at(-i)) {
5547
+ if (bad_word_ids.at(-j - 1) != ids.at(-j)) {
5548
5548
  // We have found a mismatch
5549
5549
  mark = false;
5550
5550
  break;
@@ -6382,6 +6382,7 @@ __webpack_require__.r(__webpack_exports__);
6382
6382
  /* harmony export */ AutoModelForTextToSpectrogram: () => (/* binding */ AutoModelForTextToSpectrogram),
6383
6383
  /* harmony export */ AutoModelForTextToWaveform: () => (/* binding */ AutoModelForTextToWaveform),
6384
6384
  /* harmony export */ AutoModelForTokenClassification: () => (/* binding */ AutoModelForTokenClassification),
6385
+ /* harmony export */ AutoModelForUniversalSegmentation: () => (/* binding */ AutoModelForUniversalSegmentation),
6385
6386
  /* harmony export */ AutoModelForVision2Seq: () => (/* binding */ AutoModelForVision2Seq),
6386
6387
  /* harmony export */ AutoModelForXVector: () => (/* binding */ AutoModelForXVector),
6387
6388
  /* harmony export */ AutoModelForZeroShotObjectDetection: () => (/* binding */ AutoModelForZeroShotObjectDetection),
@@ -6413,7 +6414,9 @@ __webpack_require__.r(__webpack_exports__);
6413
6414
  /* harmony export */ CLIPSegForImageSegmentation: () => (/* binding */ CLIPSegForImageSegmentation),
6414
6415
  /* harmony export */ CLIPSegModel: () => (/* binding */ CLIPSegModel),
6415
6416
  /* harmony export */ CLIPSegPreTrainedModel: () => (/* binding */ CLIPSegPreTrainedModel),
6417
+ /* harmony export */ CLIPTextModel: () => (/* binding */ CLIPTextModel),
6416
6418
  /* harmony export */ CLIPTextModelWithProjection: () => (/* binding */ CLIPTextModelWithProjection),
6419
+ /* harmony export */ CLIPVisionModel: () => (/* binding */ CLIPVisionModel),
6417
6420
  /* harmony export */ CLIPVisionModelWithProjection: () => (/* binding */ CLIPVisionModelWithProjection),
6418
6421
  /* harmony export */ CamembertForMaskedLM: () => (/* binding */ CamembertForMaskedLM),
6419
6422
  /* harmony export */ CamembertForQuestionAnswering: () => (/* binding */ CamembertForQuestionAnswering),
@@ -6462,6 +6465,8 @@ __webpack_require__.r(__webpack_exports__);
6462
6465
  /* harmony export */ DebertaV2ForTokenClassification: () => (/* binding */ DebertaV2ForTokenClassification),
6463
6466
  /* harmony export */ DebertaV2Model: () => (/* binding */ DebertaV2Model),
6464
6467
  /* harmony export */ DebertaV2PreTrainedModel: () => (/* binding */ DebertaV2PreTrainedModel),
6468
+ /* harmony export */ DecisionTransformerModel: () => (/* binding */ DecisionTransformerModel),
6469
+ /* harmony export */ DecisionTransformerPreTrainedModel: () => (/* binding */ DecisionTransformerPreTrainedModel),
6465
6470
  /* harmony export */ DeiTForImageClassification: () => (/* binding */ DeiTForImageClassification),
6466
6471
  /* harmony export */ DeiTModel: () => (/* binding */ DeiTModel),
6467
6472
  /* harmony export */ DeiTPreTrainedModel: () => (/* binding */ DeiTPreTrainedModel),
@@ -6530,6 +6535,11 @@ __webpack_require__.r(__webpack_exports__);
6530
6535
  /* harmony export */ GemmaForCausalLM: () => (/* binding */ GemmaForCausalLM),
6531
6536
  /* harmony export */ GemmaModel: () => (/* binding */ GemmaModel),
6532
6537
  /* harmony export */ GemmaPreTrainedModel: () => (/* binding */ GemmaPreTrainedModel),
6538
+ /* harmony export */ GroupViTModel: () => (/* binding */ GroupViTModel),
6539
+ /* harmony export */ GroupViTPreTrainedModel: () => (/* binding */ GroupViTPreTrainedModel),
6540
+ /* harmony export */ HieraForImageClassification: () => (/* binding */ HieraForImageClassification),
6541
+ /* harmony export */ HieraModel: () => (/* binding */ HieraModel),
6542
+ /* harmony export */ HieraPreTrainedModel: () => (/* binding */ HieraPreTrainedModel),
6533
6543
  /* harmony export */ HubertForCTC: () => (/* binding */ HubertForCTC),
6534
6544
  /* harmony export */ HubertForSequenceClassification: () => (/* binding */ HubertForSequenceClassification),
6535
6545
  /* harmony export */ HubertModel: () => (/* binding */ HubertModel),
@@ -6566,6 +6576,9 @@ __webpack_require__.r(__webpack_exports__);
6566
6576
  /* harmony export */ MarianMTModel: () => (/* binding */ MarianMTModel),
6567
6577
  /* harmony export */ MarianModel: () => (/* binding */ MarianModel),
6568
6578
  /* harmony export */ MarianPreTrainedModel: () => (/* binding */ MarianPreTrainedModel),
6579
+ /* harmony export */ MaskFormerForInstanceSegmentation: () => (/* binding */ MaskFormerForInstanceSegmentation),
6580
+ /* harmony export */ MaskFormerModel: () => (/* binding */ MaskFormerModel),
6581
+ /* harmony export */ MaskFormerPreTrainedModel: () => (/* binding */ MaskFormerPreTrainedModel),
6569
6582
  /* harmony export */ MaskedLMOutput: () => (/* binding */ MaskedLMOutput),
6570
6583
  /* harmony export */ MistralForCausalLM: () => (/* binding */ MistralForCausalLM),
6571
6584
  /* harmony export */ MistralModel: () => (/* binding */ MistralModel),
@@ -6624,6 +6637,9 @@ __webpack_require__.r(__webpack_exports__);
6624
6637
  /* harmony export */ PhiPreTrainedModel: () => (/* binding */ PhiPreTrainedModel),
6625
6638
  /* harmony export */ PreTrainedModel: () => (/* binding */ PreTrainedModel),
6626
6639
  /* harmony export */ PretrainedMixin: () => (/* binding */ PretrainedMixin),
6640
+ /* harmony export */ PvtForImageClassification: () => (/* binding */ PvtForImageClassification),
6641
+ /* harmony export */ PvtModel: () => (/* binding */ PvtModel),
6642
+ /* harmony export */ PvtPreTrainedModel: () => (/* binding */ PvtPreTrainedModel),
6627
6643
  /* harmony export */ PyAnnoteForAudioFrameClassification: () => (/* binding */ PyAnnoteForAudioFrameClassification),
6628
6644
  /* harmony export */ PyAnnoteModel: () => (/* binding */ PyAnnoteModel),
6629
6645
  /* harmony export */ PyAnnotePreTrainedModel: () => (/* binding */ PyAnnotePreTrainedModel),
@@ -6709,6 +6725,11 @@ __webpack_require__.r(__webpack_exports__);
6709
6725
  /* harmony export */ UniSpeechSatModel: () => (/* binding */ UniSpeechSatModel),
6710
6726
  /* harmony export */ UniSpeechSatPreTrainedModel: () => (/* binding */ UniSpeechSatPreTrainedModel),
6711
6727
  /* harmony export */ ViTForImageClassification: () => (/* binding */ ViTForImageClassification),
6728
+ /* harmony export */ ViTMAEModel: () => (/* binding */ ViTMAEModel),
6729
+ /* harmony export */ ViTMAEPreTrainedModel: () => (/* binding */ ViTMAEPreTrainedModel),
6730
+ /* harmony export */ ViTMSNForImageClassification: () => (/* binding */ ViTMSNForImageClassification),
6731
+ /* harmony export */ ViTMSNModel: () => (/* binding */ ViTMSNModel),
6732
+ /* harmony export */ ViTMSNPreTrainedModel: () => (/* binding */ ViTMSNPreTrainedModel),
6712
6733
  /* harmony export */ ViTModel: () => (/* binding */ ViTModel),
6713
6734
  /* harmony export */ ViTPreTrainedModel: () => (/* binding */ ViTPreTrainedModel),
6714
6735
  /* harmony export */ VisionEncoderDecoderModel: () => (/* binding */ VisionEncoderDecoderModel),
@@ -6958,6 +6979,7 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
6958
6979
  });
6959
6980
  if (Object.keys(shapes).length > 0 && !(0,_backends_onnx_js__WEBPACK_IMPORTED_MODULE_1__.isONNXProxy)()) {
6960
6981
  // Only set preferredOutputLocation if shapes are present and we aren't proxying ONNX
6982
+ /** @type {Record<string, import('onnxruntime-common').Tensor.DataLocation>} */
6961
6983
  const preferredOutputLocation = {};
6962
6984
  for (const key in shapes) {
6963
6985
  preferredOutputLocation[key] = 'gpu-buffer';
@@ -10223,6 +10245,18 @@ class CLIPPreTrainedModel extends PreTrainedModel { }
10223
10245
  */
10224
10246
  class CLIPModel extends CLIPPreTrainedModel { }
10225
10247
 
10248
+ /**
10249
+ * The text model from CLIP without any head or projection on top.
10250
+ */
10251
+ class CLIPTextModel extends CLIPPreTrainedModel {
10252
+ /** @type {PreTrainedModel.from_pretrained} */
10253
+ static async from_pretrained(pretrained_model_name_or_path, options = {}) {
10254
+ // Update default model file name if not provided
10255
+ options.model_file_name ??= 'text_model';
10256
+ return super.from_pretrained(pretrained_model_name_or_path, options);
10257
+ }
10258
+ }
10259
+
10226
10260
  /**
10227
10261
  * CLIP Text Model with a projection layer on top (a linear layer on top of the pooled output)
10228
10262
  *
@@ -10250,7 +10284,6 @@ class CLIPModel extends CLIPPreTrainedModel { }
10250
10284
  * ```
10251
10285
  */
10252
10286
  class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
10253
-
10254
10287
  /** @type {PreTrainedModel.from_pretrained} */
10255
10288
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
10256
10289
  // Update default model file name if not provided
@@ -10259,6 +10292,18 @@ class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
10259
10292
  }
10260
10293
  }
10261
10294
 
10295
+ /**
10296
+ * The vision model from CLIP without any head or projection on top.
10297
+ */
10298
+ class CLIPVisionModel extends CLIPPreTrainedModel {
10299
+ /** @type {PreTrainedModel.from_pretrained} */
10300
+ static async from_pretrained(pretrained_model_name_or_path, options = {}) {
10301
+ // Update default model file name if not provided
10302
+ options.model_file_name ??= 'vision_model';
10303
+ return super.from_pretrained(pretrained_model_name_or_path, options);
10304
+ }
10305
+ }
10306
+
10262
10307
  /**
10263
10308
  * CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output)
10264
10309
  *
@@ -10925,6 +10970,43 @@ class ViTForImageClassification extends ViTPreTrainedModel {
10925
10970
  }
10926
10971
  //////////////////////////////////////////////////
10927
10972
 
10973
+ //////////////////////////////////////////////////
10974
+ class PvtPreTrainedModel extends PreTrainedModel { }
10975
+ class PvtModel extends PvtPreTrainedModel { }
10976
+ class PvtForImageClassification extends PvtPreTrainedModel {
10977
+ /**
10978
+ * @param {any} model_inputs
10979
+ */
10980
+ async _call(model_inputs) {
10981
+ return new SequenceClassifierOutput(await super._call(model_inputs));
10982
+ }
10983
+ }
10984
+ //////////////////////////////////////////////////
10985
+
10986
+ //////////////////////////////////////////////////
10987
+ class ViTMAEPreTrainedModel extends PreTrainedModel { }
10988
+ class ViTMAEModel extends ViTMAEPreTrainedModel { }
10989
+ //////////////////////////////////////////////////
10990
+
10991
+
10992
+ //////////////////////////////////////////////////
10993
+ class ViTMSNPreTrainedModel extends PreTrainedModel { }
10994
+ class ViTMSNModel extends ViTMSNPreTrainedModel { }
10995
+ class ViTMSNForImageClassification extends ViTMSNPreTrainedModel {
10996
+ /**
10997
+ * @param {any} model_inputs
10998
+ */
10999
+ async _call(model_inputs) {
11000
+ return new SequenceClassifierOutput(await super._call(model_inputs));
11001
+ }
11002
+ }
11003
+ //////////////////////////////////////////////////
11004
+
11005
+ //////////////////////////////////////////////////
11006
+ class GroupViTPreTrainedModel extends PreTrainedModel { }
11007
+ class GroupViTModel extends GroupViTPreTrainedModel { }
11008
+ //////////////////////////////////////////////////
11009
+
10928
11010
 
10929
11011
  //////////////////////////////////////////////////
10930
11012
  class FastViTPreTrainedModel extends PreTrainedModel { }
@@ -11178,6 +11260,19 @@ class DeiTForImageClassification extends DeiTPreTrainedModel {
11178
11260
  }
11179
11261
  //////////////////////////////////////////////////
11180
11262
 
11263
+ //////////////////////////////////////////////////
11264
+ class HieraPreTrainedModel extends PreTrainedModel { }
11265
+ class HieraModel extends HieraPreTrainedModel { }
11266
+ class HieraForImageClassification extends HieraPreTrainedModel {
11267
+ /**
11268
+ * @param {any} model_inputs
11269
+ */
11270
+ async _call(model_inputs) {
11271
+ return new SequenceClassifierOutput(await super._call(model_inputs));
11272
+ }
11273
+ }
11274
+ //////////////////////////////////////////////////
11275
+
11181
11276
 
11182
11277
  //////////////////////////////////////////////////
11183
11278
  /**
@@ -11324,6 +11419,11 @@ class SapiensForDepthEstimation extends SapiensPreTrainedModel { }
11324
11419
  class SapiensForNormalEstimation extends SapiensPreTrainedModel { }
11325
11420
  //////////////////////////////////////////////////
11326
11421
 
11422
+ //////////////////////////////////////////////////
11423
+ class MaskFormerPreTrainedModel extends PreTrainedModel { }
11424
+ class MaskFormerModel extends MaskFormerPreTrainedModel { }
11425
+ class MaskFormerForInstanceSegmentation extends MaskFormerPreTrainedModel { }
11426
+ //////////////////////////////////////////////////
11327
11427
 
11328
11428
  //////////////////////////////////////////////////
11329
11429
  class GLPNPreTrainedModel extends PreTrainedModel { }
@@ -12846,6 +12946,7 @@ class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE: not Mu
12846
12946
  return audio_values;
12847
12947
  }
12848
12948
  }
12949
+ //////////////////////////////////////////////////
12849
12950
 
12850
12951
  //////////////////////////////////////////////////
12851
12952
  // MobileNetV1 models
@@ -12939,6 +13040,17 @@ class MobileNetV4ForImageClassification extends MobileNetV4PreTrainedModel {
12939
13040
  }
12940
13041
  //////////////////////////////////////////////////
12941
13042
 
13043
+ //////////////////////////////////////////////////
13044
+ // Decision Transformer models
13045
+ class DecisionTransformerPreTrainedModel extends PreTrainedModel { }
13046
+
13047
+ /**
13048
+ * The model builds upon the GPT2 architecture to perform autoregressive prediction of actions in an offline RL setting.
13049
+ * Refer to the paper for more details: https://arxiv.org/abs/2106.01345
13050
+ */
13051
+ class DecisionTransformerModel extends DecisionTransformerPreTrainedModel { }
13052
+
13053
+ //////////////////////////////////////////////////
12942
13054
 
12943
13055
  //////////////////////////////////////////////////
12944
13056
  // AutoModels, used to simplify construction of PreTrainedModels
@@ -12977,7 +13089,7 @@ class PretrainedMixin {
12977
13089
  session_options = {},
12978
13090
  } = {}) {
12979
13091
 
12980
- let options = {
13092
+ const options = {
12981
13093
  progress_callback,
12982
13094
  config,
12983
13095
  cache_dir,
@@ -12996,7 +13108,7 @@ class PretrainedMixin {
12996
13108
  throw new Error("`MODEL_CLASS_MAPPINGS` not implemented for this type of `AutoClass`: " + this.name);
12997
13109
  }
12998
13110
 
12999
- for (let MODEL_CLASS_MAPPING of this.MODEL_CLASS_MAPPINGS) {
13111
+ for (const MODEL_CLASS_MAPPING of this.MODEL_CLASS_MAPPINGS) {
13000
13112
  const modelInfo = MODEL_CLASS_MAPPING.get(options.config.model_type);
13001
13113
  if (!modelInfo) {
13002
13114
  continue; // Item not found in this mapping
@@ -13051,6 +13163,10 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
13051
13163
  ['rt_detr', ['RTDetrModel', RTDetrModel]],
13052
13164
  ['table-transformer', ['TableTransformerModel', TableTransformerModel]],
13053
13165
  ['vit', ['ViTModel', ViTModel]],
13166
+ ['pvt', ['PvtModel', PvtModel]],
13167
+ ['vit_msn', ['ViTMSNModel', ViTMSNModel]],
13168
+ ['vit_mae', ['ViTMAEModel', ViTMAEModel]],
13169
+ ['groupvit', ['GroupViTModel', GroupViTModel]],
13054
13170
  ['fastvit', ['FastViTModel', FastViTModel]],
13055
13171
  ['mobilevit', ['MobileViTModel', MobileViTModel]],
13056
13172
  ['mobilevitv2', ['MobileViTV2Model', MobileViTV2Model]],
@@ -13058,6 +13174,7 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
13058
13174
  ['owlv2', ['Owlv2Model', Owlv2Model]],
13059
13175
  ['beit', ['BeitModel', BeitModel]],
13060
13176
  ['deit', ['DeiTModel', DeiTModel]],
13177
+ ['hiera', ['HieraModel', HieraModel]],
13061
13178
  ['convnext', ['ConvNextModel', ConvNextModel]],
13062
13179
  ['convnextv2', ['ConvNextV2Model', ConvNextV2Model]],
13063
13180
  ['dinov2', ['Dinov2Model', Dinov2Model]],
@@ -13072,10 +13189,14 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
13072
13189
  ['hifigan', ['SpeechT5HifiGan', SpeechT5HifiGan]],
13073
13190
  ['efficientnet', ['EfficientNetModel', EfficientNetModel]],
13074
13191
 
13192
+ ['decision_transformer', ['DecisionTransformerModel', DecisionTransformerModel]],
13193
+
13075
13194
  ['mobilenet_v1', ['MobileNetV1Model', MobileNetV1Model]],
13076
13195
  ['mobilenet_v2', ['MobileNetV2Model', MobileNetV2Model]],
13077
13196
  ['mobilenet_v3', ['MobileNetV3Model', MobileNetV3Model]],
13078
13197
  ['mobilenet_v4', ['MobileNetV4Model', MobileNetV4Model]],
13198
+
13199
+ ['maskformer', ['MaskFormerModel', MaskFormerModel]],
13079
13200
  ]);
13080
13201
 
13081
13202
  const MODEL_MAPPING_NAMES_ENCODER_DECODER = new Map([
@@ -13260,11 +13381,14 @@ const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
13260
13381
 
13261
13382
  const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([
13262
13383
  ['vit', ['ViTForImageClassification', ViTForImageClassification]],
13384
+ ['pvt', ['PvtForImageClassification', PvtForImageClassification]],
13385
+ ['vit_msn', ['ViTMSNForImageClassification', ViTMSNForImageClassification]],
13263
13386
  ['fastvit', ['FastViTForImageClassification', FastViTForImageClassification]],
13264
13387
  ['mobilevit', ['MobileViTForImageClassification', MobileViTForImageClassification]],
13265
13388
  ['mobilevitv2', ['MobileViTV2ForImageClassification', MobileViTV2ForImageClassification]],
13266
13389
  ['beit', ['BeitForImageClassification', BeitForImageClassification]],
13267
13390
  ['deit', ['DeiTForImageClassification', DeiTForImageClassification]],
13391
+ ['hiera', ['HieraForImageClassification', HieraForImageClassification]],
13268
13392
  ['convnext', ['ConvNextForImageClassification', ConvNextForImageClassification]],
13269
13393
  ['convnextv2', ['ConvNextV2ForImageClassification', ConvNextV2ForImageClassification]],
13270
13394
  ['dinov2', ['Dinov2ForImageClassification', Dinov2ForImageClassification]],
@@ -13291,6 +13415,7 @@ const MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = new Map([
13291
13415
  ]);
13292
13416
 
13293
13417
  const MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = new Map([
13418
+ // TODO: Do not add new models here
13294
13419
  ['detr', ['DetrForSegmentation', DetrForSegmentation]],
13295
13420
  ['clipseg', ['CLIPSegForImageSegmentation', CLIPSegForImageSegmentation]],
13296
13421
  ]);
@@ -13300,6 +13425,11 @@ const MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = new Map([
13300
13425
  ['sapiens', ['SapiensForSemanticSegmentation', SapiensForSemanticSegmentation]],
13301
13426
  ]);
13302
13427
 
13428
+ const MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES = new Map([
13429
+ ['detr', ['DetrForSegmentation', DetrForSegmentation]],
13430
+ ['maskformer', ['MaskFormerForInstanceSegmentation', MaskFormerForInstanceSegmentation]],
13431
+ ]);
13432
+
13303
13433
  const MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = new Map([
13304
13434
  ['sam', ['SamModel', SamModel]],
13305
13435
  ]);
@@ -13375,6 +13505,7 @@ const MODEL_CLASS_TYPE_MAPPING = [
13375
13505
  [MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES, MODEL_TYPES.ImageTextToText],
13376
13506
  [MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
13377
13507
  [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
13508
+ [MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
13378
13509
  [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
13379
13510
  [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
13380
13511
  [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
@@ -13577,6 +13708,17 @@ class AutoModelForSemanticSegmentation extends PretrainedMixin {
13577
13708
  static MODEL_CLASS_MAPPINGS = [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES];
13578
13709
  }
13579
13710
 
13711
+ /**
13712
+ * Helper class which is used to instantiate pretrained universal image segmentation models with the `from_pretrained` function.
13713
+ * The chosen model class is determined by the type specified in the model config.
13714
+ *
13715
+ * @example
13716
+ * let model = await AutoModelForUniversalSegmentation.from_pretrained('hf-internal-testing/tiny-random-MaskFormerForInstanceSegmentation');
13717
+ */
13718
+ class AutoModelForUniversalSegmentation extends PretrainedMixin {
13719
+ static MODEL_CLASS_MAPPINGS = [MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES];
13720
+ }
13721
+
13580
13722
  /**
13581
13723
  * Helper class which is used to instantiate pretrained object detection models with the `from_pretrained` function.
13582
13724
  * The chosen model class is determined by the type specified in the model config.
@@ -14092,20 +14234,31 @@ __webpack_require__.r(__webpack_exports__);
14092
14234
 
14093
14235
 
14094
14236
 
14237
+ /**
14238
+ * Asynchronously creates a wrapper function for running an ONNX inference session.
14239
+ *
14240
+ * @param {number[]} session_bytes The session data in bytes.
14241
+ * @param {import('onnxruntime-common').InferenceSession.SessionOptions} session_options The options for the ONNX session.
14242
+ * @template {string | [string] | string[]} T
14243
+ * @param {T} names The name(s) of the output tensor(s).
14244
+ *
14245
+ * @returns {Promise<function(Record<string, Tensor>): Promise<T extends string ? Tensor : T extends string[] ? { [K in keyof T]: Tensor } : never>>}
14246
+ * The wrapper function for running the ONNX inference session.
14247
+ */
14095
14248
  const wrap = async (session_bytes, session_options, names) => {
14096
14249
  const session = await (0,_backends_onnx_js__WEBPACK_IMPORTED_MODULE_0__.createInferenceSession)(
14097
14250
  new Uint8Array(session_bytes), session_options,
14098
14251
  );
14099
- return async (inputs) => {
14252
+ return /** @type {any} */(async (/** @type {Record<string, Tensor>} */ inputs) => {
14100
14253
  const ortFeed = Object.fromEntries(Object.entries(inputs).map(([k, v]) => [k, v.ort_tensor]));
14101
14254
  const outputs = await session.run(ortFeed);
14102
14255
 
14103
14256
  if (Array.isArray(names)) {
14104
14257
  return names.map((n) => new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_1__.Tensor(outputs[n]));
14105
14258
  } else {
14106
- return new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_1__.Tensor(outputs[names]);
14259
+ return new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_1__.Tensor(outputs[/** @type {string} */(names)]);
14107
14260
  }
14108
- }
14261
+ })
14109
14262
  }
14110
14263
 
14111
14264
  // In-memory registry of initialized ONNX operators
@@ -17229,7 +17382,7 @@ const SUPPORTED_TASKS = Object.freeze({
17229
17382
  "image-segmentation": {
17230
17383
  // no tokenizer
17231
17384
  "pipeline": ImageSegmentationPipeline,
17232
- "model": [_models_js__WEBPACK_IMPORTED_MODULE_1__.AutoModelForImageSegmentation, _models_js__WEBPACK_IMPORTED_MODULE_1__.AutoModelForSemanticSegmentation],
17385
+ "model": [_models_js__WEBPACK_IMPORTED_MODULE_1__.AutoModelForImageSegmentation, _models_js__WEBPACK_IMPORTED_MODULE_1__.AutoModelForSemanticSegmentation, _models_js__WEBPACK_IMPORTED_MODULE_1__.AutoModelForUniversalSegmentation],
17233
17386
  "processor": _processors_js__WEBPACK_IMPORTED_MODULE_2__.AutoProcessor,
17234
17387
  "default": {
17235
17388
  // TODO: replace with original
@@ -17471,7 +17624,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
17471
17624
 
17472
17625
  /**@type {Promise[]} */
17473
17626
  const promises = [];
17474
- for (let [name, cls] of mapping.entries()) {
17627
+ for (const [name, cls] of mapping.entries()) {
17475
17628
  if (!cls) continue;
17476
17629
 
17477
17630
  /**@type {Promise} */
@@ -17479,7 +17632,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
17479
17632
  if (Array.isArray(cls)) {
17480
17633
  promise = new Promise(async (resolve, reject) => {
17481
17634
  let e;
17482
- for (let c of cls) {
17635
+ for (const c of cls) {
17483
17636
  if (c === null) {
17484
17637
  // If null, we resolve it immediately, meaning the relevant
17485
17638
  // class was not found, but it is optional.
@@ -17517,7 +17670,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
17517
17670
  await Promise.all(promises);
17518
17671
 
17519
17672
  // Then assign to result
17520
- for (let [name, promise] of Object.entries(result)) {
17673
+ for (const [name, promise] of Object.entries(result)) {
17521
17674
  result[name] = await promise;
17522
17675
  }
17523
17676
 
@@ -17555,6 +17708,7 @@ __webpack_require__.r(__webpack_exports__);
17555
17708
  /* harmony export */ Florence2Processor: () => (/* binding */ Florence2Processor),
17556
17709
  /* harmony export */ GLPNFeatureExtractor: () => (/* binding */ GLPNFeatureExtractor),
17557
17710
  /* harmony export */ ImageFeatureExtractor: () => (/* binding */ ImageFeatureExtractor),
17711
+ /* harmony export */ MaskFormerFeatureExtractor: () => (/* binding */ MaskFormerFeatureExtractor),
17558
17712
  /* harmony export */ MobileNetV1FeatureExtractor: () => (/* binding */ MobileNetV1FeatureExtractor),
17559
17713
  /* harmony export */ MobileNetV2FeatureExtractor: () => (/* binding */ MobileNetV2FeatureExtractor),
17560
17714
  /* harmony export */ MobileNetV3FeatureExtractor: () => (/* binding */ MobileNetV3FeatureExtractor),
@@ -17566,6 +17720,7 @@ __webpack_require__.r(__webpack_exports__);
17566
17720
  /* harmony export */ OwlViTProcessor: () => (/* binding */ OwlViTProcessor),
17567
17721
  /* harmony export */ Owlv2ImageProcessor: () => (/* binding */ Owlv2ImageProcessor),
17568
17722
  /* harmony export */ Processor: () => (/* binding */ Processor),
17723
+ /* harmony export */ PvtImageProcessor: () => (/* binding */ PvtImageProcessor),
17569
17724
  /* harmony export */ PyAnnoteFeatureExtractor: () => (/* binding */ PyAnnoteFeatureExtractor),
17570
17725
  /* harmony export */ PyAnnoteProcessor: () => (/* binding */ PyAnnoteProcessor),
17571
17726
  /* harmony export */ RTDetrImageProcessor: () => (/* binding */ RTDetrImageProcessor),
@@ -17654,7 +17809,7 @@ function center_to_corners_format([centerX, centerY, width, height]) {
17654
17809
  * @param {Tensor} outputs.logits The logits
17655
17810
  * @param {Tensor} outputs.pred_boxes The predicted boxes.
17656
17811
  * @param {number} [threshold=0.5] The threshold to use for the scores.
17657
- * @param {number[][]} [target_sizes=null] The sizes of the original images.
17812
+ * @param {[number, number][]} [target_sizes=null] The sizes of the original images.
17658
17813
  * @param {boolean} [is_zero_shot=false] Whether zero-shot object detection was performed.
17659
17814
  * @return {Object[]} An array of objects containing the post-processed outputs.
17660
17815
  * @private
@@ -17735,7 +17890,7 @@ function post_process_object_detection(outputs, threshold = 0.5, target_sizes =
17735
17890
  /**
17736
17891
  * Post-processes the outputs of the model (for semantic segmentation).
17737
17892
  * @param {*} outputs Raw outputs of the model.
17738
- * @param {number[][]} [target_sizes=null] List of tuples corresponding to the requested final size
17893
+ * @param {[number, number][]} [target_sizes=null] List of tuples corresponding to the requested final size
17739
17894
  * (height, width) of each prediction. If unset, predictions will not be resized.
17740
17895
  * @returns {{segmentation: Tensor; labels: number[]}[]} The semantic segmentation maps.
17741
17896
  */
@@ -17783,9 +17938,8 @@ function post_process_semantic_segmentation(outputs, target_sizes = null) {
17783
17938
  // Store which objects have labels
17784
17939
  // This is much more efficient that creating a set of the final values
17785
17940
  const hasLabel = new Array(data.dims[0]);
17786
- const out = segmentation.data;
17787
- for (let j = 0; j < out.length; ++j) {
17788
- const index = out[j];
17941
+ for (let j = 0; j < segmentation_data.length; ++j) {
17942
+ const index = segmentation_data[j];
17789
17943
  hasLabel[index] = index;
17790
17944
  }
17791
17945
  /** @type {number[]} The unique list of labels that were detected */
@@ -17796,6 +17950,300 @@ function post_process_semantic_segmentation(outputs, target_sizes = null) {
17796
17950
  return toReturn;
17797
17951
  }
17798
17952
 
17953
+
17954
+ /**
17955
+ * Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and `labels`.
17956
+ * @param {Tensor} class_logits The class logits.
17957
+ * @param {Tensor} mask_logits The mask logits.
17958
+ * @param {number} object_mask_threshold A number between 0 and 1 used to binarize the masks.
17959
+ * @param {number} num_labels The number of labels.
17960
+ * @returns {[Tensor[], number[], number[]]} The binarized masks, the scores, and the labels.
17961
+ * @private
17962
+ */
17963
+ function remove_low_and_no_objects(class_logits, mask_logits, object_mask_threshold, num_labels) {
17964
+
17965
+ const mask_probs_item = [];
17966
+ const pred_scores_item = [];
17967
+ const pred_labels_item = [];
17968
+
17969
+ for (let j = 0; j < class_logits.dims[0]; ++j) {
17970
+ const cls = class_logits[j];
17971
+ const mask = mask_logits[j];
17972
+
17973
+ const pred_label = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.max)(cls.data)[1];
17974
+ if (pred_label === num_labels) {
17975
+ // Is the background, so we ignore it
17976
+ continue;
17977
+ }
17978
+
17979
+ const scores = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.softmax)(cls.data);
17980
+ const pred_score = scores[pred_label];
17981
+ if (pred_score > object_mask_threshold) {
17982
+ mask_probs_item.push(mask);
17983
+ pred_scores_item.push(pred_score);
17984
+ pred_labels_item.push(pred_label);
17985
+ }
17986
+ }
17987
+
17988
+ return [mask_probs_item, pred_scores_item, pred_labels_item];
17989
+ }
17990
+
17991
+ /**
17992
+ * Checks whether the segment is valid or not.
17993
+ * @param {Int32Array} mask_labels Labels for each pixel in the mask.
17994
+ * @param {Tensor[]} mask_probs Probabilities for each pixel in the masks.
17995
+ * @param {number} k The class id of the segment.
17996
+ * @param {number} mask_threshold The mask threshold.
17997
+ * @param {number} overlap_mask_area_threshold The overlap mask area threshold.
17998
+ * @returns {[boolean, number[]]} Whether the segment is valid or not, and the indices of the valid labels.
17999
+ * @private
18000
+ */
18001
+ function check_segment_validity(
18002
+ mask_labels,
18003
+ mask_probs,
18004
+ k,
18005
+ mask_threshold = 0.5,
18006
+ overlap_mask_area_threshold = 0.8
18007
+ ) {
18008
+ // mask_k is a 1D array of indices, indicating where the mask is equal to k
18009
+ const mask_k = [];
18010
+ let mask_k_area = 0;
18011
+ let original_area = 0;
18012
+
18013
+ const mask_probs_k_data = mask_probs[k].data;
18014
+
18015
+ // Compute the area of all the stuff in query k
18016
+ for (let i = 0; i < mask_labels.length; ++i) {
18017
+ if (mask_labels[i] === k) {
18018
+ mask_k.push(i);
18019
+ ++mask_k_area;
18020
+ }
18021
+
18022
+ if (mask_probs_k_data[i] >= mask_threshold) {
18023
+ ++original_area;
18024
+ }
18025
+ }
18026
+ let mask_exists = mask_k_area > 0 && original_area > 0;
18027
+
18028
+ // Eliminate disconnected tiny segments
18029
+ if (mask_exists) {
18030
+ // Perform additional check
18031
+ let area_ratio = mask_k_area / original_area;
18032
+ mask_exists = area_ratio > overlap_mask_area_threshold;
18033
+ }
18034
+
18035
+ return [mask_exists, mask_k]
18036
+ }
18037
+
18038
+ /**
18039
+ * Computes the segments.
18040
+ * @param {Tensor[]} mask_probs The mask probabilities.
18041
+ * @param {number[]} pred_scores The predicted scores.
18042
+ * @param {number[]} pred_labels The predicted labels.
18043
+ * @param {number} mask_threshold The mask threshold.
18044
+ * @param {number} overlap_mask_area_threshold The overlap mask area threshold.
18045
+ * @param {Set<number>} label_ids_to_fuse The label ids to fuse.
18046
+ * @param {number[]} target_size The target size of the image.
18047
+ * @returns {[Tensor, Array<{id: number, label_id: number, score: number}>]} The computed segments.
18048
+ * @private
18049
+ */
18050
+ function compute_segments(
18051
+ mask_probs,
18052
+ pred_scores,
18053
+ pred_labels,
18054
+ mask_threshold,
18055
+ overlap_mask_area_threshold,
18056
+ label_ids_to_fuse = null,
18057
+ target_size = null,
18058
+ ) {
18059
+ const [height, width] = target_size ?? mask_probs[0].dims;
18060
+
18061
+ const segmentation = new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_4__.Tensor(
18062
+ 'int32',
18063
+ new Int32Array(height * width),
18064
+ [height, width]
18065
+ );
18066
+ const segments = [];
18067
+
18068
+ // 1. If target_size is not null, we need to resize the masks to the target size
18069
+ if (target_size !== null) {
18070
+ // resize the masks to the target size
18071
+ for (let i = 0; i < mask_probs.length; ++i) {
18072
+ mask_probs[i] = (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_4__.interpolate)(mask_probs[i], target_size, 'bilinear', false);
18073
+ }
18074
+ }
18075
+
18076
+ // 2. Weigh each mask by its prediction score
18077
+ // NOTE: `mask_probs` is updated in-place
18078
+ //
18079
+ // Temporary storage for the best label/scores for each pixel ([height, width]):
18080
+ const mask_labels = new Int32Array(mask_probs[0].data.length);
18081
+ const bestScores = new Float32Array(mask_probs[0].data.length);
18082
+
18083
+ for (let i = 0; i < mask_probs.length; ++i) {
18084
+ let score = pred_scores[i];
18085
+
18086
+ const mask_probs_i_data = mask_probs[i].data;
18087
+
18088
+ for (let j = 0; j < mask_probs_i_data.length; ++j) {
18089
+ mask_probs_i_data[j] *= score
18090
+ if (mask_probs_i_data[j] > bestScores[j]) {
18091
+ mask_labels[j] = i;
18092
+ bestScores[j] = mask_probs_i_data[j];
18093
+ }
18094
+ }
18095
+ }
18096
+
18097
+ let current_segment_id = 0;
18098
+
18099
+ // let stuff_memory_list = {}
18100
+ const segmentation_data = segmentation.data;
18101
+ for (let k = 0; k < pred_labels.length; ++k) {
18102
+ const pred_class = pred_labels[k];
18103
+
18104
+ // TODO add `should_fuse`
18105
+ // let should_fuse = pred_class in label_ids_to_fuse
18106
+
18107
+ // Check if mask exists and large enough to be a segment
18108
+ const [mask_exists, mask_k] = check_segment_validity(
18109
+ mask_labels,
18110
+ mask_probs,
18111
+ k,
18112
+ mask_threshold,
18113
+ overlap_mask_area_threshold
18114
+ )
18115
+
18116
+ if (!mask_exists) {
18117
+ // Nothing to see here
18118
+ continue;
18119
+ }
18120
+
18121
+ // TODO
18122
+ // if (pred_class in stuff_memory_list) {
18123
+ // current_segment_id = stuff_memory_list[pred_class]
18124
+ // } else {
18125
+ // current_segment_id += 1;
18126
+ // }
18127
+ ++current_segment_id;
18128
+
18129
+
18130
+ // Add current object segment to final segmentation map
18131
+ for (const index of mask_k) {
18132
+ segmentation_data[index] = current_segment_id;
18133
+ }
18134
+
18135
+ segments.push({
18136
+ id: current_segment_id,
18137
+ label_id: pred_class,
18138
+ // was_fused: should_fuse, TODO
18139
+ score: pred_scores[k],
18140
+ })
18141
+
18142
+ // TODO
18143
+ // if(should_fuse){
18144
+ // stuff_memory_list[pred_class] = current_segment_id
18145
+ // }
18146
+ }
18147
+
18148
+ return [segmentation, segments];
18149
+ }
18150
+
18151
+
18152
+ /**
18153
+ * Post-process the model output to generate the final panoptic segmentation.
18154
+ * @param {*} outputs The model output to post process
18155
+ * @param {number} [threshold=0.5] The probability score threshold to keep predicted instance masks.
18156
+ * @param {number} [mask_threshold=0.5] Threshold to use when turning the predicted masks into binary values.
18157
+ * @param {number} [overlap_mask_area_threshold=0.8] The overlap mask area threshold to merge or discard small disconnected parts within each binary instance mask.
18158
+ * @param {Set<number>} [label_ids_to_fuse=null] The labels in this state will have all their instances be fused together.
18159
+ * @param {[number, number][]} [target_sizes=null] The target sizes to resize the masks to.
18160
+ * @returns {Array<{ segmentation: Tensor, segments_info: Array<{id: number, label_id: number, score: number}>}>}
18161
+ */
18162
+ function post_process_panoptic_segmentation(
18163
+ outputs,
18164
+ threshold = 0.5,
18165
+ mask_threshold = 0.5,
18166
+ overlap_mask_area_threshold = 0.8,
18167
+ label_ids_to_fuse = null,
18168
+ target_sizes = null,
18169
+ ) {
18170
+ if (label_ids_to_fuse === null) {
18171
+ console.warn("`label_ids_to_fuse` unset. No instance will be fused.")
18172
+ label_ids_to_fuse = new Set();
18173
+ }
18174
+
18175
+ const class_queries_logits = outputs.class_queries_logits ?? outputs.logits; // [batch_size, num_queries, num_classes+1]
18176
+ const masks_queries_logits = outputs.masks_queries_logits ?? outputs.pred_masks; // [batch_size, num_queries, height, width]
18177
+
18178
+ const mask_probs = masks_queries_logits.sigmoid() // [batch_size, num_queries, height, width]
18179
+
18180
+ let [batch_size, num_queries, num_labels] = class_queries_logits.dims;
18181
+ num_labels -= 1; // Remove last class (background)
18182
+
18183
+ if (target_sizes !== null && target_sizes.length !== batch_size) {
18184
+ throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits")
18185
+ }
18186
+
18187
+ let toReturn = [];
18188
+ for (let i = 0; i < batch_size; ++i) {
18189
+ let target_size = target_sizes !== null ? target_sizes[i] : null;
18190
+
18191
+ let class_logits = class_queries_logits[i];
18192
+ let mask_logits = mask_probs[i];
18193
+
18194
+ let [mask_probs_item, pred_scores_item, pred_labels_item] = remove_low_and_no_objects(class_logits, mask_logits, threshold, num_labels);
18195
+
18196
+ if (pred_labels_item.length === 0) {
18197
+ // No mask found
18198
+ let [height, width] = target_size ?? mask_logits.dims.slice(-2);
18199
+
18200
+ let segmentation = new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_4__.Tensor(
18201
+ 'int32',
18202
+ new Int32Array(height * width).fill(-1),
18203
+ [height, width]
18204
+ )
18205
+ toReturn.push({
18206
+ segmentation: segmentation,
18207
+ segments_info: []
18208
+ });
18209
+ continue;
18210
+ }
18211
+
18212
+
18213
+ // Get segmentation map and segment information of batch item
18214
+ let [segmentation, segments] = compute_segments(
18215
+ mask_probs_item,
18216
+ pred_scores_item,
18217
+ pred_labels_item,
18218
+ mask_threshold,
18219
+ overlap_mask_area_threshold,
18220
+ label_ids_to_fuse,
18221
+ target_size,
18222
+ )
18223
+
18224
+ toReturn.push({
18225
+ segmentation: segmentation,
18226
+ segments_info: segments
18227
+ })
18228
+ }
18229
+
18230
+ return toReturn;
18231
+ }
18232
+
18233
+
18234
+ /**
18235
+ * Post-processes the outputs of the model (for instance segmentation).
18236
+ * @param {*} outputs Raw outputs of the model.
18237
+ * @param {number} [threshold=0.5] The probability score threshold to keep predicted instance masks.
18238
+ * @param {[number, number][]} [target_sizes=null] List of tuples corresponding to the requested final size
18239
+ * (height, width) of each prediction. If unset, predictions will not be resized.
18240
+ * @returns {Array<{ segmentation: Tensor, segments_info: Array<{id: number, label_id: number, score: number}>}>}
18241
+ */
18242
+ function post_process_instance_segmentation(outputs, threshold = 0.5, target_sizes = null) {
18243
+ throw new Error('Not implemented yet');
18244
+ return [];
18245
+ }
18246
+
17799
18247
  /**
17800
18248
  * Named tuple to indicate the order we are using is (height x width), even though
17801
18249
  * the Graphics’ industry standard is (width x height).
@@ -18384,6 +18832,7 @@ class SegformerFeatureExtractor extends ImageFeatureExtractor {
18384
18832
  return post_process_semantic_segmentation(...args);
18385
18833
  }
18386
18834
  }
18835
+ class PvtImageProcessor extends ImageFeatureExtractor { }
18387
18836
  class DPTFeatureExtractor extends ImageFeatureExtractor { }
18388
18837
  class DPTImageProcessor extends DPTFeatureExtractor { } // NOTE: extends DPTFeatureExtractor
18389
18838
  class BitImageProcessor extends ImageFeatureExtractor { }
@@ -18523,302 +18972,32 @@ class DetrFeatureExtractor extends ImageFeatureExtractor {
18523
18972
  // TODO support different mask sizes (not just 64x64)
18524
18973
  // Currently, just fill pixel mask with 1s
18525
18974
  const maskSize = [result.pixel_values.dims[0], 64, 64];
18526
- const pixel_mask = new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_4__.Tensor(
18527
- 'int64',
18528
- new BigInt64Array(maskSize.reduce((a, b) => a * b)).fill(1n),
18529
- maskSize
18530
- );
18975
+ const pixel_mask = (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_4__.full)(maskSize, 1n);
18531
18976
 
18532
18977
  return { ...result, pixel_mask };
18533
18978
  }
18534
18979
 
18535
- /**
18536
- * Post-processes the outputs of the model (for object detection).
18537
- * @param {Object} outputs The outputs of the model that must be post-processed
18538
- * @param {Tensor} outputs.logits The logits
18539
- * @param {Tensor} outputs.pred_boxes The predicted boxes.
18540
- * @return {Object[]} An array of objects containing the post-processed outputs.
18541
- */
18542
-
18543
18980
  /** @type {typeof post_process_object_detection} */
18544
18981
  post_process_object_detection(...args) {
18545
18982
  return post_process_object_detection(...args);
18546
18983
  }
18547
18984
 
18548
- /**
18549
- * Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and `labels`.
18550
- * @param {Tensor} class_logits The class logits.
18551
- * @param {Tensor} mask_logits The mask logits.
18552
- * @param {number} object_mask_threshold A number between 0 and 1 used to binarize the masks.
18553
- * @param {number} num_labels The number of labels.
18554
- * @returns {[Tensor[], number[], number[]]} The binarized masks, the scores, and the labels.
18555
- */
18556
- remove_low_and_no_objects(class_logits, mask_logits, object_mask_threshold, num_labels) {
18557
-
18558
- let mask_probs_item = [];
18559
- let pred_scores_item = [];
18560
- let pred_labels_item = [];
18561
-
18562
- for (let j = 0; j < class_logits.dims[0]; ++j) {
18563
- let cls = class_logits[j];
18564
- let mask = mask_logits[j];
18565
-
18566
- let pred_label = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.max)(cls.data)[1];
18567
- if (pred_label === num_labels) {
18568
- // Is the background, so we ignore it
18569
- continue;
18570
- }
18571
-
18572
- let scores = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.softmax)(cls.data);
18573
- let pred_score = scores[pred_label];
18574
- if (pred_score > object_mask_threshold) {
18575
- mask_probs_item.push(mask);
18576
- pred_scores_item.push(pred_score);
18577
- pred_labels_item.push(pred_label);
18578
- }
18579
- }
18580
-
18581
- return [mask_probs_item, pred_scores_item, pred_labels_item];
18582
-
18985
+ /** @type {typeof post_process_panoptic_segmentation} */
18986
+ post_process_panoptic_segmentation(...args) {
18987
+ return post_process_panoptic_segmentation(...args);
18583
18988
  }
18584
18989
 
18585
- /**
18586
- * Checks whether the segment is valid or not.
18587
- * @param {Int32Array} mask_labels Labels for each pixel in the mask.
18588
- * @param {Tensor[]} mask_probs Probabilities for each pixel in the masks.
18589
- * @param {number} k The class id of the segment.
18590
- * @param {number} mask_threshold The mask threshold.
18591
- * @param {number} overlap_mask_area_threshold The overlap mask area threshold.
18592
- * @returns {[boolean, number[]]} Whether the segment is valid or not, and the indices of the valid labels.
18593
- */
18594
- check_segment_validity(
18595
- mask_labels,
18596
- mask_probs,
18597
- k,
18598
- mask_threshold = 0.5,
18599
- overlap_mask_area_threshold = 0.8
18600
- ) {
18601
- // mask_k is a 1D array of indices, indicating where the mask is equal to k
18602
- let mask_k = [];
18603
- let mask_k_area = 0;
18604
- let original_area = 0;
18605
-
18606
- const mask_probs_k_data = mask_probs[k].data;
18607
-
18608
- // Compute the area of all the stuff in query k
18609
- for (let i = 0; i < mask_labels.length; ++i) {
18610
- if (mask_labels[i] === k) {
18611
- mask_k.push(i);
18612
- ++mask_k_area;
18613
- }
18614
-
18615
- if (mask_probs_k_data[i] >= mask_threshold) {
18616
- ++original_area;
18617
- }
18618
- }
18619
- let mask_exists = mask_k_area > 0 && original_area > 0;
18620
-
18621
- // Eliminate disconnected tiny segments
18622
- if (mask_exists) {
18623
- // Perform additional check
18624
- let area_ratio = mask_k_area / original_area;
18625
- mask_exists = area_ratio > overlap_mask_area_threshold;
18626
- }
18627
-
18628
- return [mask_exists, mask_k]
18629
- }
18630
-
18631
- /**
18632
- * Computes the segments.
18633
- * @param {Tensor[]} mask_probs The mask probabilities.
18634
- * @param {number[]} pred_scores The predicted scores.
18635
- * @param {number[]} pred_labels The predicted labels.
18636
- * @param {number} mask_threshold The mask threshold.
18637
- * @param {number} overlap_mask_area_threshold The overlap mask area threshold.
18638
- * @param {Set<number>} label_ids_to_fuse The label ids to fuse.
18639
- * @param {number[]} target_size The target size of the image.
18640
- * @returns {[Tensor, Array<{id: number, label_id: number, score: number}>]} The computed segments.
18641
- */
18642
- compute_segments(
18643
- mask_probs,
18644
- pred_scores,
18645
- pred_labels,
18646
- mask_threshold,
18647
- overlap_mask_area_threshold,
18648
- label_ids_to_fuse = null,
18649
- target_size = null,
18650
- ) {
18651
- let [height, width] = target_size ?? mask_probs[0].dims;
18652
-
18653
- let segmentation = new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_4__.Tensor(
18654
- 'int32',
18655
- new Int32Array(height * width),
18656
- [height, width]
18657
- );
18658
- let segments = [];
18659
-
18660
- // 1. If target_size is not null, we need to resize the masks to the target size
18661
- if (target_size !== null) {
18662
- // resize the masks to the target size
18663
- for (let i = 0; i < mask_probs.length; ++i) {
18664
- mask_probs[i] = (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_4__.interpolate)(mask_probs[i], target_size, 'bilinear', false);
18665
- }
18666
- }
18667
-
18668
- // 2. Weigh each mask by its prediction score
18669
- // NOTE: `mask_probs` is updated in-place
18670
- //
18671
- // Temporary storage for the best label/scores for each pixel ([height, width]):
18672
- let mask_labels = new Int32Array(mask_probs[0].data.length);
18673
- let bestScores = new Float32Array(mask_probs[0].data.length);
18674
-
18675
- for (let i = 0; i < mask_probs.length; ++i) {
18676
- let score = pred_scores[i];
18677
-
18678
- const mask_probs_i_data = mask_probs[i].data;
18679
-
18680
- for (let j = 0; j < mask_probs_i_data.length; ++j) {
18681
- mask_probs_i_data[j] *= score
18682
- if (mask_probs_i_data[j] > bestScores[j]) {
18683
- mask_labels[j] = i;
18684
- bestScores[j] = mask_probs_i_data[j];
18685
- }
18686
- }
18687
- }
18688
-
18689
- let current_segment_id = 0;
18690
-
18691
- // let stuff_memory_list = {}
18692
- const segmentation_data = segmentation.data;
18693
- for (let k = 0; k < pred_labels.length; ++k) {
18694
- let pred_class = pred_labels[k];
18695
-
18696
- // TODO add `should_fuse`
18697
- // let should_fuse = pred_class in label_ids_to_fuse
18698
-
18699
- // Check if mask exists and large enough to be a segment
18700
- let [mask_exists, mask_k] = this.check_segment_validity(
18701
- mask_labels,
18702
- mask_probs,
18703
- k,
18704
- mask_threshold,
18705
- overlap_mask_area_threshold
18706
- )
18707
-
18708
- if (!mask_exists) {
18709
- // Nothing to see here
18710
- continue;
18711
- }
18712
-
18713
- // TODO
18714
- // if (pred_class in stuff_memory_list) {
18715
- // current_segment_id = stuff_memory_list[pred_class]
18716
- // } else {
18717
- // current_segment_id += 1;
18718
- // }
18719
- ++current_segment_id;
18720
-
18721
-
18722
- // Add current object segment to final segmentation map
18723
- for (let index of mask_k) {
18724
- segmentation_data[index] = current_segment_id;
18725
- }
18726
-
18727
- segments.push({
18728
- id: current_segment_id,
18729
- label_id: pred_class,
18730
- // was_fused: should_fuse, TODO
18731
- score: pred_scores[k],
18732
- })
18733
-
18734
- // TODO
18735
- // if(should_fuse){
18736
- // stuff_memory_list[pred_class] = current_segment_id
18737
- // }
18738
- }
18739
-
18740
- return [segmentation, segments];
18990
+ post_process_instance_segmentation() {
18991
+ // TODO
18992
+ throw Error("Not implemented yet");
18741
18993
  }
18994
+ }
18742
18995
 
18743
- /**
18744
- * Post-process the model output to generate the final panoptic segmentation.
18745
- * @param {*} outputs The model output to post process
18746
- * @param {number} [threshold=0.5] The probability score threshold to keep predicted instance masks.
18747
- * @param {number} [mask_threshold=0.5] Threshold to use when turning the predicted masks into binary values.
18748
- * @param {number} [overlap_mask_area_threshold=0.8] The overlap mask area threshold to merge or discard small disconnected parts within each binary instance mask.
18749
- * @param {Set<number>} [label_ids_to_fuse=null] The labels in this state will have all their instances be fused together.
18750
- * @param {number[][]} [target_sizes=null] The target sizes to resize the masks to.
18751
- * @returns {Array<{ segmentation: Tensor, segments_info: Array<{id: number, label_id: number, score: number}>}>}
18752
- */
18753
- post_process_panoptic_segmentation(
18754
- outputs,
18755
- threshold = 0.5,
18756
- mask_threshold = 0.5,
18757
- overlap_mask_area_threshold = 0.8,
18758
- label_ids_to_fuse = null,
18759
- target_sizes = null,
18760
- ) {
18761
- if (label_ids_to_fuse === null) {
18762
- console.warn("`label_ids_to_fuse` unset. No instance will be fused.")
18763
- label_ids_to_fuse = new Set();
18764
- }
18765
-
18766
- const class_queries_logits = outputs.logits; // [batch_size, num_queries, num_classes+1]
18767
- const masks_queries_logits = outputs.pred_masks; // [batch_size, num_queries, height, width]
18768
-
18769
- const mask_probs = masks_queries_logits.sigmoid() // [batch_size, num_queries, height, width]
18770
-
18771
- let [batch_size, num_queries, num_labels] = class_queries_logits.dims;
18772
- num_labels -= 1; // Remove last class (background)
18773
-
18774
- if (target_sizes !== null && target_sizes.length !== batch_size) {
18775
- throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits")
18776
- }
18777
-
18778
- let toReturn = [];
18779
- for (let i = 0; i < batch_size; ++i) {
18780
- let target_size = target_sizes !== null ? target_sizes[i] : null;
18781
-
18782
- let class_logits = class_queries_logits[i];
18783
- let mask_logits = mask_probs[i];
18784
-
18785
- let [mask_probs_item, pred_scores_item, pred_labels_item] = this.remove_low_and_no_objects(class_logits, mask_logits, threshold, num_labels);
18786
-
18787
- if (pred_labels_item.length === 0) {
18788
- // No mask found
18789
- let [height, width] = target_size ?? mask_logits.dims.slice(-2);
18790
-
18791
- let segmentation = new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_4__.Tensor(
18792
- 'int32',
18793
- new Int32Array(height * width).fill(-1),
18794
- [height, width]
18795
- )
18796
- toReturn.push({
18797
- segmentation: segmentation,
18798
- segments_info: []
18799
- });
18800
- continue;
18801
- }
18802
-
18803
-
18804
- // Get segmentation map and segment information of batch item
18805
- let [segmentation, segments] = this.compute_segments(
18806
- mask_probs_item,
18807
- pred_scores_item,
18808
- pred_labels_item,
18809
- mask_threshold,
18810
- overlap_mask_area_threshold,
18811
- label_ids_to_fuse,
18812
- target_size,
18813
- )
18814
-
18815
- toReturn.push({
18816
- segmentation: segmentation,
18817
- segments_info: segments
18818
- })
18819
- }
18996
+ class MaskFormerFeatureExtractor extends ImageFeatureExtractor {
18820
18997
 
18821
- return toReturn;
18998
+ /** @type {typeof post_process_panoptic_segmentation} */
18999
+ post_process_panoptic_segmentation(...args) {
19000
+ return post_process_panoptic_segmentation(...args);
18822
19001
  }
18823
19002
 
18824
19003
  post_process_instance_segmentation() {
@@ -18827,6 +19006,7 @@ class DetrFeatureExtractor extends ImageFeatureExtractor {
18827
19006
  }
18828
19007
  }
18829
19008
 
19009
+
18830
19010
  class YolosFeatureExtractor extends ImageFeatureExtractor {
18831
19011
  /** @type {typeof post_process_object_detection} */
18832
19012
  post_process_object_detection(...args) {
@@ -20116,11 +20296,13 @@ class AutoProcessor {
20116
20296
  BitImageProcessor,
20117
20297
  DPTImageProcessor,
20118
20298
  DPTFeatureExtractor,
20299
+ PvtImageProcessor,
20119
20300
  GLPNFeatureExtractor,
20120
20301
  BeitFeatureExtractor,
20121
20302
  DeiTFeatureExtractor,
20122
20303
  DetrFeatureExtractor,
20123
20304
  RTDetrImageProcessor,
20305
+ MaskFormerFeatureExtractor,
20124
20306
  YolosFeatureExtractor,
20125
20307
  DonutFeatureExtractor,
20126
20308
  NougatImageProcessor,
@@ -20498,23 +20680,26 @@ function is_chinese_char(cp) {
20498
20680
  }
20499
20681
 
20500
20682
  /**
20501
- * Helper function to fuse consecutive values in an array equal to the specified value.
20502
- * @param {string[]} arr The input array
20503
- * @param {any} value The value to fuse on.
20504
- * @param {Map<string, any>} mapping The mapping from input domain to value.
20683
+ * Helper function to fuse consecutive unknown tokens.
20684
+ * @param {string[]} arr The list of input tokens
20685
+ * @param {Map<string, any>} tokens_to_ids The mapping from tokens to token ids.
20686
+ * @param {number} unk_token_id The value to fuse on.
20687
+ * @private
20505
20688
  */
20506
- function fuse(arr, value, mapping) {
20689
+ function fuse_unk(arr, tokens_to_ids, unk_token_id) {
20507
20690
  const fused = [];
20508
20691
  let i = 0;
20509
20692
  while (i < arr.length) {
20510
20693
  fused.push(arr[i])
20511
- if ((mapping.get(arr[i]) ?? value) !== value) {
20694
+ if ((tokens_to_ids.get(arr[i]) ?? unk_token_id) !== unk_token_id) {
20512
20695
  ++i;
20513
20696
  continue;
20514
20697
  }
20515
20698
 
20516
- while (i < arr.length && (mapping.get(arr[i]) ?? value) === value) {
20517
- ++i;
20699
+ while (++i < arr.length && (tokens_to_ids.get(arr[i]) ?? unk_token_id) === unk_token_id) {
20700
+ if (tokens_to_ids.get(fused.at(-1)) !== unk_token_id) {
20701
+ fused[fused.length - 1] += arr[i];
20702
+ }
20518
20703
  }
20519
20704
  }
20520
20705
 
@@ -20631,15 +20816,15 @@ class TokenizerModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
20631
20816
  /**
20632
20817
  * Internal function to call the TokenizerModel instance.
20633
20818
  * @param {string[]} tokens The tokens to encode.
20634
- * @returns {string[]} The encoded token IDs.
20819
+ * @returns {string[]} The encoded tokens.
20635
20820
  */
20636
20821
  _call(tokens) {
20637
- let ids = this.encode(tokens);
20822
+ tokens = this.encode(tokens);
20638
20823
  if (this.fuse_unk) {
20639
20824
  // Fuse unknown tokens
20640
- ids = fuse(ids, this.unk_token_id, this.tokens_to_ids);
20825
+ tokens = fuse_unk(tokens, this.tokens_to_ids, this.unk_token_id);
20641
20826
  }
20642
- return ids;
20827
+ return tokens;
20643
20828
  }
20644
20829
 
20645
20830
  /**
@@ -21137,15 +21322,19 @@ class BPE extends TokenizerModel {
21137
21322
  for (const t of bpe_token_list) {
21138
21323
  if (this.tokens_to_ids.has(t)) {
21139
21324
  outputTokens.push(t);
21140
- } else {
21141
- if (this.byte_fallback) {
21142
- outputTokens.push(
21143
- ...Array.from(this.text_encoder.encode(t))
21144
- .map(x => `<0x${x.toString(16).toUpperCase().padStart(2, '0')}>`)
21145
- );
21325
+ } else if (this.byte_fallback) {
21326
+ const byteTokens = Array.from(this.text_encoder.encode(t))
21327
+ .map(x => `<0x${x.toString(16).toUpperCase().padStart(2, '0')}>`);
21328
+ if (byteTokens.every(x => this.tokens_to_ids.has(x))) {
21329
+ // Ensure the byte tokens are actually in the vocabulary, otherwise
21330
+ // we fall back to the unknown token. For more information, see
21331
+ // https://github.com/huggingface/transformers/issues/28096.
21332
+ outputTokens.push(...byteTokens);
21146
21333
  } else {
21147
21334
  outputTokens.push(this.unk_token);
21148
21335
  }
21336
+ } else {
21337
+ outputTokens.push(this.unk_token);
21149
21338
  }
21150
21339
  }
21151
21340
  }
@@ -22817,11 +23006,10 @@ class PreTrainedTokenizer extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__
22817
23006
  this.decoder.end_of_word_suffix = this.model.end_of_word_suffix;
22818
23007
  }
22819
23008
 
22820
-
22821
23009
  this.added_tokens_regex = this.added_tokens.length > 0 ? new RegExp(
22822
- this.added_tokens
23010
+ this.added_tokens.slice()
22823
23011
  // Sort by length (desc) to avoid early partial matches
22824
- .toSorted((a, b) => b.content.length - a.content.length)
23012
+ .sort((a, b) => b.content.length - a.content.length)
22825
23013
  .map(x => `${x.lstrip ? '\\s*' : ''}(${(0,_utils_core_js__WEBPACK_IMPORTED_MODULE_1__.escapeRegExp)(x.content)})${x.rstrip ? '\\s*' : ''}`)
22826
23014
  .join('|')
22827
23015
  ) : null;
@@ -23319,6 +23507,67 @@ class PreTrainedTokenizer extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__
23319
23507
 
23320
23508
  return decoded;
23321
23509
  }
23510
+
23511
+ /**
23512
+ * Retrieve the chat template string used for tokenizing chat messages. This template is used
23513
+ * internally by the `apply_chat_template` method and can also be used externally to retrieve the model's chat
23514
+ * template for better generation tracking.
23515
+ *
23516
+ * @param {Object} options An optional object containing the following properties:
23517
+ * @param {string} [options.chat_template=null]
23518
+ * A Jinja template or the name of a template to use for this conversion.
23519
+ * It is usually not necessary to pass anything to this argument,
23520
+ * as the model's template will be used by default.
23521
+ * @param {Object[]} [options.tools=null]
23522
+ * A list of tools (callable functions) that will be accessible to the model. If the template does not
23523
+ * support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema,
23524
+ * giving the name, description and argument types for the tool. See our
23525
+ * [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use)
23526
+ * for more information.
23527
+ * @returns {string} The chat template string.
23528
+ */
23529
+ get_chat_template({
23530
+ chat_template = null,
23531
+ tools = null,
23532
+ } = {}) {
23533
+
23534
+ // First, handle the cases when the model has a dict of multiple templates
23535
+ if (this.chat_template && typeof this.chat_template === 'object') {
23536
+ const template_dict = this.chat_template;
23537
+
23538
+ if (chat_template !== null && Object.hasOwn(template_dict, chat_template)) {
23539
+ // The user can pass the name of a template to the chat template argument instead of an entire template
23540
+ chat_template = template_dict[chat_template];
23541
+ } else if (chat_template === null) {
23542
+ if (tools !== null && 'tool_use' in template_dict) {
23543
+ chat_template = template_dict['tool_use'];
23544
+ } else if ('default' in template_dict) {
23545
+ chat_template = template_dict['default'];
23546
+ } else {
23547
+ throw Error(
23548
+ `This model has multiple chat templates with no default specified! Please either pass a chat ` +
23549
+ `template or the name of the template you wish to use to the 'chat_template' argument. Available ` +
23550
+ `template names are ${Object.keys(template_dict).sort()}.`
23551
+ )
23552
+ }
23553
+ }
23554
+ } else if (chat_template === null) {
23555
+ // These are the cases when the model has a single template
23556
+ // priority: `chat_template` argument > `tokenizer.chat_template`
23557
+ if (this.chat_template) {
23558
+ chat_template = this.chat_template;
23559
+ } else {
23560
+ throw Error(
23561
+ "Cannot use apply_chat_template() because tokenizer.chat_template is not set and no template " +
23562
+ "argument was passed! For information about writing templates and setting the " +
23563
+ "tokenizer.chat_template attribute, please see the documentation at " +
23564
+ "https://huggingface.co/docs/transformers/main/en/chat_templating"
23565
+ )
23566
+ }
23567
+ }
23568
+ return chat_template;
23569
+ }
23570
+
23322
23571
  /**
23323
23572
  * Converts a list of message objects with `"role"` and `"content"` keys to a list of token
23324
23573
  * ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to
@@ -23392,39 +23641,8 @@ class PreTrainedTokenizer extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__
23392
23641
  ...kwargs
23393
23642
  } = {}) {
23394
23643
 
23395
- // First, handle the cases when the model has a dict of multiple templates
23396
- if (
23397
- (this.chat_template && typeof this.chat_template === 'object')
23398
- || this.chat_template === null
23399
- ) {
23400
- const template_dict = this.chat_template;
23644
+ chat_template = this.get_chat_template({ chat_template, tools });
23401
23645
 
23402
- if (chat_template !== null && Object.hasOwn(template_dict, chat_template)) {
23403
- // The user can pass the name of a template to the chat template argument instead of an entire template
23404
- chat_template = template_dict[chat_template];
23405
- } else if (chat_template === null && 'default' in template_dict) {
23406
- chat_template = template_dict['default'];
23407
- } else if (chat_template === null) {
23408
- throw Error(
23409
- `This model has multiple chat templates with no default specified! Please either pass a chat ` +
23410
- `template or the name of the template you wish to use to the 'chat_template' argument. Available ` +
23411
- `template names are ${Object.keys(template_dict).sort()}.`
23412
- )
23413
- }
23414
- } else {
23415
- // These are the cases when the model has a single template
23416
- // priority: `chat_template` argument > `tokenizer.chat_template`
23417
- if (this.chat_template) {
23418
- chat_template = this.chat_template;
23419
- } else {
23420
- throw Error(
23421
- "Cannot use apply_chat_template() because tokenizer.chat_template is not set and no template " +
23422
- "argument was passed! For information about writing templates and setting the " +
23423
- "tokenizer.chat_template attribute, please see the documentation at " +
23424
- "https://huggingface.co/docs/transformers/main/en/chat_templating"
23425
- )
23426
- }
23427
- }
23428
23646
  if (typeof chat_template !== 'string') {
23429
23647
  throw Error(`chat_template must be a string, but got ${typeof chat_template}`);
23430
23648
  }
@@ -27842,11 +28060,20 @@ function softmax(arr) {
27842
28060
  * @returns {T} The resulting log_softmax array.
27843
28061
  */
27844
28062
  function log_softmax(arr) {
27845
- // Compute the softmax values
27846
- const softmaxArr = softmax(arr);
28063
+ // Compute the maximum value in the array
28064
+ const maxVal = max(arr)[0];
28065
+
28066
+ // Compute the sum of the exponentials
28067
+ let sumExps = 0;
28068
+ for(let i = 0; i < arr.length; ++i) {
28069
+ sumExps += Math.exp(arr[i] - maxVal);
28070
+ }
28071
+
28072
+ // Compute the log of the sum
28073
+ const logSum = Math.log(sumExps);
27847
28074
 
27848
- // Apply log formula to each element
27849
- const logSoftmaxArr = softmaxArr.map(x => Math.log(x));
28075
+ // Compute the softmax values
28076
+ const logSoftmaxArr = arr.map(x => x - maxVal - logSum);
27850
28077
 
27851
28078
  return /** @type {T} */(logSoftmaxArr);
27852
28079
  }
@@ -27901,7 +28128,7 @@ function magnitude(arr) {
27901
28128
  /**
27902
28129
  * Returns the value and index of the minimum element in an array.
27903
28130
  * @param {number[]|TypedArray} arr array of numbers.
27904
- * @returns {number[]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
28131
+ * @returns {[number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
27905
28132
  * @throws {Error} If array is empty.
27906
28133
  */
27907
28134
  function min(arr) {
@@ -30300,6 +30527,7 @@ __webpack_require__.r(__webpack_exports__);
30300
30527
  /* harmony export */ AutoModelForTextToSpectrogram: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.AutoModelForTextToSpectrogram),
30301
30528
  /* harmony export */ AutoModelForTextToWaveform: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.AutoModelForTextToWaveform),
30302
30529
  /* harmony export */ AutoModelForTokenClassification: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.AutoModelForTokenClassification),
30530
+ /* harmony export */ AutoModelForUniversalSegmentation: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.AutoModelForUniversalSegmentation),
30303
30531
  /* harmony export */ AutoModelForVision2Seq: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.AutoModelForVision2Seq),
30304
30532
  /* harmony export */ AutoModelForXVector: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.AutoModelForXVector),
30305
30533
  /* harmony export */ AutoModelForZeroShotObjectDetection: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.AutoModelForZeroShotObjectDetection),
@@ -30344,8 +30572,10 @@ __webpack_require__.r(__webpack_exports__);
30344
30572
  /* harmony export */ CLIPSegForImageSegmentation: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.CLIPSegForImageSegmentation),
30345
30573
  /* harmony export */ CLIPSegModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.CLIPSegModel),
30346
30574
  /* harmony export */ CLIPSegPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.CLIPSegPreTrainedModel),
30575
+ /* harmony export */ CLIPTextModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.CLIPTextModel),
30347
30576
  /* harmony export */ CLIPTextModelWithProjection: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.CLIPTextModelWithProjection),
30348
30577
  /* harmony export */ CLIPTokenizer: () => (/* reexport safe */ _tokenizers_js__WEBPACK_IMPORTED_MODULE_3__.CLIPTokenizer),
30578
+ /* harmony export */ CLIPVisionModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.CLIPVisionModel),
30349
30579
  /* harmony export */ CLIPVisionModelWithProjection: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.CLIPVisionModelWithProjection),
30350
30580
  /* harmony export */ CamembertForMaskedLM: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.CamembertForMaskedLM),
30351
30581
  /* harmony export */ CamembertForQuestionAnswering: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.CamembertForQuestionAnswering),
@@ -30407,6 +30637,8 @@ __webpack_require__.r(__webpack_exports__);
30407
30637
  /* harmony export */ DebertaV2Model: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.DebertaV2Model),
30408
30638
  /* harmony export */ DebertaV2PreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.DebertaV2PreTrainedModel),
30409
30639
  /* harmony export */ DebertaV2Tokenizer: () => (/* reexport safe */ _tokenizers_js__WEBPACK_IMPORTED_MODULE_3__.DebertaV2Tokenizer),
30640
+ /* harmony export */ DecisionTransformerModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.DecisionTransformerModel),
30641
+ /* harmony export */ DecisionTransformerPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.DecisionTransformerPreTrainedModel),
30410
30642
  /* harmony export */ DeiTFeatureExtractor: () => (/* reexport safe */ _processors_js__WEBPACK_IMPORTED_MODULE_4__.DeiTFeatureExtractor),
30411
30643
  /* harmony export */ DeiTForImageClassification: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.DeiTForImageClassification),
30412
30644
  /* harmony export */ DeiTModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.DeiTModel),
@@ -30496,7 +30728,12 @@ __webpack_require__.r(__webpack_exports__);
30496
30728
  /* harmony export */ GemmaPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.GemmaPreTrainedModel),
30497
30729
  /* harmony export */ GemmaTokenizer: () => (/* reexport safe */ _tokenizers_js__WEBPACK_IMPORTED_MODULE_3__.GemmaTokenizer),
30498
30730
  /* harmony export */ Grok1Tokenizer: () => (/* reexport safe */ _tokenizers_js__WEBPACK_IMPORTED_MODULE_3__.Grok1Tokenizer),
30731
+ /* harmony export */ GroupViTModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.GroupViTModel),
30732
+ /* harmony export */ GroupViTPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.GroupViTPreTrainedModel),
30499
30733
  /* harmony export */ HerbertTokenizer: () => (/* reexport safe */ _tokenizers_js__WEBPACK_IMPORTED_MODULE_3__.HerbertTokenizer),
30734
+ /* harmony export */ HieraForImageClassification: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.HieraForImageClassification),
30735
+ /* harmony export */ HieraModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.HieraModel),
30736
+ /* harmony export */ HieraPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.HieraPreTrainedModel),
30500
30737
  /* harmony export */ HubertForCTC: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.HubertForCTC),
30501
30738
  /* harmony export */ HubertForSequenceClassification: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.HubertForSequenceClassification),
30502
30739
  /* harmony export */ HubertModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.HubertModel),
@@ -30546,6 +30783,10 @@ __webpack_require__.r(__webpack_exports__);
30546
30783
  /* harmony export */ MarianModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.MarianModel),
30547
30784
  /* harmony export */ MarianPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.MarianPreTrainedModel),
30548
30785
  /* harmony export */ MarianTokenizer: () => (/* reexport safe */ _tokenizers_js__WEBPACK_IMPORTED_MODULE_3__.MarianTokenizer),
30786
+ /* harmony export */ MaskFormerFeatureExtractor: () => (/* reexport safe */ _processors_js__WEBPACK_IMPORTED_MODULE_4__.MaskFormerFeatureExtractor),
30787
+ /* harmony export */ MaskFormerForInstanceSegmentation: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.MaskFormerForInstanceSegmentation),
30788
+ /* harmony export */ MaskFormerModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.MaskFormerModel),
30789
+ /* harmony export */ MaskFormerPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.MaskFormerPreTrainedModel),
30549
30790
  /* harmony export */ MaskedLMOutput: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.MaskedLMOutput),
30550
30791
  /* harmony export */ MaxLengthCriteria: () => (/* reexport safe */ _generation_stopping_criteria_js__WEBPACK_IMPORTED_MODULE_11__.MaxLengthCriteria),
30551
30792
  /* harmony export */ MistralForCausalLM: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.MistralForCausalLM),
@@ -30623,6 +30864,10 @@ __webpack_require__.r(__webpack_exports__);
30623
30864
  /* harmony export */ PretrainedConfig: () => (/* reexport safe */ _configs_js__WEBPACK_IMPORTED_MODULE_5__.PretrainedConfig),
30624
30865
  /* harmony export */ PretrainedMixin: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.PretrainedMixin),
30625
30866
  /* harmony export */ Processor: () => (/* reexport safe */ _processors_js__WEBPACK_IMPORTED_MODULE_4__.Processor),
30867
+ /* harmony export */ PvtForImageClassification: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.PvtForImageClassification),
30868
+ /* harmony export */ PvtImageProcessor: () => (/* reexport safe */ _processors_js__WEBPACK_IMPORTED_MODULE_4__.PvtImageProcessor),
30869
+ /* harmony export */ PvtModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.PvtModel),
30870
+ /* harmony export */ PvtPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.PvtPreTrainedModel),
30626
30871
  /* harmony export */ PyAnnoteFeatureExtractor: () => (/* reexport safe */ _processors_js__WEBPACK_IMPORTED_MODULE_4__.PyAnnoteFeatureExtractor),
30627
30872
  /* harmony export */ PyAnnoteForAudioFrameClassification: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.PyAnnoteForAudioFrameClassification),
30628
30873
  /* harmony export */ PyAnnoteModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.PyAnnoteModel),
@@ -30743,6 +30988,11 @@ __webpack_require__.r(__webpack_exports__);
30743
30988
  /* harmony export */ ViTFeatureExtractor: () => (/* reexport safe */ _processors_js__WEBPACK_IMPORTED_MODULE_4__.ViTFeatureExtractor),
30744
30989
  /* harmony export */ ViTForImageClassification: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ViTForImageClassification),
30745
30990
  /* harmony export */ ViTImageProcessor: () => (/* reexport safe */ _processors_js__WEBPACK_IMPORTED_MODULE_4__.ViTImageProcessor),
30991
+ /* harmony export */ ViTMAEModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ViTMAEModel),
30992
+ /* harmony export */ ViTMAEPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ViTMAEPreTrainedModel),
30993
+ /* harmony export */ ViTMSNForImageClassification: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ViTMSNForImageClassification),
30994
+ /* harmony export */ ViTMSNModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ViTMSNModel),
30995
+ /* harmony export */ ViTMSNPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ViTMSNPreTrainedModel),
30746
30996
  /* harmony export */ ViTModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ViTModel),
30747
30997
  /* harmony export */ ViTPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ViTPreTrainedModel),
30748
30998
  /* harmony export */ VisionEncoderDecoderModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.VisionEncoderDecoderModel),