@huggingface/tasks 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -444,6 +444,13 @@ interface WidgetExampleBase<TOutput> {
444
444
  */
445
445
  output?: TOutput;
446
446
  }
447
+ interface ChatMessage {
448
+ role: "user" | "assistant" | "system";
449
+ content: string;
450
+ }
451
+ interface WidgetExampleChatInput<TOutput = WidgetExampleOutput> extends WidgetExampleBase<TOutput> {
452
+ messages: ChatMessage[];
453
+ }
447
454
  interface WidgetExampleTextInput<TOutput = WidgetExampleOutput> extends WidgetExampleBase<TOutput> {
448
455
  text: string;
449
456
  }
@@ -476,7 +483,7 @@ interface WidgetExampleSentenceSimilarityInput<TOutput = WidgetExampleOutput> ex
476
483
  source_sentence: string;
477
484
  sentences: string[];
478
485
  }
479
- type WidgetExample<TOutput = WidgetExampleOutput> = WidgetExampleTextInput<TOutput> | WidgetExampleTextAndContextInput<TOutput> | WidgetExampleTextAndTableInput<TOutput> | WidgetExampleAssetInput<TOutput> | WidgetExampleAssetAndPromptInput<TOutput> | WidgetExampleAssetAndTextInput<TOutput> | WidgetExampleAssetAndZeroShotInput<TOutput> | WidgetExampleStructuredDataInput<TOutput> | WidgetExampleTableDataInput<TOutput> | WidgetExampleZeroShotTextInput<TOutput> | WidgetExampleSentenceSimilarityInput<TOutput>;
486
+ type WidgetExample<TOutput = WidgetExampleOutput> = WidgetExampleChatInput<TOutput> | WidgetExampleTextInput<TOutput> | WidgetExampleTextAndContextInput<TOutput> | WidgetExampleTextAndTableInput<TOutput> | WidgetExampleAssetInput<TOutput> | WidgetExampleAssetAndPromptInput<TOutput> | WidgetExampleAssetAndTextInput<TOutput> | WidgetExampleAssetAndZeroShotInput<TOutput> | WidgetExampleStructuredDataInput<TOutput> | WidgetExampleTableDataInput<TOutput> | WidgetExampleZeroShotTextInput<TOutput> | WidgetExampleSentenceSimilarityInput<TOutput>;
480
487
  type KeysOfUnion<T> = T extends unknown ? keyof T : never;
481
488
  type WidgetExampleAttribute = KeysOfUnion<WidgetExample>;
482
489
 
@@ -532,24 +539,46 @@ interface ModelData {
532
539
  /**
533
540
  * this dictionary has useful information about the model configuration
534
541
  */
535
- config?: Record<string, unknown> & {
542
+ config?: {
543
+ architectures?: string[];
544
+ /**
545
+ * Dict of AutoModel or Auto… class name to local import path in the repo
546
+ */
547
+ auto_map?: {
548
+ /**
549
+ * String Property
550
+ */
551
+ [x: string]: string;
552
+ };
553
+ model_type?: string;
554
+ quantization_config?: {
555
+ bits?: number;
556
+ load_in_4bit?: boolean;
557
+ load_in_8bit?: boolean;
558
+ };
559
+ tokenizer_config?: TokenizerConfig;
536
560
  adapter_transformers?: {
537
- model_class?: string;
538
561
  model_name?: string;
562
+ model_class?: string;
563
+ };
564
+ diffusers?: {
565
+ _class_name?: string;
539
566
  };
540
- architectures?: string[];
541
567
  sklearn?: {
542
- filename?: string;
568
+ model?: {
569
+ file?: string;
570
+ };
543
571
  model_format?: string;
544
572
  };
545
573
  speechbrain?: {
546
- interface?: string;
574
+ speechbrain_interface?: string;
575
+ vocoder_interface?: string;
576
+ vocoder_model_id?: string;
547
577
  };
548
578
  peft?: {
549
- base_model_name?: string;
579
+ base_model_name_or_path?: string;
550
580
  task_type?: string;
551
581
  };
552
- tokenizer_config?: TokenizerConfig;
553
582
  };
554
583
  /**
555
584
  * all the model tags
@@ -575,7 +604,7 @@ interface ModelData {
575
604
  */
576
605
  widgetData?: WidgetExample[] | undefined;
577
606
  /**
578
- * Parameters that will be used by the widget when calling Inference Endpoints (serverless)
607
+ * Parameters that will be used by the widget when calling Inference API (serverless)
579
608
  * https://huggingface.co/docs/api-inference/detailed_parameters
580
609
  *
581
610
  * can be set in the model card metadata (under `inference/parameters`)
@@ -732,6 +761,13 @@ declare const MODEL_LIBRARIES_UI_ELEMENTS: {
732
761
  };
733
762
  };
734
763
  };
764
+ audiocraft: {
765
+ prettyLabel: string;
766
+ repoName: string;
767
+ repoUrl: string;
768
+ snippets: (model: ModelData) => string[];
769
+ filter: false;
770
+ };
735
771
  bertopic: {
736
772
  prettyLabel: string;
737
773
  repoName: string;
@@ -1051,8 +1087,8 @@ declare const MODEL_LIBRARIES_UI_ELEMENTS: {
1051
1087
  };
1052
1088
  };
1053
1089
  type ModelLibraryKey = keyof typeof MODEL_LIBRARIES_UI_ELEMENTS;
1054
- declare const ALL_MODEL_LIBRARY_KEYS: ("sklearn" | "adapter-transformers" | "allennlp" | "asteroid" | "bertopic" | "diffusers" | "doctr" | "espnet" | "fairseq" | "fastai" | "fasttext" | "flair" | "keras" | "k2" | "mindspore" | "ml-agents" | "mlx" | "nemo" | "open_clip" | "paddlenlp" | "peft" | "pyannote-audio" | "pythae" | "sample-factory" | "sentence-transformers" | "setfit" | "spacy" | "span-marker" | "speechbrain" | "stable-baselines3" | "stanza" | "tensorflowtts" | "timm" | "transformers" | "transformers.js" | "unity-sentis")[];
1055
- declare const ALL_DISPLAY_MODEL_LIBRARY_KEYS: ("sklearn" | "adapter-transformers" | "allennlp" | "asteroid" | "bertopic" | "diffusers" | "doctr" | "espnet" | "fairseq" | "fastai" | "fasttext" | "flair" | "keras" | "k2" | "mindspore" | "ml-agents" | "mlx" | "nemo" | "open_clip" | "paddlenlp" | "peft" | "pyannote-audio" | "pythae" | "sample-factory" | "sentence-transformers" | "setfit" | "spacy" | "span-marker" | "speechbrain" | "stable-baselines3" | "stanza" | "tensorflowtts" | "timm" | "transformers" | "transformers.js" | "unity-sentis")[];
1090
+ declare const ALL_MODEL_LIBRARY_KEYS: ("sklearn" | "adapter-transformers" | "allennlp" | "asteroid" | "audiocraft" | "bertopic" | "diffusers" | "doctr" | "espnet" | "fairseq" | "fastai" | "fasttext" | "flair" | "keras" | "k2" | "mindspore" | "ml-agents" | "mlx" | "nemo" | "open_clip" | "paddlenlp" | "peft" | "pyannote-audio" | "pythae" | "sample-factory" | "sentence-transformers" | "setfit" | "spacy" | "span-marker" | "speechbrain" | "stable-baselines3" | "stanza" | "tensorflowtts" | "timm" | "transformers" | "transformers.js" | "unity-sentis")[];
1091
+ declare const ALL_DISPLAY_MODEL_LIBRARY_KEYS: ("sklearn" | "adapter-transformers" | "allennlp" | "asteroid" | "audiocraft" | "bertopic" | "diffusers" | "doctr" | "espnet" | "fairseq" | "fastai" | "fasttext" | "flair" | "keras" | "k2" | "mindspore" | "ml-agents" | "mlx" | "nemo" | "open_clip" | "paddlenlp" | "peft" | "pyannote-audio" | "pythae" | "sample-factory" | "sentence-transformers" | "setfit" | "spacy" | "span-marker" | "speechbrain" | "stable-baselines3" | "stanza" | "tensorflowtts" | "timm" | "transformers" | "transformers.js" | "unity-sentis")[];
1056
1092
 
1057
1093
  /**
1058
1094
  * Mapping from library name (excluding Transformers) to its supported tasks.
@@ -1066,6 +1102,1987 @@ declare const LIBRARY_TASK_MAPPING_EXCLUDING_TRANSFORMERS: Partial<Record<ModelL
1066
1102
  type PerLanguageMapping = Map<WidgetType, string[] | WidgetExample[]>;
1067
1103
  declare const MAPPING_DEFAULT_WIDGET: Map<string, PerLanguageMapping>;
1068
1104
 
1105
+ /**
1106
+ * Inference code generated from the JSON schema spec in ./spec
1107
+ *
1108
+ * Using src/scripts/inference-codegen
1109
+ */
1110
+ /**
1111
+ * Inputs for Audio Classification inference
1112
+ */
1113
+ interface AudioClassificationInput {
1114
+ /**
1115
+ * The input audio data
1116
+ */
1117
+ inputs: unknown;
1118
+ /**
1119
+ * Additional inference parameters
1120
+ */
1121
+ parameters?: AudioClassificationParameters;
1122
+ [property: string]: unknown;
1123
+ }
1124
+ /**
1125
+ * Additional inference parameters
1126
+ *
1127
+ * Additional inference parameters for Audio Classification
1128
+ */
1129
+ interface AudioClassificationParameters {
1130
+ function_to_apply?: ClassificationOutputTransform$3;
1131
+ /**
1132
+ * When specified, limits the output to the top K most probable classes.
1133
+ */
1134
+ top_k?: number;
1135
+ [property: string]: unknown;
1136
+ }
1137
+ /**
1138
+ * The function to apply to the model outputs in order to retrieve the scores.
1139
+ */
1140
+ type ClassificationOutputTransform$3 = "sigmoid" | "softmax" | "none";
1141
+ type AudioClassificationOutput = AudioClassificationOutputElement[];
1142
+ /**
1143
+ * Outputs for Audio Classification inference
1144
+ */
1145
+ interface AudioClassificationOutputElement {
1146
+ /**
1147
+ * The predicted class label.
1148
+ */
1149
+ label: string;
1150
+ /**
1151
+ * The corresponding probability.
1152
+ */
1153
+ score: number;
1154
+ [property: string]: unknown;
1155
+ }
1156
+
1157
+ /**
1158
+ * Inference code generated from the JSON schema spec in ./spec
1159
+ *
1160
+ * Using src/scripts/inference-codegen
1161
+ */
1162
+ /**
1163
+ * Inputs for Automatic Speech Recognition inference
1164
+ */
1165
+ interface AutomaticSpeechRecognitionInput {
1166
+ /**
1167
+ * The input audio data
1168
+ */
1169
+ inputs: unknown;
1170
+ /**
1171
+ * Additional inference parameters
1172
+ */
1173
+ parameters?: AutomaticSpeechRecognitionParameters;
1174
+ [property: string]: unknown;
1175
+ }
1176
+ /**
1177
+ * Additional inference parameters
1178
+ *
1179
+ * Additional inference parameters for Automatic Speech Recognition
1180
+ */
1181
+ interface AutomaticSpeechRecognitionParameters {
1182
+ /**
1183
+ * Parametrization of the text generation process
1184
+ */
1185
+ generate?: GenerationParameters$2;
1186
+ /**
1187
+ * Whether to output corresponding timestamps with the generated text
1188
+ */
1189
+ return_timestamps?: boolean;
1190
+ [property: string]: unknown;
1191
+ }
1192
+ /**
1193
+ * Parametrization of the text generation process
1194
+ *
1195
+ * Ad-hoc parametrization of the text generation process
1196
+ */
1197
+ interface GenerationParameters$2 {
1198
+ /**
1199
+ * Whether to use sampling instead of greedy decoding when generating new tokens.
1200
+ */
1201
+ do_sample?: boolean;
1202
+ /**
1203
+ * Controls the stopping condition for beam-based methods.
1204
+ */
1205
+ early_stopping?: EarlyStoppingUnion$2;
1206
+ /**
1207
+ * If set to float strictly between 0 and 1, only tokens with a conditional probability
1208
+ * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
1209
+ * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
1210
+ * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
1211
+ */
1212
+ epsilon_cutoff?: number;
1213
+ /**
1214
+ * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
1215
+ * float strictly between 0 and 1, a token is only considered if it is greater than either
1216
+ * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
1217
+ * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
1218
+ * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
1219
+ * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
1220
+ * for more details.
1221
+ */
1222
+ eta_cutoff?: number;
1223
+ /**
1224
+ * The maximum length (in tokens) of the generated text, including the input.
1225
+ */
1226
+ max_length?: number;
1227
+ /**
1228
+ * The maximum number of tokens to generate. Takes precedence over maxLength.
1229
+ */
1230
+ max_new_tokens?: number;
1231
+ /**
1232
+ * The minimum length (in tokens) of the generated text, including the input.
1233
+ */
1234
+ min_length?: number;
1235
+ /**
1236
+ * The minimum number of tokens to generate. Takes precedence over maxLength.
1237
+ */
1238
+ min_new_tokens?: number;
1239
+ /**
1240
+ * Number of groups to divide num_beams into in order to ensure diversity among different
1241
+ * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
1242
+ */
1243
+ num_beam_groups?: number;
1244
+ /**
1245
+ * Number of beams to use for beam search.
1246
+ */
1247
+ num_beams?: number;
1248
+ /**
1249
+ * The value balances the model confidence and the degeneration penalty in contrastive
1250
+ * search decoding.
1251
+ */
1252
+ penalty_alpha?: number;
1253
+ /**
1254
+ * The value used to modulate the next token probabilities.
1255
+ */
1256
+ temperature?: number;
1257
+ /**
1258
+ * The number of highest probability vocabulary tokens to keep for top-k-filtering.
1259
+ */
1260
+ top_k?: number;
1261
+ /**
1262
+ * If set to float < 1, only the smallest set of most probable tokens with probabilities
1263
+ * that add up to top_p or higher are kept for generation.
1264
+ */
1265
+ top_p?: number;
1266
+ /**
1267
+ * Local typicality measures how similar the conditional probability of predicting a target
1268
+ * token next is to the expected conditional probability of predicting a random token next,
1269
+ * given the partial text already generated. If set to float < 1, the smallest set of the
1270
+ * most locally typical tokens with probabilities that add up to typical_p or higher are
1271
+ * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
1272
+ */
1273
+ typical_p?: number;
1274
+ /**
1275
+ * Whether the model should use the past last key/values attentions to speed up decoding
1276
+ */
1277
+ use_cache?: boolean;
1278
+ [property: string]: unknown;
1279
+ }
1280
+ /**
1281
+ * Controls the stopping condition for beam-based methods.
1282
+ */
1283
+ type EarlyStoppingUnion$2 = boolean | "never";
1284
+ /**
1285
+ * Outputs of inference for the Automatic Speech Recognition task
1286
+ */
1287
+ interface AutomaticSpeechRecognitionOutput {
1288
+ /**
1289
+ * When returnTimestamps is enabled, chunks contains a list of audio chunks identified by
1290
+ * the model.
1291
+ */
1292
+ chunks?: AutomaticSpeechRecognitionOutputChunk[];
1293
+ /**
1294
+ * The recognized text.
1295
+ */
1296
+ text: string;
1297
+ [property: string]: unknown;
1298
+ }
1299
+ interface AutomaticSpeechRecognitionOutputChunk {
1300
+ /**
1301
+ * A chunk of text identified by the model
1302
+ */
1303
+ text: string;
1304
+ /**
1305
+ * The start and end timestamps corresponding with the text
1306
+ */
1307
+ timestamps: number[];
1308
+ [property: string]: unknown;
1309
+ }
1310
+
1311
+ /**
1312
+ * Inference code generated from the JSON schema spec in ./spec
1313
+ *
1314
+ * Using src/scripts/inference-codegen
1315
+ */
1316
+ /**
1317
+ * Inputs for Document Question Answering inference
1318
+ */
1319
+ interface DocumentQuestionAnsweringInput {
1320
+ /**
1321
+ * One (document, question) pair to answer
1322
+ */
1323
+ inputs: DocumentQuestionAnsweringInputData;
1324
+ /**
1325
+ * Additional inference parameters
1326
+ */
1327
+ parameters?: DocumentQuestionAnsweringParameters;
1328
+ [property: string]: unknown;
1329
+ }
1330
+ /**
1331
+ * One (document, question) pair to answer
1332
+ */
1333
+ interface DocumentQuestionAnsweringInputData {
1334
+ /**
1335
+ * The image on which the question is asked
1336
+ */
1337
+ image: unknown;
1338
+ /**
1339
+ * A question to ask of the document
1340
+ */
1341
+ question: string;
1342
+ [property: string]: unknown;
1343
+ }
1344
+ /**
1345
+ * Additional inference parameters
1346
+ *
1347
+ * Additional inference parameters for Document Question Answering
1348
+ */
1349
+ interface DocumentQuestionAnsweringParameters {
1350
+ /**
1351
+ * If the words in the document are too long to fit with the question for the model, it will
1352
+ * be split in several chunks with some overlap. This argument controls the size of that
1353
+ * overlap.
1354
+ */
1355
+ doc_stride?: number;
1356
+ /**
1357
+ * Whether to accept impossible as an answer
1358
+ */
1359
+ handle_impossible_answer?: boolean;
1360
+ /**
1361
+ * Language to use while running OCR. Defaults to english.
1362
+ */
1363
+ lang?: string;
1364
+ /**
1365
+ * The maximum length of predicted answers (e.g., only answers with a shorter length are
1366
+ * considered).
1367
+ */
1368
+ max_answer_len?: number;
1369
+ /**
1370
+ * The maximum length of the question after tokenization. It will be truncated if needed.
1371
+ */
1372
+ max_question_len?: number;
1373
+ /**
1374
+ * The maximum length of the total sentence (context + question) in tokens of each chunk
1375
+ * passed to the model. The context will be split in several chunks (using doc_stride as
1376
+ * overlap) if needed.
1377
+ */
1378
+ max_seq_len?: number;
1379
+ /**
1380
+ * The number of answers to return (will be chosen by order of likelihood). Can return less
1381
+ * than top_k answers if there are not enough options available within the context.
1382
+ */
1383
+ top_k?: number;
1384
+ /**
1385
+ * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
1386
+ * skip the OCR step and use the provided bounding boxes instead.
1387
+ */
1388
+ word_boxes?: WordBox[];
1389
+ [property: string]: unknown;
1390
+ }
1391
+ type WordBox = number[] | string;
1392
+ type DocumentQuestionAnsweringOutput = DocumentQuestionAnsweringOutputElement[];
1393
+ /**
1394
+ * Outputs of inference for the Document Question Answering task
1395
+ */
1396
+ interface DocumentQuestionAnsweringOutputElement {
1397
+ /**
1398
+ * The answer to the question.
1399
+ */
1400
+ answer: string;
1401
+ /**
1402
+ * The end word index of the answer (in the OCR’d version of the input or provided word
1403
+ * boxes).
1404
+ */
1405
+ end: number;
1406
+ /**
1407
+ * The probability associated to the answer.
1408
+ */
1409
+ score: number;
1410
+ /**
1411
+ * The start word index of the answer (in the OCR’d version of the input or provided word
1412
+ * boxes).
1413
+ */
1414
+ start: number;
1415
+ /**
1416
+ * The index of each word/box pair that is in the answer
1417
+ */
1418
+ words: number[];
1419
+ [property: string]: unknown;
1420
+ }
1421
+
1422
+ /**
1423
+ * Inference code generated from the JSON schema spec in ./spec
1424
+ *
1425
+ * Using src/scripts/inference-codegen
1426
+ */
1427
+ type FeatureExtractionOutput = unknown[];
1428
+ /**
1429
+ * Inputs for Text Embedding inference
1430
+ */
1431
+ interface FeatureExtractionInput {
1432
+ /**
1433
+ * The text to get the embeddings of
1434
+ */
1435
+ inputs: string;
1436
+ /**
1437
+ * Additional inference parameters
1438
+ */
1439
+ parameters?: {
1440
+ [key: string]: unknown;
1441
+ };
1442
+ [property: string]: unknown;
1443
+ }
1444
+
1445
+ /**
1446
+ * Inference code generated from the JSON schema spec in ./spec
1447
+ *
1448
+ * Using src/scripts/inference-codegen
1449
+ */
1450
+ /**
1451
+ * Inputs for Fill Mask inference
1452
+ */
1453
+ interface FillMaskInput {
1454
+ /**
1455
+ * The text with masked tokens
1456
+ */
1457
+ inputs: string;
1458
+ /**
1459
+ * Additional inference parameters
1460
+ */
1461
+ parameters?: FillMaskParameters;
1462
+ [property: string]: unknown;
1463
+ }
1464
+ /**
1465
+ * Additional inference parameters
1466
+ *
1467
+ * Additional inference parameters for Fill Mask
1468
+ */
1469
+ interface FillMaskParameters {
1470
+ /**
1471
+ * When passed, the model will limit the scores to the passed targets instead of looking up
1472
+ * in the whole vocabulary. If the provided targets are not in the model vocab, they will be
1473
+ * tokenized and the first resulting token will be used (with a warning, and that might be
1474
+ * slower).
1475
+ */
1476
+ targets?: string[];
1477
+ /**
1478
+ * When passed, overrides the number of predictions to return.
1479
+ */
1480
+ top_k?: number;
1481
+ [property: string]: unknown;
1482
+ }
1483
+ type FillMaskOutput = FillMaskOutputElement[];
1484
+ /**
1485
+ * Outputs of inference for the Fill Mask task
1486
+ */
1487
+ interface FillMaskOutputElement {
1488
+ /**
1489
+ * The corresponding probability
1490
+ */
1491
+ score: number;
1492
+ /**
1493
+ * The corresponding input with the mask token prediction.
1494
+ */
1495
+ sequence: string;
1496
+ /**
1497
+ * The predicted token id (to replace the masked one).
1498
+ */
1499
+ token: number;
1500
+ tokenStr: unknown;
1501
+ /**
1502
+ * The predicted token (to replace the masked one).
1503
+ */
1504
+ token_str?: string;
1505
+ [property: string]: unknown;
1506
+ }
1507
+
1508
+ /**
1509
+ * Inference code generated from the JSON schema spec in ./spec
1510
+ *
1511
+ * Using src/scripts/inference-codegen
1512
+ */
1513
+ /**
1514
+ * Inputs for Image Classification inference
1515
+ */
1516
+ interface ImageClassificationInput {
1517
+ /**
1518
+ * The input image data
1519
+ */
1520
+ inputs: unknown;
1521
+ /**
1522
+ * Additional inference parameters
1523
+ */
1524
+ parameters?: ImageClassificationParameters;
1525
+ [property: string]: unknown;
1526
+ }
1527
+ /**
1528
+ * Additional inference parameters
1529
+ *
1530
+ * Additional inference parameters for Image Classification
1531
+ */
1532
+ interface ImageClassificationParameters {
1533
+ function_to_apply?: ClassificationOutputTransform$2;
1534
+ /**
1535
+ * When specified, limits the output to the top K most probable classes.
1536
+ */
1537
+ top_k?: number;
1538
+ [property: string]: unknown;
1539
+ }
1540
+ /**
1541
+ * The function to apply to the model outputs in order to retrieve the scores.
1542
+ */
1543
+ type ClassificationOutputTransform$2 = "sigmoid" | "softmax" | "none";
1544
+ type ImageClassificationOutput = ImageClassificationOutputElement[];
1545
+ /**
1546
+ * Outputs of inference for the Image Classification task
1547
+ */
1548
+ interface ImageClassificationOutputElement {
1549
+ /**
1550
+ * The predicted class label.
1551
+ */
1552
+ label: string;
1553
+ /**
1554
+ * The corresponding probability.
1555
+ */
1556
+ score: number;
1557
+ [property: string]: unknown;
1558
+ }
1559
+
1560
+ /**
1561
+ * Inference code generated from the JSON schema spec in ./spec
1562
+ *
1563
+ * Using src/scripts/inference-codegen
1564
+ */
1565
+ /**
1566
+ * Inputs for Image To Image inference
1567
+ */
1568
+ interface ImageToImageInput {
1569
+ /**
1570
+ * The input image data
1571
+ */
1572
+ inputs: unknown;
1573
+ /**
1574
+ * Additional inference parameters
1575
+ */
1576
+ parameters?: ImageToImageParameters;
1577
+ [property: string]: unknown;
1578
+ }
1579
+ /**
1580
+ * Additional inference parameters
1581
+ *
1582
+ * Additional inference parameters for Image To Image
1583
+ */
1584
+ interface ImageToImageParameters {
1585
+ /**
1586
+ * For diffusion models. A higher guidance scale value encourages the model to generate
1587
+ * images closely linked to the text prompt at the expense of lower image quality.
1588
+ */
1589
+ guidance_scale?: number;
1590
+ /**
1591
+ * One or several prompt to guide what NOT to include in image generation.
1592
+ */
1593
+ negative_prompt?: string[];
1594
+ /**
1595
+ * For diffusion models. The number of denoising steps. More denoising steps usually lead to
1596
+ * a higher quality image at the expense of slower inference.
1597
+ */
1598
+ num_inference_steps?: number;
1599
+ /**
1600
+ * The size in pixel of the output image
1601
+ */
1602
+ target_size?: TargetSize$1;
1603
+ [property: string]: unknown;
1604
+ }
1605
+ /**
1606
+ * The size in pixel of the output image
1607
+ */
1608
+ interface TargetSize$1 {
1609
+ height: number;
1610
+ width: number;
1611
+ [property: string]: unknown;
1612
+ }
1613
+ /**
1614
+ * Outputs of inference for the Image To Image task
1615
+ */
1616
+ interface ImageToImageOutput {
1617
+ /**
1618
+ * The output image
1619
+ */
1620
+ image?: unknown;
1621
+ [property: string]: unknown;
1622
+ }
1623
+
1624
+ /**
1625
+ * Inference code generated from the JSON schema spec in ./spec
1626
+ *
1627
+ * Using src/scripts/inference-codegen
1628
+ */
1629
+ /**
1630
+ * Inputs for Image To Text inference
1631
+ */
1632
+ interface ImageToTextInput {
1633
+ /**
1634
+ * The input image data
1635
+ */
1636
+ inputs: unknown;
1637
+ /**
1638
+ * Additional inference parameters
1639
+ */
1640
+ parameters?: ImageToTextParameters;
1641
+ [property: string]: unknown;
1642
+ }
1643
+ /**
1644
+ * Additional inference parameters
1645
+ *
1646
+ * Additional inference parameters for Image To Text
1647
+ */
1648
+ interface ImageToTextParameters {
1649
+ /**
1650
+ * Parametrization of the text generation process
1651
+ */
1652
+ generate?: GenerationParameters$1;
1653
+ /**
1654
+ * The amount of maximum tokens to generate.
1655
+ */
1656
+ max_new_tokens?: number;
1657
+ [property: string]: unknown;
1658
+ }
1659
+ /**
1660
+ * Parametrization of the text generation process
1661
+ *
1662
+ * Ad-hoc parametrization of the text generation process
1663
+ */
1664
+ interface GenerationParameters$1 {
1665
+ /**
1666
+ * Whether to use sampling instead of greedy decoding when generating new tokens.
1667
+ */
1668
+ do_sample?: boolean;
1669
+ /**
1670
+ * Controls the stopping condition for beam-based methods.
1671
+ */
1672
+ early_stopping?: EarlyStoppingUnion$1;
1673
+ /**
1674
+ * If set to float strictly between 0 and 1, only tokens with a conditional probability
1675
+ * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
1676
+ * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
1677
+ * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
1678
+ */
1679
+ epsilon_cutoff?: number;
1680
+ /**
1681
+ * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
1682
+ * float strictly between 0 and 1, a token is only considered if it is greater than either
1683
+ * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
1684
+ * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
1685
+ * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
1686
+ * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
1687
+ * for more details.
1688
+ */
1689
+ eta_cutoff?: number;
1690
+ /**
1691
+ * The maximum length (in tokens) of the generated text, including the input.
1692
+ */
1693
+ max_length?: number;
1694
+ /**
1695
+ * The maximum number of tokens to generate. Takes precedence over maxLength.
1696
+ */
1697
+ max_new_tokens?: number;
1698
+ /**
1699
+ * The minimum length (in tokens) of the generated text, including the input.
1700
+ */
1701
+ min_length?: number;
1702
+ /**
1703
+ * The minimum number of tokens to generate. Takes precedence over maxLength.
1704
+ */
1705
+ min_new_tokens?: number;
1706
+ /**
1707
+ * Number of groups to divide num_beams into in order to ensure diversity among different
1708
+ * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
1709
+ */
1710
+ num_beam_groups?: number;
1711
+ /**
1712
+ * Number of beams to use for beam search.
1713
+ */
1714
+ num_beams?: number;
1715
+ /**
1716
+ * The value balances the model confidence and the degeneration penalty in contrastive
1717
+ * search decoding.
1718
+ */
1719
+ penalty_alpha?: number;
1720
+ /**
1721
+ * The value used to modulate the next token probabilities.
1722
+ */
1723
+ temperature?: number;
1724
+ /**
1725
+ * The number of highest probability vocabulary tokens to keep for top-k-filtering.
1726
+ */
1727
+ top_k?: number;
1728
+ /**
1729
+ * If set to float < 1, only the smallest set of most probable tokens with probabilities
1730
+ * that add up to top_p or higher are kept for generation.
1731
+ */
1732
+ top_p?: number;
1733
+ /**
1734
+ * Local typicality measures how similar the conditional probability of predicting a target
1735
+ * token next is to the expected conditional probability of predicting a random token next,
1736
+ * given the partial text already generated. If set to float < 1, the smallest set of the
1737
+ * most locally typical tokens with probabilities that add up to typical_p or higher are
1738
+ * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
1739
+ */
1740
+ typical_p?: number;
1741
+ /**
1742
+ * Whether the model should use the past last key/values attentions to speed up decoding
1743
+ */
1744
+ use_cache?: boolean;
1745
+ [property: string]: unknown;
1746
+ }
1747
+ /**
1748
+ * Controls the stopping condition for beam-based methods.
1749
+ */
1750
+ type EarlyStoppingUnion$1 = boolean | "never";
1751
+ /**
1752
+ * Outputs of inference for the Image To Text task
1753
+ */
1754
+ interface ImageToTextOutput {
1755
+ generatedText: unknown;
1756
+ /**
1757
+ * The generated text.
1758
+ */
1759
+ generated_text?: string;
1760
+ [property: string]: unknown;
1761
+ }
1762
+
1763
+ /**
1764
+ * Inference code generated from the JSON schema spec in ./spec
1765
+ *
1766
+ * Using src/scripts/inference-codegen
1767
+ */
1768
+ /**
1769
+ * Inputs for Image Segmentation inference
1770
+ */
1771
+ interface ImageSegmentationInput {
1772
+ /**
1773
+ * The input image data
1774
+ */
1775
+ inputs: unknown;
1776
+ /**
1777
+ * Additional inference parameters
1778
+ */
1779
+ parameters?: ImageSegmentationParameters;
1780
+ [property: string]: unknown;
1781
+ }
1782
+ /**
1783
+ * Additional inference parameters
1784
+ *
1785
+ * Additional inference parameters for Image Segmentation
1786
+ */
1787
+ interface ImageSegmentationParameters {
1788
+ /**
1789
+ * Threshold to use when turning the predicted masks into binary values.
1790
+ */
1791
+ mask_threshold?: number;
1792
+ /**
1793
+ * Mask overlap threshold to eliminate small, disconnected segments.
1794
+ */
1795
+ overlap_mask_area_threshold?: number;
1796
+ /**
1797
+ * Segmentation task to be performed, depending on model capabilities.
1798
+ */
1799
+ subtask?: ImageSegmentationSubtask;
1800
+ /**
1801
+ * Probability threshold to filter out predicted masks.
1802
+ */
1803
+ threshold?: number;
1804
+ [property: string]: unknown;
1805
+ }
1806
+ type ImageSegmentationSubtask = "instance" | "panoptic" | "semantic";
1807
+ type ImageSegmentationOutput = ImageSegmentationOutputElement[];
1808
+ /**
1809
+ * Outputs of inference for the Image Segmentation task
1810
+ *
1811
+ * A predicted mask / segment
1812
+ */
1813
+ interface ImageSegmentationOutputElement {
1814
+ /**
1815
+ * The label of the predicted segment
1816
+ */
1817
+ label: string;
1818
+ /**
1819
+ * The corresponding mask as a black-and-white image
1820
+ */
1821
+ mask: unknown;
1822
+ /**
1823
+ * The score or confidence degreee the model has
1824
+ */
1825
+ score?: number;
1826
+ [property: string]: unknown;
1827
+ }
1828
+
1829
+ /**
1830
+ * Inference code generated from the JSON schema spec in ./spec
1831
+ *
1832
+ * Using src/scripts/inference-codegen
1833
+ */
1834
+ /**
1835
+ * Inputs for Object Detection inference
1836
+ */
1837
+ interface ObjectDetectionInput {
1838
+ /**
1839
+ * The input image data
1840
+ */
1841
+ inputs: unknown;
1842
+ /**
1843
+ * Additional inference parameters
1844
+ */
1845
+ parameters?: ObjectDetectionParameters;
1846
+ [property: string]: unknown;
1847
+ }
1848
+ /**
1849
+ * Additional inference parameters
1850
+ *
1851
+ * Additional inference parameters for Object Detection
1852
+ */
1853
+ interface ObjectDetectionParameters {
1854
+ /**
1855
+ * The probability necessary to make a prediction.
1856
+ */
1857
+ threshold?: number;
1858
+ [property: string]: unknown;
1859
+ }
1860
+ /**
1861
+ * The predicted bounding box. Coordinates are relative to the top left corner of the input
1862
+ * image.
1863
+ */
1864
+ interface BoundingBox$1 {
1865
+ xmax: number;
1866
+ xmin: number;
1867
+ ymax: number;
1868
+ ymin: number;
1869
+ [property: string]: unknown;
1870
+ }
1871
+ type ObjectDetectionOutput = ObjectDetectionOutputElement[];
1872
+ /**
1873
+ * Outputs of inference for the Object Detection task
1874
+ */
1875
+ interface ObjectDetectionOutputElement {
1876
+ /**
1877
+ * The predicted bounding box. Coordinates are relative to the top left corner of the input
1878
+ * image.
1879
+ */
1880
+ box: BoundingBox$1;
1881
+ /**
1882
+ * The predicted label for the bounding box
1883
+ */
1884
+ label: string;
1885
+ /**
1886
+ * The associated score / probability
1887
+ */
1888
+ score: number;
1889
+ [property: string]: unknown;
1890
+ }
1891
+
1892
+ /**
1893
+ * Inference code generated from the JSON schema spec in ./spec
1894
+ *
1895
+ * Using src/scripts/inference-codegen
1896
+ */
1897
+ /**
1898
+ * Inputs for Depth Estimation inference
1899
+ */
1900
+ interface DepthEstimationInput {
1901
+ /**
1902
+ * The input image data
1903
+ */
1904
+ inputs: unknown;
1905
+ /**
1906
+ * Additional inference parameters
1907
+ */
1908
+ parameters?: {
1909
+ [key: string]: unknown;
1910
+ };
1911
+ [property: string]: unknown;
1912
+ }
1913
+ /**
1914
+ * Outputs of inference for the Depth Estimation task
1915
+ */
1916
+ interface DepthEstimationOutput {
1917
+ /**
1918
+ * The predicted depth as an image
1919
+ */
1920
+ depth?: unknown;
1921
+ /**
1922
+ * The predicted depth as a tensor
1923
+ */
1924
+ predicted_depth?: unknown;
1925
+ [property: string]: unknown;
1926
+ }
1927
+
1928
+ /**
1929
+ * Inference code generated from the JSON schema spec in ./spec
1930
+ *
1931
+ * Using src/scripts/inference-codegen
1932
+ */
1933
+ /**
1934
+ * Inputs for Question Answering inference
1935
+ */
1936
+ interface QuestionAnsweringInput {
1937
+ /**
1938
+ * One (context, question) pair to answer
1939
+ */
1940
+ inputs: QuestionAnsweringInputData;
1941
+ /**
1942
+ * Additional inference parameters
1943
+ */
1944
+ parameters?: QuestionAnsweringParameters;
1945
+ [property: string]: unknown;
1946
+ }
1947
+ /**
1948
+ * One (context, question) pair to answer
1949
+ */
1950
+ interface QuestionAnsweringInputData {
1951
+ /**
1952
+ * The context to be used for answering the question
1953
+ */
1954
+ context: string;
1955
+ /**
1956
+ * The question to be answered
1957
+ */
1958
+ question: string;
1959
+ [property: string]: unknown;
1960
+ }
1961
+ /**
1962
+ * Additional inference parameters
1963
+ *
1964
+ * Additional inference parameters for Question Answering
1965
+ */
1966
+ interface QuestionAnsweringParameters {
1967
+ /**
1968
+ * Attempts to align the answer to real words. Improves quality on space separated
1969
+ * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
1970
+ */
1971
+ align_to_words?: boolean;
1972
+ /**
1973
+ * If the context is too long to fit with the question for the model, it will be split in
1974
+ * several chunks with some overlap. This argument controls the size of that overlap.
1975
+ */
1976
+ doc_stride?: number;
1977
+ /**
1978
+ * Whether to accept impossible as an answer.
1979
+ */
1980
+ handle_impossible_answer?: boolean;
1981
+ /**
1982
+ * The maximum length of predicted answers (e.g., only answers with a shorter length are
1983
+ * considered).
1984
+ */
1985
+ max_answer_len?: number;
1986
+ /**
1987
+ * The maximum length of the question after tokenization. It will be truncated if needed.
1988
+ */
1989
+ max_question_len?: number;
1990
+ /**
1991
+ * The maximum length of the total sentence (context + question) in tokens of each chunk
1992
+ * passed to the model. The context will be split in several chunks (using docStride as
1993
+ * overlap) if needed.
1994
+ */
1995
+ max_seq_len?: number;
1996
+ /**
1997
+ * The number of answers to return (will be chosen by order of likelihood). Note that we
1998
+ * return less than topk answers if there are not enough options available within the
1999
+ * context.
2000
+ */
2001
+ top_k?: number;
2002
+ [property: string]: unknown;
2003
+ }
2004
+ type QuestionAnsweringOutput = QuestionAnsweringOutputElement[];
2005
+ /**
2006
+ * Outputs of inference for the Question Answering task
2007
+ */
2008
+ interface QuestionAnsweringOutputElement {
2009
+ /**
2010
+ * The answer to the question.
2011
+ */
2012
+ answer: string;
2013
+ /**
2014
+ * The character position in the input where the answer ends.
2015
+ */
2016
+ end: number;
2017
+ /**
2018
+ * The probability associated to the answer.
2019
+ */
2020
+ score: number;
2021
+ /**
2022
+ * The character position in the input where the answer begins.
2023
+ */
2024
+ start: number;
2025
+ [property: string]: unknown;
2026
+ }
2027
+
2028
+ /**
2029
+ * Inference code generated from the JSON schema spec in ./spec
2030
+ *
2031
+ * Using src/scripts/inference-codegen
2032
+ */
2033
+ type SentenceSimilarityOutput = number[];
2034
+ /**
2035
+ * Inputs for Sentence similarity inference
2036
+ */
2037
+ interface SentenceSimilarityInput {
2038
+ inputs: SentenceSimilarityInputData;
2039
+ /**
2040
+ * Additional inference parameters
2041
+ */
2042
+ parameters?: {
2043
+ [key: string]: unknown;
2044
+ };
2045
+ [property: string]: unknown;
2046
+ }
2047
+ interface SentenceSimilarityInputData {
2048
+ /**
2049
+ * A list of strings which will be compared against the source_sentence.
2050
+ */
2051
+ sentences: string[];
2052
+ /**
2053
+ * The string that you wish to compare the other strings with. This can be a phrase,
2054
+ * sentence, or longer passage, depending on the model being used.
2055
+ */
2056
+ sourceSentence: string;
2057
+ [property: string]: unknown;
2058
+ }
2059
+
2060
+ /**
2061
+ * Inference code generated from the JSON schema spec in ./spec
2062
+ *
2063
+ * Using src/scripts/inference-codegen
2064
+ */
2065
+ /**
2066
+ * Inputs for Summarization inference
2067
+ *
2068
+ * Inputs for Text2text Generation inference
2069
+ */
2070
+ interface SummarizationInput {
2071
+ /**
2072
+ * The input text data
2073
+ */
2074
+ inputs: string;
2075
+ /**
2076
+ * Additional inference parameters
2077
+ */
2078
+ parameters?: Text2TextGenerationParameters$1;
2079
+ [property: string]: unknown;
2080
+ }
2081
+ /**
2082
+ * Additional inference parameters
2083
+ *
2084
+ * Additional inference parameters for Text2text Generation
2085
+ */
2086
+ interface Text2TextGenerationParameters$1 {
2087
+ /**
2088
+ * Whether to clean up the potential extra spaces in the text output.
2089
+ */
2090
+ clean_up_tokenization_spaces?: boolean;
2091
+ /**
2092
+ * Additional parametrization of the text generation algorithm
2093
+ */
2094
+ generate_parameters?: {
2095
+ [key: string]: unknown;
2096
+ };
2097
+ /**
2098
+ * The truncation strategy to use
2099
+ */
2100
+ truncation?: Text2TextGenerationTruncationStrategy$1;
2101
+ [property: string]: unknown;
2102
+ }
2103
+ type Text2TextGenerationTruncationStrategy$1 = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
2104
+ /**
2105
+ * Outputs of inference for the Summarization task
2106
+ */
2107
+ interface SummarizationOutput {
2108
+ /**
2109
+ * The summarized text.
2110
+ */
2111
+ summary_text: string;
2112
+ [property: string]: unknown;
2113
+ }
2114
+
2115
+ /**
2116
+ * Inference code generated from the JSON schema spec in ./spec
2117
+ *
2118
+ * Using src/scripts/inference-codegen
2119
+ */
2120
+ /**
2121
+ * Inputs for Table Question Answering inference
2122
+ */
2123
+ interface TableQuestionAnsweringInput {
2124
+ /**
2125
+ * One (table, question) pair to answer
2126
+ */
2127
+ inputs: TableQuestionAnsweringInputData;
2128
+ /**
2129
+ * Additional inference parameters
2130
+ */
2131
+ parameters?: {
2132
+ [key: string]: unknown;
2133
+ };
2134
+ [property: string]: unknown;
2135
+ }
2136
+ /**
2137
+ * One (table, question) pair to answer
2138
+ */
2139
+ interface TableQuestionAnsweringInputData {
2140
+ /**
2141
+ * The question to be answered about the table
2142
+ */
2143
+ question: string;
2144
+ /**
2145
+ * The table to serve as context for the questions
2146
+ */
2147
+ table: {
2148
+ [key: string]: string[];
2149
+ };
2150
+ [property: string]: unknown;
2151
+ }
2152
+ type TableQuestionAnsweringOutput = TableQuestionAnsweringOutputElement[];
2153
+ /**
2154
+ * Outputs of inference for the Table Question Answering task
2155
+ */
2156
+ interface TableQuestionAnsweringOutputElement {
2157
+ /**
2158
+ * If the model has an aggregator, this returns the aggregator.
2159
+ */
2160
+ aggregator?: string;
2161
+ /**
2162
+ * The answer of the question given the table. If there is an aggregator, the answer will be
2163
+ * preceded by `AGGREGATOR >`.
2164
+ */
2165
+ answer: string;
2166
+ /**
2167
+ * List of strings made up of the answer cell values.
2168
+ */
2169
+ cells: string[];
2170
+ /**
2171
+ * Coordinates of the cells of the answers.
2172
+ */
2173
+ coordinates: Array<number[]>;
2174
+ [property: string]: unknown;
2175
+ }
2176
+
2177
+ /**
2178
+ * Inference code generated from the JSON schema spec in ./spec
2179
+ *
2180
+ * Using src/scripts/inference-codegen
2181
+ */
2182
+ /**
2183
+ * Inputs for Text To Image inference
2184
+ */
2185
+ interface TextToImageInput {
2186
+ /**
2187
+ * The input text data (sometimes called "prompt"
2188
+ */
2189
+ inputs: string;
2190
+ /**
2191
+ * Additional inference parameters
2192
+ */
2193
+ parameters?: TextToImageParameters;
2194
+ [property: string]: unknown;
2195
+ }
2196
+ /**
2197
+ * Additional inference parameters
2198
+ *
2199
+ * Additional inference parameters for Text To Image
2200
+ */
2201
+ interface TextToImageParameters {
2202
+ /**
2203
+ * For diffusion models. A higher guidance scale value encourages the model to generate
2204
+ * images closely linked to the text prompt at the expense of lower image quality.
2205
+ */
2206
+ guidance_scale?: number;
2207
+ /**
2208
+ * One or several prompt to guide what NOT to include in image generation.
2209
+ */
2210
+ negative_prompt?: string[];
2211
+ /**
2212
+ * For diffusion models. The number of denoising steps. More denoising steps usually lead to
2213
+ * a higher quality image at the expense of slower inference.
2214
+ */
2215
+ num_inference_steps?: number;
2216
+ /**
2217
+ * For diffusion models. Override the scheduler with a compatible one
2218
+ */
2219
+ scheduler?: string;
2220
+ /**
2221
+ * The size in pixel of the output image
2222
+ */
2223
+ target_size?: TargetSize;
2224
+ [property: string]: unknown;
2225
+ }
2226
+ /**
2227
+ * The size in pixel of the output image
2228
+ */
2229
+ interface TargetSize {
2230
+ height: number;
2231
+ width: number;
2232
+ [property: string]: unknown;
2233
+ }
2234
+ /**
2235
+ * Outputs of inference for the Text To Image task
2236
+ */
2237
+ interface TextToImageOutput {
2238
+ /**
2239
+ * The generated image
2240
+ */
2241
+ image: unknown;
2242
+ [property: string]: unknown;
2243
+ }
2244
+
2245
+ /**
2246
+ * Inference code generated from the JSON schema spec in ./spec
2247
+ *
2248
+ * Using src/scripts/inference-codegen
2249
+ */
2250
+ /**
2251
+ * Inputs for Text to Speech inference
2252
+ *
2253
+ * Inputs for Text To Audio inference
2254
+ */
2255
+ interface TextToSpeechInput {
2256
+ /**
2257
+ * The input text data
2258
+ */
2259
+ inputs: string;
2260
+ /**
2261
+ * Additional inference parameters
2262
+ */
2263
+ parameters?: TextToAudioParameters;
2264
+ [property: string]: unknown;
2265
+ }
2266
+ /**
2267
+ * Additional inference parameters
2268
+ *
2269
+ * Additional inference parameters for Text To Audio
2270
+ */
2271
+ interface TextToAudioParameters {
2272
+ /**
2273
+ * Parametrization of the text generation process
2274
+ */
2275
+ generate?: GenerationParameters;
2276
+ [property: string]: unknown;
2277
+ }
2278
+ /**
2279
+ * Parametrization of the text generation process
2280
+ *
2281
+ * Ad-hoc parametrization of the text generation process
2282
+ */
2283
+ interface GenerationParameters {
2284
+ /**
2285
+ * Whether to use sampling instead of greedy decoding when generating new tokens.
2286
+ */
2287
+ do_sample?: boolean;
2288
+ /**
2289
+ * Controls the stopping condition for beam-based methods.
2290
+ */
2291
+ early_stopping?: EarlyStoppingUnion;
2292
+ /**
2293
+ * If set to float strictly between 0 and 1, only tokens with a conditional probability
2294
+ * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
2295
+ * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
2296
+ * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
2297
+ */
2298
+ epsilon_cutoff?: number;
2299
+ /**
2300
+ * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
2301
+ * float strictly between 0 and 1, a token is only considered if it is greater than either
2302
+ * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
2303
+ * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
2304
+ * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
2305
+ * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
2306
+ * for more details.
2307
+ */
2308
+ eta_cutoff?: number;
2309
+ /**
2310
+ * The maximum length (in tokens) of the generated text, including the input.
2311
+ */
2312
+ max_length?: number;
2313
+ /**
2314
+ * The maximum number of tokens to generate. Takes precedence over maxLength.
2315
+ */
2316
+ max_new_tokens?: number;
2317
+ /**
2318
+ * The minimum length (in tokens) of the generated text, including the input.
2319
+ */
2320
+ min_length?: number;
2321
+ /**
2322
+ * The minimum number of tokens to generate. Takes precedence over maxLength.
2323
+ */
2324
+ min_new_tokens?: number;
2325
+ /**
2326
+ * Number of groups to divide num_beams into in order to ensure diversity among different
2327
+ * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
2328
+ */
2329
+ num_beam_groups?: number;
2330
+ /**
2331
+ * Number of beams to use for beam search.
2332
+ */
2333
+ num_beams?: number;
2334
+ /**
2335
+ * The value balances the model confidence and the degeneration penalty in contrastive
2336
+ * search decoding.
2337
+ */
2338
+ penalty_alpha?: number;
2339
+ /**
2340
+ * The value used to modulate the next token probabilities.
2341
+ */
2342
+ temperature?: number;
2343
+ /**
2344
+ * The number of highest probability vocabulary tokens to keep for top-k-filtering.
2345
+ */
2346
+ top_k?: number;
2347
+ /**
2348
+ * If set to float < 1, only the smallest set of most probable tokens with probabilities
2349
+ * that add up to top_p or higher are kept for generation.
2350
+ */
2351
+ top_p?: number;
2352
+ /**
2353
+ * Local typicality measures how similar the conditional probability of predicting a target
2354
+ * token next is to the expected conditional probability of predicting a random token next,
2355
+ * given the partial text already generated. If set to float < 1, the smallest set of the
2356
+ * most locally typical tokens with probabilities that add up to typical_p or higher are
2357
+ * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
2358
+ */
2359
+ typical_p?: number;
2360
+ /**
2361
+ * Whether the model should use the past last key/values attentions to speed up decoding
2362
+ */
2363
+ use_cache?: boolean;
2364
+ [property: string]: unknown;
2365
+ }
2366
+ /**
2367
+ * Controls the stopping condition for beam-based methods.
2368
+ */
2369
+ type EarlyStoppingUnion = boolean | "never";
2370
+ /**
2371
+ * Outputs for Text to Speech inference
2372
+ *
2373
+ * Outputs of inference for the Text To Audio task
2374
+ */
2375
+ interface TextToSpeechOutput {
2376
+ /**
2377
+ * The generated audio waveform.
2378
+ */
2379
+ audio: unknown;
2380
+ samplingRate: unknown;
2381
+ /**
2382
+ * The sampling rate of the generated audio waveform.
2383
+ */
2384
+ sampling_rate?: number;
2385
+ [property: string]: unknown;
2386
+ }
2387
+
2388
+ /**
2389
+ * Inference code generated from the JSON schema spec in ./spec
2390
+ *
2391
+ * Using src/scripts/inference-codegen
2392
+ */
2393
+ /**
2394
+ * Inputs for Token Classification inference
2395
+ */
2396
+ interface TokenClassificationInput {
2397
+ /**
2398
+ * The input text data
2399
+ */
2400
+ inputs: string;
2401
+ /**
2402
+ * Additional inference parameters
2403
+ */
2404
+ parameters?: TokenClassificationParameters;
2405
+ [property: string]: unknown;
2406
+ }
2407
+ /**
2408
+ * Additional inference parameters
2409
+ *
2410
+ * Additional inference parameters for Token Classification
2411
+ */
2412
+ interface TokenClassificationParameters {
2413
+ /**
2414
+ * The strategy used to fuse tokens based on model predictions
2415
+ */
2416
+ aggregation_strategy?: TokenClassificationAggregationStrategy;
2417
+ /**
2418
+ * A list of labels to ignore
2419
+ */
2420
+ ignore_labels?: string[];
2421
+ /**
2422
+ * The number of overlapping tokens between chunks when splitting the input text.
2423
+ */
2424
+ stride?: number;
2425
+ [property: string]: unknown;
2426
+ }
2427
+ /**
2428
+ * Do not aggregate tokens
2429
+ *
2430
+ * Group consecutive tokens with the same label in a single entity.
2431
+ *
2432
+ * Similar to "simple", also preserves word integrity (use the label predicted for the first
2433
+ * token in a word).
2434
+ *
2435
+ * Similar to "simple", also preserves word integrity (uses the label with the highest
2436
+ * score, averaged across the word's tokens).
2437
+ *
2438
+ * Similar to "simple", also preserves word integrity (uses the label with the highest score
2439
+ * across the word's tokens).
2440
+ */
2441
+ type TokenClassificationAggregationStrategy = "none" | "simple" | "first" | "average" | "max";
2442
+ type TokenClassificationOutput = TokenClassificationOutputElement[];
2443
+ /**
2444
+ * Outputs of inference for the Token Classification task
2445
+ */
2446
+ interface TokenClassificationOutputElement {
2447
+ /**
2448
+ * The character position in the input where this group ends.
2449
+ */
2450
+ end?: number;
2451
+ /**
2452
+ * The predicted label for that group of tokens
2453
+ */
2454
+ entity_group?: string;
2455
+ label: unknown;
2456
+ /**
2457
+ * The associated score / probability
2458
+ */
2459
+ score: number;
2460
+ /**
2461
+ * The character position in the input where this group begins.
2462
+ */
2463
+ start?: number;
2464
+ /**
2465
+ * The corresponding text
2466
+ */
2467
+ word?: string;
2468
+ [property: string]: unknown;
2469
+ }
2470
+
2471
+ /**
2472
+ * Inference code generated from the JSON schema spec in ./spec
2473
+ *
2474
+ * Using src/scripts/inference-codegen
2475
+ */
2476
+ /**
2477
+ * Inputs for Translation inference
2478
+ *
2479
+ * Inputs for Text2text Generation inference
2480
+ */
2481
+ interface TranslationInput {
2482
+ /**
2483
+ * The input text data
2484
+ */
2485
+ inputs: string;
2486
+ /**
2487
+ * Additional inference parameters
2488
+ */
2489
+ parameters?: Text2TextGenerationParameters;
2490
+ [property: string]: unknown;
2491
+ }
2492
+ /**
2493
+ * Additional inference parameters
2494
+ *
2495
+ * Additional inference parameters for Text2text Generation
2496
+ */
2497
+ interface Text2TextGenerationParameters {
2498
+ /**
2499
+ * Whether to clean up the potential extra spaces in the text output.
2500
+ */
2501
+ clean_up_tokenization_spaces?: boolean;
2502
+ /**
2503
+ * Additional parametrization of the text generation algorithm
2504
+ */
2505
+ generate_parameters?: {
2506
+ [key: string]: unknown;
2507
+ };
2508
+ /**
2509
+ * The truncation strategy to use
2510
+ */
2511
+ truncation?: Text2TextGenerationTruncationStrategy;
2512
+ [property: string]: unknown;
2513
+ }
2514
+ type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
2515
+ /**
2516
+ * Outputs of inference for the Translation task
2517
+ */
2518
+ interface TranslationOutput {
2519
+ /**
2520
+ * The translated text.
2521
+ */
2522
+ translation_text: string;
2523
+ [property: string]: unknown;
2524
+ }
2525
+
2526
+ /**
2527
+ * Inference code generated from the JSON schema spec in ./spec
2528
+ *
2529
+ * Using src/scripts/inference-codegen
2530
+ */
2531
+ /**
2532
+ * Inputs for Text Classification inference
2533
+ */
2534
+ interface TextClassificationInput {
2535
+ /**
2536
+ * The text to classify
2537
+ */
2538
+ inputs: string;
2539
+ /**
2540
+ * Additional inference parameters
2541
+ */
2542
+ parameters?: TextClassificationParameters;
2543
+ [property: string]: unknown;
2544
+ }
2545
+ /**
2546
+ * Additional inference parameters
2547
+ *
2548
+ * Additional inference parameters for Text Classification
2549
+ */
2550
+ interface TextClassificationParameters {
2551
+ function_to_apply?: ClassificationOutputTransform$1;
2552
+ /**
2553
+ * When specified, limits the output to the top K most probable classes.
2554
+ */
2555
+ top_k?: number;
2556
+ [property: string]: unknown;
2557
+ }
2558
+ /**
2559
+ * The function to apply to the model outputs in order to retrieve the scores.
2560
+ */
2561
+ type ClassificationOutputTransform$1 = "sigmoid" | "softmax" | "none";
2562
+ type TextClassificationOutput = TextClassificationOutputElement[];
2563
+ /**
2564
+ * Outputs of inference for the Text Classification task
2565
+ */
2566
+ interface TextClassificationOutputElement {
2567
+ /**
2568
+ * The predicted class label.
2569
+ */
2570
+ label: string;
2571
+ /**
2572
+ * The corresponding probability.
2573
+ */
2574
+ score: number;
2575
+ [property: string]: unknown;
2576
+ }
2577
+
2578
+ /**
2579
+ * Inference code generated from the JSON schema spec in ./spec
2580
+ *
2581
+ * Using src/scripts/inference-codegen
2582
+ */
2583
+ /**
2584
+ * Inputs for Text Generation inference
2585
+ */
2586
+ interface TextGenerationInput {
2587
+ /**
2588
+ * The text to initialize generation with
2589
+ */
2590
+ inputs: string;
2591
+ /**
2592
+ * Additional inference parameters
2593
+ */
2594
+ parameters?: TextGenerationParameters;
2595
+ [property: string]: unknown;
2596
+ }
2597
+ /**
2598
+ * Additional inference parameters
2599
+ *
2600
+ * Additional inference parameters for Text Generation
2601
+ */
2602
+ interface TextGenerationParameters {
2603
+ /**
2604
+ * The number of sampling queries to run. Only the best one (in terms of total logprob) will
2605
+ * be returned.
2606
+ */
2607
+ best_of?: number;
2608
+ /**
2609
+ * Whether or not to output decoder input details
2610
+ */
2611
+ decoder_input_details?: boolean;
2612
+ /**
2613
+ * Whether or not to output details
2614
+ */
2615
+ details?: boolean;
2616
+ /**
2617
+ * Whether to use logits sampling instead of greedy decoding when generating new tokens.
2618
+ */
2619
+ do_sample?: boolean;
2620
+ /**
2621
+ * The maximum number of tokens to generate.
2622
+ */
2623
+ max_new_tokens?: number;
2624
+ /**
2625
+ * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
2626
+ * paper](https://hf.co/papers/1909.05858) for more details.
2627
+ */
2628
+ repetition_penalty?: number;
2629
+ /**
2630
+ * Whether to prepend the prompt to the generated text.
2631
+ */
2632
+ return_full_text?: boolean;
2633
+ /**
2634
+ * The random sampling seed.
2635
+ */
2636
+ seed?: number;
2637
+ /**
2638
+ * Stop generating tokens if a member of `stop_sequences` is generated.
2639
+ */
2640
+ stop_sequences?: string[];
2641
+ /**
2642
+ * The value used to modulate the logits distribution.
2643
+ */
2644
+ temperature?: number;
2645
+ /**
2646
+ * The number of highest probability vocabulary tokens to keep for top-k-filtering.
2647
+ */
2648
+ top_k?: number;
2649
+ /**
2650
+ * If set to < 1, only the smallest set of most probable tokens with probabilities that add
2651
+ * up to `top_p` or higher are kept for generation.
2652
+ */
2653
+ top_p?: number;
2654
+ /**
2655
+ * Truncate input tokens to the given size.
2656
+ */
2657
+ truncate?: number;
2658
+ /**
2659
+ * Typical Decoding mass. See [Typical Decoding for Natural Language
2660
+ * Generation](https://hf.co/papers/2202.00666) for more information
2661
+ */
2662
+ typical_p?: number;
2663
+ /**
2664
+ * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
2665
+ */
2666
+ watermark?: boolean;
2667
+ [property: string]: unknown;
2668
+ }
2669
+ /**
2670
+ * Outputs for Text Generation inference
2671
+ */
2672
+ interface TextGenerationOutput {
2673
+ /**
2674
+ * When enabled, details about the generation
2675
+ */
2676
+ details?: TextGenerationOutputDetails;
2677
+ /**
2678
+ * The generated text
2679
+ */
2680
+ generated_text: string;
2681
+ [property: string]: unknown;
2682
+ }
2683
+ /**
2684
+ * When enabled, details about the generation
2685
+ */
2686
+ interface TextGenerationOutputDetails {
2687
+ /**
2688
+ * Details about additional sequences when best_of is provided
2689
+ */
2690
+ best_of_sequences?: TextGenerationSequenceDetails[];
2691
+ /**
2692
+ * The reason why the generation was stopped.
2693
+ */
2694
+ finish_reason: FinishReason;
2695
+ /**
2696
+ * The number of generated tokens
2697
+ */
2698
+ generated_tokens: number;
2699
+ prefill: PrefillToken[];
2700
+ /**
2701
+ * The random seed used for generation
2702
+ */
2703
+ seed?: number;
2704
+ /**
2705
+ * The generated tokens and associated details
2706
+ */
2707
+ tokens: Token[];
2708
+ [property: string]: unknown;
2709
+ }
2710
+ interface TextGenerationSequenceDetails {
2711
+ /**
2712
+ * The reason why the generation was stopped.
2713
+ */
2714
+ finish_reason: FinishReason;
2715
+ /**
2716
+ * The generated text
2717
+ */
2718
+ generated_text: number;
2719
+ /**
2720
+ * The number of generated tokens
2721
+ */
2722
+ generated_tokens: number;
2723
+ prefill: PrefillToken[];
2724
+ /**
2725
+ * The random seed used for generation
2726
+ */
2727
+ seed?: number;
2728
+ /**
2729
+ * The generated tokens and associated details
2730
+ */
2731
+ tokens: Token[];
2732
+ [property: string]: unknown;
2733
+ }
2734
+ /**
2735
+ * The generated sequence reached the maximum allowed length
2736
+ *
2737
+ * The model generated an end-of-sentence (EOS) token
2738
+ *
2739
+ * One of the sequence in stop_sequences was generated
2740
+ */
2741
+ type FinishReason = "length" | "eos_token" | "stop_sequence";
2742
+ interface PrefillToken {
2743
+ id: number;
2744
+ logprob: number;
2745
+ /**
2746
+ * The text associated with that token
2747
+ */
2748
+ text: string;
2749
+ [property: string]: unknown;
2750
+ }
2751
+ interface Token {
2752
+ id: number;
2753
+ logprob: number;
2754
+ /**
2755
+ * Whether or not that token is a special one
2756
+ */
2757
+ special: boolean;
2758
+ /**
2759
+ * The text associated with that token
2760
+ */
2761
+ text: string;
2762
+ [property: string]: unknown;
2763
+ }
2764
+
2765
+ /**
2766
+ * Inference code generated from the JSON schema spec in ./spec
2767
+ *
2768
+ * Using src/scripts/inference-codegen
2769
+ */
2770
+ /**
2771
+ * Inputs for Video Classification inference
2772
+ */
2773
+ interface VideoClassificationInput {
2774
+ /**
2775
+ * The input video data
2776
+ */
2777
+ inputs: unknown;
2778
+ /**
2779
+ * Additional inference parameters
2780
+ */
2781
+ parameters?: VideoClassificationParameters;
2782
+ [property: string]: unknown;
2783
+ }
2784
+ /**
2785
+ * Additional inference parameters
2786
+ *
2787
+ * Additional inference parameters for Video Classification
2788
+ */
2789
+ interface VideoClassificationParameters {
2790
+ /**
2791
+ * The sampling rate used to select frames from the video.
2792
+ */
2793
+ frame_sampling_rate?: number;
2794
+ function_to_apply?: ClassificationOutputTransform;
2795
+ /**
2796
+ * The number of sampled frames to consider for classification.
2797
+ */
2798
+ num_frames?: number;
2799
+ /**
2800
+ * When specified, limits the output to the top K most probable classes.
2801
+ */
2802
+ top_k?: number;
2803
+ [property: string]: unknown;
2804
+ }
2805
+ /**
2806
+ * The function to apply to the model outputs in order to retrieve the scores.
2807
+ */
2808
+ type ClassificationOutputTransform = "sigmoid" | "softmax" | "none";
2809
+ type VideoClassificationOutput = VideoClassificationOutputElement[];
2810
+ /**
2811
+ * Outputs of inference for the Video Classification task
2812
+ */
2813
+ interface VideoClassificationOutputElement {
2814
+ /**
2815
+ * The predicted class label.
2816
+ */
2817
+ label: string;
2818
+ /**
2819
+ * The corresponding probability.
2820
+ */
2821
+ score: number;
2822
+ [property: string]: unknown;
2823
+ }
2824
+
2825
+ /**
2826
+ * Inference code generated from the JSON schema spec in ./spec
2827
+ *
2828
+ * Using src/scripts/inference-codegen
2829
+ */
2830
+ /**
2831
+ * Inputs for Visual Question Answering inference
2832
+ */
2833
+ interface VisualQuestionAnsweringInput {
2834
+ /**
2835
+ * One (image, question) pair to answer
2836
+ */
2837
+ inputs: VisualQuestionAnsweringInputData;
2838
+ /**
2839
+ * Additional inference parameters
2840
+ */
2841
+ parameters?: VisualQuestionAnsweringParameters;
2842
+ [property: string]: unknown;
2843
+ }
2844
+ /**
2845
+ * One (image, question) pair to answer
2846
+ */
2847
+ interface VisualQuestionAnsweringInputData {
2848
+ /**
2849
+ * The image.
2850
+ */
2851
+ image: unknown;
2852
+ /**
2853
+ * The question to answer based on the image.
2854
+ */
2855
+ question: unknown;
2856
+ [property: string]: unknown;
2857
+ }
2858
+ /**
2859
+ * Additional inference parameters
2860
+ *
2861
+ * Additional inference parameters for Visual Question Answering
2862
+ */
2863
+ interface VisualQuestionAnsweringParameters {
2864
+ /**
2865
+ * The number of answers to return (will be chosen by order of likelihood). Note that we
2866
+ * return less than topk answers if there are not enough options available within the
2867
+ * context.
2868
+ */
2869
+ top_k?: number;
2870
+ [property: string]: unknown;
2871
+ }
2872
+ type VisualQuestionAnsweringOutput = VisualQuestionAnsweringOutputElement[];
2873
+ /**
2874
+ * Outputs of inference for the Visual Question Answering task
2875
+ */
2876
+ interface VisualQuestionAnsweringOutputElement {
2877
+ /**
2878
+ * The answer to the question
2879
+ */
2880
+ answer?: string;
2881
+ label: unknown;
2882
+ /**
2883
+ * The associated score / probability
2884
+ */
2885
+ score: number;
2886
+ [property: string]: unknown;
2887
+ }
2888
+
2889
+ /**
2890
+ * Inference code generated from the JSON schema spec in ./spec
2891
+ *
2892
+ * Using src/scripts/inference-codegen
2893
+ */
2894
+ /**
2895
+ * Inputs for Zero Shot Classification inference
2896
+ */
2897
+ interface ZeroShotClassificationInput {
2898
+ /**
2899
+ * The input text data, with candidate labels
2900
+ */
2901
+ inputs: ZeroShotClassificationInputData;
2902
+ /**
2903
+ * Additional inference parameters
2904
+ */
2905
+ parameters?: ZeroShotClassificationParameters;
2906
+ [property: string]: unknown;
2907
+ }
2908
+ /**
2909
+ * The input text data, with candidate labels
2910
+ */
2911
+ interface ZeroShotClassificationInputData {
2912
+ /**
2913
+ * The set of possible class labels to classify the text into.
2914
+ */
2915
+ candidateLabels: string[];
2916
+ /**
2917
+ * The text to classify
2918
+ */
2919
+ text: string;
2920
+ [property: string]: unknown;
2921
+ }
2922
+ /**
2923
+ * Additional inference parameters
2924
+ *
2925
+ * Additional inference parameters for Zero Shot Classification
2926
+ */
2927
+ interface ZeroShotClassificationParameters {
2928
+ /**
2929
+ * The sentence used in conjunction with candidateLabels to attempt the text classification
2930
+ * by replacing the placeholder with the candidate labels.
2931
+ */
2932
+ hypothesis_template?: string;
2933
+ /**
2934
+ * Whether multiple candidate labels can be true. If false, the scores are normalized such
2935
+ * that the sum of the label likelihoods for each sequence is 1. If true, the labels are
2936
+ * considered independent and probabilities are normalized for each candidate.
2937
+ */
2938
+ multi_label?: boolean;
2939
+ [property: string]: unknown;
2940
+ }
2941
+ type ZeroShotClassificationOutput = ZeroShotClassificationOutputElement[];
2942
+ /**
2943
+ * Outputs of inference for the Zero Shot Classification task
2944
+ */
2945
+ interface ZeroShotClassificationOutputElement {
2946
+ /**
2947
+ * The predicted class label.
2948
+ */
2949
+ label: string;
2950
+ /**
2951
+ * The corresponding probability.
2952
+ */
2953
+ score: number;
2954
+ [property: string]: unknown;
2955
+ }
2956
+
2957
+ /**
2958
+ * Inference code generated from the JSON schema spec in ./spec
2959
+ *
2960
+ * Using src/scripts/inference-codegen
2961
+ */
2962
+ /**
2963
+ * Inputs for Zero Shot Image Classification inference
2964
+ */
2965
+ interface ZeroShotImageClassificationInput {
2966
+ /**
2967
+ * The input image data, with candidate labels
2968
+ */
2969
+ inputs: ZeroShotImageClassificationInputData;
2970
+ /**
2971
+ * Additional inference parameters
2972
+ */
2973
+ parameters?: ZeroShotImageClassificationParameters;
2974
+ [property: string]: unknown;
2975
+ }
2976
+ /**
2977
+ * The input image data, with candidate labels
2978
+ */
2979
+ interface ZeroShotImageClassificationInputData {
2980
+ /**
2981
+ * The candidate labels for this image
2982
+ */
2983
+ candidateLabels: string[];
2984
+ /**
2985
+ * The image data to classify
2986
+ */
2987
+ image: unknown;
2988
+ [property: string]: unknown;
2989
+ }
2990
+ /**
2991
+ * Additional inference parameters
2992
+ *
2993
+ * Additional inference parameters for Zero Shot Image Classification
2994
+ */
2995
+ interface ZeroShotImageClassificationParameters {
2996
+ /**
2997
+ * The sentence used in conjunction with candidateLabels to attempt the text classification
2998
+ * by replacing the placeholder with the candidate labels.
2999
+ */
3000
+ hypothesis_template?: string;
3001
+ [property: string]: unknown;
3002
+ }
3003
+ type ZeroShotImageClassificationOutput = ZeroShotImageClassificationOutputElement[];
3004
+ /**
3005
+ * Outputs of inference for the Zero Shot Image Classification task
3006
+ */
3007
+ interface ZeroShotImageClassificationOutputElement {
3008
+ /**
3009
+ * The predicted class label.
3010
+ */
3011
+ label: string;
3012
+ /**
3013
+ * The corresponding probability.
3014
+ */
3015
+ score: number;
3016
+ [property: string]: unknown;
3017
+ }
3018
+
3019
+ /**
3020
+ * Inference code generated from the JSON schema spec in ./spec
3021
+ *
3022
+ * Using src/scripts/inference-codegen
3023
+ */
3024
+ /**
3025
+ * Inputs for Zero Shot Object Detection inference
3026
+ */
3027
+ interface ZeroShotObjectDetectionInput {
3028
+ /**
3029
+ * The input image data, with candidate labels
3030
+ */
3031
+ inputs: ZeroShotObjectDetectionInputData;
3032
+ /**
3033
+ * Additional inference parameters
3034
+ */
3035
+ parameters?: {
3036
+ [key: string]: unknown;
3037
+ };
3038
+ [property: string]: unknown;
3039
+ }
3040
+ /**
3041
+ * The input image data, with candidate labels
3042
+ */
3043
+ interface ZeroShotObjectDetectionInputData {
3044
+ /**
3045
+ * The candidate labels for this image
3046
+ */
3047
+ candidateLabels: string[];
3048
+ /**
3049
+ * The image data to generate bounding boxes from
3050
+ */
3051
+ image: unknown;
3052
+ [property: string]: unknown;
3053
+ }
3054
+ /**
3055
+ * The predicted bounding box. Coordinates are relative to the top left corner of the input
3056
+ * image.
3057
+ */
3058
+ interface BoundingBox {
3059
+ xmax: number;
3060
+ xmin: number;
3061
+ ymax: number;
3062
+ ymin: number;
3063
+ [property: string]: unknown;
3064
+ }
3065
+ type ZeroShotObjectDetectionOutput = ZeroShotObjectDetectionOutputElement[];
3066
+ /**
3067
+ * Outputs of inference for the Zero Shot Object Detection task
3068
+ */
3069
+ interface ZeroShotObjectDetectionOutputElement {
3070
+ /**
3071
+ * The predicted bounding box. Coordinates are relative to the top left corner of the input
3072
+ * image.
3073
+ */
3074
+ box: BoundingBox;
3075
+ /**
3076
+ * A candidate label
3077
+ */
3078
+ label: string;
3079
+ /**
3080
+ * The associated score / probability
3081
+ */
3082
+ score: number;
3083
+ [property: string]: unknown;
3084
+ }
3085
+
1069
3086
  /**
1070
3087
  * Model libraries compatible with each ML task
1071
3088
  */
@@ -1231,4 +3248,4 @@ declare namespace index {
1231
3248
  };
1232
3249
  }
1233
3250
 
1234
- export { ALL_DISPLAY_MODEL_LIBRARY_KEYS, ALL_MODEL_LIBRARY_KEYS, ExampleRepo, InferenceDisplayability, LIBRARY_TASK_MAPPING_EXCLUDING_TRANSFORMERS, LibraryUiElement, MAPPING_DEFAULT_WIDGET, MODALITIES, MODALITY_LABELS, MODEL_LIBRARIES_UI_ELEMENTS, Modality, ModelData, ModelLibraryKey, PIPELINE_DATA, PIPELINE_TYPES, PIPELINE_TYPES_SET, PipelineData, PipelineType, SPECIAL_TOKENS_ATTRIBUTES, SUBTASK_TYPES, SpecialTokensMap, TASKS_DATA, TASKS_MODEL_LIBRARIES, TaskData, TaskDataCustom, TaskDemo, TaskDemoEntry, TokenizerConfig, TransformersInfo, WidgetExample, WidgetExampleAssetAndPromptInput, WidgetExampleAssetAndTextInput, WidgetExampleAssetAndZeroShotInput, WidgetExampleAssetInput, WidgetExampleAttribute, WidgetExampleOutput, WidgetExampleOutputAnswerScore, WidgetExampleOutputLabels, WidgetExampleOutputText, WidgetExampleOutputUrl, WidgetExampleSentenceSimilarityInput, WidgetExampleStructuredDataInput, WidgetExampleTableDataInput, WidgetExampleTextAndContextInput, WidgetExampleTextAndTableInput, WidgetExampleTextInput, WidgetExampleZeroShotTextInput, WidgetType, index as snippets };
3251
+ export { ALL_DISPLAY_MODEL_LIBRARY_KEYS, ALL_MODEL_LIBRARY_KEYS, AudioClassificationInput, AudioClassificationOutput, AudioClassificationOutputElement, AudioClassificationParameters, AutomaticSpeechRecognitionInput, AutomaticSpeechRecognitionOutput, AutomaticSpeechRecognitionOutputChunk, AutomaticSpeechRecognitionParameters, BoundingBox, ChatMessage, ClassificationOutputTransform$1 as ClassificationOutputTransform, DepthEstimationInput, DepthEstimationOutput, DocumentQuestionAnsweringInput, DocumentQuestionAnsweringInputData, DocumentQuestionAnsweringOutput, DocumentQuestionAnsweringOutputElement, DocumentQuestionAnsweringParameters, EarlyStoppingUnion$2 as EarlyStoppingUnion, ExampleRepo, FeatureExtractionInput, FeatureExtractionOutput, FillMaskInput, FillMaskOutput, FillMaskOutputElement, FillMaskParameters, FinishReason, GenerationParameters$2 as GenerationParameters, ImageClassificationInput, ImageClassificationOutput, ImageClassificationOutputElement, ImageClassificationParameters, ImageSegmentationInput, ImageSegmentationOutput, ImageSegmentationOutputElement, ImageSegmentationParameters, ImageSegmentationSubtask, ImageToImageInput, ImageToImageOutput, ImageToImageParameters, ImageToTextInput, ImageToTextOutput, ImageToTextParameters, InferenceDisplayability, LIBRARY_TASK_MAPPING_EXCLUDING_TRANSFORMERS, LibraryUiElement, MAPPING_DEFAULT_WIDGET, MODALITIES, MODALITY_LABELS, MODEL_LIBRARIES_UI_ELEMENTS, Modality, ModelData, ModelLibraryKey, ObjectDetectionInput, ObjectDetectionOutput, ObjectDetectionOutputElement, ObjectDetectionParameters, PIPELINE_DATA, PIPELINE_TYPES, PIPELINE_TYPES_SET, PipelineData, PipelineType, PrefillToken, QuestionAnsweringInput, QuestionAnsweringInputData, QuestionAnsweringOutput, QuestionAnsweringOutputElement, QuestionAnsweringParameters, SPECIAL_TOKENS_ATTRIBUTES, SUBTASK_TYPES, SentenceSimilarityInput, SentenceSimilarityInputData, SentenceSimilarityOutput, SpecialTokensMap, SummarizationInput, SummarizationOutput, TASKS_DATA, TASKS_MODEL_LIBRARIES, TableQuestionAnsweringInput, TableQuestionAnsweringInputData, TableQuestionAnsweringOutput, TableQuestionAnsweringOutputElement, TargetSize$1 as TargetSize, TaskData, TaskDataCustom, TaskDemo, TaskDemoEntry, Text2TextGenerationParameters, Text2TextGenerationTruncationStrategy, TextClassificationInput, TextClassificationOutput, TextClassificationOutputElement, TextClassificationParameters, TextGenerationInput, TextGenerationOutput, TextGenerationOutputDetails, TextGenerationParameters, TextGenerationSequenceDetails, TextToAudioParameters, TextToImageInput, TextToImageOutput, TextToImageParameters, TextToSpeechInput, TextToSpeechOutput, Token, TokenClassificationAggregationStrategy, TokenClassificationInput, TokenClassificationOutput, TokenClassificationOutputElement, TokenClassificationParameters, TokenizerConfig, TransformersInfo, TranslationInput, TranslationOutput, VideoClassificationInput, VideoClassificationOutput, VideoClassificationOutputElement, VideoClassificationParameters, VisualQuestionAnsweringInput, VisualQuestionAnsweringInputData, VisualQuestionAnsweringOutput, VisualQuestionAnsweringOutputElement, VisualQuestionAnsweringParameters, WidgetExample, WidgetExampleAssetAndPromptInput, WidgetExampleAssetAndTextInput, WidgetExampleAssetAndZeroShotInput, WidgetExampleAssetInput, WidgetExampleAttribute, WidgetExampleChatInput, WidgetExampleOutput, WidgetExampleOutputAnswerScore, WidgetExampleOutputLabels, WidgetExampleOutputText, WidgetExampleOutputUrl, WidgetExampleSentenceSimilarityInput, WidgetExampleStructuredDataInput, WidgetExampleTableDataInput, WidgetExampleTextAndContextInput, WidgetExampleTextAndTableInput, WidgetExampleTextInput, WidgetExampleZeroShotTextInput, WidgetType, WordBox, ZeroShotClassificationInput, ZeroShotClassificationInputData, ZeroShotClassificationOutput, ZeroShotClassificationOutputElement, ZeroShotClassificationParameters, ZeroShotImageClassificationInput, ZeroShotImageClassificationInputData, ZeroShotImageClassificationOutput, ZeroShotImageClassificationOutputElement, ZeroShotImageClassificationParameters, ZeroShotObjectDetectionInput, ZeroShotObjectDetectionInputData, ZeroShotObjectDetectionOutput, ZeroShotObjectDetectionOutputElement, index as snippets };