@huggingface/tasks 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -31,6 +31,7 @@ __export(src_exports, {
31
31
  PIPELINE_DATA: () => PIPELINE_DATA,
32
32
  PIPELINE_TYPES: () => PIPELINE_TYPES,
33
33
  PIPELINE_TYPES_SET: () => PIPELINE_TYPES_SET,
34
+ SPECIAL_TOKENS_ATTRIBUTES: () => SPECIAL_TOKENS_ATTRIBUTES,
34
35
  SUBTASK_TYPES: () => SUBTASK_TYPES,
35
36
  TASKS_DATA: () => TASKS_DATA,
36
37
  TASKS_MODEL_LIBRARIES: () => TASKS_MODEL_LIBRARIES,
@@ -58,7 +59,7 @@ var LIBRARY_TASK_MAPPING_EXCLUDING_TRANSFORMERS = {
58
59
  keras: ["image-classification"],
59
60
  nemo: ["automatic-speech-recognition"],
60
61
  open_clip: ["zero-shot-classification", "zero-shot-image-classification"],
61
- paddlenlp: ["conversational", "fill-mask", "summarization", "zero-shot-classification"],
62
+ paddlenlp: ["fill-mask", "summarization", "zero-shot-classification"],
62
63
  peft: ["text-generation"],
63
64
  "pyannote-audio": ["automatic-speech-recognition"],
64
65
  "sentence-transformers": ["feature-extraction", "sentence-similarity"],
@@ -929,20 +930,9 @@ var PIPELINE_DATA = {
929
930
  modality: "nlp",
930
931
  color: "indigo"
931
932
  },
932
- conversational: {
933
- name: "Conversational",
934
- subtasks: [
935
- {
936
- type: "dialogue-generation",
937
- name: "Dialogue Generation"
938
- }
939
- ],
940
- modality: "nlp",
941
- color: "green"
942
- },
943
933
  "feature-extraction": {
944
934
  name: "Feature Extraction",
945
- modality: "multimodal",
935
+ modality: "nlp",
946
936
  color: "red"
947
937
  },
948
938
  "text-generation": {
@@ -952,6 +942,14 @@ var PIPELINE_DATA = {
952
942
  type: "dialogue-modeling",
953
943
  name: "Dialogue Modeling"
954
944
  },
945
+ {
946
+ type: "dialogue-generation",
947
+ name: "Dialogue Generation"
948
+ },
949
+ {
950
+ type: "conversational",
951
+ name: "Conversational"
952
+ },
955
953
  {
956
954
  type: "language-modeling",
957
955
  name: "Language Modeling"
@@ -1123,7 +1121,7 @@ var PIPELINE_DATA = {
1123
1121
  },
1124
1122
  "text-to-image": {
1125
1123
  name: "Text-to-Image",
1126
- modality: "multimodal",
1124
+ modality: "cv",
1127
1125
  color: "yellow"
1128
1126
  },
1129
1127
  "image-to-text": {
@@ -1134,7 +1132,7 @@ var PIPELINE_DATA = {
1134
1132
  name: "Image Captioning"
1135
1133
  }
1136
1134
  ],
1137
- modality: "multimodal",
1135
+ modality: "cv",
1138
1136
  color: "red"
1139
1137
  },
1140
1138
  "image-to-image": {
@@ -1158,7 +1156,7 @@ var PIPELINE_DATA = {
1158
1156
  },
1159
1157
  "image-to-video": {
1160
1158
  name: "Image-to-Video",
1161
- modality: "multimodal",
1159
+ modality: "cv",
1162
1160
  color: "indigo"
1163
1161
  },
1164
1162
  "unconditional-image-generation": {
@@ -1293,9 +1291,15 @@ var PIPELINE_DATA = {
1293
1291
  },
1294
1292
  "text-to-video": {
1295
1293
  name: "Text-to-Video",
1296
- modality: "multimodal",
1294
+ modality: "cv",
1297
1295
  color: "green"
1298
1296
  },
1297
+ "image-text-to-text": {
1298
+ name: "Image + Text to Text (VLLMs)",
1299
+ modality: "multimodal",
1300
+ color: "red",
1301
+ hideInDatasets: true
1302
+ },
1299
1303
  "visual-question-answering": {
1300
1304
  name: "Visual Question Answering",
1301
1305
  subtasks: [
@@ -1326,7 +1330,7 @@ var PIPELINE_DATA = {
1326
1330
  },
1327
1331
  "graph-ml": {
1328
1332
  name: "Graph Machine Learning",
1329
- modality: "multimodal",
1333
+ modality: "other",
1330
1334
  color: "green"
1331
1335
  },
1332
1336
  "mask-generation": {
@@ -1341,14 +1345,19 @@ var PIPELINE_DATA = {
1341
1345
  },
1342
1346
  "text-to-3d": {
1343
1347
  name: "Text-to-3D",
1344
- modality: "multimodal",
1348
+ modality: "cv",
1345
1349
  color: "yellow"
1346
1350
  },
1347
1351
  "image-to-3d": {
1348
1352
  name: "Image-to-3D",
1349
- modality: "multimodal",
1353
+ modality: "cv",
1350
1354
  color: "green"
1351
1355
  },
1356
+ "image-feature-extraction": {
1357
+ name: "Image Feature Extraction",
1358
+ modality: "cv",
1359
+ color: "indigo"
1360
+ },
1352
1361
  other: {
1353
1362
  name: "Other",
1354
1363
  modality: "other",
@@ -1574,68 +1583,8 @@ var taskData3 = {
1574
1583
  };
1575
1584
  var data_default3 = taskData3;
1576
1585
 
1577
- // src/tasks/conversational/data.ts
1578
- var taskData4 = {
1579
- datasets: [
1580
- {
1581
- description: "A dataset of 7k conversations explicitly designed to exhibit multiple conversation modes: displaying personality, having empathy, and demonstrating knowledge.",
1582
- id: "blended_skill_talk"
1583
- },
1584
- {
1585
- description: "ConvAI is a dataset of human-to-bot conversations labeled for quality. This data can be used to train a metric for evaluating dialogue systems",
1586
- id: "conv_ai_2"
1587
- },
1588
- {
1589
- description: "EmpatheticDialogues, is a dataset of 25k conversations grounded in emotional situations",
1590
- id: "empathetic_dialogues"
1591
- }
1592
- ],
1593
- demo: {
1594
- inputs: [
1595
- {
1596
- label: "Input",
1597
- content: "Hey my name is Julien! How are you?",
1598
- type: "text"
1599
- }
1600
- ],
1601
- outputs: [
1602
- {
1603
- label: "Answer",
1604
- content: "Hi Julien! My name is Julia! I am well.",
1605
- type: "text"
1606
- }
1607
- ]
1608
- },
1609
- metrics: [
1610
- {
1611
- description: "BLEU score is calculated by counting the number of shared single or subsequent tokens between the generated sequence and the reference. Subsequent n tokens are called \u201Cn-grams\u201D. Unigram refers to a single token while bi-gram refers to token pairs and n-grams refer to n subsequent tokens. The score ranges from 0 to 1, where 1 means the translation perfectly matched and 0 did not match at all",
1612
- id: "bleu"
1613
- }
1614
- ],
1615
- models: [
1616
- {
1617
- description: "A faster and smaller model than the famous BERT model.",
1618
- id: "facebook/blenderbot-400M-distill"
1619
- },
1620
- {
1621
- description: "DialoGPT is a large-scale pretrained dialogue response generation model for multiturn conversations.",
1622
- id: "microsoft/DialoGPT-large"
1623
- }
1624
- ],
1625
- spaces: [
1626
- {
1627
- description: "A chatbot based on Blender model.",
1628
- id: "EXFINITE/BlenderBot-UI"
1629
- }
1630
- ],
1631
- summary: "Conversational response modelling is the task of generating conversational text that is relevant, coherent and knowledgable given a prompt. These models have applications in chatbots, and as a part of voice assistants",
1632
- widgetModels: ["facebook/blenderbot-400M-distill"],
1633
- youtubeId: ""
1634
- };
1635
- var data_default4 = taskData4;
1636
-
1637
1586
  // src/tasks/document-question-answering/data.ts
1638
- var taskData5 = {
1587
+ var taskData4 = {
1639
1588
  datasets: [
1640
1589
  {
1641
1590
  // TODO write proper description
@@ -1705,10 +1654,10 @@ var taskData5 = {
1705
1654
  widgetModels: ["impira/layoutlm-document-qa"],
1706
1655
  youtubeId: ""
1707
1656
  };
1708
- var data_default5 = taskData5;
1657
+ var data_default4 = taskData4;
1709
1658
 
1710
1659
  // src/tasks/feature-extraction/data.ts
1711
- var taskData6 = {
1660
+ var taskData5 = {
1712
1661
  datasets: [
1713
1662
  {
1714
1663
  description: "Wikipedia dataset containing cleaned articles of all languages. Can be used to train `feature-extraction` models.",
@@ -1751,10 +1700,10 @@ var taskData6 = {
1751
1700
  summary: "Feature extraction refers to the process of transforming raw data into numerical features that can be processed while preserving the information in the original dataset.",
1752
1701
  widgetModels: ["facebook/bart-base"]
1753
1702
  };
1754
- var data_default6 = taskData6;
1703
+ var data_default5 = taskData5;
1755
1704
 
1756
1705
  // src/tasks/fill-mask/data.ts
1757
- var taskData7 = {
1706
+ var taskData6 = {
1758
1707
  datasets: [
1759
1708
  {
1760
1709
  description: "A common dataset that is used to train models for many languages.",
@@ -1826,10 +1775,10 @@ var taskData7 = {
1826
1775
  widgetModels: ["distilroberta-base"],
1827
1776
  youtubeId: "mqElG5QJWUg"
1828
1777
  };
1829
- var data_default7 = taskData7;
1778
+ var data_default6 = taskData6;
1830
1779
 
1831
1780
  // src/tasks/image-classification/data.ts
1832
- var taskData8 = {
1781
+ var taskData7 = {
1833
1782
  datasets: [
1834
1783
  {
1835
1784
  // TODO write proper description
@@ -1912,10 +1861,10 @@ var taskData8 = {
1912
1861
  widgetModels: ["google/vit-base-patch16-224"],
1913
1862
  youtubeId: "tjAIM7BOYhw"
1914
1863
  };
1915
- var data_default8 = taskData8;
1864
+ var data_default7 = taskData7;
1916
1865
 
1917
1866
  // src/tasks/image-to-image/data.ts
1918
- var taskData9 = {
1867
+ var taskData8 = {
1919
1868
  datasets: [
1920
1869
  {
1921
1870
  description: "Synthetic dataset, for image relighting",
@@ -2007,10 +1956,10 @@ var taskData9 = {
2007
1956
  widgetModels: ["lllyasviel/sd-controlnet-canny"],
2008
1957
  youtubeId: ""
2009
1958
  };
2010
- var data_default9 = taskData9;
1959
+ var data_default8 = taskData8;
2011
1960
 
2012
1961
  // src/tasks/image-to-text/data.ts
2013
- var taskData10 = {
1962
+ var taskData9 = {
2014
1963
  datasets: [
2015
1964
  {
2016
1965
  // TODO write proper description
@@ -2087,10 +2036,10 @@ var taskData10 = {
2087
2036
  widgetModels: ["Salesforce/blip-image-captioning-base"],
2088
2037
  youtubeId: ""
2089
2038
  };
2090
- var data_default10 = taskData10;
2039
+ var data_default9 = taskData9;
2091
2040
 
2092
2041
  // src/tasks/image-segmentation/data.ts
2093
- var taskData11 = {
2042
+ var taskData10 = {
2094
2043
  datasets: [
2095
2044
  {
2096
2045
  description: "Scene segmentation dataset.",
@@ -2182,10 +2131,10 @@ var taskData11 = {
2182
2131
  widgetModels: ["facebook/detr-resnet-50-panoptic"],
2183
2132
  youtubeId: "dKE8SIt9C-w"
2184
2133
  };
2185
- var data_default11 = taskData11;
2134
+ var data_default10 = taskData10;
2186
2135
 
2187
2136
  // src/tasks/mask-generation/data.ts
2188
- var taskData12 = {
2137
+ var taskData11 = {
2189
2138
  datasets: [],
2190
2139
  demo: {
2191
2140
  inputs: [
@@ -2234,10 +2183,10 @@ var taskData12 = {
2234
2183
  widgetModels: [],
2235
2184
  youtubeId: ""
2236
2185
  };
2237
- var data_default12 = taskData12;
2186
+ var data_default11 = taskData11;
2238
2187
 
2239
2188
  // src/tasks/object-detection/data.ts
2240
- var taskData13 = {
2189
+ var taskData12 = {
2241
2190
  datasets: [
2242
2191
  {
2243
2192
  // TODO write proper description
@@ -2309,10 +2258,10 @@ var taskData13 = {
2309
2258
  widgetModels: ["facebook/detr-resnet-50"],
2310
2259
  youtubeId: "WdAeKSOpxhw"
2311
2260
  };
2312
- var data_default13 = taskData13;
2261
+ var data_default12 = taskData12;
2313
2262
 
2314
2263
  // src/tasks/depth-estimation/data.ts
2315
- var taskData14 = {
2264
+ var taskData13 = {
2316
2265
  datasets: [
2317
2266
  {
2318
2267
  description: "NYU Depth V2 Dataset: Video dataset containing both RGB and depth sensor data",
@@ -2366,10 +2315,10 @@ var taskData14 = {
2366
2315
  widgetModels: [""],
2367
2316
  youtubeId: ""
2368
2317
  };
2369
- var data_default14 = taskData14;
2318
+ var data_default13 = taskData13;
2370
2319
 
2371
2320
  // src/tasks/placeholder/data.ts
2372
- var taskData15 = {
2321
+ var taskData14 = {
2373
2322
  datasets: [],
2374
2323
  demo: {
2375
2324
  inputs: [],
@@ -2386,10 +2335,10 @@ var taskData15 = {
2386
2335
  /// (eg, text2text-generation is the canonical ID of translation)
2387
2336
  canonicalId: void 0
2388
2337
  };
2389
- var data_default15 = taskData15;
2338
+ var data_default14 = taskData14;
2390
2339
 
2391
2340
  // src/tasks/reinforcement-learning/data.ts
2392
- var taskData16 = {
2341
+ var taskData15 = {
2393
2342
  datasets: [
2394
2343
  {
2395
2344
  description: "A curation of widely used datasets for Data Driven Deep Reinforcement Learning (D4RL)",
@@ -2455,10 +2404,10 @@ var taskData16 = {
2455
2404
  widgetModels: [],
2456
2405
  youtubeId: "q0BiUn5LiBc"
2457
2406
  };
2458
- var data_default16 = taskData16;
2407
+ var data_default15 = taskData15;
2459
2408
 
2460
2409
  // src/tasks/question-answering/data.ts
2461
- var taskData17 = {
2410
+ var taskData16 = {
2462
2411
  datasets: [
2463
2412
  {
2464
2413
  // TODO write proper description
@@ -2522,10 +2471,10 @@ var taskData17 = {
2522
2471
  widgetModels: ["deepset/roberta-base-squad2"],
2523
2472
  youtubeId: "ajPx5LwJD-I"
2524
2473
  };
2525
- var data_default17 = taskData17;
2474
+ var data_default16 = taskData16;
2526
2475
 
2527
2476
  // src/tasks/sentence-similarity/data.ts
2528
- var taskData18 = {
2477
+ var taskData17 = {
2529
2478
  datasets: [
2530
2479
  {
2531
2480
  description: "Bing queries with relevant passages from various web sources.",
@@ -2617,10 +2566,10 @@ var taskData18 = {
2617
2566
  widgetModels: ["sentence-transformers/all-MiniLM-L6-v2"],
2618
2567
  youtubeId: "VCZq5AkbNEU"
2619
2568
  };
2620
- var data_default18 = taskData18;
2569
+ var data_default17 = taskData17;
2621
2570
 
2622
2571
  // src/tasks/summarization/data.ts
2623
- var taskData19 = {
2572
+ var taskData18 = {
2624
2573
  canonicalId: "text2text-generation",
2625
2574
  datasets: [
2626
2575
  {
@@ -2686,10 +2635,10 @@ var taskData19 = {
2686
2635
  widgetModels: ["sshleifer/distilbart-cnn-12-6"],
2687
2636
  youtubeId: "yHnr5Dk2zCI"
2688
2637
  };
2689
- var data_default19 = taskData19;
2638
+ var data_default18 = taskData18;
2690
2639
 
2691
2640
  // src/tasks/table-question-answering/data.ts
2692
- var taskData20 = {
2641
+ var taskData19 = {
2693
2642
  datasets: [
2694
2643
  {
2695
2644
  description: "The WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables.",
@@ -2740,10 +2689,10 @@ var taskData20 = {
2740
2689
  summary: "Table Question Answering (Table QA) is the answering a question about an information on a given table.",
2741
2690
  widgetModels: ["google/tapas-base-finetuned-wtq"]
2742
2691
  };
2743
- var data_default20 = taskData20;
2692
+ var data_default19 = taskData19;
2744
2693
 
2745
2694
  // src/tasks/tabular-classification/data.ts
2746
- var taskData21 = {
2695
+ var taskData20 = {
2747
2696
  datasets: [
2748
2697
  {
2749
2698
  description: "A comprehensive curation of datasets covering all benchmarks.",
@@ -2807,10 +2756,10 @@ var taskData21 = {
2807
2756
  widgetModels: ["scikit-learn/tabular-playground"],
2808
2757
  youtubeId: ""
2809
2758
  };
2810
- var data_default21 = taskData21;
2759
+ var data_default20 = taskData20;
2811
2760
 
2812
2761
  // src/tasks/tabular-regression/data.ts
2813
- var taskData22 = {
2762
+ var taskData21 = {
2814
2763
  datasets: [
2815
2764
  {
2816
2765
  description: "A comprehensive curation of datasets covering all benchmarks.",
@@ -2862,10 +2811,10 @@ var taskData22 = {
2862
2811
  widgetModels: ["scikit-learn/Fish-Weight"],
2863
2812
  youtubeId: ""
2864
2813
  };
2865
- var data_default22 = taskData22;
2814
+ var data_default21 = taskData21;
2866
2815
 
2867
2816
  // src/tasks/text-to-image/data.ts
2868
- var taskData23 = {
2817
+ var taskData22 = {
2869
2818
  datasets: [
2870
2819
  {
2871
2820
  description: "RedCaps is a large-scale dataset of 12M image-text pairs collected from Reddit.",
@@ -2957,10 +2906,10 @@ var taskData23 = {
2957
2906
  widgetModels: ["CompVis/stable-diffusion-v1-4"],
2958
2907
  youtubeId: ""
2959
2908
  };
2960
- var data_default23 = taskData23;
2909
+ var data_default22 = taskData22;
2961
2910
 
2962
2911
  // src/tasks/text-to-speech/data.ts
2963
- var taskData24 = {
2912
+ var taskData23 = {
2964
2913
  canonicalId: "text-to-audio",
2965
2914
  datasets: [
2966
2915
  {
@@ -3025,10 +2974,10 @@ var taskData24 = {
3025
2974
  widgetModels: ["suno/bark"],
3026
2975
  youtubeId: "NW62DpzJ274"
3027
2976
  };
3028
- var data_default24 = taskData24;
2977
+ var data_default23 = taskData23;
3029
2978
 
3030
2979
  // src/tasks/token-classification/data.ts
3031
- var taskData25 = {
2980
+ var taskData24 = {
3032
2981
  datasets: [
3033
2982
  {
3034
2983
  description: "A widely used dataset useful to benchmark named entity recognition models.",
@@ -3104,10 +3053,10 @@ var taskData25 = {
3104
3053
  widgetModels: ["dslim/bert-base-NER"],
3105
3054
  youtubeId: "wVHdVlPScxA"
3106
3055
  };
3107
- var data_default25 = taskData25;
3056
+ var data_default24 = taskData24;
3108
3057
 
3109
3058
  // src/tasks/translation/data.ts
3110
- var taskData26 = {
3059
+ var taskData25 = {
3111
3060
  canonicalId: "text2text-generation",
3112
3061
  datasets: [
3113
3062
  {
@@ -3169,10 +3118,10 @@ var taskData26 = {
3169
3118
  widgetModels: ["t5-small"],
3170
3119
  youtubeId: "1JvfrvZgi6c"
3171
3120
  };
3172
- var data_default26 = taskData26;
3121
+ var data_default25 = taskData25;
3173
3122
 
3174
3123
  // src/tasks/text-classification/data.ts
3175
- var taskData27 = {
3124
+ var taskData26 = {
3176
3125
  datasets: [
3177
3126
  {
3178
3127
  description: "A widely used dataset used to benchmark multiple variants of text classification.",
@@ -3257,10 +3206,10 @@ var taskData27 = {
3257
3206
  widgetModels: ["distilbert-base-uncased-finetuned-sst-2-english"],
3258
3207
  youtubeId: "leNG9fN9FQU"
3259
3208
  };
3260
- var data_default27 = taskData27;
3209
+ var data_default26 = taskData26;
3261
3210
 
3262
3211
  // src/tasks/text-generation/data.ts
3263
- var taskData28 = {
3212
+ var taskData27 = {
3264
3213
  datasets: [
3265
3214
  {
3266
3215
  description: "A large multilingual dataset of text crawled from the web.",
@@ -3361,10 +3310,10 @@ var taskData28 = {
3361
3310
  widgetModels: ["HuggingFaceH4/zephyr-7b-beta"],
3362
3311
  youtubeId: "Vpjb1lu0MDk"
3363
3312
  };
3364
- var data_default28 = taskData28;
3313
+ var data_default27 = taskData27;
3365
3314
 
3366
3315
  // src/tasks/text-to-video/data.ts
3367
- var taskData29 = {
3316
+ var taskData28 = {
3368
3317
  datasets: [
3369
3318
  {
3370
3319
  description: "Microsoft Research Video to Text is a large-scale dataset for open domain video captioning",
@@ -3456,10 +3405,10 @@ var taskData29 = {
3456
3405
  widgetModels: [],
3457
3406
  youtubeId: void 0
3458
3407
  };
3459
- var data_default29 = taskData29;
3408
+ var data_default28 = taskData28;
3460
3409
 
3461
3410
  // src/tasks/unconditional-image-generation/data.ts
3462
- var taskData30 = {
3411
+ var taskData29 = {
3463
3412
  datasets: [
3464
3413
  {
3465
3414
  description: "The CIFAR-100 dataset consists of 60000 32x32 colour images in 100 classes, with 600 images per class.",
@@ -3521,10 +3470,10 @@ var taskData30 = {
3521
3470
  // TODO: Add related video
3522
3471
  youtubeId: ""
3523
3472
  };
3524
- var data_default30 = taskData30;
3473
+ var data_default29 = taskData29;
3525
3474
 
3526
3475
  // src/tasks/video-classification/data.ts
3527
- var taskData31 = {
3476
+ var taskData30 = {
3528
3477
  datasets: [
3529
3478
  {
3530
3479
  // TODO write proper description
@@ -3603,10 +3552,10 @@ var taskData31 = {
3603
3552
  widgetModels: [],
3604
3553
  youtubeId: ""
3605
3554
  };
3606
- var data_default31 = taskData31;
3555
+ var data_default30 = taskData30;
3607
3556
 
3608
3557
  // src/tasks/visual-question-answering/data.ts
3609
- var taskData32 = {
3558
+ var taskData31 = {
3610
3559
  datasets: [
3611
3560
  {
3612
3561
  description: "A widely used dataset containing questions (with answers) about images.",
@@ -3696,10 +3645,10 @@ var taskData32 = {
3696
3645
  widgetModels: ["dandelin/vilt-b32-finetuned-vqa"],
3697
3646
  youtubeId: ""
3698
3647
  };
3699
- var data_default32 = taskData32;
3648
+ var data_default31 = taskData31;
3700
3649
 
3701
3650
  // src/tasks/zero-shot-classification/data.ts
3702
- var taskData33 = {
3651
+ var taskData32 = {
3703
3652
  datasets: [
3704
3653
  {
3705
3654
  description: "A widely used dataset used to benchmark multiple variants of text classification.",
@@ -3758,10 +3707,10 @@ var taskData33 = {
3758
3707
  summary: "Zero-shot text classification is a task in natural language processing where a model is trained on a set of labeled examples but is then able to classify new examples from previously unseen classes.",
3759
3708
  widgetModels: ["facebook/bart-large-mnli"]
3760
3709
  };
3761
- var data_default33 = taskData33;
3710
+ var data_default32 = taskData32;
3762
3711
 
3763
3712
  // src/tasks/zero-shot-image-classification/data.ts
3764
- var taskData34 = {
3713
+ var taskData33 = {
3765
3714
  datasets: [
3766
3715
  {
3767
3716
  // TODO write proper description
@@ -3835,10 +3784,10 @@ var taskData34 = {
3835
3784
  widgetModels: ["openai/clip-vit-large-patch14-336"],
3836
3785
  youtubeId: ""
3837
3786
  };
3838
- var data_default34 = taskData34;
3787
+ var data_default33 = taskData33;
3839
3788
 
3840
3789
  // src/tasks/zero-shot-object-detection/data.ts
3841
- var taskData35 = {
3790
+ var taskData34 = {
3842
3791
  datasets: [],
3843
3792
  demo: {
3844
3793
  inputs: [
@@ -3893,21 +3842,22 @@ var taskData35 = {
3893
3842
  widgetModels: [],
3894
3843
  youtubeId: ""
3895
3844
  };
3896
- var data_default35 = taskData35;
3845
+ var data_default34 = taskData34;
3897
3846
 
3898
3847
  // src/tasks/index.ts
3899
3848
  var TASKS_MODEL_LIBRARIES = {
3900
3849
  "audio-classification": ["speechbrain", "transformers", "transformers.js"],
3901
3850
  "audio-to-audio": ["asteroid", "speechbrain"],
3902
3851
  "automatic-speech-recognition": ["espnet", "nemo", "speechbrain", "transformers", "transformers.js"],
3903
- conversational: ["transformers"],
3904
3852
  "depth-estimation": ["transformers", "transformers.js"],
3905
3853
  "document-question-answering": ["transformers", "transformers.js"],
3906
3854
  "feature-extraction": ["sentence-transformers", "transformers", "transformers.js"],
3907
3855
  "fill-mask": ["transformers", "transformers.js"],
3908
3856
  "graph-ml": ["transformers"],
3909
3857
  "image-classification": ["keras", "timm", "transformers", "transformers.js"],
3858
+ "image-feature-extraction": ["timm", "transformers"],
3910
3859
  "image-segmentation": ["transformers", "transformers.js"],
3860
+ "image-text-to-text": ["transformers"],
3911
3861
  "image-to-image": ["diffusers", "transformers", "transformers.js"],
3912
3862
  "image-to-text": ["transformers", "transformers.js"],
3913
3863
  "image-to-video": ["diffusers"],
@@ -3954,7 +3904,7 @@ var TASKS_MODEL_LIBRARIES = {
3954
3904
  "text-to-3d": [],
3955
3905
  "image-to-3d": []
3956
3906
  };
3957
- function getData(type, partialTaskData = data_default15) {
3907
+ function getData(type, partialTaskData = data_default14) {
3958
3908
  return {
3959
3909
  ...partialTaskData,
3960
3910
  id: type,
@@ -3966,51 +3916,52 @@ var TASKS_DATA = {
3966
3916
  "audio-classification": getData("audio-classification", data_default),
3967
3917
  "audio-to-audio": getData("audio-to-audio", data_default2),
3968
3918
  "automatic-speech-recognition": getData("automatic-speech-recognition", data_default3),
3969
- conversational: getData("conversational", data_default4),
3970
- "depth-estimation": getData("depth-estimation", data_default14),
3971
- "document-question-answering": getData("document-question-answering", data_default5),
3972
- "feature-extraction": getData("feature-extraction", data_default6),
3973
- "fill-mask": getData("fill-mask", data_default7),
3919
+ "depth-estimation": getData("depth-estimation", data_default13),
3920
+ "document-question-answering": getData("document-question-answering", data_default4),
3921
+ "feature-extraction": getData("feature-extraction", data_default5),
3922
+ "fill-mask": getData("fill-mask", data_default6),
3974
3923
  "graph-ml": void 0,
3975
- "image-classification": getData("image-classification", data_default8),
3976
- "image-segmentation": getData("image-segmentation", data_default11),
3977
- "image-to-image": getData("image-to-image", data_default9),
3978
- "image-to-text": getData("image-to-text", data_default10),
3924
+ "image-classification": getData("image-classification", data_default7),
3925
+ "image-segmentation": getData("image-segmentation", data_default10),
3926
+ "image-text-to-text": void 0,
3927
+ "image-to-image": getData("image-to-image", data_default8),
3928
+ "image-to-text": getData("image-to-text", data_default9),
3979
3929
  "image-to-video": void 0,
3980
- "mask-generation": getData("mask-generation", data_default12),
3930
+ "mask-generation": getData("mask-generation", data_default11),
3981
3931
  "multiple-choice": void 0,
3982
- "object-detection": getData("object-detection", data_default13),
3983
- "video-classification": getData("video-classification", data_default31),
3932
+ "object-detection": getData("object-detection", data_default12),
3933
+ "video-classification": getData("video-classification", data_default30),
3984
3934
  other: void 0,
3985
- "question-answering": getData("question-answering", data_default17),
3986
- "reinforcement-learning": getData("reinforcement-learning", data_default16),
3935
+ "question-answering": getData("question-answering", data_default16),
3936
+ "reinforcement-learning": getData("reinforcement-learning", data_default15),
3987
3937
  robotics: void 0,
3988
- "sentence-similarity": getData("sentence-similarity", data_default18),
3989
- summarization: getData("summarization", data_default19),
3990
- "table-question-answering": getData("table-question-answering", data_default20),
3938
+ "sentence-similarity": getData("sentence-similarity", data_default17),
3939
+ summarization: getData("summarization", data_default18),
3940
+ "table-question-answering": getData("table-question-answering", data_default19),
3991
3941
  "table-to-text": void 0,
3992
- "tabular-classification": getData("tabular-classification", data_default21),
3993
- "tabular-regression": getData("tabular-regression", data_default22),
3942
+ "tabular-classification": getData("tabular-classification", data_default20),
3943
+ "tabular-regression": getData("tabular-regression", data_default21),
3994
3944
  "tabular-to-text": void 0,
3995
- "text-classification": getData("text-classification", data_default27),
3996
- "text-generation": getData("text-generation", data_default28),
3945
+ "text-classification": getData("text-classification", data_default26),
3946
+ "text-generation": getData("text-generation", data_default27),
3997
3947
  "text-retrieval": void 0,
3998
- "text-to-image": getData("text-to-image", data_default23),
3999
- "text-to-speech": getData("text-to-speech", data_default24),
3948
+ "text-to-image": getData("text-to-image", data_default22),
3949
+ "text-to-speech": getData("text-to-speech", data_default23),
4000
3950
  "text-to-audio": void 0,
4001
- "text-to-video": getData("text-to-video", data_default29),
3951
+ "text-to-video": getData("text-to-video", data_default28),
4002
3952
  "text2text-generation": void 0,
4003
3953
  "time-series-forecasting": void 0,
4004
- "token-classification": getData("token-classification", data_default25),
4005
- translation: getData("translation", data_default26),
4006
- "unconditional-image-generation": getData("unconditional-image-generation", data_default30),
4007
- "visual-question-answering": getData("visual-question-answering", data_default32),
3954
+ "token-classification": getData("token-classification", data_default24),
3955
+ translation: getData("translation", data_default25),
3956
+ "unconditional-image-generation": getData("unconditional-image-generation", data_default29),
3957
+ "visual-question-answering": getData("visual-question-answering", data_default31),
4008
3958
  "voice-activity-detection": void 0,
4009
- "zero-shot-classification": getData("zero-shot-classification", data_default33),
4010
- "zero-shot-image-classification": getData("zero-shot-image-classification", data_default34),
4011
- "zero-shot-object-detection": getData("zero-shot-object-detection", data_default35),
4012
- "text-to-3d": getData("text-to-3d", data_default15),
4013
- "image-to-3d": getData("image-to-3d", data_default15)
3959
+ "zero-shot-classification": getData("zero-shot-classification", data_default32),
3960
+ "zero-shot-image-classification": getData("zero-shot-image-classification", data_default33),
3961
+ "zero-shot-object-detection": getData("zero-shot-object-detection", data_default34),
3962
+ "text-to-3d": getData("text-to-3d", data_default14),
3963
+ "image-to-3d": getData("image-to-3d", data_default14),
3964
+ "image-feature-extraction": getData("image-feature-extraction", data_default14)
4014
3965
  };
4015
3966
 
4016
3967
  // src/model-libraries-snippets.ts
@@ -4816,6 +4767,18 @@ var InferenceDisplayability = /* @__PURE__ */ ((InferenceDisplayability2) => {
4816
4767
  return InferenceDisplayability2;
4817
4768
  })(InferenceDisplayability || {});
4818
4769
 
4770
+ // src/tokenizer-data.ts
4771
+ var SPECIAL_TOKENS_ATTRIBUTES = [
4772
+ "bos_token",
4773
+ "eos_token",
4774
+ "unk_token",
4775
+ "sep_token",
4776
+ "pad_token",
4777
+ "cls_token",
4778
+ "mask_token"
4779
+ // additional_special_tokens (TODO)
4780
+ ];
4781
+
4819
4782
  // src/snippets/index.ts
4820
4783
  var snippets_exports = {};
4821
4784
  __export(snippets_exports, {
@@ -4833,11 +4796,6 @@ __export(inputs_exports, {
4833
4796
  var inputsZeroShotClassification = () => `"Hi, I recently bought a device from your company but it is not working as advertised and I would like to get reimbursed!"`;
4834
4797
  var inputsTranslation = () => `"\u041C\u0435\u043D\u044F \u0437\u043E\u0432\u0443\u0442 \u0412\u043E\u043B\u044C\u0444\u0433\u0430\u043D\u0433 \u0438 \u044F \u0436\u0438\u0432\u0443 \u0432 \u0411\u0435\u0440\u043B\u0438\u043D\u0435"`;
4835
4798
  var inputsSummarization = () => `"The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct."`;
4836
- var inputsConversational = () => `{
4837
- "past_user_inputs": ["Which movie is the best ?"],
4838
- "generated_responses": ["It is Die Hard for sure."],
4839
- "text": "Can you explain why ?"
4840
- }`;
4841
4799
  var inputsTableQuestionAnswering = () => `{
4842
4800
  "query": "How many stars does the transformers repository have?",
4843
4801
  "table": {
@@ -4889,7 +4847,6 @@ var modelInputSnippets = {
4889
4847
  "audio-to-audio": inputsAudioToAudio,
4890
4848
  "audio-classification": inputsAudioClassification,
4891
4849
  "automatic-speech-recognition": inputsAutomaticSpeechRecognition,
4892
- conversational: inputsConversational,
4893
4850
  "document-question-answering": inputsVisualQuestionAnswering,
4894
4851
  "feature-extraction": inputsFeatureExtraction,
4895
4852
  "fill-mask": inputsFillMask,
@@ -4969,7 +4926,6 @@ var curlSnippets = {
4969
4926
  "zero-shot-classification": snippetZeroShotClassification,
4970
4927
  translation: snippetBasic,
4971
4928
  summarization: snippetBasic,
4972
- conversational: snippetBasic,
4973
4929
  "feature-extraction": snippetBasic,
4974
4930
  "text-generation": snippetBasic,
4975
4931
  "text2text-generation": snippetBasic,
@@ -5104,7 +5060,6 @@ var pythonSnippets = {
5104
5060
  "zero-shot-classification": snippetZeroShotClassification2,
5105
5061
  translation: snippetBasic2,
5106
5062
  summarization: snippetBasic2,
5107
- conversational: snippetBasic2,
5108
5063
  "feature-extraction": snippetBasic2,
5109
5064
  "text-generation": snippetBasic2,
5110
5065
  "text2text-generation": snippetBasic2,
@@ -5254,7 +5209,6 @@ var jsSnippets = {
5254
5209
  "zero-shot-classification": snippetZeroShotClassification3,
5255
5210
  translation: snippetBasic3,
5256
5211
  summarization: snippetBasic3,
5257
- conversational: snippetBasic3,
5258
5212
  "feature-extraction": snippetBasic3,
5259
5213
  "text-generation": snippetBasic3,
5260
5214
  "text2text-generation": snippetBasic3,
@@ -5290,6 +5244,7 @@ function hasJsInferenceSnippet(model) {
5290
5244
  PIPELINE_DATA,
5291
5245
  PIPELINE_TYPES,
5292
5246
  PIPELINE_TYPES_SET,
5247
+ SPECIAL_TOKENS_ATTRIBUTES,
5293
5248
  SUBTASK_TYPES,
5294
5249
  TASKS_DATA,
5295
5250
  TASKS_MODEL_LIBRARIES,