@huggingface/tasks 0.12.1 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/dist/index.cjs +120 -57
  2. package/dist/index.js +120 -57
  3. package/dist/src/model-libraries-snippets.d.ts +1 -0
  4. package/dist/src/model-libraries-snippets.d.ts.map +1 -1
  5. package/dist/src/model-libraries.d.ts +9 -2
  6. package/dist/src/model-libraries.d.ts.map +1 -1
  7. package/dist/src/tasks/audio-classification/data.d.ts.map +1 -1
  8. package/dist/src/tasks/audio-to-audio/data.d.ts.map +1 -1
  9. package/dist/src/tasks/automatic-speech-recognition/data.d.ts.map +1 -1
  10. package/dist/src/tasks/document-question-answering/data.d.ts.map +1 -1
  11. package/dist/src/tasks/question-answering/data.d.ts.map +1 -1
  12. package/dist/src/tasks/text-classification/data.d.ts.map +1 -1
  13. package/dist/src/tasks/text-to-speech/data.d.ts.map +1 -1
  14. package/dist/src/tasks/token-classification/data.d.ts.map +1 -1
  15. package/dist/src/tasks/translation/data.d.ts.map +1 -1
  16. package/dist/src/tasks/zero-shot-classification/data.d.ts.map +1 -1
  17. package/package.json +1 -1
  18. package/src/model-libraries-snippets.ts +9 -0
  19. package/src/model-libraries.ts +7 -0
  20. package/src/tasks/audio-classification/data.ts +8 -4
  21. package/src/tasks/audio-to-audio/data.ts +5 -1
  22. package/src/tasks/automatic-speech-recognition/data.ts +6 -2
  23. package/src/tasks/document-question-answering/data.ts +7 -3
  24. package/src/tasks/fill-mask/data.ts +3 -3
  25. package/src/tasks/image-segmentation/data.ts +1 -1
  26. package/src/tasks/image-to-image/data.ts +1 -1
  27. package/src/tasks/image-to-text/data.ts +1 -1
  28. package/src/tasks/question-answering/data.ts +5 -1
  29. package/src/tasks/sentence-similarity/data.ts +3 -3
  30. package/src/tasks/summarization/data.ts +2 -2
  31. package/src/tasks/text-classification/data.ts +18 -6
  32. package/src/tasks/text-generation/data.ts +3 -3
  33. package/src/tasks/text-to-image/data.ts +1 -1
  34. package/src/tasks/text-to-speech/data.ts +7 -3
  35. package/src/tasks/token-classification/data.ts +11 -3
  36. package/src/tasks/translation/data.ts +9 -8
  37. package/src/tasks/video-classification/data.ts +3 -3
  38. package/src/tasks/visual-question-answering/data.ts +2 -2
  39. package/src/tasks/zero-shot-classification/data.ts +8 -4
  40. package/src/tasks/zero-shot-image-classification/data.ts +2 -2
package/dist/index.cjs CHANGED
@@ -1429,7 +1429,11 @@ var taskData = {
1429
1429
  datasets: [
1430
1430
  {
1431
1431
  description: "A benchmark of 10 different audio tasks.",
1432
- id: "superb"
1432
+ id: "s3prl/superb"
1433
+ },
1434
+ {
1435
+ description: "A dataset of YouTube clips and their sound categories.",
1436
+ id: "agkphysics/AudioSet"
1433
1437
  }
1434
1438
  ],
1435
1439
  demo: {
@@ -1475,11 +1479,11 @@ var taskData = {
1475
1479
  ],
1476
1480
  models: [
1477
1481
  {
1478
- description: "An easy-to-use model for Command Recognition.",
1482
+ description: "An easy-to-use model for command recognition.",
1479
1483
  id: "speechbrain/google_speech_command_xvector"
1480
1484
  },
1481
1485
  {
1482
- description: "An Emotion Recognition model.",
1486
+ description: "An emotion recognition model.",
1483
1487
  id: "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
1484
1488
  },
1485
1489
  {
@@ -1494,7 +1498,7 @@ var taskData = {
1494
1498
  }
1495
1499
  ],
1496
1500
  summary: "Audio classification is the task of assigning a label or class to a given audio. It can be used for recognizing which command a user is giving or the emotion of a statement, as well as identifying a speaker.",
1497
- widgetModels: ["facebook/mms-lid-126"],
1501
+ widgetModels: ["MIT/ast-finetuned-audioset-10-10-0.4593"],
1498
1502
  youtubeId: "KWwzcmG98Ds"
1499
1503
  };
1500
1504
  var data_default = taskData;
@@ -1542,7 +1546,11 @@ var taskData2 = {
1542
1546
  },
1543
1547
  {
1544
1548
  description: "A speech enhancement model.",
1545
- id: "speechbrain/metricgan-plus-voicebank"
1549
+ id: "ResembleAI/resemble-enhance"
1550
+ },
1551
+ {
1552
+ description: "A model that can change the voice in a speech recording.",
1553
+ id: "microsoft/speecht5_vc"
1546
1554
  }
1547
1555
  ],
1548
1556
  spaces: [
@@ -1569,8 +1577,8 @@ var taskData3 = {
1569
1577
  id: "mozilla-foundation/common_voice_17_0"
1570
1578
  },
1571
1579
  {
1572
- description: "An English dataset with 1,000 hours of data.",
1573
- id: "librispeech_asr"
1580
+ description: "A dataset with 44.6k hours of English speaker data and 6k hours of other language speakers.",
1581
+ id: "parler-tts/mls_eng"
1574
1582
  },
1575
1583
  {
1576
1584
  description: "A multi-lingual audio dataset with 370K hours of audio.",
@@ -1615,6 +1623,10 @@ var taskData3 = {
1615
1623
  {
1616
1624
  description: "An end-to-end model that performs ASR and Speech Translation by MetaAI.",
1617
1625
  id: "facebook/seamless-m4t-v2-large"
1626
+ },
1627
+ {
1628
+ description: "Powerful speaker diarization model.",
1629
+ id: "pyannote/speaker-diarization-3.1"
1618
1630
  }
1619
1631
  ],
1620
1632
  spaces: [
@@ -1681,11 +1693,15 @@ var taskData4 = {
1681
1693
  ],
1682
1694
  models: [
1683
1695
  {
1684
- description: "A LayoutLM model for the document QA task, fine-tuned on DocVQA and SQuAD2.0.",
1696
+ description: "A robust document question answering model.",
1685
1697
  id: "impira/layoutlm-document-qa"
1686
1698
  },
1687
1699
  {
1688
- description: "A special model for OCR-free Document QA task.",
1700
+ description: "A document question answering model specialized in invoices.",
1701
+ id: "impira/layoutlm-invoices"
1702
+ },
1703
+ {
1704
+ description: "A special model for OCR-free document question answering.",
1689
1705
  id: "microsoft/udop-large"
1690
1706
  },
1691
1707
  {
@@ -1708,7 +1724,7 @@ var taskData4 = {
1708
1724
  }
1709
1725
  ],
1710
1726
  summary: "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
1711
- widgetModels: ["impira/layoutlm-document-qa"],
1727
+ widgetModels: ["impira/layoutlm-invoices"],
1712
1728
  youtubeId: ""
1713
1729
  };
1714
1730
  var data_default4 = taskData4;
@@ -1828,12 +1844,12 @@ var taskData6 = {
1828
1844
  ],
1829
1845
  models: [
1830
1846
  {
1831
- description: "A faster and smaller model than the famous BERT model.",
1832
- id: "distilbert-base-uncased"
1847
+ description: "The famous BERT model.",
1848
+ id: "google-bert/bert-base-uncased"
1833
1849
  },
1834
1850
  {
1835
1851
  description: "A multilingual model trained on 100 languages.",
1836
- id: "xlm-roberta-base"
1852
+ id: "FacebookAI/xlm-roberta-base"
1837
1853
  }
1838
1854
  ],
1839
1855
  spaces: [],
@@ -2076,7 +2092,7 @@ var taskData9 = {
2076
2092
  }
2077
2093
  ],
2078
2094
  summary: "Image-to-image is the task of transforming an input image through a variety of possible manipulations and enhancements, such as super-resolution, image inpainting, colorization, and more.",
2079
- widgetModels: ["lllyasviel/sd-controlnet-canny"],
2095
+ widgetModels: ["stabilityai/stable-diffusion-2-inpainting"],
2080
2096
  youtubeId: ""
2081
2097
  };
2082
2098
  var data_default9 = taskData9;
@@ -2156,7 +2172,7 @@ var taskData10 = {
2156
2172
  }
2157
2173
  ],
2158
2174
  summary: "Image to text models output a text from a given image. Image captioning or optical character recognition can be considered as the most common applications of image to text.",
2159
- widgetModels: ["Salesforce/blip-image-captioning-base"],
2175
+ widgetModels: ["Salesforce/blip-image-captioning-large"],
2160
2176
  youtubeId: ""
2161
2177
  };
2162
2178
  var data_default10 = taskData10;
@@ -2342,7 +2358,7 @@ var taskData12 = {
2342
2358
  }
2343
2359
  ],
2344
2360
  summary: "Image Segmentation divides an image into segments where each pixel in the image is mapped to an object. This task has multiple variants such as instance segmentation, panoptic segmentation and semantic segmentation.",
2345
- widgetModels: ["facebook/detr-resnet-50-panoptic"],
2361
+ widgetModels: ["nvidia/segformer-b0-finetuned-ade-512-512"],
2346
2362
  youtubeId: "dKE8SIt9C-w"
2347
2363
  };
2348
2364
  var data_default12 = taskData12;
@@ -2682,7 +2698,11 @@ var taskData18 = {
2682
2698
  id: "deepset/roberta-base-squad2"
2683
2699
  },
2684
2700
  {
2685
- description: "A special model that can answer questions from tables!",
2701
+ description: "Small yet robust model that can answer questions.",
2702
+ id: "distilbert/distilbert-base-cased-distilled-squad"
2703
+ },
2704
+ {
2705
+ description: "A special model that can answer questions from tables.",
2686
2706
  id: "google/tapas-base-finetuned-wtq"
2687
2707
  }
2688
2708
  ],
@@ -2765,8 +2785,8 @@ var taskData19 = {
2765
2785
  id: "sentence-transformers/all-mpnet-base-v2"
2766
2786
  },
2767
2787
  {
2768
- description: "A multilingual model trained for FAQ retrieval.",
2769
- id: "clips/mfaq"
2788
+ description: "A multilingual robust sentence similarity model..",
2789
+ id: "BAAI/bge-m3"
2770
2790
  }
2771
2791
  ],
2772
2792
  spaces: [
@@ -2788,7 +2808,7 @@ var taskData19 = {
2788
2808
  }
2789
2809
  ],
2790
2810
  summary: "Sentence Similarity is the task of determining how similar two texts are. Sentence similarity models convert input texts into vectors (embeddings) that capture semantic information and calculate how close (similar) they are between them. This task is particularly useful for information retrieval and clustering/grouping.",
2791
- widgetModels: ["sentence-transformers/all-MiniLM-L6-v2"],
2811
+ widgetModels: ["BAAI/bge-small-en-v1.5"],
2792
2812
  youtubeId: "VCZq5AkbNEU"
2793
2813
  };
2794
2814
  var data_default19 = taskData19;
@@ -2835,7 +2855,7 @@ var taskData20 = {
2835
2855
  },
2836
2856
  {
2837
2857
  description: "A summarization model trained on medical articles.",
2838
- id: "google/bigbird-pegasus-large-pubmed"
2858
+ id: "Falconsai/medical_summarization"
2839
2859
  }
2840
2860
  ],
2841
2861
  spaces: [
@@ -2857,7 +2877,7 @@ var taskData20 = {
2857
2877
  }
2858
2878
  ],
2859
2879
  summary: "Summarization is the task of producing a shorter version of a document while preserving its important information. Some models can extract text from the original input, while other models can generate entirely new text.",
2860
- widgetModels: ["sshleifer/distilbart-cnn-12-6"],
2880
+ widgetModels: ["facebook/bart-large-cnn"],
2861
2881
  youtubeId: "yHnr5Dk2zCI"
2862
2882
  };
2863
2883
  var data_default20 = taskData20;
@@ -3128,7 +3148,7 @@ var taskData24 = {
3128
3148
  }
3129
3149
  ],
3130
3150
  summary: "Generates images from input text. These models can be used to generate and modify images based on text prompts.",
3131
- widgetModels: ["CompVis/stable-diffusion-v1-4"],
3151
+ widgetModels: ["black-forest-labs/FLUX.1-dev"],
3132
3152
  youtubeId: ""
3133
3153
  };
3134
3154
  var data_default24 = taskData24;
@@ -3143,7 +3163,7 @@ var taskData25 = {
3143
3163
  },
3144
3164
  {
3145
3165
  description: "Multi-speaker English dataset.",
3146
- id: "LibriTTS"
3166
+ id: "mythicinfinity/libritts_r"
3147
3167
  }
3148
3168
  ],
3149
3169
  demo: {
@@ -3170,11 +3190,15 @@ var taskData25 = {
3170
3190
  models: [
3171
3191
  {
3172
3192
  description: "A powerful TTS model.",
3173
- id: "suno/bark"
3193
+ id: "parler-tts/parler-tts-large-v1"
3174
3194
  },
3175
3195
  {
3176
3196
  description: "A massively multi-lingual TTS model.",
3177
- id: "facebook/mms-tts"
3197
+ id: "coqui/XTTS-v2"
3198
+ },
3199
+ {
3200
+ description: "Robust TTS model.",
3201
+ id: "metavoiceio/metavoice-1B-v0.1"
3178
3202
  },
3179
3203
  {
3180
3204
  description: "A prompt based, powerful TTS model.",
@@ -3206,11 +3230,11 @@ var taskData26 = {
3206
3230
  datasets: [
3207
3231
  {
3208
3232
  description: "A widely used dataset useful to benchmark named entity recognition models.",
3209
- id: "conll2003"
3233
+ id: "eriktks/conll2003"
3210
3234
  },
3211
3235
  {
3212
3236
  description: "A multilingual dataset of Wikipedia articles annotated for named entity recognition in over 150 different languages.",
3213
- id: "wikiann"
3237
+ id: "unimelb-nlp/wikiann"
3214
3238
  }
3215
3239
  ],
3216
3240
  demo: {
@@ -3263,6 +3287,14 @@ var taskData26 = {
3263
3287
  description: "A robust performance model to identify people, locations, organizations and names of miscellaneous entities.",
3264
3288
  id: "dslim/bert-base-NER"
3265
3289
  },
3290
+ {
3291
+ description: "A strong model to identify people, locations, organizations and names in multiple languages.",
3292
+ id: "FacebookAI/xlm-roberta-large-finetuned-conll03-english"
3293
+ },
3294
+ {
3295
+ description: "A token classification model specialized on medical entity recognition.",
3296
+ id: "blaze999/Medical-NER"
3297
+ },
3266
3298
  {
3267
3299
  description: "Flair models are typically the state of the art in named entity recognition tasks.",
3268
3300
  id: "flair/ner-english"
@@ -3275,7 +3307,7 @@ var taskData26 = {
3275
3307
  }
3276
3308
  ],
3277
3309
  summary: "Token classification is a natural language understanding task in which a label is assigned to some tokens in a text. Some popular token classification subtasks are Named Entity Recognition (NER) and Part-of-Speech (PoS) tagging. NER models could be trained to identify specific entities in a text, such as dates, individuals and places; and PoS tagging would identify, for example, which words in a text are verbs, nouns, and punctuation marks.",
3278
- widgetModels: ["dslim/bert-base-NER"],
3310
+ widgetModels: ["FacebookAI/xlm-roberta-large-finetuned-conll03-english"],
3279
3311
  youtubeId: "wVHdVlPScxA"
3280
3312
  };
3281
3313
  var data_default26 = taskData26;
@@ -3286,11 +3318,11 @@ var taskData27 = {
3286
3318
  datasets: [
3287
3319
  {
3288
3320
  description: "A dataset of copyright-free books translated into 16 different languages.",
3289
- id: "opus_books"
3321
+ id: "Helsinki-NLP/opus_books"
3290
3322
  },
3291
3323
  {
3292
3324
  description: "An example of translation between programming languages. This dataset consists of functions in Java and C#.",
3293
- id: "code_x_glue_cc_code_to_code_trans"
3325
+ id: "google/code_x_glue_cc_code_to_code_trans"
3294
3326
  }
3295
3327
  ],
3296
3328
  demo: {
@@ -3321,12 +3353,12 @@ var taskData27 = {
3321
3353
  ],
3322
3354
  models: [
3323
3355
  {
3324
- description: "A model that translates from English to French.",
3325
- id: "Helsinki-NLP/opus-mt-en-fr"
3356
+ description: "Very powerful model that can translate many languages between each other, especially low-resource languages.",
3357
+ id: "facebook/nllb-200-1.3B"
3326
3358
  },
3327
3359
  {
3328
3360
  description: "A general-purpose Transformer that can be used to translate from English to German, French, or Romanian.",
3329
- id: "t5-base"
3361
+ id: "google-t5/t5-base"
3330
3362
  }
3331
3363
  ],
3332
3364
  spaces: [
@@ -3335,12 +3367,12 @@ var taskData27 = {
3335
3367
  id: "Iker/Translate-100-languages"
3336
3368
  },
3337
3369
  {
3338
- description: "An application that can translate between English, Spanish and Hindi.",
3339
- id: "EuroPython2022/Translate-with-Bloom"
3370
+ description: "An application that can translate between many languages.",
3371
+ id: "Geonmo/nllb-translation-demo"
3340
3372
  }
3341
3373
  ],
3342
3374
  summary: "Translation is the task of converting text from one language to another.",
3343
- widgetModels: ["t5-small"],
3375
+ widgetModels: ["facebook/mbart-large-50-many-to-many-mmt"],
3344
3376
  youtubeId: "1JvfrvZgi6c"
3345
3377
  };
3346
3378
  var data_default27 = taskData27;
@@ -3350,11 +3382,11 @@ var taskData28 = {
3350
3382
  datasets: [
3351
3383
  {
3352
3384
  description: "A widely used dataset used to benchmark multiple variants of text classification.",
3353
- id: "glue"
3385
+ id: "nyu-mll/glue"
3354
3386
  },
3355
3387
  {
3356
3388
  description: "A text classification dataset used to benchmark natural language inference models",
3357
- id: "snli"
3389
+ id: "stanfordnlp/snli"
3358
3390
  }
3359
3391
  ],
3360
3392
  demo: {
@@ -3406,11 +3438,23 @@ var taskData28 = {
3406
3438
  models: [
3407
3439
  {
3408
3440
  description: "A robust model trained for sentiment analysis.",
3409
- id: "distilbert-base-uncased-finetuned-sst-2-english"
3441
+ id: "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
3442
+ },
3443
+ {
3444
+ description: "A sentiment analysis model specialized in financial sentiment.",
3445
+ id: "ProsusAI/finbert"
3446
+ },
3447
+ {
3448
+ description: "A sentiment analysis model specialized in analyzing tweets.",
3449
+ id: "cardiffnlp/twitter-roberta-base-sentiment-latest"
3450
+ },
3451
+ {
3452
+ description: "A model that can classify languages.",
3453
+ id: "papluca/xlm-roberta-base-language-detection"
3410
3454
  },
3411
3455
  {
3412
- description: "Multi-genre natural language inference model.",
3413
- id: "roberta-large-mnli"
3456
+ description: "A model that can classify text generation attacks.",
3457
+ id: "meta-llama/Prompt-Guard-86M"
3414
3458
  }
3415
3459
  ],
3416
3460
  spaces: [
@@ -3428,7 +3472,7 @@ var taskData28 = {
3428
3472
  }
3429
3473
  ],
3430
3474
  summary: "Text Classification is the task of assigning a label or class to a given text. Some use cases are sentiment analysis, natural language inference, and assessing grammatical correctness.",
3431
- widgetModels: ["distilbert-base-uncased-finetuned-sst-2-english"],
3475
+ widgetModels: ["distilbert/distilbert-base-uncased-finetuned-sst-2-english"],
3432
3476
  youtubeId: "leNG9fN9FQU"
3433
3477
  };
3434
3478
  var data_default28 = taskData28;
@@ -3527,8 +3571,8 @@ var taskData29 = {
3527
3571
  id: "HuggingFaceH4/zephyr-chat"
3528
3572
  },
3529
3573
  {
3530
- description: "An text generation application that combines OpenAI and Hugging Face models.",
3531
- id: "microsoft/HuggingGPT"
3574
+ description: "A leaderboard that ranks text generation models based on blind votes from people.",
3575
+ id: "lmsys/chatbot-arena-leaderboard"
3532
3576
  },
3533
3577
  {
3534
3578
  description: "An chatbot to converse with a very powerful text generation model.",
@@ -3536,7 +3580,7 @@ var taskData29 = {
3536
3580
  }
3537
3581
  ],
3538
3582
  summary: "Generating text is the task of generating new text given another text. These models can, for example, fill in incomplete text or paraphrase.",
3539
- widgetModels: ["HuggingFaceH4/zephyr-7b-beta"],
3583
+ widgetModels: ["mistralai/Mistral-Nemo-Instruct-2407"],
3540
3584
  youtubeId: "e9gNEAlsOvU"
3541
3585
  };
3542
3586
  var data_default29 = taskData29;
@@ -3758,12 +3802,12 @@ var taskData32 = {
3758
3802
  models: [
3759
3803
  {
3760
3804
  // TO DO: write description
3761
- description: "Strong Video Classification model trained on the Kinects 400 dataset.",
3762
- id: "MCG-NJU/videomae-base-finetuned-kinetics"
3805
+ description: "Strong Video Classification model trained on the Kinetics 400 dataset.",
3806
+ id: "google/vivit-b-16x2-kinetics400"
3763
3807
  },
3764
3808
  {
3765
3809
  // TO DO: write description
3766
- description: "Strong Video Classification model trained on the Kinects 400 dataset.",
3810
+ description: "Strong Video Classification model trained on the Kinetics 400 dataset.",
3767
3811
  id: "microsoft/xclip-base-patch32"
3768
3812
  }
3769
3813
  ],
@@ -3792,7 +3836,7 @@ var taskData33 = {
3792
3836
  },
3793
3837
  {
3794
3838
  description: "A dataset to benchmark visual reasoning based on text in images.",
3795
- id: "textvqa"
3839
+ id: "facebook/textvqa"
3796
3840
  }
3797
3841
  ],
3798
3842
  demo: {
@@ -3845,7 +3889,7 @@ var taskData33 = {
3845
3889
  },
3846
3890
  {
3847
3891
  description: "A visual question answering model trained for mathematical reasoning and chart derendering from images.",
3848
- id: "google/matcha-base "
3892
+ id: "google/matcha-base"
3849
3893
  },
3850
3894
  {
3851
3895
  description: "A strong visual question answering that answers questions from book covers.",
@@ -3881,15 +3925,15 @@ var taskData34 = {
3881
3925
  datasets: [
3882
3926
  {
3883
3927
  description: "A widely used dataset used to benchmark multiple variants of text classification.",
3884
- id: "glue"
3928
+ id: "nyu-mll/glue"
3885
3929
  },
3886
3930
  {
3887
3931
  description: "The Multi-Genre Natural Language Inference (MultiNLI) corpus is a crowd-sourced collection of 433k sentence pairs annotated with textual entailment information.",
3888
- id: "MultiNLI"
3932
+ id: "nyu-mll/multi_nli"
3889
3933
  },
3890
3934
  {
3891
3935
  description: "FEVER is a publicly available dataset for fact extraction and verification against textual sources.",
3892
- id: "FEVER"
3936
+ id: "fever/fever"
3893
3937
  }
3894
3938
  ],
3895
3939
  demo: {
@@ -3928,8 +3972,12 @@ var taskData34 = {
3928
3972
  metrics: [],
3929
3973
  models: [
3930
3974
  {
3931
- description: "Powerful zero-shot text classification model",
3975
+ description: "Powerful zero-shot text classification model.",
3932
3976
  id: "facebook/bart-large-mnli"
3977
+ },
3978
+ {
3979
+ description: "Powerful zero-shot multilingual text classification model that can accomplish multiple tasks.",
3980
+ id: "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
3933
3981
  }
3934
3982
  ],
3935
3983
  spaces: [],
@@ -3992,7 +4040,7 @@ var taskData35 = {
3992
4040
  },
3993
4041
  {
3994
4042
  description: "Strong zero-shot image classification model.",
3995
- id: "google/siglip-base-patch16-224"
4043
+ id: "google/siglip-so400m-patch14-224"
3996
4044
  },
3997
4045
  {
3998
4046
  description: "Small yet powerful zero-shot image classification model that can run on edge devices.",
@@ -4014,7 +4062,7 @@ var taskData35 = {
4014
4062
  }
4015
4063
  ],
4016
4064
  summary: "Zero-shot image classification is the task of classifying previously unseen classes during training of a model.",
4017
- widgetModels: ["openai/clip-vit-large-patch14-336"],
4065
+ widgetModels: ["google/siglip-so400m-patch14-224"],
4018
4066
  youtubeId: ""
4019
4067
  };
4020
4068
  var data_default35 = taskData35;
@@ -5128,6 +5176,14 @@ wavs = chat.infer(texts, )
5128
5176
 
5129
5177
  torchaudio.save("output1.wav", torch.from_numpy(wavs[0]), 24000)`
5130
5178
  ];
5179
+ var yolov10 = (model) => [
5180
+ `from ultralytics import YOLOv10
5181
+
5182
+ model = YOLOv10.from_pretrained("${model.id}")
5183
+ source = 'http://images.cocodataset.org/val2017/000000039769.jpg'
5184
+ model.predict(source=source, save=True)
5185
+ `
5186
+ ];
5131
5187
  var birefnet = (model) => [
5132
5188
  `# Option 1: use with transformers
5133
5189
 
@@ -5814,6 +5870,13 @@ var MODEL_LIBRARIES_UI_ELEMENTS = {
5814
5870
  docsUrl: "https://github.com/jasonppy/VoiceCraft",
5815
5871
  snippets: voicecraft
5816
5872
  },
5873
+ yolov10: {
5874
+ prettyLabel: "YOLOv10",
5875
+ repoName: "yolov10",
5876
+ repoUrl: "https://github.com/THU-MIG/yolov10",
5877
+ docsUrl: "https://github.com/THU-MIG/yolov10",
5878
+ snippets: yolov10
5879
+ },
5817
5880
  whisperkit: {
5818
5881
  prettyLabel: "WhisperKit",
5819
5882
  repoName: "WhisperKit",