npm - @huggingface/tasks - Versions diffs - 0.12.1 → 0.12.2 - Mend

@huggingface/tasks 0.12.1 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/dist/index.cjs +120 -57
package/dist/index.js +120 -57
package/dist/src/model-libraries-snippets.d.ts +1 -0
package/dist/src/model-libraries-snippets.d.ts.map +1 -1
package/dist/src/model-libraries.d.ts +9 -2
package/dist/src/model-libraries.d.ts.map +1 -1
package/dist/src/tasks/audio-classification/data.d.ts.map +1 -1
package/dist/src/tasks/audio-to-audio/data.d.ts.map +1 -1
package/dist/src/tasks/automatic-speech-recognition/data.d.ts.map +1 -1
package/dist/src/tasks/document-question-answering/data.d.ts.map +1 -1
package/dist/src/tasks/question-answering/data.d.ts.map +1 -1
package/dist/src/tasks/text-classification/data.d.ts.map +1 -1
package/dist/src/tasks/text-to-speech/data.d.ts.map +1 -1
package/dist/src/tasks/token-classification/data.d.ts.map +1 -1
package/dist/src/tasks/translation/data.d.ts.map +1 -1
package/dist/src/tasks/zero-shot-classification/data.d.ts.map +1 -1
package/package.json +1 -1
package/src/model-libraries-snippets.ts +9 -0
package/src/model-libraries.ts +7 -0
package/src/tasks/audio-classification/data.ts +8 -4
package/src/tasks/audio-to-audio/data.ts +5 -1
package/src/tasks/automatic-speech-recognition/data.ts +6 -2
package/src/tasks/document-question-answering/data.ts +7 -3
package/src/tasks/fill-mask/data.ts +3 -3
package/src/tasks/image-segmentation/data.ts +1 -1
package/src/tasks/image-to-image/data.ts +1 -1
package/src/tasks/image-to-text/data.ts +1 -1
package/src/tasks/question-answering/data.ts +5 -1
package/src/tasks/sentence-similarity/data.ts +3 -3
package/src/tasks/summarization/data.ts +2 -2
package/src/tasks/text-classification/data.ts +18 -6
package/src/tasks/text-generation/data.ts +3 -3
package/src/tasks/text-to-image/data.ts +1 -1
package/src/tasks/text-to-speech/data.ts +7 -3
package/src/tasks/token-classification/data.ts +11 -3
package/src/tasks/translation/data.ts +9 -8
package/src/tasks/video-classification/data.ts +3 -3
package/src/tasks/visual-question-answering/data.ts +2 -2
package/src/tasks/zero-shot-classification/data.ts +8 -4
package/src/tasks/zero-shot-image-classification/data.ts +2 -2

package/dist/index.cjs CHANGED Viewed

@@ -1429,7 +1429,11 @@ var taskData = {
   datasets: [
     {
       description: "A benchmark of 10 different audio tasks.",
-      id: "superb"
+      id: "s3prl/superb"
+    },
+    {
+      description: "A dataset of YouTube clips and their sound categories.",
+      id: "agkphysics/AudioSet"
     }
   ],
   demo: {
@@ -1475,11 +1479,11 @@ var taskData = {
   ],
   models: [
     {
-      description: "An easy-to-use model for Command Recognition.",
+      description: "An easy-to-use model for command recognition.",
       id: "speechbrain/google_speech_command_xvector"
     },
     {
-      description: "An Emotion Recognition model.",
+      description: "An emotion recognition model.",
       id: "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
     },
     {
@@ -1494,7 +1498,7 @@ var taskData = {
     }
   ],
   summary: "Audio classification is the task of assigning a label or class to a given audio. It can be used for recognizing which command a user is giving or the emotion of a statement, as well as identifying a speaker.",
-  widgetModels: ["facebook/mms-lid-126"],
+  widgetModels: ["MIT/ast-finetuned-audioset-10-10-0.4593"],
   youtubeId: "KWwzcmG98Ds"
 };
 var data_default = taskData;
@@ -1542,7 +1546,11 @@ var taskData2 = {
     },
     {
       description: "A speech enhancement model.",
-      id: "speechbrain/metricgan-plus-voicebank"
+      id: "ResembleAI/resemble-enhance"
+    },
+    {
+      description: "A model that can change the voice in a speech recording.",
+      id: "microsoft/speecht5_vc"
     }
   ],
   spaces: [
@@ -1569,8 +1577,8 @@ var taskData3 = {
       id: "mozilla-foundation/common_voice_17_0"
     },
     {
-      description: "An English dataset with 1,000 hours of data.",
-      id: "librispeech_asr"
+      description: "A dataset with 44.6k hours of English speaker data and 6k hours of other language speakers.",
+      id: "parler-tts/mls_eng"
     },
     {
       description: "A multi-lingual audio dataset with 370K hours of audio.",
@@ -1615,6 +1623,10 @@ var taskData3 = {
     {
       description: "An end-to-end model that performs ASR and Speech Translation by MetaAI.",
       id: "facebook/seamless-m4t-v2-large"
+    },
+    {
+      description: "Powerful speaker diarization model.",
+      id: "pyannote/speaker-diarization-3.1"
     }
   ],
   spaces: [
@@ -1681,11 +1693,15 @@ var taskData4 = {
   ],
   models: [
     {
-      description: "A LayoutLM model for the document QA task, fine-tuned on DocVQA and SQuAD2.0.",
+      description: "A robust document question answering model.",
       id: "impira/layoutlm-document-qa"
     },
     {
-      description: "A special model for OCR-free Document QA task.",
+      description: "A document question answering model specialized in invoices.",
+      id: "impira/layoutlm-invoices"
+    },
+    {
+      description: "A special model for OCR-free document question answering.",
       id: "microsoft/udop-large"
     },
     {
@@ -1708,7 +1724,7 @@ var taskData4 = {
     }
   ],
   summary: "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
-  widgetModels: ["impira/layoutlm-document-qa"],
+  widgetModels: ["impira/layoutlm-invoices"],
   youtubeId: ""
 };
 var data_default4 = taskData4;
@@ -1828,12 +1844,12 @@ var taskData6 = {
   ],
   models: [
     {
-      description: "A faster and smaller model than the famous BERT model.",
-      id: "distilbert-base-uncased"
+      description: "The famous BERT model.",
+      id: "google-bert/bert-base-uncased"
     },
     {
       description: "A multilingual model trained on 100 languages.",
-      id: "xlm-roberta-base"
+      id: "FacebookAI/xlm-roberta-base"
     }
   ],
   spaces: [],
@@ -2076,7 +2092,7 @@ var taskData9 = {
     }
   ],
   summary: "Image-to-image is the task of transforming an input image through a variety of possible manipulations and enhancements, such as super-resolution, image inpainting, colorization, and more.",
-  widgetModels: ["lllyasviel/sd-controlnet-canny"],
+  widgetModels: ["stabilityai/stable-diffusion-2-inpainting"],
   youtubeId: ""
 };
 var data_default9 = taskData9;
@@ -2156,7 +2172,7 @@ var taskData10 = {
     }
   ],
   summary: "Image to text models output a text from a given image. Image captioning or optical character recognition can be considered as the most common applications of image to text.",
-  widgetModels: ["Salesforce/blip-image-captioning-base"],
+  widgetModels: ["Salesforce/blip-image-captioning-large"],
   youtubeId: ""
 };
 var data_default10 = taskData10;
@@ -2342,7 +2358,7 @@ var taskData12 = {
     }
   ],
   summary: "Image Segmentation divides an image into segments where each pixel in the image is mapped to an object. This task has multiple variants such as instance segmentation, panoptic segmentation and semantic segmentation.",
-  widgetModels: ["facebook/detr-resnet-50-panoptic"],
+  widgetModels: ["nvidia/segformer-b0-finetuned-ade-512-512"],
   youtubeId: "dKE8SIt9C-w"
 };
 var data_default12 = taskData12;
@@ -2682,7 +2698,11 @@ var taskData18 = {
       id: "deepset/roberta-base-squad2"
     },
     {
-      description: "A special model that can answer questions from tables!",
+      description: "Small yet robust model that can answer questions.",
+      id: "distilbert/distilbert-base-cased-distilled-squad"
+    },
+    {
+      description: "A special model that can answer questions from tables.",
       id: "google/tapas-base-finetuned-wtq"
     }
   ],
@@ -2765,8 +2785,8 @@ var taskData19 = {
       id: "sentence-transformers/all-mpnet-base-v2"
     },
     {
-      description: "A multilingual model trained for FAQ retrieval.",
-      id: "clips/mfaq"
+      description: "A multilingual robust sentence similarity model..",
+      id: "BAAI/bge-m3"
     }
   ],
   spaces: [
@@ -2788,7 +2808,7 @@ var taskData19 = {
     }
   ],
   summary: "Sentence Similarity is the task of determining how similar two texts are. Sentence similarity models convert input texts into vectors (embeddings) that capture semantic information and calculate how close (similar) they are between them. This task is particularly useful for information retrieval and clustering/grouping.",
-  widgetModels: ["sentence-transformers/all-MiniLM-L6-v2"],
+  widgetModels: ["BAAI/bge-small-en-v1.5"],
   youtubeId: "VCZq5AkbNEU"
 };
 var data_default19 = taskData19;
@@ -2835,7 +2855,7 @@ var taskData20 = {
     },
     {
       description: "A summarization model trained on medical articles.",
-      id: "google/bigbird-pegasus-large-pubmed"
+      id: "Falconsai/medical_summarization"
     }
   ],
   spaces: [
@@ -2857,7 +2877,7 @@ var taskData20 = {
     }
   ],
   summary: "Summarization is the task of producing a shorter version of a document while preserving its important information. Some models can extract text from the original input, while other models can generate entirely new text.",
-  widgetModels: ["sshleifer/distilbart-cnn-12-6"],
+  widgetModels: ["facebook/bart-large-cnn"],
   youtubeId: "yHnr5Dk2zCI"
 };
 var data_default20 = taskData20;
@@ -3128,7 +3148,7 @@ var taskData24 = {
     }
   ],
   summary: "Generates images from input text. These models can be used to generate and modify images based on text prompts.",
-  widgetModels: ["CompVis/stable-diffusion-v1-4"],
+  widgetModels: ["black-forest-labs/FLUX.1-dev"],
   youtubeId: ""
 };
 var data_default24 = taskData24;
@@ -3143,7 +3163,7 @@ var taskData25 = {
     },
     {
       description: "Multi-speaker English dataset.",
-      id: "LibriTTS"
+      id: "mythicinfinity/libritts_r"
     }
   ],
   demo: {
@@ -3170,11 +3190,15 @@ var taskData25 = {
   models: [
     {
       description: "A powerful TTS model.",
-      id: "suno/bark"
+      id: "parler-tts/parler-tts-large-v1"
     },
     {
       description: "A massively multi-lingual TTS model.",
-      id: "facebook/mms-tts"
+      id: "coqui/XTTS-v2"
+    },
+    {
+      description: "Robust TTS model.",
+      id: "metavoiceio/metavoice-1B-v0.1"
     },
     {
       description: "A prompt based, powerful TTS model.",
@@ -3206,11 +3230,11 @@ var taskData26 = {
   datasets: [
     {
       description: "A widely used dataset useful to benchmark named entity recognition models.",
-      id: "conll2003"
+      id: "eriktks/conll2003"
     },
     {
       description: "A multilingual dataset of Wikipedia articles annotated for named entity recognition in over 150 different languages.",
-      id: "wikiann"
+      id: "unimelb-nlp/wikiann"
     }
   ],
   demo: {
@@ -3263,6 +3287,14 @@ var taskData26 = {
       description: "A robust performance model to identify people, locations, organizations and names of miscellaneous entities.",
       id: "dslim/bert-base-NER"
     },
+    {
+      description: "A strong model to identify people, locations, organizations and names in multiple languages.",
+      id: "FacebookAI/xlm-roberta-large-finetuned-conll03-english"
+    },
+    {
+      description: "A token classification model specialized on medical entity recognition.",
+      id: "blaze999/Medical-NER"
+    },
     {
       description: "Flair models are typically the state of the art in named entity recognition tasks.",
       id: "flair/ner-english"
@@ -3275,7 +3307,7 @@ var taskData26 = {
     }
   ],
   summary: "Token classification is a natural language understanding task in which a label is assigned to some tokens in a text. Some popular token classification subtasks are Named Entity Recognition (NER) and Part-of-Speech (PoS) tagging. NER models could be trained to identify specific entities in a text, such as dates, individuals and places; and PoS tagging would identify, for example, which words in a text are verbs, nouns, and punctuation marks.",
-  widgetModels: ["dslim/bert-base-NER"],
+  widgetModels: ["FacebookAI/xlm-roberta-large-finetuned-conll03-english"],
   youtubeId: "wVHdVlPScxA"
 };
 var data_default26 = taskData26;
@@ -3286,11 +3318,11 @@ var taskData27 = {
   datasets: [
     {
       description: "A dataset of copyright-free books translated into 16 different languages.",
-      id: "opus_books"
+      id: "Helsinki-NLP/opus_books"
     },
     {
       description: "An example of translation between programming languages. This dataset consists of functions in Java and C#.",
-      id: "code_x_glue_cc_code_to_code_trans"
+      id: "google/code_x_glue_cc_code_to_code_trans"
     }
   ],
   demo: {
@@ -3321,12 +3353,12 @@ var taskData27 = {
   ],
   models: [
     {
-      description: "A model that translates from English to French.",
-      id: "Helsinki-NLP/opus-mt-en-fr"
+      description: "Very powerful model that can translate many languages between each other, especially low-resource languages.",
+      id: "facebook/nllb-200-1.3B"
     },
     {
       description: "A general-purpose Transformer that can be used to translate from English to German, French, or Romanian.",
-      id: "t5-base"
+      id: "google-t5/t5-base"
     }
   ],
   spaces: [
@@ -3335,12 +3367,12 @@ var taskData27 = {
       id: "Iker/Translate-100-languages"
     },
     {
-      description: "An application that can translate between English, Spanish and Hindi.",
-      id: "EuroPython2022/Translate-with-Bloom"
+      description: "An application that can translate between many languages.",
+      id: "Geonmo/nllb-translation-demo"
     }
   ],
   summary: "Translation is the task of converting text from one language to another.",
-  widgetModels: ["t5-small"],
+  widgetModels: ["facebook/mbart-large-50-many-to-many-mmt"],
   youtubeId: "1JvfrvZgi6c"
 };
 var data_default27 = taskData27;
@@ -3350,11 +3382,11 @@ var taskData28 = {
   datasets: [
     {
       description: "A widely used dataset used to benchmark multiple variants of text classification.",
-      id: "glue"
+      id: "nyu-mll/glue"
     },
     {
       description: "A text classification dataset used to benchmark natural language inference models",
-      id: "snli"
+      id: "stanfordnlp/snli"
     }
   ],
   demo: {
@@ -3406,11 +3438,23 @@ var taskData28 = {
   models: [
     {
       description: "A robust model trained for sentiment analysis.",
-      id: "distilbert-base-uncased-finetuned-sst-2-english"
+      id: "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
+    },
+    {
+      description: "A sentiment analysis model specialized in financial sentiment.",
+      id: "ProsusAI/finbert"
+    },
+    {
+      description: "A sentiment analysis model specialized in analyzing tweets.",
+      id: "cardiffnlp/twitter-roberta-base-sentiment-latest"
+    },
+    {
+      description: "A model that can classify languages.",
+      id: "papluca/xlm-roberta-base-language-detection"
     },
     {
-      description: "Multi-genre natural language inference model.",
-      id: "roberta-large-mnli"
+      description: "A model that can classify text generation attacks.",
+      id: "meta-llama/Prompt-Guard-86M"
     }
   ],
   spaces: [
@@ -3428,7 +3472,7 @@ var taskData28 = {
     }
   ],
   summary: "Text Classification is the task of assigning a label or class to a given text. Some use cases are sentiment analysis, natural language inference, and assessing grammatical correctness.",
-  widgetModels: ["distilbert-base-uncased-finetuned-sst-2-english"],
+  widgetModels: ["distilbert/distilbert-base-uncased-finetuned-sst-2-english"],
   youtubeId: "leNG9fN9FQU"
 };
 var data_default28 = taskData28;
@@ -3527,8 +3571,8 @@ var taskData29 = {
       id: "HuggingFaceH4/zephyr-chat"
     },
     {
-      description: "An text generation application that combines OpenAI and Hugging Face models.",
-      id: "microsoft/HuggingGPT"
+      description: "A leaderboard that ranks text generation models based on blind votes from people.",
+      id: "lmsys/chatbot-arena-leaderboard"
     },
     {
       description: "An chatbot to converse with a very powerful text generation model.",
@@ -3536,7 +3580,7 @@ var taskData29 = {
     }
   ],
   summary: "Generating text is the task of generating new text given another text. These models can, for example, fill in incomplete text or paraphrase.",
-  widgetModels: ["HuggingFaceH4/zephyr-7b-beta"],
+  widgetModels: ["mistralai/Mistral-Nemo-Instruct-2407"],
   youtubeId: "e9gNEAlsOvU"
 };
 var data_default29 = taskData29;
@@ -3758,12 +3802,12 @@ var taskData32 = {
   models: [
     {
       // TO DO: write description
-      description: "Strong Video Classification model trained on the Kinects 400 dataset.",
-      id: "MCG-NJU/videomae-base-finetuned-kinetics"
+      description: "Strong Video Classification model trained on the Kinetics 400 dataset.",
+      id: "google/vivit-b-16x2-kinetics400"
     },
     {
       // TO DO: write description
-      description: "Strong Video Classification model trained on the Kinects 400 dataset.",
+      description: "Strong Video Classification model trained on the Kinetics 400 dataset.",
       id: "microsoft/xclip-base-patch32"
     }
   ],
@@ -3792,7 +3836,7 @@ var taskData33 = {
     },
     {
       description: "A dataset to benchmark visual reasoning based on text in images.",
-      id: "textvqa"
+      id: "facebook/textvqa"
     }
   ],
   demo: {
@@ -3845,7 +3889,7 @@ var taskData33 = {
     },
     {
       description: "A visual question answering model trained for mathematical reasoning and chart derendering from images.",
-      id: "google/matcha-base "
+      id: "google/matcha-base"
     },
     {
       description: "A strong visual question answering that answers questions from book covers.",
@@ -3881,15 +3925,15 @@ var taskData34 = {
   datasets: [
     {
       description: "A widely used dataset used to benchmark multiple variants of text classification.",
-      id: "glue"
+      id: "nyu-mll/glue"
     },
     {
       description: "The Multi-Genre Natural Language Inference (MultiNLI) corpus is a crowd-sourced collection of 433k sentence pairs annotated with textual entailment information.",
-      id: "MultiNLI"
+      id: "nyu-mll/multi_nli"
     },
     {
       description: "FEVER is a publicly available dataset for fact extraction and verification against textual sources.",
-      id: "FEVER"
+      id: "fever/fever"
     }
   ],
   demo: {
@@ -3928,8 +3972,12 @@ var taskData34 = {
   metrics: [],
   models: [
     {
-      description: "Powerful zero-shot text classification model",
+      description: "Powerful zero-shot text classification model.",
       id: "facebook/bart-large-mnli"
+    },
+    {
+      description: "Powerful zero-shot multilingual text classification model that can accomplish multiple tasks.",
+      id: "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
     }
   ],
   spaces: [],
@@ -3992,7 +4040,7 @@ var taskData35 = {
     },
     {
       description: "Strong zero-shot image classification model.",
-      id: "google/siglip-base-patch16-224"
+      id: "google/siglip-so400m-patch14-224"
     },
     {
       description: "Small yet powerful zero-shot image classification model that can run on edge devices.",
@@ -4014,7 +4062,7 @@ var taskData35 = {
     }
   ],
   summary: "Zero-shot image classification is the task of classifying previously unseen classes during training of a model.",
-  widgetModels: ["openai/clip-vit-large-patch14-336"],
+  widgetModels: ["google/siglip-so400m-patch14-224"],
   youtubeId: ""
 };
 var data_default35 = taskData35;
@@ -5128,6 +5176,14 @@ wavs = chat.infer(texts, )
 torchaudio.save("output1.wav", torch.from_numpy(wavs[0]), 24000)`
 ];
+var yolov10 = (model) => [
+  `from ultralytics import YOLOv10
+model = YOLOv10.from_pretrained("${model.id}")
+source = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+model.predict(source=source, save=True)
+`
+];
 var birefnet = (model) => [
   `# Option 1: use with transformers
@@ -5814,6 +5870,13 @@ var MODEL_LIBRARIES_UI_ELEMENTS = {
     docsUrl: "https://github.com/jasonppy/VoiceCraft",
     snippets: voicecraft
   },
+  yolov10: {
+    prettyLabel: "YOLOv10",
+    repoName: "yolov10",
+    repoUrl: "https://github.com/THU-MIG/yolov10",
+    docsUrl: "https://github.com/THU-MIG/yolov10",
+    snippets: yolov10
+  },
   whisperkit: {
     prettyLabel: "WhisperKit",
     repoName: "WhisperKit",