npm - @huggingface/tasks - Versions diffs - 0.0.1 - Mend

@huggingface/tasks 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (103) hide show

package/assets/audio-classification/audio.wav +0 -0
package/assets/audio-to-audio/input.wav +0 -0
package/assets/audio-to-audio/label-0.wav +0 -0
package/assets/audio-to-audio/label-1.wav +0 -0
package/assets/automatic-speech-recognition/input.flac +0 -0
package/assets/automatic-speech-recognition/wav2vec2.png +0 -0
package/assets/contribution-guide/anatomy.png +0 -0
package/assets/contribution-guide/libraries.png +0 -0
package/assets/depth-estimation/depth-estimation-input.jpg +0 -0
package/assets/depth-estimation/depth-estimation-output.png +0 -0
package/assets/document-question-answering/document-question-answering-input.png +0 -0
package/assets/image-classification/image-classification-input.jpeg +0 -0
package/assets/image-segmentation/image-segmentation-input.jpeg +0 -0
package/assets/image-segmentation/image-segmentation-output.png +0 -0
package/assets/image-to-image/image-to-image-input.jpeg +0 -0
package/assets/image-to-image/image-to-image-output.png +0 -0
package/assets/image-to-image/pix2pix_examples.jpg +0 -0
package/assets/image-to-text/savanna.jpg +0 -0
package/assets/object-detection/object-detection-input.jpg +0 -0
package/assets/object-detection/object-detection-output.jpg +0 -0
package/assets/table-question-answering/tableQA.jpg +0 -0
package/assets/text-to-image/image.jpeg +0 -0
package/assets/text-to-speech/audio.wav +0 -0
package/assets/text-to-video/text-to-video-output.gif +0 -0
package/assets/unconditional-image-generation/unconditional-image-generation-output.jpeg +0 -0
package/assets/video-classification/video-classification-input.gif +0 -0
package/assets/visual-question-answering/elephant.jpeg +0 -0
package/assets/zero-shot-image-classification/image-classification-input.jpeg +0 -0
package/dist/index.cjs +3105 -0
package/dist/index.d.cts +145 -0
package/dist/index.d.ts +145 -0
package/dist/index.js +3079 -0
package/package.json +35 -0
package/src/Types.ts +58 -0
package/src/audio-classification/about.md +85 -0
package/src/audio-classification/data.ts +77 -0
package/src/audio-to-audio/about.md +55 -0
package/src/audio-to-audio/data.ts +63 -0
package/src/automatic-speech-recognition/about.md +86 -0
package/src/automatic-speech-recognition/data.ts +77 -0
package/src/const.ts +51 -0
package/src/conversational/about.md +50 -0
package/src/conversational/data.ts +62 -0
package/src/depth-estimation/about.md +38 -0
package/src/depth-estimation/data.ts +52 -0
package/src/document-question-answering/about.md +54 -0
package/src/document-question-answering/data.ts +67 -0
package/src/feature-extraction/about.md +35 -0
package/src/feature-extraction/data.ts +57 -0
package/src/fill-mask/about.md +51 -0
package/src/fill-mask/data.ts +77 -0
package/src/image-classification/about.md +48 -0
package/src/image-classification/data.ts +88 -0
package/src/image-segmentation/about.md +63 -0
package/src/image-segmentation/data.ts +96 -0
package/src/image-to-image/about.md +81 -0
package/src/image-to-image/data.ts +97 -0
package/src/image-to-text/about.md +58 -0
package/src/image-to-text/data.ts +87 -0
package/src/index.ts +2 -0
package/src/object-detection/about.md +36 -0
package/src/object-detection/data.ts +73 -0
package/src/placeholder/about.md +15 -0
package/src/placeholder/data.ts +18 -0
package/src/question-answering/about.md +56 -0
package/src/question-answering/data.ts +69 -0
package/src/reinforcement-learning/about.md +176 -0
package/src/reinforcement-learning/data.ts +78 -0
package/src/sentence-similarity/about.md +97 -0
package/src/sentence-similarity/data.ts +100 -0
package/src/summarization/about.md +57 -0
package/src/summarization/data.ts +72 -0
package/src/table-question-answering/about.md +43 -0
package/src/table-question-answering/data.ts +63 -0
package/src/tabular-classification/about.md +67 -0
package/src/tabular-classification/data.ts +69 -0
package/src/tabular-regression/about.md +91 -0
package/src/tabular-regression/data.ts +58 -0
package/src/tasksData.ts +104 -0
package/src/text-classification/about.md +171 -0
package/src/text-classification/data.ts +90 -0
package/src/text-generation/about.md +128 -0
package/src/text-generation/data.ts +124 -0
package/src/text-to-image/about.md +65 -0
package/src/text-to-image/data.ts +88 -0
package/src/text-to-speech/about.md +63 -0
package/src/text-to-speech/data.ts +70 -0
package/src/text-to-video/about.md +36 -0
package/src/text-to-video/data.ts +97 -0
package/src/token-classification/about.md +78 -0
package/src/token-classification/data.ts +83 -0
package/src/translation/about.md +65 -0
package/src/translation/data.ts +68 -0
package/src/unconditional-image-generation/about.md +45 -0
package/src/unconditional-image-generation/data.ts +66 -0
package/src/video-classification/about.md +53 -0
package/src/video-classification/data.ts +84 -0
package/src/visual-question-answering/about.md +43 -0
package/src/visual-question-answering/data.ts +90 -0
package/src/zero-shot-classification/about.md +39 -0
package/src/zero-shot-classification/data.ts +66 -0
package/src/zero-shot-image-classification/about.md +68 -0
package/src/zero-shot-image-classification/data.ts +79 -0

package/src/video-classification/about.md ADDED Viewed

@@ -0,0 +1,53 @@
+## Use Cases
+Video classification models can be used to categorize what a video is all about.
+### Activity Recognition
+Video classification models are used to perform activity recognition which is useful for fitness applications. Activity  recognition is also helpful for vision-impaired individuals especially when they're commuting.
+### Video Search
+Models trained in video classification can improve user experience by organizing and categorizing video galleries on the phone or in the cloud, on multiple keywords or tags.
+## Inference
+Below you can find code for inferring with a pre-trained video classification model.
+```python
+from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification
+from pytorchvideo.transforms import UniformTemporalSubsample
+from pytorchvideo.data.encoded_video import EncodedVideo
+# Load the video.
+video = EncodedVideo.from_path("path_to_video.mp4")
+video_data = video.get_clip(start_sec=0, end_sec=4.0)["video"]
+# Sub-sample a fixed set of frames and convert them to a NumPy array.
+num_frames = 16
+subsampler = UniformTemporalSubsample(num_frames)
+subsampled_frames = subsampler(video_data)
+video_data_np = subsampled_frames.numpy().transpose(1, 2, 3, 0)
+# Preprocess the video frames.
+inputs = feature_extractor(list(video_data_np), return_tensors="pt")
+# Run inference
+with torch.no_grad():
+    outputs = model(**inputs)
+    logits = outputs.logits
+# Model predicts one of the 400 Kinetics 400 classes
+predicted_label = logits.argmax(-1).item()
+print(model.config.id2label[predicted_label])
+# `eating spaghetti` (if you chose this video:
+# https://hf.co/datasets/nielsr/video-demo/resolve/main/eating_spaghetti.mp4)
+```
+## Useful Resources
+- [Developing a simple video classification model](https://keras.io/examples/vision/video_classification)
+- [Video classification with Transformers](https://keras.io/examples/vision/video_transformers)
+- [Building a video archive](https://www.youtube.com/watch?v=_IeS1m8r6SY)
+- [Video classification task guide](https://huggingface.co/docs/transformers/tasks/video_classification)
+### Creating your own video classifier in minutes
+- [Fine-tuning tutorial notebook (PyTorch)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb)

package/src/video-classification/data.ts ADDED Viewed

@@ -0,0 +1,84 @@
+import type { TaskDataCustom } from "../Types";
+const taskData: TaskDataCustom = {
+	datasets: [
+		{
+			// TODO write proper description
+			description: "Benchmark dataset used for video classification with videos that belong to 400 classes.",
+			id:          "kinetics400",
+		},
+	],
+	demo: {
+		inputs: [
+			{
+				filename: "video-classification-input.gif",
+				type:     "img",
+			},
+		],
+		outputs: [
+			{
+				type: "chart",
+				data: [
+					{
+						label: "Playing Guitar",
+						score: 0.514,
+					},
+					{
+						label: "Playing Tennis",
+						score: 0.193,
+					},
+					{
+						label: "Cooking",
+						score: 0.068,
+					},
+				],
+			},
+		],
+	},
+	metrics: [
+		{
+			description: "",
+			id:          "accuracy",
+		},
+		{
+			description: "",
+			id:          "recall",
+		},
+		{
+			description: "",
+			id:          "precision",
+		},
+		{
+			description: "",
+			id:          "f1",
+		},
+	],
+	models: [
+		{
+			// TO DO: write description
+			description: "Strong Video Classification model trained on the Kinects 400 dataset.",
+			id:          "MCG-NJU/videomae-base-finetuned-kinetics",
+		},
+		{
+			// TO DO: write description
+			description: "Strong Video Classification model trained on the Kinects 400 dataset.",
+			id:          "microsoft/xclip-base-patch32",
+		},
+	],
+	spaces:       [
+		{
+			description: "An application that classifies video at different timestamps.",
+			id:          "nateraw/lavila",
+		},
+		{
+			description: "An application that classifies video.",
+			id:          "fcakyon/video-classification",
+		},
+	],
+	summary:      "Video classification is the task of assigning a label or class to an entire video. Videos are expected to have only one class for each video. Video classification models take a video as input and return a prediction about which class the video belongs to.",
+	widgetModels: [],
+	youtubeId:    "",
+};
+export default taskData;

package/src/visual-question-answering/about.md ADDED Viewed

@@ -0,0 +1,43 @@
+## Use Cases
+###  Aid the Visually Impaired Persons
+VQA models can be used to reduce visual barriers for visually impaired individuals by allowing them to get information about images from the web and the real world.
+### Education
+VQA models can be used to improve experiences at museums by allowing observers to directly ask questions they interested in.
+### Improved Image Retrieval
+Visual question answering models can be used to retrieve images with specific characteristics. For example, the user can ask "Is there a dog?" to find all images with dogs from a set of images.
+### Video Search
+Specific snippets/timestamps of a video can be retrieved based on search queries. For example, the user can ask "At which part of the video does the guitar appear?" and get a specific timestamp range from the whole video.
+## Task Variants
+### Video Question Answering
+Video Question Answering aims to answer questions asked about the content of a video.
+## Inference
+You can infer with Visual Question Answering models using the `vqa` (or `visual-question-answering`) pipeline. This pipeline requires [the Python Image Library (PIL)](https://pillow.readthedocs.io/en/stable/) to process images. You can install it with (`pip install pillow`).
+```python
+from PIL import Image
+from transformers import pipeline
+vqa_pipeline = pipeline("visual-question-answering")
+image =  Image.open("elephant.jpeg")
+question = "Is there an elephant?"
+vqa_pipeline(image, question, top_k=1)
+#[{'score': 0.9998154044151306, 'answer': 'yes'}]
+```
+## Useful Resources
+- [An introduction to Visual Question Answering - AllenAI](https://blog.allenai.org/vanilla-vqa-adcaaaa94336)
+- [Multi Modal Framework (MMF) - Meta Research](https://mmf.sh/docs/getting_started/video_overview/)
+The contents of this page are contributed by [
+Bharat Raghunathan](https://huggingface.co/bharat-raghunathan) and [Jose Londono Botero](https://huggingface.co/jlondonobo).

package/src/visual-question-answering/data.ts ADDED Viewed

@@ -0,0 +1,90 @@
+import type { TaskDataCustom } from "../Types";
+const taskData: TaskDataCustom = {
+	datasets: [
+		{
+			description: "A widely used dataset containing questions (with answers) about images.",
+			id:          "Graphcore/vqa",
+		},
+		{
+			description: "A dataset to benchmark visual reasoning based on text in images.",
+			id: "textvqa",
+		}
+	],
+	demo: {
+		inputs: [
+			{
+				filename: "elephant.jpeg",
+				type:     "img",
+			},
+			{
+				label:   "Question",
+				content: "What is in this image?",
+				type:    "text",
+			},
+		],
+		outputs: [
+			{
+				type: "chart",
+				data: [
+					{
+						label: "elephant",
+						score: 0.97,
+					},
+					{
+						label: "elephants",
+						score: 0.06,
+					},
+					{
+						label: "animal",
+						score: 0.003,
+					},
+				],
+			},
+		],
+	},
+	isPlaceholder: false,
+	metrics:       [
+		{
+			description: "",
+			id:          "accuracy",
+		},
+		{
+			description: "Measures how much a predicted answer differs from the ground truth based on the difference in their semantic meaning.",
+			id:          "wu-palmer similarity",
+		},
+	],
+	models: [
+		{
+			description: "A visual question answering model trained to convert charts and plots to text.",
+			id:          "google/deplot",
+		},
+		{
+			description: "A visual question answering model trained for mathematical reasoning and chart derendering from images.",
+			id:          "google/matcha-base ",
+		},
+		{
+			description: "A strong visual question answering that answers questions from book covers.",
+			id:          "google/pix2struct-ocrvqa-large",
+		},
+	],
+	spaces:       [
+		{
+			description: "An application that can answer questions based on images.",
+			id:          "nielsr/vilt-vqa",
+		},
+		{
+			description: "An application that can caption images and answer questions about a given image. ",
+			id:          "Salesforce/BLIP",
+		},
+		{
+			description: "An application that can caption images and answer questions about a given image. ",
+			id:          "vumichien/Img2Prompt",
+		},
+	],
+	summary:      "Visual Question Answering is the task of answering open-ended questions based on an image. They output natural language responses to natural language questions.",
+	widgetModels: ["dandelin/vilt-b32-finetuned-vqa"],
+	youtubeId:    "",
+};
+export default taskData;

package/src/zero-shot-classification/about.md ADDED Viewed

@@ -0,0 +1,39 @@
+## About the Task
+Zero Shot Classification is the task of predicting a class that wasn't seen by the model during training. This method, which leverages a pre-trained language model, can be thought of as an instance of [transfer learning](https://www.youtube.com/watch?v=BqqfQnyjmgg) which generally refers to using a model trained for one task in a different application than what it was originally trained for. This is particularly useful for situations where the amount of labeled data is small.
+In zero shot classification, we provide the model with a prompt and a sequence of text that describes what we want our model to do, in natural language. Zero-shot classification excludes any examples of the desired task being completed. This differs from single or few-shot classification, as these tasks include a single or a few examples of the selected task.
+Zero, single and few-shot classification seem to be an emergent feature of large language models. This feature seems to come about around model sizes of +100M parameters. The effectiveness of a model at a zero, single or few-shot task seems to scale with model size, meaning that larger models (models with more trainable parameters or layers) generally do better at this task.
+Here is an example of a zero-shot prompt for classifying the sentiment of a sequence of text:
+```
+Classify the following input text into one of the following three categories: [positive, negative, neutral]
+Input Text: Hugging Face is awesome for making all of these
+state of the art models available!
+Sentiment: positive
+```
+One great example of this task with a nice off-the-shelf model is available at the widget of this page, where the user can input a sequence of text and candidate labels to the model. This is a *word level* example of zero shot classification, more elaborate and lengthy generations are available with larger models. Testing these models out and getting a feel for prompt engineering is the best way to learn how to use them.
+## Inference
+You can use the 🤗 Transformers library zero-shot-classification pipeline to infer with zero shot text classification models.
+```python
+from transformers import pipeline
+pipe = pipeline(model="facebook/bart-large-mnli")
+pipe("I have a problem with my iphone that needs to be resolved asap!",
+    candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
+)
+# output
+>>> {'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]}
+```
+## Useful Resources
+- [Zero Shot Learning](https://joeddav.github.io/blog/2020/05/29/ZSL.html)
+- [Hugging Face on Transfer Learning](https://huggingface.co/course/en/chapter1/4?fw=pt#transfer-learning)

package/src/zero-shot-classification/data.ts ADDED Viewed

@@ -0,0 +1,66 @@
+import type { TaskDataCustom } from "../Types";
+const taskData: TaskDataCustom = {
+	datasets: [
+		{
+			description: "A widely used dataset used to benchmark multiple variants of text classification.",
+			id:          "glue",
+		},
+		{
+			description: "The Multi-Genre Natural Language Inference (MultiNLI) corpus is a crowd-sourced collection of 433k sentence pairs annotated with textual entailment information.",
+			id:          "MultiNLI",
+		},
+		{
+			description: "FEVER is a publicly available dataset for fact extraction and verification against textual sources.",
+			id:          "FEVER",
+		},
+	],
+	demo: {
+		inputs: [
+			{
+				label:   "Text Input",
+				content: "Dune is the best movie ever.",
+				type:    "text",
+			},
+			{
+				label:   "Candidate Labels",
+				content: "CINEMA, ART, MUSIC",
+				type:    "text",
+			},
+		],
+		outputs: [
+			{
+				type: "chart",
+				data: [
+					{
+						label: "CINEMA",
+						score: 0.90,
+					},
+					{
+						label: "ART",
+						score: 0.10,
+					},
+					{
+						label: "MUSIC",
+						score: 0.00,
+					},
+				],
+			},
+		],
+	},
+	metrics:      [],
+	models:       [
+		{
+			description:
+				"Powerful zero-shot text classification model",
+			id: "facebook/bart-large-mnli",
+		},
+	],
+	spaces:       [],
+	summary:      "Zero-shot text classification is a task in natural language processing where a model is trained on a set of labeled examples but is then able to classify new examples from previously unseen classes.",
+	widgetModels: ["facebook/bart-large-mnli"],
+};
+export default taskData;

package/src/zero-shot-image-classification/about.md ADDED Viewed

@@ -0,0 +1,68 @@
+## About the Task
+Zero-shot image classification is a computer vision task to classify images into one of several classes, without any prior training or knowledge of the classes.
+Zero shot image classification works by transferring knowledge learnt during training of one model, to classify novel classes that was not present in the training data. So this is a variation of [transfer learning](https://www.youtube.com/watch?v=BqqfQnyjmgg). For instance, a model trained to differentiate cars from airplanes can be used to classify images of ships.
+The data in this learning paradigm consists of
+- Seen data - images and their corresponding labels
+- Unseen data -  only labels and no images
+- Auxiliary information - additional information given to the model during training connecting the unseen and seen data. This can be in the form of textual description or word embeddings.
+## Use Cases
+### Image Retrieval
+Zero-shot learning resolves several challenges in image retrieval systems. For example, with the rapid growth of categories on the web, it is challenging to index images based on unseen categories. With zero-shot learning we can associate unseen categories to images by exploiting attributes to model the relationships among visual features and labels.
+### Action Recognition
+Action recognition is the task of identifying when a person in an image/video is performing a given action from a set of actions. If all the possible actions are not known beforehand, conventional deep learning models fail. With zero-shot learning, for a given domain of a set of actions, we can create a mapping connecting low-level features and a semantic description of auxiliary data to classify unknown classes of actions.
+## Task Variants
+You can contribute variants of this task [here](https://github.com/huggingface/hub-docs/blob/main/tasks/src/zero-shot-image-classification/about.md).
+## Inference
+The model can be loaded with the zero-shot-image-classification pipeline like so:
+```python
+from transformers import pipeline
+# More models in the model hub.
+model_name = "openai/clip-vit-large-patch14-336"
+classifier = pipeline("zero-shot-image-classification", model = model_name)
+```
+You can then use this pipeline to classify images into any of the class names you specify. You can specify more than two class labels too.
+```python
+image_to_classify = "path_to_cat_and_dog_image.jpeg"
+labels_for_classification =  ["cat and dog",
+                              "lion and cheetah",
+                              "rabbit and lion"]
+scores = classifier(image_to_classify,
+                    candidate_labels = labels_for_classification)
+```
+The classifier would return a list of dictionaries after the inference which is stored in the variable `scores` in the code snippet above. Variable `scores` would look as follows:
+```python
+[{'score': 0.9950482249259949, 'label': 'cat and dog'},
+{'score': 0.004863627254962921, 'label': 'rabbit and lion'},
+{'score': 8.816882473183796e-05, 'label': 'lion and cheetah'}]
+```
+The dictionary at the zeroth index of the list will contain the label with the highest score.
+```python
+print(f"The highest score is {scores[0]['score']:.3f} for the label {scores[0]['label']}")
+```
+The output from the print statement above would look as follows:
+```
+The highest probability is 0.995 for the label cat and dog
+```
+## Useful Resources
+You can contribute useful resources about this task [here](https://github.com/huggingface/hub-docs/blob/main/tasks/src/zero-shot-image-classification/about.md).
+Check out [Zero-shot image classification task guide](https://huggingface.co/docs/transformers/tasks/zero_shot_image_classification).
+This page was made possible thanks to the efforts of [Shamima Hossain](https://huggingface.co/Shamima), [Haider Zaidi
+](https://huggingface.co/chefhaider) and [Paarth Bhatnagar](https://huggingface.co/Paarth).

package/src/zero-shot-image-classification/data.ts ADDED Viewed

@@ -0,0 +1,79 @@
+import type { TaskDataCustom } from "../Types";
+const taskData: TaskDataCustom = {
+	datasets: [
+		{
+			// TODO write proper description
+			description: "",
+			id:          "",
+		},
+	],
+	demo: {
+		inputs: [
+			{
+				filename: "image-classification-input.jpeg",
+				type:     "img",
+			},
+			{
+				label:   "Classes",
+				content: "cat, dog, bird",
+				type:    "text",
+			},
+		],
+		outputs: [
+			{
+				type: "chart",
+				data: [
+					{
+						label: "Cat",
+						score: 0.664,
+					},
+					{
+						label: "Dog",
+						score: 0.329,
+					},
+					{
+						label: "Bird",
+						score: 0.008,
+					},
+				],
+			},
+		],
+	},
+	metrics: [
+		{
+			description:
+				"Computes the number of times the correct label appears in top K labels predicted",
+			id: "top-K accuracy",
+		},
+	],
+	models: [
+		{
+			description:
+				"Robust image classification model trained on publicly available image-caption data.",
+			id: "openai/clip-vit-base-patch16",
+		},
+		{
+			description:
+				"Robust image classification model trained on publicly available image-caption data trained on additional high pixel data for better performance.",
+			id: "openai/clip-vit-large-patch14-336",
+		},
+		{
+			description:
+				"Strong image classification model for biomedical domain.",
+			id: "microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224",
+		},
+	],
+	spaces:  [
+		{
+			description: "An application that leverages zero shot image classification to find best captions to generate an image. ",
+			id:          "pharma/CLIP-Interrogator",
+		},
+	],
+	summary:
+		"Zero shot image classification is the task of classifying previously unseen classes during training of a model.",
+	widgetModels: ["openai/clip-vit-large-patch14-336"],
+	youtubeId:    "",
+};
+export default taskData;