@huggingface/tasks 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/assets/audio-classification/audio.wav +0 -0
- package/assets/audio-to-audio/input.wav +0 -0
- package/assets/audio-to-audio/label-0.wav +0 -0
- package/assets/audio-to-audio/label-1.wav +0 -0
- package/assets/automatic-speech-recognition/input.flac +0 -0
- package/assets/automatic-speech-recognition/wav2vec2.png +0 -0
- package/assets/contribution-guide/anatomy.png +0 -0
- package/assets/contribution-guide/libraries.png +0 -0
- package/assets/depth-estimation/depth-estimation-input.jpg +0 -0
- package/assets/depth-estimation/depth-estimation-output.png +0 -0
- package/assets/document-question-answering/document-question-answering-input.png +0 -0
- package/assets/image-classification/image-classification-input.jpeg +0 -0
- package/assets/image-segmentation/image-segmentation-input.jpeg +0 -0
- package/assets/image-segmentation/image-segmentation-output.png +0 -0
- package/assets/image-to-image/image-to-image-input.jpeg +0 -0
- package/assets/image-to-image/image-to-image-output.png +0 -0
- package/assets/image-to-image/pix2pix_examples.jpg +0 -0
- package/assets/image-to-text/savanna.jpg +0 -0
- package/assets/object-detection/object-detection-input.jpg +0 -0
- package/assets/object-detection/object-detection-output.jpg +0 -0
- package/assets/table-question-answering/tableQA.jpg +0 -0
- package/assets/text-to-image/image.jpeg +0 -0
- package/assets/text-to-speech/audio.wav +0 -0
- package/assets/text-to-video/text-to-video-output.gif +0 -0
- package/assets/unconditional-image-generation/unconditional-image-generation-output.jpeg +0 -0
- package/assets/video-classification/video-classification-input.gif +0 -0
- package/assets/visual-question-answering/elephant.jpeg +0 -0
- package/assets/zero-shot-image-classification/image-classification-input.jpeg +0 -0
- package/dist/index.cjs +3105 -0
- package/dist/index.d.cts +145 -0
- package/dist/index.d.ts +145 -0
- package/dist/index.js +3079 -0
- package/package.json +35 -0
- package/src/Types.ts +58 -0
- package/src/audio-classification/about.md +85 -0
- package/src/audio-classification/data.ts +77 -0
- package/src/audio-to-audio/about.md +55 -0
- package/src/audio-to-audio/data.ts +63 -0
- package/src/automatic-speech-recognition/about.md +86 -0
- package/src/automatic-speech-recognition/data.ts +77 -0
- package/src/const.ts +51 -0
- package/src/conversational/about.md +50 -0
- package/src/conversational/data.ts +62 -0
- package/src/depth-estimation/about.md +38 -0
- package/src/depth-estimation/data.ts +52 -0
- package/src/document-question-answering/about.md +54 -0
- package/src/document-question-answering/data.ts +67 -0
- package/src/feature-extraction/about.md +35 -0
- package/src/feature-extraction/data.ts +57 -0
- package/src/fill-mask/about.md +51 -0
- package/src/fill-mask/data.ts +77 -0
- package/src/image-classification/about.md +48 -0
- package/src/image-classification/data.ts +88 -0
- package/src/image-segmentation/about.md +63 -0
- package/src/image-segmentation/data.ts +96 -0
- package/src/image-to-image/about.md +81 -0
- package/src/image-to-image/data.ts +97 -0
- package/src/image-to-text/about.md +58 -0
- package/src/image-to-text/data.ts +87 -0
- package/src/index.ts +2 -0
- package/src/object-detection/about.md +36 -0
- package/src/object-detection/data.ts +73 -0
- package/src/placeholder/about.md +15 -0
- package/src/placeholder/data.ts +18 -0
- package/src/question-answering/about.md +56 -0
- package/src/question-answering/data.ts +69 -0
- package/src/reinforcement-learning/about.md +176 -0
- package/src/reinforcement-learning/data.ts +78 -0
- package/src/sentence-similarity/about.md +97 -0
- package/src/sentence-similarity/data.ts +100 -0
- package/src/summarization/about.md +57 -0
- package/src/summarization/data.ts +72 -0
- package/src/table-question-answering/about.md +43 -0
- package/src/table-question-answering/data.ts +63 -0
- package/src/tabular-classification/about.md +67 -0
- package/src/tabular-classification/data.ts +69 -0
- package/src/tabular-regression/about.md +91 -0
- package/src/tabular-regression/data.ts +58 -0
- package/src/tasksData.ts +104 -0
- package/src/text-classification/about.md +171 -0
- package/src/text-classification/data.ts +90 -0
- package/src/text-generation/about.md +128 -0
- package/src/text-generation/data.ts +124 -0
- package/src/text-to-image/about.md +65 -0
- package/src/text-to-image/data.ts +88 -0
- package/src/text-to-speech/about.md +63 -0
- package/src/text-to-speech/data.ts +70 -0
- package/src/text-to-video/about.md +36 -0
- package/src/text-to-video/data.ts +97 -0
- package/src/token-classification/about.md +78 -0
- package/src/token-classification/data.ts +83 -0
- package/src/translation/about.md +65 -0
- package/src/translation/data.ts +68 -0
- package/src/unconditional-image-generation/about.md +45 -0
- package/src/unconditional-image-generation/data.ts +66 -0
- package/src/video-classification/about.md +53 -0
- package/src/video-classification/data.ts +84 -0
- package/src/visual-question-answering/about.md +43 -0
- package/src/visual-question-answering/data.ts +90 -0
- package/src/zero-shot-classification/about.md +39 -0
- package/src/zero-shot-classification/data.ts +66 -0
- package/src/zero-shot-image-classification/about.md +68 -0
- package/src/zero-shot-image-classification/data.ts +79 -0
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../Types";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
description: "A dataset of 7k conversations explicitly designed to exhibit multiple conversation modes: displaying personality, having empathy, and demonstrating knowledge.",
|
|
7
|
+
id: "blended_skill_talk",
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
description: "ConvAI is a dataset of human-to-bot conversations labeled for quality. This data can be used to train a metric for evaluating dialogue systems",
|
|
11
|
+
id: "conv_ai_2",
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
description: "EmpatheticDialogues, is a dataset of 25k conversations grounded in emotional situations",
|
|
15
|
+
id: "empathetic_dialogues",
|
|
16
|
+
},
|
|
17
|
+
],
|
|
18
|
+
demo: {
|
|
19
|
+
inputs: [
|
|
20
|
+
{
|
|
21
|
+
label: "Input",
|
|
22
|
+
content: "Hey my name is Julien! How are you?",
|
|
23
|
+
type: "text",
|
|
24
|
+
},
|
|
25
|
+
|
|
26
|
+
],
|
|
27
|
+
outputs: [
|
|
28
|
+
{
|
|
29
|
+
label: "Answer",
|
|
30
|
+
content: "Hi Julien! My name is Julia! I am well.",
|
|
31
|
+
type: "text",
|
|
32
|
+
},
|
|
33
|
+
],
|
|
34
|
+
},
|
|
35
|
+
metrics: [
|
|
36
|
+
{
|
|
37
|
+
description: "BLEU score is calculated by counting the number of shared single or subsequent tokens between the generated sequence and the reference. Subsequent n tokens are called “n-grams”. Unigram refers to a single token while bi-gram refers to token pairs and n-grams refer to n subsequent tokens. The score ranges from 0 to 1, where 1 means the translation perfectly matched and 0 did not match at all",
|
|
38
|
+
id: "bleu",
|
|
39
|
+
},
|
|
40
|
+
],
|
|
41
|
+
models: [
|
|
42
|
+
{
|
|
43
|
+
description: "A faster and smaller model than the famous BERT model.",
|
|
44
|
+
id: "facebook/blenderbot-400M-distill",
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
description: "DialoGPT is a large-scale pretrained dialogue response generation model for multiturn conversations.",
|
|
48
|
+
id: "microsoft/DialoGPT-large",
|
|
49
|
+
},
|
|
50
|
+
],
|
|
51
|
+
spaces: [
|
|
52
|
+
{
|
|
53
|
+
description: "A chatbot based on Blender model.",
|
|
54
|
+
id: "EXFINITE/BlenderBot-UI",
|
|
55
|
+
},
|
|
56
|
+
],
|
|
57
|
+
summary: "Conversational response modelling is the task of generating conversational text that is relevant, coherent and knowledgable given a prompt. These models have applications in chatbots, and as a part of voice assistants",
|
|
58
|
+
widgetModels: ["facebook/blenderbot-400M-distill"],
|
|
59
|
+
youtubeId: "",
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
export default taskData;
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
## Use Cases
|
|
2
|
+
Depth estimation models can be used to estimate the depth of different objects present in an image.
|
|
3
|
+
|
|
4
|
+
### Estimation of Volumetric Information
|
|
5
|
+
Depth estimation models are widely used to study volumetric formation of objects present inside an image. This is an important use case in the domain of computer graphics.
|
|
6
|
+
|
|
7
|
+
### 3D Representation
|
|
8
|
+
|
|
9
|
+
Depth estimation models can also be used to develop a 3D representation from a 2D image.
|
|
10
|
+
|
|
11
|
+
## Inference
|
|
12
|
+
|
|
13
|
+
With the `transformers` library, you can use the `depth-estimation` pipeline to infer with image classification models. You can initialize the pipeline with a model id from the Hub. If you do not provide a model id it will initialize with [Intel/dpt-large](https://huggingface.co/Intel/dpt-large) by default. When calling the pipeline you just need to specify a path, http link or an image loaded in PIL. Additionally, you can find a comprehensive list of various depth estimation models at [this link](https://huggingface.co/models?pipeline_tag=depth-estimation).
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from transformers import pipeline
|
|
17
|
+
|
|
18
|
+
estimator = pipeline(task="depth-estimation", model="Intel/dpt-large")
|
|
19
|
+
result = estimator(images="http://images.cocodataset.org/val2017/000000039769.jpg")
|
|
20
|
+
result
|
|
21
|
+
|
|
22
|
+
# {'predicted_depth': tensor([[[ 6.3199, 6.3629, 6.4148, ..., 10.4104, 10.5109, 10.3847],
|
|
23
|
+
# [ 6.3850, 6.3615, 6.4166, ..., 10.4540, 10.4384, 10.4554],
|
|
24
|
+
# [ 6.3519, 6.3176, 6.3575, ..., 10.4247, 10.4618, 10.4257],
|
|
25
|
+
# ...,
|
|
26
|
+
# [22.3772, 22.4624, 22.4227, ..., 22.5207, 22.5593, 22.5293],
|
|
27
|
+
# [22.5073, 22.5148, 22.5114, ..., 22.6604, 22.6344, 22.5871],
|
|
28
|
+
# [22.5176, 22.5275, 22.5218, ..., 22.6282, 22.6216, 22.6108]]]),
|
|
29
|
+
# 'depth': <PIL.Image.Image image mode=L size=640x480 at 0x7F1A8BFE5D90>}
|
|
30
|
+
|
|
31
|
+
# You can visualize the result just by calling `result["depth"]`.
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
## Useful Resources
|
|
36
|
+
|
|
37
|
+
- [Monocular depth estimation task guide](https://huggingface.co/docs/transformers/tasks/monocular_depth_estimation)
|
|
38
|
+
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../Types";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
description: "NYU Depth V2 Dataset: Video dataset containing both RGB and depth sensor data",
|
|
7
|
+
id: "sayakpaul/nyu_depth_v2",
|
|
8
|
+
},
|
|
9
|
+
],
|
|
10
|
+
demo: {
|
|
11
|
+
inputs: [
|
|
12
|
+
{
|
|
13
|
+
filename: "depth-estimation-input.jpg",
|
|
14
|
+
type: "img",
|
|
15
|
+
},
|
|
16
|
+
],
|
|
17
|
+
outputs: [
|
|
18
|
+
{
|
|
19
|
+
filename: "depth-estimation-output.png",
|
|
20
|
+
type: "img",
|
|
21
|
+
},
|
|
22
|
+
],
|
|
23
|
+
},
|
|
24
|
+
metrics: [],
|
|
25
|
+
models: [
|
|
26
|
+
{
|
|
27
|
+
// TO DO: write description
|
|
28
|
+
description: "Strong Depth Estimation model trained on 1.4 million images.",
|
|
29
|
+
id: "Intel/dpt-large",
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
// TO DO: write description
|
|
33
|
+
description: "Strong Depth Estimation model trained on the KITTI dataset.",
|
|
34
|
+
id: "vinvino02/glpn-kitti",
|
|
35
|
+
},
|
|
36
|
+
],
|
|
37
|
+
spaces: [
|
|
38
|
+
{
|
|
39
|
+
description: "An application that predicts the depth of an image and then reconstruct the 3D model as voxels.",
|
|
40
|
+
id: "radames/dpt-depth-estimation-3d-voxels",
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
description: "An application that can estimate the depth in a given image.",
|
|
44
|
+
id: "keras-io/Monocular-Depth-Estimation",
|
|
45
|
+
},
|
|
46
|
+
],
|
|
47
|
+
summary: "Depth estimation is the task of predicting depth of the objects present in an image.",
|
|
48
|
+
widgetModels: [""],
|
|
49
|
+
youtubeId: "",
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
export default taskData;
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
## Use Cases
|
|
2
|
+
|
|
3
|
+
Document Question Answering models can be used to answer natural language questions about documents. Typically, document QA models consider textual, layout and potentially visual information. This is useful when the question requires some understanding of the visual aspects of the document.
|
|
4
|
+
Nevertheless, certain document QA models can work without document images. Hence the task is not limited to visually-rich documents and allows users to ask questions based on spreadsheets, text PDFs, etc!
|
|
5
|
+
|
|
6
|
+
### Document Parsing
|
|
7
|
+
|
|
8
|
+
One of the most popular use cases of document question answering models is the parsing of structured documents. For example, you can extract the name, address, and other information from a form. You can also use the model to extract information from a table, or even a resume.
|
|
9
|
+
|
|
10
|
+
### Invoice Information Extraction
|
|
11
|
+
|
|
12
|
+
Another very popular use case is invoice information extraction. For example, you can extract the invoice number, the invoice date, the total amount, the VAT number, and the invoice recipient.
|
|
13
|
+
|
|
14
|
+
## Inference
|
|
15
|
+
|
|
16
|
+
You can infer with Document QA models with the 🤗 Transformers library using the [`document-question-answering` pipeline](https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.DocumentQuestionAnsweringPipeline). If no model checkpoint is given, the pipeline will be initialized with [`impira/layoutlm-document-qa`](https://huggingface.co/impira/layoutlm-document-qa). This pipeline takes question(s) and document(s) as input, and returns the answer.
|
|
17
|
+
👉 Note that the question answering task solved here is extractive: the model extracts the answer from a context (the document).
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
from transformers import pipeline
|
|
22
|
+
from PIL import Image
|
|
23
|
+
|
|
24
|
+
pipe = pipeline("document-question-answering", model="naver-clova-ix/donut-base-finetuned-docvqa")
|
|
25
|
+
|
|
26
|
+
question = "What is the purchase amount?"
|
|
27
|
+
image = Image.open("your-document.png")
|
|
28
|
+
|
|
29
|
+
pipe(image=image, question=question)
|
|
30
|
+
|
|
31
|
+
## [{'answer': '20,000$'}]
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Useful Resources
|
|
35
|
+
|
|
36
|
+
Would you like to learn more about Document QA? Awesome! Here are some curated resources that you may find helpful!
|
|
37
|
+
|
|
38
|
+
- [Document Visual Question Answering (DocVQA) challenge](https://rrc.cvc.uab.es/?ch=17)
|
|
39
|
+
- [DocVQA: A Dataset for Document Visual Question Answering](https://arxiv.org/abs/2007.00398) (Dataset paper)
|
|
40
|
+
- [ICDAR 2021 Competition on Document Visual Question Answering](https://lilianweng.github.io/lil-log/2020/10/29/open-domain-question-answering.html) (Conference paper)
|
|
41
|
+
- [HuggingFace's Document Question Answering pipeline](https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.DocumentQuestionAnsweringPipeline)
|
|
42
|
+
- [Github repo: DocQuery - Document Query Engine Powered by Large Language Models](https://github.com/impira/docquery)
|
|
43
|
+
|
|
44
|
+
### Notebooks
|
|
45
|
+
|
|
46
|
+
- [Fine-tuning Donut on DocVQA dataset](https://github.com/NielsRogge/Transformers-Tutorials/tree/0ea77f29d01217587d7e32a848f3691d9c15d6ab/Donut/DocVQA)
|
|
47
|
+
- [Fine-tuning LayoutLMv2 on DocVQA dataset](https://github.com/NielsRogge/Transformers-Tutorials/tree/1b4bad710c41017d07a8f63b46a12523bfd2e835/LayoutLMv2/DocVQA)
|
|
48
|
+
- [Accelerating Document AI](https://huggingface.co/blog/document-ai)
|
|
49
|
+
|
|
50
|
+
### Documentation
|
|
51
|
+
|
|
52
|
+
- [Document question answering task guide](https://huggingface.co/docs/transformers/tasks/document_question_answering)
|
|
53
|
+
|
|
54
|
+
The contents of this page are contributed by [Eliott Zemour](https://huggingface.co/eliolio) and reviewed by [Kwadwo Agyapon-Ntra](https://huggingface.co/KayO) and [Ankur Goyal](https://huggingface.co/ankrgyl).
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../Types";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
// TODO write proper description
|
|
7
|
+
description: "Dataset from the 2020 DocVQA challenge. The documents are taken from the UCSF Industry Documents Library.",
|
|
8
|
+
id: "eliolio/docvqa",
|
|
9
|
+
},
|
|
10
|
+
],
|
|
11
|
+
demo: {
|
|
12
|
+
inputs: [
|
|
13
|
+
{
|
|
14
|
+
label: "Question",
|
|
15
|
+
content: "What is the idea behind the consumer relations efficiency team?",
|
|
16
|
+
type: "text",
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
filename: "document-question-answering-input.png",
|
|
20
|
+
type: "img",
|
|
21
|
+
},
|
|
22
|
+
],
|
|
23
|
+
outputs: [
|
|
24
|
+
{
|
|
25
|
+
label: "Answer",
|
|
26
|
+
content: "Balance cost efficiency with quality customer service",
|
|
27
|
+
type: "text",
|
|
28
|
+
},
|
|
29
|
+
],
|
|
30
|
+
},
|
|
31
|
+
metrics: [
|
|
32
|
+
{
|
|
33
|
+
description: "The evaluation metric for the DocVQA challenge is the Average Normalized Levenshtein Similarity (ANLS). This metric is flexible to character regognition errors and compares the predicted answer with the ground truth answer.",
|
|
34
|
+
id: "anls",
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
description: "Exact Match is a metric based on the strict character match of the predicted answer and the right answer. For answers predicted correctly, the Exact Match will be 1. Even if only one character is different, Exact Match will be 0",
|
|
38
|
+
id: "exact-match",
|
|
39
|
+
},
|
|
40
|
+
],
|
|
41
|
+
models: [
|
|
42
|
+
{
|
|
43
|
+
description: "A LayoutLM model for the document QA task, fine-tuned on DocVQA and SQuAD2.0.",
|
|
44
|
+
id: "impira/layoutlm-document-qa",
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
|
|
48
|
+
id: "naver-clova-ix/donut-base-finetuned-docvqa",
|
|
49
|
+
},
|
|
50
|
+
],
|
|
51
|
+
spaces: [
|
|
52
|
+
{
|
|
53
|
+
description: "A robust document question answering application.",
|
|
54
|
+
id: "impira/docquery",
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
description: "An application that can answer questions from invoices.",
|
|
58
|
+
id: "impira/invoices",
|
|
59
|
+
},
|
|
60
|
+
],
|
|
61
|
+
summary: "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
|
|
62
|
+
widgetModels: ["impira/layoutlm-document-qa"],
|
|
63
|
+
youtubeId: "",
|
|
64
|
+
};
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
export default taskData;
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
## About the Task
|
|
2
|
+
|
|
3
|
+
Feature extraction is the task of building features intended to be informative from a given dataset,
|
|
4
|
+
facilitating the subsequent learning and generalization steps in various domains of machine learning.
|
|
5
|
+
|
|
6
|
+
## Use Cases
|
|
7
|
+
Feature extraction can be used to do transfer learning in natural language processing, computer vision and audio models.
|
|
8
|
+
|
|
9
|
+
## Inference
|
|
10
|
+
|
|
11
|
+
#### Feature Extraction
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from transformers import pipeline
|
|
15
|
+
checkpoint = "facebook/bart-base"
|
|
16
|
+
feature_extractor = pipeline("feature-extraction",framework="pt",model=checkpoint)
|
|
17
|
+
text = "Transformers is an awesome library!"
|
|
18
|
+
|
|
19
|
+
#Reducing along the first dimension to get a 768 dimensional array
|
|
20
|
+
feature_extractor(text,return_tensors = "pt")[0].numpy().mean(axis=0)
|
|
21
|
+
|
|
22
|
+
'''tensor([[[ 2.5834, 2.7571, 0.9024, ..., 1.5036, -0.0435, -0.8603],
|
|
23
|
+
[-1.2850, -1.0094, -2.0826, ..., 1.5993, -0.9017, 0.6426],
|
|
24
|
+
[ 0.9082, 0.3896, -0.6843, ..., 0.7061, 0.6517, 1.0550],
|
|
25
|
+
...,
|
|
26
|
+
[ 0.6919, -1.1946, 0.2438, ..., 1.3646, -1.8661, -0.1642],
|
|
27
|
+
[-0.1701, -2.0019, -0.4223, ..., 0.3680, -1.9704, -0.0068],
|
|
28
|
+
[ 0.2520, -0.6869, -1.0582, ..., 0.5198, -2.2106, 0.4547]]])'''
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Useful resources
|
|
32
|
+
|
|
33
|
+
- [Documentation for feature extractor of 🤗Transformers](https://huggingface.co/docs/transformers/main_classes/feature_extractor)
|
|
34
|
+
|
|
35
|
+
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../Types";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
description: "Wikipedia dataset containing cleaned articles of all languages. Can be used to train `feature-extraction` models.",
|
|
7
|
+
id:"wikipedia",
|
|
8
|
+
},
|
|
9
|
+
],
|
|
10
|
+
demo: {
|
|
11
|
+
inputs: [
|
|
12
|
+
{
|
|
13
|
+
label: "Input",
|
|
14
|
+
content:
|
|
15
|
+
"India, officially the Republic of India, is a country in South Asia.",
|
|
16
|
+
type: "text",
|
|
17
|
+
},
|
|
18
|
+
|
|
19
|
+
],
|
|
20
|
+
outputs: [
|
|
21
|
+
{
|
|
22
|
+
table: [
|
|
23
|
+
["Dimension 1", "Dimension 2", "Dimension 3"],
|
|
24
|
+
["2.583383083343506", "2.757075071334839", "0.9023529887199402"],
|
|
25
|
+
["8.29393482208252", "1.1071064472198486", "2.03399395942688"],
|
|
26
|
+
["-0.7754912972450256", "-1.647324562072754", "-0.6113331913948059"],
|
|
27
|
+
["0.07087723910808563", "1.5942802429199219", "1.4610432386398315"],
|
|
28
|
+
|
|
29
|
+
],
|
|
30
|
+
type: "tabular",
|
|
31
|
+
},
|
|
32
|
+
],
|
|
33
|
+
},
|
|
34
|
+
metrics: [
|
|
35
|
+
{
|
|
36
|
+
description: "",
|
|
37
|
+
id: "",
|
|
38
|
+
},
|
|
39
|
+
],
|
|
40
|
+
models: [
|
|
41
|
+
{
|
|
42
|
+
description: "A powerful feature extraction model for natural language processing tasks.",
|
|
43
|
+
id: "facebook/bart-base",
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
description: "A strong feature extraction model for coding tasks.",
|
|
47
|
+
id: "microsoft/codebert-base",
|
|
48
|
+
},
|
|
49
|
+
],
|
|
50
|
+
spaces: [
|
|
51
|
+
|
|
52
|
+
],
|
|
53
|
+
summary: "Feature extraction refers to the process of transforming raw data into numerical features that can be processed while preserving the information in the original dataset.",
|
|
54
|
+
widgetModels: ["facebook/bart-base"],
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
export default taskData;
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
## Use Cases
|
|
2
|
+
|
|
3
|
+
### Domain Adaptation 👩⚕️
|
|
4
|
+
|
|
5
|
+
Masked language models do not require labelled data! They are trained by masking a couple of words in sentences and the model is expected to guess the masked word. This makes it very practical!
|
|
6
|
+
|
|
7
|
+
For example, masked language modeling is used to train large models for domain-specific problems. If you have to work on a domain-specific task, such as retrieving information from medical research papers, you can train a masked language model using those papers. 📄
|
|
8
|
+
|
|
9
|
+
The resulting model has a statistical understanding of the language used in medical research papers, and can be further trained in a process called fine-tuning to solve different tasks, such as [Text Classification](/tasks/text-classification) or [Question Answering](/tasks/question-answering) to build a medical research papers information extraction system. 👩⚕️ Pre-training on domain-specific data tends to yield better results (see [this paper](https://arxiv.org/abs/2007.15779) for an example).
|
|
10
|
+
|
|
11
|
+
If you don't have the data to train a masked language model, you can also use an existing [domain-specific masked language model](https://huggingface.co/microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext) from the Hub and fine-tune it with your smaller task dataset. That's the magic of Open Source and sharing your work! 🎉
|
|
12
|
+
|
|
13
|
+
## Inference with Fill-Mask Pipeline
|
|
14
|
+
|
|
15
|
+
You can use the 🤗 Transformers library `fill-mask` pipeline to do inference with masked language models. If a model name is not provided, the pipeline will be initialized with [distilroberta-base](/distilroberta-base). You can provide masked text and it will return a list of possible mask values ranked according to the score.
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
from transformers import pipeline
|
|
19
|
+
|
|
20
|
+
classifier = pipeline("fill-mask")
|
|
21
|
+
classifier("Paris is the <mask> of France.")
|
|
22
|
+
|
|
23
|
+
# [{'score': 0.7, 'sequence': 'Paris is the capital of France.'},
|
|
24
|
+
# {'score': 0.2, 'sequence': 'Paris is the birthplace of France.'},
|
|
25
|
+
# {'score': 0.1, 'sequence': 'Paris is the heart of France.'}]
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Useful Resources
|
|
29
|
+
|
|
30
|
+
Would you like to learn more about the topic? Awesome! Here you can find some curated resources that can be helpful to you!
|
|
31
|
+
|
|
32
|
+
- [Course Chapter on Fine-tuning a Masked Language Model](https://huggingface.co/course/chapter7/3?fw=pt)
|
|
33
|
+
- [Workshop on Pretraining Language Models and CodeParrot](https://www.youtube.com/watch?v=ExUR7w6xe94)
|
|
34
|
+
- [BERT 101: State Of The Art NLP Model Explained](https://huggingface.co/blog/bert-101)
|
|
35
|
+
- [Nyströmformer: Approximating self-attention in linear time and memory via the Nyström method](https://huggingface.co/blog/nystromformer)
|
|
36
|
+
|
|
37
|
+
### Notebooks
|
|
38
|
+
|
|
39
|
+
- [Pre-training an MLM for JAX/Flax](https://github.com/huggingface/notebooks/blob/master/examples/masked_language_modeling_flax.ipynb)
|
|
40
|
+
- [Masked language modeling in TensorFlow](https://github.com/huggingface/notebooks/blob/master/examples/language_modeling-tf.ipynb)
|
|
41
|
+
- [Masked language modeling in PyTorch](https://github.com/huggingface/notebooks/blob/master/examples/language_modeling.ipynb)
|
|
42
|
+
|
|
43
|
+
### Scripts for training
|
|
44
|
+
|
|
45
|
+
- [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling)
|
|
46
|
+
- [Flax](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling)
|
|
47
|
+
- [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling)
|
|
48
|
+
|
|
49
|
+
### Documentation
|
|
50
|
+
|
|
51
|
+
- [Masked language modeling task guide](https://huggingface.co/docs/transformers/tasks/masked_language_modeling)
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../Types";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
description: "A common dataset that is used to train models for many languages.",
|
|
7
|
+
id: "wikipedia",
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
description: "A large English dataset with text crawled from the web.",
|
|
11
|
+
id: "c4",
|
|
12
|
+
},
|
|
13
|
+
],
|
|
14
|
+
demo: {
|
|
15
|
+
inputs: [
|
|
16
|
+
{
|
|
17
|
+
label: "Input",
|
|
18
|
+
content: "The <mask> barked at me",
|
|
19
|
+
type: "text",
|
|
20
|
+
},
|
|
21
|
+
|
|
22
|
+
],
|
|
23
|
+
outputs: [
|
|
24
|
+
{
|
|
25
|
+
type: "chart",
|
|
26
|
+
data: [
|
|
27
|
+
{
|
|
28
|
+
label: "wolf",
|
|
29
|
+
score: 0.487,
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
label: "dog",
|
|
33
|
+
score: 0.061,
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
label: "cat",
|
|
37
|
+
score: 0.058,
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
label: "fox",
|
|
41
|
+
score: 0.047,
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
label: "squirrel",
|
|
45
|
+
score: 0.025,
|
|
46
|
+
},
|
|
47
|
+
],
|
|
48
|
+
},
|
|
49
|
+
],
|
|
50
|
+
},
|
|
51
|
+
metrics: [
|
|
52
|
+
{
|
|
53
|
+
description: "Cross Entropy is a metric that calculates the difference between two probability distributions. Each probability distribution is the distribution of predicted words",
|
|
54
|
+
id: "cross_entropy",
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
description: "Perplexity is the exponential of the cross-entropy loss. It evaluates the probabilities assigned to the next word by the model. Lower perplexity indicates better performance",
|
|
58
|
+
id: "perplexity",
|
|
59
|
+
},
|
|
60
|
+
],
|
|
61
|
+
models: [
|
|
62
|
+
{
|
|
63
|
+
description: "A faster and smaller model than the famous BERT model.",
|
|
64
|
+
id: "distilbert-base-uncased",
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
description: "A multilingual model trained on 100 languages.",
|
|
68
|
+
id: "xlm-roberta-base",
|
|
69
|
+
},
|
|
70
|
+
],
|
|
71
|
+
spaces: [],
|
|
72
|
+
summary: "Masked language modeling is the task of masking some of the words in a sentence and predicting which words should replace those masks. These models are useful when we want to get a statistical understanding of the language in which the model is trained in.",
|
|
73
|
+
widgetModels: ["distilroberta-base"],
|
|
74
|
+
youtubeId: "mqElG5QJWUg",
|
|
75
|
+
};
|
|
76
|
+
|
|
77
|
+
export default taskData;
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
## Use Cases
|
|
2
|
+
Image classification models can be used when we are not interested in specific instances of objects with location information or their shape.
|
|
3
|
+
|
|
4
|
+
### Keyword Classification
|
|
5
|
+
Image classification models are used widely in stock photography to assign each image a keyword.
|
|
6
|
+
|
|
7
|
+
### Image Search
|
|
8
|
+
|
|
9
|
+
Models trained in image classification can improve user experience by organizing and categorizing photo galleries on the phone or in the cloud, on multiple keywords or tags.
|
|
10
|
+
|
|
11
|
+
## Inference
|
|
12
|
+
|
|
13
|
+
With the `transformers` library, you can use the `image-classification` pipeline to infer with image classification models. You can initialize the pipeline with a model id from the Hub. If you do not provide a model id it will initialize with [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) by default. When calling the pipeline you just need to specify a path, http link or an image loaded in PIL. You can also provide a `top_k` parameter which determines how many results it should return.
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from transformers import pipeline
|
|
17
|
+
clf = pipeline("image-classification")
|
|
18
|
+
clf("path_to_a_cat_image")
|
|
19
|
+
|
|
20
|
+
[{'label': 'tabby cat', 'score': 0.731},
|
|
21
|
+
...
|
|
22
|
+
]
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to classify images using models on Hugging Face Hub.
|
|
26
|
+
|
|
27
|
+
```javascript
|
|
28
|
+
import { HfInference } from "@huggingface/inference";
|
|
29
|
+
|
|
30
|
+
const inference = new HfInference(HF_ACCESS_TOKEN);
|
|
31
|
+
await inference.imageClassification({
|
|
32
|
+
data: await (await fetch('https://picsum.photos/300/300')).blob(),
|
|
33
|
+
model: 'microsoft/resnet-50',
|
|
34
|
+
})
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Useful Resources
|
|
38
|
+
|
|
39
|
+
- [Let's Play Pictionary with Machine Learning!](https://www.youtube.com/watch?v=LS9Y2wDVI0k)
|
|
40
|
+
- [Fine-Tune ViT for Image Classification with 🤗Transformers](https://huggingface.co/blog/fine-tune-vit)
|
|
41
|
+
- [Walkthrough of Computer Vision Ecosystem in Hugging Face - CV Study Group](https://www.youtube.com/watch?v=oL-xmufhZM8)
|
|
42
|
+
- [Computer Vision Study Group: Swin Transformer](https://www.youtube.com/watch?v=Ngikt-K1Ecc)
|
|
43
|
+
- [Computer Vision Study Group: Masked Autoencoders Paper Walkthrough](https://www.youtube.com/watch?v=Ngikt-K1Ecc)
|
|
44
|
+
- [Image classification task guide](https://huggingface.co/docs/transformers/tasks/image_classification)
|
|
45
|
+
|
|
46
|
+
### Creating your own image classifier in just a few minutes
|
|
47
|
+
|
|
48
|
+
With [HuggingPics](https://github.com/nateraw/huggingpics), you can fine-tune Vision Transformers for anything using images found on the web. This project downloads images of classes defined by you, trains a model, and pushes it to the Hub. You even get to try out the model directly with a working widget in the browser, ready to be shared with all your friends!
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../Types";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
// TODO write proper description
|
|
7
|
+
description: "Benchmark dataset used for image classification with images that belong to 100 classes.",
|
|
8
|
+
id: "cifar100",
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
// TODO write proper description
|
|
12
|
+
description: "Dataset consisting of images of garments.",
|
|
13
|
+
id: "fashion_mnist",
|
|
14
|
+
},
|
|
15
|
+
],
|
|
16
|
+
demo: {
|
|
17
|
+
inputs: [
|
|
18
|
+
{
|
|
19
|
+
filename: "image-classification-input.jpeg",
|
|
20
|
+
type: "img",
|
|
21
|
+
},
|
|
22
|
+
],
|
|
23
|
+
outputs: [
|
|
24
|
+
{
|
|
25
|
+
type: "chart",
|
|
26
|
+
data: [
|
|
27
|
+
{
|
|
28
|
+
label: "Egyptian cat",
|
|
29
|
+
score: 0.514,
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
label: "Tabby cat",
|
|
33
|
+
score: 0.193,
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
label: "Tiger cat",
|
|
37
|
+
score: 0.068,
|
|
38
|
+
},
|
|
39
|
+
],
|
|
40
|
+
},
|
|
41
|
+
],
|
|
42
|
+
},
|
|
43
|
+
metrics: [
|
|
44
|
+
{
|
|
45
|
+
description: "",
|
|
46
|
+
id: "accuracy",
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
description: "",
|
|
50
|
+
id: "recall",
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
description: "",
|
|
54
|
+
id: "precision",
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
description: "",
|
|
58
|
+
id: "f1",
|
|
59
|
+
},
|
|
60
|
+
|
|
61
|
+
],
|
|
62
|
+
models: [
|
|
63
|
+
{
|
|
64
|
+
description: "A strong image classification model.",
|
|
65
|
+
id: "google/vit-base-patch16-224",
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
description: "A robust image classification model.",
|
|
69
|
+
id: "facebook/deit-base-distilled-patch16-224",
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
description: "A strong image classification model.",
|
|
73
|
+
id: "facebook/convnext-large-224",
|
|
74
|
+
},
|
|
75
|
+
],
|
|
76
|
+
spaces: [
|
|
77
|
+
{
|
|
78
|
+
// TO DO: write description
|
|
79
|
+
description: "An application that classifies what a given image is about.",
|
|
80
|
+
id: "nielsr/perceiver-image-classification",
|
|
81
|
+
},
|
|
82
|
+
],
|
|
83
|
+
summary: "Image classification is the task of assigning a label or class to an entire image. Images are expected to have only one class for each image. Image classification models take an image as input and return a prediction about which class the image belongs to.",
|
|
84
|
+
widgetModels: ["google/vit-base-patch16-224"],
|
|
85
|
+
youtubeId: "tjAIM7BOYhw",
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
export default taskData;
|