@huggingface/tasks 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/assets/audio-classification/audio.wav +0 -0
- package/assets/audio-to-audio/input.wav +0 -0
- package/assets/audio-to-audio/label-0.wav +0 -0
- package/assets/audio-to-audio/label-1.wav +0 -0
- package/assets/automatic-speech-recognition/input.flac +0 -0
- package/assets/automatic-speech-recognition/wav2vec2.png +0 -0
- package/assets/contribution-guide/anatomy.png +0 -0
- package/assets/contribution-guide/libraries.png +0 -0
- package/assets/depth-estimation/depth-estimation-input.jpg +0 -0
- package/assets/depth-estimation/depth-estimation-output.png +0 -0
- package/assets/document-question-answering/document-question-answering-input.png +0 -0
- package/assets/image-classification/image-classification-input.jpeg +0 -0
- package/assets/image-segmentation/image-segmentation-input.jpeg +0 -0
- package/assets/image-segmentation/image-segmentation-output.png +0 -0
- package/assets/image-to-image/image-to-image-input.jpeg +0 -0
- package/assets/image-to-image/image-to-image-output.png +0 -0
- package/assets/image-to-image/pix2pix_examples.jpg +0 -0
- package/assets/image-to-text/savanna.jpg +0 -0
- package/assets/object-detection/object-detection-input.jpg +0 -0
- package/assets/object-detection/object-detection-output.jpg +0 -0
- package/assets/table-question-answering/tableQA.jpg +0 -0
- package/assets/text-to-image/image.jpeg +0 -0
- package/assets/text-to-speech/audio.wav +0 -0
- package/assets/text-to-video/text-to-video-output.gif +0 -0
- package/assets/unconditional-image-generation/unconditional-image-generation-output.jpeg +0 -0
- package/assets/video-classification/video-classification-input.gif +0 -0
- package/assets/visual-question-answering/elephant.jpeg +0 -0
- package/assets/zero-shot-image-classification/image-classification-input.jpeg +0 -0
- package/dist/index.cjs +3105 -0
- package/dist/index.d.cts +145 -0
- package/dist/index.d.ts +145 -0
- package/dist/index.js +3079 -0
- package/package.json +35 -0
- package/src/Types.ts +58 -0
- package/src/audio-classification/about.md +85 -0
- package/src/audio-classification/data.ts +77 -0
- package/src/audio-to-audio/about.md +55 -0
- package/src/audio-to-audio/data.ts +63 -0
- package/src/automatic-speech-recognition/about.md +86 -0
- package/src/automatic-speech-recognition/data.ts +77 -0
- package/src/const.ts +51 -0
- package/src/conversational/about.md +50 -0
- package/src/conversational/data.ts +62 -0
- package/src/depth-estimation/about.md +38 -0
- package/src/depth-estimation/data.ts +52 -0
- package/src/document-question-answering/about.md +54 -0
- package/src/document-question-answering/data.ts +67 -0
- package/src/feature-extraction/about.md +35 -0
- package/src/feature-extraction/data.ts +57 -0
- package/src/fill-mask/about.md +51 -0
- package/src/fill-mask/data.ts +77 -0
- package/src/image-classification/about.md +48 -0
- package/src/image-classification/data.ts +88 -0
- package/src/image-segmentation/about.md +63 -0
- package/src/image-segmentation/data.ts +96 -0
- package/src/image-to-image/about.md +81 -0
- package/src/image-to-image/data.ts +97 -0
- package/src/image-to-text/about.md +58 -0
- package/src/image-to-text/data.ts +87 -0
- package/src/index.ts +2 -0
- package/src/object-detection/about.md +36 -0
- package/src/object-detection/data.ts +73 -0
- package/src/placeholder/about.md +15 -0
- package/src/placeholder/data.ts +18 -0
- package/src/question-answering/about.md +56 -0
- package/src/question-answering/data.ts +69 -0
- package/src/reinforcement-learning/about.md +176 -0
- package/src/reinforcement-learning/data.ts +78 -0
- package/src/sentence-similarity/about.md +97 -0
- package/src/sentence-similarity/data.ts +100 -0
- package/src/summarization/about.md +57 -0
- package/src/summarization/data.ts +72 -0
- package/src/table-question-answering/about.md +43 -0
- package/src/table-question-answering/data.ts +63 -0
- package/src/tabular-classification/about.md +67 -0
- package/src/tabular-classification/data.ts +69 -0
- package/src/tabular-regression/about.md +91 -0
- package/src/tabular-regression/data.ts +58 -0
- package/src/tasksData.ts +104 -0
- package/src/text-classification/about.md +171 -0
- package/src/text-classification/data.ts +90 -0
- package/src/text-generation/about.md +128 -0
- package/src/text-generation/data.ts +124 -0
- package/src/text-to-image/about.md +65 -0
- package/src/text-to-image/data.ts +88 -0
- package/src/text-to-speech/about.md +63 -0
- package/src/text-to-speech/data.ts +70 -0
- package/src/text-to-video/about.md +36 -0
- package/src/text-to-video/data.ts +97 -0
- package/src/token-classification/about.md +78 -0
- package/src/token-classification/data.ts +83 -0
- package/src/translation/about.md +65 -0
- package/src/translation/data.ts +68 -0
- package/src/unconditional-image-generation/about.md +45 -0
- package/src/unconditional-image-generation/data.ts +66 -0
- package/src/video-classification/about.md +53 -0
- package/src/video-classification/data.ts +84 -0
- package/src/visual-question-answering/about.md +43 -0
- package/src/visual-question-answering/data.ts +90 -0
- package/src/zero-shot-classification/about.md +39 -0
- package/src/zero-shot-classification/data.ts +66 -0
- package/src/zero-shot-image-classification/about.md +68 -0
- package/src/zero-shot-image-classification/data.ts +79 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
## Use Cases
|
|
2
|
+
|
|
3
|
+
Text-to-Speech (TTS) models can be used in any speech-enabled application that requires converting text to speech imitating human voice.
|
|
4
|
+
|
|
5
|
+
### Voice Assistants
|
|
6
|
+
|
|
7
|
+
TTS models are used to create voice assistants on smart devices. These models are a better alternative compared to concatenative methods where the assistant is built by recording sounds and mapping them, since the outputs in TTS models contain elements in natural speech such as emphasis.
|
|
8
|
+
|
|
9
|
+
### Announcement Systems
|
|
10
|
+
|
|
11
|
+
TTS models are widely used in airport and public transportation announcement systems to convert the announcement of a given text into speech.
|
|
12
|
+
|
|
13
|
+
## Inference API
|
|
14
|
+
|
|
15
|
+
The Hub contains over [1500 TTS models](https://huggingface.co/models?pipeline_tag=text-to-speech&sort=downloads) that you can use right away by trying out the widgets directly in the browser or calling the models as a service using the Inference API. Here is a simple code snippet to get you started:
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
import json
|
|
19
|
+
import requests
|
|
20
|
+
|
|
21
|
+
headers = {"Authorization": f"Bearer {API_TOKEN}"}
|
|
22
|
+
API_URL = "https://api-inference.huggingface.co/models/microsoft/speecht5_tts"
|
|
23
|
+
|
|
24
|
+
def query(payload):
|
|
25
|
+
response = requests.post(API_URL, headers=headers, json=payload)
|
|
26
|
+
return response
|
|
27
|
+
|
|
28
|
+
output = query({"text_inputs": "This is a test"})
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
You can also use libraries such as [espnet](https://huggingface.co/models?library=espnet&pipeline_tag=text-to-speech&sort=downloads) or [transformers](https://huggingface.co/models?pipeline_tag=text-to-speech&library=transformers&sort=trending) if you want to handle the Inference directly.
|
|
32
|
+
|
|
33
|
+
## Direct Inference
|
|
34
|
+
|
|
35
|
+
Now, you can also use the Text-to-Speech pipeline in Transformers to synthesise high quality voice.
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from transformers import pipeline
|
|
39
|
+
|
|
40
|
+
synthesizer = pipeline("text-to-speech", "suno/bark")
|
|
41
|
+
|
|
42
|
+
synthesizer("Look I am generating speech in three lines of code!")
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer summarization models on Hugging Face Hub.
|
|
47
|
+
|
|
48
|
+
```javascript
|
|
49
|
+
import { HfInference } from "@huggingface/inference";
|
|
50
|
+
|
|
51
|
+
const inference = new HfInference(HF_ACCESS_TOKEN);
|
|
52
|
+
await inference.textToSpeech({
|
|
53
|
+
model: 'facebook/mms-tts',
|
|
54
|
+
inputs: "text to generate speech from"
|
|
55
|
+
})
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Useful Resources
|
|
59
|
+
- [ML for Audio Study Group - Text to Speech Deep Dive](https://www.youtube.com/watch?v=aLBedWj-5CQ)
|
|
60
|
+
- [An introduction to SpeechT5, a multi-purpose speech recognition and synthesis model](https://huggingface.co/blog/speecht5).
|
|
61
|
+
- [A guide on Fine-tuning Whisper For Multilingual ASR with 🤗Transformers](https://huggingface.co/blog/fine-tune-whisper)
|
|
62
|
+
- [Speech Synthesis, Recognition, and More With SpeechT5](https://huggingface.co/blog/speecht5)
|
|
63
|
+
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../Types";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
description: "Thousands of short audio clips of a single speaker.",
|
|
7
|
+
id: "lj_speech",
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
description: "Multi-speaker English dataset.",
|
|
11
|
+
id: "LibriTTS",
|
|
12
|
+
},
|
|
13
|
+
],
|
|
14
|
+
demo: {
|
|
15
|
+
inputs: [
|
|
16
|
+
{
|
|
17
|
+
label: "Input",
|
|
18
|
+
content:
|
|
19
|
+
"I love audio models on the Hub!",
|
|
20
|
+
type: "text",
|
|
21
|
+
},
|
|
22
|
+
|
|
23
|
+
],
|
|
24
|
+
outputs: [
|
|
25
|
+
{
|
|
26
|
+
filename: "audio.wav",
|
|
27
|
+
type: "audio",
|
|
28
|
+
},
|
|
29
|
+
],
|
|
30
|
+
},
|
|
31
|
+
metrics: [
|
|
32
|
+
{
|
|
33
|
+
description: "The Mel Cepstral Distortion (MCD) metric is used to calculate the quality of generated speech.",
|
|
34
|
+
id: "mel cepstral distortion",
|
|
35
|
+
},
|
|
36
|
+
],
|
|
37
|
+
models: [
|
|
38
|
+
{
|
|
39
|
+
description: "A powerful TTS model.",
|
|
40
|
+
id: "suno/bark",
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
description: "A massively multi-lingual TTS model.",
|
|
44
|
+
id: "facebook/mms-tts",
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
description: "An end-to-end speech synthesis model.",
|
|
48
|
+
id: "microsoft/speecht5_tts",
|
|
49
|
+
},
|
|
50
|
+
],
|
|
51
|
+
spaces: [
|
|
52
|
+
{
|
|
53
|
+
description: "An application for generate highly realistic, multilingual speech.",
|
|
54
|
+
id: "suno/bark",
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
description: "An application that contains multiple speech synthesis models for various languages and accents.",
|
|
58
|
+
id: "coqui/CoquiTTS",
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
description: "An application that synthesizes speech for various speaker types.",
|
|
62
|
+
id: "Matthijs/speecht5-tts-demo",
|
|
63
|
+
},
|
|
64
|
+
],
|
|
65
|
+
summary: "Text-to-Speech (TTS) is the task of generating natural sounding speech given text input. TTS models can be extended to have a single model that generates speech for multiple speakers and multiple languages.",
|
|
66
|
+
widgetModels: ["microsoft/speecht5_tts"],
|
|
67
|
+
youtubeId: "NW62DpzJ274",
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
export default taskData;
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
## Use Cases
|
|
2
|
+
|
|
3
|
+
### Script-based Video Generation
|
|
4
|
+
Text-to-video models can be used to create short-form video content from a provided text script. These models can be used to create engaging and informative marketing videos. For example, a company could use a text-to-video model to create a video that explains how their product works.
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
### Content format conversion
|
|
8
|
+
Text-to-video models can be used to generate videos from long-form text, including blog posts, articles, and text files. Text-to-video models can be used to create educational videos that are more engaging and interactive. An example of this is creating a video that explains a complex concept from an article.
|
|
9
|
+
|
|
10
|
+
### Voice-overs and Speech
|
|
11
|
+
Text-to-video models can be used to create an AI newscaster to deliver daily news, or for a film-maker to create a short film or a music video.
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
## Task Variants
|
|
15
|
+
Text-to-video models have different variants based on inputs and outputs.
|
|
16
|
+
|
|
17
|
+
### Text-to-video Editing
|
|
18
|
+
One text-to-video task is generating text-based video style and local attribute editing. Text-to-video editing models can make it easier to perform tasks like cropping, stabilization, color correction, resizing and audio editing consistently.
|
|
19
|
+
|
|
20
|
+
### Text-to-video Search
|
|
21
|
+
Text-to-video search is the task of retrieving videos that are relevant to a given text query. This can be challenging, as videos are a complex medium that can contain a lot of information. By using semantic analysis to extract the meaning of the text query, visual analysis to extract features from the videos, such as the objects and actions that are present in the video, and temporal analysis to categorize relationships between the objects and actions in the video, we can determine which videos are most likely to be relevant to the text query.
|
|
22
|
+
|
|
23
|
+
### Text-driven Video Prediction
|
|
24
|
+
Text-driven video prediction is the task of generating a video sequence from a text description. Text description can be anything from a simple sentence to a detailed story. The goal of this task is to generate a video that is both visually realistic and semantically consistent with the text description.
|
|
25
|
+
|
|
26
|
+
### Video Translation
|
|
27
|
+
Text-to-video translation models can translate videos from one language to another or allow to query the multilingual text-video model with non-English sentences. This can be useful for people who want to watch videos in a language that they don't understand, especially when multi-lingual captions are available for training.
|
|
28
|
+
|
|
29
|
+
## Inference
|
|
30
|
+
Contribute an inference snippet for text-to-video here!
|
|
31
|
+
|
|
32
|
+
## Useful Resources
|
|
33
|
+
|
|
34
|
+
In this area, you can insert useful resources about how to train or use a model for this task.
|
|
35
|
+
|
|
36
|
+
- [Text-to-Video: The Task, Challenges and the Current State](https://huggingface.co/blog/text-to-video)
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../Types";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
description: "Microsoft Research Video to Text is a large-scale dataset for open domain video captioning",
|
|
7
|
+
id: "iejMac/CLIP-MSR-VTT",
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
description: "UCF101 Human Actions dataset consists of 13,320 video clips from YouTube, with 101 classes.",
|
|
11
|
+
id: "quchenyuan/UCF101-ZIP",
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
description: "A high-quality dataset for human action recognition in YouTube videos.",
|
|
15
|
+
id: "nateraw/kinetics",
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
description: "A dataset of video clips of humans performing pre-defined basic actions with everyday objects.",
|
|
19
|
+
id: "HuggingFaceM4/something_something_v2",
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
description: "This dataset consists of text-video pairs and contains noisy samples with irrelevant video descriptions",
|
|
23
|
+
id: "HuggingFaceM4/webvid",
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
description: "A dataset of short Flickr videos for the temporal localization of events with descriptions.",
|
|
27
|
+
id: "iejMac/CLIP-DiDeMo",
|
|
28
|
+
},
|
|
29
|
+
],
|
|
30
|
+
demo: {
|
|
31
|
+
inputs: [
|
|
32
|
+
{
|
|
33
|
+
label: "Input",
|
|
34
|
+
content:
|
|
35
|
+
"Darth Vader is surfing on the waves.",
|
|
36
|
+
type: "text",
|
|
37
|
+
},
|
|
38
|
+
],
|
|
39
|
+
outputs: [
|
|
40
|
+
{
|
|
41
|
+
filename: "text-to-video-output.gif",
|
|
42
|
+
type: "img",
|
|
43
|
+
},
|
|
44
|
+
],
|
|
45
|
+
},
|
|
46
|
+
metrics: [
|
|
47
|
+
{
|
|
48
|
+
description: "Inception Score uses an image classification model that predicts class labels and evaluates how distinct and diverse the images are. A higher score indicates better video generation.",
|
|
49
|
+
id: "is",
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
description: "Frechet Inception Distance uses an image classification model to obtain image embeddings. The metric compares mean and standard deviation of the embeddings of real and generated images. A smaller score indicates better video generation.",
|
|
53
|
+
id: "fid",
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
description: "Frechet Video Distance uses a model that captures coherence for changes in frames and the quality of each frame. A smaller score indicates better video generation.",
|
|
57
|
+
id: "fvd",
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
description: "CLIPSIM measures similarity between video frames and text using an image-text similarity model. A higher score indicates better video generation.",
|
|
61
|
+
id: "clipsim",
|
|
62
|
+
},
|
|
63
|
+
],
|
|
64
|
+
models: [
|
|
65
|
+
{
|
|
66
|
+
description: "A strong model for video generation.",
|
|
67
|
+
id: "PAIR/text2video-zero-controlnet-canny-arcane",
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
description: "A robust model for text-to-video generation.",
|
|
71
|
+
id: "damo-vilab/text-to-video-ms-1.7b",
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
description: "A text-to-video generation model with high quality and smooth outputs.",
|
|
75
|
+
id: "cerspense/zeroscope_v2_576w",
|
|
76
|
+
},
|
|
77
|
+
],
|
|
78
|
+
spaces: [
|
|
79
|
+
{
|
|
80
|
+
description: "An application that generates video from text.",
|
|
81
|
+
id: "fffiloni/zeroscope",
|
|
82
|
+
},
|
|
83
|
+
{
|
|
84
|
+
description: "An application that generates video from image and text.",
|
|
85
|
+
id: "TempoFunk/makeavid-sd-jax",
|
|
86
|
+
},
|
|
87
|
+
{
|
|
88
|
+
description: "An application that generates videos from text and provides multi-model support.",
|
|
89
|
+
id: "ArtGAN/Video-Diffusion-WebUI",
|
|
90
|
+
},
|
|
91
|
+
],
|
|
92
|
+
summary: "Text-to-video models can be used in any application that requires generating consistent sequence of images from text. ",
|
|
93
|
+
widgetModels: [],
|
|
94
|
+
youtubeId: undefined,
|
|
95
|
+
};
|
|
96
|
+
|
|
97
|
+
export default taskData;
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
## Use Cases
|
|
2
|
+
|
|
3
|
+
### Information Extraction from Invoices
|
|
4
|
+
|
|
5
|
+
You can extract entities of interest from invoices automatically using Named Entity Recognition (NER) models. Invoices can be read with Optical Character Recognition models and the output can be used to do inference with NER models. In this way, important information such as date, company name, and other named entities can be extracted.
|
|
6
|
+
|
|
7
|
+
## Task Variants
|
|
8
|
+
|
|
9
|
+
### Named Entity Recognition (NER)
|
|
10
|
+
|
|
11
|
+
NER is the task of recognizing named entities in a text. These entities can be the names of people, locations, or organizations. The task is formulated as labeling each token with a class for each named entity and a class named "0" for tokens that do not contain any entities. The input for this task is text and the output is the annotated text with named entities.
|
|
12
|
+
|
|
13
|
+
#### Inference
|
|
14
|
+
|
|
15
|
+
You can use the 🤗 Transformers library `ner` pipeline to infer with NER models.
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
from transformers import pipeline
|
|
19
|
+
|
|
20
|
+
classifier = pipeline("ner")
|
|
21
|
+
classifier("Hello I'm Omar and I live in Zürich.")
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
### Part-of-Speech (PoS) Tagging
|
|
27
|
+
In PoS tagging, the model recognizes parts of speech, such as nouns, pronouns, adjectives, or verbs, in a given text. The task is formulated as labeling each word with a part of the speech.
|
|
28
|
+
|
|
29
|
+
#### Inference
|
|
30
|
+
|
|
31
|
+
You can use the 🤗 Transformers library `token-classification` pipeline with a POS tagging model of your choice. The model will return a json with PoS tags for each token.
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
from transformers import pipeline
|
|
35
|
+
|
|
36
|
+
classifier = pipeline("token-classification", model = "vblagoje/bert-english-uncased-finetuned-pos")
|
|
37
|
+
classifier("Hello I'm Omar and I live in Zürich.")
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
This is not limited to transformers! You can also use other libraries such as Stanza, spaCy, and Flair to do inference! Here is an example using a canonical [spaCy](https://hf.co/blog/spacy) model.
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
!pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
|
|
44
|
+
|
|
45
|
+
import en_core_web_sm
|
|
46
|
+
|
|
47
|
+
nlp = en_core_web_sm.load()
|
|
48
|
+
doc = nlp("I'm Omar and I live in Zürich.")
|
|
49
|
+
for token in doc:
|
|
50
|
+
print(token.text, token.pos_, token.dep_, token.ent_type_)
|
|
51
|
+
|
|
52
|
+
## I PRON nsubj
|
|
53
|
+
## 'm AUX ROOT
|
|
54
|
+
## Omar PROPN attr PERSON
|
|
55
|
+
### ...
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Useful Resources
|
|
59
|
+
|
|
60
|
+
Would you like to learn more about token classification? Great! Here you can find some curated resources that you may find helpful!
|
|
61
|
+
|
|
62
|
+
- [Course Chapter on Token Classification](https://huggingface.co/course/chapter7/2?fw=pt)
|
|
63
|
+
- [Blog post: Welcome spaCy to the Hugging Face Hub](https://huggingface.co/blog/spacy)
|
|
64
|
+
|
|
65
|
+
### Notebooks
|
|
66
|
+
|
|
67
|
+
- [PyTorch](https://github.com/huggingface/notebooks/blob/master/examples/token_classification.ipynb)
|
|
68
|
+
- [TensorFlow](https://github.com/huggingface/notebooks/blob/master/examples/token_classification-tf.ipynb)
|
|
69
|
+
|
|
70
|
+
### Scripts for training
|
|
71
|
+
|
|
72
|
+
- [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification)
|
|
73
|
+
- [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow)
|
|
74
|
+
- [Flax](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification)
|
|
75
|
+
|
|
76
|
+
### Documentation
|
|
77
|
+
|
|
78
|
+
- [Token classification task guide](https://huggingface.co/docs/transformers/tasks/token_classification)
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../Types";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
description: "A widely used dataset useful to benchmark named entity recognition models.",
|
|
7
|
+
id: "conll2003",
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
description: "A multilingual dataset of Wikipedia articles annotated for named entity recognition in over 150 different languages.",
|
|
11
|
+
id: "wikiann",
|
|
12
|
+
},
|
|
13
|
+
],
|
|
14
|
+
demo: {
|
|
15
|
+
inputs: [
|
|
16
|
+
{
|
|
17
|
+
label: "Input",
|
|
18
|
+
content:
|
|
19
|
+
"My name is Omar and I live in Zürich.",
|
|
20
|
+
type: "text",
|
|
21
|
+
},
|
|
22
|
+
|
|
23
|
+
],
|
|
24
|
+
outputs: [
|
|
25
|
+
{
|
|
26
|
+
text: "My name is Omar and I live in Zürich.",
|
|
27
|
+
tokens: [
|
|
28
|
+
{
|
|
29
|
+
type: "PERSON",
|
|
30
|
+
start: 11,
|
|
31
|
+
end: 15,
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
type: "GPE",
|
|
35
|
+
start: 30,
|
|
36
|
+
end: 36,
|
|
37
|
+
},
|
|
38
|
+
],
|
|
39
|
+
type: "text-with-tokens",
|
|
40
|
+
},
|
|
41
|
+
],
|
|
42
|
+
},
|
|
43
|
+
metrics: [
|
|
44
|
+
{
|
|
45
|
+
description: "",
|
|
46
|
+
id: "accuracy",
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
description: "",
|
|
50
|
+
id: "recall",
|
|
51
|
+
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
description: "",
|
|
55
|
+
id: "precision",
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
description: "",
|
|
59
|
+
id: "f1",
|
|
60
|
+
},
|
|
61
|
+
],
|
|
62
|
+
models: [
|
|
63
|
+
{
|
|
64
|
+
description: "A robust performance model to identify people, locations, organizations and names of miscellaneous entities.",
|
|
65
|
+
id: "dslim/bert-base-NER",
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
description: "Flair models are typically the state of the art in named entity recognition tasks.",
|
|
69
|
+
id: "flair/ner-english",
|
|
70
|
+
},
|
|
71
|
+
],
|
|
72
|
+
spaces: [
|
|
73
|
+
{
|
|
74
|
+
description: "An application that can recognizes entities, extracts noun chunks and recognizes various linguistic features of each token.",
|
|
75
|
+
id: "spacy/gradio_pipeline_visualizer",
|
|
76
|
+
},
|
|
77
|
+
],
|
|
78
|
+
summary: "Token classification is a natural language understanding task in which a label is assigned to some tokens in a text. Some popular token classification subtasks are Named Entity Recognition (NER) and Part-of-Speech (PoS) tagging. NER models could be trained to identify specific entities in a text, such as dates, individuals and places; and PoS tagging would identify, for example, which words in a text are verbs, nouns, and punctuation marks.",
|
|
79
|
+
widgetModels: ["dslim/bert-base-NER"],
|
|
80
|
+
youtubeId: "wVHdVlPScxA",
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
export default taskData;
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
## Use Cases
|
|
2
|
+
|
|
3
|
+
You can find over a thousand Translation models on the Hub, but sometimes you might not find a model for the language pair you are interested in. When this happen, you can use a pretrained multilingual Translation model like [mBART](https://huggingface.co/facebook/mbart-large-cc25) and further train it on your own data in a process called fine-tuning.
|
|
4
|
+
|
|
5
|
+
### Multilingual conversational agents
|
|
6
|
+
|
|
7
|
+
Translation models can be used to build conversational agents across different languages. This can be done in two ways.
|
|
8
|
+
|
|
9
|
+
- **Translate the dataset to a new language.** You can translate a dataset of intents (inputs) and responses to the target language. You can then train a new intent classification model with this new dataset. This allows you to proofread responses in the target language and have better control of the chatbot's outputs.
|
|
10
|
+
|
|
11
|
+
* **Translate the input and output of the agent.** You can use a Translation model in user inputs so that the chatbot can process it. You can then translate the output of the chatbot into the language of the user. This approach might be less reliable as the chatbot will generate responses that were not defined before.
|
|
12
|
+
|
|
13
|
+
## Inference
|
|
14
|
+
|
|
15
|
+
You can use the 🤗 Transformers library with the `translation_xx_to_yy` pattern where xx is the source language code and yy is the target language code. The default model for the pipeline is [t5-base](https://huggingface.co/t5-base) which under the hood adds a task prefix indicating the task itself, e.g. “translate: English to French”.
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
from transformers import pipeline
|
|
19
|
+
en_fr_translator = pipeline("translation_en_to_fr")
|
|
20
|
+
en_fr_translator("How old are you?")
|
|
21
|
+
## [{'translation_text': ' quel âge êtes-vous?'}]
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
If you’d like to use a specific model checkpoint that is from one specific language to another, you can also directly use the `translation` pipeline.
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from transformers import pipeline
|
|
28
|
+
|
|
29
|
+
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
|
|
30
|
+
translator = pipeline("translation", model=model_checkpoint)
|
|
31
|
+
translator("How are you?")
|
|
32
|
+
# [{'translation_text': 'Comment allez-vous ?'}]
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer translation models on Hugging Face Hub.
|
|
36
|
+
|
|
37
|
+
```javascript
|
|
38
|
+
import { HfInference } from "@huggingface/inference";
|
|
39
|
+
|
|
40
|
+
const inference = new HfInference(HF_ACCESS_TOKEN);
|
|
41
|
+
await inference.translation({
|
|
42
|
+
model: 't5-base',
|
|
43
|
+
inputs: 'My name is Wolfgang and I live in Berlin'
|
|
44
|
+
})
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Useful Resources
|
|
48
|
+
|
|
49
|
+
Would you like to learn more about Translation? Great! Here you can find some curated resources that you may find helpful!
|
|
50
|
+
|
|
51
|
+
- [Course Chapter on Translation](https://huggingface.co/course/chapter7/4?fw=pt)
|
|
52
|
+
|
|
53
|
+
### Notebooks
|
|
54
|
+
|
|
55
|
+
- [PyTorch](https://github.com/huggingface/notebooks/blob/master/examples/translation.ipynb)
|
|
56
|
+
- [TensorFlow](https://github.com/huggingface/notebooks/blob/master/examples/translation-tf.ipynb)
|
|
57
|
+
|
|
58
|
+
### Scripts for training
|
|
59
|
+
|
|
60
|
+
- [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/translation)
|
|
61
|
+
- [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/translation)
|
|
62
|
+
|
|
63
|
+
### Documentation
|
|
64
|
+
|
|
65
|
+
- [Translation task guide](https://huggingface.co/docs/transformers/tasks/translation)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../Types";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
description: "A dataset of copyright-free books translated into 16 different languages.",
|
|
7
|
+
id: "opus_books",
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
description: "An example of translation between programming languages. This dataset consists of functions in Java and C#.",
|
|
11
|
+
id: "code_x_glue_cc_code_to_code_trans",
|
|
12
|
+
},
|
|
13
|
+
],
|
|
14
|
+
demo: {
|
|
15
|
+
inputs: [
|
|
16
|
+
{
|
|
17
|
+
label: "Input",
|
|
18
|
+
content:
|
|
19
|
+
"My name is Omar and I live in Zürich.",
|
|
20
|
+
type: "text",
|
|
21
|
+
},
|
|
22
|
+
|
|
23
|
+
],
|
|
24
|
+
outputs: [
|
|
25
|
+
{
|
|
26
|
+
label: "Output",
|
|
27
|
+
content:
|
|
28
|
+
"Mein Name ist Omar und ich wohne in Zürich.",
|
|
29
|
+
type: "text",
|
|
30
|
+
},
|
|
31
|
+
],
|
|
32
|
+
},
|
|
33
|
+
metrics: [
|
|
34
|
+
{
|
|
35
|
+
description: "BLEU score is calculated by counting the number of shared single or subsequent tokens between the generated sequence and the reference. Subsequent n tokens are called “n-grams”. Unigram refers to a single token while bi-gram refers to token pairs and n-grams refer to n subsequent tokens. The score ranges from 0 to 1, where 1 means the translation perfectly matched and 0 did not match at all",
|
|
36
|
+
id: "bleu",
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
description: "",
|
|
40
|
+
id: "sacrebleu",
|
|
41
|
+
},
|
|
42
|
+
],
|
|
43
|
+
models: [
|
|
44
|
+
{
|
|
45
|
+
description: "A model that translates from English to French.",
|
|
46
|
+
id: "Helsinki-NLP/opus-mt-en-fr",
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
description: "A general-purpose Transformer that can be used to translate from English to German, French, or Romanian.",
|
|
50
|
+
id: "t5-base",
|
|
51
|
+
},
|
|
52
|
+
],
|
|
53
|
+
spaces: [
|
|
54
|
+
{
|
|
55
|
+
description: "An application that can translate between 100 languages.",
|
|
56
|
+
id: "Iker/Translate-100-languages",
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
description: "An application that can translate between English, Spanish and Hindi.",
|
|
60
|
+
id: "EuroPython2022/Translate-with-Bloom",
|
|
61
|
+
},
|
|
62
|
+
],
|
|
63
|
+
summary: "Translation is the task of converting text from one language to another.",
|
|
64
|
+
widgetModels: ["t5-small"],
|
|
65
|
+
youtubeId: "1JvfrvZgi6c",
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
export default taskData;
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
## About the Task
|
|
2
|
+
|
|
3
|
+
Unconditional image generation is the task of generating new images without any specific input. The main goal of this is to create novel, original images that are not based on existing images.
|
|
4
|
+
This can be used for a variety of applications, such as creating new artistic images, improving image recognition algorithms, or generating photorealistic images for virtual reality environments.
|
|
5
|
+
|
|
6
|
+
Unconditional image generation models usually start with a *seed* that generates a *random noise vector*. The model will then use this vector to create an output image similar to the images used for training the model.
|
|
7
|
+
|
|
8
|
+
An example of unconditional image generation would be generating the image of a face on a model trained with the [CelebA dataset](https://huggingface.co/datasets/huggan/CelebA-HQ) or [generating a butterfly](https://huggingface.co/spaces/huggan/butterfly-gan) on a model trained with the [Smithsonian Butterflies dataset](https://huggingface.co/datasets/ceyda/smithsonian_butterflies).
|
|
9
|
+
|
|
10
|
+
[Generative adversarial networks](https://en.wikipedia.org/wiki/Generative_adversarial_network) and [Diffusion](https://huggingface.co/docs/diffusers/index) are common architectures for this task.
|
|
11
|
+
|
|
12
|
+
## Use Cases
|
|
13
|
+
|
|
14
|
+
Unconditional image generation can be used for a variety of applications.
|
|
15
|
+
|
|
16
|
+
### Artistic Expression
|
|
17
|
+
Unconditional image generation can be used to create novel, original artwork that is not based on any existing images. This can be used to explore new creative possibilities and produce unique, imaginative images.
|
|
18
|
+
|
|
19
|
+
### Data Augmentation
|
|
20
|
+
Unconditional image generation models can be used to generate new images to improve the performance of image recognition algorithms. This makes algorithms more robust and able to handle a broader range of images.
|
|
21
|
+
|
|
22
|
+
### Virtual Reality
|
|
23
|
+
Unconditional image generation models can be used to create photorealistic images that can be used in virtual reality environments. This makes the VR experience more immersive and realistic.
|
|
24
|
+
|
|
25
|
+
### Medical Imaging
|
|
26
|
+
Unconditional image generation models can generate new medical images, such as CT or MRI scans, that can be used to train and evaluate medical imaging algorithms. This can improve the accuracy and reliability of these algorithms.
|
|
27
|
+
|
|
28
|
+
### Industrial Design
|
|
29
|
+
Unconditional image generation models can generate new designs for products, such as clothing or furniture, that are not based on any existing designs. This way, designers can explore new creative possibilities and produce unique, innovative designs.
|
|
30
|
+
|
|
31
|
+
## Model Hosting and Inference
|
|
32
|
+
|
|
33
|
+
This section should have useful information about Model Hosting and Inference
|
|
34
|
+
|
|
35
|
+
## Useful Resources
|
|
36
|
+
|
|
37
|
+
- [Hugging Face Diffusion Models Course](https://github.com/huggingface/diffusion-models-class)
|
|
38
|
+
- [Getting Started with Diffusers](https://huggingface.co/docs/diffusers/index)
|
|
39
|
+
- [Unconditional Image Generation Training](https://huggingface.co/docs/diffusers/training/unconditional_training)
|
|
40
|
+
|
|
41
|
+
### Training your own model in just a few seconds
|
|
42
|
+
|
|
43
|
+
In this area, you can insert useful information about training the model
|
|
44
|
+
|
|
45
|
+
This page was made possible thanks to the efforts of [Someet Sahoo](https://huggingface.co/Someet24) and [Juan Carlos Piñeros](https://huggingface.co/juancopi81).
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../Types";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
description: "The CIFAR-100 dataset consists of 60000 32x32 colour images in 100 classes, with 600 images per class.",
|
|
7
|
+
id: "cifar100",
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
description: "Multiple images of celebrities, used for facial expression translation.",
|
|
11
|
+
id: "CelebA",
|
|
12
|
+
},
|
|
13
|
+
],
|
|
14
|
+
demo: {
|
|
15
|
+
inputs: [
|
|
16
|
+
{
|
|
17
|
+
label: "Seed",
|
|
18
|
+
content: "42",
|
|
19
|
+
type: "text",
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
label: "Number of images to generate:",
|
|
23
|
+
content: "4",
|
|
24
|
+
type: "text",
|
|
25
|
+
},
|
|
26
|
+
],
|
|
27
|
+
outputs: [
|
|
28
|
+
{
|
|
29
|
+
filename: "unconditional-image-generation-output.jpeg",
|
|
30
|
+
type: "img",
|
|
31
|
+
},
|
|
32
|
+
],
|
|
33
|
+
},
|
|
34
|
+
metrics: [
|
|
35
|
+
{
|
|
36
|
+
description: "The inception score (IS) evaluates the quality of generated images. It measures the diversity of the generated images (the model predictions are evenly distributed across all possible labels) and their 'distinction' or 'sharpness' (the model confidently predicts a single label for each image).",
|
|
37
|
+
id: "Inception score (IS)",
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
description: "The Fréchet Inception Distance (FID) evaluates the quality of images created by a generative model by calculating the distance between feature vectors for real and generated images.",
|
|
41
|
+
id: "Frećhet Inception Distance (FID)",
|
|
42
|
+
},
|
|
43
|
+
],
|
|
44
|
+
models: [
|
|
45
|
+
{
|
|
46
|
+
description: "High-quality image generation model trained on the CIFAR-10 dataset. It synthesizes images of the ten classes presented in the dataset using diffusion probabilistic models, a class of latent variable models inspired by considerations from nonequilibrium thermodynamics.",
|
|
47
|
+
id: "google/ddpm-cifar10-32",
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
description: "High-quality image generation model trained on the 256x256 CelebA-HQ dataset. It synthesizes images of faces using diffusion probabilistic models, a class of latent variable models inspired by considerations from nonequilibrium thermodynamics.",
|
|
51
|
+
id: "google/ddpm-celebahq-256",
|
|
52
|
+
},
|
|
53
|
+
],
|
|
54
|
+
spaces: [
|
|
55
|
+
{
|
|
56
|
+
description: "An application that can generate realistic faces.",
|
|
57
|
+
id: "CompVis/celeba-latent-diffusion",
|
|
58
|
+
},
|
|
59
|
+
],
|
|
60
|
+
summary: "Unconditional image generation is the task of generating images with no condition in any context (like a prompt text or another image). Once trained, the model will create images that resemble its training data distribution.",
|
|
61
|
+
widgetModels: [""],
|
|
62
|
+
// TODO: Add related video
|
|
63
|
+
youtubeId: "",
|
|
64
|
+
};
|
|
65
|
+
|
|
66
|
+
export default taskData;
|