@huggingface/tasks 0.13.1-test → 0.13.1-test2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -2
- package/src/dataset-libraries.ts +89 -0
- package/src/default-widget-inputs.ts +718 -0
- package/src/gguf.ts +40 -0
- package/src/hardware.ts +482 -0
- package/src/index.ts +59 -0
- package/src/library-to-tasks.ts +76 -0
- package/src/local-apps.ts +412 -0
- package/src/model-data.ts +149 -0
- package/src/model-libraries-downloads.ts +18 -0
- package/src/model-libraries-snippets.ts +1128 -0
- package/src/model-libraries.ts +820 -0
- package/src/pipelines.ts +698 -0
- package/src/snippets/common.ts +39 -0
- package/src/snippets/curl.spec.ts +94 -0
- package/src/snippets/curl.ts +120 -0
- package/src/snippets/index.ts +7 -0
- package/src/snippets/inputs.ts +167 -0
- package/src/snippets/js.spec.ts +148 -0
- package/src/snippets/js.ts +305 -0
- package/src/snippets/python.spec.ts +144 -0
- package/src/snippets/python.ts +321 -0
- package/src/snippets/types.ts +16 -0
- package/src/tasks/audio-classification/about.md +86 -0
- package/src/tasks/audio-classification/data.ts +81 -0
- package/src/tasks/audio-classification/inference.ts +52 -0
- package/src/tasks/audio-classification/spec/input.json +35 -0
- package/src/tasks/audio-classification/spec/output.json +11 -0
- package/src/tasks/audio-to-audio/about.md +56 -0
- package/src/tasks/audio-to-audio/data.ts +70 -0
- package/src/tasks/automatic-speech-recognition/about.md +90 -0
- package/src/tasks/automatic-speech-recognition/data.ts +82 -0
- package/src/tasks/automatic-speech-recognition/inference.ts +160 -0
- package/src/tasks/automatic-speech-recognition/spec/input.json +35 -0
- package/src/tasks/automatic-speech-recognition/spec/output.json +38 -0
- package/src/tasks/chat-completion/inference.ts +322 -0
- package/src/tasks/chat-completion/spec/input.json +350 -0
- package/src/tasks/chat-completion/spec/output.json +206 -0
- package/src/tasks/chat-completion/spec/stream_output.json +213 -0
- package/src/tasks/common-definitions.json +100 -0
- package/src/tasks/depth-estimation/about.md +45 -0
- package/src/tasks/depth-estimation/data.ts +70 -0
- package/src/tasks/depth-estimation/inference.ts +35 -0
- package/src/tasks/depth-estimation/spec/input.json +25 -0
- package/src/tasks/depth-estimation/spec/output.json +16 -0
- package/src/tasks/document-question-answering/about.md +53 -0
- package/src/tasks/document-question-answering/data.ts +85 -0
- package/src/tasks/document-question-answering/inference.ts +110 -0
- package/src/tasks/document-question-answering/spec/input.json +85 -0
- package/src/tasks/document-question-answering/spec/output.json +36 -0
- package/src/tasks/feature-extraction/about.md +72 -0
- package/src/tasks/feature-extraction/data.ts +57 -0
- package/src/tasks/feature-extraction/inference.ts +40 -0
- package/src/tasks/feature-extraction/spec/input.json +47 -0
- package/src/tasks/feature-extraction/spec/output.json +15 -0
- package/src/tasks/fill-mask/about.md +51 -0
- package/src/tasks/fill-mask/data.ts +79 -0
- package/src/tasks/fill-mask/inference.ts +62 -0
- package/src/tasks/fill-mask/spec/input.json +38 -0
- package/src/tasks/fill-mask/spec/output.json +29 -0
- package/src/tasks/image-classification/about.md +50 -0
- package/src/tasks/image-classification/data.ts +88 -0
- package/src/tasks/image-classification/inference.ts +52 -0
- package/src/tasks/image-classification/spec/input.json +35 -0
- package/src/tasks/image-classification/spec/output.json +11 -0
- package/src/tasks/image-feature-extraction/about.md +23 -0
- package/src/tasks/image-feature-extraction/data.ts +59 -0
- package/src/tasks/image-segmentation/about.md +63 -0
- package/src/tasks/image-segmentation/data.ts +99 -0
- package/src/tasks/image-segmentation/inference.ts +69 -0
- package/src/tasks/image-segmentation/spec/input.json +45 -0
- package/src/tasks/image-segmentation/spec/output.json +26 -0
- package/src/tasks/image-text-to-text/about.md +76 -0
- package/src/tasks/image-text-to-text/data.ts +102 -0
- package/src/tasks/image-to-3d/about.md +62 -0
- package/src/tasks/image-to-3d/data.ts +75 -0
- package/src/tasks/image-to-image/about.md +129 -0
- package/src/tasks/image-to-image/data.ts +101 -0
- package/src/tasks/image-to-image/inference.ts +68 -0
- package/src/tasks/image-to-image/spec/input.json +55 -0
- package/src/tasks/image-to-image/spec/output.json +12 -0
- package/src/tasks/image-to-text/about.md +61 -0
- package/src/tasks/image-to-text/data.ts +82 -0
- package/src/tasks/image-to-text/inference.ts +143 -0
- package/src/tasks/image-to-text/spec/input.json +34 -0
- package/src/tasks/image-to-text/spec/output.json +14 -0
- package/src/tasks/index.ts +312 -0
- package/src/tasks/keypoint-detection/about.md +57 -0
- package/src/tasks/keypoint-detection/data.ts +50 -0
- package/src/tasks/mask-generation/about.md +65 -0
- package/src/tasks/mask-generation/data.ts +55 -0
- package/src/tasks/object-detection/about.md +37 -0
- package/src/tasks/object-detection/data.ts +86 -0
- package/src/tasks/object-detection/inference.ts +75 -0
- package/src/tasks/object-detection/spec/input.json +31 -0
- package/src/tasks/object-detection/spec/output.json +50 -0
- package/src/tasks/placeholder/about.md +15 -0
- package/src/tasks/placeholder/data.ts +21 -0
- package/src/tasks/placeholder/spec/input.json +35 -0
- package/src/tasks/placeholder/spec/output.json +17 -0
- package/src/tasks/question-answering/about.md +56 -0
- package/src/tasks/question-answering/data.ts +75 -0
- package/src/tasks/question-answering/inference.ts +99 -0
- package/src/tasks/question-answering/spec/input.json +67 -0
- package/src/tasks/question-answering/spec/output.json +29 -0
- package/src/tasks/reinforcement-learning/about.md +167 -0
- package/src/tasks/reinforcement-learning/data.ts +75 -0
- package/src/tasks/sentence-similarity/about.md +97 -0
- package/src/tasks/sentence-similarity/data.ts +101 -0
- package/src/tasks/sentence-similarity/inference.ts +32 -0
- package/src/tasks/sentence-similarity/spec/input.json +40 -0
- package/src/tasks/sentence-similarity/spec/output.json +12 -0
- package/src/tasks/summarization/about.md +58 -0
- package/src/tasks/summarization/data.ts +76 -0
- package/src/tasks/summarization/inference.ts +57 -0
- package/src/tasks/summarization/spec/input.json +42 -0
- package/src/tasks/summarization/spec/output.json +14 -0
- package/src/tasks/table-question-answering/about.md +43 -0
- package/src/tasks/table-question-answering/data.ts +59 -0
- package/src/tasks/table-question-answering/inference.ts +61 -0
- package/src/tasks/table-question-answering/spec/input.json +44 -0
- package/src/tasks/table-question-answering/spec/output.json +40 -0
- package/src/tasks/tabular-classification/about.md +65 -0
- package/src/tasks/tabular-classification/data.ts +68 -0
- package/src/tasks/tabular-regression/about.md +87 -0
- package/src/tasks/tabular-regression/data.ts +57 -0
- package/src/tasks/text-classification/about.md +173 -0
- package/src/tasks/text-classification/data.ts +103 -0
- package/src/tasks/text-classification/inference.ts +51 -0
- package/src/tasks/text-classification/spec/input.json +35 -0
- package/src/tasks/text-classification/spec/output.json +11 -0
- package/src/tasks/text-generation/about.md +154 -0
- package/src/tasks/text-generation/data.ts +114 -0
- package/src/tasks/text-generation/inference.ts +200 -0
- package/src/tasks/text-generation/spec/input.json +219 -0
- package/src/tasks/text-generation/spec/output.json +179 -0
- package/src/tasks/text-generation/spec/stream_output.json +103 -0
- package/src/tasks/text-to-3d/about.md +62 -0
- package/src/tasks/text-to-3d/data.ts +56 -0
- package/src/tasks/text-to-audio/inference.ts +143 -0
- package/src/tasks/text-to-audio/spec/input.json +31 -0
- package/src/tasks/text-to-audio/spec/output.json +17 -0
- package/src/tasks/text-to-image/about.md +96 -0
- package/src/tasks/text-to-image/data.ts +100 -0
- package/src/tasks/text-to-image/inference.ts +75 -0
- package/src/tasks/text-to-image/spec/input.json +63 -0
- package/src/tasks/text-to-image/spec/output.json +13 -0
- package/src/tasks/text-to-speech/about.md +63 -0
- package/src/tasks/text-to-speech/data.ts +79 -0
- package/src/tasks/text-to-speech/inference.ts +145 -0
- package/src/tasks/text-to-speech/spec/input.json +31 -0
- package/src/tasks/text-to-speech/spec/output.json +7 -0
- package/src/tasks/text-to-video/about.md +41 -0
- package/src/tasks/text-to-video/data.ts +102 -0
- package/src/tasks/text2text-generation/inference.ts +55 -0
- package/src/tasks/text2text-generation/spec/input.json +55 -0
- package/src/tasks/text2text-generation/spec/output.json +14 -0
- package/src/tasks/token-classification/about.md +76 -0
- package/src/tasks/token-classification/data.ts +92 -0
- package/src/tasks/token-classification/inference.ts +85 -0
- package/src/tasks/token-classification/spec/input.json +65 -0
- package/src/tasks/token-classification/spec/output.json +37 -0
- package/src/tasks/translation/about.md +65 -0
- package/src/tasks/translation/data.ts +70 -0
- package/src/tasks/translation/inference.ts +67 -0
- package/src/tasks/translation/spec/input.json +50 -0
- package/src/tasks/translation/spec/output.json +14 -0
- package/src/tasks/unconditional-image-generation/about.md +50 -0
- package/src/tasks/unconditional-image-generation/data.ts +72 -0
- package/src/tasks/video-classification/about.md +37 -0
- package/src/tasks/video-classification/data.ts +84 -0
- package/src/tasks/video-classification/inference.ts +59 -0
- package/src/tasks/video-classification/spec/input.json +42 -0
- package/src/tasks/video-classification/spec/output.json +10 -0
- package/src/tasks/video-text-to-text/about.md +98 -0
- package/src/tasks/video-text-to-text/data.ts +66 -0
- package/src/tasks/visual-question-answering/about.md +48 -0
- package/src/tasks/visual-question-answering/data.ts +97 -0
- package/src/tasks/visual-question-answering/inference.ts +62 -0
- package/src/tasks/visual-question-answering/spec/input.json +41 -0
- package/src/tasks/visual-question-answering/spec/output.json +21 -0
- package/src/tasks/zero-shot-classification/about.md +40 -0
- package/src/tasks/zero-shot-classification/data.ts +70 -0
- package/src/tasks/zero-shot-classification/inference.ts +67 -0
- package/src/tasks/zero-shot-classification/spec/input.json +50 -0
- package/src/tasks/zero-shot-classification/spec/output.json +11 -0
- package/src/tasks/zero-shot-image-classification/about.md +75 -0
- package/src/tasks/zero-shot-image-classification/data.ts +84 -0
- package/src/tasks/zero-shot-image-classification/inference.ts +61 -0
- package/src/tasks/zero-shot-image-classification/spec/input.json +45 -0
- package/src/tasks/zero-shot-image-classification/spec/output.json +10 -0
- package/src/tasks/zero-shot-object-detection/about.md +45 -0
- package/src/tasks/zero-shot-object-detection/data.ts +67 -0
- package/src/tasks/zero-shot-object-detection/inference.ts +66 -0
- package/src/tasks/zero-shot-object-detection/spec/input.json +40 -0
- package/src/tasks/zero-shot-object-detection/spec/output.json +47 -0
- package/src/tokenizer-data.ts +32 -0
- package/src/widget-example.ts +125 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
## About the Task
|
|
2
|
+
|
|
3
|
+
Zero Shot Classification is the task of predicting a class that wasn't seen by the model during training. This method, which leverages a pre-trained language model, can be thought of as an instance of [transfer learning](https://www.youtube.com/watch?v=BqqfQnyjmgg) which generally refers to using a model trained for one task in a different application than what it was originally trained for. This is particularly useful for situations where the amount of labeled data is small.
|
|
4
|
+
|
|
5
|
+
In zero shot classification, we provide the model with a prompt and a sequence of text that describes what we want our model to do, in natural language. Zero-shot classification excludes any examples of the desired task being completed. This differs from single or few-shot classification, as these tasks include a single or a few examples of the selected task.
|
|
6
|
+
|
|
7
|
+
Zero, single and few-shot classification seem to be an emergent feature of large language models. This feature seems to come about around model sizes of +100M parameters. The effectiveness of a model at a zero, single or few-shot task seems to scale with model size, meaning that larger models (models with more trainable parameters or layers) generally do better at this task.
|
|
8
|
+
|
|
9
|
+
Here is an example of a zero-shot prompt for classifying the sentiment of a sequence of text:
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
Classify the following input text into one of the following three categories: [positive, negative, neutral]
|
|
13
|
+
|
|
14
|
+
Input Text: Hugging Face is awesome for making all of these
|
|
15
|
+
state of the art models available!
|
|
16
|
+
Sentiment: positive
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
One great example of this task with a nice off-the-shelf model is available at the widget of this page, where the user can input a sequence of text and candidate labels to the model. This is a _word level_ example of zero shot classification, more elaborate and lengthy generations are available with larger models. Testing these models out and getting a feel for prompt engineering is the best way to learn how to use them.
|
|
21
|
+
|
|
22
|
+
## Inference
|
|
23
|
+
|
|
24
|
+
You can use the 🤗 Transformers library zero-shot-classification pipeline to infer with zero shot text classification models.
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from transformers import pipeline
|
|
28
|
+
|
|
29
|
+
pipe = pipeline(model="facebook/bart-large-mnli")
|
|
30
|
+
pipe("I have a problem with my iphone that needs to be resolved asap!",
|
|
31
|
+
candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
|
|
32
|
+
)
|
|
33
|
+
# output
|
|
34
|
+
>>> {'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]}
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Useful Resources
|
|
38
|
+
|
|
39
|
+
- [Zero Shot Learning](https://joeddav.github.io/blog/2020/05/29/ZSL.html)
|
|
40
|
+
- [Hugging Face on Transfer Learning](https://huggingface.co/course/en/chapter1/4?fw=pt#transfer-learning)
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../index.js";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
description: "A widely used dataset used to benchmark multiple variants of text classification.",
|
|
7
|
+
id: "nyu-mll/glue",
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
description:
|
|
11
|
+
"The Multi-Genre Natural Language Inference (MultiNLI) corpus is a crowd-sourced collection of 433k sentence pairs annotated with textual entailment information.",
|
|
12
|
+
id: "nyu-mll/multi_nli",
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
description:
|
|
16
|
+
"FEVER is a publicly available dataset for fact extraction and verification against textual sources.",
|
|
17
|
+
id: "fever/fever",
|
|
18
|
+
},
|
|
19
|
+
],
|
|
20
|
+
demo: {
|
|
21
|
+
inputs: [
|
|
22
|
+
{
|
|
23
|
+
label: "Text Input",
|
|
24
|
+
content: "Dune is the best movie ever.",
|
|
25
|
+
type: "text",
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
label: "Candidate Labels",
|
|
29
|
+
content: "CINEMA, ART, MUSIC",
|
|
30
|
+
type: "text",
|
|
31
|
+
},
|
|
32
|
+
],
|
|
33
|
+
outputs: [
|
|
34
|
+
{
|
|
35
|
+
type: "chart",
|
|
36
|
+
data: [
|
|
37
|
+
{
|
|
38
|
+
label: "CINEMA",
|
|
39
|
+
score: 0.9,
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
label: "ART",
|
|
43
|
+
score: 0.1,
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
label: "MUSIC",
|
|
47
|
+
score: 0.0,
|
|
48
|
+
},
|
|
49
|
+
],
|
|
50
|
+
},
|
|
51
|
+
],
|
|
52
|
+
},
|
|
53
|
+
metrics: [],
|
|
54
|
+
models: [
|
|
55
|
+
{
|
|
56
|
+
description: "Powerful zero-shot text classification model.",
|
|
57
|
+
id: "facebook/bart-large-mnli",
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
description: "Powerful zero-shot multilingual text classification model that can accomplish multiple tasks.",
|
|
61
|
+
id: "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7",
|
|
62
|
+
},
|
|
63
|
+
],
|
|
64
|
+
spaces: [],
|
|
65
|
+
summary:
|
|
66
|
+
"Zero-shot text classification is a task in natural language processing where a model is trained on a set of labeled examples but is then able to classify new examples from previously unseen classes.",
|
|
67
|
+
widgetModels: ["facebook/bart-large-mnli"],
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
export default taskData;
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inference code generated from the JSON schema spec in ./spec
|
|
3
|
+
*
|
|
4
|
+
* Using src/scripts/inference-codegen
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Inputs for Zero Shot Classification inference
|
|
8
|
+
*/
|
|
9
|
+
export interface ZeroShotClassificationInput {
|
|
10
|
+
/**
|
|
11
|
+
* The input text data, with candidate labels
|
|
12
|
+
*/
|
|
13
|
+
inputs: ZeroShotClassificationInputData;
|
|
14
|
+
/**
|
|
15
|
+
* Additional inference parameters
|
|
16
|
+
*/
|
|
17
|
+
parameters?: ZeroShotClassificationParameters;
|
|
18
|
+
[property: string]: unknown;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* The input text data, with candidate labels
|
|
22
|
+
*/
|
|
23
|
+
export interface ZeroShotClassificationInputData {
|
|
24
|
+
/**
|
|
25
|
+
* The set of possible class labels to classify the text into.
|
|
26
|
+
*/
|
|
27
|
+
candidateLabels: string[];
|
|
28
|
+
/**
|
|
29
|
+
* The text to classify
|
|
30
|
+
*/
|
|
31
|
+
text: string;
|
|
32
|
+
[property: string]: unknown;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Additional inference parameters
|
|
36
|
+
*
|
|
37
|
+
* Additional inference parameters for Zero Shot Classification
|
|
38
|
+
*/
|
|
39
|
+
export interface ZeroShotClassificationParameters {
|
|
40
|
+
/**
|
|
41
|
+
* The sentence used in conjunction with candidateLabels to attempt the text classification
|
|
42
|
+
* by replacing the placeholder with the candidate labels.
|
|
43
|
+
*/
|
|
44
|
+
hypothesis_template?: string;
|
|
45
|
+
/**
|
|
46
|
+
* Whether multiple candidate labels can be true. If false, the scores are normalized such
|
|
47
|
+
* that the sum of the label likelihoods for each sequence is 1. If true, the labels are
|
|
48
|
+
* considered independent and probabilities are normalized for each candidate.
|
|
49
|
+
*/
|
|
50
|
+
multi_label?: boolean;
|
|
51
|
+
[property: string]: unknown;
|
|
52
|
+
}
|
|
53
|
+
export type ZeroShotClassificationOutput = ZeroShotClassificationOutputElement[];
|
|
54
|
+
/**
|
|
55
|
+
* Outputs of inference for the Zero Shot Classification task
|
|
56
|
+
*/
|
|
57
|
+
export interface ZeroShotClassificationOutputElement {
|
|
58
|
+
/**
|
|
59
|
+
* The predicted class label.
|
|
60
|
+
*/
|
|
61
|
+
label: string;
|
|
62
|
+
/**
|
|
63
|
+
* The corresponding probability.
|
|
64
|
+
*/
|
|
65
|
+
score: number;
|
|
66
|
+
[property: string]: unknown;
|
|
67
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/zero-shot-classification/input.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Inputs for Zero Shot Classification inference",
|
|
5
|
+
"title": "ZeroShotClassificationInput",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"inputs": {
|
|
9
|
+
"description": "The input text data, with candidate labels",
|
|
10
|
+
"type": "object",
|
|
11
|
+
"title": "ZeroShotClassificationInputData",
|
|
12
|
+
"properties": {
|
|
13
|
+
"text": {
|
|
14
|
+
"type": "string",
|
|
15
|
+
"description": "The text to classify"
|
|
16
|
+
},
|
|
17
|
+
"candidateLabels": {
|
|
18
|
+
"type": "array",
|
|
19
|
+
"description": "The set of possible class labels to classify the text into.",
|
|
20
|
+
"items": {
|
|
21
|
+
"type": "string"
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
"required": ["text", "candidateLabels"]
|
|
26
|
+
},
|
|
27
|
+
"parameters": {
|
|
28
|
+
"description": "Additional inference parameters",
|
|
29
|
+
"$ref": "#/$defs/ZeroShotClassificationParameters"
|
|
30
|
+
}
|
|
31
|
+
},
|
|
32
|
+
"$defs": {
|
|
33
|
+
"ZeroShotClassificationParameters": {
|
|
34
|
+
"title": "ZeroShotClassificationParameters",
|
|
35
|
+
"description": "Additional inference parameters for Zero Shot Classification",
|
|
36
|
+
"type": "object",
|
|
37
|
+
"properties": {
|
|
38
|
+
"hypothesis_template": {
|
|
39
|
+
"type": "string",
|
|
40
|
+
"description": "The sentence used in conjunction with candidateLabels to attempt the text classification by replacing the placeholder with the candidate labels."
|
|
41
|
+
},
|
|
42
|
+
"multi_label": {
|
|
43
|
+
"type": "boolean",
|
|
44
|
+
"description": "Whether multiple candidate labels can be true. If false, the scores are normalized such that the sum of the label likelihoods for each sequence is 1. If true, the labels are considered independent and probabilities are normalized for each candidate."
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
},
|
|
49
|
+
"required": ["inputs"]
|
|
50
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/zero-shot-classification/output.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Outputs of inference for the Zero Shot Classification task",
|
|
5
|
+
"title": "ZeroShotClassificationOutput",
|
|
6
|
+
"type": "array",
|
|
7
|
+
"items": {
|
|
8
|
+
"type": "object",
|
|
9
|
+
"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
|
|
10
|
+
}
|
|
11
|
+
}
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
## About the Task
|
|
2
|
+
|
|
3
|
+
Zero-shot image classification is a computer vision task to classify images into one of several classes, without any prior training or knowledge of the classes.
|
|
4
|
+
|
|
5
|
+
Zero shot image classification works by transferring knowledge learnt during training of one model, to classify novel classes that was not present in the training data. So this is a variation of [transfer learning](https://www.youtube.com/watch?v=BqqfQnyjmgg). For instance, a model trained to differentiate cars from airplanes can be used to classify images of ships.
|
|
6
|
+
|
|
7
|
+
The data in this learning paradigm consists of
|
|
8
|
+
|
|
9
|
+
- Seen data - images and their corresponding labels
|
|
10
|
+
- Unseen data - only labels and no images
|
|
11
|
+
- Auxiliary information - additional information given to the model during training connecting the unseen and seen data. This can be in the form of textual description or word embeddings.
|
|
12
|
+
|
|
13
|
+
## Use Cases
|
|
14
|
+
|
|
15
|
+
### Image Retrieval
|
|
16
|
+
|
|
17
|
+
Zero-shot learning resolves several challenges in image retrieval systems. For example, with the rapid growth of categories on the web, it is challenging to index images based on unseen categories. With zero-shot learning we can associate unseen categories to images by exploiting attributes to model the relationships among visual features and labels.
|
|
18
|
+
|
|
19
|
+
### Action Recognition
|
|
20
|
+
|
|
21
|
+
Action recognition is the task of identifying when a person in an image/video is performing a given action from a set of actions. If all the possible actions are not known beforehand, conventional deep learning models fail. With zero-shot learning, for a given domain of a set of actions, we can create a mapping connecting low-level features and a semantic description of auxiliary data to classify unknown classes of actions.
|
|
22
|
+
|
|
23
|
+
## Task Variants
|
|
24
|
+
|
|
25
|
+
You can contribute variants of this task [here](https://github.com/huggingface/hub-docs/blob/main/tasks/src/zero-shot-image-classification/about.md).
|
|
26
|
+
|
|
27
|
+
## Inference
|
|
28
|
+
|
|
29
|
+
The model can be loaded with the zero-shot-image-classification pipeline like so:
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
from transformers import pipeline
|
|
33
|
+
# More models in the model hub.
|
|
34
|
+
model_name = "openai/clip-vit-large-patch14-336"
|
|
35
|
+
classifier = pipeline("zero-shot-image-classification", model = model_name)
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
You can then use this pipeline to classify images into any of the class names you specify. You can specify more than two class labels too.
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
image_to_classify = "path_to_cat_and_dog_image.jpeg"
|
|
42
|
+
labels_for_classification = ["cat and dog",
|
|
43
|
+
"lion and cheetah",
|
|
44
|
+
"rabbit and lion"]
|
|
45
|
+
scores = classifier(image_to_classify,
|
|
46
|
+
candidate_labels = labels_for_classification)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
The classifier would return a list of dictionaries after the inference which is stored in the variable `scores` in the code snippet above. Variable `scores` would look as follows:
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
[{'score': 0.9950482249259949, 'label': 'cat and dog'},
|
|
53
|
+
{'score': 0.004863627254962921, 'label': 'rabbit and lion'},
|
|
54
|
+
{'score': 8.816882473183796e-05, 'label': 'lion and cheetah'}]
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
The dictionary at the zeroth index of the list will contain the label with the highest score.
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
print(f"The highest score is {scores[0]['score']:.3f} for the label {scores[0]['label']}")
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
The output from the print statement above would look as follows:
|
|
64
|
+
|
|
65
|
+
```
|
|
66
|
+
The highest probability is 0.995 for the label cat and dog
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Useful Resources
|
|
70
|
+
|
|
71
|
+
- [Zero-shot image classification task guide](https://huggingface.co/docs/transformers/tasks/zero_shot_image_classification).
|
|
72
|
+
- [Image-text Similarity Search](https://huggingface.co/learn/cookbook/faiss_with_hf_datasets_and_clip)
|
|
73
|
+
|
|
74
|
+
This page was made possible thanks to the efforts of [Shamima Hossain](https://huggingface.co/Shamima), [Haider Zaidi
|
|
75
|
+
](https://huggingface.co/chefhaider) and [Paarth Bhatnagar](https://huggingface.co/Paarth).
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../index.js";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
// TODO write proper description
|
|
7
|
+
description: "",
|
|
8
|
+
id: "",
|
|
9
|
+
},
|
|
10
|
+
],
|
|
11
|
+
demo: {
|
|
12
|
+
inputs: [
|
|
13
|
+
{
|
|
14
|
+
filename: "image-classification-input.jpeg",
|
|
15
|
+
type: "img",
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
label: "Classes",
|
|
19
|
+
content: "cat, dog, bird",
|
|
20
|
+
type: "text",
|
|
21
|
+
},
|
|
22
|
+
],
|
|
23
|
+
outputs: [
|
|
24
|
+
{
|
|
25
|
+
type: "chart",
|
|
26
|
+
data: [
|
|
27
|
+
{
|
|
28
|
+
label: "Cat",
|
|
29
|
+
score: 0.664,
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
label: "Dog",
|
|
33
|
+
score: 0.329,
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
label: "Bird",
|
|
37
|
+
score: 0.008,
|
|
38
|
+
},
|
|
39
|
+
],
|
|
40
|
+
},
|
|
41
|
+
],
|
|
42
|
+
},
|
|
43
|
+
metrics: [
|
|
44
|
+
{
|
|
45
|
+
description: "Computes the number of times the correct label appears in top K labels predicted",
|
|
46
|
+
id: "top-K accuracy",
|
|
47
|
+
},
|
|
48
|
+
],
|
|
49
|
+
models: [
|
|
50
|
+
{
|
|
51
|
+
description: "Robust image classification model trained on publicly available image-caption data.",
|
|
52
|
+
id: "openai/clip-vit-base-patch16",
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
description: "Strong zero-shot image classification model.",
|
|
56
|
+
id: "google/siglip-so400m-patch14-224",
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
description: "Small yet powerful zero-shot image classification model that can run on edge devices.",
|
|
60
|
+
id: "apple/MobileCLIP-S1-OpenCLIP",
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
description: "Strong image classification model for biomedical domain.",
|
|
64
|
+
id: "microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224",
|
|
65
|
+
},
|
|
66
|
+
],
|
|
67
|
+
spaces: [
|
|
68
|
+
{
|
|
69
|
+
description:
|
|
70
|
+
"An application that leverages zero-shot image classification to find best captions to generate an image. ",
|
|
71
|
+
id: "pharma/CLIP-Interrogator",
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
description: "An application to compare different zero-shot image classification models. ",
|
|
75
|
+
id: "merve/compare_clip_siglip",
|
|
76
|
+
},
|
|
77
|
+
],
|
|
78
|
+
summary:
|
|
79
|
+
"Zero-shot image classification is the task of classifying previously unseen classes during training of a model.",
|
|
80
|
+
widgetModels: ["google/siglip-so400m-patch14-224"],
|
|
81
|
+
youtubeId: "",
|
|
82
|
+
};
|
|
83
|
+
|
|
84
|
+
export default taskData;
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inference code generated from the JSON schema spec in ./spec
|
|
3
|
+
*
|
|
4
|
+
* Using src/scripts/inference-codegen
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Inputs for Zero Shot Image Classification inference
|
|
8
|
+
*/
|
|
9
|
+
export interface ZeroShotImageClassificationInput {
|
|
10
|
+
/**
|
|
11
|
+
* The input image data, with candidate labels
|
|
12
|
+
*/
|
|
13
|
+
inputs: ZeroShotImageClassificationInputData;
|
|
14
|
+
/**
|
|
15
|
+
* Additional inference parameters
|
|
16
|
+
*/
|
|
17
|
+
parameters?: ZeroShotImageClassificationParameters;
|
|
18
|
+
[property: string]: unknown;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* The input image data, with candidate labels
|
|
22
|
+
*/
|
|
23
|
+
export interface ZeroShotImageClassificationInputData {
|
|
24
|
+
/**
|
|
25
|
+
* The candidate labels for this image
|
|
26
|
+
*/
|
|
27
|
+
candidateLabels: string[];
|
|
28
|
+
/**
|
|
29
|
+
* The image data to classify
|
|
30
|
+
*/
|
|
31
|
+
image: unknown;
|
|
32
|
+
[property: string]: unknown;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Additional inference parameters
|
|
36
|
+
*
|
|
37
|
+
* Additional inference parameters for Zero Shot Image Classification
|
|
38
|
+
*/
|
|
39
|
+
export interface ZeroShotImageClassificationParameters {
|
|
40
|
+
/**
|
|
41
|
+
* The sentence used in conjunction with candidateLabels to attempt the text classification
|
|
42
|
+
* by replacing the placeholder with the candidate labels.
|
|
43
|
+
*/
|
|
44
|
+
hypothesis_template?: string;
|
|
45
|
+
[property: string]: unknown;
|
|
46
|
+
}
|
|
47
|
+
export type ZeroShotImageClassificationOutput = ZeroShotImageClassificationOutputElement[];
|
|
48
|
+
/**
|
|
49
|
+
* Outputs of inference for the Zero Shot Image Classification task
|
|
50
|
+
*/
|
|
51
|
+
export interface ZeroShotImageClassificationOutputElement {
|
|
52
|
+
/**
|
|
53
|
+
* The predicted class label.
|
|
54
|
+
*/
|
|
55
|
+
label: string;
|
|
56
|
+
/**
|
|
57
|
+
* The corresponding probability.
|
|
58
|
+
*/
|
|
59
|
+
score: number;
|
|
60
|
+
[property: string]: unknown;
|
|
61
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/zero-shot-image-classification/input.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Inputs for Zero Shot Image Classification inference",
|
|
5
|
+
"title": "ZeroShotImageClassificationInput",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"inputs": {
|
|
9
|
+
"description": "The input image data, with candidate labels",
|
|
10
|
+
"type": "object",
|
|
11
|
+
"title": "ZeroShotImageClassificationInputData",
|
|
12
|
+
"properties": {
|
|
13
|
+
"image": {
|
|
14
|
+
"description": "The image data to classify"
|
|
15
|
+
},
|
|
16
|
+
"candidateLabels": {
|
|
17
|
+
"description": "The candidate labels for this image",
|
|
18
|
+
"type": "array",
|
|
19
|
+
"items": {
|
|
20
|
+
"type": "string"
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
},
|
|
24
|
+
"required": ["image", "candidateLabels"]
|
|
25
|
+
},
|
|
26
|
+
"parameters": {
|
|
27
|
+
"description": "Additional inference parameters",
|
|
28
|
+
"$ref": "#/$defs/ZeroShotImageClassificationParameters"
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
"$defs": {
|
|
32
|
+
"ZeroShotImageClassificationParameters": {
|
|
33
|
+
"title": "ZeroShotImageClassificationParameters",
|
|
34
|
+
"description": "Additional inference parameters for Zero Shot Image Classification",
|
|
35
|
+
"type": "object",
|
|
36
|
+
"properties": {
|
|
37
|
+
"hypothesis_template": {
|
|
38
|
+
"type": "string",
|
|
39
|
+
"description": "The sentence used in conjunction with candidateLabels to attempt the text classification by replacing the placeholder with the candidate labels."
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
},
|
|
44
|
+
"required": ["inputs"]
|
|
45
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/zero-shot-image-classification/output.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Outputs of inference for the Zero Shot Image Classification task",
|
|
5
|
+
"title": "ZeroShotImageClassificationOutput",
|
|
6
|
+
"type": "array",
|
|
7
|
+
"items": {
|
|
8
|
+
"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
|
|
9
|
+
}
|
|
10
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
## Use Cases
|
|
2
|
+
|
|
3
|
+
Zero-shot object detection models can be used in any object detection application where the detection involves text queries for objects of interest.
|
|
4
|
+
|
|
5
|
+
### Object Search
|
|
6
|
+
|
|
7
|
+
Zero-shot object detection models can be used in image search. Smartphones, for example, use zero-shot object detection models to detect entities (such as specific places or objects) and allow the user to search for the entity on the internet.
|
|
8
|
+
|
|
9
|
+
### Object Counting
|
|
10
|
+
|
|
11
|
+
Zero-shot object detection models are used to count instances of objects in a given image. This can include counting the objects in warehouses or stores or the number of visitors in a store. They are also used to manage crowds at events to prevent disasters.
|
|
12
|
+
|
|
13
|
+
### Object Tracking
|
|
14
|
+
|
|
15
|
+
Zero-shot object detectors can track objects in videos.
|
|
16
|
+
|
|
17
|
+
## Inference
|
|
18
|
+
|
|
19
|
+
You can infer with zero-shot object detection models through the `zero-shot-object-detection` pipeline. When calling the pipeline, you just need to specify a path or HTTP link to an image and the candidate labels.
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
from transformers import pipeline
|
|
23
|
+
from PIL import Image
|
|
24
|
+
|
|
25
|
+
image = Image.open("my-image.png").convert("RGB")
|
|
26
|
+
|
|
27
|
+
detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection")
|
|
28
|
+
|
|
29
|
+
predictions = detector(
|
|
30
|
+
image,
|
|
31
|
+
candidate_labels=["a photo of a cat", "a photo of a dog"],
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# [{'score': 0.95,
|
|
35
|
+
# 'label': 'a photo of a cat',
|
|
36
|
+
# 'box': {'xmin': 180, 'ymin': 71, 'xmax': 271, 'ymax': 178}},
|
|
37
|
+
# ...
|
|
38
|
+
# ]
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
# Useful Resources
|
|
42
|
+
|
|
43
|
+
- [Zero-shot object detection task guide](https://huggingface.co/docs/transformers/tasks/zero_shot_object_detection)
|
|
44
|
+
|
|
45
|
+
This page was made possible thanks to the efforts of [Victor Guichard](https://huggingface.co/VictorGuichard)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../index.js";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [],
|
|
5
|
+
demo: {
|
|
6
|
+
inputs: [
|
|
7
|
+
{
|
|
8
|
+
filename: "zero-shot-object-detection-input.jpg",
|
|
9
|
+
type: "img",
|
|
10
|
+
},
|
|
11
|
+
{
|
|
12
|
+
label: "Classes",
|
|
13
|
+
content: "cat, dog, bird",
|
|
14
|
+
type: "text",
|
|
15
|
+
},
|
|
16
|
+
],
|
|
17
|
+
outputs: [
|
|
18
|
+
{
|
|
19
|
+
filename: "zero-shot-object-detection-output.jpg",
|
|
20
|
+
type: "img",
|
|
21
|
+
},
|
|
22
|
+
],
|
|
23
|
+
},
|
|
24
|
+
metrics: [
|
|
25
|
+
{
|
|
26
|
+
description:
|
|
27
|
+
"The Average Precision (AP) metric is the Area Under the PR Curve (AUC-PR). It is calculated for each class separately",
|
|
28
|
+
id: "Average Precision",
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
description: "The Mean Average Precision (mAP) metric is the overall average of the AP values",
|
|
32
|
+
id: "Mean Average Precision",
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
description:
|
|
36
|
+
"The APα metric is the Average Precision at the IoU threshold of a α value, for example, AP50 and AP75",
|
|
37
|
+
id: "APα",
|
|
38
|
+
},
|
|
39
|
+
],
|
|
40
|
+
models: [
|
|
41
|
+
{
|
|
42
|
+
description: "Solid zero-shot object detection model.",
|
|
43
|
+
id: "IDEA-Research/grounding-dino-base",
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
description: "Cutting-edge zero-shot object detection model.",
|
|
47
|
+
id: "google/owlv2-base-patch16-ensemble",
|
|
48
|
+
},
|
|
49
|
+
],
|
|
50
|
+
spaces: [
|
|
51
|
+
{
|
|
52
|
+
description: "A demo to try the state-of-the-art zero-shot object detection model, OWLv2.",
|
|
53
|
+
id: "merve/owlv2",
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
description:
|
|
57
|
+
"A demo that combines a zero-shot object detection and mask generation model for zero-shot segmentation.",
|
|
58
|
+
id: "merve/OWLSAM",
|
|
59
|
+
},
|
|
60
|
+
],
|
|
61
|
+
summary:
|
|
62
|
+
"Zero-shot object detection is a computer vision task to detect objects and their classes in images, without any prior training or knowledge of the classes. Zero-shot object detection models receive an image as input, as well as a list of candidate classes, and output the bounding boxes and labels where the objects have been detected.",
|
|
63
|
+
widgetModels: [],
|
|
64
|
+
youtubeId: "",
|
|
65
|
+
};
|
|
66
|
+
|
|
67
|
+
export default taskData;
|