@huggingface/tasks 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/{index.mjs → index.cjs} +2695 -2497
- package/dist/index.d.ts +427 -65
- package/dist/index.js +2660 -2532
- package/package.json +13 -8
- package/src/index.ts +2 -5
- package/src/library-to-tasks.ts +1 -1
- package/src/model-data.ts +1 -1
- package/src/model-libraries-downloads.ts +20 -0
- package/src/{library-ui-elements.ts → model-libraries-snippets.ts} +50 -296
- package/src/model-libraries.ts +375 -44
- package/src/pipelines.ts +1 -1
- package/src/tasks/audio-classification/about.md +1 -1
- package/src/tasks/audio-classification/inference.ts +51 -0
- package/src/tasks/audio-classification/spec/input.json +34 -0
- package/src/tasks/audio-classification/spec/output.json +10 -0
- package/src/tasks/audio-to-audio/about.md +1 -1
- package/src/tasks/automatic-speech-recognition/about.md +4 -2
- package/src/tasks/automatic-speech-recognition/inference.ts +159 -0
- package/src/tasks/automatic-speech-recognition/spec/input.json +34 -0
- package/src/tasks/automatic-speech-recognition/spec/output.json +38 -0
- package/src/tasks/common-definitions.json +117 -0
- package/src/tasks/depth-estimation/data.ts +8 -4
- package/src/tasks/depth-estimation/inference.ts +35 -0
- package/src/tasks/depth-estimation/spec/input.json +25 -0
- package/src/tasks/depth-estimation/spec/output.json +16 -0
- package/src/tasks/document-question-answering/inference.ts +110 -0
- package/src/tasks/document-question-answering/spec/input.json +85 -0
- package/src/tasks/document-question-answering/spec/output.json +36 -0
- package/src/tasks/feature-extraction/inference.ts +22 -0
- package/src/tasks/feature-extraction/spec/input.json +26 -0
- package/src/tasks/feature-extraction/spec/output.json +7 -0
- package/src/tasks/fill-mask/inference.ts +62 -0
- package/src/tasks/fill-mask/spec/input.json +38 -0
- package/src/tasks/fill-mask/spec/output.json +29 -0
- package/src/tasks/image-classification/inference.ts +51 -0
- package/src/tasks/image-classification/spec/input.json +34 -0
- package/src/tasks/image-classification/spec/output.json +10 -0
- package/src/tasks/image-segmentation/inference.ts +65 -0
- package/src/tasks/image-segmentation/spec/input.json +54 -0
- package/src/tasks/image-segmentation/spec/output.json +25 -0
- package/src/tasks/image-to-image/inference.ts +67 -0
- package/src/tasks/image-to-image/spec/input.json +54 -0
- package/src/tasks/image-to-image/spec/output.json +12 -0
- package/src/tasks/image-to-text/inference.ts +143 -0
- package/src/tasks/image-to-text/spec/input.json +34 -0
- package/src/tasks/image-to-text/spec/output.json +14 -0
- package/src/tasks/index.ts +5 -2
- package/src/tasks/mask-generation/about.md +65 -0
- package/src/tasks/mask-generation/data.ts +42 -5
- package/src/tasks/object-detection/inference.ts +62 -0
- package/src/tasks/object-detection/spec/input.json +30 -0
- package/src/tasks/object-detection/spec/output.json +46 -0
- package/src/tasks/placeholder/data.ts +3 -0
- package/src/tasks/placeholder/spec/input.json +35 -0
- package/src/tasks/placeholder/spec/output.json +17 -0
- package/src/tasks/question-answering/inference.ts +99 -0
- package/src/tasks/question-answering/spec/input.json +67 -0
- package/src/tasks/question-answering/spec/output.json +29 -0
- package/src/tasks/sentence-similarity/about.md +2 -2
- package/src/tasks/sentence-similarity/inference.ts +32 -0
- package/src/tasks/sentence-similarity/spec/input.json +40 -0
- package/src/tasks/sentence-similarity/spec/output.json +12 -0
- package/src/tasks/summarization/data.ts +1 -0
- package/src/tasks/summarization/inference.ts +59 -0
- package/src/tasks/summarization/spec/input.json +7 -0
- package/src/tasks/summarization/spec/output.json +7 -0
- package/src/tasks/table-question-answering/inference.ts +61 -0
- package/src/tasks/table-question-answering/spec/input.json +44 -0
- package/src/tasks/table-question-answering/spec/output.json +40 -0
- package/src/tasks/tabular-classification/about.md +1 -1
- package/src/tasks/tabular-regression/about.md +1 -1
- package/src/tasks/text-classification/about.md +1 -0
- package/src/tasks/text-classification/inference.ts +51 -0
- package/src/tasks/text-classification/spec/input.json +35 -0
- package/src/tasks/text-classification/spec/output.json +10 -0
- package/src/tasks/text-generation/about.md +24 -13
- package/src/tasks/text-generation/data.ts +22 -38
- package/src/tasks/text-generation/inference.ts +194 -0
- package/src/tasks/text-generation/spec/input.json +90 -0
- package/src/tasks/text-generation/spec/output.json +120 -0
- package/src/tasks/text-to-audio/inference.ts +143 -0
- package/src/tasks/text-to-audio/spec/input.json +31 -0
- package/src/tasks/text-to-audio/spec/output.json +17 -0
- package/src/tasks/text-to-image/about.md +11 -2
- package/src/tasks/text-to-image/data.ts +6 -2
- package/src/tasks/text-to-image/inference.ts +71 -0
- package/src/tasks/text-to-image/spec/input.json +59 -0
- package/src/tasks/text-to-image/spec/output.json +13 -0
- package/src/tasks/text-to-speech/about.md +4 -2
- package/src/tasks/text-to-speech/data.ts +1 -0
- package/src/tasks/text-to-speech/inference.ts +147 -0
- package/src/tasks/text-to-speech/spec/input.json +7 -0
- package/src/tasks/text-to-speech/spec/output.json +7 -0
- package/src/tasks/text2text-generation/inference.ts +55 -0
- package/src/tasks/text2text-generation/spec/input.json +55 -0
- package/src/tasks/text2text-generation/spec/output.json +14 -0
- package/src/tasks/token-classification/inference.ts +82 -0
- package/src/tasks/token-classification/spec/input.json +65 -0
- package/src/tasks/token-classification/spec/output.json +33 -0
- package/src/tasks/translation/data.ts +1 -0
- package/src/tasks/translation/inference.ts +59 -0
- package/src/tasks/translation/spec/input.json +7 -0
- package/src/tasks/translation/spec/output.json +7 -0
- package/src/tasks/video-classification/inference.ts +59 -0
- package/src/tasks/video-classification/spec/input.json +42 -0
- package/src/tasks/video-classification/spec/output.json +10 -0
- package/src/tasks/visual-question-answering/inference.ts +63 -0
- package/src/tasks/visual-question-answering/spec/input.json +41 -0
- package/src/tasks/visual-question-answering/spec/output.json +21 -0
- package/src/tasks/zero-shot-classification/inference.ts +67 -0
- package/src/tasks/zero-shot-classification/spec/input.json +50 -0
- package/src/tasks/zero-shot-classification/spec/output.json +10 -0
- package/src/tasks/zero-shot-image-classification/data.ts +8 -5
- package/src/tasks/zero-shot-image-classification/inference.ts +61 -0
- package/src/tasks/zero-shot-image-classification/spec/input.json +45 -0
- package/src/tasks/zero-shot-image-classification/spec/output.json +10 -0
- package/src/tasks/zero-shot-object-detection/about.md +6 -0
- package/src/tasks/zero-shot-object-detection/data.ts +6 -1
- package/src/tasks/zero-shot-object-detection/inference.ts +66 -0
- package/src/tasks/zero-shot-object-detection/spec/input.json +40 -0
- package/src/tasks/zero-shot-object-detection/spec/output.json +47 -0
- package/tsconfig.json +3 -3
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/token-classification/output.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Outputs of inference for the Token Classification task",
|
|
5
|
+
"title": "TokenClassificationOutput",
|
|
6
|
+
"type": "array",
|
|
7
|
+
"items": {
|
|
8
|
+
"type": "object",
|
|
9
|
+
"properties": {
|
|
10
|
+
"entity_group": {
|
|
11
|
+
"type": "string",
|
|
12
|
+
"description": "The predicted label for that group of tokens"
|
|
13
|
+
},
|
|
14
|
+
"score": {
|
|
15
|
+
"type": "number",
|
|
16
|
+
"description": "The associated score / probability"
|
|
17
|
+
},
|
|
18
|
+
"word": {
|
|
19
|
+
"type": "string",
|
|
20
|
+
"description": "The corresponding text"
|
|
21
|
+
},
|
|
22
|
+
"start": {
|
|
23
|
+
"type": "integer",
|
|
24
|
+
"description": "The character position in the input where this group begins."
|
|
25
|
+
},
|
|
26
|
+
"end": {
|
|
27
|
+
"type": "integer",
|
|
28
|
+
"description": "The character position in the input where this group ends."
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
"required": ["label", "score"]
|
|
32
|
+
}
|
|
33
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inference code generated from the JSON schema spec in ./spec
|
|
3
|
+
*
|
|
4
|
+
* Using src/scripts/inference-codegen
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Inputs for Translation inference
|
|
9
|
+
*
|
|
10
|
+
* Inputs for Text2text Generation inference
|
|
11
|
+
*/
|
|
12
|
+
export interface TranslationInput {
|
|
13
|
+
/**
|
|
14
|
+
* The input text data
|
|
15
|
+
*/
|
|
16
|
+
inputs: string;
|
|
17
|
+
/**
|
|
18
|
+
* Additional inference parameters
|
|
19
|
+
*/
|
|
20
|
+
parameters?: Text2TextGenerationParameters;
|
|
21
|
+
[property: string]: unknown;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Additional inference parameters
|
|
26
|
+
*
|
|
27
|
+
* Additional inference parameters for Text2text Generation
|
|
28
|
+
*/
|
|
29
|
+
export interface Text2TextGenerationParameters {
|
|
30
|
+
/**
|
|
31
|
+
* Whether to clean up the potential extra spaces in the text output.
|
|
32
|
+
*/
|
|
33
|
+
clean_up_tokenization_spaces?: boolean;
|
|
34
|
+
/**
|
|
35
|
+
* Additional parametrization of the text generation algorithm
|
|
36
|
+
*/
|
|
37
|
+
generate_parameters?: { [key: string]: unknown };
|
|
38
|
+
/**
|
|
39
|
+
* The truncation strategy to use
|
|
40
|
+
*/
|
|
41
|
+
truncation?: Text2TextGenerationTruncationStrategy;
|
|
42
|
+
[property: string]: unknown;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Outputs for Translation inference
|
|
49
|
+
*
|
|
50
|
+
* Outputs of inference for the Text2text Generation task
|
|
51
|
+
*/
|
|
52
|
+
export interface TranslationOutput {
|
|
53
|
+
generatedText: unknown;
|
|
54
|
+
/**
|
|
55
|
+
* The generated text.
|
|
56
|
+
*/
|
|
57
|
+
generated_text?: string;
|
|
58
|
+
[property: string]: unknown;
|
|
59
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inference code generated from the JSON schema spec in ./spec
|
|
3
|
+
*
|
|
4
|
+
* Using src/scripts/inference-codegen
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Inputs for Video Classification inference
|
|
8
|
+
*/
|
|
9
|
+
export interface VideoClassificationInput {
|
|
10
|
+
/**
|
|
11
|
+
* The input video data
|
|
12
|
+
*/
|
|
13
|
+
inputs: unknown;
|
|
14
|
+
/**
|
|
15
|
+
* Additional inference parameters
|
|
16
|
+
*/
|
|
17
|
+
parameters?: VideoClassificationParameters;
|
|
18
|
+
[property: string]: unknown;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Additional inference parameters
|
|
22
|
+
*
|
|
23
|
+
* Additional inference parameters for Video Classification
|
|
24
|
+
*/
|
|
25
|
+
export interface VideoClassificationParameters {
|
|
26
|
+
/**
|
|
27
|
+
* The sampling rate used to select frames from the video.
|
|
28
|
+
*/
|
|
29
|
+
frame_sampling_rate?: number;
|
|
30
|
+
function_to_apply?: ClassificationOutputTransform;
|
|
31
|
+
/**
|
|
32
|
+
* The number of sampled frames to consider for classification.
|
|
33
|
+
*/
|
|
34
|
+
num_frames?: number;
|
|
35
|
+
/**
|
|
36
|
+
* When specified, limits the output to the top K most probable classes.
|
|
37
|
+
*/
|
|
38
|
+
top_k?: number;
|
|
39
|
+
[property: string]: unknown;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* The function to apply to the model outputs in order to retrieve the scores.
|
|
43
|
+
*/
|
|
44
|
+
export type ClassificationOutputTransform = "sigmoid" | "softmax" | "none";
|
|
45
|
+
export type VideoClassificationOutput = VideoClassificationOutputElement[];
|
|
46
|
+
/**
|
|
47
|
+
* Outputs of inference for the Video Classification task
|
|
48
|
+
*/
|
|
49
|
+
export interface VideoClassificationOutputElement {
|
|
50
|
+
/**
|
|
51
|
+
* The predicted class label.
|
|
52
|
+
*/
|
|
53
|
+
label: string;
|
|
54
|
+
/**
|
|
55
|
+
* The corresponding probability.
|
|
56
|
+
*/
|
|
57
|
+
score: number;
|
|
58
|
+
[property: string]: unknown;
|
|
59
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/video-classification/input.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Inputs for Video Classification inference",
|
|
5
|
+
"title": "VideoClassificationInput",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"inputs": {
|
|
9
|
+
"description": "The input video data"
|
|
10
|
+
},
|
|
11
|
+
"parameters": {
|
|
12
|
+
"description": "Additional inference parameters",
|
|
13
|
+
"$ref": "#/$defs/VideoClassificationParameters"
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"$defs": {
|
|
17
|
+
"VideoClassificationParameters": {
|
|
18
|
+
"title": "VideoClassificationParameters",
|
|
19
|
+
"description": "Additional inference parameters for Video Classification",
|
|
20
|
+
"type": "object",
|
|
21
|
+
"properties": {
|
|
22
|
+
"function_to_apply": {
|
|
23
|
+
"title": "TextClassificationOutputTransform",
|
|
24
|
+
"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
|
|
25
|
+
},
|
|
26
|
+
"num_frames": {
|
|
27
|
+
"type": "integer",
|
|
28
|
+
"description": "The number of sampled frames to consider for classification."
|
|
29
|
+
},
|
|
30
|
+
"frame_sampling_rate": {
|
|
31
|
+
"type": "integer",
|
|
32
|
+
"description": "The sampling rate used to select frames from the video."
|
|
33
|
+
},
|
|
34
|
+
"top_k": {
|
|
35
|
+
"type": "integer",
|
|
36
|
+
"description": "When specified, limits the output to the top K most probable classes."
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
},
|
|
41
|
+
"required": ["inputs"]
|
|
42
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/video-classification/output.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Outputs of inference for the Video Classification task",
|
|
5
|
+
"title": "VideoClassificationOutput",
|
|
6
|
+
"type": "array",
|
|
7
|
+
"items": {
|
|
8
|
+
"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
|
|
9
|
+
}
|
|
10
|
+
}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inference code generated from the JSON schema spec in ./spec
|
|
3
|
+
*
|
|
4
|
+
* Using src/scripts/inference-codegen
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Inputs for Visual Question Answering inference
|
|
8
|
+
*/
|
|
9
|
+
export interface VisualQuestionAnsweringInput {
|
|
10
|
+
/**
|
|
11
|
+
* One (image, question) pair to answer
|
|
12
|
+
*/
|
|
13
|
+
inputs: VisualQuestionAnsweringInputData;
|
|
14
|
+
/**
|
|
15
|
+
* Additional inference parameters
|
|
16
|
+
*/
|
|
17
|
+
parameters?: VisualQuestionAnsweringParameters;
|
|
18
|
+
[property: string]: unknown;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* One (image, question) pair to answer
|
|
22
|
+
*/
|
|
23
|
+
export interface VisualQuestionAnsweringInputData {
|
|
24
|
+
/**
|
|
25
|
+
* The image.
|
|
26
|
+
*/
|
|
27
|
+
image: unknown;
|
|
28
|
+
/**
|
|
29
|
+
* The question to answer based on the image.
|
|
30
|
+
*/
|
|
31
|
+
question: unknown;
|
|
32
|
+
[property: string]: unknown;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Additional inference parameters
|
|
36
|
+
*
|
|
37
|
+
* Additional inference parameters for Visual Question Answering
|
|
38
|
+
*/
|
|
39
|
+
export interface VisualQuestionAnsweringParameters {
|
|
40
|
+
/**
|
|
41
|
+
* The number of answers to return (will be chosen by order of likelihood). Note that we
|
|
42
|
+
* return less than topk answers if there are not enough options available within the
|
|
43
|
+
* context.
|
|
44
|
+
*/
|
|
45
|
+
top_k?: number;
|
|
46
|
+
[property: string]: unknown;
|
|
47
|
+
}
|
|
48
|
+
export type VisualQuestionAnsweringOutput = VisualQuestionAnsweringOutputElement[];
|
|
49
|
+
/**
|
|
50
|
+
* Outputs of inference for the Visual Question Answering task
|
|
51
|
+
*/
|
|
52
|
+
export interface VisualQuestionAnsweringOutputElement {
|
|
53
|
+
/**
|
|
54
|
+
* The answer to the question
|
|
55
|
+
*/
|
|
56
|
+
answer?: string;
|
|
57
|
+
label: unknown;
|
|
58
|
+
/**
|
|
59
|
+
* The associated score / probability
|
|
60
|
+
*/
|
|
61
|
+
score: number;
|
|
62
|
+
[property: string]: unknown;
|
|
63
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/visual-question-answering/input.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Inputs for Visual Question Answering inference",
|
|
5
|
+
"title": "VisualQuestionAnsweringInput",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"inputs": {
|
|
9
|
+
"description": "One (image, question) pair to answer",
|
|
10
|
+
"type": "object",
|
|
11
|
+
"title": "VisualQuestionAnsweringInputData",
|
|
12
|
+
"properties": {
|
|
13
|
+
"image": {
|
|
14
|
+
"description": "The image."
|
|
15
|
+
},
|
|
16
|
+
"question": {
|
|
17
|
+
"description": "The question to answer based on the image."
|
|
18
|
+
}
|
|
19
|
+
},
|
|
20
|
+
"required": ["question", "image"]
|
|
21
|
+
},
|
|
22
|
+
"parameters": {
|
|
23
|
+
"description": "Additional inference parameters",
|
|
24
|
+
"$ref": "#/$defs/VisualQuestionAnsweringParameters"
|
|
25
|
+
}
|
|
26
|
+
},
|
|
27
|
+
"$defs": {
|
|
28
|
+
"VisualQuestionAnsweringParameters": {
|
|
29
|
+
"title": "VisualQuestionAnsweringParameters",
|
|
30
|
+
"description": "Additional inference parameters for Visual Question Answering",
|
|
31
|
+
"type": "object",
|
|
32
|
+
"properties": {
|
|
33
|
+
"top_k": {
|
|
34
|
+
"type": "integer",
|
|
35
|
+
"description": "The number of answers to return (will be chosen by order of likelihood). Note that we return less than topk answers if there are not enough options available within the context."
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
},
|
|
40
|
+
"required": ["inputs"]
|
|
41
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/visual-question-answering/output.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Outputs of inference for the Visual Question Answering task",
|
|
5
|
+
"title": "VisualQuestionAnsweringOutput",
|
|
6
|
+
"type": "array",
|
|
7
|
+
"items": {
|
|
8
|
+
"type": "object",
|
|
9
|
+
"properties": {
|
|
10
|
+
"answer": {
|
|
11
|
+
"type": "string",
|
|
12
|
+
"description": "The answer to the question"
|
|
13
|
+
},
|
|
14
|
+
"score": {
|
|
15
|
+
"type": "number",
|
|
16
|
+
"description": "The associated score / probability"
|
|
17
|
+
}
|
|
18
|
+
},
|
|
19
|
+
"required": ["label", "score"]
|
|
20
|
+
}
|
|
21
|
+
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inference code generated from the JSON schema spec in ./spec
|
|
3
|
+
*
|
|
4
|
+
* Using src/scripts/inference-codegen
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Inputs for Zero Shot Classification inference
|
|
8
|
+
*/
|
|
9
|
+
export interface ZeroShotClassificationInput {
|
|
10
|
+
/**
|
|
11
|
+
* The input text data, with candidate labels
|
|
12
|
+
*/
|
|
13
|
+
inputs: ZeroShotClassificationInputData;
|
|
14
|
+
/**
|
|
15
|
+
* Additional inference parameters
|
|
16
|
+
*/
|
|
17
|
+
parameters?: ZeroShotClassificationParameters;
|
|
18
|
+
[property: string]: unknown;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* The input text data, with candidate labels
|
|
22
|
+
*/
|
|
23
|
+
export interface ZeroShotClassificationInputData {
|
|
24
|
+
/**
|
|
25
|
+
* The set of possible class labels to classify the text into.
|
|
26
|
+
*/
|
|
27
|
+
candidateLabels: string[];
|
|
28
|
+
/**
|
|
29
|
+
* The text to classify
|
|
30
|
+
*/
|
|
31
|
+
text: string;
|
|
32
|
+
[property: string]: unknown;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Additional inference parameters
|
|
36
|
+
*
|
|
37
|
+
* Additional inference parameters for Zero Shot Classification
|
|
38
|
+
*/
|
|
39
|
+
export interface ZeroShotClassificationParameters {
|
|
40
|
+
/**
|
|
41
|
+
* The sentence used in conjunction with candidateLabels to attempt the text classification
|
|
42
|
+
* by replacing the placeholder with the candidate labels.
|
|
43
|
+
*/
|
|
44
|
+
hypothesis_template?: string;
|
|
45
|
+
/**
|
|
46
|
+
* Whether multiple candidate labels can be true. If false, the scores are normalized such
|
|
47
|
+
* that the sum of the label likelihoods for each sequence is 1. If true, the labels are
|
|
48
|
+
* considered independent and probabilities are normalized for each candidate.
|
|
49
|
+
*/
|
|
50
|
+
multi_label?: boolean;
|
|
51
|
+
[property: string]: unknown;
|
|
52
|
+
}
|
|
53
|
+
export type ZeroShotClassificationOutput = ZeroShotClassificationOutputElement[];
|
|
54
|
+
/**
|
|
55
|
+
* Outputs of inference for the Zero Shot Classification task
|
|
56
|
+
*/
|
|
57
|
+
export interface ZeroShotClassificationOutputElement {
|
|
58
|
+
/**
|
|
59
|
+
* The predicted class label.
|
|
60
|
+
*/
|
|
61
|
+
label: string;
|
|
62
|
+
/**
|
|
63
|
+
* The corresponding probability.
|
|
64
|
+
*/
|
|
65
|
+
score: number;
|
|
66
|
+
[property: string]: unknown;
|
|
67
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/zero-shot-classification/input.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Inputs for Zero Shot Classification inference",
|
|
5
|
+
"title": "ZeroShotClassificationInput",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"inputs": {
|
|
9
|
+
"description": "The input text data, with candidate labels",
|
|
10
|
+
"type": "object",
|
|
11
|
+
"title": "ZeroShotClassificationInputData",
|
|
12
|
+
"properties": {
|
|
13
|
+
"text": {
|
|
14
|
+
"type": "string",
|
|
15
|
+
"description": "The text to classify"
|
|
16
|
+
},
|
|
17
|
+
"candidateLabels": {
|
|
18
|
+
"type": "array",
|
|
19
|
+
"description": "The set of possible class labels to classify the text into.",
|
|
20
|
+
"items": {
|
|
21
|
+
"type": "string"
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
"required": ["text", "candidateLabels"]
|
|
26
|
+
},
|
|
27
|
+
"parameters": {
|
|
28
|
+
"description": "Additional inference parameters",
|
|
29
|
+
"$ref": "#/$defs/ZeroShotClassificationParameters"
|
|
30
|
+
}
|
|
31
|
+
},
|
|
32
|
+
"$defs": {
|
|
33
|
+
"ZeroShotClassificationParameters": {
|
|
34
|
+
"title": "ZeroShotClassificationParameters",
|
|
35
|
+
"description": "Additional inference parameters for Zero Shot Classification",
|
|
36
|
+
"type": "object",
|
|
37
|
+
"properties": {
|
|
38
|
+
"hypothesis_template": {
|
|
39
|
+
"type": "string",
|
|
40
|
+
"description": "The sentence used in conjunction with candidateLabels to attempt the text classification by replacing the placeholder with the candidate labels."
|
|
41
|
+
},
|
|
42
|
+
"multi_label": {
|
|
43
|
+
"type": "boolean",
|
|
44
|
+
"description": "Whether multiple candidate labels can be true. If false, the scores are normalized such that the sum of the label likelihoods for each sequence is 1. If true, the labels are considered independent and probabilities are normalized for each candidate."
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
},
|
|
49
|
+
"required": ["inputs"]
|
|
50
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/zero-shot-classification/output.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Outputs of inference for the Zero Shot Classification task",
|
|
5
|
+
"title": "ZeroShotClassificationOutput",
|
|
6
|
+
"type": "array",
|
|
7
|
+
"items": {
|
|
8
|
+
"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
|
|
9
|
+
}
|
|
10
|
+
}
|
|
@@ -52,9 +52,8 @@ const taskData: TaskDataCustom = {
|
|
|
52
52
|
id: "openai/clip-vit-base-patch16",
|
|
53
53
|
},
|
|
54
54
|
{
|
|
55
|
-
description:
|
|
56
|
-
|
|
57
|
-
id: "openai/clip-vit-large-patch14-336",
|
|
55
|
+
description: "Strong zero-shot image classification model.",
|
|
56
|
+
id: "google/siglip-base-patch16-224",
|
|
58
57
|
},
|
|
59
58
|
{
|
|
60
59
|
description: "Strong image classification model for biomedical domain.",
|
|
@@ -64,12 +63,16 @@ const taskData: TaskDataCustom = {
|
|
|
64
63
|
spaces: [
|
|
65
64
|
{
|
|
66
65
|
description:
|
|
67
|
-
"An application that leverages zero
|
|
66
|
+
"An application that leverages zero-shot image classification to find best captions to generate an image. ",
|
|
68
67
|
id: "pharma/CLIP-Interrogator",
|
|
69
68
|
},
|
|
69
|
+
{
|
|
70
|
+
description: "An application to compare different zero-shot image classification models. ",
|
|
71
|
+
id: "merve/compare_clip_siglip",
|
|
72
|
+
},
|
|
70
73
|
],
|
|
71
74
|
summary:
|
|
72
|
-
"Zero
|
|
75
|
+
"Zero-shot image classification is the task of classifying previously unseen classes during training of a model.",
|
|
73
76
|
widgetModels: ["openai/clip-vit-large-patch14-336"],
|
|
74
77
|
youtubeId: "",
|
|
75
78
|
};
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inference code generated from the JSON schema spec in ./spec
|
|
3
|
+
*
|
|
4
|
+
* Using src/scripts/inference-codegen
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Inputs for Zero Shot Image Classification inference
|
|
8
|
+
*/
|
|
9
|
+
export interface ZeroShotImageClassificationInput {
|
|
10
|
+
/**
|
|
11
|
+
* The input image data, with candidate labels
|
|
12
|
+
*/
|
|
13
|
+
inputs: ZeroShotImageClassificationInputData;
|
|
14
|
+
/**
|
|
15
|
+
* Additional inference parameters
|
|
16
|
+
*/
|
|
17
|
+
parameters?: ZeroShotImageClassificationParameters;
|
|
18
|
+
[property: string]: unknown;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* The input image data, with candidate labels
|
|
22
|
+
*/
|
|
23
|
+
export interface ZeroShotImageClassificationInputData {
|
|
24
|
+
/**
|
|
25
|
+
* The candidate labels for this image
|
|
26
|
+
*/
|
|
27
|
+
candidateLabels: string[];
|
|
28
|
+
/**
|
|
29
|
+
* The image data to classify
|
|
30
|
+
*/
|
|
31
|
+
image: unknown;
|
|
32
|
+
[property: string]: unknown;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Additional inference parameters
|
|
36
|
+
*
|
|
37
|
+
* Additional inference parameters for Zero Shot Image Classification
|
|
38
|
+
*/
|
|
39
|
+
export interface ZeroShotImageClassificationParameters {
|
|
40
|
+
/**
|
|
41
|
+
* The sentence used in conjunction with candidateLabels to attempt the text classification
|
|
42
|
+
* by replacing the placeholder with the candidate labels.
|
|
43
|
+
*/
|
|
44
|
+
hypothesis_template?: string;
|
|
45
|
+
[property: string]: unknown;
|
|
46
|
+
}
|
|
47
|
+
export type ZeroShotImageClassificationOutput = ZeroShotImageClassificationOutputElement[];
|
|
48
|
+
/**
|
|
49
|
+
* Outputs of inference for the Zero Shot Image Classification task
|
|
50
|
+
*/
|
|
51
|
+
export interface ZeroShotImageClassificationOutputElement {
|
|
52
|
+
/**
|
|
53
|
+
* The predicted class label.
|
|
54
|
+
*/
|
|
55
|
+
label: string;
|
|
56
|
+
/**
|
|
57
|
+
* The corresponding probability.
|
|
58
|
+
*/
|
|
59
|
+
score: number;
|
|
60
|
+
[property: string]: unknown;
|
|
61
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/zero-shot-image-classification/input.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Inputs for Zero Shot Image Classification inference",
|
|
5
|
+
"title": "ZeroShotImageClassificationInput",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"inputs": {
|
|
9
|
+
"description": "The input image data, with candidate labels",
|
|
10
|
+
"type": "object",
|
|
11
|
+
"title": "ZeroShotImageClassificationInputData",
|
|
12
|
+
"properties": {
|
|
13
|
+
"image": {
|
|
14
|
+
"description": "The image data to classify"
|
|
15
|
+
},
|
|
16
|
+
"candidateLabels": {
|
|
17
|
+
"description": "The candidate labels for this image",
|
|
18
|
+
"type": "array",
|
|
19
|
+
"items": {
|
|
20
|
+
"type": "string"
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
},
|
|
24
|
+
"required": ["image", "candidateLabels"]
|
|
25
|
+
},
|
|
26
|
+
"parameters": {
|
|
27
|
+
"description": "Additional inference parameters",
|
|
28
|
+
"$ref": "#/$defs/ZeroShotImageClassificationParameters"
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
"$defs": {
|
|
32
|
+
"ZeroShotImageClassificationParameters": {
|
|
33
|
+
"title": "ZeroShotImageClassificationParameters",
|
|
34
|
+
"description": "Additional inference parameters for Zero Shot Image Classification",
|
|
35
|
+
"type": "object",
|
|
36
|
+
"properties": {
|
|
37
|
+
"hypothesis_template": {
|
|
38
|
+
"type": "string",
|
|
39
|
+
"description": "The sentence used in conjunction with candidateLabels to attempt the text classification by replacing the placeholder with the candidate labels."
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
},
|
|
44
|
+
"required": ["inputs"]
|
|
45
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/zero-shot-image-classification/output.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Outputs of inference for the Zero Shot Image Classification task",
|
|
5
|
+
"title": "ZeroShotImageClassificationOutput",
|
|
6
|
+
"type": "array",
|
|
7
|
+
"items": {
|
|
8
|
+
"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
|
|
9
|
+
}
|
|
10
|
+
}
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
## Use Cases
|
|
2
2
|
|
|
3
|
+
Zero-shot object detection models can be used in any object detection application where the detection involves text queries for objects of interest.
|
|
4
|
+
|
|
3
5
|
### Object Search
|
|
4
6
|
|
|
5
7
|
Zero-shot object detection models can be used in image search. Smartphones, for example, use zero-shot object detection models to detect entities (such as specific places or objects) and allow the user to search for the entity on the internet.
|
|
@@ -8,6 +10,10 @@ Zero-shot object detection models can be used in image search. Smartphones, for
|
|
|
8
10
|
|
|
9
11
|
Zero-shot object detection models are used to count instances of objects in a given image. This can include counting the objects in warehouses or stores or the number of visitors in a store. They are also used to manage crowds at events to prevent disasters.
|
|
10
12
|
|
|
13
|
+
### Object Tracking
|
|
14
|
+
|
|
15
|
+
Zero-shot object detectors can track objects in videos.
|
|
16
|
+
|
|
11
17
|
## Inference
|
|
12
18
|
|
|
13
19
|
You can infer with zero-shot object detection models through the `zero-shot-object-detection` pipeline. When calling the pipeline, you just need to specify a path or HTTP link to an image and the candidate labels.
|
|
@@ -47,7 +47,12 @@ const taskData: TaskDataCustom = {
|
|
|
47
47
|
id: "google/owlv2-base-patch16-ensemble",
|
|
48
48
|
},
|
|
49
49
|
],
|
|
50
|
-
spaces: [
|
|
50
|
+
spaces: [
|
|
51
|
+
{
|
|
52
|
+
description: "A demo to try the state-of-the-art zero-shot object detection model, OWLv2.",
|
|
53
|
+
id: "merve/owlv2",
|
|
54
|
+
},
|
|
55
|
+
],
|
|
51
56
|
summary:
|
|
52
57
|
"Zero-shot object detection is a computer vision task to detect objects and their classes in images, without any prior training or knowledge of the classes. Zero-shot object detection models receive an image as input, as well as a list of candidate classes, and output the bounding boxes and labels where the objects have been detected.",
|
|
53
58
|
widgetModels: [],
|