@huggingface/tasks 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{index.mjs → index.cjs} +295 -134
- package/dist/index.d.ts +8 -6
- package/dist/index.js +260 -169
- package/package.json +13 -8
- package/src/library-to-tasks.ts +1 -1
- package/src/library-ui-elements.ts +24 -10
- package/src/model-data.ts +1 -1
- package/src/model-libraries.ts +3 -2
- package/src/pipelines.ts +1 -1
- package/src/tasks/audio-classification/about.md +1 -1
- package/src/tasks/audio-classification/inference.ts +51 -0
- package/src/tasks/audio-classification/spec/input.json +34 -0
- package/src/tasks/audio-classification/spec/output.json +21 -0
- package/src/tasks/audio-to-audio/about.md +1 -1
- package/src/tasks/automatic-speech-recognition/about.md +4 -2
- package/src/tasks/automatic-speech-recognition/inference.ts +154 -0
- package/src/tasks/automatic-speech-recognition/spec/input.json +34 -0
- package/src/tasks/automatic-speech-recognition/spec/output.json +36 -0
- package/src/tasks/common-definitions.json +109 -0
- package/src/tasks/depth-estimation/data.ts +8 -4
- package/src/tasks/depth-estimation/inference.ts +35 -0
- package/src/tasks/depth-estimation/spec/input.json +30 -0
- package/src/tasks/depth-estimation/spec/output.json +10 -0
- package/src/tasks/document-question-answering/inference.ts +102 -0
- package/src/tasks/document-question-answering/spec/input.json +85 -0
- package/src/tasks/document-question-answering/spec/output.json +36 -0
- package/src/tasks/feature-extraction/inference.ts +22 -0
- package/src/tasks/feature-extraction/spec/input.json +26 -0
- package/src/tasks/feature-extraction/spec/output.json +7 -0
- package/src/tasks/fill-mask/inference.ts +61 -0
- package/src/tasks/fill-mask/spec/input.json +38 -0
- package/src/tasks/fill-mask/spec/output.json +29 -0
- package/src/tasks/image-classification/inference.ts +51 -0
- package/src/tasks/image-classification/spec/input.json +34 -0
- package/src/tasks/image-classification/spec/output.json +10 -0
- package/src/tasks/image-segmentation/inference.ts +65 -0
- package/src/tasks/image-segmentation/spec/input.json +54 -0
- package/src/tasks/image-segmentation/spec/output.json +25 -0
- package/src/tasks/image-to-image/inference.ts +67 -0
- package/src/tasks/image-to-image/spec/input.json +52 -0
- package/src/tasks/image-to-image/spec/output.json +12 -0
- package/src/tasks/image-to-text/inference.ts +138 -0
- package/src/tasks/image-to-text/spec/input.json +34 -0
- package/src/tasks/image-to-text/spec/output.json +17 -0
- package/src/tasks/index.ts +5 -2
- package/src/tasks/mask-generation/about.md +65 -0
- package/src/tasks/mask-generation/data.ts +55 -0
- package/src/tasks/object-detection/inference.ts +62 -0
- package/src/tasks/object-detection/spec/input.json +30 -0
- package/src/tasks/object-detection/spec/output.json +46 -0
- package/src/tasks/placeholder/data.ts +3 -0
- package/src/tasks/placeholder/spec/input.json +35 -0
- package/src/tasks/placeholder/spec/output.json +17 -0
- package/src/tasks/question-answering/inference.ts +99 -0
- package/src/tasks/question-answering/spec/input.json +67 -0
- package/src/tasks/question-answering/spec/output.json +29 -0
- package/src/tasks/sentence-similarity/about.md +2 -2
- package/src/tasks/sentence-similarity/inference.ts +32 -0
- package/src/tasks/sentence-similarity/spec/input.json +40 -0
- package/src/tasks/sentence-similarity/spec/output.json +12 -0
- package/src/tasks/summarization/data.ts +1 -0
- package/src/tasks/summarization/inference.ts +58 -0
- package/src/tasks/summarization/spec/input.json +7 -0
- package/src/tasks/summarization/spec/output.json +7 -0
- package/src/tasks/table-question-answering/inference.ts +61 -0
- package/src/tasks/table-question-answering/spec/input.json +39 -0
- package/src/tasks/table-question-answering/spec/output.json +40 -0
- package/src/tasks/tabular-classification/about.md +1 -1
- package/src/tasks/tabular-regression/about.md +1 -1
- package/src/tasks/text-classification/about.md +1 -0
- package/src/tasks/text-classification/inference.ts +51 -0
- package/src/tasks/text-classification/spec/input.json +35 -0
- package/src/tasks/text-classification/spec/output.json +10 -0
- package/src/tasks/text-generation/about.md +24 -13
- package/src/tasks/text-generation/data.ts +22 -38
- package/src/tasks/text-generation/inference.ts +85 -0
- package/src/tasks/text-generation/spec/input.json +74 -0
- package/src/tasks/text-generation/spec/output.json +17 -0
- package/src/tasks/text-to-audio/inference.ts +138 -0
- package/src/tasks/text-to-audio/spec/input.json +31 -0
- package/src/tasks/text-to-audio/spec/output.json +20 -0
- package/src/tasks/text-to-image/about.md +11 -2
- package/src/tasks/text-to-image/data.ts +6 -2
- package/src/tasks/text-to-image/inference.ts +73 -0
- package/src/tasks/text-to-image/spec/input.json +57 -0
- package/src/tasks/text-to-image/spec/output.json +15 -0
- package/src/tasks/text-to-speech/about.md +4 -2
- package/src/tasks/text-to-speech/data.ts +1 -0
- package/src/tasks/text-to-speech/inference.ts +146 -0
- package/src/tasks/text-to-speech/spec/input.json +7 -0
- package/src/tasks/text-to-speech/spec/output.json +7 -0
- package/src/tasks/text2text-generation/inference.ts +53 -0
- package/src/tasks/text2text-generation/spec/input.json +55 -0
- package/src/tasks/text2text-generation/spec/output.json +17 -0
- package/src/tasks/token-classification/inference.ts +82 -0
- package/src/tasks/token-classification/spec/input.json +65 -0
- package/src/tasks/token-classification/spec/output.json +33 -0
- package/src/tasks/translation/data.ts +1 -0
- package/src/tasks/translation/inference.ts +58 -0
- package/src/tasks/translation/spec/input.json +7 -0
- package/src/tasks/translation/spec/output.json +7 -0
- package/src/tasks/video-classification/inference.ts +59 -0
- package/src/tasks/video-classification/spec/input.json +42 -0
- package/src/tasks/video-classification/spec/output.json +10 -0
- package/src/tasks/visual-question-answering/inference.ts +63 -0
- package/src/tasks/visual-question-answering/spec/input.json +41 -0
- package/src/tasks/visual-question-answering/spec/output.json +21 -0
- package/src/tasks/zero-shot-classification/inference.ts +67 -0
- package/src/tasks/zero-shot-classification/spec/input.json +50 -0
- package/src/tasks/zero-shot-classification/spec/output.json +10 -0
- package/src/tasks/zero-shot-image-classification/data.ts +8 -5
- package/src/tasks/zero-shot-image-classification/inference.ts +61 -0
- package/src/tasks/zero-shot-image-classification/spec/input.json +45 -0
- package/src/tasks/zero-shot-image-classification/spec/output.json +10 -0
- package/src/tasks/zero-shot-object-detection/about.md +45 -0
- package/src/tasks/zero-shot-object-detection/data.ts +62 -0
- package/src/tasks/zero-shot-object-detection/inference.ts +66 -0
- package/src/tasks/zero-shot-object-detection/spec/input.json +40 -0
- package/src/tasks/zero-shot-object-detection/spec/output.json +47 -0
- package/tsconfig.json +3 -3
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/image-segmentation/input.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Inputs for Image Segmentation inference",
|
|
5
|
+
"title": "ImageSegmentationInput",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"data": {
|
|
9
|
+
"description": "The input image data"
|
|
10
|
+
},
|
|
11
|
+
"parameters": {
|
|
12
|
+
"description": "Additional inference parameters",
|
|
13
|
+
"$ref": "#/$defs/ImageSegmentationParameters"
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"$defs": {
|
|
17
|
+
"ImageSegmentationParameters": {
|
|
18
|
+
"title": "ImageSegmentationParameters",
|
|
19
|
+
"description": "Additional inference parameters for Image Segmentation",
|
|
20
|
+
"type": "object",
|
|
21
|
+
"properties": {
|
|
22
|
+
"maskThreshold": {
|
|
23
|
+
"type": "number",
|
|
24
|
+
"description": "Threshold to use when turning the predicted masks into binary values."
|
|
25
|
+
},
|
|
26
|
+
"overlapMaskAreaThreshold": {
|
|
27
|
+
"type": "number",
|
|
28
|
+
"description": "Mask overlap threshold to eliminate small, disconnected segments."
|
|
29
|
+
},
|
|
30
|
+
"subtask": {
|
|
31
|
+
"title": "ImageSegmentationSubtask",
|
|
32
|
+
"type": "string",
|
|
33
|
+
"description": "Segmentation task to be performed, depending on model capabilities.",
|
|
34
|
+
"oneOf": [
|
|
35
|
+
{
|
|
36
|
+
"const": "instance"
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
"const": "panoptic"
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"const": "semantic"
|
|
43
|
+
}
|
|
44
|
+
]
|
|
45
|
+
},
|
|
46
|
+
"threshold": {
|
|
47
|
+
"type": "number",
|
|
48
|
+
"description": "Probability threshold to filter out predicted masks."
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
},
|
|
53
|
+
"required": ["data"]
|
|
54
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/image-segmentation/output.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Outputs of inference for the Image Segmentation task",
|
|
5
|
+
"title": "ImageSegmentationOutput",
|
|
6
|
+
"type": "array",
|
|
7
|
+
"items": {
|
|
8
|
+
"description": "A predicted mask / segment",
|
|
9
|
+
"type": "object",
|
|
10
|
+
"properties": {
|
|
11
|
+
"label": {
|
|
12
|
+
"type": "string",
|
|
13
|
+
"description": "The label of the predicted segment"
|
|
14
|
+
},
|
|
15
|
+
"mask": {
|
|
16
|
+
"description": "The corresponding mask as a black-and-white image"
|
|
17
|
+
},
|
|
18
|
+
"score": {
|
|
19
|
+
"type": "number",
|
|
20
|
+
"description": "The score or confidence degreee the model has"
|
|
21
|
+
}
|
|
22
|
+
},
|
|
23
|
+
"required": ["label", "mask"]
|
|
24
|
+
}
|
|
25
|
+
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inference code generated from the JSON schema spec in ./spec
|
|
3
|
+
*
|
|
4
|
+
* Using src/scripts/inference-codegen
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Inputs for Image To Image inference
|
|
9
|
+
*/
|
|
10
|
+
export interface ImageToImageInput {
|
|
11
|
+
/**
|
|
12
|
+
* The input image data
|
|
13
|
+
*/
|
|
14
|
+
data: unknown;
|
|
15
|
+
/**
|
|
16
|
+
* Additional inference parameters
|
|
17
|
+
*/
|
|
18
|
+
parameters?: ImageToImageParameters;
|
|
19
|
+
[property: string]: unknown;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Additional inference parameters
|
|
24
|
+
*
|
|
25
|
+
* Additional inference parameters for Image To Image
|
|
26
|
+
*/
|
|
27
|
+
export interface ImageToImageParameters {
|
|
28
|
+
/**
|
|
29
|
+
* For diffusion models. A higher guidance scale value encourages the model to generate
|
|
30
|
+
* images closely linked to the text prompt at the expense of lower image quality.
|
|
31
|
+
*/
|
|
32
|
+
guidanceScale?: number;
|
|
33
|
+
/**
|
|
34
|
+
* One or several prompt to guide what NOT to include in image generation.
|
|
35
|
+
*/
|
|
36
|
+
negativePrompt?: string[];
|
|
37
|
+
/**
|
|
38
|
+
* For diffusion models. The number of denoising steps. More denoising steps usually lead to
|
|
39
|
+
* a higher quality image at the expense of slower inference.
|
|
40
|
+
*/
|
|
41
|
+
numInferenceSteps?: number;
|
|
42
|
+
/**
|
|
43
|
+
* The size in pixel of the output image
|
|
44
|
+
*/
|
|
45
|
+
targetSize?: TargetSize;
|
|
46
|
+
[property: string]: unknown;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* The size in pixel of the output image
|
|
51
|
+
*/
|
|
52
|
+
export interface TargetSize {
|
|
53
|
+
height: number;
|
|
54
|
+
width: number;
|
|
55
|
+
[property: string]: unknown;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Outputs of inference for the Image To Image task
|
|
60
|
+
*/
|
|
61
|
+
export interface ImageToImageOutput {
|
|
62
|
+
/**
|
|
63
|
+
* The output image
|
|
64
|
+
*/
|
|
65
|
+
image?: unknown;
|
|
66
|
+
[property: string]: unknown;
|
|
67
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/image-to-image/input.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Inputs for Image To Image inference",
|
|
5
|
+
"title": "ImageToImageInput",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"data": {
|
|
9
|
+
"description": "The input image data"
|
|
10
|
+
},
|
|
11
|
+
"parameters": {
|
|
12
|
+
"description": "Additional inference parameters",
|
|
13
|
+
"$ref": "#/$defs/ImageToImageParameters"
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"$defs": {
|
|
17
|
+
"ImageToImageParameters": {
|
|
18
|
+
"title": "ImageToImageParameters",
|
|
19
|
+
"description": "Additional inference parameters for Image To Image",
|
|
20
|
+
"type": "object",
|
|
21
|
+
"properties": {
|
|
22
|
+
"guidanceScale": {
|
|
23
|
+
"type": "number",
|
|
24
|
+
"description": "For diffusion models. A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality."
|
|
25
|
+
},
|
|
26
|
+
"negativePrompt": {
|
|
27
|
+
"type": "array",
|
|
28
|
+
"items": { "type": "string" },
|
|
29
|
+
"description": "One or several prompt to guide what NOT to include in image generation."
|
|
30
|
+
},
|
|
31
|
+
"numInferenceSteps": {
|
|
32
|
+
"type": "integer",
|
|
33
|
+
"description": "For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference."
|
|
34
|
+
},
|
|
35
|
+
"targetSize": {
|
|
36
|
+
"type": "object",
|
|
37
|
+
"description": "The size in pixel of the output image",
|
|
38
|
+
"properties": {
|
|
39
|
+
"width": {
|
|
40
|
+
"type": "integer"
|
|
41
|
+
},
|
|
42
|
+
"height": {
|
|
43
|
+
"type": "integer"
|
|
44
|
+
}
|
|
45
|
+
},
|
|
46
|
+
"required": ["width", "height"]
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
},
|
|
51
|
+
"required": ["data"]
|
|
52
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/image-to-image/output.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Outputs of inference for the Image To Image task",
|
|
5
|
+
"title": "ImageToImageOutput",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"image": {
|
|
9
|
+
"description": "The output image"
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
}
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inference code generated from the JSON schema spec in ./spec
|
|
3
|
+
*
|
|
4
|
+
* Using src/scripts/inference-codegen
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Inputs for Image To Text inference
|
|
8
|
+
*/
|
|
9
|
+
export interface ImageToTextInput {
|
|
10
|
+
/**
|
|
11
|
+
* The input image data
|
|
12
|
+
*/
|
|
13
|
+
data: unknown;
|
|
14
|
+
/**
|
|
15
|
+
* Additional inference parameters
|
|
16
|
+
*/
|
|
17
|
+
parameters?: ImageToTextParameters;
|
|
18
|
+
[property: string]: unknown;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Additional inference parameters
|
|
22
|
+
*
|
|
23
|
+
* Additional inference parameters for Image To Text
|
|
24
|
+
*/
|
|
25
|
+
export interface ImageToTextParameters {
|
|
26
|
+
/**
|
|
27
|
+
* Parametrization of the text generation process
|
|
28
|
+
*/
|
|
29
|
+
generate?: GenerationParameters;
|
|
30
|
+
/**
|
|
31
|
+
* The amount of maximum tokens to generate.
|
|
32
|
+
*/
|
|
33
|
+
maxNewTokens?: number;
|
|
34
|
+
[property: string]: unknown;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Parametrization of the text generation process
|
|
38
|
+
*
|
|
39
|
+
* Ad-hoc parametrization of the text generation process
|
|
40
|
+
*/
|
|
41
|
+
export interface GenerationParameters {
|
|
42
|
+
/**
|
|
43
|
+
* Whether to use sampling instead of greedy decoding when generating new tokens.
|
|
44
|
+
*/
|
|
45
|
+
doSample?: boolean;
|
|
46
|
+
/**
|
|
47
|
+
* Controls the stopping condition for beam-based methods.
|
|
48
|
+
*/
|
|
49
|
+
earlyStopping?: EarlyStoppingUnion;
|
|
50
|
+
/**
|
|
51
|
+
* If set to float strictly between 0 and 1, only tokens with a conditional probability
|
|
52
|
+
* greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
|
|
53
|
+
* 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
|
|
54
|
+
* Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
|
|
55
|
+
*/
|
|
56
|
+
epsilonCutoff?: number;
|
|
57
|
+
/**
|
|
58
|
+
* Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
|
|
59
|
+
* float strictly between 0 and 1, a token is only considered if it is greater than either
|
|
60
|
+
* eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
|
|
61
|
+
* term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
|
|
62
|
+
* the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
|
|
63
|
+
* See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
|
|
64
|
+
* for more details.
|
|
65
|
+
*/
|
|
66
|
+
etaCutoff?: number;
|
|
67
|
+
/**
|
|
68
|
+
* The maximum length (in tokens) of the generated text, including the input.
|
|
69
|
+
*/
|
|
70
|
+
maxLength?: number;
|
|
71
|
+
/**
|
|
72
|
+
* The maximum number of tokens to generate. Takes precedence over maxLength.
|
|
73
|
+
*/
|
|
74
|
+
maxNewTokens?: number;
|
|
75
|
+
/**
|
|
76
|
+
* The minimum length (in tokens) of the generated text, including the input.
|
|
77
|
+
*/
|
|
78
|
+
minLength?: number;
|
|
79
|
+
/**
|
|
80
|
+
* The minimum number of tokens to generate. Takes precedence over maxLength.
|
|
81
|
+
*/
|
|
82
|
+
minNewTokens?: number;
|
|
83
|
+
/**
|
|
84
|
+
* Number of groups to divide num_beams into in order to ensure diversity among different
|
|
85
|
+
* groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
|
|
86
|
+
*/
|
|
87
|
+
numBeamGroups?: number;
|
|
88
|
+
/**
|
|
89
|
+
* Number of beams to use for beam search.
|
|
90
|
+
*/
|
|
91
|
+
numBeams?: number;
|
|
92
|
+
/**
|
|
93
|
+
* The value balances the model confidence and the degeneration penalty in contrastive
|
|
94
|
+
* search decoding.
|
|
95
|
+
*/
|
|
96
|
+
penaltyAlpha?: number;
|
|
97
|
+
/**
|
|
98
|
+
* The value used to modulate the next token probabilities.
|
|
99
|
+
*/
|
|
100
|
+
temperature?: number;
|
|
101
|
+
/**
|
|
102
|
+
* The number of highest probability vocabulary tokens to keep for top-k-filtering.
|
|
103
|
+
*/
|
|
104
|
+
topK?: number;
|
|
105
|
+
/**
|
|
106
|
+
* If set to float < 1, only the smallest set of most probable tokens with probabilities
|
|
107
|
+
* that add up to top_p or higher are kept for generation.
|
|
108
|
+
*/
|
|
109
|
+
topP?: number;
|
|
110
|
+
/**
|
|
111
|
+
* Local typicality measures how similar the conditional probability of predicting a target
|
|
112
|
+
* token next is to the expected conditional probability of predicting a random token next,
|
|
113
|
+
* given the partial text already generated. If set to float < 1, the smallest set of the
|
|
114
|
+
* most locally typical tokens with probabilities that add up to typical_p or higher are
|
|
115
|
+
* kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
|
|
116
|
+
*/
|
|
117
|
+
typicalP?: number;
|
|
118
|
+
/**
|
|
119
|
+
* Whether the model should use the past last key/values attentions to speed up decoding
|
|
120
|
+
*/
|
|
121
|
+
useCache?: boolean;
|
|
122
|
+
[property: string]: unknown;
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Controls the stopping condition for beam-based methods.
|
|
126
|
+
*/
|
|
127
|
+
export type EarlyStoppingUnion = boolean | "never";
|
|
128
|
+
export type ImageToTextOutput = ImageToTextOutputElement[];
|
|
129
|
+
/**
|
|
130
|
+
* Outputs of inference for the Image To Text task
|
|
131
|
+
*/
|
|
132
|
+
export interface ImageToTextOutputElement {
|
|
133
|
+
/**
|
|
134
|
+
* The generated text.
|
|
135
|
+
*/
|
|
136
|
+
generatedText: string;
|
|
137
|
+
[property: string]: unknown;
|
|
138
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/image-to-text/input.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Inputs for Image To Text inference",
|
|
5
|
+
"title": "ImageToTextInput",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"data": {
|
|
9
|
+
"description": "The input image data"
|
|
10
|
+
},
|
|
11
|
+
"parameters": {
|
|
12
|
+
"description": "Additional inference parameters",
|
|
13
|
+
"$ref": "#/$defs/ImageToTextParameters"
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"$defs": {
|
|
17
|
+
"ImageToTextParameters": {
|
|
18
|
+
"title": "ImageToTextParameters",
|
|
19
|
+
"description": "Additional inference parameters for Image To Text",
|
|
20
|
+
"type": "object",
|
|
21
|
+
"properties": {
|
|
22
|
+
"maxNewTokens": {
|
|
23
|
+
"type": "integer",
|
|
24
|
+
"description": "The amount of maximum tokens to generate."
|
|
25
|
+
},
|
|
26
|
+
"generate": {
|
|
27
|
+
"description": "Parametrization of the text generation process",
|
|
28
|
+
"$ref": "/inference/schemas/common-definitions.json#/definitions/GenerationParameters"
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
},
|
|
33
|
+
"required": ["data"]
|
|
34
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/image-to-text/output.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Outputs of inference for the Image To Text task",
|
|
5
|
+
"title": "ImageToTextOutput",
|
|
6
|
+
"type": "array",
|
|
7
|
+
"items": {
|
|
8
|
+
"type": "object",
|
|
9
|
+
"properties": {
|
|
10
|
+
"generatedText": {
|
|
11
|
+
"type": "string",
|
|
12
|
+
"description": "The generated text."
|
|
13
|
+
}
|
|
14
|
+
},
|
|
15
|
+
"required": ["generatedText"]
|
|
16
|
+
}
|
|
17
|
+
}
|
package/src/tasks/index.ts
CHANGED
|
@@ -11,6 +11,7 @@ import imageClassification from "./image-classification/data";
|
|
|
11
11
|
import imageToImage from "./image-to-image/data";
|
|
12
12
|
import imageToText from "./image-to-text/data";
|
|
13
13
|
import imageSegmentation from "./image-segmentation/data";
|
|
14
|
+
import maskGeneration from "./mask-generation/data";
|
|
14
15
|
import objectDetection from "./object-detection/data";
|
|
15
16
|
import depthEstimation from "./depth-estimation/data";
|
|
16
17
|
import placeholder from "./placeholder/data";
|
|
@@ -33,6 +34,7 @@ import videoClassification from "./video-classification/data";
|
|
|
33
34
|
import visualQuestionAnswering from "./visual-question-answering/data";
|
|
34
35
|
import zeroShotClassification from "./zero-shot-classification/data";
|
|
35
36
|
import zeroShotImageClassification from "./zero-shot-image-classification/data";
|
|
37
|
+
import zeroShotObjectDetection from "./zero-shot-object-detection/data";
|
|
36
38
|
|
|
37
39
|
import type { ModelLibraryKey } from "../model-libraries";
|
|
38
40
|
|
|
@@ -131,7 +133,7 @@ export const TASKS_DATA: Record<PipelineType, TaskData | undefined> = {
|
|
|
131
133
|
"image-to-image": getData("image-to-image", imageToImage),
|
|
132
134
|
"image-to-text": getData("image-to-text", imageToText),
|
|
133
135
|
"image-to-video": undefined,
|
|
134
|
-
"mask-generation": getData("mask-generation",
|
|
136
|
+
"mask-generation": getData("mask-generation", maskGeneration),
|
|
135
137
|
"multiple-choice": undefined,
|
|
136
138
|
"object-detection": getData("object-detection", objectDetection),
|
|
137
139
|
"video-classification": getData("video-classification", videoClassification),
|
|
@@ -162,7 +164,7 @@ export const TASKS_DATA: Record<PipelineType, TaskData | undefined> = {
|
|
|
162
164
|
"voice-activity-detection": undefined,
|
|
163
165
|
"zero-shot-classification": getData("zero-shot-classification", zeroShotClassification),
|
|
164
166
|
"zero-shot-image-classification": getData("zero-shot-image-classification", zeroShotImageClassification),
|
|
165
|
-
"zero-shot-object-detection": getData("zero-shot-object-detection",
|
|
167
|
+
"zero-shot-object-detection": getData("zero-shot-object-detection", zeroShotObjectDetection),
|
|
166
168
|
"text-to-3d": getData("text-to-3d", placeholder),
|
|
167
169
|
"image-to-3d": getData("image-to-3d", placeholder),
|
|
168
170
|
} as const;
|
|
@@ -216,6 +218,7 @@ export interface TaskData {
|
|
|
216
218
|
datasets: ExampleRepo[];
|
|
217
219
|
demo: TaskDemo;
|
|
218
220
|
id: PipelineType;
|
|
221
|
+
canonicalId?: PipelineType;
|
|
219
222
|
isPlaceholder?: boolean;
|
|
220
223
|
label: string;
|
|
221
224
|
libraries: ModelLibraryKey[];
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
## Use Cases
|
|
2
|
+
|
|
3
|
+
### Filtering an Image
|
|
4
|
+
|
|
5
|
+
When filtering for an image, the generated masks might serve as an initial filter to eliminate irrelevant information. For instance, when monitoring vegetation in satellite imaging, mask generation models identify green spots, highlighting the relevant region of the image.
|
|
6
|
+
|
|
7
|
+
### Masked Image Modelling
|
|
8
|
+
|
|
9
|
+
Generating masks can facilitate learning, especially in semi or unsupervised learning. For example, the [BEiT model](https://huggingface.co/docs/transformers/model_doc/beit) uses image-mask patches in the pre-training.
|
|
10
|
+
|
|
11
|
+
### Human-in-the-loop Computer Vision Applications
|
|
12
|
+
|
|
13
|
+
For applications where humans are in the loop, masks highlight certain regions of images for humans to validate.
|
|
14
|
+
|
|
15
|
+
## Task Variants
|
|
16
|
+
|
|
17
|
+
### Segmentation
|
|
18
|
+
|
|
19
|
+
Image Segmentation divides an image into segments where each pixel is mapped to an object. This task has multiple variants, such as instance segmentation, panoptic segmentation, and semantic segmentation. You can learn more about segmentation on its [task page](https://huggingface.co/tasks/image-segmentation).
|
|
20
|
+
|
|
21
|
+
## Inference
|
|
22
|
+
|
|
23
|
+
Mask generation models often work in two modes: segment everything or prompt mode.
|
|
24
|
+
The example below works in segment-everything-mode, where many masks will be returned.
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from transformers import pipeline
|
|
28
|
+
|
|
29
|
+
generator = pipeline("mask-generation", model="Zigeng/SlimSAM-uniform-50", points_per_batch=64, device="cuda")
|
|
30
|
+
image_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
|
|
31
|
+
outputs = generator(image_url)
|
|
32
|
+
outputs["masks"]
|
|
33
|
+
# array of multiple binary masks returned for each generated mask
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Prompt mode takes in three types of prompts:
|
|
37
|
+
|
|
38
|
+
- **Point prompt:** The user can select a point on the image, and a meaningful segment around the point will be returned.
|
|
39
|
+
- **Box prompt:** The user can draw a box on the image, and a meaningful segment within the box will be returned.
|
|
40
|
+
- **Text prompt:** The user can input a text, and the objects of that type will be segmented. Note that this capability has not yet been released and has only been explored in research.
|
|
41
|
+
|
|
42
|
+
Below you can see how to use an input-point prompt. It also demonstrates direct model inference without the `pipeline` abstraction. The input prompt here is a nested list where the outermost list is the batch size (`1`), then the number of points (also `1` in this example), and the innermost list contains the actual coordinates of the point (`[450, 600]`).
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from transformers import SamModel, SamProcessor
|
|
46
|
+
from PIL import Image
|
|
47
|
+
import requests
|
|
48
|
+
|
|
49
|
+
model = SamModel.from_pretrained("Zigeng/SlimSAM-uniform-50").to("cuda")
|
|
50
|
+
processor = SamProcessor.from_pretrained("Zigeng/SlimSAM-uniform-50")
|
|
51
|
+
|
|
52
|
+
raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
|
|
53
|
+
# pointing to the car window
|
|
54
|
+
input_points = [[[450, 600]]]
|
|
55
|
+
inputs = processor(raw_image, input_points=input_points, return_tensors="pt").to("cuda")
|
|
56
|
+
outputs = model(**inputs)
|
|
57
|
+
masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
|
|
58
|
+
scores = outputs.iou_scores
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Useful Resources
|
|
62
|
+
|
|
63
|
+
Would you like to learn more about mask generation? Great! Here you can find some curated resources that you may find helpful!
|
|
64
|
+
|
|
65
|
+
- [Segment anything model](https://huggingface.co/docs/transformers/main/model_doc/sam)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "..";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [],
|
|
5
|
+
demo: {
|
|
6
|
+
inputs: [
|
|
7
|
+
{
|
|
8
|
+
filename: "mask-generation-input.png",
|
|
9
|
+
type: "img",
|
|
10
|
+
},
|
|
11
|
+
],
|
|
12
|
+
outputs: [
|
|
13
|
+
{
|
|
14
|
+
filename: "mask-generation-output.png",
|
|
15
|
+
type: "img",
|
|
16
|
+
},
|
|
17
|
+
],
|
|
18
|
+
},
|
|
19
|
+
metrics: [],
|
|
20
|
+
models: [
|
|
21
|
+
{
|
|
22
|
+
description: "Small yet powerful mask generation model.",
|
|
23
|
+
id: "Zigeng/SlimSAM-uniform-50",
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
description: "Very strong mask generation model.",
|
|
27
|
+
id: "facebook/sam-vit-huge",
|
|
28
|
+
},
|
|
29
|
+
],
|
|
30
|
+
spaces: [
|
|
31
|
+
{
|
|
32
|
+
description:
|
|
33
|
+
"An application that combines a mask generation model with an image embedding model for open-vocabulary image segmentation.",
|
|
34
|
+
id: "SkalskiP/SAM_and_MetaCLIP",
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
description: "An application that compares the performance of a large and a small mask generation model.",
|
|
38
|
+
id: "merve/slimsam",
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
description: "An application based on an improved mask generation model.",
|
|
42
|
+
id: "linfanluntan/Grounded-SAM",
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
description: "An application to remove objects from videos using mask generation models.",
|
|
46
|
+
id: "SkalskiP/SAM_and_ProPainter",
|
|
47
|
+
},
|
|
48
|
+
],
|
|
49
|
+
summary:
|
|
50
|
+
"Mask generation is the task of generating masks that identify a specific object or region of interest in a given image. Masks are often used in segmentation tasks, where they provide a precise way to isolate the object of interest for further processing or analysis.",
|
|
51
|
+
widgetModels: [],
|
|
52
|
+
youtubeId: "",
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
export default taskData;
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inference code generated from the JSON schema spec in ./spec
|
|
3
|
+
*
|
|
4
|
+
* Using src/scripts/inference-codegen
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Inputs for Object Detection inference
|
|
8
|
+
*/
|
|
9
|
+
export interface ObjectDetectionInput {
|
|
10
|
+
/**
|
|
11
|
+
* The input image data
|
|
12
|
+
*/
|
|
13
|
+
data: unknown;
|
|
14
|
+
/**
|
|
15
|
+
* Additional inference parameters
|
|
16
|
+
*/
|
|
17
|
+
parameters?: ObjectDetectionParameters;
|
|
18
|
+
[property: string]: unknown;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Additional inference parameters
|
|
22
|
+
*
|
|
23
|
+
* Additional inference parameters for Object Detection
|
|
24
|
+
*/
|
|
25
|
+
export interface ObjectDetectionParameters {
|
|
26
|
+
/**
|
|
27
|
+
* The probability necessary to make a prediction.
|
|
28
|
+
*/
|
|
29
|
+
threshold?: number;
|
|
30
|
+
[property: string]: unknown;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* The predicted bounding box. Coordinates are relative to the top left corner of the input
|
|
34
|
+
* image.
|
|
35
|
+
*/
|
|
36
|
+
export interface BoundingBox {
|
|
37
|
+
xmax: number;
|
|
38
|
+
xmin: number;
|
|
39
|
+
ymax: number;
|
|
40
|
+
ymin: number;
|
|
41
|
+
[property: string]: unknown;
|
|
42
|
+
}
|
|
43
|
+
export type ObjectDetectionOutput = ObjectDetectionOutputElement[];
|
|
44
|
+
/**
|
|
45
|
+
* Outputs of inference for the Object Detection task
|
|
46
|
+
*/
|
|
47
|
+
export interface ObjectDetectionOutputElement {
|
|
48
|
+
/**
|
|
49
|
+
* The predicted bounding box. Coordinates are relative to the top left corner of the input
|
|
50
|
+
* image.
|
|
51
|
+
*/
|
|
52
|
+
box: BoundingBox;
|
|
53
|
+
/**
|
|
54
|
+
* The predicted label for the bounding box
|
|
55
|
+
*/
|
|
56
|
+
label: string;
|
|
57
|
+
/**
|
|
58
|
+
* The associated score / probability
|
|
59
|
+
*/
|
|
60
|
+
score: number;
|
|
61
|
+
[property: string]: unknown;
|
|
62
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/object-detection/input.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Inputs for Object Detection inference",
|
|
5
|
+
"title": "ObjectDetectionInput",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"data": {
|
|
9
|
+
"description": "The input image data"
|
|
10
|
+
},
|
|
11
|
+
"parameters": {
|
|
12
|
+
"description": "Additional inference parameters",
|
|
13
|
+
"$ref": "#/$defs/ObjectDetectionParameters"
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"$defs": {
|
|
17
|
+
"ObjectDetectionParameters": {
|
|
18
|
+
"title": "ObjectDetectionParameters",
|
|
19
|
+
"description": "Additional inference parameters for Object Detection",
|
|
20
|
+
"type": "object",
|
|
21
|
+
"properties": {
|
|
22
|
+
"threshold": {
|
|
23
|
+
"type": "number",
|
|
24
|
+
"description": "The probability necessary to make a prediction."
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
},
|
|
29
|
+
"required": ["data"]
|
|
30
|
+
}
|