@huggingface/tasks 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/{index.mjs → index.cjs} +2695 -2497
- package/dist/index.d.ts +427 -65
- package/dist/index.js +2660 -2532
- package/package.json +13 -8
- package/src/index.ts +2 -5
- package/src/library-to-tasks.ts +1 -1
- package/src/model-data.ts +1 -1
- package/src/model-libraries-downloads.ts +20 -0
- package/src/{library-ui-elements.ts → model-libraries-snippets.ts} +50 -296
- package/src/model-libraries.ts +375 -44
- package/src/pipelines.ts +1 -1
- package/src/tasks/audio-classification/about.md +1 -1
- package/src/tasks/audio-classification/inference.ts +51 -0
- package/src/tasks/audio-classification/spec/input.json +34 -0
- package/src/tasks/audio-classification/spec/output.json +10 -0
- package/src/tasks/audio-to-audio/about.md +1 -1
- package/src/tasks/automatic-speech-recognition/about.md +4 -2
- package/src/tasks/automatic-speech-recognition/inference.ts +159 -0
- package/src/tasks/automatic-speech-recognition/spec/input.json +34 -0
- package/src/tasks/automatic-speech-recognition/spec/output.json +38 -0
- package/src/tasks/common-definitions.json +117 -0
- package/src/tasks/depth-estimation/data.ts +8 -4
- package/src/tasks/depth-estimation/inference.ts +35 -0
- package/src/tasks/depth-estimation/spec/input.json +25 -0
- package/src/tasks/depth-estimation/spec/output.json +16 -0
- package/src/tasks/document-question-answering/inference.ts +110 -0
- package/src/tasks/document-question-answering/spec/input.json +85 -0
- package/src/tasks/document-question-answering/spec/output.json +36 -0
- package/src/tasks/feature-extraction/inference.ts +22 -0
- package/src/tasks/feature-extraction/spec/input.json +26 -0
- package/src/tasks/feature-extraction/spec/output.json +7 -0
- package/src/tasks/fill-mask/inference.ts +62 -0
- package/src/tasks/fill-mask/spec/input.json +38 -0
- package/src/tasks/fill-mask/spec/output.json +29 -0
- package/src/tasks/image-classification/inference.ts +51 -0
- package/src/tasks/image-classification/spec/input.json +34 -0
- package/src/tasks/image-classification/spec/output.json +10 -0
- package/src/tasks/image-segmentation/inference.ts +65 -0
- package/src/tasks/image-segmentation/spec/input.json +54 -0
- package/src/tasks/image-segmentation/spec/output.json +25 -0
- package/src/tasks/image-to-image/inference.ts +67 -0
- package/src/tasks/image-to-image/spec/input.json +54 -0
- package/src/tasks/image-to-image/spec/output.json +12 -0
- package/src/tasks/image-to-text/inference.ts +143 -0
- package/src/tasks/image-to-text/spec/input.json +34 -0
- package/src/tasks/image-to-text/spec/output.json +14 -0
- package/src/tasks/index.ts +5 -2
- package/src/tasks/mask-generation/about.md +65 -0
- package/src/tasks/mask-generation/data.ts +42 -5
- package/src/tasks/object-detection/inference.ts +62 -0
- package/src/tasks/object-detection/spec/input.json +30 -0
- package/src/tasks/object-detection/spec/output.json +46 -0
- package/src/tasks/placeholder/data.ts +3 -0
- package/src/tasks/placeholder/spec/input.json +35 -0
- package/src/tasks/placeholder/spec/output.json +17 -0
- package/src/tasks/question-answering/inference.ts +99 -0
- package/src/tasks/question-answering/spec/input.json +67 -0
- package/src/tasks/question-answering/spec/output.json +29 -0
- package/src/tasks/sentence-similarity/about.md +2 -2
- package/src/tasks/sentence-similarity/inference.ts +32 -0
- package/src/tasks/sentence-similarity/spec/input.json +40 -0
- package/src/tasks/sentence-similarity/spec/output.json +12 -0
- package/src/tasks/summarization/data.ts +1 -0
- package/src/tasks/summarization/inference.ts +59 -0
- package/src/tasks/summarization/spec/input.json +7 -0
- package/src/tasks/summarization/spec/output.json +7 -0
- package/src/tasks/table-question-answering/inference.ts +61 -0
- package/src/tasks/table-question-answering/spec/input.json +44 -0
- package/src/tasks/table-question-answering/spec/output.json +40 -0
- package/src/tasks/tabular-classification/about.md +1 -1
- package/src/tasks/tabular-regression/about.md +1 -1
- package/src/tasks/text-classification/about.md +1 -0
- package/src/tasks/text-classification/inference.ts +51 -0
- package/src/tasks/text-classification/spec/input.json +35 -0
- package/src/tasks/text-classification/spec/output.json +10 -0
- package/src/tasks/text-generation/about.md +24 -13
- package/src/tasks/text-generation/data.ts +22 -38
- package/src/tasks/text-generation/inference.ts +194 -0
- package/src/tasks/text-generation/spec/input.json +90 -0
- package/src/tasks/text-generation/spec/output.json +120 -0
- package/src/tasks/text-to-audio/inference.ts +143 -0
- package/src/tasks/text-to-audio/spec/input.json +31 -0
- package/src/tasks/text-to-audio/spec/output.json +17 -0
- package/src/tasks/text-to-image/about.md +11 -2
- package/src/tasks/text-to-image/data.ts +6 -2
- package/src/tasks/text-to-image/inference.ts +71 -0
- package/src/tasks/text-to-image/spec/input.json +59 -0
- package/src/tasks/text-to-image/spec/output.json +13 -0
- package/src/tasks/text-to-speech/about.md +4 -2
- package/src/tasks/text-to-speech/data.ts +1 -0
- package/src/tasks/text-to-speech/inference.ts +147 -0
- package/src/tasks/text-to-speech/spec/input.json +7 -0
- package/src/tasks/text-to-speech/spec/output.json +7 -0
- package/src/tasks/text2text-generation/inference.ts +55 -0
- package/src/tasks/text2text-generation/spec/input.json +55 -0
- package/src/tasks/text2text-generation/spec/output.json +14 -0
- package/src/tasks/token-classification/inference.ts +82 -0
- package/src/tasks/token-classification/spec/input.json +65 -0
- package/src/tasks/token-classification/spec/output.json +33 -0
- package/src/tasks/translation/data.ts +1 -0
- package/src/tasks/translation/inference.ts +59 -0
- package/src/tasks/translation/spec/input.json +7 -0
- package/src/tasks/translation/spec/output.json +7 -0
- package/src/tasks/video-classification/inference.ts +59 -0
- package/src/tasks/video-classification/spec/input.json +42 -0
- package/src/tasks/video-classification/spec/output.json +10 -0
- package/src/tasks/visual-question-answering/inference.ts +63 -0
- package/src/tasks/visual-question-answering/spec/input.json +41 -0
- package/src/tasks/visual-question-answering/spec/output.json +21 -0
- package/src/tasks/zero-shot-classification/inference.ts +67 -0
- package/src/tasks/zero-shot-classification/spec/input.json +50 -0
- package/src/tasks/zero-shot-classification/spec/output.json +10 -0
- package/src/tasks/zero-shot-image-classification/data.ts +8 -5
- package/src/tasks/zero-shot-image-classification/inference.ts +61 -0
- package/src/tasks/zero-shot-image-classification/spec/input.json +45 -0
- package/src/tasks/zero-shot-image-classification/spec/output.json +10 -0
- package/src/tasks/zero-shot-object-detection/about.md +6 -0
- package/src/tasks/zero-shot-object-detection/data.ts +6 -1
- package/src/tasks/zero-shot-object-detection/inference.ts +66 -0
- package/src/tasks/zero-shot-object-detection/spec/input.json +40 -0
- package/src/tasks/zero-shot-object-detection/spec/output.json +47 -0
- package/tsconfig.json +3 -3
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inference code generated from the JSON schema spec in ./spec
|
|
3
|
+
*
|
|
4
|
+
* Using src/scripts/inference-codegen
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Inputs for Image To Text inference
|
|
9
|
+
*/
|
|
10
|
+
export interface ImageToTextInput {
|
|
11
|
+
/**
|
|
12
|
+
* The input image data
|
|
13
|
+
*/
|
|
14
|
+
inputs: unknown;
|
|
15
|
+
/**
|
|
16
|
+
* Additional inference parameters
|
|
17
|
+
*/
|
|
18
|
+
parameters?: ImageToTextParameters;
|
|
19
|
+
[property: string]: unknown;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Additional inference parameters
|
|
24
|
+
*
|
|
25
|
+
* Additional inference parameters for Image To Text
|
|
26
|
+
*/
|
|
27
|
+
export interface ImageToTextParameters {
|
|
28
|
+
/**
|
|
29
|
+
* Parametrization of the text generation process
|
|
30
|
+
*/
|
|
31
|
+
generate?: GenerationParameters;
|
|
32
|
+
/**
|
|
33
|
+
* The amount of maximum tokens to generate.
|
|
34
|
+
*/
|
|
35
|
+
max_new_tokens?: number;
|
|
36
|
+
[property: string]: unknown;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Parametrization of the text generation process
|
|
41
|
+
*
|
|
42
|
+
* Ad-hoc parametrization of the text generation process
|
|
43
|
+
*/
|
|
44
|
+
export interface GenerationParameters {
|
|
45
|
+
/**
|
|
46
|
+
* Whether to use sampling instead of greedy decoding when generating new tokens.
|
|
47
|
+
*/
|
|
48
|
+
do_sample?: boolean;
|
|
49
|
+
/**
|
|
50
|
+
* Controls the stopping condition for beam-based methods.
|
|
51
|
+
*/
|
|
52
|
+
early_stopping?: EarlyStoppingUnion;
|
|
53
|
+
/**
|
|
54
|
+
* If set to float strictly between 0 and 1, only tokens with a conditional probability
|
|
55
|
+
* greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
|
|
56
|
+
* 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
|
|
57
|
+
* Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
|
|
58
|
+
*/
|
|
59
|
+
epsilon_cutoff?: number;
|
|
60
|
+
/**
|
|
61
|
+
* Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
|
|
62
|
+
* float strictly between 0 and 1, a token is only considered if it is greater than either
|
|
63
|
+
* eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
|
|
64
|
+
* term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
|
|
65
|
+
* the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
|
|
66
|
+
* See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
|
|
67
|
+
* for more details.
|
|
68
|
+
*/
|
|
69
|
+
eta_cutoff?: number;
|
|
70
|
+
/**
|
|
71
|
+
* The maximum length (in tokens) of the generated text, including the input.
|
|
72
|
+
*/
|
|
73
|
+
max_length?: number;
|
|
74
|
+
/**
|
|
75
|
+
* The maximum number of tokens to generate. Takes precedence over maxLength.
|
|
76
|
+
*/
|
|
77
|
+
max_new_tokens?: number;
|
|
78
|
+
/**
|
|
79
|
+
* The minimum length (in tokens) of the generated text, including the input.
|
|
80
|
+
*/
|
|
81
|
+
min_length?: number;
|
|
82
|
+
/**
|
|
83
|
+
* The minimum number of tokens to generate. Takes precedence over maxLength.
|
|
84
|
+
*/
|
|
85
|
+
min_new_tokens?: number;
|
|
86
|
+
/**
|
|
87
|
+
* Number of groups to divide num_beams into in order to ensure diversity among different
|
|
88
|
+
* groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
|
|
89
|
+
*/
|
|
90
|
+
num_beam_groups?: number;
|
|
91
|
+
/**
|
|
92
|
+
* Number of beams to use for beam search.
|
|
93
|
+
*/
|
|
94
|
+
num_beams?: number;
|
|
95
|
+
/**
|
|
96
|
+
* The value balances the model confidence and the degeneration penalty in contrastive
|
|
97
|
+
* search decoding.
|
|
98
|
+
*/
|
|
99
|
+
penalty_alpha?: number;
|
|
100
|
+
/**
|
|
101
|
+
* The value used to modulate the next token probabilities.
|
|
102
|
+
*/
|
|
103
|
+
temperature?: number;
|
|
104
|
+
/**
|
|
105
|
+
* The number of highest probability vocabulary tokens to keep for top-k-filtering.
|
|
106
|
+
*/
|
|
107
|
+
top_k?: number;
|
|
108
|
+
/**
|
|
109
|
+
* If set to float < 1, only the smallest set of most probable tokens with probabilities
|
|
110
|
+
* that add up to top_p or higher are kept for generation.
|
|
111
|
+
*/
|
|
112
|
+
top_p?: number;
|
|
113
|
+
/**
|
|
114
|
+
* Local typicality measures how similar the conditional probability of predicting a target
|
|
115
|
+
* token next is to the expected conditional probability of predicting a random token next,
|
|
116
|
+
* given the partial text already generated. If set to float < 1, the smallest set of the
|
|
117
|
+
* most locally typical tokens with probabilities that add up to typical_p or higher are
|
|
118
|
+
* kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
|
|
119
|
+
*/
|
|
120
|
+
typical_p?: number;
|
|
121
|
+
/**
|
|
122
|
+
* Whether the model should use the past last key/values attentions to speed up decoding
|
|
123
|
+
*/
|
|
124
|
+
use_cache?: boolean;
|
|
125
|
+
[property: string]: unknown;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Controls the stopping condition for beam-based methods.
|
|
130
|
+
*/
|
|
131
|
+
export type EarlyStoppingUnion = boolean | "never";
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Outputs of inference for the Image To Text task
|
|
135
|
+
*/
|
|
136
|
+
export interface ImageToTextOutput {
|
|
137
|
+
generatedText: unknown;
|
|
138
|
+
/**
|
|
139
|
+
* The generated text.
|
|
140
|
+
*/
|
|
141
|
+
generated_text?: string;
|
|
142
|
+
[property: string]: unknown;
|
|
143
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/image-to-text/input.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Inputs for Image To Text inference",
|
|
5
|
+
"title": "ImageToTextInput",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"inputs": {
|
|
9
|
+
"description": "The input image data"
|
|
10
|
+
},
|
|
11
|
+
"parameters": {
|
|
12
|
+
"description": "Additional inference parameters",
|
|
13
|
+
"$ref": "#/$defs/ImageToTextParameters"
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"$defs": {
|
|
17
|
+
"ImageToTextParameters": {
|
|
18
|
+
"title": "ImageToTextParameters",
|
|
19
|
+
"description": "Additional inference parameters for Image To Text",
|
|
20
|
+
"type": "object",
|
|
21
|
+
"properties": {
|
|
22
|
+
"max_new_tokens": {
|
|
23
|
+
"type": "integer",
|
|
24
|
+
"description": "The amount of maximum tokens to generate."
|
|
25
|
+
},
|
|
26
|
+
"generate": {
|
|
27
|
+
"description": "Parametrization of the text generation process",
|
|
28
|
+
"$ref": "/inference/schemas/common-definitions.json#/definitions/GenerationParameters"
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
},
|
|
33
|
+
"required": ["inputs"]
|
|
34
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/image-to-text/output.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Outputs of inference for the Image To Text task",
|
|
5
|
+
"title": "ImageToTextOutput",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"generated_text": {
|
|
9
|
+
"type": "string",
|
|
10
|
+
"description": "The generated text."
|
|
11
|
+
}
|
|
12
|
+
},
|
|
13
|
+
"required": ["generatedText"]
|
|
14
|
+
}
|
package/src/tasks/index.ts
CHANGED
|
@@ -11,6 +11,7 @@ import imageClassification from "./image-classification/data";
|
|
|
11
11
|
import imageToImage from "./image-to-image/data";
|
|
12
12
|
import imageToText from "./image-to-text/data";
|
|
13
13
|
import imageSegmentation from "./image-segmentation/data";
|
|
14
|
+
import maskGeneration from "./mask-generation/data";
|
|
14
15
|
import objectDetection from "./object-detection/data";
|
|
15
16
|
import depthEstimation from "./depth-estimation/data";
|
|
16
17
|
import placeholder from "./placeholder/data";
|
|
@@ -33,6 +34,7 @@ import videoClassification from "./video-classification/data";
|
|
|
33
34
|
import visualQuestionAnswering from "./visual-question-answering/data";
|
|
34
35
|
import zeroShotClassification from "./zero-shot-classification/data";
|
|
35
36
|
import zeroShotImageClassification from "./zero-shot-image-classification/data";
|
|
37
|
+
import zeroShotObjectDetection from "./zero-shot-object-detection/data";
|
|
36
38
|
|
|
37
39
|
import type { ModelLibraryKey } from "../model-libraries";
|
|
38
40
|
|
|
@@ -131,7 +133,7 @@ export const TASKS_DATA: Record<PipelineType, TaskData | undefined> = {
|
|
|
131
133
|
"image-to-image": getData("image-to-image", imageToImage),
|
|
132
134
|
"image-to-text": getData("image-to-text", imageToText),
|
|
133
135
|
"image-to-video": undefined,
|
|
134
|
-
"mask-generation": getData("mask-generation",
|
|
136
|
+
"mask-generation": getData("mask-generation", maskGeneration),
|
|
135
137
|
"multiple-choice": undefined,
|
|
136
138
|
"object-detection": getData("object-detection", objectDetection),
|
|
137
139
|
"video-classification": getData("video-classification", videoClassification),
|
|
@@ -162,7 +164,7 @@ export const TASKS_DATA: Record<PipelineType, TaskData | undefined> = {
|
|
|
162
164
|
"voice-activity-detection": undefined,
|
|
163
165
|
"zero-shot-classification": getData("zero-shot-classification", zeroShotClassification),
|
|
164
166
|
"zero-shot-image-classification": getData("zero-shot-image-classification", zeroShotImageClassification),
|
|
165
|
-
"zero-shot-object-detection": getData("zero-shot-object-detection",
|
|
167
|
+
"zero-shot-object-detection": getData("zero-shot-object-detection", zeroShotObjectDetection),
|
|
166
168
|
"text-to-3d": getData("text-to-3d", placeholder),
|
|
167
169
|
"image-to-3d": getData("image-to-3d", placeholder),
|
|
168
170
|
} as const;
|
|
@@ -216,6 +218,7 @@ export interface TaskData {
|
|
|
216
218
|
datasets: ExampleRepo[];
|
|
217
219
|
demo: TaskDemo;
|
|
218
220
|
id: PipelineType;
|
|
221
|
+
canonicalId?: PipelineType;
|
|
219
222
|
isPlaceholder?: boolean;
|
|
220
223
|
label: string;
|
|
221
224
|
libraries: ModelLibraryKey[];
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
## Use Cases
|
|
2
|
+
|
|
3
|
+
### Filtering an Image
|
|
4
|
+
|
|
5
|
+
When filtering for an image, the generated masks might serve as an initial filter to eliminate irrelevant information. For instance, when monitoring vegetation in satellite imaging, mask generation models identify green spots, highlighting the relevant region of the image.
|
|
6
|
+
|
|
7
|
+
### Masked Image Modelling
|
|
8
|
+
|
|
9
|
+
Generating masks can facilitate learning, especially in semi or unsupervised learning. For example, the [BEiT model](https://huggingface.co/docs/transformers/model_doc/beit) uses image-mask patches in the pre-training.
|
|
10
|
+
|
|
11
|
+
### Human-in-the-loop Computer Vision Applications
|
|
12
|
+
|
|
13
|
+
For applications where humans are in the loop, masks highlight certain regions of images for humans to validate.
|
|
14
|
+
|
|
15
|
+
## Task Variants
|
|
16
|
+
|
|
17
|
+
### Segmentation
|
|
18
|
+
|
|
19
|
+
Image Segmentation divides an image into segments where each pixel is mapped to an object. This task has multiple variants, such as instance segmentation, panoptic segmentation, and semantic segmentation. You can learn more about segmentation on its [task page](https://huggingface.co/tasks/image-segmentation).
|
|
20
|
+
|
|
21
|
+
## Inference
|
|
22
|
+
|
|
23
|
+
Mask generation models often work in two modes: segment everything or prompt mode.
|
|
24
|
+
The example below works in segment-everything-mode, where many masks will be returned.
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from transformers import pipeline
|
|
28
|
+
|
|
29
|
+
generator = pipeline("mask-generation", model="Zigeng/SlimSAM-uniform-50", points_per_batch=64, device="cuda")
|
|
30
|
+
image_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
|
|
31
|
+
outputs = generator(image_url)
|
|
32
|
+
outputs["masks"]
|
|
33
|
+
# array of multiple binary masks returned for each generated mask
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Prompt mode takes in three types of prompts:
|
|
37
|
+
|
|
38
|
+
- **Point prompt:** The user can select a point on the image, and a meaningful segment around the point will be returned.
|
|
39
|
+
- **Box prompt:** The user can draw a box on the image, and a meaningful segment within the box will be returned.
|
|
40
|
+
- **Text prompt:** The user can input a text, and the objects of that type will be segmented. Note that this capability has not yet been released and has only been explored in research.
|
|
41
|
+
|
|
42
|
+
Below you can see how to use an input-point prompt. It also demonstrates direct model inference without the `pipeline` abstraction. The input prompt here is a nested list where the outermost list is the batch size (`1`), then the number of points (also `1` in this example), and the innermost list contains the actual coordinates of the point (`[450, 600]`).
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from transformers import SamModel, SamProcessor
|
|
46
|
+
from PIL import Image
|
|
47
|
+
import requests
|
|
48
|
+
|
|
49
|
+
model = SamModel.from_pretrained("Zigeng/SlimSAM-uniform-50").to("cuda")
|
|
50
|
+
processor = SamProcessor.from_pretrained("Zigeng/SlimSAM-uniform-50")
|
|
51
|
+
|
|
52
|
+
raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
|
|
53
|
+
# pointing to the car window
|
|
54
|
+
input_points = [[[450, 600]]]
|
|
55
|
+
inputs = processor(raw_image, input_points=input_points, return_tensors="pt").to("cuda")
|
|
56
|
+
outputs = model(**inputs)
|
|
57
|
+
masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
|
|
58
|
+
scores = outputs.iou_scores
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Useful Resources
|
|
62
|
+
|
|
63
|
+
Would you like to learn more about mask generation? Great! Here you can find some curated resources that you may find helpful!
|
|
64
|
+
|
|
65
|
+
- [Segment anything model](https://huggingface.co/docs/transformers/main/model_doc/sam)
|
|
@@ -3,14 +3,51 @@ import type { TaskDataCustom } from "..";
|
|
|
3
3
|
const taskData: TaskDataCustom = {
|
|
4
4
|
datasets: [],
|
|
5
5
|
demo: {
|
|
6
|
-
inputs: [
|
|
7
|
-
|
|
6
|
+
inputs: [
|
|
7
|
+
{
|
|
8
|
+
filename: "mask-generation-input.png",
|
|
9
|
+
type: "img",
|
|
10
|
+
},
|
|
11
|
+
],
|
|
12
|
+
outputs: [
|
|
13
|
+
{
|
|
14
|
+
filename: "mask-generation-output.png",
|
|
15
|
+
type: "img",
|
|
16
|
+
},
|
|
17
|
+
],
|
|
8
18
|
},
|
|
9
19
|
metrics: [],
|
|
10
|
-
models: [
|
|
11
|
-
|
|
20
|
+
models: [
|
|
21
|
+
{
|
|
22
|
+
description: "Small yet powerful mask generation model.",
|
|
23
|
+
id: "Zigeng/SlimSAM-uniform-50",
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
description: "Very strong mask generation model.",
|
|
27
|
+
id: "facebook/sam-vit-huge",
|
|
28
|
+
},
|
|
29
|
+
],
|
|
30
|
+
spaces: [
|
|
31
|
+
{
|
|
32
|
+
description:
|
|
33
|
+
"An application that combines a mask generation model with an image embedding model for open-vocabulary image segmentation.",
|
|
34
|
+
id: "SkalskiP/SAM_and_MetaCLIP",
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
description: "An application that compares the performance of a large and a small mask generation model.",
|
|
38
|
+
id: "merve/slimsam",
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
description: "An application based on an improved mask generation model.",
|
|
42
|
+
id: "linfanluntan/Grounded-SAM",
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
description: "An application to remove objects from videos using mask generation models.",
|
|
46
|
+
id: "SkalskiP/SAM_and_ProPainter",
|
|
47
|
+
},
|
|
48
|
+
],
|
|
12
49
|
summary:
|
|
13
|
-
"Mask generation is
|
|
50
|
+
"Mask generation is the task of generating masks that identify a specific object or region of interest in a given image. Masks are often used in segmentation tasks, where they provide a precise way to isolate the object of interest for further processing or analysis.",
|
|
14
51
|
widgetModels: [],
|
|
15
52
|
youtubeId: "",
|
|
16
53
|
};
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inference code generated from the JSON schema spec in ./spec
|
|
3
|
+
*
|
|
4
|
+
* Using src/scripts/inference-codegen
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Inputs for Object Detection inference
|
|
8
|
+
*/
|
|
9
|
+
export interface ObjectDetectionInput {
|
|
10
|
+
/**
|
|
11
|
+
* The input image data
|
|
12
|
+
*/
|
|
13
|
+
inputs: unknown;
|
|
14
|
+
/**
|
|
15
|
+
* Additional inference parameters
|
|
16
|
+
*/
|
|
17
|
+
parameters?: ObjectDetectionParameters;
|
|
18
|
+
[property: string]: unknown;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Additional inference parameters
|
|
22
|
+
*
|
|
23
|
+
* Additional inference parameters for Object Detection
|
|
24
|
+
*/
|
|
25
|
+
export interface ObjectDetectionParameters {
|
|
26
|
+
/**
|
|
27
|
+
* The probability necessary to make a prediction.
|
|
28
|
+
*/
|
|
29
|
+
threshold?: number;
|
|
30
|
+
[property: string]: unknown;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* The predicted bounding box. Coordinates are relative to the top left corner of the input
|
|
34
|
+
* image.
|
|
35
|
+
*/
|
|
36
|
+
export interface BoundingBox {
|
|
37
|
+
xmax: number;
|
|
38
|
+
xmin: number;
|
|
39
|
+
ymax: number;
|
|
40
|
+
ymin: number;
|
|
41
|
+
[property: string]: unknown;
|
|
42
|
+
}
|
|
43
|
+
export type ObjectDetectionOutput = ObjectDetectionOutputElement[];
|
|
44
|
+
/**
|
|
45
|
+
* Outputs of inference for the Object Detection task
|
|
46
|
+
*/
|
|
47
|
+
export interface ObjectDetectionOutputElement {
|
|
48
|
+
/**
|
|
49
|
+
* The predicted bounding box. Coordinates are relative to the top left corner of the input
|
|
50
|
+
* image.
|
|
51
|
+
*/
|
|
52
|
+
box: BoundingBox;
|
|
53
|
+
/**
|
|
54
|
+
* The predicted label for the bounding box
|
|
55
|
+
*/
|
|
56
|
+
label: string;
|
|
57
|
+
/**
|
|
58
|
+
* The associated score / probability
|
|
59
|
+
*/
|
|
60
|
+
score: number;
|
|
61
|
+
[property: string]: unknown;
|
|
62
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/object-detection/input.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Inputs for Object Detection inference",
|
|
5
|
+
"title": "ObjectDetectionInput",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"inputs": {
|
|
9
|
+
"description": "The input image data"
|
|
10
|
+
},
|
|
11
|
+
"parameters": {
|
|
12
|
+
"description": "Additional inference parameters",
|
|
13
|
+
"$ref": "#/$defs/ObjectDetectionParameters"
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"$defs": {
|
|
17
|
+
"ObjectDetectionParameters": {
|
|
18
|
+
"title": "ObjectDetectionParameters",
|
|
19
|
+
"description": "Additional inference parameters for Object Detection",
|
|
20
|
+
"type": "object",
|
|
21
|
+
"properties": {
|
|
22
|
+
"threshold": {
|
|
23
|
+
"type": "number",
|
|
24
|
+
"description": "The probability necessary to make a prediction."
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
},
|
|
29
|
+
"required": ["inputs"]
|
|
30
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/object-detection/output.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Outputs of inference for the Object Detection task",
|
|
5
|
+
"title": "ObjectDetectionOutput",
|
|
6
|
+
"type": "array",
|
|
7
|
+
"items": {
|
|
8
|
+
"type": "object",
|
|
9
|
+
"properties": {
|
|
10
|
+
"label": {
|
|
11
|
+
"type": "string",
|
|
12
|
+
"description": "The predicted label for the bounding box"
|
|
13
|
+
},
|
|
14
|
+
"score": {
|
|
15
|
+
"type": "number",
|
|
16
|
+
"description": "The associated score / probability"
|
|
17
|
+
},
|
|
18
|
+
"box": {
|
|
19
|
+
"$ref": "#/$defs/BoundingBox",
|
|
20
|
+
"description": "The predicted bounding box. Coordinates are relative to the top left corner of the input image."
|
|
21
|
+
}
|
|
22
|
+
},
|
|
23
|
+
"required": ["box", "label", "score"]
|
|
24
|
+
},
|
|
25
|
+
"$defs": {
|
|
26
|
+
"BoundingBox": {
|
|
27
|
+
"type": "object",
|
|
28
|
+
"title": "BoundingBox",
|
|
29
|
+
"properties": {
|
|
30
|
+
"xmin": {
|
|
31
|
+
"type": "integer"
|
|
32
|
+
},
|
|
33
|
+
"xmax": {
|
|
34
|
+
"type": "integer"
|
|
35
|
+
},
|
|
36
|
+
"ymin": {
|
|
37
|
+
"type": "integer"
|
|
38
|
+
},
|
|
39
|
+
"ymax": {
|
|
40
|
+
"type": "integer"
|
|
41
|
+
}
|
|
42
|
+
},
|
|
43
|
+
"required": ["xmin", "xmax", "ymin", "ymax"]
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
}
|
|
@@ -13,6 +13,9 @@ const taskData: TaskDataCustom = {
|
|
|
13
13
|
summary: "",
|
|
14
14
|
widgetModels: [],
|
|
15
15
|
youtubeId: undefined,
|
|
16
|
+
/// If this is a subtask, link to the most general task ID
|
|
17
|
+
/// (eg, text2text-generation is the canonical ID of translation)
|
|
18
|
+
canonicalId: undefined,
|
|
16
19
|
};
|
|
17
20
|
|
|
18
21
|
export default taskData;
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/<TASK_ID>/input.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Inputs for <TASK_ID> inference",
|
|
5
|
+
"title": "PlaceholderInput",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"inputs": {
|
|
9
|
+
"description": "TODO: describe the input here. This must be model & framework agnostic.",
|
|
10
|
+
"type": "string"
|
|
11
|
+
},
|
|
12
|
+
"parameters": {
|
|
13
|
+
"description": "Additional inference parameters",
|
|
14
|
+
"$ref": "#/$defs/<TASK_ID>Parameters"
|
|
15
|
+
}
|
|
16
|
+
},
|
|
17
|
+
"$defs": {
|
|
18
|
+
"<TASK_ID>Parameters": {
|
|
19
|
+
"title": "<TASK_ID>Parameters",
|
|
20
|
+
"description": "TODO: describe additional parameters here.",
|
|
21
|
+
"type": "object",
|
|
22
|
+
"properties": {
|
|
23
|
+
"dummy_parameter_name": {
|
|
24
|
+
"type": "boolean",
|
|
25
|
+
"description": "TODO: describe the parameter here"
|
|
26
|
+
},
|
|
27
|
+
"dummy_parameter_name2": {
|
|
28
|
+
"type": "integer",
|
|
29
|
+
"description": "TODO: describe the parameter here"
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
"required": ["inputs"]
|
|
35
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/<TASK_ID>/output.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Outputs for <TASK_ID> inference",
|
|
5
|
+
"title": "PlaceholderOutput",
|
|
6
|
+
"type": "array",
|
|
7
|
+
"items": {
|
|
8
|
+
"type": "object",
|
|
9
|
+
"properties": {
|
|
10
|
+
"meaningful_output_name": {
|
|
11
|
+
"type": "string",
|
|
12
|
+
"description": "TODO: Describe what is outputed by the inference here"
|
|
13
|
+
}
|
|
14
|
+
},
|
|
15
|
+
"required": ["meaningfulOutputName"]
|
|
16
|
+
}
|
|
17
|
+
}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inference code generated from the JSON schema spec in ./spec
|
|
3
|
+
*
|
|
4
|
+
* Using src/scripts/inference-codegen
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Inputs for Question Answering inference
|
|
8
|
+
*/
|
|
9
|
+
export interface QuestionAnsweringInput {
|
|
10
|
+
/**
|
|
11
|
+
* One (context, question) pair to answer
|
|
12
|
+
*/
|
|
13
|
+
inputs: QuestionAnsweringInputData;
|
|
14
|
+
/**
|
|
15
|
+
* Additional inference parameters
|
|
16
|
+
*/
|
|
17
|
+
parameters?: QuestionAnsweringParameters;
|
|
18
|
+
[property: string]: unknown;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* One (context, question) pair to answer
|
|
22
|
+
*/
|
|
23
|
+
export interface QuestionAnsweringInputData {
|
|
24
|
+
/**
|
|
25
|
+
* The context to be used for answering the question
|
|
26
|
+
*/
|
|
27
|
+
context: string;
|
|
28
|
+
/**
|
|
29
|
+
* The question to be answered
|
|
30
|
+
*/
|
|
31
|
+
question: string;
|
|
32
|
+
[property: string]: unknown;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Additional inference parameters
|
|
36
|
+
*
|
|
37
|
+
* Additional inference parameters for Question Answering
|
|
38
|
+
*/
|
|
39
|
+
export interface QuestionAnsweringParameters {
|
|
40
|
+
/**
|
|
41
|
+
* Attempts to align the answer to real words. Improves quality on space separated
|
|
42
|
+
* languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
|
|
43
|
+
*/
|
|
44
|
+
align_to_words?: boolean;
|
|
45
|
+
/**
|
|
46
|
+
* If the context is too long to fit with the question for the model, it will be split in
|
|
47
|
+
* several chunks with some overlap. This argument controls the size of that overlap.
|
|
48
|
+
*/
|
|
49
|
+
doc_stride?: number;
|
|
50
|
+
/**
|
|
51
|
+
* Whether to accept impossible as an answer.
|
|
52
|
+
*/
|
|
53
|
+
handle_impossible_answer?: boolean;
|
|
54
|
+
/**
|
|
55
|
+
* The maximum length of predicted answers (e.g., only answers with a shorter length are
|
|
56
|
+
* considered).
|
|
57
|
+
*/
|
|
58
|
+
max_answer_len?: number;
|
|
59
|
+
/**
|
|
60
|
+
* The maximum length of the question after tokenization. It will be truncated if needed.
|
|
61
|
+
*/
|
|
62
|
+
max_question_len?: number;
|
|
63
|
+
/**
|
|
64
|
+
* The maximum length of the total sentence (context + question) in tokens of each chunk
|
|
65
|
+
* passed to the model. The context will be split in several chunks (using docStride as
|
|
66
|
+
* overlap) if needed.
|
|
67
|
+
*/
|
|
68
|
+
max_seq_len?: number;
|
|
69
|
+
/**
|
|
70
|
+
* The number of answers to return (will be chosen by order of likelihood). Note that we
|
|
71
|
+
* return less than topk answers if there are not enough options available within the
|
|
72
|
+
* context.
|
|
73
|
+
*/
|
|
74
|
+
top_k?: number;
|
|
75
|
+
[property: string]: unknown;
|
|
76
|
+
}
|
|
77
|
+
export type QuestionAnsweringOutput = QuestionAnsweringOutputElement[];
|
|
78
|
+
/**
|
|
79
|
+
* Outputs of inference for the Question Answering task
|
|
80
|
+
*/
|
|
81
|
+
export interface QuestionAnsweringOutputElement {
|
|
82
|
+
/**
|
|
83
|
+
* The answer to the question.
|
|
84
|
+
*/
|
|
85
|
+
answer: string;
|
|
86
|
+
/**
|
|
87
|
+
* The character position in the input where the answer ends.
|
|
88
|
+
*/
|
|
89
|
+
end: number;
|
|
90
|
+
/**
|
|
91
|
+
* The probability associated to the answer.
|
|
92
|
+
*/
|
|
93
|
+
score: number;
|
|
94
|
+
/**
|
|
95
|
+
* The character position in the input where the answer begins.
|
|
96
|
+
*/
|
|
97
|
+
start: number;
|
|
98
|
+
[property: string]: unknown;
|
|
99
|
+
}
|