@huggingface/tasks 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. package/dist/{index.mjs → index.cjs} +295 -134
  2. package/dist/index.d.ts +8 -6
  3. package/dist/index.js +260 -169
  4. package/package.json +13 -8
  5. package/src/library-to-tasks.ts +1 -1
  6. package/src/library-ui-elements.ts +24 -10
  7. package/src/model-data.ts +1 -1
  8. package/src/model-libraries.ts +3 -2
  9. package/src/pipelines.ts +1 -1
  10. package/src/tasks/audio-classification/about.md +1 -1
  11. package/src/tasks/audio-classification/inference.ts +51 -0
  12. package/src/tasks/audio-classification/spec/input.json +34 -0
  13. package/src/tasks/audio-classification/spec/output.json +21 -0
  14. package/src/tasks/audio-to-audio/about.md +1 -1
  15. package/src/tasks/automatic-speech-recognition/about.md +4 -2
  16. package/src/tasks/automatic-speech-recognition/inference.ts +154 -0
  17. package/src/tasks/automatic-speech-recognition/spec/input.json +34 -0
  18. package/src/tasks/automatic-speech-recognition/spec/output.json +36 -0
  19. package/src/tasks/common-definitions.json +109 -0
  20. package/src/tasks/depth-estimation/data.ts +8 -4
  21. package/src/tasks/depth-estimation/inference.ts +35 -0
  22. package/src/tasks/depth-estimation/spec/input.json +30 -0
  23. package/src/tasks/depth-estimation/spec/output.json +10 -0
  24. package/src/tasks/document-question-answering/inference.ts +102 -0
  25. package/src/tasks/document-question-answering/spec/input.json +85 -0
  26. package/src/tasks/document-question-answering/spec/output.json +36 -0
  27. package/src/tasks/feature-extraction/inference.ts +22 -0
  28. package/src/tasks/feature-extraction/spec/input.json +26 -0
  29. package/src/tasks/feature-extraction/spec/output.json +7 -0
  30. package/src/tasks/fill-mask/inference.ts +61 -0
  31. package/src/tasks/fill-mask/spec/input.json +38 -0
  32. package/src/tasks/fill-mask/spec/output.json +29 -0
  33. package/src/tasks/image-classification/inference.ts +51 -0
  34. package/src/tasks/image-classification/spec/input.json +34 -0
  35. package/src/tasks/image-classification/spec/output.json +10 -0
  36. package/src/tasks/image-segmentation/inference.ts +65 -0
  37. package/src/tasks/image-segmentation/spec/input.json +54 -0
  38. package/src/tasks/image-segmentation/spec/output.json +25 -0
  39. package/src/tasks/image-to-image/inference.ts +67 -0
  40. package/src/tasks/image-to-image/spec/input.json +52 -0
  41. package/src/tasks/image-to-image/spec/output.json +12 -0
  42. package/src/tasks/image-to-text/inference.ts +138 -0
  43. package/src/tasks/image-to-text/spec/input.json +34 -0
  44. package/src/tasks/image-to-text/spec/output.json +17 -0
  45. package/src/tasks/index.ts +5 -2
  46. package/src/tasks/mask-generation/about.md +65 -0
  47. package/src/tasks/mask-generation/data.ts +55 -0
  48. package/src/tasks/object-detection/inference.ts +62 -0
  49. package/src/tasks/object-detection/spec/input.json +30 -0
  50. package/src/tasks/object-detection/spec/output.json +46 -0
  51. package/src/tasks/placeholder/data.ts +3 -0
  52. package/src/tasks/placeholder/spec/input.json +35 -0
  53. package/src/tasks/placeholder/spec/output.json +17 -0
  54. package/src/tasks/question-answering/inference.ts +99 -0
  55. package/src/tasks/question-answering/spec/input.json +67 -0
  56. package/src/tasks/question-answering/spec/output.json +29 -0
  57. package/src/tasks/sentence-similarity/about.md +2 -2
  58. package/src/tasks/sentence-similarity/inference.ts +32 -0
  59. package/src/tasks/sentence-similarity/spec/input.json +40 -0
  60. package/src/tasks/sentence-similarity/spec/output.json +12 -0
  61. package/src/tasks/summarization/data.ts +1 -0
  62. package/src/tasks/summarization/inference.ts +58 -0
  63. package/src/tasks/summarization/spec/input.json +7 -0
  64. package/src/tasks/summarization/spec/output.json +7 -0
  65. package/src/tasks/table-question-answering/inference.ts +61 -0
  66. package/src/tasks/table-question-answering/spec/input.json +39 -0
  67. package/src/tasks/table-question-answering/spec/output.json +40 -0
  68. package/src/tasks/tabular-classification/about.md +1 -1
  69. package/src/tasks/tabular-regression/about.md +1 -1
  70. package/src/tasks/text-classification/about.md +1 -0
  71. package/src/tasks/text-classification/inference.ts +51 -0
  72. package/src/tasks/text-classification/spec/input.json +35 -0
  73. package/src/tasks/text-classification/spec/output.json +10 -0
  74. package/src/tasks/text-generation/about.md +24 -13
  75. package/src/tasks/text-generation/data.ts +22 -38
  76. package/src/tasks/text-generation/inference.ts +85 -0
  77. package/src/tasks/text-generation/spec/input.json +74 -0
  78. package/src/tasks/text-generation/spec/output.json +17 -0
  79. package/src/tasks/text-to-audio/inference.ts +138 -0
  80. package/src/tasks/text-to-audio/spec/input.json +31 -0
  81. package/src/tasks/text-to-audio/spec/output.json +20 -0
  82. package/src/tasks/text-to-image/about.md +11 -2
  83. package/src/tasks/text-to-image/data.ts +6 -2
  84. package/src/tasks/text-to-image/inference.ts +73 -0
  85. package/src/tasks/text-to-image/spec/input.json +57 -0
  86. package/src/tasks/text-to-image/spec/output.json +15 -0
  87. package/src/tasks/text-to-speech/about.md +4 -2
  88. package/src/tasks/text-to-speech/data.ts +1 -0
  89. package/src/tasks/text-to-speech/inference.ts +146 -0
  90. package/src/tasks/text-to-speech/spec/input.json +7 -0
  91. package/src/tasks/text-to-speech/spec/output.json +7 -0
  92. package/src/tasks/text2text-generation/inference.ts +53 -0
  93. package/src/tasks/text2text-generation/spec/input.json +55 -0
  94. package/src/tasks/text2text-generation/spec/output.json +17 -0
  95. package/src/tasks/token-classification/inference.ts +82 -0
  96. package/src/tasks/token-classification/spec/input.json +65 -0
  97. package/src/tasks/token-classification/spec/output.json +33 -0
  98. package/src/tasks/translation/data.ts +1 -0
  99. package/src/tasks/translation/inference.ts +58 -0
  100. package/src/tasks/translation/spec/input.json +7 -0
  101. package/src/tasks/translation/spec/output.json +7 -0
  102. package/src/tasks/video-classification/inference.ts +59 -0
  103. package/src/tasks/video-classification/spec/input.json +42 -0
  104. package/src/tasks/video-classification/spec/output.json +10 -0
  105. package/src/tasks/visual-question-answering/inference.ts +63 -0
  106. package/src/tasks/visual-question-answering/spec/input.json +41 -0
  107. package/src/tasks/visual-question-answering/spec/output.json +21 -0
  108. package/src/tasks/zero-shot-classification/inference.ts +67 -0
  109. package/src/tasks/zero-shot-classification/spec/input.json +50 -0
  110. package/src/tasks/zero-shot-classification/spec/output.json +10 -0
  111. package/src/tasks/zero-shot-image-classification/data.ts +8 -5
  112. package/src/tasks/zero-shot-image-classification/inference.ts +61 -0
  113. package/src/tasks/zero-shot-image-classification/spec/input.json +45 -0
  114. package/src/tasks/zero-shot-image-classification/spec/output.json +10 -0
  115. package/src/tasks/zero-shot-object-detection/about.md +45 -0
  116. package/src/tasks/zero-shot-object-detection/data.ts +62 -0
  117. package/src/tasks/zero-shot-object-detection/inference.ts +66 -0
  118. package/src/tasks/zero-shot-object-detection/spec/input.json +40 -0
  119. package/src/tasks/zero-shot-object-detection/spec/output.json +47 -0
  120. package/tsconfig.json +3 -3
@@ -0,0 +1,54 @@
1
+ {
2
+ "$id": "/inference/schemas/image-segmentation/input.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Inputs for Image Segmentation inference",
5
+ "title": "ImageSegmentationInput",
6
+ "type": "object",
7
+ "properties": {
8
+ "data": {
9
+ "description": "The input image data"
10
+ },
11
+ "parameters": {
12
+ "description": "Additional inference parameters",
13
+ "$ref": "#/$defs/ImageSegmentationParameters"
14
+ }
15
+ },
16
+ "$defs": {
17
+ "ImageSegmentationParameters": {
18
+ "title": "ImageSegmentationParameters",
19
+ "description": "Additional inference parameters for Image Segmentation",
20
+ "type": "object",
21
+ "properties": {
22
+ "maskThreshold": {
23
+ "type": "number",
24
+ "description": "Threshold to use when turning the predicted masks into binary values."
25
+ },
26
+ "overlapMaskAreaThreshold": {
27
+ "type": "number",
28
+ "description": "Mask overlap threshold to eliminate small, disconnected segments."
29
+ },
30
+ "subtask": {
31
+ "title": "ImageSegmentationSubtask",
32
+ "type": "string",
33
+ "description": "Segmentation task to be performed, depending on model capabilities.",
34
+ "oneOf": [
35
+ {
36
+ "const": "instance"
37
+ },
38
+ {
39
+ "const": "panoptic"
40
+ },
41
+ {
42
+ "const": "semantic"
43
+ }
44
+ ]
45
+ },
46
+ "threshold": {
47
+ "type": "number",
48
+ "description": "Probability threshold to filter out predicted masks."
49
+ }
50
+ }
51
+ }
52
+ },
53
+ "required": ["data"]
54
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "$id": "/inference/schemas/image-segmentation/output.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Outputs of inference for the Image Segmentation task",
5
+ "title": "ImageSegmentationOutput",
6
+ "type": "array",
7
+ "items": {
8
+ "description": "A predicted mask / segment",
9
+ "type": "object",
10
+ "properties": {
11
+ "label": {
12
+ "type": "string",
13
+ "description": "The label of the predicted segment"
14
+ },
15
+ "mask": {
16
+ "description": "The corresponding mask as a black-and-white image"
17
+ },
18
+ "score": {
19
+ "type": "number",
20
+ "description": "The score or confidence degreee the model has"
21
+ }
22
+ },
23
+ "required": ["label", "mask"]
24
+ }
25
+ }
@@ -0,0 +1,67 @@
1
+ /**
2
+ * Inference code generated from the JSON schema spec in ./spec
3
+ *
4
+ * Using src/scripts/inference-codegen
5
+ */
6
+
7
+ /**
8
+ * Inputs for Image To Image inference
9
+ */
10
+ export interface ImageToImageInput {
11
+ /**
12
+ * The input image data
13
+ */
14
+ data: unknown;
15
+ /**
16
+ * Additional inference parameters
17
+ */
18
+ parameters?: ImageToImageParameters;
19
+ [property: string]: unknown;
20
+ }
21
+
22
+ /**
23
+ * Additional inference parameters
24
+ *
25
+ * Additional inference parameters for Image To Image
26
+ */
27
+ export interface ImageToImageParameters {
28
+ /**
29
+ * For diffusion models. A higher guidance scale value encourages the model to generate
30
+ * images closely linked to the text prompt at the expense of lower image quality.
31
+ */
32
+ guidanceScale?: number;
33
+ /**
34
+ * One or several prompt to guide what NOT to include in image generation.
35
+ */
36
+ negativePrompt?: string[];
37
+ /**
38
+ * For diffusion models. The number of denoising steps. More denoising steps usually lead to
39
+ * a higher quality image at the expense of slower inference.
40
+ */
41
+ numInferenceSteps?: number;
42
+ /**
43
+ * The size in pixel of the output image
44
+ */
45
+ targetSize?: TargetSize;
46
+ [property: string]: unknown;
47
+ }
48
+
49
+ /**
50
+ * The size in pixel of the output image
51
+ */
52
+ export interface TargetSize {
53
+ height: number;
54
+ width: number;
55
+ [property: string]: unknown;
56
+ }
57
+
58
+ /**
59
+ * Outputs of inference for the Image To Image task
60
+ */
61
+ export interface ImageToImageOutput {
62
+ /**
63
+ * The output image
64
+ */
65
+ image?: unknown;
66
+ [property: string]: unknown;
67
+ }
@@ -0,0 +1,52 @@
1
+ {
2
+ "$id": "/inference/schemas/image-to-image/input.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Inputs for Image To Image inference",
5
+ "title": "ImageToImageInput",
6
+ "type": "object",
7
+ "properties": {
8
+ "data": {
9
+ "description": "The input image data"
10
+ },
11
+ "parameters": {
12
+ "description": "Additional inference parameters",
13
+ "$ref": "#/$defs/ImageToImageParameters"
14
+ }
15
+ },
16
+ "$defs": {
17
+ "ImageToImageParameters": {
18
+ "title": "ImageToImageParameters",
19
+ "description": "Additional inference parameters for Image To Image",
20
+ "type": "object",
21
+ "properties": {
22
+ "guidanceScale": {
23
+ "type": "number",
24
+ "description": "For diffusion models. A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality."
25
+ },
26
+ "negativePrompt": {
27
+ "type": "array",
28
+ "items": { "type": "string" },
29
+ "description": "One or several prompt to guide what NOT to include in image generation."
30
+ },
31
+ "numInferenceSteps": {
32
+ "type": "integer",
33
+ "description": "For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference."
34
+ },
35
+ "targetSize": {
36
+ "type": "object",
37
+ "description": "The size in pixel of the output image",
38
+ "properties": {
39
+ "width": {
40
+ "type": "integer"
41
+ },
42
+ "height": {
43
+ "type": "integer"
44
+ }
45
+ },
46
+ "required": ["width", "height"]
47
+ }
48
+ }
49
+ }
50
+ },
51
+ "required": ["data"]
52
+ }
@@ -0,0 +1,12 @@
1
+ {
2
+ "$id": "/inference/schemas/image-to-image/output.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Outputs of inference for the Image To Image task",
5
+ "title": "ImageToImageOutput",
6
+ "type": "object",
7
+ "properties": {
8
+ "image": {
9
+ "description": "The output image"
10
+ }
11
+ }
12
+ }
@@ -0,0 +1,138 @@
1
+ /**
2
+ * Inference code generated from the JSON schema spec in ./spec
3
+ *
4
+ * Using src/scripts/inference-codegen
5
+ */
6
+ /**
7
+ * Inputs for Image To Text inference
8
+ */
9
+ export interface ImageToTextInput {
10
+ /**
11
+ * The input image data
12
+ */
13
+ data: unknown;
14
+ /**
15
+ * Additional inference parameters
16
+ */
17
+ parameters?: ImageToTextParameters;
18
+ [property: string]: unknown;
19
+ }
20
+ /**
21
+ * Additional inference parameters
22
+ *
23
+ * Additional inference parameters for Image To Text
24
+ */
25
+ export interface ImageToTextParameters {
26
+ /**
27
+ * Parametrization of the text generation process
28
+ */
29
+ generate?: GenerationParameters;
30
+ /**
31
+ * The amount of maximum tokens to generate.
32
+ */
33
+ maxNewTokens?: number;
34
+ [property: string]: unknown;
35
+ }
36
+ /**
37
+ * Parametrization of the text generation process
38
+ *
39
+ * Ad-hoc parametrization of the text generation process
40
+ */
41
+ export interface GenerationParameters {
42
+ /**
43
+ * Whether to use sampling instead of greedy decoding when generating new tokens.
44
+ */
45
+ doSample?: boolean;
46
+ /**
47
+ * Controls the stopping condition for beam-based methods.
48
+ */
49
+ earlyStopping?: EarlyStoppingUnion;
50
+ /**
51
+ * If set to float strictly between 0 and 1, only tokens with a conditional probability
52
+ * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
53
+ * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
54
+ * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
55
+ */
56
+ epsilonCutoff?: number;
57
+ /**
58
+ * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
59
+ * float strictly between 0 and 1, a token is only considered if it is greater than either
60
+ * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
61
+ * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
62
+ * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
63
+ * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
64
+ * for more details.
65
+ */
66
+ etaCutoff?: number;
67
+ /**
68
+ * The maximum length (in tokens) of the generated text, including the input.
69
+ */
70
+ maxLength?: number;
71
+ /**
72
+ * The maximum number of tokens to generate. Takes precedence over maxLength.
73
+ */
74
+ maxNewTokens?: number;
75
+ /**
76
+ * The minimum length (in tokens) of the generated text, including the input.
77
+ */
78
+ minLength?: number;
79
+ /**
80
+ * The minimum number of tokens to generate. Takes precedence over maxLength.
81
+ */
82
+ minNewTokens?: number;
83
+ /**
84
+ * Number of groups to divide num_beams into in order to ensure diversity among different
85
+ * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
86
+ */
87
+ numBeamGroups?: number;
88
+ /**
89
+ * Number of beams to use for beam search.
90
+ */
91
+ numBeams?: number;
92
+ /**
93
+ * The value balances the model confidence and the degeneration penalty in contrastive
94
+ * search decoding.
95
+ */
96
+ penaltyAlpha?: number;
97
+ /**
98
+ * The value used to modulate the next token probabilities.
99
+ */
100
+ temperature?: number;
101
+ /**
102
+ * The number of highest probability vocabulary tokens to keep for top-k-filtering.
103
+ */
104
+ topK?: number;
105
+ /**
106
+ * If set to float < 1, only the smallest set of most probable tokens with probabilities
107
+ * that add up to top_p or higher are kept for generation.
108
+ */
109
+ topP?: number;
110
+ /**
111
+ * Local typicality measures how similar the conditional probability of predicting a target
112
+ * token next is to the expected conditional probability of predicting a random token next,
113
+ * given the partial text already generated. If set to float < 1, the smallest set of the
114
+ * most locally typical tokens with probabilities that add up to typical_p or higher are
115
+ * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
116
+ */
117
+ typicalP?: number;
118
+ /**
119
+ * Whether the model should use the past last key/values attentions to speed up decoding
120
+ */
121
+ useCache?: boolean;
122
+ [property: string]: unknown;
123
+ }
124
+ /**
125
+ * Controls the stopping condition for beam-based methods.
126
+ */
127
+ export type EarlyStoppingUnion = boolean | "never";
128
+ export type ImageToTextOutput = ImageToTextOutputElement[];
129
+ /**
130
+ * Outputs of inference for the Image To Text task
131
+ */
132
+ export interface ImageToTextOutputElement {
133
+ /**
134
+ * The generated text.
135
+ */
136
+ generatedText: string;
137
+ [property: string]: unknown;
138
+ }
@@ -0,0 +1,34 @@
1
+ {
2
+ "$id": "/inference/schemas/image-to-text/input.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Inputs for Image To Text inference",
5
+ "title": "ImageToTextInput",
6
+ "type": "object",
7
+ "properties": {
8
+ "data": {
9
+ "description": "The input image data"
10
+ },
11
+ "parameters": {
12
+ "description": "Additional inference parameters",
13
+ "$ref": "#/$defs/ImageToTextParameters"
14
+ }
15
+ },
16
+ "$defs": {
17
+ "ImageToTextParameters": {
18
+ "title": "ImageToTextParameters",
19
+ "description": "Additional inference parameters for Image To Text",
20
+ "type": "object",
21
+ "properties": {
22
+ "maxNewTokens": {
23
+ "type": "integer",
24
+ "description": "The amount of maximum tokens to generate."
25
+ },
26
+ "generate": {
27
+ "description": "Parametrization of the text generation process",
28
+ "$ref": "/inference/schemas/common-definitions.json#/definitions/GenerationParameters"
29
+ }
30
+ }
31
+ }
32
+ },
33
+ "required": ["data"]
34
+ }
@@ -0,0 +1,17 @@
1
+ {
2
+ "$id": "/inference/schemas/image-to-text/output.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Outputs of inference for the Image To Text task",
5
+ "title": "ImageToTextOutput",
6
+ "type": "array",
7
+ "items": {
8
+ "type": "object",
9
+ "properties": {
10
+ "generatedText": {
11
+ "type": "string",
12
+ "description": "The generated text."
13
+ }
14
+ },
15
+ "required": ["generatedText"]
16
+ }
17
+ }
@@ -11,6 +11,7 @@ import imageClassification from "./image-classification/data";
11
11
  import imageToImage from "./image-to-image/data";
12
12
  import imageToText from "./image-to-text/data";
13
13
  import imageSegmentation from "./image-segmentation/data";
14
+ import maskGeneration from "./mask-generation/data";
14
15
  import objectDetection from "./object-detection/data";
15
16
  import depthEstimation from "./depth-estimation/data";
16
17
  import placeholder from "./placeholder/data";
@@ -33,6 +34,7 @@ import videoClassification from "./video-classification/data";
33
34
  import visualQuestionAnswering from "./visual-question-answering/data";
34
35
  import zeroShotClassification from "./zero-shot-classification/data";
35
36
  import zeroShotImageClassification from "./zero-shot-image-classification/data";
37
+ import zeroShotObjectDetection from "./zero-shot-object-detection/data";
36
38
 
37
39
  import type { ModelLibraryKey } from "../model-libraries";
38
40
 
@@ -131,7 +133,7 @@ export const TASKS_DATA: Record<PipelineType, TaskData | undefined> = {
131
133
  "image-to-image": getData("image-to-image", imageToImage),
132
134
  "image-to-text": getData("image-to-text", imageToText),
133
135
  "image-to-video": undefined,
134
- "mask-generation": getData("mask-generation", placeholder),
136
+ "mask-generation": getData("mask-generation", maskGeneration),
135
137
  "multiple-choice": undefined,
136
138
  "object-detection": getData("object-detection", objectDetection),
137
139
  "video-classification": getData("video-classification", videoClassification),
@@ -162,7 +164,7 @@ export const TASKS_DATA: Record<PipelineType, TaskData | undefined> = {
162
164
  "voice-activity-detection": undefined,
163
165
  "zero-shot-classification": getData("zero-shot-classification", zeroShotClassification),
164
166
  "zero-shot-image-classification": getData("zero-shot-image-classification", zeroShotImageClassification),
165
- "zero-shot-object-detection": getData("zero-shot-object-detection", placeholder),
167
+ "zero-shot-object-detection": getData("zero-shot-object-detection", zeroShotObjectDetection),
166
168
  "text-to-3d": getData("text-to-3d", placeholder),
167
169
  "image-to-3d": getData("image-to-3d", placeholder),
168
170
  } as const;
@@ -216,6 +218,7 @@ export interface TaskData {
216
218
  datasets: ExampleRepo[];
217
219
  demo: TaskDemo;
218
220
  id: PipelineType;
221
+ canonicalId?: PipelineType;
219
222
  isPlaceholder?: boolean;
220
223
  label: string;
221
224
  libraries: ModelLibraryKey[];
@@ -0,0 +1,65 @@
1
+ ## Use Cases
2
+
3
+ ### Filtering an Image
4
+
5
+ When filtering for an image, the generated masks might serve as an initial filter to eliminate irrelevant information. For instance, when monitoring vegetation in satellite imaging, mask generation models identify green spots, highlighting the relevant region of the image.
6
+
7
+ ### Masked Image Modelling
8
+
9
+ Generating masks can facilitate learning, especially in semi or unsupervised learning. For example, the [BEiT model](https://huggingface.co/docs/transformers/model_doc/beit) uses image-mask patches in the pre-training.
10
+
11
+ ### Human-in-the-loop Computer Vision Applications
12
+
13
+ For applications where humans are in the loop, masks highlight certain regions of images for humans to validate.
14
+
15
+ ## Task Variants
16
+
17
+ ### Segmentation
18
+
19
+ Image Segmentation divides an image into segments where each pixel is mapped to an object. This task has multiple variants, such as instance segmentation, panoptic segmentation, and semantic segmentation. You can learn more about segmentation on its [task page](https://huggingface.co/tasks/image-segmentation).
20
+
21
+ ## Inference
22
+
23
+ Mask generation models often work in two modes: segment everything or prompt mode.
24
+ The example below works in segment-everything-mode, where many masks will be returned.
25
+
26
+ ```python
27
+ from transformers import pipeline
28
+
29
+ generator = pipeline("mask-generation", model="Zigeng/SlimSAM-uniform-50", points_per_batch=64, device="cuda")
30
+ image_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
31
+ outputs = generator(image_url)
32
+ outputs["masks"]
33
+ # array of multiple binary masks returned for each generated mask
34
+ ```
35
+
36
+ Prompt mode takes in three types of prompts:
37
+
38
+ - **Point prompt:** The user can select a point on the image, and a meaningful segment around the point will be returned.
39
+ - **Box prompt:** The user can draw a box on the image, and a meaningful segment within the box will be returned.
40
+ - **Text prompt:** The user can input a text, and the objects of that type will be segmented. Note that this capability has not yet been released and has only been explored in research.
41
+
42
+ Below you can see how to use an input-point prompt. It also demonstrates direct model inference without the `pipeline` abstraction. The input prompt here is a nested list where the outermost list is the batch size (`1`), then the number of points (also `1` in this example), and the innermost list contains the actual coordinates of the point (`[450, 600]`).
43
+
44
+ ```python
45
+ from transformers import SamModel, SamProcessor
46
+ from PIL import Image
47
+ import requests
48
+
49
+ model = SamModel.from_pretrained("Zigeng/SlimSAM-uniform-50").to("cuda")
50
+ processor = SamProcessor.from_pretrained("Zigeng/SlimSAM-uniform-50")
51
+
52
+ raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
53
+ # pointing to the car window
54
+ input_points = [[[450, 600]]]
55
+ inputs = processor(raw_image, input_points=input_points, return_tensors="pt").to("cuda")
56
+ outputs = model(**inputs)
57
+ masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
58
+ scores = outputs.iou_scores
59
+ ```
60
+
61
+ ## Useful Resources
62
+
63
+ Would you like to learn more about mask generation? Great! Here you can find some curated resources that you may find helpful!
64
+
65
+ - [Segment anything model](https://huggingface.co/docs/transformers/main/model_doc/sam)
@@ -0,0 +1,55 @@
1
+ import type { TaskDataCustom } from "..";
2
+
3
+ const taskData: TaskDataCustom = {
4
+ datasets: [],
5
+ demo: {
6
+ inputs: [
7
+ {
8
+ filename: "mask-generation-input.png",
9
+ type: "img",
10
+ },
11
+ ],
12
+ outputs: [
13
+ {
14
+ filename: "mask-generation-output.png",
15
+ type: "img",
16
+ },
17
+ ],
18
+ },
19
+ metrics: [],
20
+ models: [
21
+ {
22
+ description: "Small yet powerful mask generation model.",
23
+ id: "Zigeng/SlimSAM-uniform-50",
24
+ },
25
+ {
26
+ description: "Very strong mask generation model.",
27
+ id: "facebook/sam-vit-huge",
28
+ },
29
+ ],
30
+ spaces: [
31
+ {
32
+ description:
33
+ "An application that combines a mask generation model with an image embedding model for open-vocabulary image segmentation.",
34
+ id: "SkalskiP/SAM_and_MetaCLIP",
35
+ },
36
+ {
37
+ description: "An application that compares the performance of a large and a small mask generation model.",
38
+ id: "merve/slimsam",
39
+ },
40
+ {
41
+ description: "An application based on an improved mask generation model.",
42
+ id: "linfanluntan/Grounded-SAM",
43
+ },
44
+ {
45
+ description: "An application to remove objects from videos using mask generation models.",
46
+ id: "SkalskiP/SAM_and_ProPainter",
47
+ },
48
+ ],
49
+ summary:
50
+ "Mask generation is the task of generating masks that identify a specific object or region of interest in a given image. Masks are often used in segmentation tasks, where they provide a precise way to isolate the object of interest for further processing or analysis.",
51
+ widgetModels: [],
52
+ youtubeId: "",
53
+ };
54
+
55
+ export default taskData;
@@ -0,0 +1,62 @@
1
+ /**
2
+ * Inference code generated from the JSON schema spec in ./spec
3
+ *
4
+ * Using src/scripts/inference-codegen
5
+ */
6
+ /**
7
+ * Inputs for Object Detection inference
8
+ */
9
+ export interface ObjectDetectionInput {
10
+ /**
11
+ * The input image data
12
+ */
13
+ data: unknown;
14
+ /**
15
+ * Additional inference parameters
16
+ */
17
+ parameters?: ObjectDetectionParameters;
18
+ [property: string]: unknown;
19
+ }
20
+ /**
21
+ * Additional inference parameters
22
+ *
23
+ * Additional inference parameters for Object Detection
24
+ */
25
+ export interface ObjectDetectionParameters {
26
+ /**
27
+ * The probability necessary to make a prediction.
28
+ */
29
+ threshold?: number;
30
+ [property: string]: unknown;
31
+ }
32
+ /**
33
+ * The predicted bounding box. Coordinates are relative to the top left corner of the input
34
+ * image.
35
+ */
36
+ export interface BoundingBox {
37
+ xmax: number;
38
+ xmin: number;
39
+ ymax: number;
40
+ ymin: number;
41
+ [property: string]: unknown;
42
+ }
43
+ export type ObjectDetectionOutput = ObjectDetectionOutputElement[];
44
+ /**
45
+ * Outputs of inference for the Object Detection task
46
+ */
47
+ export interface ObjectDetectionOutputElement {
48
+ /**
49
+ * The predicted bounding box. Coordinates are relative to the top left corner of the input
50
+ * image.
51
+ */
52
+ box: BoundingBox;
53
+ /**
54
+ * The predicted label for the bounding box
55
+ */
56
+ label: string;
57
+ /**
58
+ * The associated score / probability
59
+ */
60
+ score: number;
61
+ [property: string]: unknown;
62
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "$id": "/inference/schemas/object-detection/input.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Inputs for Object Detection inference",
5
+ "title": "ObjectDetectionInput",
6
+ "type": "object",
7
+ "properties": {
8
+ "data": {
9
+ "description": "The input image data"
10
+ },
11
+ "parameters": {
12
+ "description": "Additional inference parameters",
13
+ "$ref": "#/$defs/ObjectDetectionParameters"
14
+ }
15
+ },
16
+ "$defs": {
17
+ "ObjectDetectionParameters": {
18
+ "title": "ObjectDetectionParameters",
19
+ "description": "Additional inference parameters for Object Detection",
20
+ "type": "object",
21
+ "properties": {
22
+ "threshold": {
23
+ "type": "number",
24
+ "description": "The probability necessary to make a prediction."
25
+ }
26
+ }
27
+ }
28
+ },
29
+ "required": ["data"]
30
+ }