@huggingface/tasks 0.13.1-test → 0.13.1-test2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. package/package.json +4 -2
  2. package/src/dataset-libraries.ts +89 -0
  3. package/src/default-widget-inputs.ts +718 -0
  4. package/src/gguf.ts +40 -0
  5. package/src/hardware.ts +482 -0
  6. package/src/index.ts +59 -0
  7. package/src/library-to-tasks.ts +76 -0
  8. package/src/local-apps.ts +412 -0
  9. package/src/model-data.ts +149 -0
  10. package/src/model-libraries-downloads.ts +18 -0
  11. package/src/model-libraries-snippets.ts +1128 -0
  12. package/src/model-libraries.ts +820 -0
  13. package/src/pipelines.ts +698 -0
  14. package/src/snippets/common.ts +39 -0
  15. package/src/snippets/curl.spec.ts +94 -0
  16. package/src/snippets/curl.ts +120 -0
  17. package/src/snippets/index.ts +7 -0
  18. package/src/snippets/inputs.ts +167 -0
  19. package/src/snippets/js.spec.ts +148 -0
  20. package/src/snippets/js.ts +305 -0
  21. package/src/snippets/python.spec.ts +144 -0
  22. package/src/snippets/python.ts +321 -0
  23. package/src/snippets/types.ts +16 -0
  24. package/src/tasks/audio-classification/about.md +86 -0
  25. package/src/tasks/audio-classification/data.ts +81 -0
  26. package/src/tasks/audio-classification/inference.ts +52 -0
  27. package/src/tasks/audio-classification/spec/input.json +35 -0
  28. package/src/tasks/audio-classification/spec/output.json +11 -0
  29. package/src/tasks/audio-to-audio/about.md +56 -0
  30. package/src/tasks/audio-to-audio/data.ts +70 -0
  31. package/src/tasks/automatic-speech-recognition/about.md +90 -0
  32. package/src/tasks/automatic-speech-recognition/data.ts +82 -0
  33. package/src/tasks/automatic-speech-recognition/inference.ts +160 -0
  34. package/src/tasks/automatic-speech-recognition/spec/input.json +35 -0
  35. package/src/tasks/automatic-speech-recognition/spec/output.json +38 -0
  36. package/src/tasks/chat-completion/inference.ts +322 -0
  37. package/src/tasks/chat-completion/spec/input.json +350 -0
  38. package/src/tasks/chat-completion/spec/output.json +206 -0
  39. package/src/tasks/chat-completion/spec/stream_output.json +213 -0
  40. package/src/tasks/common-definitions.json +100 -0
  41. package/src/tasks/depth-estimation/about.md +45 -0
  42. package/src/tasks/depth-estimation/data.ts +70 -0
  43. package/src/tasks/depth-estimation/inference.ts +35 -0
  44. package/src/tasks/depth-estimation/spec/input.json +25 -0
  45. package/src/tasks/depth-estimation/spec/output.json +16 -0
  46. package/src/tasks/document-question-answering/about.md +53 -0
  47. package/src/tasks/document-question-answering/data.ts +85 -0
  48. package/src/tasks/document-question-answering/inference.ts +110 -0
  49. package/src/tasks/document-question-answering/spec/input.json +85 -0
  50. package/src/tasks/document-question-answering/spec/output.json +36 -0
  51. package/src/tasks/feature-extraction/about.md +72 -0
  52. package/src/tasks/feature-extraction/data.ts +57 -0
  53. package/src/tasks/feature-extraction/inference.ts +40 -0
  54. package/src/tasks/feature-extraction/spec/input.json +47 -0
  55. package/src/tasks/feature-extraction/spec/output.json +15 -0
  56. package/src/tasks/fill-mask/about.md +51 -0
  57. package/src/tasks/fill-mask/data.ts +79 -0
  58. package/src/tasks/fill-mask/inference.ts +62 -0
  59. package/src/tasks/fill-mask/spec/input.json +38 -0
  60. package/src/tasks/fill-mask/spec/output.json +29 -0
  61. package/src/tasks/image-classification/about.md +50 -0
  62. package/src/tasks/image-classification/data.ts +88 -0
  63. package/src/tasks/image-classification/inference.ts +52 -0
  64. package/src/tasks/image-classification/spec/input.json +35 -0
  65. package/src/tasks/image-classification/spec/output.json +11 -0
  66. package/src/tasks/image-feature-extraction/about.md +23 -0
  67. package/src/tasks/image-feature-extraction/data.ts +59 -0
  68. package/src/tasks/image-segmentation/about.md +63 -0
  69. package/src/tasks/image-segmentation/data.ts +99 -0
  70. package/src/tasks/image-segmentation/inference.ts +69 -0
  71. package/src/tasks/image-segmentation/spec/input.json +45 -0
  72. package/src/tasks/image-segmentation/spec/output.json +26 -0
  73. package/src/tasks/image-text-to-text/about.md +76 -0
  74. package/src/tasks/image-text-to-text/data.ts +102 -0
  75. package/src/tasks/image-to-3d/about.md +62 -0
  76. package/src/tasks/image-to-3d/data.ts +75 -0
  77. package/src/tasks/image-to-image/about.md +129 -0
  78. package/src/tasks/image-to-image/data.ts +101 -0
  79. package/src/tasks/image-to-image/inference.ts +68 -0
  80. package/src/tasks/image-to-image/spec/input.json +55 -0
  81. package/src/tasks/image-to-image/spec/output.json +12 -0
  82. package/src/tasks/image-to-text/about.md +61 -0
  83. package/src/tasks/image-to-text/data.ts +82 -0
  84. package/src/tasks/image-to-text/inference.ts +143 -0
  85. package/src/tasks/image-to-text/spec/input.json +34 -0
  86. package/src/tasks/image-to-text/spec/output.json +14 -0
  87. package/src/tasks/index.ts +312 -0
  88. package/src/tasks/keypoint-detection/about.md +57 -0
  89. package/src/tasks/keypoint-detection/data.ts +50 -0
  90. package/src/tasks/mask-generation/about.md +65 -0
  91. package/src/tasks/mask-generation/data.ts +55 -0
  92. package/src/tasks/object-detection/about.md +37 -0
  93. package/src/tasks/object-detection/data.ts +86 -0
  94. package/src/tasks/object-detection/inference.ts +75 -0
  95. package/src/tasks/object-detection/spec/input.json +31 -0
  96. package/src/tasks/object-detection/spec/output.json +50 -0
  97. package/src/tasks/placeholder/about.md +15 -0
  98. package/src/tasks/placeholder/data.ts +21 -0
  99. package/src/tasks/placeholder/spec/input.json +35 -0
  100. package/src/tasks/placeholder/spec/output.json +17 -0
  101. package/src/tasks/question-answering/about.md +56 -0
  102. package/src/tasks/question-answering/data.ts +75 -0
  103. package/src/tasks/question-answering/inference.ts +99 -0
  104. package/src/tasks/question-answering/spec/input.json +67 -0
  105. package/src/tasks/question-answering/spec/output.json +29 -0
  106. package/src/tasks/reinforcement-learning/about.md +167 -0
  107. package/src/tasks/reinforcement-learning/data.ts +75 -0
  108. package/src/tasks/sentence-similarity/about.md +97 -0
  109. package/src/tasks/sentence-similarity/data.ts +101 -0
  110. package/src/tasks/sentence-similarity/inference.ts +32 -0
  111. package/src/tasks/sentence-similarity/spec/input.json +40 -0
  112. package/src/tasks/sentence-similarity/spec/output.json +12 -0
  113. package/src/tasks/summarization/about.md +58 -0
  114. package/src/tasks/summarization/data.ts +76 -0
  115. package/src/tasks/summarization/inference.ts +57 -0
  116. package/src/tasks/summarization/spec/input.json +42 -0
  117. package/src/tasks/summarization/spec/output.json +14 -0
  118. package/src/tasks/table-question-answering/about.md +43 -0
  119. package/src/tasks/table-question-answering/data.ts +59 -0
  120. package/src/tasks/table-question-answering/inference.ts +61 -0
  121. package/src/tasks/table-question-answering/spec/input.json +44 -0
  122. package/src/tasks/table-question-answering/spec/output.json +40 -0
  123. package/src/tasks/tabular-classification/about.md +65 -0
  124. package/src/tasks/tabular-classification/data.ts +68 -0
  125. package/src/tasks/tabular-regression/about.md +87 -0
  126. package/src/tasks/tabular-regression/data.ts +57 -0
  127. package/src/tasks/text-classification/about.md +173 -0
  128. package/src/tasks/text-classification/data.ts +103 -0
  129. package/src/tasks/text-classification/inference.ts +51 -0
  130. package/src/tasks/text-classification/spec/input.json +35 -0
  131. package/src/tasks/text-classification/spec/output.json +11 -0
  132. package/src/tasks/text-generation/about.md +154 -0
  133. package/src/tasks/text-generation/data.ts +114 -0
  134. package/src/tasks/text-generation/inference.ts +200 -0
  135. package/src/tasks/text-generation/spec/input.json +219 -0
  136. package/src/tasks/text-generation/spec/output.json +179 -0
  137. package/src/tasks/text-generation/spec/stream_output.json +103 -0
  138. package/src/tasks/text-to-3d/about.md +62 -0
  139. package/src/tasks/text-to-3d/data.ts +56 -0
  140. package/src/tasks/text-to-audio/inference.ts +143 -0
  141. package/src/tasks/text-to-audio/spec/input.json +31 -0
  142. package/src/tasks/text-to-audio/spec/output.json +17 -0
  143. package/src/tasks/text-to-image/about.md +96 -0
  144. package/src/tasks/text-to-image/data.ts +100 -0
  145. package/src/tasks/text-to-image/inference.ts +75 -0
  146. package/src/tasks/text-to-image/spec/input.json +63 -0
  147. package/src/tasks/text-to-image/spec/output.json +13 -0
  148. package/src/tasks/text-to-speech/about.md +63 -0
  149. package/src/tasks/text-to-speech/data.ts +79 -0
  150. package/src/tasks/text-to-speech/inference.ts +145 -0
  151. package/src/tasks/text-to-speech/spec/input.json +31 -0
  152. package/src/tasks/text-to-speech/spec/output.json +7 -0
  153. package/src/tasks/text-to-video/about.md +41 -0
  154. package/src/tasks/text-to-video/data.ts +102 -0
  155. package/src/tasks/text2text-generation/inference.ts +55 -0
  156. package/src/tasks/text2text-generation/spec/input.json +55 -0
  157. package/src/tasks/text2text-generation/spec/output.json +14 -0
  158. package/src/tasks/token-classification/about.md +76 -0
  159. package/src/tasks/token-classification/data.ts +92 -0
  160. package/src/tasks/token-classification/inference.ts +85 -0
  161. package/src/tasks/token-classification/spec/input.json +65 -0
  162. package/src/tasks/token-classification/spec/output.json +37 -0
  163. package/src/tasks/translation/about.md +65 -0
  164. package/src/tasks/translation/data.ts +70 -0
  165. package/src/tasks/translation/inference.ts +67 -0
  166. package/src/tasks/translation/spec/input.json +50 -0
  167. package/src/tasks/translation/spec/output.json +14 -0
  168. package/src/tasks/unconditional-image-generation/about.md +50 -0
  169. package/src/tasks/unconditional-image-generation/data.ts +72 -0
  170. package/src/tasks/video-classification/about.md +37 -0
  171. package/src/tasks/video-classification/data.ts +84 -0
  172. package/src/tasks/video-classification/inference.ts +59 -0
  173. package/src/tasks/video-classification/spec/input.json +42 -0
  174. package/src/tasks/video-classification/spec/output.json +10 -0
  175. package/src/tasks/video-text-to-text/about.md +98 -0
  176. package/src/tasks/video-text-to-text/data.ts +66 -0
  177. package/src/tasks/visual-question-answering/about.md +48 -0
  178. package/src/tasks/visual-question-answering/data.ts +97 -0
  179. package/src/tasks/visual-question-answering/inference.ts +62 -0
  180. package/src/tasks/visual-question-answering/spec/input.json +41 -0
  181. package/src/tasks/visual-question-answering/spec/output.json +21 -0
  182. package/src/tasks/zero-shot-classification/about.md +40 -0
  183. package/src/tasks/zero-shot-classification/data.ts +70 -0
  184. package/src/tasks/zero-shot-classification/inference.ts +67 -0
  185. package/src/tasks/zero-shot-classification/spec/input.json +50 -0
  186. package/src/tasks/zero-shot-classification/spec/output.json +11 -0
  187. package/src/tasks/zero-shot-image-classification/about.md +75 -0
  188. package/src/tasks/zero-shot-image-classification/data.ts +84 -0
  189. package/src/tasks/zero-shot-image-classification/inference.ts +61 -0
  190. package/src/tasks/zero-shot-image-classification/spec/input.json +45 -0
  191. package/src/tasks/zero-shot-image-classification/spec/output.json +10 -0
  192. package/src/tasks/zero-shot-object-detection/about.md +45 -0
  193. package/src/tasks/zero-shot-object-detection/data.ts +67 -0
  194. package/src/tasks/zero-shot-object-detection/inference.ts +66 -0
  195. package/src/tasks/zero-shot-object-detection/spec/input.json +40 -0
  196. package/src/tasks/zero-shot-object-detection/spec/output.json +47 -0
  197. package/src/tokenizer-data.ts +32 -0
  198. package/src/widget-example.ts +125 -0
@@ -0,0 +1,14 @@
1
+ {
2
+ "$id": "/inference/schemas/image-to-text/output.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Outputs of inference for the Image To Text task",
5
+ "title": "ImageToTextOutput",
6
+ "type": "object",
7
+ "properties": {
8
+ "generated_text": {
9
+ "type": "string",
10
+ "description": "The generated text."
11
+ }
12
+ },
13
+ "required": ["generatedText"]
14
+ }
@@ -0,0 +1,312 @@
1
+ import type { PipelineType } from "../pipelines.js";
2
+ import { PIPELINE_DATA } from "../pipelines.js";
3
+
4
+ import audioClassification from "./audio-classification/data.js";
5
+ import audioToAudio from "./audio-to-audio/data.js";
6
+ import automaticSpeechRecognition from "./automatic-speech-recognition/data.js";
7
+ import documentQuestionAnswering from "./document-question-answering/data.js";
8
+ import featureExtraction from "./feature-extraction/data.js";
9
+ import fillMask from "./fill-mask/data.js";
10
+ import imageClassification from "./image-classification/data.js";
11
+ import imageFeatureExtraction from "./image-feature-extraction/data.js";
12
+ import imageToImage from "./image-to-image/data.js";
13
+ import imageToText from "./image-to-text/data.js";
14
+ import imageTextToText from "./image-text-to-text/data.js";
15
+ import imageSegmentation from "./image-segmentation/data.js";
16
+ import maskGeneration from "./mask-generation/data.js";
17
+ import objectDetection from "./object-detection/data.js";
18
+ import depthEstimation from "./depth-estimation/data.js";
19
+ import placeholder from "./placeholder/data.js";
20
+ import reinforcementLearning from "./reinforcement-learning/data.js";
21
+ import questionAnswering from "./question-answering/data.js";
22
+ import sentenceSimilarity from "./sentence-similarity/data.js";
23
+ import summarization from "./summarization/data.js";
24
+ import tableQuestionAnswering from "./table-question-answering/data.js";
25
+ import tabularClassification from "./tabular-classification/data.js";
26
+ import tabularRegression from "./tabular-regression/data.js";
27
+ import textToImage from "./text-to-image/data.js";
28
+ import textToSpeech from "./text-to-speech/data.js";
29
+ import tokenClassification from "./token-classification/data.js";
30
+ import translation from "./translation/data.js";
31
+ import textClassification from "./text-classification/data.js";
32
+ import textGeneration from "./text-generation/data.js";
33
+ import textToVideo from "./text-to-video/data.js";
34
+ import unconditionalImageGeneration from "./unconditional-image-generation/data.js";
35
+ import videoClassification from "./video-classification/data.js";
36
+ import visualQuestionAnswering from "./visual-question-answering/data.js";
37
+ import zeroShotClassification from "./zero-shot-classification/data.js";
38
+ import zeroShotImageClassification from "./zero-shot-image-classification/data.js";
39
+ import zeroShotObjectDetection from "./zero-shot-object-detection/data.js";
40
+ import imageTo3D from "./image-to-3d/data.js";
41
+ import textTo3D from "./text-to-3d/data.js";
42
+ import keypointDetection from "./keypoint-detection/data.js";
43
+ import videoTextToText from "./video-text-to-text/data.js";
44
+
45
+ export type * from "./audio-classification/inference.js";
46
+ export type * from "./automatic-speech-recognition/inference.js";
47
+ export type {
48
+ ChatCompletionInput,
49
+ ChatCompletionInputMessage,
50
+ ChatCompletionOutput,
51
+ ChatCompletionOutputComplete,
52
+ ChatCompletionOutputMessage,
53
+ ChatCompletionStreamOutput,
54
+ ChatCompletionStreamOutputChoice,
55
+ ChatCompletionStreamOutputDelta,
56
+ } from "./chat-completion/inference.js";
57
+ export type * from "./document-question-answering/inference.js";
58
+ export type * from "./feature-extraction/inference.js";
59
+ export type * from "./fill-mask/inference.js";
60
+ export type {
61
+ ImageClassificationInput,
62
+ ImageClassificationOutput,
63
+ ImageClassificationOutputElement,
64
+ ImageClassificationParameters,
65
+ } from "./image-classification/inference.js";
66
+ export type * from "./image-to-image/inference.js";
67
+ export type { ImageToTextInput, ImageToTextOutput, ImageToTextParameters } from "./image-to-text/inference.js";
68
+ export type * from "./image-segmentation/inference.js";
69
+ export type * from "./object-detection/inference.js";
70
+ export type * from "./depth-estimation/inference.js";
71
+ export type * from "./question-answering/inference.js";
72
+ export type * from "./sentence-similarity/inference.js";
73
+ export type * from "./summarization/inference.js";
74
+ export type * from "./table-question-answering/inference.js";
75
+ export type { TextToImageInput, TextToImageOutput, TextToImageParameters } from "./text-to-image/inference.js";
76
+ export type { TextToSpeechParameters, TextToSpeechInput, TextToSpeechOutput } from "./text-to-speech/inference.js";
77
+ export type * from "./token-classification/inference.js";
78
+ export type { TranslationInput, TranslationOutput } from "./translation/inference.js";
79
+ export type {
80
+ ClassificationOutputTransform,
81
+ TextClassificationInput,
82
+ TextClassificationOutput,
83
+ TextClassificationOutputElement,
84
+ TextClassificationParameters,
85
+ } from "./text-classification/inference.js";
86
+ export type {
87
+ TextGenerationOutputFinishReason,
88
+ TextGenerationOutputPrefillToken,
89
+ TextGenerationInput,
90
+ TextGenerationOutput,
91
+ TextGenerationOutputDetails,
92
+ TextGenerationInputGenerateParameters,
93
+ TextGenerationOutputBestOfSequence,
94
+ TextGenerationOutputToken,
95
+ TextGenerationStreamOutputStreamDetails,
96
+ TextGenerationStreamOutput,
97
+ } from "./text-generation/inference.js";
98
+ export type * from "./video-classification/inference.js";
99
+ export type * from "./visual-question-answering/inference.js";
100
+ export type * from "./zero-shot-classification/inference.js";
101
+ export type * from "./zero-shot-image-classification/inference.js";
102
+ export type {
103
+ BoundingBox,
104
+ ZeroShotObjectDetectionInput,
105
+ ZeroShotObjectDetectionInputData,
106
+ ZeroShotObjectDetectionOutput,
107
+ ZeroShotObjectDetectionOutputElement,
108
+ } from "./zero-shot-object-detection/inference.js";
109
+
110
+ import type { ModelLibraryKey } from "../model-libraries.js";
111
+
112
+ /**
113
+ * Model libraries compatible with each ML task
114
+ */
115
+ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
116
+ "audio-classification": ["speechbrain", "transformers", "transformers.js"],
117
+ "audio-to-audio": ["asteroid", "fairseq", "speechbrain"],
118
+ "automatic-speech-recognition": ["espnet", "nemo", "speechbrain", "transformers", "transformers.js"],
119
+ "depth-estimation": ["transformers", "transformers.js"],
120
+ "document-question-answering": ["transformers", "transformers.js"],
121
+ "feature-extraction": ["sentence-transformers", "transformers", "transformers.js"],
122
+ "fill-mask": ["transformers", "transformers.js"],
123
+ "graph-ml": ["transformers"],
124
+ "image-classification": ["keras", "timm", "transformers", "transformers.js"],
125
+ "image-feature-extraction": ["timm", "transformers"],
126
+ "image-segmentation": ["transformers", "transformers.js"],
127
+ "image-text-to-text": ["transformers"],
128
+ "image-to-image": ["diffusers", "transformers", "transformers.js"],
129
+ "image-to-text": ["transformers", "transformers.js"],
130
+ "image-to-video": ["diffusers"],
131
+ "keypoint-detection": ["transformers"],
132
+ "video-classification": ["transformers"],
133
+ "mask-generation": ["transformers"],
134
+ "multiple-choice": ["transformers"],
135
+ "object-detection": ["transformers", "transformers.js"],
136
+ other: [],
137
+ "question-answering": ["adapter-transformers", "allennlp", "transformers", "transformers.js"],
138
+ robotics: [],
139
+ "reinforcement-learning": ["transformers", "stable-baselines3", "ml-agents", "sample-factory"],
140
+ "sentence-similarity": ["sentence-transformers", "spacy", "transformers.js"],
141
+ summarization: ["transformers", "transformers.js"],
142
+ "table-question-answering": ["transformers"],
143
+ "table-to-text": ["transformers"],
144
+ "tabular-classification": ["sklearn"],
145
+ "tabular-regression": ["sklearn"],
146
+ "tabular-to-text": ["transformers"],
147
+ "text-classification": ["adapter-transformers", "setfit", "spacy", "transformers", "transformers.js"],
148
+ "text-generation": ["transformers", "transformers.js"],
149
+ "text-retrieval": [],
150
+ "text-to-image": ["diffusers"],
151
+ "text-to-speech": ["espnet", "tensorflowtts", "transformers", "transformers.js"],
152
+ "text-to-audio": ["transformers", "transformers.js"],
153
+ "text-to-video": ["diffusers"],
154
+ "text2text-generation": ["transformers", "transformers.js"],
155
+ "time-series-forecasting": [],
156
+ "token-classification": [
157
+ "adapter-transformers",
158
+ "flair",
159
+ "spacy",
160
+ "span-marker",
161
+ "stanza",
162
+ "transformers",
163
+ "transformers.js",
164
+ ],
165
+ translation: ["transformers", "transformers.js"],
166
+ "unconditional-image-generation": ["diffusers"],
167
+ "video-text-to-text": ["transformers"],
168
+ "visual-question-answering": ["transformers", "transformers.js"],
169
+ "voice-activity-detection": [],
170
+ "zero-shot-classification": ["transformers", "transformers.js"],
171
+ "zero-shot-image-classification": ["transformers", "transformers.js"],
172
+ "zero-shot-object-detection": ["transformers", "transformers.js"],
173
+ "text-to-3d": ["diffusers"],
174
+ "image-to-3d": ["diffusers"],
175
+ "any-to-any": ["transformers"],
176
+ };
177
+
178
+ /**
179
+ * Return the whole TaskData object for a certain task.
180
+ * If the partialTaskData argument is left undefined,
181
+ * the default placholder data will be used.
182
+ */
183
+ function getData(type: PipelineType, partialTaskData: TaskDataCustom = placeholder): TaskData {
184
+ return {
185
+ ...partialTaskData,
186
+ id: type,
187
+ label: PIPELINE_DATA[type].name,
188
+ libraries: TASKS_MODEL_LIBRARIES[type],
189
+ };
190
+ }
191
+
192
+ // To make comparisons easier, task order is the same as in const.ts
193
+ // Tasks set to undefined won't have an associated task page.
194
+ // Tasks that call getData() without the second argument will
195
+ // have a "placeholder" page.
196
+ export const TASKS_DATA: Record<PipelineType, TaskData | undefined> = {
197
+ "any-to-any": getData("any-to-any", placeholder),
198
+ "audio-classification": getData("audio-classification", audioClassification),
199
+ "audio-to-audio": getData("audio-to-audio", audioToAudio),
200
+ "automatic-speech-recognition": getData("automatic-speech-recognition", automaticSpeechRecognition),
201
+ "depth-estimation": getData("depth-estimation", depthEstimation),
202
+ "document-question-answering": getData("document-question-answering", documentQuestionAnswering),
203
+ "feature-extraction": getData("feature-extraction", featureExtraction),
204
+ "fill-mask": getData("fill-mask", fillMask),
205
+ "graph-ml": undefined,
206
+ "image-classification": getData("image-classification", imageClassification),
207
+ "image-feature-extraction": getData("image-feature-extraction", imageFeatureExtraction),
208
+ "image-segmentation": getData("image-segmentation", imageSegmentation),
209
+ "image-to-image": getData("image-to-image", imageToImage),
210
+ "image-text-to-text": getData("image-text-to-text", imageTextToText),
211
+ "image-to-text": getData("image-to-text", imageToText),
212
+ "image-to-video": undefined,
213
+ "keypoint-detection": getData("keypoint-detection", keypointDetection),
214
+ "mask-generation": getData("mask-generation", maskGeneration),
215
+ "multiple-choice": undefined,
216
+ "object-detection": getData("object-detection", objectDetection),
217
+ "video-classification": getData("video-classification", videoClassification),
218
+ other: undefined,
219
+ "question-answering": getData("question-answering", questionAnswering),
220
+ "reinforcement-learning": getData("reinforcement-learning", reinforcementLearning),
221
+ robotics: undefined,
222
+ "sentence-similarity": getData("sentence-similarity", sentenceSimilarity),
223
+ summarization: getData("summarization", summarization),
224
+ "table-question-answering": getData("table-question-answering", tableQuestionAnswering),
225
+ "table-to-text": undefined,
226
+ "tabular-classification": getData("tabular-classification", tabularClassification),
227
+ "tabular-regression": getData("tabular-regression", tabularRegression),
228
+ "tabular-to-text": undefined,
229
+ "text-classification": getData("text-classification", textClassification),
230
+ "text-generation": getData("text-generation", textGeneration),
231
+ "text-retrieval": undefined,
232
+ "text-to-image": getData("text-to-image", textToImage),
233
+ "text-to-speech": getData("text-to-speech", textToSpeech),
234
+ "text-to-audio": undefined,
235
+ "text-to-video": getData("text-to-video", textToVideo),
236
+ "text2text-generation": undefined,
237
+ "time-series-forecasting": undefined,
238
+ "token-classification": getData("token-classification", tokenClassification),
239
+ translation: getData("translation", translation),
240
+ "unconditional-image-generation": getData("unconditional-image-generation", unconditionalImageGeneration),
241
+ "video-text-to-text": getData("video-text-to-text", videoTextToText),
242
+ "visual-question-answering": getData("visual-question-answering", visualQuestionAnswering),
243
+ "voice-activity-detection": undefined,
244
+ "zero-shot-classification": getData("zero-shot-classification", zeroShotClassification),
245
+ "zero-shot-image-classification": getData("zero-shot-image-classification", zeroShotImageClassification),
246
+ "zero-shot-object-detection": getData("zero-shot-object-detection", zeroShotObjectDetection),
247
+ "text-to-3d": getData("text-to-3d", textTo3D),
248
+ "image-to-3d": getData("image-to-3d", imageTo3D),
249
+ } as const;
250
+
251
+ export interface ExampleRepo {
252
+ description: string;
253
+ id: string;
254
+ }
255
+
256
+ export type TaskDemoEntry =
257
+ | {
258
+ filename: string;
259
+ type: "audio";
260
+ }
261
+ | {
262
+ data: Array<{
263
+ label: string;
264
+ score: number;
265
+ }>;
266
+ type: "chart";
267
+ }
268
+ | {
269
+ filename: string;
270
+ type: "img";
271
+ }
272
+ | {
273
+ table: string[][];
274
+ type: "tabular";
275
+ }
276
+ | {
277
+ content: string;
278
+ label: string;
279
+ type: "text";
280
+ }
281
+ | {
282
+ text: string;
283
+ tokens: Array<{
284
+ end: number;
285
+ start: number;
286
+ type: string;
287
+ }>;
288
+ type: "text-with-tokens";
289
+ };
290
+
291
+ export interface TaskDemo {
292
+ inputs: TaskDemoEntry[];
293
+ outputs: TaskDemoEntry[];
294
+ }
295
+
296
+ export interface TaskData {
297
+ datasets: ExampleRepo[];
298
+ demo: TaskDemo;
299
+ id: PipelineType;
300
+ canonicalId?: PipelineType;
301
+ isPlaceholder?: boolean;
302
+ label: string;
303
+ libraries: ModelLibraryKey[];
304
+ metrics: ExampleRepo[];
305
+ models: ExampleRepo[];
306
+ spaces: ExampleRepo[];
307
+ summary: string;
308
+ widgetModels: string[];
309
+ youtubeId?: string;
310
+ }
311
+
312
+ export type TaskDataCustom = Omit<TaskData, "id" | "label" | "libraries">;
@@ -0,0 +1,57 @@
1
+ ## Task Variants
2
+
3
+ ### Pose Estimation
4
+
5
+ Pose estimation is the process of determining the position and orientation of an object or a camera in a 3D space. It is a fundamental task in computer vision and is widely used in various applications such as robotics, augmented reality, and 3D reconstruction.
6
+
7
+ ## Use Cases for Keypoint Detection
8
+
9
+ ### Facial Landmark Estimation
10
+
11
+ Keypoint detection models can be used to estimate the position of facial landmarks. Facial landmarks are points on the face such as the corners of the mouth, the outer corners of the eyes, and the tip of the nose. These landmarks can be used for a variety of applications, such as facial expression recognition, 3D face reconstruction, and cinematic animation.
12
+
13
+ ### Fitness Tracking
14
+
15
+ Keypoint detection models can be used to track the movement of the human body, e.g. position of the joints in a 3D space. This can be used for a variety of applications, such as fitness tracking, sports analysis or virtual reality applications.
16
+
17
+ ## Inference Code
18
+
19
+ Below you can find an example of how to use a keypoint detection model and how to visualize the results.
20
+
21
+ ```python
22
+ from transformers import AutoImageProcessor, SuperPointForKeypointDetection
23
+ import torch
24
+ import matplotlib.pyplot as plt
25
+ from PIL import Image
26
+ import requests
27
+
28
+ url_image = "http://images.cocodataset.org/val2017/000000039769.jpg"
29
+ image = Image.open(requests.get(url_image_1, stream=True).raw)
30
+
31
+ # initialize the model and processor
32
+ processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint")
33
+ model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")
34
+
35
+ # infer
36
+ inputs = processor(image, return_tensors="pt").to(model.device, model.dtype)
37
+ outputs = model(**inputs)
38
+
39
+ # postprocess
40
+ image_sizes = [(image.size[1], image.size[0])]
41
+ outputs = processor.post_process_keypoint_detection(model_outputs, image_sizes)
42
+ keypoints = outputs[0]["keypoints"].detach().numpy()
43
+ scores = outputs[0]["scores"].detach().numpy()
44
+ image_width, image_height = image.size
45
+
46
+ # plot
47
+ plt.axis('off')
48
+ plt.imshow(image)
49
+ plt.scatter(
50
+ keypoints[:, 0],
51
+ keypoints[:, 1],
52
+ s=scores * 100,
53
+ c='cyan',
54
+ alpha=0.4
55
+ )
56
+ plt.show()
57
+ ```
@@ -0,0 +1,50 @@
1
+ import type { TaskDataCustom } from "../index.js";
2
+
3
+ const taskData: TaskDataCustom = {
4
+ datasets: [
5
+ {
6
+ description: "A dataset of hand keypoints of over 500k examples.",
7
+ id: "Vincent-luo/hagrid-mediapipe-hands",
8
+ },
9
+ ],
10
+ demo: {
11
+ inputs: [
12
+ {
13
+ filename: "keypoint-detection-input.png",
14
+ type: "img",
15
+ },
16
+ ],
17
+ outputs: [
18
+ {
19
+ filename: "keypoint-detection-output.png",
20
+ type: "img",
21
+ },
22
+ ],
23
+ },
24
+ metrics: [],
25
+ models: [
26
+ {
27
+ description: "A robust keypoint detection model.",
28
+ id: "magic-leap-community/superpoint",
29
+ },
30
+ {
31
+ description: "Strong keypoint detection model used to detect human pose.",
32
+ id: "facebook/sapiens-pose-1b",
33
+ },
34
+ ],
35
+ spaces: [
36
+ {
37
+ description: "An application that detects hand keypoints in real-time.",
38
+ id: "datasciencedojo/Hand-Keypoint-Detection-Realtime",
39
+ },
40
+ {
41
+ description: "An application to try a universal keypoint detection model.",
42
+ id: "merve/SuperPoint",
43
+ },
44
+ ],
45
+ summary: "Keypoint detection is the task of identifying meaningful distinctive points or features in an image.",
46
+ widgetModels: [],
47
+ youtubeId: "",
48
+ };
49
+
50
+ export default taskData;
@@ -0,0 +1,65 @@
1
+ ## Use Cases
2
+
3
+ ### Filtering an Image
4
+
5
+ When filtering for an image, the generated masks might serve as an initial filter to eliminate irrelevant information. For instance, when monitoring vegetation in satellite imaging, mask generation models identify green spots, highlighting the relevant region of the image.
6
+
7
+ ### Masked Image Modelling
8
+
9
+ Generating masks can facilitate learning, especially in semi or unsupervised learning. For example, the [BEiT model](https://huggingface.co/docs/transformers/model_doc/beit) uses image-mask patches in the pre-training.
10
+
11
+ ### Human-in-the-loop Computer Vision Applications
12
+
13
+ For applications where humans are in the loop, masks highlight certain regions of images for humans to validate.
14
+
15
+ ## Task Variants
16
+
17
+ ### Segmentation
18
+
19
+ Image Segmentation divides an image into segments where each pixel is mapped to an object. This task has multiple variants, such as instance segmentation, panoptic segmentation, and semantic segmentation. You can learn more about segmentation on its [task page](https://huggingface.co/tasks/image-segmentation).
20
+
21
+ ## Inference
22
+
23
+ Mask generation models often work in two modes: segment everything or prompt mode.
24
+ The example below works in segment-everything-mode, where many masks will be returned.
25
+
26
+ ```python
27
+ from transformers import pipeline
28
+
29
+ generator = pipeline("mask-generation", model="Zigeng/SlimSAM-uniform-50", points_per_batch=64, device="cuda")
30
+ image_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
31
+ outputs = generator(image_url)
32
+ outputs["masks"]
33
+ # array of multiple binary masks returned for each generated mask
34
+ ```
35
+
36
+ Prompt mode takes in three types of prompts:
37
+
38
+ - **Point prompt:** The user can select a point on the image, and a meaningful segment around the point will be returned.
39
+ - **Box prompt:** The user can draw a box on the image, and a meaningful segment within the box will be returned.
40
+ - **Text prompt:** The user can input a text, and the objects of that type will be segmented. Note that this capability has not yet been released and has only been explored in research.
41
+
42
+ Below you can see how to use an input-point prompt. It also demonstrates direct model inference without the `pipeline` abstraction. The input prompt here is a nested list where the outermost list is the batch size (`1`), then the number of points (also `1` in this example), and the innermost list contains the actual coordinates of the point (`[450, 600]`).
43
+
44
+ ```python
45
+ from transformers import SamModel, SamProcessor
46
+ from PIL import Image
47
+ import requests
48
+
49
+ model = SamModel.from_pretrained("Zigeng/SlimSAM-uniform-50").to("cuda")
50
+ processor = SamProcessor.from_pretrained("Zigeng/SlimSAM-uniform-50")
51
+
52
+ raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
53
+ # pointing to the car window
54
+ input_points = [[[450, 600]]]
55
+ inputs = processor(raw_image, input_points=input_points, return_tensors="pt").to("cuda")
56
+ outputs = model(**inputs)
57
+ masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
58
+ scores = outputs.iou_scores
59
+ ```
60
+
61
+ ## Useful Resources
62
+
63
+ Would you like to learn more about mask generation? Great! Here you can find some curated resources that you may find helpful!
64
+
65
+ - [Segment anything model](https://huggingface.co/docs/transformers/main/model_doc/sam)
@@ -0,0 +1,55 @@
1
+ import type { TaskDataCustom } from "../index.js";
2
+
3
+ const taskData: TaskDataCustom = {
4
+ datasets: [],
5
+ demo: {
6
+ inputs: [
7
+ {
8
+ filename: "mask-generation-input.png",
9
+ type: "img",
10
+ },
11
+ ],
12
+ outputs: [
13
+ {
14
+ filename: "mask-generation-output.png",
15
+ type: "img",
16
+ },
17
+ ],
18
+ },
19
+ metrics: [],
20
+ models: [
21
+ {
22
+ description: "Small yet powerful mask generation model.",
23
+ id: "Zigeng/SlimSAM-uniform-50",
24
+ },
25
+ {
26
+ description: "Very strong mask generation model.",
27
+ id: "facebook/sam2-hiera-large",
28
+ },
29
+ ],
30
+ spaces: [
31
+ {
32
+ description:
33
+ "An application that combines a mask generation model with a zero-shot object detection model for text-guided image segmentation.",
34
+ id: "merve/OWLSAM2",
35
+ },
36
+ {
37
+ description: "An application that compares the performance of a large and a small mask generation model.",
38
+ id: "merve/slimsam",
39
+ },
40
+ {
41
+ description: "An application based on an improved mask generation model.",
42
+ id: "SkalskiP/segment-anything-model-2",
43
+ },
44
+ {
45
+ description: "An application to remove objects from videos using mask generation models.",
46
+ id: "SkalskiP/SAM_and_ProPainter",
47
+ },
48
+ ],
49
+ summary:
50
+ "Mask generation is the task of generating masks that identify a specific object or region of interest in a given image. Masks are often used in segmentation tasks, where they provide a precise way to isolate the object of interest for further processing or analysis.",
51
+ widgetModels: [],
52
+ youtubeId: "",
53
+ };
54
+
55
+ export default taskData;
@@ -0,0 +1,37 @@
1
+ ## Use Cases
2
+
3
+ ### Autonomous Driving
4
+
5
+ Object Detection is widely used in computer vision for autonomous driving. Self-driving cars use Object Detection models to detect pedestrians, bicycles, traffic lights and road signs to decide which step to take.
6
+
7
+ ### Object Tracking in Matches
8
+
9
+ Object Detection models are widely used in sports where the ball or a player is tracked for monitoring and refereeing during matches.
10
+
11
+ ### Image Search
12
+
13
+ Object Detection models are widely used in image search. Smartphones use Object Detection models to detect entities (such as specific places or objects) and allow the user to search for the entity on the Internet.
14
+
15
+ ### Object Counting
16
+
17
+ Object Detection models are used to count instances of objects in a given image, this can include counting the objects in warehouses or stores, or counting the number of visitors in a store. They are also used to manage crowds at events to prevent disasters.
18
+
19
+ ## Inference
20
+
21
+ You can infer with Object Detection models through the `object-detection` pipeline. When calling the pipeline you just need to specify a path or http link to an image.
22
+
23
+ ```python
24
+ model = pipeline("object-detection")
25
+
26
+ model("path_to_cat_image")
27
+
28
+ # [{'label': 'blanket',
29
+ # 'mask': mask_string,
30
+ # 'score': 0.917},
31
+ #...]
32
+ ```
33
+
34
+ # Useful Resources
35
+
36
+ - [Walkthrough of Computer Vision Ecosystem in Hugging Face - CV Study Group](https://www.youtube.com/watch?v=oL-xmufhZM8)
37
+ - [Object detection task guide](https://huggingface.co/docs/transformers/tasks/object_detection)
@@ -0,0 +1,86 @@
1
+ import type { TaskDataCustom } from "../index.js";
2
+
3
+ const taskData: TaskDataCustom = {
4
+ datasets: [
5
+ {
6
+ description: "Widely used benchmark dataset for multiple vision tasks.",
7
+ id: "merve/coco2017",
8
+ },
9
+ {
10
+ description: "Multi-task computer vision benchmark.",
11
+ id: "merve/pascal-voc",
12
+ },
13
+ ],
14
+ demo: {
15
+ inputs: [
16
+ {
17
+ filename: "object-detection-input.jpg",
18
+ type: "img",
19
+ },
20
+ ],
21
+ outputs: [
22
+ {
23
+ filename: "object-detection-output.jpg",
24
+ type: "img",
25
+ },
26
+ ],
27
+ },
28
+ metrics: [
29
+ {
30
+ description:
31
+ "The Average Precision (AP) metric is the Area Under the PR Curve (AUC-PR). It is calculated for each class separately",
32
+ id: "Average Precision",
33
+ },
34
+ {
35
+ description: "The Mean Average Precision (mAP) metric is the overall average of the AP values",
36
+ id: "Mean Average Precision",
37
+ },
38
+ {
39
+ description:
40
+ "The APα metric is the Average Precision at the IoU threshold of a α value, for example, AP50 and AP75",
41
+ id: "APα",
42
+ },
43
+ ],
44
+ models: [
45
+ {
46
+ description: "Solid object detection model pre-trained on the COCO 2017 dataset.",
47
+ id: "facebook/detr-resnet-50",
48
+ },
49
+ {
50
+ description: "Real-time and accurate object detection model.",
51
+ id: "jameslahm/yolov10x",
52
+ },
53
+ {
54
+ description: "Fast and accurate object detection model trained on COCO and Object365 datasets.",
55
+ id: "PekingU/rtdetr_r18vd_coco_o365",
56
+ },
57
+ ],
58
+ spaces: [
59
+ {
60
+ description: "Leaderboard to compare various object detection models across several metrics.",
61
+ id: "hf-vision/object_detection_leaderboard",
62
+ },
63
+ {
64
+ description: "An application that contains various object detection models to try from.",
65
+ id: "Gradio-Blocks/Object-Detection-With-DETR-and-YOLOS",
66
+ },
67
+ {
68
+ description: "An application that shows multiple cutting edge techniques for object detection and tracking.",
69
+ id: "kadirnar/torchyolo",
70
+ },
71
+ {
72
+ description: "An object tracking, segmentation and inpainting application.",
73
+ id: "VIPLab/Track-Anything",
74
+ },
75
+ {
76
+ description: "Very fast object tracking application based on object detection.",
77
+ id: "merve/RT-DETR-tracking-coco",
78
+ },
79
+ ],
80
+ summary:
81
+ "Object Detection models allow users to identify objects of certain defined classes. Object detection models receive an image as input and output the images with bounding boxes and labels on detected objects.",
82
+ widgetModels: ["facebook/detr-resnet-50"],
83
+ youtubeId: "WdAeKSOpxhw",
84
+ };
85
+
86
+ export default taskData;