@huggingface/tasks 0.11.11 → 0.11.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/dist/index.cjs +146 -17
  2. package/dist/index.js +146 -17
  3. package/dist/src/dataset-libraries.d.ts +6 -0
  4. package/dist/src/dataset-libraries.d.ts.map +1 -1
  5. package/dist/src/hardware.d.ts +8 -0
  6. package/dist/src/hardware.d.ts.map +1 -1
  7. package/dist/src/model-libraries-snippets.d.ts +3 -0
  8. package/dist/src/model-libraries-snippets.d.ts.map +1 -1
  9. package/dist/src/model-libraries.d.ts +34 -2
  10. package/dist/src/model-libraries.d.ts.map +1 -1
  11. package/dist/src/pipelines.d.ts +12 -2
  12. package/dist/src/pipelines.d.ts.map +1 -1
  13. package/dist/src/snippets/curl.d.ts.map +1 -1
  14. package/dist/src/snippets/js.d.ts.map +1 -1
  15. package/dist/src/snippets/python.d.ts.map +1 -1
  16. package/dist/src/tasks/audio-classification/inference.d.ts +3 -2
  17. package/dist/src/tasks/audio-classification/inference.d.ts.map +1 -1
  18. package/dist/src/tasks/automatic-speech-recognition/inference.d.ts +3 -2
  19. package/dist/src/tasks/automatic-speech-recognition/inference.d.ts.map +1 -1
  20. package/dist/src/tasks/image-classification/inference.d.ts +3 -2
  21. package/dist/src/tasks/image-classification/inference.d.ts.map +1 -1
  22. package/dist/src/tasks/image-segmentation/inference.d.ts +10 -6
  23. package/dist/src/tasks/image-segmentation/inference.d.ts.map +1 -1
  24. package/dist/src/tasks/image-to-image/inference.d.ts +6 -5
  25. package/dist/src/tasks/image-to-image/inference.d.ts.map +1 -1
  26. package/dist/src/tasks/index.d.ts +1 -1
  27. package/dist/src/tasks/index.d.ts.map +1 -1
  28. package/dist/src/tasks/keypoint-detection/data.d.ts +4 -0
  29. package/dist/src/tasks/keypoint-detection/data.d.ts.map +1 -0
  30. package/dist/src/tasks/object-detection/inference.d.ts +17 -4
  31. package/dist/src/tasks/object-detection/inference.d.ts.map +1 -1
  32. package/dist/src/tasks/summarization/inference.d.ts +13 -12
  33. package/dist/src/tasks/summarization/inference.d.ts.map +1 -1
  34. package/dist/src/tasks/text-to-image/inference.d.ts +2 -2
  35. package/dist/src/tasks/translation/inference.d.ts +21 -10
  36. package/dist/src/tasks/translation/inference.d.ts.map +1 -1
  37. package/package.json +1 -1
  38. package/src/dataset-libraries.ts +6 -0
  39. package/src/hardware.ts +8 -0
  40. package/src/local-apps.ts +1 -1
  41. package/src/model-libraries-snippets.ts +87 -6
  42. package/src/model-libraries.ts +32 -0
  43. package/src/pipelines.ts +12 -0
  44. package/src/snippets/curl.ts +3 -6
  45. package/src/snippets/js.ts +1 -2
  46. package/src/snippets/python.ts +1 -2
  47. package/src/tasks/audio-classification/inference.ts +3 -2
  48. package/src/tasks/audio-classification/spec/input.json +2 -1
  49. package/src/tasks/audio-classification/spec/output.json +1 -0
  50. package/src/tasks/automatic-speech-recognition/inference.ts +3 -2
  51. package/src/tasks/automatic-speech-recognition/spec/input.json +2 -1
  52. package/src/tasks/common-definitions.json +3 -20
  53. package/src/tasks/image-classification/inference.ts +3 -2
  54. package/src/tasks/image-classification/spec/input.json +2 -1
  55. package/src/tasks/image-classification/spec/output.json +1 -0
  56. package/src/tasks/image-segmentation/inference.ts +10 -6
  57. package/src/tasks/image-segmentation/spec/input.json +3 -12
  58. package/src/tasks/image-segmentation/spec/output.json +4 -3
  59. package/src/tasks/image-to-image/inference.ts +6 -5
  60. package/src/tasks/image-to-image/spec/input.json +3 -2
  61. package/src/tasks/image-to-image/spec/output.json +1 -1
  62. package/src/tasks/index.ts +3 -6
  63. package/src/tasks/keypoint-detection/about.md +59 -0
  64. package/src/tasks/keypoint-detection/data.ts +46 -0
  65. package/src/tasks/object-detection/inference.ts +17 -4
  66. package/src/tasks/object-detection/spec/input.json +2 -1
  67. package/src/tasks/object-detection/spec/output.json +10 -6
  68. package/src/tasks/summarization/inference.ts +13 -12
  69. package/src/tasks/summarization/spec/input.json +37 -2
  70. package/src/tasks/text-classification/spec/output.json +1 -0
  71. package/src/tasks/text-to-image/inference.ts +2 -2
  72. package/src/tasks/text-to-image/spec/input.json +1 -1
  73. package/src/tasks/text-to-image/spec/output.json +1 -1
  74. package/src/tasks/translation/inference.ts +21 -10
  75. package/src/tasks/translation/spec/input.json +45 -2
  76. package/src/tasks/zero-shot-classification/spec/output.json +1 -0
@@ -170,6 +170,87 @@ export const diffusers = (model: ModelData): string[] => {
170
170
  }
171
171
  };
172
172
 
173
+ export const diffusionkit = (model: ModelData): string[] => {
174
+ const sd3Snippet = `# Pipeline for Stable Diffusion 3
175
+ from diffusionkit.mlx import DiffusionPipeline
176
+
177
+ pipeline = DiffusionPipeline(
178
+ shift=3.0,
179
+ use_t5=False,
180
+ model_version=${model.id},
181
+ low_memory_mode=True,
182
+ a16=True,
183
+ w16=True,
184
+ )`;
185
+
186
+ const fluxSnippet = `# Pipeline for Flux
187
+ from diffusionkit.mlx import FluxPipeline
188
+
189
+ pipeline = FluxPipeline(
190
+ shift=1.0,
191
+ model_version=${model.id},
192
+ low_memory_mode=True,
193
+ a16=True,
194
+ w16=True,
195
+ )`;
196
+
197
+ const generateSnippet = `# Image Generation
198
+ HEIGHT = 512
199
+ WIDTH = 512
200
+ NUM_STEPS = ${model.tags.includes("flux") ? 4 : 50}
201
+ CFG_WEIGHT = ${model.tags.includes("flux") ? 0 : 5}
202
+
203
+ image, _ = pipeline.generate_image(
204
+ "a photo of a cat",
205
+ cfg_weight=CFG_WEIGHT,
206
+ num_steps=NUM_STEPS,
207
+ latent_size=(HEIGHT // 8, WIDTH // 8),
208
+ )`;
209
+
210
+ const pipelineSnippet = model.tags.includes("flux") ? fluxSnippet : sd3Snippet;
211
+
212
+ return [pipelineSnippet, generateSnippet];
213
+ };
214
+
215
+ export const cartesia_pytorch = (model: ModelData): string[] => [
216
+ `# pip install --no-binary :all: cartesia-pytorch
217
+ from cartesia_pytorch import ReneLMHeadModel
218
+ from transformers import AutoTokenizer
219
+
220
+ model = ReneLMHeadModel.from_pretrained("${model.id}")
221
+ tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-1B-hf")
222
+
223
+ in_message = ["Rene Descartes was"]
224
+ inputs = tokenizer(in_message, return_tensors="pt")
225
+
226
+ outputs = model.generate(inputs.input_ids, max_length=50, top_k=100, top_p=0.99)
227
+ out_message = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
228
+
229
+ print(out_message)
230
+ )`,
231
+ ];
232
+
233
+ export const cartesia_mlx = (model: ModelData): string[] => [
234
+ `import mlx.core as mx
235
+ import cartesia_mlx as cmx
236
+
237
+ model = cmx.from_pretrained("${model.id}")
238
+ model.set_dtype(mx.float32)
239
+
240
+ prompt = "Rene Descartes was"
241
+
242
+ for text in model.generate(
243
+ prompt,
244
+ max_tokens=500,
245
+ eval_every_n=5,
246
+ verbose=True,
247
+ top_p=0.99,
248
+ temperature=0.85,
249
+ ):
250
+ print(text, end="", flush=True)
251
+ `,
252
+ ];
253
+
173
254
  export const edsnlp = (model: ModelData): string[] => {
174
255
  const packageName = nameWithoutNamespace(model.id).replaceAll("-", "_");
175
256
  return [
@@ -270,12 +351,12 @@ llm = Llama.from_pretrained(
270
351
  )
271
352
 
272
353
  llm.create_chat_completion(
273
- messages = [
274
- {
275
- "role": "user",
276
- "content": "What is the capital of France?"
277
- }
278
- ]
354
+ messages = [
355
+ {
356
+ "role": "user",
357
+ "content": "What is the capital of France?"
358
+ }
359
+ ]
279
360
  )`,
280
361
  ];
281
362
 
@@ -150,6 +150,13 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
150
150
  filter: false,
151
151
  countDownloads: `path:"adapter_config.json"`,
152
152
  },
153
+ deepforest: {
154
+ prettyLabel: "DeepForest",
155
+ repoName: "deepforest",
156
+ docsUrl: "https://deepforest.readthedocs.io/en/latest/",
157
+ repoUrl: "https://github.com/weecology/DeepForest",
158
+ countDownloads: `path_extension:"pt" OR path_extension:"pl"`,
159
+ },
153
160
  "depth-anything-v2": {
154
161
  prettyLabel: "DepthAnythingV2",
155
162
  repoName: "Depth Anything V2",
@@ -174,11 +181,29 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
174
181
  filter: true,
175
182
  /// diffusers has its own more complex "countDownloads" query
176
183
  },
184
+ diffusionkit: {
185
+ prettyLabel: "DiffusionKit",
186
+ repoName: "DiffusionKit",
187
+ repoUrl: "https://github.com/argmaxinc/DiffusionKit",
188
+ snippets: snippets.diffusionkit,
189
+ },
177
190
  doctr: {
178
191
  prettyLabel: "docTR",
179
192
  repoName: "doctr",
180
193
  repoUrl: "https://github.com/mindee/doctr",
181
194
  },
195
+ cartesia_pytorch: {
196
+ prettyLabel: "Cartesia Pytorch",
197
+ repoName: "Cartesia Pytorch",
198
+ repoUrl: "https://github.com/cartesia-ai/cartesia_pytorch",
199
+ snippets: snippets.cartesia_pytorch,
200
+ },
201
+ cartesia_mlx: {
202
+ prettyLabel: "Cartesia MLX",
203
+ repoName: "Cartesia MLX",
204
+ repoUrl: "https://github.com/cartesia-ai/cartesia_mlx",
205
+ snippets: snippets.cartesia_mlx,
206
+ },
182
207
  edsnlp: {
183
208
  prettyLabel: "EDS-NLP",
184
209
  repoName: "edsnlp",
@@ -421,6 +446,13 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
421
446
  snippets: snippets.pyannote_audio,
422
447
  filter: true,
423
448
  },
449
+ "py-feat": {
450
+ prettyLabel: "Py-Feat",
451
+ repoName: "Py-Feat",
452
+ repoUrl: "https://github.com/cosanlab/py-feat",
453
+ docsUrl: "https://py-feat.org/",
454
+ filter: false,
455
+ },
424
456
  pythae: {
425
457
  prettyLabel: "pythae",
426
458
  repoName: "pythae",
package/src/pipelines.ts CHANGED
@@ -656,6 +656,18 @@ export const PIPELINE_DATA = {
656
656
  name: "Video-Text-to-Text",
657
657
  modality: "multimodal",
658
658
  color: "blue",
659
+ hideInDatasets: false,
660
+ },
661
+ "keypoint-detection": {
662
+ name: "Keypoint Detection",
663
+ subtasks: [
664
+ {
665
+ type: "pose-estimation",
666
+ name: "Pose Estimation",
667
+ },
668
+ ],
669
+ modality: "cv",
670
+ color: "red",
659
671
  hideInDatasets: true,
660
672
  },
661
673
  other: {
@@ -7,8 +7,7 @@ export const snippetBasic = (model: ModelDataMinimal, accessToken: string): stri
7
7
  -X POST \\
8
8
  -d '{"inputs": ${getModelInputSnippet(model, true)}}' \\
9
9
  -H 'Content-Type: application/json' \\
10
- -H "Authorization: Bearer ${accessToken || `{API_TOKEN}`}"
11
- `;
10
+ -H "Authorization: Bearer ${accessToken || `{API_TOKEN}`}"`;
12
11
 
13
12
  export const snippetTextGeneration = (model: ModelDataMinimal, accessToken: string): string => {
14
13
  if (model.config?.tokenizer_config?.chat_template) {
@@ -33,15 +32,13 @@ export const snippetZeroShotClassification = (model: ModelDataMinimal, accessTok
33
32
  -X POST \\
34
33
  -d '{"inputs": ${getModelInputSnippet(model, true)}, "parameters": {"candidate_labels": ["refund", "legal", "faq"]}}' \\
35
34
  -H 'Content-Type: application/json' \\
36
- -H "Authorization: Bearer ${accessToken || `{API_TOKEN}`}"
37
- `;
35
+ -H "Authorization: Bearer ${accessToken || `{API_TOKEN}`}"`;
38
36
 
39
37
  export const snippetFile = (model: ModelDataMinimal, accessToken: string): string =>
40
38
  `curl https://api-inference.huggingface.co/models/${model.id} \\
41
39
  -X POST \\
42
40
  --data-binary '@${getModelInputSnippet(model, true, true)}' \\
43
- -H "Authorization: Bearer ${accessToken || `{API_TOKEN}`}"
44
- `;
41
+ -H "Authorization: Bearer ${accessToken || `{API_TOKEN}`}"`;
45
42
 
46
43
  export const curlSnippets: Partial<Record<PipelineType, (model: ModelDataMinimal, accessToken: string) => string>> = {
47
44
  // Same order as in js/src/lib/interfaces/Types.ts
@@ -36,8 +36,7 @@ for await (const chunk of inference.chatCompletionStream({
36
36
  max_tokens: 500,
37
37
  })) {
38
38
  process.stdout.write(chunk.choices[0]?.delta?.content || "");
39
- }
40
- `;
39
+ }`;
41
40
  } else {
42
41
  return snippetBasic(model, accessToken);
43
42
  }
@@ -15,8 +15,7 @@ for message in client.chat_completion(
15
15
  max_tokens=500,
16
16
  stream=True,
17
17
  ):
18
- print(message.choices[0].delta.content, end="")
19
- `;
18
+ print(message.choices[0].delta.content, end="")`;
20
19
 
21
20
  export const snippetZeroShotClassification = (model: ModelDataMinimal): string =>
22
21
  `def query(payload):
@@ -8,9 +8,10 @@
8
8
  */
9
9
  export interface AudioClassificationInput {
10
10
  /**
11
- * The input audio data
11
+ * The input audio data as a base64-encoded string. If no `parameters` are provided, you can
12
+ * also provide the audio data as a raw bytes payload.
12
13
  */
13
- inputs: unknown;
14
+ inputs: string;
14
15
  /**
15
16
  * Additional inference parameters
16
17
  */
@@ -6,7 +6,8 @@
6
6
  "type": "object",
7
7
  "properties": {
8
8
  "inputs": {
9
- "description": "The input audio data"
9
+ "description": "The input audio data as a base64-encoded string. If no `parameters` are provided, you can also provide the audio data as a raw bytes payload.",
10
+ "type": "string"
10
11
  },
11
12
  "parameters": {
12
13
  "description": "Additional inference parameters",
@@ -5,6 +5,7 @@
5
5
  "description": "Outputs for Audio Classification inference",
6
6
  "type": "array",
7
7
  "items": {
8
+ "type": "object",
8
9
  "$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
9
10
  }
10
11
  }
@@ -9,9 +9,10 @@
9
9
  */
10
10
  export interface AutomaticSpeechRecognitionInput {
11
11
  /**
12
- * The input audio data
12
+ * The input audio data as a base64-encoded string. If no `parameters` are provided, you can
13
+ * also provide the audio data as a raw bytes payload.
13
14
  */
14
- inputs: unknown;
15
+ inputs: string;
15
16
  /**
16
17
  * Additional inference parameters
17
18
  */
@@ -6,7 +6,8 @@
6
6
  "type": "object",
7
7
  "properties": {
8
8
  "inputs": {
9
- "description": "The input audio data"
9
+ "description": "The input audio data as a base64-encoded string. If no `parameters` are provided, you can also provide the audio data as a raw bytes payload.",
10
+ "type": "string"
10
11
  },
11
12
  "parameters": {
12
13
  "description": "Additional inference parameters",
@@ -7,17 +7,7 @@
7
7
  "title": "ClassificationOutputTransform",
8
8
  "type": "string",
9
9
  "description": "The function to apply to the model outputs in order to retrieve the scores.",
10
- "oneOf": [
11
- {
12
- "const": "sigmoid"
13
- },
14
- {
15
- "const": "softmax"
16
- },
17
- {
18
- "const": "none"
19
- }
20
- ]
10
+ "enum": ["sigmoid", "softmax", "none"]
21
11
  },
22
12
  "ClassificationOutput": {
23
13
  "title": "ClassificationOutput",
@@ -84,16 +74,9 @@
84
74
  "description": "Whether to use sampling instead of greedy decoding when generating new tokens."
85
75
  },
86
76
  "early_stopping": {
77
+ "type": ["boolean", "string"],
87
78
  "description": "Controls the stopping condition for beam-based methods.",
88
- "oneOf": [
89
- {
90
- "type": "boolean"
91
- },
92
- {
93
- "const": "never",
94
- "type": "string"
95
- }
96
- ]
79
+ "enum": ["never", true, false]
97
80
  },
98
81
  "num_beams": {
99
82
  "type": "integer",
@@ -8,9 +8,10 @@
8
8
  */
9
9
  export interface ImageClassificationInput {
10
10
  /**
11
- * The input image data
11
+ * The input image data as a base64-encoded string. If no `parameters` are provided, you can
12
+ * also provide the image data as a raw bytes payload.
12
13
  */
13
- inputs: unknown;
14
+ inputs: string;
14
15
  /**
15
16
  * Additional inference parameters
16
17
  */
@@ -6,7 +6,8 @@
6
6
  "type": "object",
7
7
  "properties": {
8
8
  "inputs": {
9
- "description": "The input image data"
9
+ "type": "string",
10
+ "description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload."
10
11
  },
11
12
  "parameters": {
12
13
  "description": "Additional inference parameters",
@@ -5,6 +5,7 @@
5
5
  "title": "ImageClassificationOutput",
6
6
  "type": "array",
7
7
  "items": {
8
+ "type": "object",
8
9
  "$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
9
10
  }
10
11
  }
@@ -8,9 +8,10 @@
8
8
  */
9
9
  export interface ImageSegmentationInput {
10
10
  /**
11
- * The input image data
11
+ * The input image data as a base64-encoded string. If no `parameters` are provided, you can
12
+ * also provide the image data as a raw bytes payload.
12
13
  */
13
- inputs: unknown;
14
+ inputs: string;
14
15
  /**
15
16
  * Additional inference parameters
16
17
  */
@@ -41,6 +42,9 @@ export interface ImageSegmentationParameters {
41
42
  threshold?: number;
42
43
  [property: string]: unknown;
43
44
  }
45
+ /**
46
+ * Segmentation task to be performed, depending on model capabilities.
47
+ */
44
48
  export type ImageSegmentationSubtask = "instance" | "panoptic" | "semantic";
45
49
  export type ImageSegmentationOutput = ImageSegmentationOutputElement[];
46
50
  /**
@@ -50,15 +54,15 @@ export type ImageSegmentationOutput = ImageSegmentationOutputElement[];
50
54
  */
51
55
  export interface ImageSegmentationOutputElement {
52
56
  /**
53
- * The label of the predicted segment
57
+ * The label of the predicted segment.
54
58
  */
55
59
  label: string;
56
60
  /**
57
- * The corresponding mask as a black-and-white image
61
+ * The corresponding mask as a black-and-white image (base64-encoded).
58
62
  */
59
- mask: unknown;
63
+ mask: string;
60
64
  /**
61
- * The score or confidence degreee the model has
65
+ * The score or confidence degree the model has.
62
66
  */
63
67
  score?: number;
64
68
  [property: string]: unknown;
@@ -6,7 +6,8 @@
6
6
  "type": "object",
7
7
  "properties": {
8
8
  "inputs": {
9
- "description": "The input image data"
9
+ "type": "string",
10
+ "description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload."
10
11
  },
11
12
  "parameters": {
12
13
  "description": "Additional inference parameters",
@@ -31,17 +32,7 @@
31
32
  "title": "ImageSegmentationSubtask",
32
33
  "type": "string",
33
34
  "description": "Segmentation task to be performed, depending on model capabilities.",
34
- "oneOf": [
35
- {
36
- "const": "instance"
37
- },
38
- {
39
- "const": "panoptic"
40
- },
41
- {
42
- "const": "semantic"
43
- }
44
- ]
35
+ "enum": ["instance", "panoptic", "semantic"]
45
36
  },
46
37
  "threshold": {
47
38
  "type": "number",
@@ -10,14 +10,15 @@
10
10
  "properties": {
11
11
  "label": {
12
12
  "type": "string",
13
- "description": "The label of the predicted segment"
13
+ "description": "The label of the predicted segment."
14
14
  },
15
15
  "mask": {
16
- "description": "The corresponding mask as a black-and-white image"
16
+ "type": "string",
17
+ "description": "The corresponding mask as a black-and-white image (base64-encoded)."
17
18
  },
18
19
  "score": {
19
20
  "type": "number",
20
- "description": "The score or confidence degreee the model has"
21
+ "description": "The score or confidence degree the model has."
21
22
  }
22
23
  },
23
24
  "required": ["label", "mask"]
@@ -9,9 +9,10 @@
9
9
  */
10
10
  export interface ImageToImageInput {
11
11
  /**
12
- * The input image data
12
+ * The input image data as a base64-encoded string. If no `parameters` are provided, you can
13
+ * also provide the image data as a raw bytes payload.
13
14
  */
14
- inputs: unknown;
15
+ inputs: string;
15
16
  /**
16
17
  * Additional inference parameters
17
18
  */
@@ -40,14 +41,14 @@ export interface ImageToImageParameters {
40
41
  */
41
42
  num_inference_steps?: number;
42
43
  /**
43
- * The size in pixel of the output image
44
+ * The size in pixel of the output image.
44
45
  */
45
46
  target_size?: TargetSize;
46
47
  [property: string]: unknown;
47
48
  }
48
49
 
49
50
  /**
50
- * The size in pixel of the output image
51
+ * The size in pixel of the output image.
51
52
  */
52
53
  export interface TargetSize {
53
54
  height: number;
@@ -60,7 +61,7 @@ export interface TargetSize {
60
61
  */
61
62
  export interface ImageToImageOutput {
62
63
  /**
63
- * The output image
64
+ * The output image returned as raw bytes in the payload.
64
65
  */
65
66
  image?: unknown;
66
67
  [property: string]: unknown;
@@ -6,7 +6,8 @@
6
6
  "type": "object",
7
7
  "properties": {
8
8
  "inputs": {
9
- "description": "The input image data"
9
+ "type": "string",
10
+ "description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload."
10
11
  },
11
12
  "parameters": {
12
13
  "description": "Additional inference parameters",
@@ -36,7 +37,7 @@
36
37
  },
37
38
  "target_size": {
38
39
  "type": "object",
39
- "description": "The size in pixel of the output image",
40
+ "description": "The size in pixel of the output image.",
40
41
  "properties": {
41
42
  "width": {
42
43
  "type": "integer"
@@ -6,7 +6,7 @@
6
6
  "type": "object",
7
7
  "properties": {
8
8
  "image": {
9
- "description": "The output image"
9
+ "description": "The output image returned as raw bytes in the payload."
10
10
  }
11
11
  }
12
12
  }
@@ -73,12 +73,7 @@ export type * from "./table-question-answering/inference";
73
73
  export type { TextToImageInput, TextToImageOutput, TextToImageParameters } from "./text-to-image/inference";
74
74
  export type { TextToAudioParameters, TextToSpeechInput, TextToSpeechOutput } from "./text-to-speech/inference";
75
75
  export type * from "./token-classification/inference";
76
- export type {
77
- Text2TextGenerationParameters,
78
- Text2TextGenerationTruncationStrategy,
79
- TranslationInput,
80
- TranslationOutput,
81
- } from "./translation/inference";
76
+ export type { TranslationInput, TranslationOutput } from "./translation/inference";
82
77
  export type {
83
78
  ClassificationOutputTransform,
84
79
  TextClassificationInput,
@@ -131,6 +126,7 @@ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
131
126
  "image-to-image": ["diffusers", "transformers", "transformers.js"],
132
127
  "image-to-text": ["transformers", "transformers.js"],
133
128
  "image-to-video": ["diffusers"],
129
+ "keypoint-detection": ["transformers"],
134
130
  "video-classification": ["transformers"],
135
131
  "mask-generation": ["transformers"],
136
132
  "multiple-choice": ["transformers"],
@@ -210,6 +206,7 @@ export const TASKS_DATA: Record<PipelineType, TaskData | undefined> = {
210
206
  "image-text-to-text": getData("image-text-to-text", imageTextToText),
211
207
  "image-to-text": getData("image-to-text", imageToText),
212
208
  "image-to-video": undefined,
209
+ "keypoint-detection": getData("keypoint-detection", placeholder),
213
210
  "mask-generation": getData("mask-generation", maskGeneration),
214
211
  "multiple-choice": undefined,
215
212
  "object-detection": getData("object-detection", objectDetection),
@@ -0,0 +1,59 @@
1
+ ## Task Variants
2
+
3
+ ### Pose Estimation
4
+
5
+ Pose estimation is the process of determining the position and orientation of an object or a camera in a 3D space. It is a fundamental task in computer vision and is widely used in various applications such as robotics, augmented reality, and 3D reconstruction.
6
+
7
+ ## Use Cases for Keypoint Detection
8
+
9
+ ### Facial Landmark Estimation
10
+
11
+ Keypoint detection models can be used to estimate the position of facial landmarks. Facial landmarks are points on the face such as the corners of the mouth, the outer corners of the eyes, and the tip of the nose. These landmarks can be used for a variety of applications, such as facial expression recognition, 3D face reconstruction, and cinematic animation.
12
+
13
+ ### Fitness Tracking
14
+
15
+ Keypoint detection models can be used to track the movement of the human body, e.g. position of the joints in a 3D space. This can be used for a variety of applications, such as fitness tracking, sports analysis or virtual reality applications.
16
+
17
+ ## Inference Code
18
+
19
+ Below you can find an example of how to use a keypoint detection model and how to visualize the results.
20
+
21
+ ```python
22
+ from transformers import AutoImageProcessor, SuperPointForKeypointDetection
23
+ import torch
24
+ import matplotlib.pyplot as plt
25
+ from PIL import Image
26
+ import requests
27
+
28
+ url_image = "http://images.cocodataset.org/val2017/000000039769.jpg"
29
+ image = Image.open(requests.get(url_image_1, stream=True).raw)
30
+
31
+ # initialize the model and processor
32
+ processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint")
33
+ model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")
34
+
35
+ # infer
36
+ inputs = processor(image, return_tensors="pt").to(model.device, model.dtype)
37
+ outputs = model(**inputs)
38
+
39
+ # visualize the output
40
+ image_width, image_height = image.size
41
+ image_mask = outputs.mask
42
+ image_indices = torch.nonzero(image_mask).squeeze()
43
+
44
+ image_scores = outputs.scores.squeeze()
45
+ image_keypoints = outputs.keypoints.squeeze()
46
+ keypoints = image_keypoints.detach().numpy()
47
+ scores = image_scores.detach().numpy()
48
+
49
+ plt.axis('off')
50
+ plt.imshow(image)
51
+ plt.scatter(
52
+ keypoints[:, 0],
53
+ keypoints[:, 1],
54
+ s=scores * 100,
55
+ c='cyan',
56
+ alpha=0.4
57
+ )
58
+ plt.show()
59
+ ```
@@ -0,0 +1,46 @@
1
+ import type { TaskDataCustom } from "..";
2
+
3
+ const taskData: TaskDataCustom = {
4
+ datasets: [
5
+ {
6
+ description: "A dataset of hand keypoints of over 500k examples.",
7
+ id: "Vincent-luo/hagrid-mediapipe-hands",
8
+ },
9
+ ],
10
+ demo: {
11
+ inputs: [
12
+ {
13
+ filename: "keypoint-detection-input.png",
14
+ type: "img",
15
+ },
16
+ ],
17
+ outputs: [
18
+ {
19
+ filename: "keypoint-detection-output.png",
20
+ type: "img",
21
+ },
22
+ ],
23
+ },
24
+ metrics: [],
25
+ models: [
26
+ {
27
+ description: "A robust keypoint detection model.",
28
+ id: "magic-leap-community/superpoint",
29
+ },
30
+ {
31
+ description: "Strong keypoint detection model used to detect human pose.",
32
+ id: "qualcomm/MediaPipe-Pose-Estimation",
33
+ },
34
+ ],
35
+ spaces: [
36
+ {
37
+ description: "An application that detects hand keypoints in real-time.",
38
+ id: "datasciencedojo/Hand-Keypoint-Detection-Realtime",
39
+ },
40
+ ],
41
+ summary: "Keypoint detection is the task of identifying meaningful distinctive points or features in an image.",
42
+ widgetModels: [],
43
+ youtubeId: "",
44
+ };
45
+
46
+ export default taskData;