@huggingface/tasks 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. package/README.md +1 -1
  2. package/dist/{index.mjs → index.cjs} +2695 -2497
  3. package/dist/index.d.ts +427 -65
  4. package/dist/index.js +2660 -2532
  5. package/package.json +13 -8
  6. package/src/index.ts +2 -5
  7. package/src/library-to-tasks.ts +1 -1
  8. package/src/model-data.ts +1 -1
  9. package/src/model-libraries-downloads.ts +20 -0
  10. package/src/{library-ui-elements.ts → model-libraries-snippets.ts} +50 -296
  11. package/src/model-libraries.ts +375 -44
  12. package/src/pipelines.ts +1 -1
  13. package/src/tasks/audio-classification/about.md +1 -1
  14. package/src/tasks/audio-classification/inference.ts +51 -0
  15. package/src/tasks/audio-classification/spec/input.json +34 -0
  16. package/src/tasks/audio-classification/spec/output.json +10 -0
  17. package/src/tasks/audio-to-audio/about.md +1 -1
  18. package/src/tasks/automatic-speech-recognition/about.md +4 -2
  19. package/src/tasks/automatic-speech-recognition/inference.ts +159 -0
  20. package/src/tasks/automatic-speech-recognition/spec/input.json +34 -0
  21. package/src/tasks/automatic-speech-recognition/spec/output.json +38 -0
  22. package/src/tasks/common-definitions.json +117 -0
  23. package/src/tasks/depth-estimation/data.ts +8 -4
  24. package/src/tasks/depth-estimation/inference.ts +35 -0
  25. package/src/tasks/depth-estimation/spec/input.json +25 -0
  26. package/src/tasks/depth-estimation/spec/output.json +16 -0
  27. package/src/tasks/document-question-answering/inference.ts +110 -0
  28. package/src/tasks/document-question-answering/spec/input.json +85 -0
  29. package/src/tasks/document-question-answering/spec/output.json +36 -0
  30. package/src/tasks/feature-extraction/inference.ts +22 -0
  31. package/src/tasks/feature-extraction/spec/input.json +26 -0
  32. package/src/tasks/feature-extraction/spec/output.json +7 -0
  33. package/src/tasks/fill-mask/inference.ts +62 -0
  34. package/src/tasks/fill-mask/spec/input.json +38 -0
  35. package/src/tasks/fill-mask/spec/output.json +29 -0
  36. package/src/tasks/image-classification/inference.ts +51 -0
  37. package/src/tasks/image-classification/spec/input.json +34 -0
  38. package/src/tasks/image-classification/spec/output.json +10 -0
  39. package/src/tasks/image-segmentation/inference.ts +65 -0
  40. package/src/tasks/image-segmentation/spec/input.json +54 -0
  41. package/src/tasks/image-segmentation/spec/output.json +25 -0
  42. package/src/tasks/image-to-image/inference.ts +67 -0
  43. package/src/tasks/image-to-image/spec/input.json +54 -0
  44. package/src/tasks/image-to-image/spec/output.json +12 -0
  45. package/src/tasks/image-to-text/inference.ts +143 -0
  46. package/src/tasks/image-to-text/spec/input.json +34 -0
  47. package/src/tasks/image-to-text/spec/output.json +14 -0
  48. package/src/tasks/index.ts +5 -2
  49. package/src/tasks/mask-generation/about.md +65 -0
  50. package/src/tasks/mask-generation/data.ts +42 -5
  51. package/src/tasks/object-detection/inference.ts +62 -0
  52. package/src/tasks/object-detection/spec/input.json +30 -0
  53. package/src/tasks/object-detection/spec/output.json +46 -0
  54. package/src/tasks/placeholder/data.ts +3 -0
  55. package/src/tasks/placeholder/spec/input.json +35 -0
  56. package/src/tasks/placeholder/spec/output.json +17 -0
  57. package/src/tasks/question-answering/inference.ts +99 -0
  58. package/src/tasks/question-answering/spec/input.json +67 -0
  59. package/src/tasks/question-answering/spec/output.json +29 -0
  60. package/src/tasks/sentence-similarity/about.md +2 -2
  61. package/src/tasks/sentence-similarity/inference.ts +32 -0
  62. package/src/tasks/sentence-similarity/spec/input.json +40 -0
  63. package/src/tasks/sentence-similarity/spec/output.json +12 -0
  64. package/src/tasks/summarization/data.ts +1 -0
  65. package/src/tasks/summarization/inference.ts +59 -0
  66. package/src/tasks/summarization/spec/input.json +7 -0
  67. package/src/tasks/summarization/spec/output.json +7 -0
  68. package/src/tasks/table-question-answering/inference.ts +61 -0
  69. package/src/tasks/table-question-answering/spec/input.json +44 -0
  70. package/src/tasks/table-question-answering/spec/output.json +40 -0
  71. package/src/tasks/tabular-classification/about.md +1 -1
  72. package/src/tasks/tabular-regression/about.md +1 -1
  73. package/src/tasks/text-classification/about.md +1 -0
  74. package/src/tasks/text-classification/inference.ts +51 -0
  75. package/src/tasks/text-classification/spec/input.json +35 -0
  76. package/src/tasks/text-classification/spec/output.json +10 -0
  77. package/src/tasks/text-generation/about.md +24 -13
  78. package/src/tasks/text-generation/data.ts +22 -38
  79. package/src/tasks/text-generation/inference.ts +194 -0
  80. package/src/tasks/text-generation/spec/input.json +90 -0
  81. package/src/tasks/text-generation/spec/output.json +120 -0
  82. package/src/tasks/text-to-audio/inference.ts +143 -0
  83. package/src/tasks/text-to-audio/spec/input.json +31 -0
  84. package/src/tasks/text-to-audio/spec/output.json +17 -0
  85. package/src/tasks/text-to-image/about.md +11 -2
  86. package/src/tasks/text-to-image/data.ts +6 -2
  87. package/src/tasks/text-to-image/inference.ts +71 -0
  88. package/src/tasks/text-to-image/spec/input.json +59 -0
  89. package/src/tasks/text-to-image/spec/output.json +13 -0
  90. package/src/tasks/text-to-speech/about.md +4 -2
  91. package/src/tasks/text-to-speech/data.ts +1 -0
  92. package/src/tasks/text-to-speech/inference.ts +147 -0
  93. package/src/tasks/text-to-speech/spec/input.json +7 -0
  94. package/src/tasks/text-to-speech/spec/output.json +7 -0
  95. package/src/tasks/text2text-generation/inference.ts +55 -0
  96. package/src/tasks/text2text-generation/spec/input.json +55 -0
  97. package/src/tasks/text2text-generation/spec/output.json +14 -0
  98. package/src/tasks/token-classification/inference.ts +82 -0
  99. package/src/tasks/token-classification/spec/input.json +65 -0
  100. package/src/tasks/token-classification/spec/output.json +33 -0
  101. package/src/tasks/translation/data.ts +1 -0
  102. package/src/tasks/translation/inference.ts +59 -0
  103. package/src/tasks/translation/spec/input.json +7 -0
  104. package/src/tasks/translation/spec/output.json +7 -0
  105. package/src/tasks/video-classification/inference.ts +59 -0
  106. package/src/tasks/video-classification/spec/input.json +42 -0
  107. package/src/tasks/video-classification/spec/output.json +10 -0
  108. package/src/tasks/visual-question-answering/inference.ts +63 -0
  109. package/src/tasks/visual-question-answering/spec/input.json +41 -0
  110. package/src/tasks/visual-question-answering/spec/output.json +21 -0
  111. package/src/tasks/zero-shot-classification/inference.ts +67 -0
  112. package/src/tasks/zero-shot-classification/spec/input.json +50 -0
  113. package/src/tasks/zero-shot-classification/spec/output.json +10 -0
  114. package/src/tasks/zero-shot-image-classification/data.ts +8 -5
  115. package/src/tasks/zero-shot-image-classification/inference.ts +61 -0
  116. package/src/tasks/zero-shot-image-classification/spec/input.json +45 -0
  117. package/src/tasks/zero-shot-image-classification/spec/output.json +10 -0
  118. package/src/tasks/zero-shot-object-detection/about.md +6 -0
  119. package/src/tasks/zero-shot-object-detection/data.ts +6 -1
  120. package/src/tasks/zero-shot-object-detection/inference.ts +66 -0
  121. package/src/tasks/zero-shot-object-detection/spec/input.json +40 -0
  122. package/src/tasks/zero-shot-object-detection/spec/output.json +47 -0
  123. package/tsconfig.json +3 -3
@@ -0,0 +1,143 @@
1
+ /**
2
+ * Inference code generated from the JSON schema spec in ./spec
3
+ *
4
+ * Using src/scripts/inference-codegen
5
+ */
6
+
7
+ /**
8
+ * Inputs for Image To Text inference
9
+ */
10
+ export interface ImageToTextInput {
11
+ /**
12
+ * The input image data
13
+ */
14
+ inputs: unknown;
15
+ /**
16
+ * Additional inference parameters
17
+ */
18
+ parameters?: ImageToTextParameters;
19
+ [property: string]: unknown;
20
+ }
21
+
22
+ /**
23
+ * Additional inference parameters
24
+ *
25
+ * Additional inference parameters for Image To Text
26
+ */
27
+ export interface ImageToTextParameters {
28
+ /**
29
+ * Parametrization of the text generation process
30
+ */
31
+ generate?: GenerationParameters;
32
+ /**
33
+ * The amount of maximum tokens to generate.
34
+ */
35
+ max_new_tokens?: number;
36
+ [property: string]: unknown;
37
+ }
38
+
39
+ /**
40
+ * Parametrization of the text generation process
41
+ *
42
+ * Ad-hoc parametrization of the text generation process
43
+ */
44
+ export interface GenerationParameters {
45
+ /**
46
+ * Whether to use sampling instead of greedy decoding when generating new tokens.
47
+ */
48
+ do_sample?: boolean;
49
+ /**
50
+ * Controls the stopping condition for beam-based methods.
51
+ */
52
+ early_stopping?: EarlyStoppingUnion;
53
+ /**
54
+ * If set to float strictly between 0 and 1, only tokens with a conditional probability
55
+ * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
56
+ * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
57
+ * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
58
+ */
59
+ epsilon_cutoff?: number;
60
+ /**
61
+ * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
62
+ * float strictly between 0 and 1, a token is only considered if it is greater than either
63
+ * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
64
+ * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
65
+ * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
66
+ * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
67
+ * for more details.
68
+ */
69
+ eta_cutoff?: number;
70
+ /**
71
+ * The maximum length (in tokens) of the generated text, including the input.
72
+ */
73
+ max_length?: number;
74
+ /**
75
+ * The maximum number of tokens to generate. Takes precedence over maxLength.
76
+ */
77
+ max_new_tokens?: number;
78
+ /**
79
+ * The minimum length (in tokens) of the generated text, including the input.
80
+ */
81
+ min_length?: number;
82
+ /**
83
+ * The minimum number of tokens to generate. Takes precedence over maxLength.
84
+ */
85
+ min_new_tokens?: number;
86
+ /**
87
+ * Number of groups to divide num_beams into in order to ensure diversity among different
88
+ * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
89
+ */
90
+ num_beam_groups?: number;
91
+ /**
92
+ * Number of beams to use for beam search.
93
+ */
94
+ num_beams?: number;
95
+ /**
96
+ * The value balances the model confidence and the degeneration penalty in contrastive
97
+ * search decoding.
98
+ */
99
+ penalty_alpha?: number;
100
+ /**
101
+ * The value used to modulate the next token probabilities.
102
+ */
103
+ temperature?: number;
104
+ /**
105
+ * The number of highest probability vocabulary tokens to keep for top-k-filtering.
106
+ */
107
+ top_k?: number;
108
+ /**
109
+ * If set to float < 1, only the smallest set of most probable tokens with probabilities
110
+ * that add up to top_p or higher are kept for generation.
111
+ */
112
+ top_p?: number;
113
+ /**
114
+ * Local typicality measures how similar the conditional probability of predicting a target
115
+ * token next is to the expected conditional probability of predicting a random token next,
116
+ * given the partial text already generated. If set to float < 1, the smallest set of the
117
+ * most locally typical tokens with probabilities that add up to typical_p or higher are
118
+ * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
119
+ */
120
+ typical_p?: number;
121
+ /**
122
+ * Whether the model should use the past last key/values attentions to speed up decoding
123
+ */
124
+ use_cache?: boolean;
125
+ [property: string]: unknown;
126
+ }
127
+
128
+ /**
129
+ * Controls the stopping condition for beam-based methods.
130
+ */
131
+ export type EarlyStoppingUnion = boolean | "never";
132
+
133
+ /**
134
+ * Outputs of inference for the Image To Text task
135
+ */
136
+ export interface ImageToTextOutput {
137
+ generatedText: unknown;
138
+ /**
139
+ * The generated text.
140
+ */
141
+ generated_text?: string;
142
+ [property: string]: unknown;
143
+ }
@@ -0,0 +1,34 @@
1
+ {
2
+ "$id": "/inference/schemas/image-to-text/input.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Inputs for Image To Text inference",
5
+ "title": "ImageToTextInput",
6
+ "type": "object",
7
+ "properties": {
8
+ "inputs": {
9
+ "description": "The input image data"
10
+ },
11
+ "parameters": {
12
+ "description": "Additional inference parameters",
13
+ "$ref": "#/$defs/ImageToTextParameters"
14
+ }
15
+ },
16
+ "$defs": {
17
+ "ImageToTextParameters": {
18
+ "title": "ImageToTextParameters",
19
+ "description": "Additional inference parameters for Image To Text",
20
+ "type": "object",
21
+ "properties": {
22
+ "max_new_tokens": {
23
+ "type": "integer",
24
+ "description": "The amount of maximum tokens to generate."
25
+ },
26
+ "generate": {
27
+ "description": "Parametrization of the text generation process",
28
+ "$ref": "/inference/schemas/common-definitions.json#/definitions/GenerationParameters"
29
+ }
30
+ }
31
+ }
32
+ },
33
+ "required": ["inputs"]
34
+ }
@@ -0,0 +1,14 @@
1
+ {
2
+ "$id": "/inference/schemas/image-to-text/output.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Outputs of inference for the Image To Text task",
5
+ "title": "ImageToTextOutput",
6
+ "type": "object",
7
+ "properties": {
8
+ "generated_text": {
9
+ "type": "string",
10
+ "description": "The generated text."
11
+ }
12
+ },
13
+ "required": ["generatedText"]
14
+ }
@@ -11,6 +11,7 @@ import imageClassification from "./image-classification/data";
11
11
  import imageToImage from "./image-to-image/data";
12
12
  import imageToText from "./image-to-text/data";
13
13
  import imageSegmentation from "./image-segmentation/data";
14
+ import maskGeneration from "./mask-generation/data";
14
15
  import objectDetection from "./object-detection/data";
15
16
  import depthEstimation from "./depth-estimation/data";
16
17
  import placeholder from "./placeholder/data";
@@ -33,6 +34,7 @@ import videoClassification from "./video-classification/data";
33
34
  import visualQuestionAnswering from "./visual-question-answering/data";
34
35
  import zeroShotClassification from "./zero-shot-classification/data";
35
36
  import zeroShotImageClassification from "./zero-shot-image-classification/data";
37
+ import zeroShotObjectDetection from "./zero-shot-object-detection/data";
36
38
 
37
39
  import type { ModelLibraryKey } from "../model-libraries";
38
40
 
@@ -131,7 +133,7 @@ export const TASKS_DATA: Record<PipelineType, TaskData | undefined> = {
131
133
  "image-to-image": getData("image-to-image", imageToImage),
132
134
  "image-to-text": getData("image-to-text", imageToText),
133
135
  "image-to-video": undefined,
134
- "mask-generation": getData("mask-generation", placeholder),
136
+ "mask-generation": getData("mask-generation", maskGeneration),
135
137
  "multiple-choice": undefined,
136
138
  "object-detection": getData("object-detection", objectDetection),
137
139
  "video-classification": getData("video-classification", videoClassification),
@@ -162,7 +164,7 @@ export const TASKS_DATA: Record<PipelineType, TaskData | undefined> = {
162
164
  "voice-activity-detection": undefined,
163
165
  "zero-shot-classification": getData("zero-shot-classification", zeroShotClassification),
164
166
  "zero-shot-image-classification": getData("zero-shot-image-classification", zeroShotImageClassification),
165
- "zero-shot-object-detection": getData("zero-shot-object-detection", placeholder),
167
+ "zero-shot-object-detection": getData("zero-shot-object-detection", zeroShotObjectDetection),
166
168
  "text-to-3d": getData("text-to-3d", placeholder),
167
169
  "image-to-3d": getData("image-to-3d", placeholder),
168
170
  } as const;
@@ -216,6 +218,7 @@ export interface TaskData {
216
218
  datasets: ExampleRepo[];
217
219
  demo: TaskDemo;
218
220
  id: PipelineType;
221
+ canonicalId?: PipelineType;
219
222
  isPlaceholder?: boolean;
220
223
  label: string;
221
224
  libraries: ModelLibraryKey[];
@@ -0,0 +1,65 @@
1
+ ## Use Cases
2
+
3
+ ### Filtering an Image
4
+
5
+ When filtering for an image, the generated masks might serve as an initial filter to eliminate irrelevant information. For instance, when monitoring vegetation in satellite imaging, mask generation models identify green spots, highlighting the relevant region of the image.
6
+
7
+ ### Masked Image Modelling
8
+
9
+ Generating masks can facilitate learning, especially in semi or unsupervised learning. For example, the [BEiT model](https://huggingface.co/docs/transformers/model_doc/beit) uses image-mask patches in the pre-training.
10
+
11
+ ### Human-in-the-loop Computer Vision Applications
12
+
13
+ For applications where humans are in the loop, masks highlight certain regions of images for humans to validate.
14
+
15
+ ## Task Variants
16
+
17
+ ### Segmentation
18
+
19
+ Image Segmentation divides an image into segments where each pixel is mapped to an object. This task has multiple variants, such as instance segmentation, panoptic segmentation, and semantic segmentation. You can learn more about segmentation on its [task page](https://huggingface.co/tasks/image-segmentation).
20
+
21
+ ## Inference
22
+
23
+ Mask generation models often work in two modes: segment everything or prompt mode.
24
+ The example below works in segment-everything-mode, where many masks will be returned.
25
+
26
+ ```python
27
+ from transformers import pipeline
28
+
29
+ generator = pipeline("mask-generation", model="Zigeng/SlimSAM-uniform-50", points_per_batch=64, device="cuda")
30
+ image_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
31
+ outputs = generator(image_url)
32
+ outputs["masks"]
33
+ # array of multiple binary masks returned for each generated mask
34
+ ```
35
+
36
+ Prompt mode takes in three types of prompts:
37
+
38
+ - **Point prompt:** The user can select a point on the image, and a meaningful segment around the point will be returned.
39
+ - **Box prompt:** The user can draw a box on the image, and a meaningful segment within the box will be returned.
40
+ - **Text prompt:** The user can input a text, and the objects of that type will be segmented. Note that this capability has not yet been released and has only been explored in research.
41
+
42
+ Below you can see how to use an input-point prompt. It also demonstrates direct model inference without the `pipeline` abstraction. The input prompt here is a nested list where the outermost list is the batch size (`1`), then the number of points (also `1` in this example), and the innermost list contains the actual coordinates of the point (`[450, 600]`).
43
+
44
+ ```python
45
+ from transformers import SamModel, SamProcessor
46
+ from PIL import Image
47
+ import requests
48
+
49
+ model = SamModel.from_pretrained("Zigeng/SlimSAM-uniform-50").to("cuda")
50
+ processor = SamProcessor.from_pretrained("Zigeng/SlimSAM-uniform-50")
51
+
52
+ raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
53
+ # pointing to the car window
54
+ input_points = [[[450, 600]]]
55
+ inputs = processor(raw_image, input_points=input_points, return_tensors="pt").to("cuda")
56
+ outputs = model(**inputs)
57
+ masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
58
+ scores = outputs.iou_scores
59
+ ```
60
+
61
+ ## Useful Resources
62
+
63
+ Would you like to learn more about mask generation? Great! Here you can find some curated resources that you may find helpful!
64
+
65
+ - [Segment anything model](https://huggingface.co/docs/transformers/main/model_doc/sam)
@@ -3,14 +3,51 @@ import type { TaskDataCustom } from "..";
3
3
  const taskData: TaskDataCustom = {
4
4
  datasets: [],
5
5
  demo: {
6
- inputs: [],
7
- outputs: [],
6
+ inputs: [
7
+ {
8
+ filename: "mask-generation-input.png",
9
+ type: "img",
10
+ },
11
+ ],
12
+ outputs: [
13
+ {
14
+ filename: "mask-generation-output.png",
15
+ type: "img",
16
+ },
17
+ ],
8
18
  },
9
19
  metrics: [],
10
- models: [],
11
- spaces: [],
20
+ models: [
21
+ {
22
+ description: "Small yet powerful mask generation model.",
23
+ id: "Zigeng/SlimSAM-uniform-50",
24
+ },
25
+ {
26
+ description: "Very strong mask generation model.",
27
+ id: "facebook/sam-vit-huge",
28
+ },
29
+ ],
30
+ spaces: [
31
+ {
32
+ description:
33
+ "An application that combines a mask generation model with an image embedding model for open-vocabulary image segmentation.",
34
+ id: "SkalskiP/SAM_and_MetaCLIP",
35
+ },
36
+ {
37
+ description: "An application that compares the performance of a large and a small mask generation model.",
38
+ id: "merve/slimsam",
39
+ },
40
+ {
41
+ description: "An application based on an improved mask generation model.",
42
+ id: "linfanluntan/Grounded-SAM",
43
+ },
44
+ {
45
+ description: "An application to remove objects from videos using mask generation models.",
46
+ id: "SkalskiP/SAM_and_ProPainter",
47
+ },
48
+ ],
12
49
  summary:
13
- "Mask generation is creating a binary image that identifies a specific object or region of interest in an input image. Masks are often used in segmentation tasks, where they provide a precise way to isolate the object of interest for further processing or analysis.",
50
+ "Mask generation is the task of generating masks that identify a specific object or region of interest in a given image. Masks are often used in segmentation tasks, where they provide a precise way to isolate the object of interest for further processing or analysis.",
14
51
  widgetModels: [],
15
52
  youtubeId: "",
16
53
  };
@@ -0,0 +1,62 @@
1
+ /**
2
+ * Inference code generated from the JSON schema spec in ./spec
3
+ *
4
+ * Using src/scripts/inference-codegen
5
+ */
6
+ /**
7
+ * Inputs for Object Detection inference
8
+ */
9
+ export interface ObjectDetectionInput {
10
+ /**
11
+ * The input image data
12
+ */
13
+ inputs: unknown;
14
+ /**
15
+ * Additional inference parameters
16
+ */
17
+ parameters?: ObjectDetectionParameters;
18
+ [property: string]: unknown;
19
+ }
20
+ /**
21
+ * Additional inference parameters
22
+ *
23
+ * Additional inference parameters for Object Detection
24
+ */
25
+ export interface ObjectDetectionParameters {
26
+ /**
27
+ * The probability necessary to make a prediction.
28
+ */
29
+ threshold?: number;
30
+ [property: string]: unknown;
31
+ }
32
+ /**
33
+ * The predicted bounding box. Coordinates are relative to the top left corner of the input
34
+ * image.
35
+ */
36
+ export interface BoundingBox {
37
+ xmax: number;
38
+ xmin: number;
39
+ ymax: number;
40
+ ymin: number;
41
+ [property: string]: unknown;
42
+ }
43
+ export type ObjectDetectionOutput = ObjectDetectionOutputElement[];
44
+ /**
45
+ * Outputs of inference for the Object Detection task
46
+ */
47
+ export interface ObjectDetectionOutputElement {
48
+ /**
49
+ * The predicted bounding box. Coordinates are relative to the top left corner of the input
50
+ * image.
51
+ */
52
+ box: BoundingBox;
53
+ /**
54
+ * The predicted label for the bounding box
55
+ */
56
+ label: string;
57
+ /**
58
+ * The associated score / probability
59
+ */
60
+ score: number;
61
+ [property: string]: unknown;
62
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "$id": "/inference/schemas/object-detection/input.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Inputs for Object Detection inference",
5
+ "title": "ObjectDetectionInput",
6
+ "type": "object",
7
+ "properties": {
8
+ "inputs": {
9
+ "description": "The input image data"
10
+ },
11
+ "parameters": {
12
+ "description": "Additional inference parameters",
13
+ "$ref": "#/$defs/ObjectDetectionParameters"
14
+ }
15
+ },
16
+ "$defs": {
17
+ "ObjectDetectionParameters": {
18
+ "title": "ObjectDetectionParameters",
19
+ "description": "Additional inference parameters for Object Detection",
20
+ "type": "object",
21
+ "properties": {
22
+ "threshold": {
23
+ "type": "number",
24
+ "description": "The probability necessary to make a prediction."
25
+ }
26
+ }
27
+ }
28
+ },
29
+ "required": ["inputs"]
30
+ }
@@ -0,0 +1,46 @@
1
+ {
2
+ "$id": "/inference/schemas/object-detection/output.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Outputs of inference for the Object Detection task",
5
+ "title": "ObjectDetectionOutput",
6
+ "type": "array",
7
+ "items": {
8
+ "type": "object",
9
+ "properties": {
10
+ "label": {
11
+ "type": "string",
12
+ "description": "The predicted label for the bounding box"
13
+ },
14
+ "score": {
15
+ "type": "number",
16
+ "description": "The associated score / probability"
17
+ },
18
+ "box": {
19
+ "$ref": "#/$defs/BoundingBox",
20
+ "description": "The predicted bounding box. Coordinates are relative to the top left corner of the input image."
21
+ }
22
+ },
23
+ "required": ["box", "label", "score"]
24
+ },
25
+ "$defs": {
26
+ "BoundingBox": {
27
+ "type": "object",
28
+ "title": "BoundingBox",
29
+ "properties": {
30
+ "xmin": {
31
+ "type": "integer"
32
+ },
33
+ "xmax": {
34
+ "type": "integer"
35
+ },
36
+ "ymin": {
37
+ "type": "integer"
38
+ },
39
+ "ymax": {
40
+ "type": "integer"
41
+ }
42
+ },
43
+ "required": ["xmin", "xmax", "ymin", "ymax"]
44
+ }
45
+ }
46
+ }
@@ -13,6 +13,9 @@ const taskData: TaskDataCustom = {
13
13
  summary: "",
14
14
  widgetModels: [],
15
15
  youtubeId: undefined,
16
+ /// If this is a subtask, link to the most general task ID
17
+ /// (eg, text2text-generation is the canonical ID of translation)
18
+ canonicalId: undefined,
16
19
  };
17
20
 
18
21
  export default taskData;
@@ -0,0 +1,35 @@
1
+ {
2
+ "$id": "/inference/schemas/<TASK_ID>/input.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Inputs for <TASK_ID> inference",
5
+ "title": "PlaceholderInput",
6
+ "type": "object",
7
+ "properties": {
8
+ "inputs": {
9
+ "description": "TODO: describe the input here. This must be model & framework agnostic.",
10
+ "type": "string"
11
+ },
12
+ "parameters": {
13
+ "description": "Additional inference parameters",
14
+ "$ref": "#/$defs/<TASK_ID>Parameters"
15
+ }
16
+ },
17
+ "$defs": {
18
+ "<TASK_ID>Parameters": {
19
+ "title": "<TASK_ID>Parameters",
20
+ "description": "TODO: describe additional parameters here.",
21
+ "type": "object",
22
+ "properties": {
23
+ "dummy_parameter_name": {
24
+ "type": "boolean",
25
+ "description": "TODO: describe the parameter here"
26
+ },
27
+ "dummy_parameter_name2": {
28
+ "type": "integer",
29
+ "description": "TODO: describe the parameter here"
30
+ }
31
+ }
32
+ }
33
+ },
34
+ "required": ["inputs"]
35
+ }
@@ -0,0 +1,17 @@
1
+ {
2
+ "$id": "/inference/schemas/<TASK_ID>/output.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Outputs for <TASK_ID> inference",
5
+ "title": "PlaceholderOutput",
6
+ "type": "array",
7
+ "items": {
8
+ "type": "object",
9
+ "properties": {
10
+ "meaningful_output_name": {
11
+ "type": "string",
12
+ "description": "TODO: Describe what is outputed by the inference here"
13
+ }
14
+ },
15
+ "required": ["meaningfulOutputName"]
16
+ }
17
+ }
@@ -0,0 +1,99 @@
1
+ /**
2
+ * Inference code generated from the JSON schema spec in ./spec
3
+ *
4
+ * Using src/scripts/inference-codegen
5
+ */
6
+ /**
7
+ * Inputs for Question Answering inference
8
+ */
9
+ export interface QuestionAnsweringInput {
10
+ /**
11
+ * One (context, question) pair to answer
12
+ */
13
+ inputs: QuestionAnsweringInputData;
14
+ /**
15
+ * Additional inference parameters
16
+ */
17
+ parameters?: QuestionAnsweringParameters;
18
+ [property: string]: unknown;
19
+ }
20
+ /**
21
+ * One (context, question) pair to answer
22
+ */
23
+ export interface QuestionAnsweringInputData {
24
+ /**
25
+ * The context to be used for answering the question
26
+ */
27
+ context: string;
28
+ /**
29
+ * The question to be answered
30
+ */
31
+ question: string;
32
+ [property: string]: unknown;
33
+ }
34
+ /**
35
+ * Additional inference parameters
36
+ *
37
+ * Additional inference parameters for Question Answering
38
+ */
39
+ export interface QuestionAnsweringParameters {
40
+ /**
41
+ * Attempts to align the answer to real words. Improves quality on space separated
42
+ * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
43
+ */
44
+ align_to_words?: boolean;
45
+ /**
46
+ * If the context is too long to fit with the question for the model, it will be split in
47
+ * several chunks with some overlap. This argument controls the size of that overlap.
48
+ */
49
+ doc_stride?: number;
50
+ /**
51
+ * Whether to accept impossible as an answer.
52
+ */
53
+ handle_impossible_answer?: boolean;
54
+ /**
55
+ * The maximum length of predicted answers (e.g., only answers with a shorter length are
56
+ * considered).
57
+ */
58
+ max_answer_len?: number;
59
+ /**
60
+ * The maximum length of the question after tokenization. It will be truncated if needed.
61
+ */
62
+ max_question_len?: number;
63
+ /**
64
+ * The maximum length of the total sentence (context + question) in tokens of each chunk
65
+ * passed to the model. The context will be split in several chunks (using docStride as
66
+ * overlap) if needed.
67
+ */
68
+ max_seq_len?: number;
69
+ /**
70
+ * The number of answers to return (will be chosen by order of likelihood). Note that we
71
+ * return less than topk answers if there are not enough options available within the
72
+ * context.
73
+ */
74
+ top_k?: number;
75
+ [property: string]: unknown;
76
+ }
77
+ export type QuestionAnsweringOutput = QuestionAnsweringOutputElement[];
78
+ /**
79
+ * Outputs of inference for the Question Answering task
80
+ */
81
+ export interface QuestionAnsweringOutputElement {
82
+ /**
83
+ * The answer to the question.
84
+ */
85
+ answer: string;
86
+ /**
87
+ * The character position in the input where the answer ends.
88
+ */
89
+ end: number;
90
+ /**
91
+ * The probability associated to the answer.
92
+ */
93
+ score: number;
94
+ /**
95
+ * The character position in the input where the answer begins.
96
+ */
97
+ start: number;
98
+ [property: string]: unknown;
99
+ }