@huggingface/tasks 0.11.12 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +74 -2
- package/dist/index.js +74 -2
- package/dist/src/model-libraries-snippets.d.ts +1 -0
- package/dist/src/model-libraries-snippets.d.ts.map +1 -1
- package/dist/src/model-libraries.d.ts +15 -2
- package/dist/src/model-libraries.d.ts.map +1 -1
- package/dist/src/pipelines.d.ts +18 -2
- package/dist/src/pipelines.d.ts.map +1 -1
- package/dist/src/tasks/audio-classification/inference.d.ts +3 -2
- package/dist/src/tasks/audio-classification/inference.d.ts.map +1 -1
- package/dist/src/tasks/automatic-speech-recognition/inference.d.ts +3 -2
- package/dist/src/tasks/automatic-speech-recognition/inference.d.ts.map +1 -1
- package/dist/src/tasks/image-classification/inference.d.ts +3 -2
- package/dist/src/tasks/image-classification/inference.d.ts.map +1 -1
- package/dist/src/tasks/image-segmentation/inference.d.ts +10 -6
- package/dist/src/tasks/image-segmentation/inference.d.ts.map +1 -1
- package/dist/src/tasks/image-to-image/inference.d.ts +6 -5
- package/dist/src/tasks/image-to-image/inference.d.ts.map +1 -1
- package/dist/src/tasks/index.d.ts +1 -1
- package/dist/src/tasks/index.d.ts.map +1 -1
- package/dist/src/tasks/keypoint-detection/data.d.ts +4 -0
- package/dist/src/tasks/keypoint-detection/data.d.ts.map +1 -0
- package/dist/src/tasks/object-detection/inference.d.ts +17 -4
- package/dist/src/tasks/object-detection/inference.d.ts.map +1 -1
- package/dist/src/tasks/summarization/inference.d.ts +13 -12
- package/dist/src/tasks/summarization/inference.d.ts.map +1 -1
- package/dist/src/tasks/text-to-image/inference.d.ts +11 -7
- package/dist/src/tasks/text-to-image/inference.d.ts.map +1 -1
- package/dist/src/tasks/translation/inference.d.ts +21 -10
- package/dist/src/tasks/translation/inference.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/model-libraries-snippets.ts +42 -0
- package/src/model-libraries.ts +13 -0
- package/src/pipelines.ts +18 -0
- package/src/tasks/audio-classification/inference.ts +3 -2
- package/src/tasks/audio-classification/spec/input.json +2 -1
- package/src/tasks/audio-classification/spec/output.json +1 -0
- package/src/tasks/automatic-speech-recognition/inference.ts +3 -2
- package/src/tasks/automatic-speech-recognition/spec/input.json +2 -1
- package/src/tasks/common-definitions.json +3 -20
- package/src/tasks/image-classification/inference.ts +3 -2
- package/src/tasks/image-classification/spec/input.json +2 -1
- package/src/tasks/image-classification/spec/output.json +1 -0
- package/src/tasks/image-segmentation/inference.ts +10 -6
- package/src/tasks/image-segmentation/spec/input.json +3 -12
- package/src/tasks/image-segmentation/spec/output.json +4 -3
- package/src/tasks/image-to-image/about.md +70 -21
- package/src/tasks/image-to-image/data.ts +1 -1
- package/src/tasks/image-to-image/inference.ts +6 -5
- package/src/tasks/image-to-image/spec/input.json +3 -2
- package/src/tasks/image-to-image/spec/output.json +1 -1
- package/src/tasks/index.ts +5 -6
- package/src/tasks/keypoint-detection/about.md +59 -0
- package/src/tasks/keypoint-detection/data.ts +46 -0
- package/src/tasks/object-detection/inference.ts +17 -4
- package/src/tasks/object-detection/spec/input.json +2 -1
- package/src/tasks/object-detection/spec/output.json +10 -6
- package/src/tasks/summarization/inference.ts +13 -12
- package/src/tasks/summarization/spec/input.json +37 -2
- package/src/tasks/text-classification/spec/output.json +1 -0
- package/src/tasks/text-to-image/inference.ts +11 -7
- package/src/tasks/text-to-image/spec/input.json +8 -4
- package/src/tasks/text-to-image/spec/output.json +1 -1
- package/src/tasks/translation/inference.ts +21 -10
- package/src/tasks/translation/spec/input.json +45 -2
- package/src/tasks/zero-shot-classification/spec/output.json +1 -0
|
@@ -1,15 +1,10 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
### Style transfer
|
|
1
|
+
Image-to-image pipelines can also be used in text-to-image tasks, to provide visual guidance to the text-guided generation process.
|
|
4
2
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
## Task Variants
|
|
3
|
+
## Use Cases
|
|
8
4
|
|
|
9
5
|
### Image inpainting
|
|
10
6
|
|
|
11
|
-
Image inpainting is widely used during photography editing to remove unwanted objects, such as poles, wires, or sensor
|
|
12
|
-
dust.
|
|
7
|
+
Image inpainting is widely used during photography editing to remove unwanted objects, such as poles, wires, or sensor dust.
|
|
13
8
|
|
|
14
9
|
### Image colorization
|
|
15
10
|
|
|
@@ -24,18 +19,27 @@ Super-resolution models increase the resolution of an image, allowing for higher
|
|
|
24
19
|
You can use pipelines for image-to-image in 🧨diffusers library to easily use image-to-image models. See an example for `StableDiffusionImg2ImgPipeline` below.
|
|
25
20
|
|
|
26
21
|
```python
|
|
27
|
-
|
|
28
|
-
from diffusers import
|
|
22
|
+
import torch
|
|
23
|
+
from diffusers import AutoPipelineForImage2Image
|
|
24
|
+
from diffusers.utils import make_image_grid, load_image
|
|
29
25
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
26
|
+
pipeline = AutoPipelineForImage2Image.from_pretrained(
|
|
27
|
+
"stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
|
|
28
|
+
)
|
|
33
29
|
|
|
34
|
-
|
|
35
|
-
|
|
30
|
+
# this helps us to reduce memory usage- since SDXL is a bit heavy, this could help by
|
|
31
|
+
# offloading the model to CPU w/o hurting performance.
|
|
32
|
+
pipeline.enable_model_cpu_offload()
|
|
36
33
|
|
|
37
|
-
|
|
38
|
-
images
|
|
34
|
+
# prepare image
|
|
35
|
+
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-sdxl-init.png"
|
|
36
|
+
init_image = load_image(url)
|
|
37
|
+
|
|
38
|
+
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
|
39
|
+
|
|
40
|
+
# pass prompt and image to pipeline
|
|
41
|
+
image = pipeline(prompt, image=init_image, strength=0.5).images[0]
|
|
42
|
+
make_image_grid([init_image, image], rows=1, cols=2)
|
|
39
43
|
```
|
|
40
44
|
|
|
41
45
|
You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer image-to-image models on Hugging Face Hub.
|
|
@@ -53,13 +57,53 @@ await inference.imageToImage({
|
|
|
53
57
|
});
|
|
54
58
|
```
|
|
55
59
|
|
|
56
|
-
##
|
|
60
|
+
## Uses Cases for Text Guided Image Generation
|
|
57
61
|
|
|
58
|
-
|
|
62
|
+
### Style Transfer
|
|
63
|
+
|
|
64
|
+
One of the most popular use cases of image-to-image is style transfer. With style transfer models:
|
|
59
65
|
|
|
60
|
-
|
|
66
|
+
- a regular photo can be transformed into a variety of artistic styles or genres, such as a watercolor painting, a comic book illustration and more.
|
|
67
|
+
- new images can be generated using a text prompt, in the style of a reference input image.
|
|
68
|
+
|
|
69
|
+
See 🧨diffusers example for style transfer with `AutoPipelineForText2Image` below.
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from diffusers import AutoPipelineForText2Image
|
|
73
|
+
from diffusers.utils import load_image
|
|
74
|
+
import torch
|
|
75
|
+
|
|
76
|
+
# load pipeline
|
|
77
|
+
pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
|
|
78
|
+
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
|
|
79
|
+
|
|
80
|
+
# set the adapter and scales - this is a component that lets us add the style control from an image to the text-to-image model
|
|
81
|
+
scale = {
|
|
82
|
+
"down": {"block_2": [0.0, 1.0]},
|
|
83
|
+
"up": {"block_0": [0.0, 1.0, 0.0]},
|
|
84
|
+
}
|
|
85
|
+
pipeline.set_ip_adapter_scale(scale)
|
|
86
|
+
|
|
87
|
+
style_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg")
|
|
88
|
+
|
|
89
|
+
generator = torch.Generator(device="cpu").manual_seed(26)
|
|
90
|
+
image = pipeline(
|
|
91
|
+
prompt="a cat, masterpiece, best quality, high quality",
|
|
92
|
+
ip_adapter_image=style_image,
|
|
93
|
+
negative_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
|
|
94
|
+
guidance_scale=5,
|
|
95
|
+
num_inference_steps=30,
|
|
96
|
+
generator=generator,
|
|
97
|
+
).images[0]
|
|
98
|
+
image
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### ControlNet
|
|
102
|
+
|
|
103
|
+
Controlling the outputs of diffusion models only with a text prompt is a challenging problem. ControlNet is a neural network model that provides image-based control to diffusion models. Control images can be edges or other landmarks extracted from a source image.
|
|
104
|
+

|
|
61
105
|
|
|
62
|
-
##
|
|
106
|
+
## Pix2Pix
|
|
63
107
|
|
|
64
108
|
Pix2Pix is a popular model used for image-to-image translation tasks. It is based on a conditional-GAN (generative adversarial network) where instead of a noise vector a 2D image is given as input. More information about Pix2Pix can be retrieved from this [link](https://phillipi.github.io/pix2pix/) where the associated paper and the GitHub repository can be found.
|
|
65
109
|
|
|
@@ -70,8 +114,13 @@ The images below show some examples extracted from the Pix2Pix paper. This model
|
|
|
70
114
|
## Useful Resources
|
|
71
115
|
|
|
72
116
|
- [Image-to-image guide with diffusers](https://huggingface.co/docs/diffusers/using-diffusers/img2img)
|
|
117
|
+
- Image inpainting: [inpainting with 🧨diffusers](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/inpaint), [demo](https://huggingface.co/spaces/diffusers/stable-diffusion-xl-inpainting)
|
|
118
|
+
- Colorization: [demo](https://huggingface.co/spaces/modelscope/old_photo_restoration)
|
|
119
|
+
- Super resolution: [image upscaling with 🧨diffusers](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/upscale#super-resolution), [demo](https://huggingface.co/spaces/radames/Enhance-This-HiDiffusion-SDXL)
|
|
120
|
+
- [Style transfer and layout control with diffusers 🧨](https://huggingface.co/docs/diffusers/main/en/using-diffusers/ip_adapter#style--layout-control)
|
|
73
121
|
- [Train your ControlNet with diffusers 🧨](https://huggingface.co/blog/train-your-controlnet)
|
|
74
122
|
- [Ultra fast ControlNet with 🧨 Diffusers](https://huggingface.co/blog/controlnet)
|
|
123
|
+
- [List of ControlNets trained in the community JAX Diffusers sprint](https://huggingface.co/spaces/jax-diffusers-event/leaderboard)
|
|
75
124
|
|
|
76
125
|
## References
|
|
77
126
|
|
|
@@ -93,7 +93,7 @@ const taskData: TaskDataCustom = {
|
|
|
93
93
|
},
|
|
94
94
|
],
|
|
95
95
|
summary:
|
|
96
|
-
"Image-to-image is the task of transforming
|
|
96
|
+
"Image-to-image is the task of transforming an input image through a variety of possible manipulations and enhancements, such as super-resolution, image inpainting, colorization, and more.",
|
|
97
97
|
widgetModels: ["lllyasviel/sd-controlnet-canny"],
|
|
98
98
|
youtubeId: "",
|
|
99
99
|
};
|
|
@@ -9,9 +9,10 @@
|
|
|
9
9
|
*/
|
|
10
10
|
export interface ImageToImageInput {
|
|
11
11
|
/**
|
|
12
|
-
* The input image data
|
|
12
|
+
* The input image data as a base64-encoded string. If no `parameters` are provided, you can
|
|
13
|
+
* also provide the image data as a raw bytes payload.
|
|
13
14
|
*/
|
|
14
|
-
inputs:
|
|
15
|
+
inputs: string;
|
|
15
16
|
/**
|
|
16
17
|
* Additional inference parameters
|
|
17
18
|
*/
|
|
@@ -40,14 +41,14 @@ export interface ImageToImageParameters {
|
|
|
40
41
|
*/
|
|
41
42
|
num_inference_steps?: number;
|
|
42
43
|
/**
|
|
43
|
-
* The size in pixel of the output image
|
|
44
|
+
* The size in pixel of the output image.
|
|
44
45
|
*/
|
|
45
46
|
target_size?: TargetSize;
|
|
46
47
|
[property: string]: unknown;
|
|
47
48
|
}
|
|
48
49
|
|
|
49
50
|
/**
|
|
50
|
-
* The size in pixel of the output image
|
|
51
|
+
* The size in pixel of the output image.
|
|
51
52
|
*/
|
|
52
53
|
export interface TargetSize {
|
|
53
54
|
height: number;
|
|
@@ -60,7 +61,7 @@ export interface TargetSize {
|
|
|
60
61
|
*/
|
|
61
62
|
export interface ImageToImageOutput {
|
|
62
63
|
/**
|
|
63
|
-
* The output image
|
|
64
|
+
* The output image returned as raw bytes in the payload.
|
|
64
65
|
*/
|
|
65
66
|
image?: unknown;
|
|
66
67
|
[property: string]: unknown;
|
|
@@ -6,7 +6,8 @@
|
|
|
6
6
|
"type": "object",
|
|
7
7
|
"properties": {
|
|
8
8
|
"inputs": {
|
|
9
|
-
"
|
|
9
|
+
"type": "string",
|
|
10
|
+
"description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload."
|
|
10
11
|
},
|
|
11
12
|
"parameters": {
|
|
12
13
|
"description": "Additional inference parameters",
|
|
@@ -36,7 +37,7 @@
|
|
|
36
37
|
},
|
|
37
38
|
"target_size": {
|
|
38
39
|
"type": "object",
|
|
39
|
-
"description": "The size in pixel of the output image",
|
|
40
|
+
"description": "The size in pixel of the output image.",
|
|
40
41
|
"properties": {
|
|
41
42
|
"width": {
|
|
42
43
|
"type": "integer"
|
package/src/tasks/index.ts
CHANGED
|
@@ -73,12 +73,7 @@ export type * from "./table-question-answering/inference";
|
|
|
73
73
|
export type { TextToImageInput, TextToImageOutput, TextToImageParameters } from "./text-to-image/inference";
|
|
74
74
|
export type { TextToAudioParameters, TextToSpeechInput, TextToSpeechOutput } from "./text-to-speech/inference";
|
|
75
75
|
export type * from "./token-classification/inference";
|
|
76
|
-
export type {
|
|
77
|
-
Text2TextGenerationParameters,
|
|
78
|
-
Text2TextGenerationTruncationStrategy,
|
|
79
|
-
TranslationInput,
|
|
80
|
-
TranslationOutput,
|
|
81
|
-
} from "./translation/inference";
|
|
76
|
+
export type { TranslationInput, TranslationOutput } from "./translation/inference";
|
|
82
77
|
export type {
|
|
83
78
|
ClassificationOutputTransform,
|
|
84
79
|
TextClassificationInput,
|
|
@@ -131,6 +126,7 @@ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
|
|
|
131
126
|
"image-to-image": ["diffusers", "transformers", "transformers.js"],
|
|
132
127
|
"image-to-text": ["transformers", "transformers.js"],
|
|
133
128
|
"image-to-video": ["diffusers"],
|
|
129
|
+
"keypoint-detection": ["transformers"],
|
|
134
130
|
"video-classification": ["transformers"],
|
|
135
131
|
"mask-generation": ["transformers"],
|
|
136
132
|
"multiple-choice": ["transformers"],
|
|
@@ -174,6 +170,7 @@ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
|
|
|
174
170
|
"zero-shot-object-detection": ["transformers", "transformers.js"],
|
|
175
171
|
"text-to-3d": ["diffusers"],
|
|
176
172
|
"image-to-3d": ["diffusers"],
|
|
173
|
+
"any-to-any": ["transformers"],
|
|
177
174
|
};
|
|
178
175
|
|
|
179
176
|
/**
|
|
@@ -195,6 +192,7 @@ function getData(type: PipelineType, partialTaskData: TaskDataCustom = placehold
|
|
|
195
192
|
// Tasks that call getData() without the second argument will
|
|
196
193
|
// have a "placeholder" page.
|
|
197
194
|
export const TASKS_DATA: Record<PipelineType, TaskData | undefined> = {
|
|
195
|
+
"any-to-any": getData("any-to-any", placeholder),
|
|
198
196
|
"audio-classification": getData("audio-classification", audioClassification),
|
|
199
197
|
"audio-to-audio": getData("audio-to-audio", audioToAudio),
|
|
200
198
|
"automatic-speech-recognition": getData("automatic-speech-recognition", automaticSpeechRecognition),
|
|
@@ -210,6 +208,7 @@ export const TASKS_DATA: Record<PipelineType, TaskData | undefined> = {
|
|
|
210
208
|
"image-text-to-text": getData("image-text-to-text", imageTextToText),
|
|
211
209
|
"image-to-text": getData("image-to-text", imageToText),
|
|
212
210
|
"image-to-video": undefined,
|
|
211
|
+
"keypoint-detection": getData("keypoint-detection", placeholder),
|
|
213
212
|
"mask-generation": getData("mask-generation", maskGeneration),
|
|
214
213
|
"multiple-choice": undefined,
|
|
215
214
|
"object-detection": getData("object-detection", objectDetection),
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
## Task Variants
|
|
2
|
+
|
|
3
|
+
### Pose Estimation
|
|
4
|
+
|
|
5
|
+
Pose estimation is the process of determining the position and orientation of an object or a camera in a 3D space. It is a fundamental task in computer vision and is widely used in various applications such as robotics, augmented reality, and 3D reconstruction.
|
|
6
|
+
|
|
7
|
+
## Use Cases for Keypoint Detection
|
|
8
|
+
|
|
9
|
+
### Facial Landmark Estimation
|
|
10
|
+
|
|
11
|
+
Keypoint detection models can be used to estimate the position of facial landmarks. Facial landmarks are points on the face such as the corners of the mouth, the outer corners of the eyes, and the tip of the nose. These landmarks can be used for a variety of applications, such as facial expression recognition, 3D face reconstruction, and cinematic animation.
|
|
12
|
+
|
|
13
|
+
### Fitness Tracking
|
|
14
|
+
|
|
15
|
+
Keypoint detection models can be used to track the movement of the human body, e.g. position of the joints in a 3D space. This can be used for a variety of applications, such as fitness tracking, sports analysis or virtual reality applications.
|
|
16
|
+
|
|
17
|
+
## Inference Code
|
|
18
|
+
|
|
19
|
+
Below you can find an example of how to use a keypoint detection model and how to visualize the results.
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
from transformers import AutoImageProcessor, SuperPointForKeypointDetection
|
|
23
|
+
import torch
|
|
24
|
+
import matplotlib.pyplot as plt
|
|
25
|
+
from PIL import Image
|
|
26
|
+
import requests
|
|
27
|
+
|
|
28
|
+
url_image = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
29
|
+
image = Image.open(requests.get(url_image_1, stream=True).raw)
|
|
30
|
+
|
|
31
|
+
# initialize the model and processor
|
|
32
|
+
processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint")
|
|
33
|
+
model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")
|
|
34
|
+
|
|
35
|
+
# infer
|
|
36
|
+
inputs = processor(image, return_tensors="pt").to(model.device, model.dtype)
|
|
37
|
+
outputs = model(**inputs)
|
|
38
|
+
|
|
39
|
+
# visualize the output
|
|
40
|
+
image_width, image_height = image.size
|
|
41
|
+
image_mask = outputs.mask
|
|
42
|
+
image_indices = torch.nonzero(image_mask).squeeze()
|
|
43
|
+
|
|
44
|
+
image_scores = outputs.scores.squeeze()
|
|
45
|
+
image_keypoints = outputs.keypoints.squeeze()
|
|
46
|
+
keypoints = image_keypoints.detach().numpy()
|
|
47
|
+
scores = image_scores.detach().numpy()
|
|
48
|
+
|
|
49
|
+
plt.axis('off')
|
|
50
|
+
plt.imshow(image)
|
|
51
|
+
plt.scatter(
|
|
52
|
+
keypoints[:, 0],
|
|
53
|
+
keypoints[:, 1],
|
|
54
|
+
s=scores * 100,
|
|
55
|
+
c='cyan',
|
|
56
|
+
alpha=0.4
|
|
57
|
+
)
|
|
58
|
+
plt.show()
|
|
59
|
+
```
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "..";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
description: "A dataset of hand keypoints of over 500k examples.",
|
|
7
|
+
id: "Vincent-luo/hagrid-mediapipe-hands",
|
|
8
|
+
},
|
|
9
|
+
],
|
|
10
|
+
demo: {
|
|
11
|
+
inputs: [
|
|
12
|
+
{
|
|
13
|
+
filename: "keypoint-detection-input.png",
|
|
14
|
+
type: "img",
|
|
15
|
+
},
|
|
16
|
+
],
|
|
17
|
+
outputs: [
|
|
18
|
+
{
|
|
19
|
+
filename: "keypoint-detection-output.png",
|
|
20
|
+
type: "img",
|
|
21
|
+
},
|
|
22
|
+
],
|
|
23
|
+
},
|
|
24
|
+
metrics: [],
|
|
25
|
+
models: [
|
|
26
|
+
{
|
|
27
|
+
description: "A robust keypoint detection model.",
|
|
28
|
+
id: "magic-leap-community/superpoint",
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
description: "Strong keypoint detection model used to detect human pose.",
|
|
32
|
+
id: "qualcomm/MediaPipe-Pose-Estimation",
|
|
33
|
+
},
|
|
34
|
+
],
|
|
35
|
+
spaces: [
|
|
36
|
+
{
|
|
37
|
+
description: "An application that detects hand keypoints in real-time.",
|
|
38
|
+
id: "datasciencedojo/Hand-Keypoint-Detection-Realtime",
|
|
39
|
+
},
|
|
40
|
+
],
|
|
41
|
+
summary: "Keypoint detection is the task of identifying meaningful distinctive points or features in an image.",
|
|
42
|
+
widgetModels: [],
|
|
43
|
+
youtubeId: "",
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
export default taskData;
|
|
@@ -8,9 +8,10 @@
|
|
|
8
8
|
*/
|
|
9
9
|
export interface ObjectDetectionInput {
|
|
10
10
|
/**
|
|
11
|
-
* The input image data
|
|
11
|
+
* The input image data as a base64-encoded string. If no `parameters` are provided, you can
|
|
12
|
+
* also provide the image data as a raw bytes payload.
|
|
12
13
|
*/
|
|
13
|
-
inputs:
|
|
14
|
+
inputs: string;
|
|
14
15
|
/**
|
|
15
16
|
* Additional inference parameters
|
|
16
17
|
*/
|
|
@@ -34,9 +35,21 @@ export interface ObjectDetectionParameters {
|
|
|
34
35
|
* image.
|
|
35
36
|
*/
|
|
36
37
|
export interface BoundingBox {
|
|
38
|
+
/**
|
|
39
|
+
* The x-coordinate of the bottom-right corner of the bounding box.
|
|
40
|
+
*/
|
|
37
41
|
xmax: number;
|
|
42
|
+
/**
|
|
43
|
+
* The x-coordinate of the top-left corner of the bounding box.
|
|
44
|
+
*/
|
|
38
45
|
xmin: number;
|
|
46
|
+
/**
|
|
47
|
+
* The y-coordinate of the bottom-right corner of the bounding box.
|
|
48
|
+
*/
|
|
39
49
|
ymax: number;
|
|
50
|
+
/**
|
|
51
|
+
* The y-coordinate of the top-left corner of the bounding box.
|
|
52
|
+
*/
|
|
40
53
|
ymin: number;
|
|
41
54
|
[property: string]: unknown;
|
|
42
55
|
}
|
|
@@ -51,11 +64,11 @@ export interface ObjectDetectionOutputElement {
|
|
|
51
64
|
*/
|
|
52
65
|
box: BoundingBox;
|
|
53
66
|
/**
|
|
54
|
-
* The predicted label for the bounding box
|
|
67
|
+
* The predicted label for the bounding box.
|
|
55
68
|
*/
|
|
56
69
|
label: string;
|
|
57
70
|
/**
|
|
58
|
-
* The associated score / probability
|
|
71
|
+
* The associated score / probability.
|
|
59
72
|
*/
|
|
60
73
|
score: number;
|
|
61
74
|
[property: string]: unknown;
|
|
@@ -6,7 +6,8 @@
|
|
|
6
6
|
"type": "object",
|
|
7
7
|
"properties": {
|
|
8
8
|
"inputs": {
|
|
9
|
-
"
|
|
9
|
+
"type": "string",
|
|
10
|
+
"description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload."
|
|
10
11
|
},
|
|
11
12
|
"parameters": {
|
|
12
13
|
"description": "Additional inference parameters",
|
|
@@ -9,11 +9,11 @@
|
|
|
9
9
|
"properties": {
|
|
10
10
|
"label": {
|
|
11
11
|
"type": "string",
|
|
12
|
-
"description": "The predicted label for the bounding box"
|
|
12
|
+
"description": "The predicted label for the bounding box."
|
|
13
13
|
},
|
|
14
14
|
"score": {
|
|
15
15
|
"type": "number",
|
|
16
|
-
"description": "The associated score / probability"
|
|
16
|
+
"description": "The associated score / probability."
|
|
17
17
|
},
|
|
18
18
|
"box": {
|
|
19
19
|
"$ref": "#/$defs/BoundingBox",
|
|
@@ -28,16 +28,20 @@
|
|
|
28
28
|
"title": "BoundingBox",
|
|
29
29
|
"properties": {
|
|
30
30
|
"xmin": {
|
|
31
|
-
"type": "integer"
|
|
31
|
+
"type": "integer",
|
|
32
|
+
"description": "The x-coordinate of the top-left corner of the bounding box."
|
|
32
33
|
},
|
|
33
34
|
"xmax": {
|
|
34
|
-
"type": "integer"
|
|
35
|
+
"type": "integer",
|
|
36
|
+
"description": "The x-coordinate of the bottom-right corner of the bounding box."
|
|
35
37
|
},
|
|
36
38
|
"ymin": {
|
|
37
|
-
"type": "integer"
|
|
39
|
+
"type": "integer",
|
|
40
|
+
"description": "The y-coordinate of the top-left corner of the bounding box."
|
|
38
41
|
},
|
|
39
42
|
"ymax": {
|
|
40
|
-
"type": "integer"
|
|
43
|
+
"type": "integer",
|
|
44
|
+
"description": "The y-coordinate of the bottom-right corner of the bounding box."
|
|
41
45
|
}
|
|
42
46
|
},
|
|
43
47
|
"required": ["xmin", "xmax", "ymin", "ymax"]
|
|
@@ -6,43 +6,44 @@
|
|
|
6
6
|
|
|
7
7
|
/**
|
|
8
8
|
* Inputs for Summarization inference
|
|
9
|
-
*
|
|
10
|
-
* Inputs for Text2text Generation inference
|
|
11
9
|
*/
|
|
12
10
|
export interface SummarizationInput {
|
|
13
11
|
/**
|
|
14
|
-
* The input text
|
|
12
|
+
* The input text to summarize.
|
|
15
13
|
*/
|
|
16
14
|
inputs: string;
|
|
17
15
|
/**
|
|
18
|
-
* Additional inference parameters
|
|
16
|
+
* Additional inference parameters.
|
|
19
17
|
*/
|
|
20
|
-
parameters?:
|
|
18
|
+
parameters?: SummarizationParameters;
|
|
21
19
|
[property: string]: unknown;
|
|
22
20
|
}
|
|
23
21
|
|
|
24
22
|
/**
|
|
25
|
-
* Additional inference parameters
|
|
23
|
+
* Additional inference parameters.
|
|
26
24
|
*
|
|
27
|
-
* Additional inference parameters for
|
|
25
|
+
* Additional inference parameters for summarization.
|
|
28
26
|
*/
|
|
29
|
-
export interface
|
|
27
|
+
export interface SummarizationParameters {
|
|
30
28
|
/**
|
|
31
29
|
* Whether to clean up the potential extra spaces in the text output.
|
|
32
30
|
*/
|
|
33
31
|
clean_up_tokenization_spaces?: boolean;
|
|
34
32
|
/**
|
|
35
|
-
* Additional parametrization of the text generation algorithm
|
|
33
|
+
* Additional parametrization of the text generation algorithm.
|
|
36
34
|
*/
|
|
37
35
|
generate_parameters?: { [key: string]: unknown };
|
|
38
36
|
/**
|
|
39
|
-
* The truncation strategy to use
|
|
37
|
+
* The truncation strategy to use.
|
|
40
38
|
*/
|
|
41
|
-
truncation?:
|
|
39
|
+
truncation?: SummarizationTruncationStrategy;
|
|
42
40
|
[property: string]: unknown;
|
|
43
41
|
}
|
|
44
42
|
|
|
45
|
-
|
|
43
|
+
/**
|
|
44
|
+
* The truncation strategy to use.
|
|
45
|
+
*/
|
|
46
|
+
export type SummarizationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
|
|
46
47
|
|
|
47
48
|
/**
|
|
48
49
|
* Outputs of inference for the Summarization task
|
|
@@ -1,7 +1,42 @@
|
|
|
1
1
|
{
|
|
2
|
-
"$ref": "/inference/schemas/text2text-generation/input.json",
|
|
3
2
|
"$id": "/inference/schemas/summarization/input.json",
|
|
4
3
|
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Inputs for Summarization inference",
|
|
5
5
|
"title": "SummarizationInput",
|
|
6
|
-
"
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"inputs": {
|
|
9
|
+
"description": "The input text to summarize.",
|
|
10
|
+
"type": "string"
|
|
11
|
+
},
|
|
12
|
+
"parameters": {
|
|
13
|
+
"description": "Additional inference parameters.",
|
|
14
|
+
"$ref": "#/$defs/SummarizationParameters"
|
|
15
|
+
}
|
|
16
|
+
},
|
|
17
|
+
"$defs": {
|
|
18
|
+
"SummarizationParameters": {
|
|
19
|
+
"title": "SummarizationParameters",
|
|
20
|
+
"description": "Additional inference parameters for summarization.",
|
|
21
|
+
"type": "object",
|
|
22
|
+
"properties": {
|
|
23
|
+
"clean_up_tokenization_spaces": {
|
|
24
|
+
"type": "boolean",
|
|
25
|
+
"description": "Whether to clean up the potential extra spaces in the text output."
|
|
26
|
+
},
|
|
27
|
+
"truncation": {
|
|
28
|
+
"title": "SummarizationTruncationStrategy",
|
|
29
|
+
"type": "string",
|
|
30
|
+
"description": "The truncation strategy to use.",
|
|
31
|
+
"enum": ["do_not_truncate", "longest_first", "only_first", "only_second"]
|
|
32
|
+
},
|
|
33
|
+
"generate_parameters": {
|
|
34
|
+
"title": "generateParameters",
|
|
35
|
+
"type": "object",
|
|
36
|
+
"description": "Additional parametrization of the text generation algorithm."
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
},
|
|
41
|
+
"required": ["inputs"]
|
|
7
42
|
}
|
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
*/
|
|
10
10
|
export interface TextToImageInput {
|
|
11
11
|
/**
|
|
12
|
-
* The input text data (sometimes called "prompt"
|
|
12
|
+
* The input text data (sometimes called "prompt")
|
|
13
13
|
*/
|
|
14
14
|
inputs: string;
|
|
15
15
|
/**
|
|
@@ -26,8 +26,8 @@ export interface TextToImageInput {
|
|
|
26
26
|
*/
|
|
27
27
|
export interface TextToImageParameters {
|
|
28
28
|
/**
|
|
29
|
-
*
|
|
30
|
-
*
|
|
29
|
+
* A higher guidance scale value encourages the model to generate images closely linked to
|
|
30
|
+
* the text prompt, but values too high may cause saturation and other artifacts.
|
|
31
31
|
*/
|
|
32
32
|
guidance_scale?: number;
|
|
33
33
|
/**
|
|
@@ -35,14 +35,18 @@ export interface TextToImageParameters {
|
|
|
35
35
|
*/
|
|
36
36
|
negative_prompt?: string[];
|
|
37
37
|
/**
|
|
38
|
-
*
|
|
39
|
-
*
|
|
38
|
+
* The number of denoising steps. More denoising steps usually lead to a higher quality
|
|
39
|
+
* image at the expense of slower inference.
|
|
40
40
|
*/
|
|
41
41
|
num_inference_steps?: number;
|
|
42
42
|
/**
|
|
43
|
-
*
|
|
43
|
+
* Override the scheduler with a compatible one.
|
|
44
44
|
*/
|
|
45
45
|
scheduler?: string;
|
|
46
|
+
/**
|
|
47
|
+
* Seed for the random number generator.
|
|
48
|
+
*/
|
|
49
|
+
seed?: number;
|
|
46
50
|
/**
|
|
47
51
|
* The size in pixel of the output image
|
|
48
52
|
*/
|
|
@@ -64,7 +68,7 @@ export interface TargetSize {
|
|
|
64
68
|
*/
|
|
65
69
|
export interface TextToImageOutput {
|
|
66
70
|
/**
|
|
67
|
-
* The generated image
|
|
71
|
+
* The generated image returned as raw bytes in the payload.
|
|
68
72
|
*/
|
|
69
73
|
image: unknown;
|
|
70
74
|
[property: string]: unknown;
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
"type": "object",
|
|
7
7
|
"properties": {
|
|
8
8
|
"inputs": {
|
|
9
|
-
"description": "The input text data (sometimes called \"prompt\"",
|
|
9
|
+
"description": "The input text data (sometimes called \"prompt\")",
|
|
10
10
|
"type": "string"
|
|
11
11
|
},
|
|
12
12
|
"parameters": {
|
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
"properties": {
|
|
23
23
|
"guidance_scale": {
|
|
24
24
|
"type": "number",
|
|
25
|
-
"description": "
|
|
25
|
+
"description": "A higher guidance scale value encourages the model to generate images closely linked to the text prompt, but values too high may cause saturation and other artifacts."
|
|
26
26
|
},
|
|
27
27
|
"negative_prompt": {
|
|
28
28
|
"type": "array",
|
|
@@ -33,7 +33,7 @@
|
|
|
33
33
|
},
|
|
34
34
|
"num_inference_steps": {
|
|
35
35
|
"type": "integer",
|
|
36
|
-
"description": "
|
|
36
|
+
"description": "The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference."
|
|
37
37
|
},
|
|
38
38
|
"target_size": {
|
|
39
39
|
"type": "object",
|
|
@@ -50,7 +50,11 @@
|
|
|
50
50
|
},
|
|
51
51
|
"scheduler": {
|
|
52
52
|
"type": "string",
|
|
53
|
-
"description": "
|
|
53
|
+
"description": "Override the scheduler with a compatible one."
|
|
54
|
+
},
|
|
55
|
+
"seed": {
|
|
56
|
+
"type": "integer",
|
|
57
|
+
"description": "Seed for the random number generator."
|
|
54
58
|
}
|
|
55
59
|
}
|
|
56
60
|
}
|