@huggingface/tasks 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/assets/audio-classification/audio.wav +0 -0
- package/assets/audio-to-audio/input.wav +0 -0
- package/assets/audio-to-audio/label-0.wav +0 -0
- package/assets/audio-to-audio/label-1.wav +0 -0
- package/assets/automatic-speech-recognition/input.flac +0 -0
- package/assets/automatic-speech-recognition/wav2vec2.png +0 -0
- package/assets/contribution-guide/anatomy.png +0 -0
- package/assets/contribution-guide/libraries.png +0 -0
- package/assets/depth-estimation/depth-estimation-input.jpg +0 -0
- package/assets/depth-estimation/depth-estimation-output.png +0 -0
- package/assets/document-question-answering/document-question-answering-input.png +0 -0
- package/assets/image-classification/image-classification-input.jpeg +0 -0
- package/assets/image-segmentation/image-segmentation-input.jpeg +0 -0
- package/assets/image-segmentation/image-segmentation-output.png +0 -0
- package/assets/image-to-image/image-to-image-input.jpeg +0 -0
- package/assets/image-to-image/image-to-image-output.png +0 -0
- package/assets/image-to-image/pix2pix_examples.jpg +0 -0
- package/assets/image-to-text/savanna.jpg +0 -0
- package/assets/object-detection/object-detection-input.jpg +0 -0
- package/assets/object-detection/object-detection-output.jpg +0 -0
- package/assets/table-question-answering/tableQA.jpg +0 -0
- package/assets/text-to-image/image.jpeg +0 -0
- package/assets/text-to-speech/audio.wav +0 -0
- package/assets/text-to-video/text-to-video-output.gif +0 -0
- package/assets/unconditional-image-generation/unconditional-image-generation-output.jpeg +0 -0
- package/assets/video-classification/video-classification-input.gif +0 -0
- package/assets/visual-question-answering/elephant.jpeg +0 -0
- package/assets/zero-shot-image-classification/image-classification-input.jpeg +0 -0
- package/dist/index.cjs +3105 -0
- package/dist/index.d.cts +145 -0
- package/dist/index.d.ts +145 -0
- package/dist/index.js +3079 -0
- package/package.json +35 -0
- package/src/Types.ts +58 -0
- package/src/audio-classification/about.md +85 -0
- package/src/audio-classification/data.ts +77 -0
- package/src/audio-to-audio/about.md +55 -0
- package/src/audio-to-audio/data.ts +63 -0
- package/src/automatic-speech-recognition/about.md +86 -0
- package/src/automatic-speech-recognition/data.ts +77 -0
- package/src/const.ts +51 -0
- package/src/conversational/about.md +50 -0
- package/src/conversational/data.ts +62 -0
- package/src/depth-estimation/about.md +38 -0
- package/src/depth-estimation/data.ts +52 -0
- package/src/document-question-answering/about.md +54 -0
- package/src/document-question-answering/data.ts +67 -0
- package/src/feature-extraction/about.md +35 -0
- package/src/feature-extraction/data.ts +57 -0
- package/src/fill-mask/about.md +51 -0
- package/src/fill-mask/data.ts +77 -0
- package/src/image-classification/about.md +48 -0
- package/src/image-classification/data.ts +88 -0
- package/src/image-segmentation/about.md +63 -0
- package/src/image-segmentation/data.ts +96 -0
- package/src/image-to-image/about.md +81 -0
- package/src/image-to-image/data.ts +97 -0
- package/src/image-to-text/about.md +58 -0
- package/src/image-to-text/data.ts +87 -0
- package/src/index.ts +2 -0
- package/src/object-detection/about.md +36 -0
- package/src/object-detection/data.ts +73 -0
- package/src/placeholder/about.md +15 -0
- package/src/placeholder/data.ts +18 -0
- package/src/question-answering/about.md +56 -0
- package/src/question-answering/data.ts +69 -0
- package/src/reinforcement-learning/about.md +176 -0
- package/src/reinforcement-learning/data.ts +78 -0
- package/src/sentence-similarity/about.md +97 -0
- package/src/sentence-similarity/data.ts +100 -0
- package/src/summarization/about.md +57 -0
- package/src/summarization/data.ts +72 -0
- package/src/table-question-answering/about.md +43 -0
- package/src/table-question-answering/data.ts +63 -0
- package/src/tabular-classification/about.md +67 -0
- package/src/tabular-classification/data.ts +69 -0
- package/src/tabular-regression/about.md +91 -0
- package/src/tabular-regression/data.ts +58 -0
- package/src/tasksData.ts +104 -0
- package/src/text-classification/about.md +171 -0
- package/src/text-classification/data.ts +90 -0
- package/src/text-generation/about.md +128 -0
- package/src/text-generation/data.ts +124 -0
- package/src/text-to-image/about.md +65 -0
- package/src/text-to-image/data.ts +88 -0
- package/src/text-to-speech/about.md +63 -0
- package/src/text-to-speech/data.ts +70 -0
- package/src/text-to-video/about.md +36 -0
- package/src/text-to-video/data.ts +97 -0
- package/src/token-classification/about.md +78 -0
- package/src/token-classification/data.ts +83 -0
- package/src/translation/about.md +65 -0
- package/src/translation/data.ts +68 -0
- package/src/unconditional-image-generation/about.md +45 -0
- package/src/unconditional-image-generation/data.ts +66 -0
- package/src/video-classification/about.md +53 -0
- package/src/video-classification/data.ts +84 -0
- package/src/visual-question-answering/about.md +43 -0
- package/src/visual-question-answering/data.ts +90 -0
- package/src/zero-shot-classification/about.md +39 -0
- package/src/zero-shot-classification/data.ts +66 -0
- package/src/zero-shot-image-classification/about.md +68 -0
- package/src/zero-shot-image-classification/data.ts +79 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
## Use Cases
|
|
2
|
+
|
|
3
|
+
### Autonomous Driving
|
|
4
|
+
|
|
5
|
+
Segmentation models are used to identify road patterns such as lanes and obstacles for safer driving.
|
|
6
|
+
|
|
7
|
+
### Background Removal
|
|
8
|
+
|
|
9
|
+
Image Segmentation models are used in cameras to erase the background of certain objects and apply filters to them.
|
|
10
|
+
|
|
11
|
+
### Medical Imaging
|
|
12
|
+
|
|
13
|
+
Image Segmentation models are used to distinguish organs or tissues, improving medical imaging workflows. Models are used to segment dental instances, analyze X-Ray scans or even segment cells for pathological diagnosis. This [dataset](https://github.com/v7labs/covid-19-xray-dataset) contains images of lungs of healthy patients and patients with COVID-19 segmented with masks. Another [segmentation dataset](https://ivdm3seg.weebly.com/data.html) contains segmented MRI data of the lower spine to analyze the effect of spaceflight simulation.
|
|
14
|
+
|
|
15
|
+
## Task Variants
|
|
16
|
+
|
|
17
|
+
### Semantic Segmentation
|
|
18
|
+
|
|
19
|
+
Semantic Segmentation is the task of segmenting parts of an image that belong to the same class. Semantic Segmentation models make predictions for each pixel and return the probabilities of the classes for each pixel. These models are evaluated on Mean Intersection Over Union (Mean IoU).
|
|
20
|
+
|
|
21
|
+
### Instance Segmentation
|
|
22
|
+
|
|
23
|
+
Instance Segmentation is the variant of Image Segmentation where every distinct object is segmented, instead of one segment per class.
|
|
24
|
+
|
|
25
|
+
### Panoptic Segmentation
|
|
26
|
+
|
|
27
|
+
Panoptic Segmentation is the Image Segmentation task that segments the image both by instance and by class, assigning each pixel a different instance of the class.
|
|
28
|
+
|
|
29
|
+
## Inference
|
|
30
|
+
|
|
31
|
+
You can infer with Image Segmentation models using the `image-segmentation` pipeline. You need to install [timm](https://github.com/rwightman/pytorch-image-models) first.
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
!pip install timm
|
|
35
|
+
model = pipeline("image-segmentation")
|
|
36
|
+
model("cat.png")
|
|
37
|
+
#[{'label': 'cat',
|
|
38
|
+
# 'mask': mask_code,
|
|
39
|
+
# 'score': 0.999}
|
|
40
|
+
# ...]
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer image segmentation models on Hugging Face Hub.
|
|
44
|
+
|
|
45
|
+
```javascript
|
|
46
|
+
import { HfInference } from "@huggingface/inference";
|
|
47
|
+
|
|
48
|
+
const inference = new HfInference(HF_ACCESS_TOKEN);
|
|
49
|
+
await inference.imageSegmentation({
|
|
50
|
+
data: await (await fetch('https://picsum.photos/300/300')).blob(),
|
|
51
|
+
model: 'facebook/detr-resnet-50-panoptic',
|
|
52
|
+
})
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Useful Resources
|
|
56
|
+
|
|
57
|
+
Would you like to learn more about image segmentation? Great! Here you can find some curated resources that you may find helpful!
|
|
58
|
+
|
|
59
|
+
- [Fine-Tune a Semantic Segmentation Model with a Custom Dataset](https://huggingface.co/blog/fine-tune-segformer)
|
|
60
|
+
- [Walkthrough of Computer Vision Ecosystem in Hugging Face - CV Study Group](https://www.youtube.com/watch?v=oL-xmufhZM8)
|
|
61
|
+
- [A Guide on Universal Image Segmentation with Mask2Former and OneFormer](https://huggingface.co/blog/mask2former)
|
|
62
|
+
- [Zero-shot image segmentation with CLIPSeg](https://huggingface.co/blog/clipseg-zero-shot)
|
|
63
|
+
- [Semantic segmentation task guide](https://huggingface.co/docs/transformers/tasks/semantic_segmentation)
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../Types";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
description: "Scene segmentation dataset.",
|
|
7
|
+
id: "scene_parse_150",
|
|
8
|
+
},
|
|
9
|
+
],
|
|
10
|
+
demo: {
|
|
11
|
+
inputs: [
|
|
12
|
+
{
|
|
13
|
+
filename: "image-segmentation-input.jpeg",
|
|
14
|
+
type: "img",
|
|
15
|
+
},
|
|
16
|
+
],
|
|
17
|
+
outputs: [
|
|
18
|
+
{
|
|
19
|
+
filename: "image-segmentation-output.png",
|
|
20
|
+
type: "img",
|
|
21
|
+
},
|
|
22
|
+
],
|
|
23
|
+
},
|
|
24
|
+
metrics: [
|
|
25
|
+
{
|
|
26
|
+
description: "Average Precision (AP) is the Area Under the PR Curve (AUC-PR). It is calculated for each semantic class separately",
|
|
27
|
+
id: "Average Precision",
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
description: "Mean Average Precision (mAP) is the overall average of the AP values",
|
|
31
|
+
id: "Mean Average Precision",
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
description: "Intersection over Union (IoU) is the overlap of segmentation masks. Mean IoU is the average of the IoU of all semantic classes",
|
|
35
|
+
id: "Mean Intersection over Union",
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
description: "APα is the Average Precision at the IoU threshold of a α value, for example, AP50 and AP75",
|
|
39
|
+
id: "APα",
|
|
40
|
+
},
|
|
41
|
+
],
|
|
42
|
+
models: [
|
|
43
|
+
{
|
|
44
|
+
// TO DO: write description
|
|
45
|
+
description: "Solid panoptic segmentation model trained on the COCO 2017 benchmark dataset.",
|
|
46
|
+
id: "facebook/detr-resnet-50-panoptic",
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
description: "Semantic segmentation model trained on ADE20k benchmark dataset.",
|
|
50
|
+
id: "microsoft/beit-large-finetuned-ade-640-640",
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
description: "Semantic segmentation model trained on ADE20k benchmark dataset with 512x512 resolution.",
|
|
54
|
+
id: "nvidia/segformer-b0-finetuned-ade-512-512",
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
description: "Semantic segmentation model trained Cityscapes dataset.",
|
|
58
|
+
id: "facebook/mask2former-swin-large-cityscapes-semantic",
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
description: "Panoptic segmentation model trained COCO (common objects) dataset.",
|
|
62
|
+
id: "facebook/mask2former-swin-large-coco-panoptic",
|
|
63
|
+
},
|
|
64
|
+
],
|
|
65
|
+
spaces: [
|
|
66
|
+
{
|
|
67
|
+
description: "A semantic segmentation application that can predict unseen instances out of the box.",
|
|
68
|
+
id: "facebook/ov-seg",
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
description: "One of the strongest segmentation applications.",
|
|
72
|
+
id: "jbrinkma/segment-anything",
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
description: "A semantic segmentation application that predicts human silhouettes.",
|
|
76
|
+
id: "keras-io/Human-Part-Segmentation",
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
description: "An instance segmentation application to predict neuronal cell types from microscopy images.",
|
|
80
|
+
id: "rashmi/sartorius-cell-instance-segmentation",
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
description: "An application that segments videos.",
|
|
84
|
+
id: "ArtGAN/Segment-Anything-Video",
|
|
85
|
+
},
|
|
86
|
+
{
|
|
87
|
+
description: "An panoptic segmentation application built for outdoor environments.",
|
|
88
|
+
id: "segments/panoptic-segment-anything",
|
|
89
|
+
},
|
|
90
|
+
],
|
|
91
|
+
summary: "Image Segmentation divides an image into segments where each pixel in the image is mapped to an object. This task has multiple variants such as instance segmentation, panoptic segmentation and semantic segmentation.",
|
|
92
|
+
widgetModels: ["facebook/detr-resnet-50-panoptic"],
|
|
93
|
+
youtubeId: "dKE8SIt9C-w",
|
|
94
|
+
};
|
|
95
|
+
|
|
96
|
+
export default taskData;
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
## Use Cases
|
|
2
|
+
|
|
3
|
+
### Style transfer
|
|
4
|
+
|
|
5
|
+
One of the most popular use cases of image to image is the style transfer. Style transfer models can convert a regular photography into a painting in the style of a famous painter.
|
|
6
|
+
|
|
7
|
+
## Task Variants
|
|
8
|
+
|
|
9
|
+
### Image inpainting
|
|
10
|
+
|
|
11
|
+
Image inpainting is widely used during photography editing to remove unwanted objects, such as poles, wires or sensor
|
|
12
|
+
dust.
|
|
13
|
+
|
|
14
|
+
### Image colorization
|
|
15
|
+
|
|
16
|
+
Old, black and white images can be brought up to life using an image colorization model.
|
|
17
|
+
|
|
18
|
+
### Super Resolution
|
|
19
|
+
|
|
20
|
+
Super resolution models increase the resolution of an image, allowing for higher quality viewing and printing.
|
|
21
|
+
|
|
22
|
+
## Inference
|
|
23
|
+
|
|
24
|
+
You can use pipelines for image-to-image in 🧨diffusers library to easily use image-to-image models. See an example for `StableDiffusionImg2ImgPipeline` below.
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from PIL import Image
|
|
28
|
+
from diffusers import StableDiffusionImg2ImgPipeline
|
|
29
|
+
|
|
30
|
+
model_id_or_path = "runwayml/stable-diffusion-v1-5"
|
|
31
|
+
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
|
|
32
|
+
pipe = pipe.to(cuda)
|
|
33
|
+
|
|
34
|
+
init_image = Image.open("mountains_image.jpeg").convert("RGB").resize((768, 512))
|
|
35
|
+
prompt = "A fantasy landscape, trending on artstation"
|
|
36
|
+
|
|
37
|
+
images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
|
|
38
|
+
images[0].save("fantasy_landscape.png")
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer image-to-image models on Hugging Face Hub.
|
|
42
|
+
|
|
43
|
+
```javascript
|
|
44
|
+
import { HfInference } from "@huggingface/inference";
|
|
45
|
+
|
|
46
|
+
const inference = new HfInference(HF_ACCESS_TOKEN);
|
|
47
|
+
await inference.imageToImage({
|
|
48
|
+
data: await (await fetch('image')).blob(),
|
|
49
|
+
model: "timbrooks/instruct-pix2pix",
|
|
50
|
+
parameters: {
|
|
51
|
+
prompt: "Deblur this image"
|
|
52
|
+
}
|
|
53
|
+
})
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## ControlNet
|
|
57
|
+
|
|
58
|
+
Controlling outputs of diffusion models only with a text prompt is a challenging problem. ControlNet is a neural network type that provides an image based control to diffusion models. These controls can be edges or landmarks in an image.
|
|
59
|
+
|
|
60
|
+
Many ControlNet models were trained in our community event, JAX Diffusers sprint. You can see the full list of the ControlNet models available [here](https://huggingface.co/spaces/jax-diffusers-event/leaderboard).
|
|
61
|
+
|
|
62
|
+
## Most Used Model for the Task
|
|
63
|
+
|
|
64
|
+
Pix2Pix is a popular model used for image to image translation tasks. It is based on a conditional-GAN (generative adversarial network) where instead of a noise vector a 2D image is given as input. More information about Pix2Pix can be retrieved from this [link](https://phillipi.github.io/pix2pix/) where the associated paper and the GitHub repository can be found.
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
Below images show some of the examples shared in the paper that can be obtained using Pix2Pix. There are various cases this model can be applied on. It is capable of relatively simpler things, e.g. converting a grayscale image to its colored version. But more importantly, it can generate realistic pictures from rough sketches (can be seen in the purse example) or from painting-like images (can be seen in the street and facade examples below).
|
|
68
|
+
|
|
69
|
+
<img src="/tasks/assets/image-to-image/pix2pix_examples.jpg" alt="Alt text" title="Optional title">
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
## Useful Resources
|
|
73
|
+
|
|
74
|
+
- [Train your ControlNet with diffusers 🧨](https://huggingface.co/blog/train-your-controlnet)
|
|
75
|
+
- [Ultra fast ControlNet with 🧨 Diffusers](https://huggingface.co/blog/controlnet)
|
|
76
|
+
|
|
77
|
+
## References
|
|
78
|
+
|
|
79
|
+
[1] P. Isola, J. -Y. Zhu, T. Zhou and A. A. Efros, "Image-to-Image Translation with Conditional Adversarial Networks," 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2017, pp. 5967-5976, doi: 10.1109/CVPR.2017.632.
|
|
80
|
+
|
|
81
|
+
This page was made possible thanks to the efforts of [Paul Gafton](https://github.com/Paul92) and [Osman Alenbey](https://huggingface.co/osman93).
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../Types";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
description: "Synthetic dataset, for image relighting",
|
|
7
|
+
id: "VIDIT",
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
description: "Multiple images of celebrities, used for facial expression translation",
|
|
11
|
+
id: "huggan/CelebA-faces",
|
|
12
|
+
},
|
|
13
|
+
],
|
|
14
|
+
demo: {
|
|
15
|
+
inputs: [
|
|
16
|
+
{
|
|
17
|
+
filename: "image-to-image-input.jpeg",
|
|
18
|
+
type: "img",
|
|
19
|
+
},
|
|
20
|
+
],
|
|
21
|
+
outputs: [
|
|
22
|
+
{
|
|
23
|
+
filename: "image-to-image-output.png",
|
|
24
|
+
type: "img",
|
|
25
|
+
},
|
|
26
|
+
],
|
|
27
|
+
},
|
|
28
|
+
isPlaceholder: false,
|
|
29
|
+
metrics: [
|
|
30
|
+
{
|
|
31
|
+
description: "Peak Signal to Noise Ratio (PSNR) is an approximation of the human perception, considering the ratio of the absolute intensity with respect to the variations. Measured in dB, a high value indicates a high fidelity.",
|
|
32
|
+
id: "PSNR",
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
description: "Structural Similarity Index (SSIM) is a perceptual metric which compares the luminance, contrast and structure of two images. The values of SSIM range between -1 and 1, and higher values indicate closer resemblance to the original image.",
|
|
36
|
+
id: "SSIM",
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
description: "Inception Score (IS) is an analysis of the labels predicted by an image classification model when presented with a sample of the generated images.",
|
|
40
|
+
id: "IS",
|
|
41
|
+
},
|
|
42
|
+
],
|
|
43
|
+
models: [
|
|
44
|
+
{
|
|
45
|
+
description: "A model that enhances images captured in low light conditions.",
|
|
46
|
+
id: "keras-io/low-light-image-enhancement",
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
description: "A model that increases the resolution of an image.",
|
|
50
|
+
id: "keras-io/super-resolution",
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
description: "A model that creates a set of variations of the input image in the style of DALL-E using Stable Diffusion.",
|
|
54
|
+
id: "lambdalabs/sd-image-variations-diffusers",
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
description: "A model that generates images based on segments in the input image and the text prompt.",
|
|
58
|
+
id: "mfidabel/controlnet-segment-anything",
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
description: "A model that takes an image and an instruction to edit the image.",
|
|
62
|
+
id: "timbrooks/instruct-pix2pix",
|
|
63
|
+
},
|
|
64
|
+
],
|
|
65
|
+
spaces: [
|
|
66
|
+
{
|
|
67
|
+
description: "Image enhancer application for low light.",
|
|
68
|
+
id: "keras-io/low-light-image-enhancement",
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
description: "Style transfer application.",
|
|
72
|
+
id: "keras-io/neural-style-transfer",
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
description: "An application that generates images based on segment control.",
|
|
76
|
+
id: "mfidabel/controlnet-segment-anything",
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
description: "Image generation application that takes image control and text prompt.",
|
|
80
|
+
id: "hysts/ControlNet",
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
description: "Colorize any image using this app.",
|
|
84
|
+
id: "ioclab/brightness-controlnet",
|
|
85
|
+
},
|
|
86
|
+
{
|
|
87
|
+
description: "Edit images with instructions.",
|
|
88
|
+
id: "timbrooks/instruct-pix2pix",
|
|
89
|
+
},
|
|
90
|
+
|
|
91
|
+
],
|
|
92
|
+
summary: "Image-to-image is the task of transforming a source image to match the characteristics of a target image or a target image domain. Any image manipulation and enhancement is possible with image to image models.",
|
|
93
|
+
widgetModels: ["lllyasviel/sd-controlnet-canny"],
|
|
94
|
+
youtubeId: "",
|
|
95
|
+
};
|
|
96
|
+
|
|
97
|
+
export default taskData;
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
## Use Cases
|
|
2
|
+
### Image Captioning
|
|
3
|
+
Image Captioning is the process of generating textual description of an image.
|
|
4
|
+
This can help the visually impaired people to understand what's happening in their surroundings.
|
|
5
|
+
|
|
6
|
+
### Optical Character Recognition (OCR)
|
|
7
|
+
OCR models convert the text present in an image, e.g. a scanned document, to text.
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
## Pix2Struct
|
|
12
|
+
|
|
13
|
+
Pix2Struct is a state-of-the-art model built and released by Google AI. The model itself has to be trained on a downstream task to be used. These tasks include, captioning UI components, images including text, visual questioning infographics, charts, scientific diagrams and more. You can find these models on recommended models of this page.
|
|
14
|
+
|
|
15
|
+
## Inference
|
|
16
|
+
### Image Captioning
|
|
17
|
+
You can use the 🤗 Transformers library's `image-to-text` pipeline to generate caption for the Image input.
|
|
18
|
+
```python
|
|
19
|
+
from transformers import pipeline
|
|
20
|
+
|
|
21
|
+
captioner = pipeline("image-to-text",model="Salesforce/blip-image-captioning-base")
|
|
22
|
+
captioner("https://huggingface.co/datasets/Narsil/image_dummy/resolve/main/parrots.png")
|
|
23
|
+
## [{'generated_text': 'two birds are standing next to each other '}]
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### OCR
|
|
27
|
+
This code snippet uses Microsoft’s TrOCR, an encoder-decoder model consisting of an image Transformer encoder and a text Transformer decoder for state-of-the-art optical character recognition (OCR) on single-text line images.
|
|
28
|
+
```python
|
|
29
|
+
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
|
30
|
+
|
|
31
|
+
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
|
|
32
|
+
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten')
|
|
33
|
+
pixel_values = processor(images="image.jpeg", return_tensors="pt").pixel_values
|
|
34
|
+
|
|
35
|
+
generated_ids = model.generate(pixel_values)
|
|
36
|
+
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer image-to-text models on Hugging Face Hub.
|
|
41
|
+
|
|
42
|
+
```javascript
|
|
43
|
+
import { HfInference } from "@huggingface/inference";
|
|
44
|
+
|
|
45
|
+
const inference = new HfInference(HF_ACCESS_TOKEN);
|
|
46
|
+
await inference.imageToText({
|
|
47
|
+
data: await (await fetch('https://picsum.photos/300/300')).blob(),
|
|
48
|
+
model: 'Salesforce/blip-image-captioning-base',
|
|
49
|
+
})
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Useful Resources
|
|
53
|
+
- [Image Captioning](https://huggingface.co/docs/transformers/main/en/tasks/image_captioning)
|
|
54
|
+
- [Image captioning use case](https://blog.google/outreach-initiatives/accessibility/get-image-descriptions/)
|
|
55
|
+
- [Train Image Captioning model on your dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/GIT/Fine_tune_GIT_on_an_image_captioning_dataset.ipynb)
|
|
56
|
+
- [Train OCR model on your dataset ](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/TrOCR)
|
|
57
|
+
|
|
58
|
+
This page was made possible thanks to efforts of [Sukesh Perla](https://huggingface.co/hitchhiker3010) and [Johannes Kolbe](https://huggingface.co/johko).
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../Types";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
// TODO write proper description
|
|
7
|
+
description: "Dataset from 12M image-text of Reddit",
|
|
8
|
+
id: "red_caps",
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
// TODO write proper description
|
|
12
|
+
description: "Dataset from 3.3M images of Google",
|
|
13
|
+
id: "datasets/conceptual_captions",
|
|
14
|
+
},
|
|
15
|
+
|
|
16
|
+
],
|
|
17
|
+
demo: {
|
|
18
|
+
inputs: [
|
|
19
|
+
{
|
|
20
|
+
filename: "savanna.jpg",
|
|
21
|
+
type: "img",
|
|
22
|
+
},
|
|
23
|
+
],
|
|
24
|
+
outputs: [
|
|
25
|
+
{
|
|
26
|
+
label: "Detailed description",
|
|
27
|
+
content: "a herd of giraffes and zebras grazing in a field",
|
|
28
|
+
type: "text",
|
|
29
|
+
},
|
|
30
|
+
],
|
|
31
|
+
},
|
|
32
|
+
metrics: [],
|
|
33
|
+
models: [
|
|
34
|
+
{
|
|
35
|
+
description: "A robust image captioning model.",
|
|
36
|
+
id: "Salesforce/blip-image-captioning-large",
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
description: "A strong image captioning model.",
|
|
40
|
+
id: "nlpconnect/vit-gpt2-image-captioning",
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
description: "A strong optical character recognition model.",
|
|
44
|
+
id: "microsoft/trocr-base-printed",
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
description: "A strong visual question answering model for scientific diagrams.",
|
|
48
|
+
id: "google/pix2struct-ai2d-base",
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
description: "A strong captioning model for UI components.",
|
|
52
|
+
id: "google/pix2struct-widget-captioning-base",
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
description: "A captioning model for images that contain text.",
|
|
56
|
+
id: "google/pix2struct-textcaps-base",
|
|
57
|
+
},
|
|
58
|
+
],
|
|
59
|
+
spaces: [
|
|
60
|
+
{
|
|
61
|
+
description: "A robust image captioning application.",
|
|
62
|
+
id: "flax-community/image-captioning",
|
|
63
|
+
},
|
|
64
|
+
{
|
|
65
|
+
description: "An application that transcribes handwritings into text.",
|
|
66
|
+
id: "nielsr/TrOCR-handwritten",
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
description: "An application that can caption images and answer questions about a given image.",
|
|
70
|
+
id: "Salesforce/BLIP",
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
description: "An application that can caption images and answer questions with a conversational agent.",
|
|
74
|
+
id: "Salesforce/BLIP2",
|
|
75
|
+
},
|
|
76
|
+
{
|
|
77
|
+
description: "An image captioning application that demonstrates the effect of noise on captions.",
|
|
78
|
+
id: "johko/capdec-image-captioning",
|
|
79
|
+
},
|
|
80
|
+
],
|
|
81
|
+
summary: "Image to text models output a text from a given image. Image captioning or optical character recognition can be considered as the most common applications of image to text.",
|
|
82
|
+
widgetModels: ["Salesforce/blip-image-captioning-base"],
|
|
83
|
+
youtubeId: "",
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
export default taskData;
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
## Use Cases
|
|
2
|
+
|
|
3
|
+
### Autonomous Driving
|
|
4
|
+
|
|
5
|
+
Object Detection is widely used in computer vision for autonomous driving. Self-driving cars use Object Detection models to detect pedestrians, bicycles, traffic lights and road signs to decide which step to take.
|
|
6
|
+
|
|
7
|
+
### Object Tracking in Matches
|
|
8
|
+
|
|
9
|
+
Object Detection models are widely used in sports where the ball or a player is tracked for monitoring and refereeing during matches.
|
|
10
|
+
|
|
11
|
+
### Image Search
|
|
12
|
+
|
|
13
|
+
Object Detection models are widely used in image search. Smartphones use Object Detection models to detect entities (such as specific places or objects) and allow the user to search for the entity on the Internet.
|
|
14
|
+
|
|
15
|
+
### Object Counting
|
|
16
|
+
|
|
17
|
+
Object Detection models are used to count instances of objects in a given image, this can include counting the objects in warehouses or stores, or counting the number of visitors in a store. They are also used to manage crowds at events to prevent disasters.
|
|
18
|
+
|
|
19
|
+
## Inference
|
|
20
|
+
|
|
21
|
+
You can infer with Object Detection models through the `object-detection` pipeline. When calling the pipeline you just need to specify a path or http link to an image.
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
model = pipeline("object-detection")
|
|
25
|
+
|
|
26
|
+
model("path_to_cat_image")
|
|
27
|
+
|
|
28
|
+
# [{'label': 'blanket',
|
|
29
|
+
# 'mask': mask_string,
|
|
30
|
+
# 'score': 0.917},
|
|
31
|
+
#...]
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
# Useful Resources
|
|
35
|
+
- [Walkthrough of Computer Vision Ecosystem in Hugging Face - CV Study Group](https://www.youtube.com/watch?v=oL-xmufhZM8)
|
|
36
|
+
- [Object detection task guide](https://huggingface.co/docs/transformers/tasks/object_detection)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../Types";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
// TODO write proper description
|
|
7
|
+
description: "Widely used benchmark dataset for multiple Vision tasks.",
|
|
8
|
+
id: "merve/coco2017",
|
|
9
|
+
},
|
|
10
|
+
],
|
|
11
|
+
demo: {
|
|
12
|
+
inputs: [
|
|
13
|
+
{
|
|
14
|
+
filename: "object-detection-input.jpg",
|
|
15
|
+
type: "img",
|
|
16
|
+
},
|
|
17
|
+
],
|
|
18
|
+
outputs: [
|
|
19
|
+
{
|
|
20
|
+
filename: "object-detection-output.jpg",
|
|
21
|
+
type: "img",
|
|
22
|
+
},
|
|
23
|
+
],
|
|
24
|
+
},
|
|
25
|
+
metrics: [
|
|
26
|
+
{
|
|
27
|
+
description: "The Average Precision (AP) metric is the Area Under the PR Curve (AUC-PR). It is calculated for each class separately",
|
|
28
|
+
id: "Average Precision",
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
description: "The Mean Average Precision (mAP) metric is the overall average of the AP values",
|
|
32
|
+
id: "Mean Average Precision",
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
description: "The APα metric is the Average Precision at the IoU threshold of a α value, for example, AP50 and AP75",
|
|
36
|
+
id: "APα",
|
|
37
|
+
},
|
|
38
|
+
],
|
|
39
|
+
models: [
|
|
40
|
+
{
|
|
41
|
+
// TO DO: write description
|
|
42
|
+
description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
|
|
43
|
+
id: "facebook/detr-resnet-50",
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
description: "Strong object detection model trained on ImageNet-21k dataset.",
|
|
47
|
+
id: "microsoft/beit-base-patch16-224-pt22k-ft22k",
|
|
48
|
+
},
|
|
49
|
+
],
|
|
50
|
+
spaces: [
|
|
51
|
+
{
|
|
52
|
+
description: "An object detection application that can detect unseen objects out of the box.",
|
|
53
|
+
id: "adirik/OWL-ViT",
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
description: "An application that contains various object detection models to try from.",
|
|
57
|
+
id: "Gradio-Blocks/Object-Detection-With-DETR-and-YOLOS",
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
description: "An application that shows multiple cutting edge techniques for object detection and tracking.",
|
|
61
|
+
id: "kadirnar/torchyolo",
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
description: "An object tracking, segmentation and inpainting application.",
|
|
65
|
+
id: "VIPLab/Track-Anything",
|
|
66
|
+
},
|
|
67
|
+
],
|
|
68
|
+
summary: "Object Detection models allow users to identify objects of certain defined classes. Object detection models receive an image as input and output the images with bounding boxes and labels on detected objects.",
|
|
69
|
+
widgetModels: ["facebook/detr-resnet-50"],
|
|
70
|
+
youtubeId: "WdAeKSOpxhw",
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
export default taskData;
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
## Use Cases
|
|
2
|
+
|
|
3
|
+
You can contribute this area with common use cases of the task!
|
|
4
|
+
|
|
5
|
+
## Task Variants
|
|
6
|
+
|
|
7
|
+
This place can be filled with variants of this task if there's any.
|
|
8
|
+
|
|
9
|
+
## Inference
|
|
10
|
+
|
|
11
|
+
This section should have useful information about how to pull a model from Hugging Face Hub that is a part of a library specialized in a task and use it.
|
|
12
|
+
|
|
13
|
+
## Useful Resources
|
|
14
|
+
|
|
15
|
+
In this area, you can insert useful resources about how to train or use a model for this task.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../Types";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [],
|
|
5
|
+
demo: {
|
|
6
|
+
inputs: [],
|
|
7
|
+
outputs: [],
|
|
8
|
+
},
|
|
9
|
+
isPlaceholder: true,
|
|
10
|
+
metrics: [],
|
|
11
|
+
models: [],
|
|
12
|
+
spaces: [],
|
|
13
|
+
summary: "",
|
|
14
|
+
widgetModels: [],
|
|
15
|
+
youtubeId: undefined,
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
export default taskData;
|