@huggingface/tasks 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/assets/audio-classification/audio.wav +0 -0
  2. package/assets/audio-to-audio/input.wav +0 -0
  3. package/assets/audio-to-audio/label-0.wav +0 -0
  4. package/assets/audio-to-audio/label-1.wav +0 -0
  5. package/assets/automatic-speech-recognition/input.flac +0 -0
  6. package/assets/automatic-speech-recognition/wav2vec2.png +0 -0
  7. package/assets/contribution-guide/anatomy.png +0 -0
  8. package/assets/contribution-guide/libraries.png +0 -0
  9. package/assets/depth-estimation/depth-estimation-input.jpg +0 -0
  10. package/assets/depth-estimation/depth-estimation-output.png +0 -0
  11. package/assets/document-question-answering/document-question-answering-input.png +0 -0
  12. package/assets/image-classification/image-classification-input.jpeg +0 -0
  13. package/assets/image-segmentation/image-segmentation-input.jpeg +0 -0
  14. package/assets/image-segmentation/image-segmentation-output.png +0 -0
  15. package/assets/image-to-image/image-to-image-input.jpeg +0 -0
  16. package/assets/image-to-image/image-to-image-output.png +0 -0
  17. package/assets/image-to-image/pix2pix_examples.jpg +0 -0
  18. package/assets/image-to-text/savanna.jpg +0 -0
  19. package/assets/object-detection/object-detection-input.jpg +0 -0
  20. package/assets/object-detection/object-detection-output.jpg +0 -0
  21. package/assets/table-question-answering/tableQA.jpg +0 -0
  22. package/assets/text-to-image/image.jpeg +0 -0
  23. package/assets/text-to-speech/audio.wav +0 -0
  24. package/assets/text-to-video/text-to-video-output.gif +0 -0
  25. package/assets/unconditional-image-generation/unconditional-image-generation-output.jpeg +0 -0
  26. package/assets/video-classification/video-classification-input.gif +0 -0
  27. package/assets/visual-question-answering/elephant.jpeg +0 -0
  28. package/assets/zero-shot-image-classification/image-classification-input.jpeg +0 -0
  29. package/dist/index.cjs +3105 -0
  30. package/dist/index.d.cts +145 -0
  31. package/dist/index.d.ts +145 -0
  32. package/dist/index.js +3079 -0
  33. package/package.json +35 -0
  34. package/src/Types.ts +58 -0
  35. package/src/audio-classification/about.md +85 -0
  36. package/src/audio-classification/data.ts +77 -0
  37. package/src/audio-to-audio/about.md +55 -0
  38. package/src/audio-to-audio/data.ts +63 -0
  39. package/src/automatic-speech-recognition/about.md +86 -0
  40. package/src/automatic-speech-recognition/data.ts +77 -0
  41. package/src/const.ts +51 -0
  42. package/src/conversational/about.md +50 -0
  43. package/src/conversational/data.ts +62 -0
  44. package/src/depth-estimation/about.md +38 -0
  45. package/src/depth-estimation/data.ts +52 -0
  46. package/src/document-question-answering/about.md +54 -0
  47. package/src/document-question-answering/data.ts +67 -0
  48. package/src/feature-extraction/about.md +35 -0
  49. package/src/feature-extraction/data.ts +57 -0
  50. package/src/fill-mask/about.md +51 -0
  51. package/src/fill-mask/data.ts +77 -0
  52. package/src/image-classification/about.md +48 -0
  53. package/src/image-classification/data.ts +88 -0
  54. package/src/image-segmentation/about.md +63 -0
  55. package/src/image-segmentation/data.ts +96 -0
  56. package/src/image-to-image/about.md +81 -0
  57. package/src/image-to-image/data.ts +97 -0
  58. package/src/image-to-text/about.md +58 -0
  59. package/src/image-to-text/data.ts +87 -0
  60. package/src/index.ts +2 -0
  61. package/src/object-detection/about.md +36 -0
  62. package/src/object-detection/data.ts +73 -0
  63. package/src/placeholder/about.md +15 -0
  64. package/src/placeholder/data.ts +18 -0
  65. package/src/question-answering/about.md +56 -0
  66. package/src/question-answering/data.ts +69 -0
  67. package/src/reinforcement-learning/about.md +176 -0
  68. package/src/reinforcement-learning/data.ts +78 -0
  69. package/src/sentence-similarity/about.md +97 -0
  70. package/src/sentence-similarity/data.ts +100 -0
  71. package/src/summarization/about.md +57 -0
  72. package/src/summarization/data.ts +72 -0
  73. package/src/table-question-answering/about.md +43 -0
  74. package/src/table-question-answering/data.ts +63 -0
  75. package/src/tabular-classification/about.md +67 -0
  76. package/src/tabular-classification/data.ts +69 -0
  77. package/src/tabular-regression/about.md +91 -0
  78. package/src/tabular-regression/data.ts +58 -0
  79. package/src/tasksData.ts +104 -0
  80. package/src/text-classification/about.md +171 -0
  81. package/src/text-classification/data.ts +90 -0
  82. package/src/text-generation/about.md +128 -0
  83. package/src/text-generation/data.ts +124 -0
  84. package/src/text-to-image/about.md +65 -0
  85. package/src/text-to-image/data.ts +88 -0
  86. package/src/text-to-speech/about.md +63 -0
  87. package/src/text-to-speech/data.ts +70 -0
  88. package/src/text-to-video/about.md +36 -0
  89. package/src/text-to-video/data.ts +97 -0
  90. package/src/token-classification/about.md +78 -0
  91. package/src/token-classification/data.ts +83 -0
  92. package/src/translation/about.md +65 -0
  93. package/src/translation/data.ts +68 -0
  94. package/src/unconditional-image-generation/about.md +45 -0
  95. package/src/unconditional-image-generation/data.ts +66 -0
  96. package/src/video-classification/about.md +53 -0
  97. package/src/video-classification/data.ts +84 -0
  98. package/src/visual-question-answering/about.md +43 -0
  99. package/src/visual-question-answering/data.ts +90 -0
  100. package/src/zero-shot-classification/about.md +39 -0
  101. package/src/zero-shot-classification/data.ts +66 -0
  102. package/src/zero-shot-image-classification/about.md +68 -0
  103. package/src/zero-shot-image-classification/data.ts +79 -0
@@ -0,0 +1,63 @@
1
+ ## Use Cases
2
+
3
+ ### Autonomous Driving
4
+
5
+ Segmentation models are used to identify road patterns such as lanes and obstacles for safer driving.
6
+
7
+ ### Background Removal
8
+
9
+ Image Segmentation models are used in cameras to erase the background of certain objects and apply filters to them.
10
+
11
+ ### Medical Imaging
12
+
13
+ Image Segmentation models are used to distinguish organs or tissues, improving medical imaging workflows. Models are used to segment dental instances, analyze X-Ray scans or even segment cells for pathological diagnosis. This [dataset](https://github.com/v7labs/covid-19-xray-dataset) contains images of lungs of healthy patients and patients with COVID-19 segmented with masks. Another [segmentation dataset](https://ivdm3seg.weebly.com/data.html) contains segmented MRI data of the lower spine to analyze the effect of spaceflight simulation.
14
+
15
+ ## Task Variants
16
+
17
+ ### Semantic Segmentation
18
+
19
+ Semantic Segmentation is the task of segmenting parts of an image that belong to the same class. Semantic Segmentation models make predictions for each pixel and return the probabilities of the classes for each pixel. These models are evaluated on Mean Intersection Over Union (Mean IoU).
20
+
21
+ ### Instance Segmentation
22
+
23
+ Instance Segmentation is the variant of Image Segmentation where every distinct object is segmented, instead of one segment per class.
24
+
25
+ ### Panoptic Segmentation
26
+
27
+ Panoptic Segmentation is the Image Segmentation task that segments the image both by instance and by class, assigning each pixel a different instance of the class.
28
+
29
+ ## Inference
30
+
31
+ You can infer with Image Segmentation models using the `image-segmentation` pipeline. You need to install [timm](https://github.com/rwightman/pytorch-image-models) first.
32
+
33
+ ```python
34
+ !pip install timm
35
+ model = pipeline("image-segmentation")
36
+ model("cat.png")
37
+ #[{'label': 'cat',
38
+ # 'mask': mask_code,
39
+ # 'score': 0.999}
40
+ # ...]
41
+ ```
42
+
43
+ You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer image segmentation models on Hugging Face Hub.
44
+
45
+ ```javascript
46
+ import { HfInference } from "@huggingface/inference";
47
+
48
+ const inference = new HfInference(HF_ACCESS_TOKEN);
49
+ await inference.imageSegmentation({
50
+ data: await (await fetch('https://picsum.photos/300/300')).blob(),
51
+ model: 'facebook/detr-resnet-50-panoptic',
52
+ })
53
+ ```
54
+
55
+ ## Useful Resources
56
+
57
+ Would you like to learn more about image segmentation? Great! Here you can find some curated resources that you may find helpful!
58
+
59
+ - [Fine-Tune a Semantic Segmentation Model with a Custom Dataset](https://huggingface.co/blog/fine-tune-segformer)
60
+ - [Walkthrough of Computer Vision Ecosystem in Hugging Face - CV Study Group](https://www.youtube.com/watch?v=oL-xmufhZM8)
61
+ - [A Guide on Universal Image Segmentation with Mask2Former and OneFormer](https://huggingface.co/blog/mask2former)
62
+ - [Zero-shot image segmentation with CLIPSeg](https://huggingface.co/blog/clipseg-zero-shot)
63
+ - [Semantic segmentation task guide](https://huggingface.co/docs/transformers/tasks/semantic_segmentation)
@@ -0,0 +1,96 @@
1
+ import type { TaskDataCustom } from "../Types";
2
+
3
+ const taskData: TaskDataCustom = {
4
+ datasets: [
5
+ {
6
+ description: "Scene segmentation dataset.",
7
+ id: "scene_parse_150",
8
+ },
9
+ ],
10
+ demo: {
11
+ inputs: [
12
+ {
13
+ filename: "image-segmentation-input.jpeg",
14
+ type: "img",
15
+ },
16
+ ],
17
+ outputs: [
18
+ {
19
+ filename: "image-segmentation-output.png",
20
+ type: "img",
21
+ },
22
+ ],
23
+ },
24
+ metrics: [
25
+ {
26
+ description: "Average Precision (AP) is the Area Under the PR Curve (AUC-PR). It is calculated for each semantic class separately",
27
+ id: "Average Precision",
28
+ },
29
+ {
30
+ description: "Mean Average Precision (mAP) is the overall average of the AP values",
31
+ id: "Mean Average Precision",
32
+ },
33
+ {
34
+ description: "Intersection over Union (IoU) is the overlap of segmentation masks. Mean IoU is the average of the IoU of all semantic classes",
35
+ id: "Mean Intersection over Union",
36
+ },
37
+ {
38
+ description: "APα is the Average Precision at the IoU threshold of a α value, for example, AP50 and AP75",
39
+ id: "APα",
40
+ },
41
+ ],
42
+ models: [
43
+ {
44
+ // TO DO: write description
45
+ description: "Solid panoptic segmentation model trained on the COCO 2017 benchmark dataset.",
46
+ id: "facebook/detr-resnet-50-panoptic",
47
+ },
48
+ {
49
+ description: "Semantic segmentation model trained on ADE20k benchmark dataset.",
50
+ id: "microsoft/beit-large-finetuned-ade-640-640",
51
+ },
52
+ {
53
+ description: "Semantic segmentation model trained on ADE20k benchmark dataset with 512x512 resolution.",
54
+ id: "nvidia/segformer-b0-finetuned-ade-512-512",
55
+ },
56
+ {
57
+ description: "Semantic segmentation model trained Cityscapes dataset.",
58
+ id: "facebook/mask2former-swin-large-cityscapes-semantic",
59
+ },
60
+ {
61
+ description: "Panoptic segmentation model trained COCO (common objects) dataset.",
62
+ id: "facebook/mask2former-swin-large-coco-panoptic",
63
+ },
64
+ ],
65
+ spaces: [
66
+ {
67
+ description: "A semantic segmentation application that can predict unseen instances out of the box.",
68
+ id: "facebook/ov-seg",
69
+ },
70
+ {
71
+ description: "One of the strongest segmentation applications.",
72
+ id: "jbrinkma/segment-anything",
73
+ },
74
+ {
75
+ description: "A semantic segmentation application that predicts human silhouettes.",
76
+ id: "keras-io/Human-Part-Segmentation",
77
+ },
78
+ {
79
+ description: "An instance segmentation application to predict neuronal cell types from microscopy images.",
80
+ id: "rashmi/sartorius-cell-instance-segmentation",
81
+ },
82
+ {
83
+ description: "An application that segments videos.",
84
+ id: "ArtGAN/Segment-Anything-Video",
85
+ },
86
+ {
87
+ description: "An panoptic segmentation application built for outdoor environments.",
88
+ id: "segments/panoptic-segment-anything",
89
+ },
90
+ ],
91
+ summary: "Image Segmentation divides an image into segments where each pixel in the image is mapped to an object. This task has multiple variants such as instance segmentation, panoptic segmentation and semantic segmentation.",
92
+ widgetModels: ["facebook/detr-resnet-50-panoptic"],
93
+ youtubeId: "dKE8SIt9C-w",
94
+ };
95
+
96
+ export default taskData;
@@ -0,0 +1,81 @@
1
+ ## Use Cases
2
+
3
+ ### Style transfer
4
+
5
+ One of the most popular use cases of image to image is the style transfer. Style transfer models can convert a regular photography into a painting in the style of a famous painter.
6
+
7
+ ## Task Variants
8
+
9
+ ### Image inpainting
10
+
11
+ Image inpainting is widely used during photography editing to remove unwanted objects, such as poles, wires or sensor
12
+ dust.
13
+
14
+ ### Image colorization
15
+
16
+ Old, black and white images can be brought up to life using an image colorization model.
17
+
18
+ ### Super Resolution
19
+
20
+ Super resolution models increase the resolution of an image, allowing for higher quality viewing and printing.
21
+
22
+ ## Inference
23
+
24
+ You can use pipelines for image-to-image in 🧨diffusers library to easily use image-to-image models. See an example for `StableDiffusionImg2ImgPipeline` below.
25
+
26
+ ```python
27
+ from PIL import Image
28
+ from diffusers import StableDiffusionImg2ImgPipeline
29
+
30
+ model_id_or_path = "runwayml/stable-diffusion-v1-5"
31
+ pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
32
+ pipe = pipe.to(cuda)
33
+
34
+ init_image = Image.open("mountains_image.jpeg").convert("RGB").resize((768, 512))
35
+ prompt = "A fantasy landscape, trending on artstation"
36
+
37
+ images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
38
+ images[0].save("fantasy_landscape.png")
39
+ ```
40
+
41
+ You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer image-to-image models on Hugging Face Hub.
42
+
43
+ ```javascript
44
+ import { HfInference } from "@huggingface/inference";
45
+
46
+ const inference = new HfInference(HF_ACCESS_TOKEN);
47
+ await inference.imageToImage({
48
+ data: await (await fetch('image')).blob(),
49
+ model: "timbrooks/instruct-pix2pix",
50
+ parameters: {
51
+ prompt: "Deblur this image"
52
+ }
53
+ })
54
+ ```
55
+
56
+ ## ControlNet
57
+
58
+ Controlling outputs of diffusion models only with a text prompt is a challenging problem. ControlNet is a neural network type that provides an image based control to diffusion models. These controls can be edges or landmarks in an image.
59
+
60
+ Many ControlNet models were trained in our community event, JAX Diffusers sprint. You can see the full list of the ControlNet models available [here](https://huggingface.co/spaces/jax-diffusers-event/leaderboard).
61
+
62
+ ## Most Used Model for the Task
63
+
64
+ Pix2Pix is a popular model used for image to image translation tasks. It is based on a conditional-GAN (generative adversarial network) where instead of a noise vector a 2D image is given as input. More information about Pix2Pix can be retrieved from this [link](https://phillipi.github.io/pix2pix/) where the associated paper and the GitHub repository can be found.
65
+
66
+
67
+ Below images show some of the examples shared in the paper that can be obtained using Pix2Pix. There are various cases this model can be applied on. It is capable of relatively simpler things, e.g. converting a grayscale image to its colored version. But more importantly, it can generate realistic pictures from rough sketches (can be seen in the purse example) or from painting-like images (can be seen in the street and facade examples below).
68
+
69
+ <img src="/tasks/assets/image-to-image/pix2pix_examples.jpg" alt="Alt text" title="Optional title">
70
+
71
+
72
+ ## Useful Resources
73
+
74
+ - [Train your ControlNet with diffusers 🧨](https://huggingface.co/blog/train-your-controlnet)
75
+ - [Ultra fast ControlNet with 🧨 Diffusers](https://huggingface.co/blog/controlnet)
76
+
77
+ ## References
78
+
79
+ [1] P. Isola, J. -Y. Zhu, T. Zhou and A. A. Efros, "Image-to-Image Translation with Conditional Adversarial Networks," 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2017, pp. 5967-5976, doi: 10.1109/CVPR.2017.632.
80
+
81
+ This page was made possible thanks to the efforts of [Paul Gafton](https://github.com/Paul92) and [Osman Alenbey](https://huggingface.co/osman93).
@@ -0,0 +1,97 @@
1
+ import type { TaskDataCustom } from "../Types";
2
+
3
+ const taskData: TaskDataCustom = {
4
+ datasets: [
5
+ {
6
+ description: "Synthetic dataset, for image relighting",
7
+ id: "VIDIT",
8
+ },
9
+ {
10
+ description: "Multiple images of celebrities, used for facial expression translation",
11
+ id: "huggan/CelebA-faces",
12
+ },
13
+ ],
14
+ demo: {
15
+ inputs: [
16
+ {
17
+ filename: "image-to-image-input.jpeg",
18
+ type: "img",
19
+ },
20
+ ],
21
+ outputs: [
22
+ {
23
+ filename: "image-to-image-output.png",
24
+ type: "img",
25
+ },
26
+ ],
27
+ },
28
+ isPlaceholder: false,
29
+ metrics: [
30
+ {
31
+ description: "Peak Signal to Noise Ratio (PSNR) is an approximation of the human perception, considering the ratio of the absolute intensity with respect to the variations. Measured in dB, a high value indicates a high fidelity.",
32
+ id: "PSNR",
33
+ },
34
+ {
35
+ description: "Structural Similarity Index (SSIM) is a perceptual metric which compares the luminance, contrast and structure of two images. The values of SSIM range between -1 and 1, and higher values indicate closer resemblance to the original image.",
36
+ id: "SSIM",
37
+ },
38
+ {
39
+ description: "Inception Score (IS) is an analysis of the labels predicted by an image classification model when presented with a sample of the generated images.",
40
+ id: "IS",
41
+ },
42
+ ],
43
+ models: [
44
+ {
45
+ description: "A model that enhances images captured in low light conditions.",
46
+ id: "keras-io/low-light-image-enhancement",
47
+ },
48
+ {
49
+ description: "A model that increases the resolution of an image.",
50
+ id: "keras-io/super-resolution",
51
+ },
52
+ {
53
+ description: "A model that creates a set of variations of the input image in the style of DALL-E using Stable Diffusion.",
54
+ id: "lambdalabs/sd-image-variations-diffusers",
55
+ },
56
+ {
57
+ description: "A model that generates images based on segments in the input image and the text prompt.",
58
+ id: "mfidabel/controlnet-segment-anything",
59
+ },
60
+ {
61
+ description: "A model that takes an image and an instruction to edit the image.",
62
+ id: "timbrooks/instruct-pix2pix",
63
+ },
64
+ ],
65
+ spaces: [
66
+ {
67
+ description: "Image enhancer application for low light.",
68
+ id: "keras-io/low-light-image-enhancement",
69
+ },
70
+ {
71
+ description: "Style transfer application.",
72
+ id: "keras-io/neural-style-transfer",
73
+ },
74
+ {
75
+ description: "An application that generates images based on segment control.",
76
+ id: "mfidabel/controlnet-segment-anything",
77
+ },
78
+ {
79
+ description: "Image generation application that takes image control and text prompt.",
80
+ id: "hysts/ControlNet",
81
+ },
82
+ {
83
+ description: "Colorize any image using this app.",
84
+ id: "ioclab/brightness-controlnet",
85
+ },
86
+ {
87
+ description: "Edit images with instructions.",
88
+ id: "timbrooks/instruct-pix2pix",
89
+ },
90
+
91
+ ],
92
+ summary: "Image-to-image is the task of transforming a source image to match the characteristics of a target image or a target image domain. Any image manipulation and enhancement is possible with image to image models.",
93
+ widgetModels: ["lllyasviel/sd-controlnet-canny"],
94
+ youtubeId: "",
95
+ };
96
+
97
+ export default taskData;
@@ -0,0 +1,58 @@
1
+ ## Use Cases
2
+ ### Image Captioning
3
+ Image Captioning is the process of generating textual description of an image.
4
+ This can help the visually impaired people to understand what's happening in their surroundings.
5
+
6
+ ### Optical Character Recognition (OCR)
7
+ OCR models convert the text present in an image, e.g. a scanned document, to text.
8
+
9
+
10
+
11
+ ## Pix2Struct
12
+
13
+ Pix2Struct is a state-of-the-art model built and released by Google AI. The model itself has to be trained on a downstream task to be used. These tasks include, captioning UI components, images including text, visual questioning infographics, charts, scientific diagrams and more. You can find these models on recommended models of this page.
14
+
15
+ ## Inference
16
+ ### Image Captioning
17
+ You can use the 🤗 Transformers library's `image-to-text` pipeline to generate caption for the Image input.
18
+ ```python
19
+ from transformers import pipeline
20
+
21
+ captioner = pipeline("image-to-text",model="Salesforce/blip-image-captioning-base")
22
+ captioner("https://huggingface.co/datasets/Narsil/image_dummy/resolve/main/parrots.png")
23
+ ## [{'generated_text': 'two birds are standing next to each other '}]
24
+ ```
25
+
26
+ ### OCR
27
+ This code snippet uses Microsoft’s TrOCR, an encoder-decoder model consisting of an image Transformer encoder and a text Transformer decoder for state-of-the-art optical character recognition (OCR) on single-text line images.
28
+ ```python
29
+ from transformers import TrOCRProcessor, VisionEncoderDecoderModel
30
+
31
+ processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
32
+ model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten')
33
+ pixel_values = processor(images="image.jpeg", return_tensors="pt").pixel_values
34
+
35
+ generated_ids = model.generate(pixel_values)
36
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
37
+
38
+ ```
39
+
40
+ You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer image-to-text models on Hugging Face Hub.
41
+
42
+ ```javascript
43
+ import { HfInference } from "@huggingface/inference";
44
+
45
+ const inference = new HfInference(HF_ACCESS_TOKEN);
46
+ await inference.imageToText({
47
+ data: await (await fetch('https://picsum.photos/300/300')).blob(),
48
+ model: 'Salesforce/blip-image-captioning-base',
49
+ })
50
+ ```
51
+
52
+ ## Useful Resources
53
+ - [Image Captioning](https://huggingface.co/docs/transformers/main/en/tasks/image_captioning)
54
+ - [Image captioning use case](https://blog.google/outreach-initiatives/accessibility/get-image-descriptions/)
55
+ - [Train Image Captioning model on your dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/GIT/Fine_tune_GIT_on_an_image_captioning_dataset.ipynb)
56
+ - [Train OCR model on your dataset ](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/TrOCR)
57
+
58
+ This page was made possible thanks to efforts of [Sukesh Perla](https://huggingface.co/hitchhiker3010) and [Johannes Kolbe](https://huggingface.co/johko).
@@ -0,0 +1,87 @@
1
+ import type { TaskDataCustom } from "../Types";
2
+
3
+ const taskData: TaskDataCustom = {
4
+ datasets: [
5
+ {
6
+ // TODO write proper description
7
+ description: "Dataset from 12M image-text of Reddit",
8
+ id: "red_caps",
9
+ },
10
+ {
11
+ // TODO write proper description
12
+ description: "Dataset from 3.3M images of Google",
13
+ id: "datasets/conceptual_captions",
14
+ },
15
+
16
+ ],
17
+ demo: {
18
+ inputs: [
19
+ {
20
+ filename: "savanna.jpg",
21
+ type: "img",
22
+ },
23
+ ],
24
+ outputs: [
25
+ {
26
+ label: "Detailed description",
27
+ content: "a herd of giraffes and zebras grazing in a field",
28
+ type: "text",
29
+ },
30
+ ],
31
+ },
32
+ metrics: [],
33
+ models: [
34
+ {
35
+ description: "A robust image captioning model.",
36
+ id: "Salesforce/blip-image-captioning-large",
37
+ },
38
+ {
39
+ description: "A strong image captioning model.",
40
+ id: "nlpconnect/vit-gpt2-image-captioning",
41
+ },
42
+ {
43
+ description: "A strong optical character recognition model.",
44
+ id: "microsoft/trocr-base-printed",
45
+ },
46
+ {
47
+ description: "A strong visual question answering model for scientific diagrams.",
48
+ id: "google/pix2struct-ai2d-base",
49
+ },
50
+ {
51
+ description: "A strong captioning model for UI components.",
52
+ id: "google/pix2struct-widget-captioning-base",
53
+ },
54
+ {
55
+ description: "A captioning model for images that contain text.",
56
+ id: "google/pix2struct-textcaps-base",
57
+ },
58
+ ],
59
+ spaces: [
60
+ {
61
+ description: "A robust image captioning application.",
62
+ id: "flax-community/image-captioning",
63
+ },
64
+ {
65
+ description: "An application that transcribes handwritings into text.",
66
+ id: "nielsr/TrOCR-handwritten",
67
+ },
68
+ {
69
+ description: "An application that can caption images and answer questions about a given image.",
70
+ id: "Salesforce/BLIP",
71
+ },
72
+ {
73
+ description: "An application that can caption images and answer questions with a conversational agent.",
74
+ id: "Salesforce/BLIP2",
75
+ },
76
+ {
77
+ description: "An image captioning application that demonstrates the effect of noise on captions.",
78
+ id: "johko/capdec-image-captioning",
79
+ },
80
+ ],
81
+ summary: "Image to text models output a text from a given image. Image captioning or optical character recognition can be considered as the most common applications of image to text.",
82
+ widgetModels: ["Salesforce/blip-image-captioning-base"],
83
+ youtubeId: "",
84
+ };
85
+
86
+
87
+ export default taskData;
package/src/index.ts ADDED
@@ -0,0 +1,2 @@
1
+ export type { TaskData, TaskDemo, TaskDemoEntry } from "./Types";
2
+ export { TASKS_DATA } from "./tasksData";
@@ -0,0 +1,36 @@
1
+ ## Use Cases
2
+
3
+ ### Autonomous Driving
4
+
5
+ Object Detection is widely used in computer vision for autonomous driving. Self-driving cars use Object Detection models to detect pedestrians, bicycles, traffic lights and road signs to decide which step to take.
6
+
7
+ ### Object Tracking in Matches
8
+
9
+ Object Detection models are widely used in sports where the ball or a player is tracked for monitoring and refereeing during matches.
10
+
11
+ ### Image Search
12
+
13
+ Object Detection models are widely used in image search. Smartphones use Object Detection models to detect entities (such as specific places or objects) and allow the user to search for the entity on the Internet.
14
+
15
+ ### Object Counting
16
+
17
+ Object Detection models are used to count instances of objects in a given image, this can include counting the objects in warehouses or stores, or counting the number of visitors in a store. They are also used to manage crowds at events to prevent disasters.
18
+
19
+ ## Inference
20
+
21
+ You can infer with Object Detection models through the `object-detection` pipeline. When calling the pipeline you just need to specify a path or http link to an image.
22
+
23
+ ```python
24
+ model = pipeline("object-detection")
25
+
26
+ model("path_to_cat_image")
27
+
28
+ # [{'label': 'blanket',
29
+ # 'mask': mask_string,
30
+ # 'score': 0.917},
31
+ #...]
32
+ ```
33
+
34
+ # Useful Resources
35
+ - [Walkthrough of Computer Vision Ecosystem in Hugging Face - CV Study Group](https://www.youtube.com/watch?v=oL-xmufhZM8)
36
+ - [Object detection task guide](https://huggingface.co/docs/transformers/tasks/object_detection)
@@ -0,0 +1,73 @@
1
+ import type { TaskDataCustom } from "../Types";
2
+
3
+ const taskData: TaskDataCustom = {
4
+ datasets: [
5
+ {
6
+ // TODO write proper description
7
+ description: "Widely used benchmark dataset for multiple Vision tasks.",
8
+ id: "merve/coco2017",
9
+ },
10
+ ],
11
+ demo: {
12
+ inputs: [
13
+ {
14
+ filename: "object-detection-input.jpg",
15
+ type: "img",
16
+ },
17
+ ],
18
+ outputs: [
19
+ {
20
+ filename: "object-detection-output.jpg",
21
+ type: "img",
22
+ },
23
+ ],
24
+ },
25
+ metrics: [
26
+ {
27
+ description: "The Average Precision (AP) metric is the Area Under the PR Curve (AUC-PR). It is calculated for each class separately",
28
+ id: "Average Precision",
29
+ },
30
+ {
31
+ description: "The Mean Average Precision (mAP) metric is the overall average of the AP values",
32
+ id: "Mean Average Precision",
33
+ },
34
+ {
35
+ description: "The APα metric is the Average Precision at the IoU threshold of a α value, for example, AP50 and AP75",
36
+ id: "APα",
37
+ },
38
+ ],
39
+ models: [
40
+ {
41
+ // TO DO: write description
42
+ description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
43
+ id: "facebook/detr-resnet-50",
44
+ },
45
+ {
46
+ description: "Strong object detection model trained on ImageNet-21k dataset.",
47
+ id: "microsoft/beit-base-patch16-224-pt22k-ft22k",
48
+ },
49
+ ],
50
+ spaces: [
51
+ {
52
+ description: "An object detection application that can detect unseen objects out of the box.",
53
+ id: "adirik/OWL-ViT",
54
+ },
55
+ {
56
+ description: "An application that contains various object detection models to try from.",
57
+ id: "Gradio-Blocks/Object-Detection-With-DETR-and-YOLOS",
58
+ },
59
+ {
60
+ description: "An application that shows multiple cutting edge techniques for object detection and tracking.",
61
+ id: "kadirnar/torchyolo",
62
+ },
63
+ {
64
+ description: "An object tracking, segmentation and inpainting application.",
65
+ id: "VIPLab/Track-Anything",
66
+ },
67
+ ],
68
+ summary: "Object Detection models allow users to identify objects of certain defined classes. Object detection models receive an image as input and output the images with bounding boxes and labels on detected objects.",
69
+ widgetModels: ["facebook/detr-resnet-50"],
70
+ youtubeId: "WdAeKSOpxhw",
71
+ };
72
+
73
+ export default taskData;
@@ -0,0 +1,15 @@
1
+ ## Use Cases
2
+
3
+ You can contribute this area with common use cases of the task!
4
+
5
+ ## Task Variants
6
+
7
+ This place can be filled with variants of this task if there's any.
8
+
9
+ ## Inference
10
+
11
+ This section should have useful information about how to pull a model from Hugging Face Hub that is a part of a library specialized in a task and use it.
12
+
13
+ ## Useful Resources
14
+
15
+ In this area, you can insert useful resources about how to train or use a model for this task.
@@ -0,0 +1,18 @@
1
+ import type { TaskDataCustom } from "../Types";
2
+
3
+ const taskData: TaskDataCustom = {
4
+ datasets: [],
5
+ demo: {
6
+ inputs: [],
7
+ outputs: [],
8
+ },
9
+ isPlaceholder: true,
10
+ metrics: [],
11
+ models: [],
12
+ spaces: [],
13
+ summary: "",
14
+ widgetModels: [],
15
+ youtubeId: undefined,
16
+ };
17
+
18
+ export default taskData;