@huggingface/tasks 0.0.3 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +20 -0
  3. package/dist/index.d.ts +368 -46
  4. package/dist/index.js +117 -41
  5. package/dist/{index.cjs → index.mjs} +84 -67
  6. package/package.json +43 -33
  7. package/src/Types.ts +49 -43
  8. package/src/audio-classification/about.md +5 -5
  9. package/src/audio-classification/data.ts +11 -11
  10. package/src/audio-to-audio/about.md +4 -3
  11. package/src/audio-to-audio/data.ts +18 -15
  12. package/src/automatic-speech-recognition/about.md +5 -4
  13. package/src/automatic-speech-recognition/data.ts +18 -17
  14. package/src/const.ts +52 -44
  15. package/src/conversational/about.md +9 -9
  16. package/src/conversational/data.ts +22 -18
  17. package/src/depth-estimation/about.md +1 -3
  18. package/src/depth-estimation/data.ts +11 -11
  19. package/src/document-question-answering/about.md +1 -2
  20. package/src/document-question-answering/data.ts +22 -19
  21. package/src/feature-extraction/about.md +2 -3
  22. package/src/feature-extraction/data.ts +12 -15
  23. package/src/fill-mask/about.md +1 -1
  24. package/src/fill-mask/data.ts +16 -14
  25. package/src/image-classification/about.md +5 -3
  26. package/src/image-classification/data.ts +15 -15
  27. package/src/image-segmentation/about.md +4 -4
  28. package/src/image-segmentation/data.ts +26 -23
  29. package/src/image-to-image/about.md +10 -12
  30. package/src/image-to-image/data.ts +31 -27
  31. package/src/image-to-text/about.md +13 -6
  32. package/src/image-to-text/data.ts +20 -21
  33. package/src/index.ts +11 -0
  34. package/src/modelLibraries.ts +43 -0
  35. package/src/object-detection/about.md +2 -1
  36. package/src/object-detection/data.ts +20 -17
  37. package/src/pipelines.ts +619 -0
  38. package/src/placeholder/about.md +3 -3
  39. package/src/placeholder/data.ts +8 -8
  40. package/src/question-answering/about.md +1 -1
  41. package/src/question-answering/data.ts +21 -19
  42. package/src/reinforcement-learning/about.md +167 -176
  43. package/src/reinforcement-learning/data.ts +75 -78
  44. package/src/sentence-similarity/data.ts +29 -28
  45. package/src/summarization/about.md +6 -5
  46. package/src/summarization/data.ts +23 -20
  47. package/src/table-question-answering/about.md +5 -5
  48. package/src/table-question-answering/data.ts +35 -39
  49. package/src/tabular-classification/about.md +4 -6
  50. package/src/tabular-classification/data.ts +11 -12
  51. package/src/tabular-regression/about.md +14 -18
  52. package/src/tabular-regression/data.ts +10 -11
  53. package/src/tasksData.ts +47 -50
  54. package/src/text-classification/about.md +5 -4
  55. package/src/text-classification/data.ts +21 -20
  56. package/src/text-generation/about.md +7 -6
  57. package/src/text-generation/data.ts +36 -34
  58. package/src/text-to-image/about.md +19 -18
  59. package/src/text-to-image/data.ts +32 -26
  60. package/src/text-to-speech/about.md +4 -5
  61. package/src/text-to-speech/data.ts +16 -17
  62. package/src/text-to-video/about.md +41 -36
  63. package/src/text-to-video/data.ts +43 -38
  64. package/src/token-classification/about.md +1 -3
  65. package/src/token-classification/data.ts +26 -25
  66. package/src/translation/about.md +4 -4
  67. package/src/translation/data.ts +21 -21
  68. package/src/unconditional-image-generation/about.md +10 -5
  69. package/src/unconditional-image-generation/data.ts +26 -20
  70. package/src/video-classification/about.md +5 -1
  71. package/src/video-classification/data.ts +14 -14
  72. package/src/visual-question-answering/about.md +8 -3
  73. package/src/visual-question-answering/data.ts +22 -19
  74. package/src/zero-shot-classification/about.md +5 -4
  75. package/src/zero-shot-classification/data.ts +20 -20
  76. package/src/zero-shot-image-classification/about.md +17 -9
  77. package/src/zero-shot-image-classification/data.ts +12 -14
  78. package/tsconfig.json +18 -0
  79. package/assets/audio-classification/audio.wav +0 -0
  80. package/assets/audio-to-audio/input.wav +0 -0
  81. package/assets/audio-to-audio/label-0.wav +0 -0
  82. package/assets/audio-to-audio/label-1.wav +0 -0
  83. package/assets/automatic-speech-recognition/input.flac +0 -0
  84. package/assets/automatic-speech-recognition/wav2vec2.png +0 -0
  85. package/assets/contribution-guide/anatomy.png +0 -0
  86. package/assets/contribution-guide/libraries.png +0 -0
  87. package/assets/depth-estimation/depth-estimation-input.jpg +0 -0
  88. package/assets/depth-estimation/depth-estimation-output.png +0 -0
  89. package/assets/document-question-answering/document-question-answering-input.png +0 -0
  90. package/assets/image-classification/image-classification-input.jpeg +0 -0
  91. package/assets/image-segmentation/image-segmentation-input.jpeg +0 -0
  92. package/assets/image-segmentation/image-segmentation-output.png +0 -0
  93. package/assets/image-to-image/image-to-image-input.jpeg +0 -0
  94. package/assets/image-to-image/image-to-image-output.png +0 -0
  95. package/assets/image-to-image/pix2pix_examples.jpg +0 -0
  96. package/assets/image-to-text/savanna.jpg +0 -0
  97. package/assets/object-detection/object-detection-input.jpg +0 -0
  98. package/assets/object-detection/object-detection-output.jpg +0 -0
  99. package/assets/table-question-answering/tableQA.jpg +0 -0
  100. package/assets/text-to-image/image.jpeg +0 -0
  101. package/assets/text-to-speech/audio.wav +0 -0
  102. package/assets/text-to-video/text-to-video-output.gif +0 -0
  103. package/assets/unconditional-image-generation/unconditional-image-generation-output.jpeg +0 -0
  104. package/assets/video-classification/video-classification-input.gif +0 -0
  105. package/assets/visual-question-answering/elephant.jpeg +0 -0
  106. package/assets/zero-shot-image-classification/image-classification-input.jpeg +0 -0
  107. package/dist/index.d.cts +0 -145
@@ -3,64 +3,70 @@ import type { TaskDataCustom } from "../Types";
3
3
  const taskData: TaskDataCustom = {
4
4
  datasets: [
5
5
  {
6
- description: "The CIFAR-100 dataset consists of 60000 32x32 colour images in 100 classes, with 600 images per class.",
7
- id: "cifar100",
6
+ description:
7
+ "The CIFAR-100 dataset consists of 60000 32x32 colour images in 100 classes, with 600 images per class.",
8
+ id: "cifar100",
8
9
  },
9
10
  {
10
11
  description: "Multiple images of celebrities, used for facial expression translation.",
11
- id: "CelebA",
12
+ id: "CelebA",
12
13
  },
13
14
  ],
14
15
  demo: {
15
16
  inputs: [
16
17
  {
17
- label: "Seed",
18
+ label: "Seed",
18
19
  content: "42",
19
- type: "text",
20
+ type: "text",
20
21
  },
21
22
  {
22
- label: "Number of images to generate:",
23
+ label: "Number of images to generate:",
23
24
  content: "4",
24
- type: "text",
25
+ type: "text",
25
26
  },
26
27
  ],
27
28
  outputs: [
28
29
  {
29
30
  filename: "unconditional-image-generation-output.jpeg",
30
- type: "img",
31
+ type: "img",
31
32
  },
32
33
  ],
33
34
  },
34
35
  metrics: [
35
36
  {
36
- description: "The inception score (IS) evaluates the quality of generated images. It measures the diversity of the generated images (the model predictions are evenly distributed across all possible labels) and their 'distinction' or 'sharpness' (the model confidently predicts a single label for each image).",
37
- id: "Inception score (IS)",
37
+ description:
38
+ "The inception score (IS) evaluates the quality of generated images. It measures the diversity of the generated images (the model predictions are evenly distributed across all possible labels) and their 'distinction' or 'sharpness' (the model confidently predicts a single label for each image).",
39
+ id: "Inception score (IS)",
38
40
  },
39
41
  {
40
- description: "The Fréchet Inception Distance (FID) evaluates the quality of images created by a generative model by calculating the distance between feature vectors for real and generated images.",
41
- id: "Frećhet Inception Distance (FID)",
42
+ description:
43
+ "The Fréchet Inception Distance (FID) evaluates the quality of images created by a generative model by calculating the distance between feature vectors for real and generated images.",
44
+ id: "Frećhet Inception Distance (FID)",
42
45
  },
43
46
  ],
44
47
  models: [
45
48
  {
46
- description: "High-quality image generation model trained on the CIFAR-10 dataset. It synthesizes images of the ten classes presented in the dataset using diffusion probabilistic models, a class of latent variable models inspired by considerations from nonequilibrium thermodynamics.",
47
- id: "google/ddpm-cifar10-32",
49
+ description:
50
+ "High-quality image generation model trained on the CIFAR-10 dataset. It synthesizes images of the ten classes presented in the dataset using diffusion probabilistic models, a class of latent variable models inspired by considerations from nonequilibrium thermodynamics.",
51
+ id: "google/ddpm-cifar10-32",
48
52
  },
49
53
  {
50
- description: "High-quality image generation model trained on the 256x256 CelebA-HQ dataset. It synthesizes images of faces using diffusion probabilistic models, a class of latent variable models inspired by considerations from nonequilibrium thermodynamics.",
51
- id: "google/ddpm-celebahq-256",
54
+ description:
55
+ "High-quality image generation model trained on the 256x256 CelebA-HQ dataset. It synthesizes images of faces using diffusion probabilistic models, a class of latent variable models inspired by considerations from nonequilibrium thermodynamics.",
56
+ id: "google/ddpm-celebahq-256",
52
57
  },
53
58
  ],
54
- spaces: [
59
+ spaces: [
55
60
  {
56
61
  description: "An application that can generate realistic faces.",
57
- id: "CompVis/celeba-latent-diffusion",
62
+ id: "CompVis/celeba-latent-diffusion",
58
63
  },
59
64
  ],
60
- summary: "Unconditional image generation is the task of generating images with no condition in any context (like a prompt text or another image). Once trained, the model will create images that resemble its training data distribution.",
65
+ summary:
66
+ "Unconditional image generation is the task of generating images with no condition in any context (like a prompt text or another image). Once trained, the model will create images that resemble its training data distribution.",
61
67
  widgetModels: [""],
62
68
  // TODO: Add related video
63
- youtubeId: "",
69
+ youtubeId: "",
64
70
  };
65
71
 
66
72
  export default taskData;
@@ -1,10 +1,13 @@
1
1
  ## Use Cases
2
+
2
3
  Video classification models can be used to categorize what a video is all about.
3
4
 
4
5
  ### Activity Recognition
5
- Video classification models are used to perform activity recognition which is useful for fitness applications. Activity recognition is also helpful for vision-impaired individuals especially when they're commuting.
6
+
7
+ Video classification models are used to perform activity recognition which is useful for fitness applications. Activity recognition is also helpful for vision-impaired individuals especially when they're commuting.
6
8
 
7
9
  ### Video Search
10
+
8
11
  Models trained in video classification can improve user experience by organizing and categorizing video galleries on the phone or in the cloud, on multiple keywords or tags.
9
12
 
10
13
  ## Inference
@@ -50,4 +53,5 @@ print(model.config.id2label[predicted_label])
50
53
  - [Video classification task guide](https://huggingface.co/docs/transformers/tasks/video_classification)
51
54
 
52
55
  ### Creating your own video classifier in minutes
56
+
53
57
  - [Fine-tuning tutorial notebook (PyTorch)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb)
@@ -5,14 +5,14 @@ const taskData: TaskDataCustom = {
5
5
  {
6
6
  // TODO write proper description
7
7
  description: "Benchmark dataset used for video classification with videos that belong to 400 classes.",
8
- id: "kinetics400",
8
+ id: "kinetics400",
9
9
  },
10
10
  ],
11
11
  demo: {
12
12
  inputs: [
13
13
  {
14
14
  filename: "video-classification-input.gif",
15
- type: "img",
15
+ type: "img",
16
16
  },
17
17
  ],
18
18
  outputs: [
@@ -38,47 +38,47 @@ const taskData: TaskDataCustom = {
38
38
  metrics: [
39
39
  {
40
40
  description: "",
41
- id: "accuracy",
41
+ id: "accuracy",
42
42
  },
43
43
  {
44
44
  description: "",
45
- id: "recall",
45
+ id: "recall",
46
46
  },
47
47
  {
48
48
  description: "",
49
- id: "precision",
49
+ id: "precision",
50
50
  },
51
51
  {
52
52
  description: "",
53
- id: "f1",
53
+ id: "f1",
54
54
  },
55
-
56
55
  ],
57
56
  models: [
58
57
  {
59
58
  // TO DO: write description
60
59
  description: "Strong Video Classification model trained on the Kinects 400 dataset.",
61
- id: "MCG-NJU/videomae-base-finetuned-kinetics",
60
+ id: "MCG-NJU/videomae-base-finetuned-kinetics",
62
61
  },
63
62
  {
64
63
  // TO DO: write description
65
64
  description: "Strong Video Classification model trained on the Kinects 400 dataset.",
66
- id: "microsoft/xclip-base-patch32",
65
+ id: "microsoft/xclip-base-patch32",
67
66
  },
68
67
  ],
69
- spaces: [
68
+ spaces: [
70
69
  {
71
70
  description: "An application that classifies video at different timestamps.",
72
- id: "nateraw/lavila",
71
+ id: "nateraw/lavila",
73
72
  },
74
73
  {
75
74
  description: "An application that classifies video.",
76
- id: "fcakyon/video-classification",
75
+ id: "fcakyon/video-classification",
77
76
  },
78
77
  ],
79
- summary: "Video classification is the task of assigning a label or class to an entire video. Videos are expected to have only one class for each video. Video classification models take a video as input and return a prediction about which class the video belongs to.",
78
+ summary:
79
+ "Video classification is the task of assigning a label or class to an entire video. Videos are expected to have only one class for each video. Video classification models take a video as input and return a prediction about which class the video belongs to.",
80
80
  widgetModels: [],
81
- youtubeId: "",
81
+ youtubeId: "",
82
82
  };
83
83
 
84
84
  export default taskData;
@@ -1,20 +1,25 @@
1
1
  ## Use Cases
2
2
 
3
- ### Aid the Visually Impaired Persons
3
+ ### Aid the Visually Impaired Persons
4
+
4
5
  VQA models can be used to reduce visual barriers for visually impaired individuals by allowing them to get information about images from the web and the real world.
5
6
 
6
7
  ### Education
8
+
7
9
  VQA models can be used to improve experiences at museums by allowing observers to directly ask questions they interested in.
8
10
 
9
11
  ### Improved Image Retrieval
12
+
10
13
  Visual question answering models can be used to retrieve images with specific characteristics. For example, the user can ask "Is there a dog?" to find all images with dogs from a set of images.
11
14
 
12
15
  ### Video Search
16
+
13
17
  Specific snippets/timestamps of a video can be retrieved based on search queries. For example, the user can ask "At which part of the video does the guitar appear?" and get a specific timestamp range from the whole video.
14
18
 
15
- ## Task Variants
19
+ ## Task Variants
16
20
 
17
21
  ### Video Question Answering
22
+
18
23
  Video Question Answering aims to answer questions asked about the content of a video.
19
24
 
20
25
  ## Inference
@@ -40,4 +45,4 @@ vqa_pipeline(image, question, top_k=1)
40
45
  - [Multi Modal Framework (MMF) - Meta Research](https://mmf.sh/docs/getting_started/video_overview/)
41
46
 
42
47
  The contents of this page are contributed by [
43
- Bharat Raghunathan](https://huggingface.co/bharat-raghunathan) and [Jose Londono Botero](https://huggingface.co/jlondonobo).
48
+ Bharat Raghunathan](https://huggingface.co/bharat-raghunathan) and [Jose Londono Botero](https://huggingface.co/jlondonobo).
@@ -4,23 +4,23 @@ const taskData: TaskDataCustom = {
4
4
  datasets: [
5
5
  {
6
6
  description: "A widely used dataset containing questions (with answers) about images.",
7
- id: "Graphcore/vqa",
7
+ id: "Graphcore/vqa",
8
8
  },
9
9
  {
10
10
  description: "A dataset to benchmark visual reasoning based on text in images.",
11
11
  id: "textvqa",
12
- }
12
+ },
13
13
  ],
14
14
  demo: {
15
15
  inputs: [
16
16
  {
17
17
  filename: "elephant.jpeg",
18
- type: "img",
18
+ type: "img",
19
19
  },
20
20
  {
21
- label: "Question",
21
+ label: "Question",
22
22
  content: "What is in this image?",
23
- type: "text",
23
+ type: "text",
24
24
  },
25
25
  ],
26
26
  outputs: [
@@ -44,47 +44,50 @@ const taskData: TaskDataCustom = {
44
44
  ],
45
45
  },
46
46
  isPlaceholder: false,
47
- metrics: [
47
+ metrics: [
48
48
  {
49
49
  description: "",
50
- id: "accuracy",
50
+ id: "accuracy",
51
51
  },
52
52
  {
53
- description: "Measures how much a predicted answer differs from the ground truth based on the difference in their semantic meaning.",
54
- id: "wu-palmer similarity",
53
+ description:
54
+ "Measures how much a predicted answer differs from the ground truth based on the difference in their semantic meaning.",
55
+ id: "wu-palmer similarity",
55
56
  },
56
57
  ],
57
58
  models: [
58
59
  {
59
60
  description: "A visual question answering model trained to convert charts and plots to text.",
60
- id: "google/deplot",
61
+ id: "google/deplot",
61
62
  },
62
63
  {
63
- description: "A visual question answering model trained for mathematical reasoning and chart derendering from images.",
64
- id: "google/matcha-base ",
64
+ description:
65
+ "A visual question answering model trained for mathematical reasoning and chart derendering from images.",
66
+ id: "google/matcha-base ",
65
67
  },
66
68
  {
67
69
  description: "A strong visual question answering that answers questions from book covers.",
68
- id: "google/pix2struct-ocrvqa-large",
70
+ id: "google/pix2struct-ocrvqa-large",
69
71
  },
70
72
  ],
71
- spaces: [
73
+ spaces: [
72
74
  {
73
75
  description: "An application that can answer questions based on images.",
74
- id: "nielsr/vilt-vqa",
76
+ id: "nielsr/vilt-vqa",
75
77
  },
76
78
  {
77
79
  description: "An application that can caption images and answer questions about a given image. ",
78
- id: "Salesforce/BLIP",
80
+ id: "Salesforce/BLIP",
79
81
  },
80
82
  {
81
83
  description: "An application that can caption images and answer questions about a given image. ",
82
- id: "vumichien/Img2Prompt",
84
+ id: "vumichien/Img2Prompt",
83
85
  },
84
86
  ],
85
- summary: "Visual Question Answering is the task of answering open-ended questions based on an image. They output natural language responses to natural language questions.",
87
+ summary:
88
+ "Visual Question Answering is the task of answering open-ended questions based on an image. They output natural language responses to natural language questions.",
86
89
  widgetModels: ["dandelin/vilt-b32-finetuned-vqa"],
87
- youtubeId: "",
90
+ youtubeId: "",
88
91
  };
89
92
 
90
93
  export default taskData;
@@ -6,20 +6,21 @@ In zero shot classification, we provide the model with a prompt and a sequence o
6
6
 
7
7
  Zero, single and few-shot classification seem to be an emergent feature of large language models. This feature seems to come about around model sizes of +100M parameters. The effectiveness of a model at a zero, single or few-shot task seems to scale with model size, meaning that larger models (models with more trainable parameters or layers) generally do better at this task.
8
8
 
9
-
10
9
  Here is an example of a zero-shot prompt for classifying the sentiment of a sequence of text:
10
+
11
11
  ```
12
12
  Classify the following input text into one of the following three categories: [positive, negative, neutral]
13
13
 
14
- Input Text: Hugging Face is awesome for making all of these
14
+ Input Text: Hugging Face is awesome for making all of these
15
15
  state of the art models available!
16
16
  Sentiment: positive
17
17
 
18
18
  ```
19
19
 
20
- One great example of this task with a nice off-the-shelf model is available at the widget of this page, where the user can input a sequence of text and candidate labels to the model. This is a *word level* example of zero shot classification, more elaborate and lengthy generations are available with larger models. Testing these models out and getting a feel for prompt engineering is the best way to learn how to use them.
20
+ One great example of this task with a nice off-the-shelf model is available at the widget of this page, where the user can input a sequence of text and candidate labels to the model. This is a _word level_ example of zero shot classification, more elaborate and lengthy generations are available with larger models. Testing these models out and getting a feel for prompt engineering is the best way to learn how to use them.
21
21
 
22
22
  ## Inference
23
+
23
24
  You can use the 🤗 Transformers library zero-shot-classification pipeline to infer with zero shot text classification models.
24
25
 
25
26
  ```python
@@ -34,6 +35,6 @@ pipe("I have a problem with my iphone that needs to be resolved asap!",
34
35
  ```
35
36
 
36
37
  ## Useful Resources
38
+
37
39
  - [Zero Shot Learning](https://joeddav.github.io/blog/2020/05/29/ZSL.html)
38
40
  - [Hugging Face on Transfer Learning](https://huggingface.co/course/en/chapter1/4?fw=pt#transfer-learning)
39
-
@@ -1,34 +1,34 @@
1
1
  import type { TaskDataCustom } from "../Types";
2
2
 
3
3
  const taskData: TaskDataCustom = {
4
-
5
4
  datasets: [
6
5
  {
7
6
  description: "A widely used dataset used to benchmark multiple variants of text classification.",
8
- id: "glue",
7
+ id: "glue",
9
8
  },
10
9
  {
11
- description: "The Multi-Genre Natural Language Inference (MultiNLI) corpus is a crowd-sourced collection of 433k sentence pairs annotated with textual entailment information.",
12
- id: "MultiNLI",
10
+ description:
11
+ "The Multi-Genre Natural Language Inference (MultiNLI) corpus is a crowd-sourced collection of 433k sentence pairs annotated with textual entailment information.",
12
+ id: "MultiNLI",
13
13
  },
14
14
  {
15
- description: "FEVER is a publicly available dataset for fact extraction and verification against textual sources.",
16
- id: "FEVER",
15
+ description:
16
+ "FEVER is a publicly available dataset for fact extraction and verification against textual sources.",
17
+ id: "FEVER",
17
18
  },
18
19
  ],
19
20
  demo: {
20
21
  inputs: [
21
22
  {
22
- label: "Text Input",
23
+ label: "Text Input",
23
24
  content: "Dune is the best movie ever.",
24
- type: "text",
25
+ type: "text",
25
26
  },
26
27
  {
27
- label: "Candidate Labels",
28
+ label: "Candidate Labels",
28
29
  content: "CINEMA, ART, MUSIC",
29
- type: "text",
30
+ type: "text",
30
31
  },
31
-
32
32
  ],
33
33
  outputs: [
34
34
  {
@@ -36,30 +36,30 @@ const taskData: TaskDataCustom = {
36
36
  data: [
37
37
  {
38
38
  label: "CINEMA",
39
- score: 0.90,
39
+ score: 0.9,
40
40
  },
41
41
  {
42
42
  label: "ART",
43
- score: 0.10,
43
+ score: 0.1,
44
44
  },
45
45
  {
46
46
  label: "MUSIC",
47
- score: 0.00,
47
+ score: 0.0,
48
48
  },
49
49
  ],
50
50
  },
51
51
  ],
52
52
  },
53
- metrics: [],
54
- models: [
53
+ metrics: [],
54
+ models: [
55
55
  {
56
- description:
57
- "Powerful zero-shot text classification model",
56
+ description: "Powerful zero-shot text classification model",
58
57
  id: "facebook/bart-large-mnli",
59
58
  },
60
59
  ],
61
- spaces: [],
62
- summary: "Zero-shot text classification is a task in natural language processing where a model is trained on a set of labeled examples but is then able to classify new examples from previously unseen classes.",
60
+ spaces: [],
61
+ summary:
62
+ "Zero-shot text classification is a task in natural language processing where a model is trained on a set of labeled examples but is then able to classify new examples from previously unseen classes.",
63
63
  widgetModels: ["facebook/bart-large-mnli"],
64
64
  };
65
65
 
@@ -1,58 +1,67 @@
1
1
  ## About the Task
2
2
 
3
- Zero-shot image classification is a computer vision task to classify images into one of several classes, without any prior training or knowledge of the classes.
3
+ Zero-shot image classification is a computer vision task to classify images into one of several classes, without any prior training or knowledge of the classes.
4
4
 
5
5
  Zero shot image classification works by transferring knowledge learnt during training of one model, to classify novel classes that was not present in the training data. So this is a variation of [transfer learning](https://www.youtube.com/watch?v=BqqfQnyjmgg). For instance, a model trained to differentiate cars from airplanes can be used to classify images of ships.
6
6
 
7
7
  The data in this learning paradigm consists of
8
8
 
9
9
  - Seen data - images and their corresponding labels
10
- - Unseen data - only labels and no images
10
+ - Unseen data - only labels and no images
11
11
  - Auxiliary information - additional information given to the model during training connecting the unseen and seen data. This can be in the form of textual description or word embeddings.
12
12
 
13
-
14
13
  ## Use Cases
15
14
 
16
15
  ### Image Retrieval
16
+
17
17
  Zero-shot learning resolves several challenges in image retrieval systems. For example, with the rapid growth of categories on the web, it is challenging to index images based on unseen categories. With zero-shot learning we can associate unseen categories to images by exploiting attributes to model the relationships among visual features and labels.
18
18
 
19
19
  ### Action Recognition
20
- Action recognition is the task of identifying when a person in an image/video is performing a given action from a set of actions. If all the possible actions are not known beforehand, conventional deep learning models fail. With zero-shot learning, for a given domain of a set of actions, we can create a mapping connecting low-level features and a semantic description of auxiliary data to classify unknown classes of actions.
21
20
 
21
+ Action recognition is the task of identifying when a person in an image/video is performing a given action from a set of actions. If all the possible actions are not known beforehand, conventional deep learning models fail. With zero-shot learning, for a given domain of a set of actions, we can create a mapping connecting low-level features and a semantic description of auxiliary data to classify unknown classes of actions.
22
22
 
23
- ## Task Variants
23
+ ## Task Variants
24
24
 
25
25
  You can contribute variants of this task [here](https://github.com/huggingface/hub-docs/blob/main/tasks/src/zero-shot-image-classification/about.md).
26
26
 
27
27
  ## Inference
28
28
 
29
29
  The model can be loaded with the zero-shot-image-classification pipeline like so:
30
+
30
31
  ```python
31
32
  from transformers import pipeline
32
33
  # More models in the model hub.
33
34
  model_name = "openai/clip-vit-large-patch14-336"
34
35
  classifier = pipeline("zero-shot-image-classification", model = model_name)
35
36
  ```
37
+
36
38
  You can then use this pipeline to classify images into any of the class names you specify. You can specify more than two class labels too.
39
+
37
40
  ```python
38
41
  image_to_classify = "path_to_cat_and_dog_image.jpeg"
39
- labels_for_classification = ["cat and dog",
40
- "lion and cheetah",
42
+ labels_for_classification = ["cat and dog",
43
+ "lion and cheetah",
41
44
  "rabbit and lion"]
42
- scores = classifier(image_to_classify,
45
+ scores = classifier(image_to_classify,
43
46
  candidate_labels = labels_for_classification)
44
47
  ```
48
+
45
49
  The classifier would return a list of dictionaries after the inference which is stored in the variable `scores` in the code snippet above. Variable `scores` would look as follows:
50
+
46
51
  ```python
47
52
  [{'score': 0.9950482249259949, 'label': 'cat and dog'},
48
53
  {'score': 0.004863627254962921, 'label': 'rabbit and lion'},
49
54
  {'score': 8.816882473183796e-05, 'label': 'lion and cheetah'}]
50
55
  ```
56
+
51
57
  The dictionary at the zeroth index of the list will contain the label with the highest score.
58
+
52
59
  ```python
53
60
  print(f"The highest score is {scores[0]['score']:.3f} for the label {scores[0]['label']}")
54
61
  ```
62
+
55
63
  The output from the print statement above would look as follows:
64
+
56
65
  ```
57
66
  The highest probability is 0.995 for the label cat and dog
58
67
  ```
@@ -65,4 +74,3 @@ Check out [Zero-shot image classification task guide](https://huggingface.co/doc
65
74
 
66
75
  This page was made possible thanks to the efforts of [Shamima Hossain](https://huggingface.co/Shamima), [Haider Zaidi
67
76
  ](https://huggingface.co/chefhaider) and [Paarth Bhatnagar](https://huggingface.co/Paarth).
68
-
@@ -5,19 +5,19 @@ const taskData: TaskDataCustom = {
5
5
  {
6
6
  // TODO write proper description
7
7
  description: "",
8
- id: "",
8
+ id: "",
9
9
  },
10
10
  ],
11
11
  demo: {
12
12
  inputs: [
13
13
  {
14
14
  filename: "image-classification-input.jpeg",
15
- type: "img",
15
+ type: "img",
16
16
  },
17
17
  {
18
- label: "Classes",
18
+ label: "Classes",
19
19
  content: "cat, dog, bird",
20
- type: "text",
20
+ type: "text",
21
21
  },
22
22
  ],
23
23
  outputs: [
@@ -42,15 +42,13 @@ const taskData: TaskDataCustom = {
42
42
  },
43
43
  metrics: [
44
44
  {
45
- description:
46
- "Computes the number of times the correct label appears in top K labels predicted",
45
+ description: "Computes the number of times the correct label appears in top K labels predicted",
47
46
  id: "top-K accuracy",
48
47
  },
49
48
  ],
50
49
  models: [
51
50
  {
52
- description:
53
- "Robust image classification model trained on publicly available image-caption data.",
51
+ description: "Robust image classification model trained on publicly available image-caption data.",
54
52
  id: "openai/clip-vit-base-patch16",
55
53
  },
56
54
  {
@@ -59,21 +57,21 @@ const taskData: TaskDataCustom = {
59
57
  id: "openai/clip-vit-large-patch14-336",
60
58
  },
61
59
  {
62
- description:
63
- "Strong image classification model for biomedical domain.",
60
+ description: "Strong image classification model for biomedical domain.",
64
61
  id: "microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224",
65
62
  },
66
63
  ],
67
- spaces: [
64
+ spaces: [
68
65
  {
69
- description: "An application that leverages zero shot image classification to find best captions to generate an image. ",
70
- id: "pharma/CLIP-Interrogator",
66
+ description:
67
+ "An application that leverages zero shot image classification to find best captions to generate an image. ",
68
+ id: "pharma/CLIP-Interrogator",
71
69
  },
72
70
  ],
73
71
  summary:
74
72
  "Zero shot image classification is the task of classifying previously unseen classes during training of a model.",
75
73
  widgetModels: ["openai/clip-vit-large-patch14-336"],
76
- youtubeId: "",
74
+ youtubeId: "",
77
75
  };
78
76
 
79
77
  export default taskData;
package/tsconfig.json ADDED
@@ -0,0 +1,18 @@
1
+ {
2
+ "compilerOptions": {
3
+ "allowSyntheticDefaultImports": true,
4
+ "lib": ["ES2022", "DOM"],
5
+ "module": "CommonJS",
6
+ "moduleResolution": "node",
7
+ "target": "ES2022",
8
+ "forceConsistentCasingInFileNames": true,
9
+ "strict": true,
10
+ "noImplicitAny": true,
11
+ "strictNullChecks": true,
12
+ "skipLibCheck": true,
13
+ "noImplicitOverride": true,
14
+ "outDir": "./dist"
15
+ },
16
+ "include": ["src"],
17
+ "exclude": ["dist"]
18
+ }
Binary file
Binary file
Binary file
Binary file