@huggingface/tasks 0.13.1-test → 0.13.1-test2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. package/package.json +4 -2
  2. package/src/dataset-libraries.ts +89 -0
  3. package/src/default-widget-inputs.ts +718 -0
  4. package/src/gguf.ts +40 -0
  5. package/src/hardware.ts +482 -0
  6. package/src/index.ts +59 -0
  7. package/src/library-to-tasks.ts +76 -0
  8. package/src/local-apps.ts +412 -0
  9. package/src/model-data.ts +149 -0
  10. package/src/model-libraries-downloads.ts +18 -0
  11. package/src/model-libraries-snippets.ts +1128 -0
  12. package/src/model-libraries.ts +820 -0
  13. package/src/pipelines.ts +698 -0
  14. package/src/snippets/common.ts +39 -0
  15. package/src/snippets/curl.spec.ts +94 -0
  16. package/src/snippets/curl.ts +120 -0
  17. package/src/snippets/index.ts +7 -0
  18. package/src/snippets/inputs.ts +167 -0
  19. package/src/snippets/js.spec.ts +148 -0
  20. package/src/snippets/js.ts +305 -0
  21. package/src/snippets/python.spec.ts +144 -0
  22. package/src/snippets/python.ts +321 -0
  23. package/src/snippets/types.ts +16 -0
  24. package/src/tasks/audio-classification/about.md +86 -0
  25. package/src/tasks/audio-classification/data.ts +81 -0
  26. package/src/tasks/audio-classification/inference.ts +52 -0
  27. package/src/tasks/audio-classification/spec/input.json +35 -0
  28. package/src/tasks/audio-classification/spec/output.json +11 -0
  29. package/src/tasks/audio-to-audio/about.md +56 -0
  30. package/src/tasks/audio-to-audio/data.ts +70 -0
  31. package/src/tasks/automatic-speech-recognition/about.md +90 -0
  32. package/src/tasks/automatic-speech-recognition/data.ts +82 -0
  33. package/src/tasks/automatic-speech-recognition/inference.ts +160 -0
  34. package/src/tasks/automatic-speech-recognition/spec/input.json +35 -0
  35. package/src/tasks/automatic-speech-recognition/spec/output.json +38 -0
  36. package/src/tasks/chat-completion/inference.ts +322 -0
  37. package/src/tasks/chat-completion/spec/input.json +350 -0
  38. package/src/tasks/chat-completion/spec/output.json +206 -0
  39. package/src/tasks/chat-completion/spec/stream_output.json +213 -0
  40. package/src/tasks/common-definitions.json +100 -0
  41. package/src/tasks/depth-estimation/about.md +45 -0
  42. package/src/tasks/depth-estimation/data.ts +70 -0
  43. package/src/tasks/depth-estimation/inference.ts +35 -0
  44. package/src/tasks/depth-estimation/spec/input.json +25 -0
  45. package/src/tasks/depth-estimation/spec/output.json +16 -0
  46. package/src/tasks/document-question-answering/about.md +53 -0
  47. package/src/tasks/document-question-answering/data.ts +85 -0
  48. package/src/tasks/document-question-answering/inference.ts +110 -0
  49. package/src/tasks/document-question-answering/spec/input.json +85 -0
  50. package/src/tasks/document-question-answering/spec/output.json +36 -0
  51. package/src/tasks/feature-extraction/about.md +72 -0
  52. package/src/tasks/feature-extraction/data.ts +57 -0
  53. package/src/tasks/feature-extraction/inference.ts +40 -0
  54. package/src/tasks/feature-extraction/spec/input.json +47 -0
  55. package/src/tasks/feature-extraction/spec/output.json +15 -0
  56. package/src/tasks/fill-mask/about.md +51 -0
  57. package/src/tasks/fill-mask/data.ts +79 -0
  58. package/src/tasks/fill-mask/inference.ts +62 -0
  59. package/src/tasks/fill-mask/spec/input.json +38 -0
  60. package/src/tasks/fill-mask/spec/output.json +29 -0
  61. package/src/tasks/image-classification/about.md +50 -0
  62. package/src/tasks/image-classification/data.ts +88 -0
  63. package/src/tasks/image-classification/inference.ts +52 -0
  64. package/src/tasks/image-classification/spec/input.json +35 -0
  65. package/src/tasks/image-classification/spec/output.json +11 -0
  66. package/src/tasks/image-feature-extraction/about.md +23 -0
  67. package/src/tasks/image-feature-extraction/data.ts +59 -0
  68. package/src/tasks/image-segmentation/about.md +63 -0
  69. package/src/tasks/image-segmentation/data.ts +99 -0
  70. package/src/tasks/image-segmentation/inference.ts +69 -0
  71. package/src/tasks/image-segmentation/spec/input.json +45 -0
  72. package/src/tasks/image-segmentation/spec/output.json +26 -0
  73. package/src/tasks/image-text-to-text/about.md +76 -0
  74. package/src/tasks/image-text-to-text/data.ts +102 -0
  75. package/src/tasks/image-to-3d/about.md +62 -0
  76. package/src/tasks/image-to-3d/data.ts +75 -0
  77. package/src/tasks/image-to-image/about.md +129 -0
  78. package/src/tasks/image-to-image/data.ts +101 -0
  79. package/src/tasks/image-to-image/inference.ts +68 -0
  80. package/src/tasks/image-to-image/spec/input.json +55 -0
  81. package/src/tasks/image-to-image/spec/output.json +12 -0
  82. package/src/tasks/image-to-text/about.md +61 -0
  83. package/src/tasks/image-to-text/data.ts +82 -0
  84. package/src/tasks/image-to-text/inference.ts +143 -0
  85. package/src/tasks/image-to-text/spec/input.json +34 -0
  86. package/src/tasks/image-to-text/spec/output.json +14 -0
  87. package/src/tasks/index.ts +312 -0
  88. package/src/tasks/keypoint-detection/about.md +57 -0
  89. package/src/tasks/keypoint-detection/data.ts +50 -0
  90. package/src/tasks/mask-generation/about.md +65 -0
  91. package/src/tasks/mask-generation/data.ts +55 -0
  92. package/src/tasks/object-detection/about.md +37 -0
  93. package/src/tasks/object-detection/data.ts +86 -0
  94. package/src/tasks/object-detection/inference.ts +75 -0
  95. package/src/tasks/object-detection/spec/input.json +31 -0
  96. package/src/tasks/object-detection/spec/output.json +50 -0
  97. package/src/tasks/placeholder/about.md +15 -0
  98. package/src/tasks/placeholder/data.ts +21 -0
  99. package/src/tasks/placeholder/spec/input.json +35 -0
  100. package/src/tasks/placeholder/spec/output.json +17 -0
  101. package/src/tasks/question-answering/about.md +56 -0
  102. package/src/tasks/question-answering/data.ts +75 -0
  103. package/src/tasks/question-answering/inference.ts +99 -0
  104. package/src/tasks/question-answering/spec/input.json +67 -0
  105. package/src/tasks/question-answering/spec/output.json +29 -0
  106. package/src/tasks/reinforcement-learning/about.md +167 -0
  107. package/src/tasks/reinforcement-learning/data.ts +75 -0
  108. package/src/tasks/sentence-similarity/about.md +97 -0
  109. package/src/tasks/sentence-similarity/data.ts +101 -0
  110. package/src/tasks/sentence-similarity/inference.ts +32 -0
  111. package/src/tasks/sentence-similarity/spec/input.json +40 -0
  112. package/src/tasks/sentence-similarity/spec/output.json +12 -0
  113. package/src/tasks/summarization/about.md +58 -0
  114. package/src/tasks/summarization/data.ts +76 -0
  115. package/src/tasks/summarization/inference.ts +57 -0
  116. package/src/tasks/summarization/spec/input.json +42 -0
  117. package/src/tasks/summarization/spec/output.json +14 -0
  118. package/src/tasks/table-question-answering/about.md +43 -0
  119. package/src/tasks/table-question-answering/data.ts +59 -0
  120. package/src/tasks/table-question-answering/inference.ts +61 -0
  121. package/src/tasks/table-question-answering/spec/input.json +44 -0
  122. package/src/tasks/table-question-answering/spec/output.json +40 -0
  123. package/src/tasks/tabular-classification/about.md +65 -0
  124. package/src/tasks/tabular-classification/data.ts +68 -0
  125. package/src/tasks/tabular-regression/about.md +87 -0
  126. package/src/tasks/tabular-regression/data.ts +57 -0
  127. package/src/tasks/text-classification/about.md +173 -0
  128. package/src/tasks/text-classification/data.ts +103 -0
  129. package/src/tasks/text-classification/inference.ts +51 -0
  130. package/src/tasks/text-classification/spec/input.json +35 -0
  131. package/src/tasks/text-classification/spec/output.json +11 -0
  132. package/src/tasks/text-generation/about.md +154 -0
  133. package/src/tasks/text-generation/data.ts +114 -0
  134. package/src/tasks/text-generation/inference.ts +200 -0
  135. package/src/tasks/text-generation/spec/input.json +219 -0
  136. package/src/tasks/text-generation/spec/output.json +179 -0
  137. package/src/tasks/text-generation/spec/stream_output.json +103 -0
  138. package/src/tasks/text-to-3d/about.md +62 -0
  139. package/src/tasks/text-to-3d/data.ts +56 -0
  140. package/src/tasks/text-to-audio/inference.ts +143 -0
  141. package/src/tasks/text-to-audio/spec/input.json +31 -0
  142. package/src/tasks/text-to-audio/spec/output.json +17 -0
  143. package/src/tasks/text-to-image/about.md +96 -0
  144. package/src/tasks/text-to-image/data.ts +100 -0
  145. package/src/tasks/text-to-image/inference.ts +75 -0
  146. package/src/tasks/text-to-image/spec/input.json +63 -0
  147. package/src/tasks/text-to-image/spec/output.json +13 -0
  148. package/src/tasks/text-to-speech/about.md +63 -0
  149. package/src/tasks/text-to-speech/data.ts +79 -0
  150. package/src/tasks/text-to-speech/inference.ts +145 -0
  151. package/src/tasks/text-to-speech/spec/input.json +31 -0
  152. package/src/tasks/text-to-speech/spec/output.json +7 -0
  153. package/src/tasks/text-to-video/about.md +41 -0
  154. package/src/tasks/text-to-video/data.ts +102 -0
  155. package/src/tasks/text2text-generation/inference.ts +55 -0
  156. package/src/tasks/text2text-generation/spec/input.json +55 -0
  157. package/src/tasks/text2text-generation/spec/output.json +14 -0
  158. package/src/tasks/token-classification/about.md +76 -0
  159. package/src/tasks/token-classification/data.ts +92 -0
  160. package/src/tasks/token-classification/inference.ts +85 -0
  161. package/src/tasks/token-classification/spec/input.json +65 -0
  162. package/src/tasks/token-classification/spec/output.json +37 -0
  163. package/src/tasks/translation/about.md +65 -0
  164. package/src/tasks/translation/data.ts +70 -0
  165. package/src/tasks/translation/inference.ts +67 -0
  166. package/src/tasks/translation/spec/input.json +50 -0
  167. package/src/tasks/translation/spec/output.json +14 -0
  168. package/src/tasks/unconditional-image-generation/about.md +50 -0
  169. package/src/tasks/unconditional-image-generation/data.ts +72 -0
  170. package/src/tasks/video-classification/about.md +37 -0
  171. package/src/tasks/video-classification/data.ts +84 -0
  172. package/src/tasks/video-classification/inference.ts +59 -0
  173. package/src/tasks/video-classification/spec/input.json +42 -0
  174. package/src/tasks/video-classification/spec/output.json +10 -0
  175. package/src/tasks/video-text-to-text/about.md +98 -0
  176. package/src/tasks/video-text-to-text/data.ts +66 -0
  177. package/src/tasks/visual-question-answering/about.md +48 -0
  178. package/src/tasks/visual-question-answering/data.ts +97 -0
  179. package/src/tasks/visual-question-answering/inference.ts +62 -0
  180. package/src/tasks/visual-question-answering/spec/input.json +41 -0
  181. package/src/tasks/visual-question-answering/spec/output.json +21 -0
  182. package/src/tasks/zero-shot-classification/about.md +40 -0
  183. package/src/tasks/zero-shot-classification/data.ts +70 -0
  184. package/src/tasks/zero-shot-classification/inference.ts +67 -0
  185. package/src/tasks/zero-shot-classification/spec/input.json +50 -0
  186. package/src/tasks/zero-shot-classification/spec/output.json +11 -0
  187. package/src/tasks/zero-shot-image-classification/about.md +75 -0
  188. package/src/tasks/zero-shot-image-classification/data.ts +84 -0
  189. package/src/tasks/zero-shot-image-classification/inference.ts +61 -0
  190. package/src/tasks/zero-shot-image-classification/spec/input.json +45 -0
  191. package/src/tasks/zero-shot-image-classification/spec/output.json +10 -0
  192. package/src/tasks/zero-shot-object-detection/about.md +45 -0
  193. package/src/tasks/zero-shot-object-detection/data.ts +67 -0
  194. package/src/tasks/zero-shot-object-detection/inference.ts +66 -0
  195. package/src/tasks/zero-shot-object-detection/spec/input.json +40 -0
  196. package/src/tasks/zero-shot-object-detection/spec/output.json +47 -0
  197. package/src/tokenizer-data.ts +32 -0
  198. package/src/widget-example.ts +125 -0
@@ -0,0 +1,37 @@
1
+ ## Use Cases
2
+
3
+ Video classification models can be used to categorize what a video is all about.
4
+
5
+ ### Activity Recognition
6
+
7
+ Video classification models are used to perform activity recognition which is useful for fitness applications. Activity recognition is also helpful for vision-impaired individuals especially when they're commuting.
8
+
9
+ ### Video Search
10
+
11
+ Models trained in video classification can improve user experience by organizing and categorizing video galleries on the phone or in the cloud, on multiple keywords or tags.
12
+
13
+ ## Inference
14
+
15
+ Below you can find code for inferring with a pre-trained video classification model.
16
+
17
+ ```python
18
+ from transformers import pipeline
19
+
20
+ pipe = pipeline(task = "video-classification", model="nateraw/videomae-base-finetuned-ucf101-subset")
21
+ pipe("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/basketball.avi?download=true")
22
+
23
+ #[{'score': 0.90, 'label': 'BasketballDunk'},
24
+ # {'score': 0.02, 'label': 'BalanceBeam'},
25
+ # ... ]
26
+ ```
27
+
28
+ ## Useful Resources
29
+
30
+ - [Developing a simple video classification model](https://keras.io/examples/vision/video_classification)
31
+ - [Video classification with Transformers](https://keras.io/examples/vision/video_transformers)
32
+ - [Building a video archive](https://www.youtube.com/watch?v=_IeS1m8r6SY)
33
+ - [Video classification task guide](https://huggingface.co/docs/transformers/tasks/video_classification)
34
+
35
+ ### Creating your own video classifier in minutes
36
+
37
+ - [Fine-tuning tutorial notebook (PyTorch)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb)
@@ -0,0 +1,84 @@
1
+ import type { TaskDataCustom } from "../index.js";
2
+
3
+ const taskData: TaskDataCustom = {
4
+ datasets: [
5
+ {
6
+ // TODO write proper description
7
+ description: "Benchmark dataset used for video classification with videos that belong to 400 classes.",
8
+ id: "kinetics400",
9
+ },
10
+ ],
11
+ demo: {
12
+ inputs: [
13
+ {
14
+ filename: "video-classification-input.gif",
15
+ type: "img",
16
+ },
17
+ ],
18
+ outputs: [
19
+ {
20
+ type: "chart",
21
+ data: [
22
+ {
23
+ label: "Playing Guitar",
24
+ score: 0.514,
25
+ },
26
+ {
27
+ label: "Playing Tennis",
28
+ score: 0.193,
29
+ },
30
+ {
31
+ label: "Cooking",
32
+ score: 0.068,
33
+ },
34
+ ],
35
+ },
36
+ ],
37
+ },
38
+ metrics: [
39
+ {
40
+ description: "",
41
+ id: "accuracy",
42
+ },
43
+ {
44
+ description: "",
45
+ id: "recall",
46
+ },
47
+ {
48
+ description: "",
49
+ id: "precision",
50
+ },
51
+ {
52
+ description: "",
53
+ id: "f1",
54
+ },
55
+ ],
56
+ models: [
57
+ {
58
+ // TO DO: write description
59
+ description: "Strong Video Classification model trained on the Kinetics 400 dataset.",
60
+ id: "google/vivit-b-16x2-kinetics400",
61
+ },
62
+ {
63
+ // TO DO: write description
64
+ description: "Strong Video Classification model trained on the Kinetics 400 dataset.",
65
+ id: "microsoft/xclip-base-patch32",
66
+ },
67
+ ],
68
+ spaces: [
69
+ {
70
+ description: "An application that classifies video at different timestamps.",
71
+ id: "nateraw/lavila",
72
+ },
73
+ {
74
+ description: "An application that classifies video.",
75
+ id: "fcakyon/video-classification",
76
+ },
77
+ ],
78
+ summary:
79
+ "Video classification is the task of assigning a label or class to an entire video. Videos are expected to have only one class for each video. Video classification models take a video as input and return a prediction about which class the video belongs to.",
80
+ widgetModels: [],
81
+ youtubeId: "",
82
+ };
83
+
84
+ export default taskData;
@@ -0,0 +1,59 @@
1
+ /**
2
+ * Inference code generated from the JSON schema spec in ./spec
3
+ *
4
+ * Using src/scripts/inference-codegen
5
+ */
6
+ /**
7
+ * Inputs for Video Classification inference
8
+ */
9
+ export interface VideoClassificationInput {
10
+ /**
11
+ * The input video data
12
+ */
13
+ inputs: unknown;
14
+ /**
15
+ * Additional inference parameters
16
+ */
17
+ parameters?: VideoClassificationParameters;
18
+ [property: string]: unknown;
19
+ }
20
+ /**
21
+ * Additional inference parameters
22
+ *
23
+ * Additional inference parameters for Video Classification
24
+ */
25
+ export interface VideoClassificationParameters {
26
+ /**
27
+ * The sampling rate used to select frames from the video.
28
+ */
29
+ frame_sampling_rate?: number;
30
+ function_to_apply?: ClassificationOutputTransform;
31
+ /**
32
+ * The number of sampled frames to consider for classification.
33
+ */
34
+ num_frames?: number;
35
+ /**
36
+ * When specified, limits the output to the top K most probable classes.
37
+ */
38
+ top_k?: number;
39
+ [property: string]: unknown;
40
+ }
41
+ /**
42
+ * The function to apply to the model outputs in order to retrieve the scores.
43
+ */
44
+ export type ClassificationOutputTransform = "sigmoid" | "softmax" | "none";
45
+ export type VideoClassificationOutput = VideoClassificationOutputElement[];
46
+ /**
47
+ * Outputs of inference for the Video Classification task
48
+ */
49
+ export interface VideoClassificationOutputElement {
50
+ /**
51
+ * The predicted class label.
52
+ */
53
+ label: string;
54
+ /**
55
+ * The corresponding probability.
56
+ */
57
+ score: number;
58
+ [property: string]: unknown;
59
+ }
@@ -0,0 +1,42 @@
1
+ {
2
+ "$id": "/inference/schemas/video-classification/input.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Inputs for Video Classification inference",
5
+ "title": "VideoClassificationInput",
6
+ "type": "object",
7
+ "properties": {
8
+ "inputs": {
9
+ "description": "The input video data"
10
+ },
11
+ "parameters": {
12
+ "description": "Additional inference parameters",
13
+ "$ref": "#/$defs/VideoClassificationParameters"
14
+ }
15
+ },
16
+ "$defs": {
17
+ "VideoClassificationParameters": {
18
+ "title": "VideoClassificationParameters",
19
+ "description": "Additional inference parameters for Video Classification",
20
+ "type": "object",
21
+ "properties": {
22
+ "function_to_apply": {
23
+ "title": "TextClassificationOutputTransform",
24
+ "$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
25
+ },
26
+ "num_frames": {
27
+ "type": "integer",
28
+ "description": "The number of sampled frames to consider for classification."
29
+ },
30
+ "frame_sampling_rate": {
31
+ "type": "integer",
32
+ "description": "The sampling rate used to select frames from the video."
33
+ },
34
+ "top_k": {
35
+ "type": "integer",
36
+ "description": "When specified, limits the output to the top K most probable classes."
37
+ }
38
+ }
39
+ }
40
+ },
41
+ "required": ["inputs"]
42
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "$id": "/inference/schemas/video-classification/output.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Outputs of inference for the Video Classification task",
5
+ "title": "VideoClassificationOutput",
6
+ "type": "array",
7
+ "items": {
8
+ "$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
9
+ }
10
+ }
@@ -0,0 +1,98 @@
1
+ Most of the video language models can take in videos, multiple videos, images and multiple images. Some of these models can also take interleaved inputs, which can have images and videos inside the text, where you can refer to the input images and input videos within the text prompt.
2
+
3
+ ## Different Types of Video Language Models
4
+
5
+ Video language models come in three types:
6
+
7
+ - **Base:** Pre-trained models that can be fine-tuned.
8
+ - **Instruction:** Base models fine-tuned on video-instruction pairs and answers.
9
+ - **Chatty/Conversational:** Base models fine-tuned on video conversation datasets.
10
+
11
+ ## Use Cases
12
+
13
+ ### Video Question Answering
14
+
15
+ Video language models trained on video-question-answer pairs can be used for video question answering and generating captions for videos.
16
+
17
+ ### Video Chat
18
+
19
+ Video language models can be used to have a dialogue about a video.
20
+
21
+ ### Video Recognition with Instructions
22
+
23
+ Video language models can recognize images through descriptions. When given detailed descriptions of specific entities, they can classify the entities in a video.
24
+
25
+ ## Inference
26
+
27
+ You can use the Transformers library to interact with video-language models.
28
+ Below we load [a video language model](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf), write a simple utility to sample videos, use chat template to format the text prompt, process the video and the text prompt and infer. To run the snippet below, please install [OpenCV](https://pypi.org/project/opencv-python/) by running `pip install opencv-python`.
29
+
30
+ ```python
31
+ import uuid
32
+ import requests
33
+ import cv2
34
+ import torch
35
+ from transformers import LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration
36
+
37
+ device = "cuda" if torch.cuda.is_available() else "cpu"
38
+ model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
39
+
40
+ model = LlavaNextVideoForConditionalGeneration.from_pretrained(
41
+ model_id,
42
+ torch_dtype=torch.float16,
43
+ low_cpu_mem_usage=True,
44
+ ).to(device)
45
+
46
+ processor = LlavaNextVideoProcessor.from_pretrained(model_id)
47
+
48
+ def sample_frames(url, num_frames):
49
+ response = requests.get(url)
50
+ path_id = str(uuid.uuid4())
51
+
52
+ path = f"./{path_id}.mp4"
53
+
54
+ with open(path, "wb") as f:
55
+ f.write(response.content)
56
+
57
+ video = cv2.VideoCapture(path)
58
+ total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
59
+ interval = total_frames // num_frames
60
+ frames = []
61
+ for i in range(total_frames):
62
+ ret, frame = video.read()
63
+ if not ret:
64
+ continue
65
+ if i % interval == 0:
66
+ pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
67
+ frames.append(pil_img)
68
+ video.release()
69
+ return frames
70
+
71
+ conversation = [
72
+ {
73
+
74
+ "role": "user",
75
+ "content": [
76
+ {"type": "text", "text": "Why is this video funny?"},
77
+ {"type": "video"},
78
+ ],
79
+ },
80
+ ]
81
+
82
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
83
+
84
+ video_url = "https://huggingface.co/spaces/merve/llava-interleave/resolve/main/cats_1.mp4"
85
+ video = sample_frames(video, 8)
86
+
87
+ inputs = processor(text=prompt, videos=video, padding=True, return_tensors="pt").to(model.device)
88
+
89
+ output = model.generate(**inputs, max_new_tokens=100, do_sample=False)
90
+ print(processor.decode(output[0][2:], skip_special_tokens=True))
91
+
92
+ # Why is this video funny? ASSISTANT: The humor in this video comes from the cat's facial expression and body language. The cat appears to be making a funny face, with its eyes squinted and mouth open, which can be interpreted as a playful or mischievous expression. Cats often make such faces when they are in a good mood or are playful, and this can be amusing to people who are familiar with their behavior. The combination of the cat's expression and the close-
93
+
94
+ ```
95
+
96
+ ## Useful Resources
97
+
98
+ - [Transformers task guide on video-text-to-text](https://huggingface.co/docs/transformers/tasks/video_text_to_text)
@@ -0,0 +1,66 @@
1
+ import type { TaskDataCustom } from "../index.js";
2
+
3
+ const taskData: TaskDataCustom = {
4
+ datasets: [
5
+ {
6
+ description: "Multiple-choice questions and answers about videos.",
7
+ id: "lmms-lab/Video-MME",
8
+ },
9
+ {
10
+ description: "A dataset of instructions and question-answer pairs about videos.",
11
+ id: "lmms-lab/VideoChatGPT",
12
+ },
13
+ {
14
+ description: "Large video understanding dataset.",
15
+ id: "HuggingFaceFV/finevideo",
16
+ },
17
+ ],
18
+ demo: {
19
+ inputs: [
20
+ {
21
+ filename: "video-text-to-text-input.gif",
22
+ type: "img",
23
+ },
24
+ {
25
+ label: "Text Prompt",
26
+ content: "What is happening in this video?",
27
+ type: "text",
28
+ },
29
+ ],
30
+ outputs: [
31
+ {
32
+ label: "Answer",
33
+ content:
34
+ "The video shows a series of images showing a fountain with water jets and a variety of colorful flowers and butterflies in the background.",
35
+ type: "text",
36
+ },
37
+ ],
38
+ },
39
+ metrics: [],
40
+ models: [
41
+ {
42
+ description: "A robust video-text-to-text model that can take in image and video inputs.",
43
+ id: "llava-hf/llava-onevision-qwen2-72b-ov-hf",
44
+ },
45
+ {
46
+ description: "Large and powerful video-text-to-text model that can take in image and video inputs.",
47
+ id: "llava-hf/LLaVA-NeXT-Video-34B-hf",
48
+ },
49
+ ],
50
+ spaces: [
51
+ {
52
+ description: "An application to chat with a video-text-to-text model.",
53
+ id: "llava-hf/video-llava",
54
+ },
55
+ {
56
+ description: "A leaderboard for various video-text-to-text models.",
57
+ id: "opencompass/openvlm_video_leaderboard",
58
+ },
59
+ ],
60
+ summary:
61
+ "Video-text-to-text models take in a video and a text prompt and output text. These models are also called video-language models.",
62
+ widgetModels: [""],
63
+ youtubeId: "",
64
+ };
65
+
66
+ export default taskData;
@@ -0,0 +1,48 @@
1
+ ## Use Cases
2
+
3
+ ### Aid the Visually Impaired Persons
4
+
5
+ VQA models can be used to reduce visual barriers for visually impaired individuals by allowing them to get information about images from the web and the real world.
6
+
7
+ ### Education
8
+
9
+ VQA models can be used to improve experiences at museums by allowing observers to directly ask questions they interested in.
10
+
11
+ ### Improved Image Retrieval
12
+
13
+ Visual question answering models can be used to retrieve images with specific characteristics. For example, the user can ask "Is there a dog?" to find all images with dogs from a set of images.
14
+
15
+ ### Video Search
16
+
17
+ Specific snippets/timestamps of a video can be retrieved based on search queries. For example, the user can ask "At which part of the video does the guitar appear?" and get a specific timestamp range from the whole video.
18
+
19
+ ## Task Variants
20
+
21
+ ### Video Question Answering
22
+
23
+ Video Question Answering aims to answer questions asked about the content of a video.
24
+
25
+ ## Inference
26
+
27
+ You can infer with Visual Question Answering models using the `vqa` (or `visual-question-answering`) pipeline. This pipeline requires [the Python Image Library (PIL)](https://pillow.readthedocs.io/en/stable/) to process images. You can install it with (`pip install pillow`).
28
+
29
+ ```python
30
+ from PIL import Image
31
+ from transformers import pipeline
32
+
33
+ vqa_pipeline = pipeline("visual-question-answering")
34
+
35
+ image = Image.open("elephant.jpeg")
36
+ question = "Is there an elephant?"
37
+
38
+ vqa_pipeline(image, question, top_k=1)
39
+ #[{'score': 0.9998154044151306, 'answer': 'yes'}]
40
+ ```
41
+
42
+ ## Useful Resources
43
+
44
+ - [An introduction to Visual Question Answering - AllenAI](https://blog.allenai.org/vanilla-vqa-adcaaaa94336)
45
+ - [Multi Modal Framework (MMF) - Meta Research](https://mmf.sh/docs/getting_started/video_overview/)
46
+
47
+ The contents of this page are contributed by [
48
+ Bharat Raghunathan](https://huggingface.co/bharat-raghunathan) and [Jose Londono Botero](https://huggingface.co/jlondonobo).
@@ -0,0 +1,97 @@
1
+ import type { TaskDataCustom } from "../index.js";
2
+
3
+ const taskData: TaskDataCustom = {
4
+ datasets: [
5
+ {
6
+ description: "A widely used dataset containing questions (with answers) about images.",
7
+ id: "Graphcore/vqa",
8
+ },
9
+ {
10
+ description: "A dataset to benchmark visual reasoning based on text in images.",
11
+ id: "facebook/textvqa",
12
+ },
13
+ ],
14
+ demo: {
15
+ inputs: [
16
+ {
17
+ filename: "elephant.jpeg",
18
+ type: "img",
19
+ },
20
+ {
21
+ label: "Question",
22
+ content: "What is in this image?",
23
+ type: "text",
24
+ },
25
+ ],
26
+ outputs: [
27
+ {
28
+ type: "chart",
29
+ data: [
30
+ {
31
+ label: "elephant",
32
+ score: 0.97,
33
+ },
34
+ {
35
+ label: "elephants",
36
+ score: 0.06,
37
+ },
38
+ {
39
+ label: "animal",
40
+ score: 0.003,
41
+ },
42
+ ],
43
+ },
44
+ ],
45
+ },
46
+ isPlaceholder: false,
47
+ metrics: [
48
+ {
49
+ description: "",
50
+ id: "accuracy",
51
+ },
52
+ {
53
+ description:
54
+ "Measures how much a predicted answer differs from the ground truth based on the difference in their semantic meaning.",
55
+ id: "wu-palmer similarity",
56
+ },
57
+ ],
58
+ models: [
59
+ {
60
+ description: "A visual question answering model trained to convert charts and plots to text.",
61
+ id: "google/deplot",
62
+ },
63
+ {
64
+ description:
65
+ "A visual question answering model trained for mathematical reasoning and chart derendering from images.",
66
+ id: "google/matcha-base",
67
+ },
68
+ {
69
+ description: "A strong visual question answering that answers questions from book covers.",
70
+ id: "google/pix2struct-ocrvqa-large",
71
+ },
72
+ ],
73
+ spaces: [
74
+ {
75
+ description: "An application that compares visual question answering models across different tasks.",
76
+ id: "merve/pix2struct",
77
+ },
78
+ {
79
+ description: "An application that can answer questions based on images.",
80
+ id: "nielsr/vilt-vqa",
81
+ },
82
+ {
83
+ description: "An application that can caption images and answer questions about a given image. ",
84
+ id: "Salesforce/BLIP",
85
+ },
86
+ {
87
+ description: "An application that can caption images and answer questions about a given image. ",
88
+ id: "vumichien/Img2Prompt",
89
+ },
90
+ ],
91
+ summary:
92
+ "Visual Question Answering is the task of answering open-ended questions based on an image. They output natural language responses to natural language questions.",
93
+ widgetModels: ["dandelin/vilt-b32-finetuned-vqa"],
94
+ youtubeId: "",
95
+ };
96
+
97
+ export default taskData;
@@ -0,0 +1,62 @@
1
+ /**
2
+ * Inference code generated from the JSON schema spec in ./spec
3
+ *
4
+ * Using src/scripts/inference-codegen
5
+ */
6
+ /**
7
+ * Inputs for Visual Question Answering inference
8
+ */
9
+ export interface VisualQuestionAnsweringInput {
10
+ /**
11
+ * One (image, question) pair to answer
12
+ */
13
+ inputs: VisualQuestionAnsweringInputData;
14
+ /**
15
+ * Additional inference parameters
16
+ */
17
+ parameters?: VisualQuestionAnsweringParameters;
18
+ [property: string]: unknown;
19
+ }
20
+ /**
21
+ * One (image, question) pair to answer
22
+ */
23
+ export interface VisualQuestionAnsweringInputData {
24
+ /**
25
+ * The image.
26
+ */
27
+ image: unknown;
28
+ /**
29
+ * The question to answer based on the image.
30
+ */
31
+ question: unknown;
32
+ [property: string]: unknown;
33
+ }
34
+ /**
35
+ * Additional inference parameters
36
+ *
37
+ * Additional inference parameters for Visual Question Answering
38
+ */
39
+ export interface VisualQuestionAnsweringParameters {
40
+ /**
41
+ * The number of answers to return (will be chosen by order of likelihood). Note that we
42
+ * return less than topk answers if there are not enough options available within the
43
+ * context.
44
+ */
45
+ top_k?: number;
46
+ [property: string]: unknown;
47
+ }
48
+ export type VisualQuestionAnsweringOutput = VisualQuestionAnsweringOutputElement[];
49
+ /**
50
+ * Outputs of inference for the Visual Question Answering task
51
+ */
52
+ export interface VisualQuestionAnsweringOutputElement {
53
+ /**
54
+ * The answer to the question
55
+ */
56
+ answer?: string;
57
+ /**
58
+ * The associated score / probability
59
+ */
60
+ score: number;
61
+ [property: string]: unknown;
62
+ }
@@ -0,0 +1,41 @@
1
+ {
2
+ "$id": "/inference/schemas/visual-question-answering/input.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Inputs for Visual Question Answering inference",
5
+ "title": "VisualQuestionAnsweringInput",
6
+ "type": "object",
7
+ "properties": {
8
+ "inputs": {
9
+ "description": "One (image, question) pair to answer",
10
+ "type": "object",
11
+ "title": "VisualQuestionAnsweringInputData",
12
+ "properties": {
13
+ "image": {
14
+ "description": "The image."
15
+ },
16
+ "question": {
17
+ "description": "The question to answer based on the image."
18
+ }
19
+ },
20
+ "required": ["question", "image"]
21
+ },
22
+ "parameters": {
23
+ "description": "Additional inference parameters",
24
+ "$ref": "#/$defs/VisualQuestionAnsweringParameters"
25
+ }
26
+ },
27
+ "$defs": {
28
+ "VisualQuestionAnsweringParameters": {
29
+ "title": "VisualQuestionAnsweringParameters",
30
+ "description": "Additional inference parameters for Visual Question Answering",
31
+ "type": "object",
32
+ "properties": {
33
+ "top_k": {
34
+ "type": "integer",
35
+ "description": "The number of answers to return (will be chosen by order of likelihood). Note that we return less than topk answers if there are not enough options available within the context."
36
+ }
37
+ }
38
+ }
39
+ },
40
+ "required": ["inputs"]
41
+ }
@@ -0,0 +1,21 @@
1
+ {
2
+ "$id": "/inference/schemas/visual-question-answering/output.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Outputs of inference for the Visual Question Answering task",
5
+ "title": "VisualQuestionAnsweringOutput",
6
+ "type": "array",
7
+ "items": {
8
+ "type": "object",
9
+ "properties": {
10
+ "answer": {
11
+ "type": "string",
12
+ "description": "The answer to the question"
13
+ },
14
+ "score": {
15
+ "type": "number",
16
+ "description": "The associated score / probability"
17
+ }
18
+ },
19
+ "required": ["score"]
20
+ }
21
+ }