@huggingface/tasks 0.13.1-test → 0.13.1-test2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -2
- package/src/dataset-libraries.ts +89 -0
- package/src/default-widget-inputs.ts +718 -0
- package/src/gguf.ts +40 -0
- package/src/hardware.ts +482 -0
- package/src/index.ts +59 -0
- package/src/library-to-tasks.ts +76 -0
- package/src/local-apps.ts +412 -0
- package/src/model-data.ts +149 -0
- package/src/model-libraries-downloads.ts +18 -0
- package/src/model-libraries-snippets.ts +1128 -0
- package/src/model-libraries.ts +820 -0
- package/src/pipelines.ts +698 -0
- package/src/snippets/common.ts +39 -0
- package/src/snippets/curl.spec.ts +94 -0
- package/src/snippets/curl.ts +120 -0
- package/src/snippets/index.ts +7 -0
- package/src/snippets/inputs.ts +167 -0
- package/src/snippets/js.spec.ts +148 -0
- package/src/snippets/js.ts +305 -0
- package/src/snippets/python.spec.ts +144 -0
- package/src/snippets/python.ts +321 -0
- package/src/snippets/types.ts +16 -0
- package/src/tasks/audio-classification/about.md +86 -0
- package/src/tasks/audio-classification/data.ts +81 -0
- package/src/tasks/audio-classification/inference.ts +52 -0
- package/src/tasks/audio-classification/spec/input.json +35 -0
- package/src/tasks/audio-classification/spec/output.json +11 -0
- package/src/tasks/audio-to-audio/about.md +56 -0
- package/src/tasks/audio-to-audio/data.ts +70 -0
- package/src/tasks/automatic-speech-recognition/about.md +90 -0
- package/src/tasks/automatic-speech-recognition/data.ts +82 -0
- package/src/tasks/automatic-speech-recognition/inference.ts +160 -0
- package/src/tasks/automatic-speech-recognition/spec/input.json +35 -0
- package/src/tasks/automatic-speech-recognition/spec/output.json +38 -0
- package/src/tasks/chat-completion/inference.ts +322 -0
- package/src/tasks/chat-completion/spec/input.json +350 -0
- package/src/tasks/chat-completion/spec/output.json +206 -0
- package/src/tasks/chat-completion/spec/stream_output.json +213 -0
- package/src/tasks/common-definitions.json +100 -0
- package/src/tasks/depth-estimation/about.md +45 -0
- package/src/tasks/depth-estimation/data.ts +70 -0
- package/src/tasks/depth-estimation/inference.ts +35 -0
- package/src/tasks/depth-estimation/spec/input.json +25 -0
- package/src/tasks/depth-estimation/spec/output.json +16 -0
- package/src/tasks/document-question-answering/about.md +53 -0
- package/src/tasks/document-question-answering/data.ts +85 -0
- package/src/tasks/document-question-answering/inference.ts +110 -0
- package/src/tasks/document-question-answering/spec/input.json +85 -0
- package/src/tasks/document-question-answering/spec/output.json +36 -0
- package/src/tasks/feature-extraction/about.md +72 -0
- package/src/tasks/feature-extraction/data.ts +57 -0
- package/src/tasks/feature-extraction/inference.ts +40 -0
- package/src/tasks/feature-extraction/spec/input.json +47 -0
- package/src/tasks/feature-extraction/spec/output.json +15 -0
- package/src/tasks/fill-mask/about.md +51 -0
- package/src/tasks/fill-mask/data.ts +79 -0
- package/src/tasks/fill-mask/inference.ts +62 -0
- package/src/tasks/fill-mask/spec/input.json +38 -0
- package/src/tasks/fill-mask/spec/output.json +29 -0
- package/src/tasks/image-classification/about.md +50 -0
- package/src/tasks/image-classification/data.ts +88 -0
- package/src/tasks/image-classification/inference.ts +52 -0
- package/src/tasks/image-classification/spec/input.json +35 -0
- package/src/tasks/image-classification/spec/output.json +11 -0
- package/src/tasks/image-feature-extraction/about.md +23 -0
- package/src/tasks/image-feature-extraction/data.ts +59 -0
- package/src/tasks/image-segmentation/about.md +63 -0
- package/src/tasks/image-segmentation/data.ts +99 -0
- package/src/tasks/image-segmentation/inference.ts +69 -0
- package/src/tasks/image-segmentation/spec/input.json +45 -0
- package/src/tasks/image-segmentation/spec/output.json +26 -0
- package/src/tasks/image-text-to-text/about.md +76 -0
- package/src/tasks/image-text-to-text/data.ts +102 -0
- package/src/tasks/image-to-3d/about.md +62 -0
- package/src/tasks/image-to-3d/data.ts +75 -0
- package/src/tasks/image-to-image/about.md +129 -0
- package/src/tasks/image-to-image/data.ts +101 -0
- package/src/tasks/image-to-image/inference.ts +68 -0
- package/src/tasks/image-to-image/spec/input.json +55 -0
- package/src/tasks/image-to-image/spec/output.json +12 -0
- package/src/tasks/image-to-text/about.md +61 -0
- package/src/tasks/image-to-text/data.ts +82 -0
- package/src/tasks/image-to-text/inference.ts +143 -0
- package/src/tasks/image-to-text/spec/input.json +34 -0
- package/src/tasks/image-to-text/spec/output.json +14 -0
- package/src/tasks/index.ts +312 -0
- package/src/tasks/keypoint-detection/about.md +57 -0
- package/src/tasks/keypoint-detection/data.ts +50 -0
- package/src/tasks/mask-generation/about.md +65 -0
- package/src/tasks/mask-generation/data.ts +55 -0
- package/src/tasks/object-detection/about.md +37 -0
- package/src/tasks/object-detection/data.ts +86 -0
- package/src/tasks/object-detection/inference.ts +75 -0
- package/src/tasks/object-detection/spec/input.json +31 -0
- package/src/tasks/object-detection/spec/output.json +50 -0
- package/src/tasks/placeholder/about.md +15 -0
- package/src/tasks/placeholder/data.ts +21 -0
- package/src/tasks/placeholder/spec/input.json +35 -0
- package/src/tasks/placeholder/spec/output.json +17 -0
- package/src/tasks/question-answering/about.md +56 -0
- package/src/tasks/question-answering/data.ts +75 -0
- package/src/tasks/question-answering/inference.ts +99 -0
- package/src/tasks/question-answering/spec/input.json +67 -0
- package/src/tasks/question-answering/spec/output.json +29 -0
- package/src/tasks/reinforcement-learning/about.md +167 -0
- package/src/tasks/reinforcement-learning/data.ts +75 -0
- package/src/tasks/sentence-similarity/about.md +97 -0
- package/src/tasks/sentence-similarity/data.ts +101 -0
- package/src/tasks/sentence-similarity/inference.ts +32 -0
- package/src/tasks/sentence-similarity/spec/input.json +40 -0
- package/src/tasks/sentence-similarity/spec/output.json +12 -0
- package/src/tasks/summarization/about.md +58 -0
- package/src/tasks/summarization/data.ts +76 -0
- package/src/tasks/summarization/inference.ts +57 -0
- package/src/tasks/summarization/spec/input.json +42 -0
- package/src/tasks/summarization/spec/output.json +14 -0
- package/src/tasks/table-question-answering/about.md +43 -0
- package/src/tasks/table-question-answering/data.ts +59 -0
- package/src/tasks/table-question-answering/inference.ts +61 -0
- package/src/tasks/table-question-answering/spec/input.json +44 -0
- package/src/tasks/table-question-answering/spec/output.json +40 -0
- package/src/tasks/tabular-classification/about.md +65 -0
- package/src/tasks/tabular-classification/data.ts +68 -0
- package/src/tasks/tabular-regression/about.md +87 -0
- package/src/tasks/tabular-regression/data.ts +57 -0
- package/src/tasks/text-classification/about.md +173 -0
- package/src/tasks/text-classification/data.ts +103 -0
- package/src/tasks/text-classification/inference.ts +51 -0
- package/src/tasks/text-classification/spec/input.json +35 -0
- package/src/tasks/text-classification/spec/output.json +11 -0
- package/src/tasks/text-generation/about.md +154 -0
- package/src/tasks/text-generation/data.ts +114 -0
- package/src/tasks/text-generation/inference.ts +200 -0
- package/src/tasks/text-generation/spec/input.json +219 -0
- package/src/tasks/text-generation/spec/output.json +179 -0
- package/src/tasks/text-generation/spec/stream_output.json +103 -0
- package/src/tasks/text-to-3d/about.md +62 -0
- package/src/tasks/text-to-3d/data.ts +56 -0
- package/src/tasks/text-to-audio/inference.ts +143 -0
- package/src/tasks/text-to-audio/spec/input.json +31 -0
- package/src/tasks/text-to-audio/spec/output.json +17 -0
- package/src/tasks/text-to-image/about.md +96 -0
- package/src/tasks/text-to-image/data.ts +100 -0
- package/src/tasks/text-to-image/inference.ts +75 -0
- package/src/tasks/text-to-image/spec/input.json +63 -0
- package/src/tasks/text-to-image/spec/output.json +13 -0
- package/src/tasks/text-to-speech/about.md +63 -0
- package/src/tasks/text-to-speech/data.ts +79 -0
- package/src/tasks/text-to-speech/inference.ts +145 -0
- package/src/tasks/text-to-speech/spec/input.json +31 -0
- package/src/tasks/text-to-speech/spec/output.json +7 -0
- package/src/tasks/text-to-video/about.md +41 -0
- package/src/tasks/text-to-video/data.ts +102 -0
- package/src/tasks/text2text-generation/inference.ts +55 -0
- package/src/tasks/text2text-generation/spec/input.json +55 -0
- package/src/tasks/text2text-generation/spec/output.json +14 -0
- package/src/tasks/token-classification/about.md +76 -0
- package/src/tasks/token-classification/data.ts +92 -0
- package/src/tasks/token-classification/inference.ts +85 -0
- package/src/tasks/token-classification/spec/input.json +65 -0
- package/src/tasks/token-classification/spec/output.json +37 -0
- package/src/tasks/translation/about.md +65 -0
- package/src/tasks/translation/data.ts +70 -0
- package/src/tasks/translation/inference.ts +67 -0
- package/src/tasks/translation/spec/input.json +50 -0
- package/src/tasks/translation/spec/output.json +14 -0
- package/src/tasks/unconditional-image-generation/about.md +50 -0
- package/src/tasks/unconditional-image-generation/data.ts +72 -0
- package/src/tasks/video-classification/about.md +37 -0
- package/src/tasks/video-classification/data.ts +84 -0
- package/src/tasks/video-classification/inference.ts +59 -0
- package/src/tasks/video-classification/spec/input.json +42 -0
- package/src/tasks/video-classification/spec/output.json +10 -0
- package/src/tasks/video-text-to-text/about.md +98 -0
- package/src/tasks/video-text-to-text/data.ts +66 -0
- package/src/tasks/visual-question-answering/about.md +48 -0
- package/src/tasks/visual-question-answering/data.ts +97 -0
- package/src/tasks/visual-question-answering/inference.ts +62 -0
- package/src/tasks/visual-question-answering/spec/input.json +41 -0
- package/src/tasks/visual-question-answering/spec/output.json +21 -0
- package/src/tasks/zero-shot-classification/about.md +40 -0
- package/src/tasks/zero-shot-classification/data.ts +70 -0
- package/src/tasks/zero-shot-classification/inference.ts +67 -0
- package/src/tasks/zero-shot-classification/spec/input.json +50 -0
- package/src/tasks/zero-shot-classification/spec/output.json +11 -0
- package/src/tasks/zero-shot-image-classification/about.md +75 -0
- package/src/tasks/zero-shot-image-classification/data.ts +84 -0
- package/src/tasks/zero-shot-image-classification/inference.ts +61 -0
- package/src/tasks/zero-shot-image-classification/spec/input.json +45 -0
- package/src/tasks/zero-shot-image-classification/spec/output.json +10 -0
- package/src/tasks/zero-shot-object-detection/about.md +45 -0
- package/src/tasks/zero-shot-object-detection/data.ts +67 -0
- package/src/tasks/zero-shot-object-detection/inference.ts +66 -0
- package/src/tasks/zero-shot-object-detection/spec/input.json +40 -0
- package/src/tasks/zero-shot-object-detection/spec/output.json +47 -0
- package/src/tokenizer-data.ts +32 -0
- package/src/widget-example.ts +125 -0
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
## Use Cases
|
|
2
|
+
|
|
3
|
+
Video classification models can be used to categorize what a video is all about.
|
|
4
|
+
|
|
5
|
+
### Activity Recognition
|
|
6
|
+
|
|
7
|
+
Video classification models are used to perform activity recognition which is useful for fitness applications. Activity recognition is also helpful for vision-impaired individuals especially when they're commuting.
|
|
8
|
+
|
|
9
|
+
### Video Search
|
|
10
|
+
|
|
11
|
+
Models trained in video classification can improve user experience by organizing and categorizing video galleries on the phone or in the cloud, on multiple keywords or tags.
|
|
12
|
+
|
|
13
|
+
## Inference
|
|
14
|
+
|
|
15
|
+
Below you can find code for inferring with a pre-trained video classification model.
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
from transformers import pipeline
|
|
19
|
+
|
|
20
|
+
pipe = pipeline(task = "video-classification", model="nateraw/videomae-base-finetuned-ucf101-subset")
|
|
21
|
+
pipe("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/basketball.avi?download=true")
|
|
22
|
+
|
|
23
|
+
#[{'score': 0.90, 'label': 'BasketballDunk'},
|
|
24
|
+
# {'score': 0.02, 'label': 'BalanceBeam'},
|
|
25
|
+
# ... ]
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Useful Resources
|
|
29
|
+
|
|
30
|
+
- [Developing a simple video classification model](https://keras.io/examples/vision/video_classification)
|
|
31
|
+
- [Video classification with Transformers](https://keras.io/examples/vision/video_transformers)
|
|
32
|
+
- [Building a video archive](https://www.youtube.com/watch?v=_IeS1m8r6SY)
|
|
33
|
+
- [Video classification task guide](https://huggingface.co/docs/transformers/tasks/video_classification)
|
|
34
|
+
|
|
35
|
+
### Creating your own video classifier in minutes
|
|
36
|
+
|
|
37
|
+
- [Fine-tuning tutorial notebook (PyTorch)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../index.js";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
// TODO write proper description
|
|
7
|
+
description: "Benchmark dataset used for video classification with videos that belong to 400 classes.",
|
|
8
|
+
id: "kinetics400",
|
|
9
|
+
},
|
|
10
|
+
],
|
|
11
|
+
demo: {
|
|
12
|
+
inputs: [
|
|
13
|
+
{
|
|
14
|
+
filename: "video-classification-input.gif",
|
|
15
|
+
type: "img",
|
|
16
|
+
},
|
|
17
|
+
],
|
|
18
|
+
outputs: [
|
|
19
|
+
{
|
|
20
|
+
type: "chart",
|
|
21
|
+
data: [
|
|
22
|
+
{
|
|
23
|
+
label: "Playing Guitar",
|
|
24
|
+
score: 0.514,
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
label: "Playing Tennis",
|
|
28
|
+
score: 0.193,
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
label: "Cooking",
|
|
32
|
+
score: 0.068,
|
|
33
|
+
},
|
|
34
|
+
],
|
|
35
|
+
},
|
|
36
|
+
],
|
|
37
|
+
},
|
|
38
|
+
metrics: [
|
|
39
|
+
{
|
|
40
|
+
description: "",
|
|
41
|
+
id: "accuracy",
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
description: "",
|
|
45
|
+
id: "recall",
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
description: "",
|
|
49
|
+
id: "precision",
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
description: "",
|
|
53
|
+
id: "f1",
|
|
54
|
+
},
|
|
55
|
+
],
|
|
56
|
+
models: [
|
|
57
|
+
{
|
|
58
|
+
// TO DO: write description
|
|
59
|
+
description: "Strong Video Classification model trained on the Kinetics 400 dataset.",
|
|
60
|
+
id: "google/vivit-b-16x2-kinetics400",
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
// TO DO: write description
|
|
64
|
+
description: "Strong Video Classification model trained on the Kinetics 400 dataset.",
|
|
65
|
+
id: "microsoft/xclip-base-patch32",
|
|
66
|
+
},
|
|
67
|
+
],
|
|
68
|
+
spaces: [
|
|
69
|
+
{
|
|
70
|
+
description: "An application that classifies video at different timestamps.",
|
|
71
|
+
id: "nateraw/lavila",
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
description: "An application that classifies video.",
|
|
75
|
+
id: "fcakyon/video-classification",
|
|
76
|
+
},
|
|
77
|
+
],
|
|
78
|
+
summary:
|
|
79
|
+
"Video classification is the task of assigning a label or class to an entire video. Videos are expected to have only one class for each video. Video classification models take a video as input and return a prediction about which class the video belongs to.",
|
|
80
|
+
widgetModels: [],
|
|
81
|
+
youtubeId: "",
|
|
82
|
+
};
|
|
83
|
+
|
|
84
|
+
export default taskData;
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inference code generated from the JSON schema spec in ./spec
|
|
3
|
+
*
|
|
4
|
+
* Using src/scripts/inference-codegen
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Inputs for Video Classification inference
|
|
8
|
+
*/
|
|
9
|
+
export interface VideoClassificationInput {
|
|
10
|
+
/**
|
|
11
|
+
* The input video data
|
|
12
|
+
*/
|
|
13
|
+
inputs: unknown;
|
|
14
|
+
/**
|
|
15
|
+
* Additional inference parameters
|
|
16
|
+
*/
|
|
17
|
+
parameters?: VideoClassificationParameters;
|
|
18
|
+
[property: string]: unknown;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Additional inference parameters
|
|
22
|
+
*
|
|
23
|
+
* Additional inference parameters for Video Classification
|
|
24
|
+
*/
|
|
25
|
+
export interface VideoClassificationParameters {
|
|
26
|
+
/**
|
|
27
|
+
* The sampling rate used to select frames from the video.
|
|
28
|
+
*/
|
|
29
|
+
frame_sampling_rate?: number;
|
|
30
|
+
function_to_apply?: ClassificationOutputTransform;
|
|
31
|
+
/**
|
|
32
|
+
* The number of sampled frames to consider for classification.
|
|
33
|
+
*/
|
|
34
|
+
num_frames?: number;
|
|
35
|
+
/**
|
|
36
|
+
* When specified, limits the output to the top K most probable classes.
|
|
37
|
+
*/
|
|
38
|
+
top_k?: number;
|
|
39
|
+
[property: string]: unknown;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* The function to apply to the model outputs in order to retrieve the scores.
|
|
43
|
+
*/
|
|
44
|
+
export type ClassificationOutputTransform = "sigmoid" | "softmax" | "none";
|
|
45
|
+
export type VideoClassificationOutput = VideoClassificationOutputElement[];
|
|
46
|
+
/**
|
|
47
|
+
* Outputs of inference for the Video Classification task
|
|
48
|
+
*/
|
|
49
|
+
export interface VideoClassificationOutputElement {
|
|
50
|
+
/**
|
|
51
|
+
* The predicted class label.
|
|
52
|
+
*/
|
|
53
|
+
label: string;
|
|
54
|
+
/**
|
|
55
|
+
* The corresponding probability.
|
|
56
|
+
*/
|
|
57
|
+
score: number;
|
|
58
|
+
[property: string]: unknown;
|
|
59
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/video-classification/input.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Inputs for Video Classification inference",
|
|
5
|
+
"title": "VideoClassificationInput",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"inputs": {
|
|
9
|
+
"description": "The input video data"
|
|
10
|
+
},
|
|
11
|
+
"parameters": {
|
|
12
|
+
"description": "Additional inference parameters",
|
|
13
|
+
"$ref": "#/$defs/VideoClassificationParameters"
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"$defs": {
|
|
17
|
+
"VideoClassificationParameters": {
|
|
18
|
+
"title": "VideoClassificationParameters",
|
|
19
|
+
"description": "Additional inference parameters for Video Classification",
|
|
20
|
+
"type": "object",
|
|
21
|
+
"properties": {
|
|
22
|
+
"function_to_apply": {
|
|
23
|
+
"title": "TextClassificationOutputTransform",
|
|
24
|
+
"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
|
|
25
|
+
},
|
|
26
|
+
"num_frames": {
|
|
27
|
+
"type": "integer",
|
|
28
|
+
"description": "The number of sampled frames to consider for classification."
|
|
29
|
+
},
|
|
30
|
+
"frame_sampling_rate": {
|
|
31
|
+
"type": "integer",
|
|
32
|
+
"description": "The sampling rate used to select frames from the video."
|
|
33
|
+
},
|
|
34
|
+
"top_k": {
|
|
35
|
+
"type": "integer",
|
|
36
|
+
"description": "When specified, limits the output to the top K most probable classes."
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
},
|
|
41
|
+
"required": ["inputs"]
|
|
42
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/video-classification/output.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Outputs of inference for the Video Classification task",
|
|
5
|
+
"title": "VideoClassificationOutput",
|
|
6
|
+
"type": "array",
|
|
7
|
+
"items": {
|
|
8
|
+
"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
|
|
9
|
+
}
|
|
10
|
+
}
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
Most of the video language models can take in videos, multiple videos, images and multiple images. Some of these models can also take interleaved inputs, which can have images and videos inside the text, where you can refer to the input images and input videos within the text prompt.
|
|
2
|
+
|
|
3
|
+
## Different Types of Video Language Models
|
|
4
|
+
|
|
5
|
+
Video language models come in three types:
|
|
6
|
+
|
|
7
|
+
- **Base:** Pre-trained models that can be fine-tuned.
|
|
8
|
+
- **Instruction:** Base models fine-tuned on video-instruction pairs and answers.
|
|
9
|
+
- **Chatty/Conversational:** Base models fine-tuned on video conversation datasets.
|
|
10
|
+
|
|
11
|
+
## Use Cases
|
|
12
|
+
|
|
13
|
+
### Video Question Answering
|
|
14
|
+
|
|
15
|
+
Video language models trained on video-question-answer pairs can be used for video question answering and generating captions for videos.
|
|
16
|
+
|
|
17
|
+
### Video Chat
|
|
18
|
+
|
|
19
|
+
Video language models can be used to have a dialogue about a video.
|
|
20
|
+
|
|
21
|
+
### Video Recognition with Instructions
|
|
22
|
+
|
|
23
|
+
Video language models can recognize images through descriptions. When given detailed descriptions of specific entities, they can classify the entities in a video.
|
|
24
|
+
|
|
25
|
+
## Inference
|
|
26
|
+
|
|
27
|
+
You can use the Transformers library to interact with video-language models.
|
|
28
|
+
Below we load [a video language model](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf), write a simple utility to sample videos, use chat template to format the text prompt, process the video and the text prompt and infer. To run the snippet below, please install [OpenCV](https://pypi.org/project/opencv-python/) by running `pip install opencv-python`.
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import uuid
|
|
32
|
+
import requests
|
|
33
|
+
import cv2
|
|
34
|
+
import torch
|
|
35
|
+
from transformers import LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration
|
|
36
|
+
|
|
37
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
38
|
+
model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
|
|
39
|
+
|
|
40
|
+
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
|
|
41
|
+
model_id,
|
|
42
|
+
torch_dtype=torch.float16,
|
|
43
|
+
low_cpu_mem_usage=True,
|
|
44
|
+
).to(device)
|
|
45
|
+
|
|
46
|
+
processor = LlavaNextVideoProcessor.from_pretrained(model_id)
|
|
47
|
+
|
|
48
|
+
def sample_frames(url, num_frames):
|
|
49
|
+
response = requests.get(url)
|
|
50
|
+
path_id = str(uuid.uuid4())
|
|
51
|
+
|
|
52
|
+
path = f"./{path_id}.mp4"
|
|
53
|
+
|
|
54
|
+
with open(path, "wb") as f:
|
|
55
|
+
f.write(response.content)
|
|
56
|
+
|
|
57
|
+
video = cv2.VideoCapture(path)
|
|
58
|
+
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
59
|
+
interval = total_frames // num_frames
|
|
60
|
+
frames = []
|
|
61
|
+
for i in range(total_frames):
|
|
62
|
+
ret, frame = video.read()
|
|
63
|
+
if not ret:
|
|
64
|
+
continue
|
|
65
|
+
if i % interval == 0:
|
|
66
|
+
pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
|
67
|
+
frames.append(pil_img)
|
|
68
|
+
video.release()
|
|
69
|
+
return frames
|
|
70
|
+
|
|
71
|
+
conversation = [
|
|
72
|
+
{
|
|
73
|
+
|
|
74
|
+
"role": "user",
|
|
75
|
+
"content": [
|
|
76
|
+
{"type": "text", "text": "Why is this video funny?"},
|
|
77
|
+
{"type": "video"},
|
|
78
|
+
],
|
|
79
|
+
},
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
|
|
83
|
+
|
|
84
|
+
video_url = "https://huggingface.co/spaces/merve/llava-interleave/resolve/main/cats_1.mp4"
|
|
85
|
+
video = sample_frames(video, 8)
|
|
86
|
+
|
|
87
|
+
inputs = processor(text=prompt, videos=video, padding=True, return_tensors="pt").to(model.device)
|
|
88
|
+
|
|
89
|
+
output = model.generate(**inputs, max_new_tokens=100, do_sample=False)
|
|
90
|
+
print(processor.decode(output[0][2:], skip_special_tokens=True))
|
|
91
|
+
|
|
92
|
+
# Why is this video funny? ASSISTANT: The humor in this video comes from the cat's facial expression and body language. The cat appears to be making a funny face, with its eyes squinted and mouth open, which can be interpreted as a playful or mischievous expression. Cats often make such faces when they are in a good mood or are playful, and this can be amusing to people who are familiar with their behavior. The combination of the cat's expression and the close-
|
|
93
|
+
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Useful Resources
|
|
97
|
+
|
|
98
|
+
- [Transformers task guide on video-text-to-text](https://huggingface.co/docs/transformers/tasks/video_text_to_text)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../index.js";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
description: "Multiple-choice questions and answers about videos.",
|
|
7
|
+
id: "lmms-lab/Video-MME",
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
description: "A dataset of instructions and question-answer pairs about videos.",
|
|
11
|
+
id: "lmms-lab/VideoChatGPT",
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
description: "Large video understanding dataset.",
|
|
15
|
+
id: "HuggingFaceFV/finevideo",
|
|
16
|
+
},
|
|
17
|
+
],
|
|
18
|
+
demo: {
|
|
19
|
+
inputs: [
|
|
20
|
+
{
|
|
21
|
+
filename: "video-text-to-text-input.gif",
|
|
22
|
+
type: "img",
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
label: "Text Prompt",
|
|
26
|
+
content: "What is happening in this video?",
|
|
27
|
+
type: "text",
|
|
28
|
+
},
|
|
29
|
+
],
|
|
30
|
+
outputs: [
|
|
31
|
+
{
|
|
32
|
+
label: "Answer",
|
|
33
|
+
content:
|
|
34
|
+
"The video shows a series of images showing a fountain with water jets and a variety of colorful flowers and butterflies in the background.",
|
|
35
|
+
type: "text",
|
|
36
|
+
},
|
|
37
|
+
],
|
|
38
|
+
},
|
|
39
|
+
metrics: [],
|
|
40
|
+
models: [
|
|
41
|
+
{
|
|
42
|
+
description: "A robust video-text-to-text model that can take in image and video inputs.",
|
|
43
|
+
id: "llava-hf/llava-onevision-qwen2-72b-ov-hf",
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
description: "Large and powerful video-text-to-text model that can take in image and video inputs.",
|
|
47
|
+
id: "llava-hf/LLaVA-NeXT-Video-34B-hf",
|
|
48
|
+
},
|
|
49
|
+
],
|
|
50
|
+
spaces: [
|
|
51
|
+
{
|
|
52
|
+
description: "An application to chat with a video-text-to-text model.",
|
|
53
|
+
id: "llava-hf/video-llava",
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
description: "A leaderboard for various video-text-to-text models.",
|
|
57
|
+
id: "opencompass/openvlm_video_leaderboard",
|
|
58
|
+
},
|
|
59
|
+
],
|
|
60
|
+
summary:
|
|
61
|
+
"Video-text-to-text models take in a video and a text prompt and output text. These models are also called video-language models.",
|
|
62
|
+
widgetModels: [""],
|
|
63
|
+
youtubeId: "",
|
|
64
|
+
};
|
|
65
|
+
|
|
66
|
+
export default taskData;
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
## Use Cases
|
|
2
|
+
|
|
3
|
+
### Aid the Visually Impaired Persons
|
|
4
|
+
|
|
5
|
+
VQA models can be used to reduce visual barriers for visually impaired individuals by allowing them to get information about images from the web and the real world.
|
|
6
|
+
|
|
7
|
+
### Education
|
|
8
|
+
|
|
9
|
+
VQA models can be used to improve experiences at museums by allowing observers to directly ask questions they interested in.
|
|
10
|
+
|
|
11
|
+
### Improved Image Retrieval
|
|
12
|
+
|
|
13
|
+
Visual question answering models can be used to retrieve images with specific characteristics. For example, the user can ask "Is there a dog?" to find all images with dogs from a set of images.
|
|
14
|
+
|
|
15
|
+
### Video Search
|
|
16
|
+
|
|
17
|
+
Specific snippets/timestamps of a video can be retrieved based on search queries. For example, the user can ask "At which part of the video does the guitar appear?" and get a specific timestamp range from the whole video.
|
|
18
|
+
|
|
19
|
+
## Task Variants
|
|
20
|
+
|
|
21
|
+
### Video Question Answering
|
|
22
|
+
|
|
23
|
+
Video Question Answering aims to answer questions asked about the content of a video.
|
|
24
|
+
|
|
25
|
+
## Inference
|
|
26
|
+
|
|
27
|
+
You can infer with Visual Question Answering models using the `vqa` (or `visual-question-answering`) pipeline. This pipeline requires [the Python Image Library (PIL)](https://pillow.readthedocs.io/en/stable/) to process images. You can install it with (`pip install pillow`).
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from PIL import Image
|
|
31
|
+
from transformers import pipeline
|
|
32
|
+
|
|
33
|
+
vqa_pipeline = pipeline("visual-question-answering")
|
|
34
|
+
|
|
35
|
+
image = Image.open("elephant.jpeg")
|
|
36
|
+
question = "Is there an elephant?"
|
|
37
|
+
|
|
38
|
+
vqa_pipeline(image, question, top_k=1)
|
|
39
|
+
#[{'score': 0.9998154044151306, 'answer': 'yes'}]
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Useful Resources
|
|
43
|
+
|
|
44
|
+
- [An introduction to Visual Question Answering - AllenAI](https://blog.allenai.org/vanilla-vqa-adcaaaa94336)
|
|
45
|
+
- [Multi Modal Framework (MMF) - Meta Research](https://mmf.sh/docs/getting_started/video_overview/)
|
|
46
|
+
|
|
47
|
+
The contents of this page are contributed by [
|
|
48
|
+
Bharat Raghunathan](https://huggingface.co/bharat-raghunathan) and [Jose Londono Botero](https://huggingface.co/jlondonobo).
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../index.js";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
description: "A widely used dataset containing questions (with answers) about images.",
|
|
7
|
+
id: "Graphcore/vqa",
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
description: "A dataset to benchmark visual reasoning based on text in images.",
|
|
11
|
+
id: "facebook/textvqa",
|
|
12
|
+
},
|
|
13
|
+
],
|
|
14
|
+
demo: {
|
|
15
|
+
inputs: [
|
|
16
|
+
{
|
|
17
|
+
filename: "elephant.jpeg",
|
|
18
|
+
type: "img",
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
label: "Question",
|
|
22
|
+
content: "What is in this image?",
|
|
23
|
+
type: "text",
|
|
24
|
+
},
|
|
25
|
+
],
|
|
26
|
+
outputs: [
|
|
27
|
+
{
|
|
28
|
+
type: "chart",
|
|
29
|
+
data: [
|
|
30
|
+
{
|
|
31
|
+
label: "elephant",
|
|
32
|
+
score: 0.97,
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
label: "elephants",
|
|
36
|
+
score: 0.06,
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
label: "animal",
|
|
40
|
+
score: 0.003,
|
|
41
|
+
},
|
|
42
|
+
],
|
|
43
|
+
},
|
|
44
|
+
],
|
|
45
|
+
},
|
|
46
|
+
isPlaceholder: false,
|
|
47
|
+
metrics: [
|
|
48
|
+
{
|
|
49
|
+
description: "",
|
|
50
|
+
id: "accuracy",
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
description:
|
|
54
|
+
"Measures how much a predicted answer differs from the ground truth based on the difference in their semantic meaning.",
|
|
55
|
+
id: "wu-palmer similarity",
|
|
56
|
+
},
|
|
57
|
+
],
|
|
58
|
+
models: [
|
|
59
|
+
{
|
|
60
|
+
description: "A visual question answering model trained to convert charts and plots to text.",
|
|
61
|
+
id: "google/deplot",
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
description:
|
|
65
|
+
"A visual question answering model trained for mathematical reasoning and chart derendering from images.",
|
|
66
|
+
id: "google/matcha-base",
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
description: "A strong visual question answering that answers questions from book covers.",
|
|
70
|
+
id: "google/pix2struct-ocrvqa-large",
|
|
71
|
+
},
|
|
72
|
+
],
|
|
73
|
+
spaces: [
|
|
74
|
+
{
|
|
75
|
+
description: "An application that compares visual question answering models across different tasks.",
|
|
76
|
+
id: "merve/pix2struct",
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
description: "An application that can answer questions based on images.",
|
|
80
|
+
id: "nielsr/vilt-vqa",
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
description: "An application that can caption images and answer questions about a given image. ",
|
|
84
|
+
id: "Salesforce/BLIP",
|
|
85
|
+
},
|
|
86
|
+
{
|
|
87
|
+
description: "An application that can caption images and answer questions about a given image. ",
|
|
88
|
+
id: "vumichien/Img2Prompt",
|
|
89
|
+
},
|
|
90
|
+
],
|
|
91
|
+
summary:
|
|
92
|
+
"Visual Question Answering is the task of answering open-ended questions based on an image. They output natural language responses to natural language questions.",
|
|
93
|
+
widgetModels: ["dandelin/vilt-b32-finetuned-vqa"],
|
|
94
|
+
youtubeId: "",
|
|
95
|
+
};
|
|
96
|
+
|
|
97
|
+
export default taskData;
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inference code generated from the JSON schema spec in ./spec
|
|
3
|
+
*
|
|
4
|
+
* Using src/scripts/inference-codegen
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Inputs for Visual Question Answering inference
|
|
8
|
+
*/
|
|
9
|
+
export interface VisualQuestionAnsweringInput {
|
|
10
|
+
/**
|
|
11
|
+
* One (image, question) pair to answer
|
|
12
|
+
*/
|
|
13
|
+
inputs: VisualQuestionAnsweringInputData;
|
|
14
|
+
/**
|
|
15
|
+
* Additional inference parameters
|
|
16
|
+
*/
|
|
17
|
+
parameters?: VisualQuestionAnsweringParameters;
|
|
18
|
+
[property: string]: unknown;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* One (image, question) pair to answer
|
|
22
|
+
*/
|
|
23
|
+
export interface VisualQuestionAnsweringInputData {
|
|
24
|
+
/**
|
|
25
|
+
* The image.
|
|
26
|
+
*/
|
|
27
|
+
image: unknown;
|
|
28
|
+
/**
|
|
29
|
+
* The question to answer based on the image.
|
|
30
|
+
*/
|
|
31
|
+
question: unknown;
|
|
32
|
+
[property: string]: unknown;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Additional inference parameters
|
|
36
|
+
*
|
|
37
|
+
* Additional inference parameters for Visual Question Answering
|
|
38
|
+
*/
|
|
39
|
+
export interface VisualQuestionAnsweringParameters {
|
|
40
|
+
/**
|
|
41
|
+
* The number of answers to return (will be chosen by order of likelihood). Note that we
|
|
42
|
+
* return less than topk answers if there are not enough options available within the
|
|
43
|
+
* context.
|
|
44
|
+
*/
|
|
45
|
+
top_k?: number;
|
|
46
|
+
[property: string]: unknown;
|
|
47
|
+
}
|
|
48
|
+
export type VisualQuestionAnsweringOutput = VisualQuestionAnsweringOutputElement[];
|
|
49
|
+
/**
|
|
50
|
+
* Outputs of inference for the Visual Question Answering task
|
|
51
|
+
*/
|
|
52
|
+
export interface VisualQuestionAnsweringOutputElement {
|
|
53
|
+
/**
|
|
54
|
+
* The answer to the question
|
|
55
|
+
*/
|
|
56
|
+
answer?: string;
|
|
57
|
+
/**
|
|
58
|
+
* The associated score / probability
|
|
59
|
+
*/
|
|
60
|
+
score: number;
|
|
61
|
+
[property: string]: unknown;
|
|
62
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/visual-question-answering/input.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Inputs for Visual Question Answering inference",
|
|
5
|
+
"title": "VisualQuestionAnsweringInput",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"inputs": {
|
|
9
|
+
"description": "One (image, question) pair to answer",
|
|
10
|
+
"type": "object",
|
|
11
|
+
"title": "VisualQuestionAnsweringInputData",
|
|
12
|
+
"properties": {
|
|
13
|
+
"image": {
|
|
14
|
+
"description": "The image."
|
|
15
|
+
},
|
|
16
|
+
"question": {
|
|
17
|
+
"description": "The question to answer based on the image."
|
|
18
|
+
}
|
|
19
|
+
},
|
|
20
|
+
"required": ["question", "image"]
|
|
21
|
+
},
|
|
22
|
+
"parameters": {
|
|
23
|
+
"description": "Additional inference parameters",
|
|
24
|
+
"$ref": "#/$defs/VisualQuestionAnsweringParameters"
|
|
25
|
+
}
|
|
26
|
+
},
|
|
27
|
+
"$defs": {
|
|
28
|
+
"VisualQuestionAnsweringParameters": {
|
|
29
|
+
"title": "VisualQuestionAnsweringParameters",
|
|
30
|
+
"description": "Additional inference parameters for Visual Question Answering",
|
|
31
|
+
"type": "object",
|
|
32
|
+
"properties": {
|
|
33
|
+
"top_k": {
|
|
34
|
+
"type": "integer",
|
|
35
|
+
"description": "The number of answers to return (will be chosen by order of likelihood). Note that we return less than topk answers if there are not enough options available within the context."
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
},
|
|
40
|
+
"required": ["inputs"]
|
|
41
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/visual-question-answering/output.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Outputs of inference for the Visual Question Answering task",
|
|
5
|
+
"title": "VisualQuestionAnsweringOutput",
|
|
6
|
+
"type": "array",
|
|
7
|
+
"items": {
|
|
8
|
+
"type": "object",
|
|
9
|
+
"properties": {
|
|
10
|
+
"answer": {
|
|
11
|
+
"type": "string",
|
|
12
|
+
"description": "The answer to the question"
|
|
13
|
+
},
|
|
14
|
+
"score": {
|
|
15
|
+
"type": "number",
|
|
16
|
+
"description": "The associated score / probability"
|
|
17
|
+
}
|
|
18
|
+
},
|
|
19
|
+
"required": ["score"]
|
|
20
|
+
}
|
|
21
|
+
}
|