@huggingface/tasks 0.12.2 → 0.12.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -2237,7 +2237,7 @@ var taskData11 = {
2237
2237
  },
2238
2238
  {
2239
2239
  description: "Strong image-text-to-text model.",
2240
- id: "llava-hf/llava-v1.6-mistral-7b-hf"
2240
+ id: "microsoft/Phi-3.5-vision-instruct"
2241
2241
  }
2242
2242
  ],
2243
2243
  spaces: [
package/dist/index.js CHANGED
@@ -2199,7 +2199,7 @@ var taskData11 = {
2199
2199
  },
2200
2200
  {
2201
2201
  description: "Strong image-text-to-text model.",
2202
- id: "llava-hf/llava-v1.6-mistral-7b-hf"
2202
+ id: "microsoft/Phi-3.5-vision-instruct"
2203
2203
  }
2204
2204
  ],
2205
2205
  spaces: [
@@ -0,0 +1,4 @@
1
+ import type { TaskDataCustom } from "..";
2
+ declare const taskData: TaskDataCustom;
3
+ export default taskData;
4
+ //# sourceMappingURL=data.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/video-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,IAAI,CAAC;AAEzC,QAAA,MAAM,QAAQ,EAAE,cAqDf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@huggingface/tasks",
3
3
  "packageManager": "pnpm@8.10.5",
4
- "version": "0.12.2",
4
+ "version": "0.12.4",
5
5
  "description": "List of ML tasks for huggingface.co/tasks",
6
6
  "repository": "https://github.com/huggingface/huggingface.js.git",
7
7
  "publishConfig": {
@@ -60,7 +60,7 @@ const taskData: TaskDataCustom = {
60
60
  },
61
61
  {
62
62
  description: "Strong image-text-to-text model.",
63
- id: "llava-hf/llava-v1.6-mistral-7b-hf",
63
+ id: "microsoft/Phi-3.5-vision-instruct",
64
64
  },
65
65
  ],
66
66
  spaces: [
@@ -0,0 +1,98 @@
1
+ Most of the video language models can take in videos, multiple videos, images and multiple images. Some of these models can also take interleaved inputs, which can have images and videos inside the text, where you can refer to the input images and input videos within the text prompt.
2
+
3
+ ## Different Types of Video Language Models
4
+
5
+ Video language models come in three types:
6
+
7
+ - **Base:** Pre-trained models that can be fine-tuned.
8
+ - **Instruction:** Base models fine-tuned on video-instruction pairs and answers.
9
+ - **Chatty/Conversational:** Base models fine-tuned on video conversation datasets.
10
+
11
+ ## Use Cases
12
+
13
+ ### Video Question Answering
14
+
15
+ Video language models trained on video-question-answer pairs can be used for video question answering and generating captions for videos.
16
+
17
+ ### Video Chat
18
+
19
+ Video language models can be used to have a dialogue about a video.
20
+
21
+ ### Video Recognition with Instructions
22
+
23
+ Video language models can recognize images through descriptions. When given detailed descriptions of specific entities, they can classify the entities in a video.
24
+
25
+ ## Inference
26
+
27
+ You can use the Transformers library to interact with video-language models.
28
+ Below we load [a video language model](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf), write a simple utility to sample videos, use chat template to format the text prompt, process the video and the text prompt and infer. To run the snippet below, please install [OpenCV](https://pypi.org/project/opencv-python/) by running `pip install opencv-python`.
29
+
30
+ ```python
31
+ import uuid
32
+ import requests
33
+ import cv2
34
+ import torch
35
+ from transformers import LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration
36
+
37
+ device = "cuda" if torch.cuda.is_available() else "cpu"
38
+ model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
39
+
40
+ model = LlavaNextVideoForConditionalGeneration.from_pretrained(
41
+ model_id,
42
+ torch_dtype=torch.float16,
43
+ low_cpu_mem_usage=True,
44
+ ).to(device)
45
+
46
+ processor = LlavaNextVideoProcessor.from_pretrained(model_id)
47
+
48
+ def sample_frames(url, num_frames):
49
+ response = requests.get(url)
50
+ path_id = str(uuid.uuid4())
51
+
52
+ path = f"./{path_id}.mp4"
53
+
54
+ with open(path, "wb") as f:
55
+ f.write(response.content)
56
+
57
+ video = cv2.VideoCapture(path)
58
+ total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
59
+ interval = total_frames // num_frames
60
+ frames = []
61
+ for i in range(total_frames):
62
+ ret, frame = video.read()
63
+ if not ret:
64
+ continue
65
+ if i % interval == 0:
66
+ pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
67
+ frames.append(pil_img)
68
+ video.release()
69
+ return frames
70
+
71
+ conversation = [
72
+ {
73
+
74
+ "role": "user",
75
+ "content": [
76
+ {"type": "text", "text": "Why is this video funny?"},
77
+ {"type": "video"},
78
+ ],
79
+ },
80
+ ]
81
+
82
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
83
+
84
+ video_url = "https://huggingface.co/spaces/merve/llava-interleave/resolve/main/cats_1.mp4"
85
+ video = sample_frames(video, 8)
86
+
87
+ inputs = processor(text=prompt, videos=video, padding=True, return_tensors="pt").to(model.device)
88
+
89
+ output = model.generate(**inputs, max_new_tokens=100, do_sample=False)
90
+ print(processor.decode(output[0][2:], skip_special_tokens=True))
91
+
92
+ # Why is this video funny? ASSISTANT: The humor in this video comes from the cat's facial expression and body language. The cat appears to be making a funny face, with its eyes squinted and mouth open, which can be interpreted as a playful or mischievous expression. Cats often make such faces when they are in a good mood or are playful, and this can be amusing to people who are familiar with their behavior. The combination of the cat's expression and the close-
93
+
94
+ ```
95
+
96
+ ## Useful Resources
97
+
98
+ - [Transformers task guide on video-text-to-text](https://huggingface.co/docs/transformers/tasks/video_text_to_text)
@@ -0,0 +1,58 @@
1
+ import type { TaskDataCustom } from "..";
2
+
3
+ const taskData: TaskDataCustom = {
4
+ datasets: [
5
+ {
6
+ description: "Multiple-choice questions and answers about videos.",
7
+ id: "lmms-lab/Video-MME",
8
+ },
9
+ {
10
+ description: "A dataset of instructions and question-answer pairs about videos.",
11
+ id: "lmms-lab/VideoChatGPT",
12
+ },
13
+ ],
14
+ demo: {
15
+ inputs: [
16
+ {
17
+ filename: "video-text-to-text-input.gif",
18
+ type: "img",
19
+ },
20
+ {
21
+ label: "Text Prompt",
22
+ content: "What is happening in this video?",
23
+ type: "text",
24
+ },
25
+ ],
26
+ outputs: [
27
+ {
28
+ label: "Answer",
29
+ content:
30
+ "The video shows a series of images showing a fountain with water jets and a variety of colorful flowers and butterflies in the background.",
31
+ type: "text",
32
+ },
33
+ ],
34
+ },
35
+ metrics: [],
36
+ models: [
37
+ {
38
+ description: "A robust video-text-to-text model that can take in image and video inputs.",
39
+ id: "llava-hf/llava-onevision-qwen2-72b-ov-hf",
40
+ },
41
+ {
42
+ description: "Large and powerful video-text-to-text model that can take in image and video inputs.",
43
+ id: "llava-hf/LLaVA-NeXT-Video-34B-hf",
44
+ },
45
+ ],
46
+ spaces: [
47
+ {
48
+ description: "An application to chat with a video-text-to-text model.",
49
+ id: "llava-hf/video-llava",
50
+ },
51
+ ],
52
+ summary:
53
+ "Video-text-to-text models take in a video and a text prompt and output text. These models are also called video-language models.",
54
+ widgetModels: [""],
55
+ youtubeId: "",
56
+ };
57
+
58
+ export default taskData;