@huggingface/tasks 0.19.65 → 0.19.67

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/dist/commonjs/dataset-libraries.d.ts +6 -0
  2. package/dist/commonjs/dataset-libraries.d.ts.map +1 -1
  3. package/dist/commonjs/dataset-libraries.js +6 -0
  4. package/dist/commonjs/hardware.d.ts +4 -0
  5. package/dist/commonjs/hardware.d.ts.map +1 -1
  6. package/dist/commonjs/hardware.js +4 -0
  7. package/dist/commonjs/local-apps.d.ts +0 -7
  8. package/dist/commonjs/local-apps.d.ts.map +1 -1
  9. package/dist/commonjs/local-apps.js +0 -7
  10. package/dist/commonjs/model-libraries-snippets.d.ts +1 -0
  11. package/dist/commonjs/model-libraries-snippets.d.ts.map +1 -1
  12. package/dist/commonjs/model-libraries-snippets.js +16 -3
  13. package/dist/commonjs/model-libraries.d.ts +15 -1
  14. package/dist/commonjs/model-libraries.d.ts.map +1 -1
  15. package/dist/commonjs/model-libraries.js +14 -0
  16. package/dist/commonjs/pipelines.d.ts +9 -1
  17. package/dist/commonjs/pipelines.d.ts.map +1 -1
  18. package/dist/commonjs/pipelines.js +8 -0
  19. package/dist/commonjs/snippets/inputs.d.ts.map +1 -1
  20. package/dist/commonjs/snippets/inputs.js +10 -0
  21. package/dist/commonjs/tasks/image-text-to-image/data.d.ts +4 -0
  22. package/dist/commonjs/tasks/image-text-to-image/data.d.ts.map +1 -0
  23. package/dist/commonjs/tasks/image-text-to-image/data.js +50 -0
  24. package/dist/commonjs/tasks/image-text-to-image/inference.d.ts +76 -0
  25. package/dist/commonjs/tasks/image-text-to-image/inference.d.ts.map +1 -0
  26. package/dist/commonjs/tasks/image-text-to-image/inference.js +2 -0
  27. package/dist/commonjs/tasks/image-text-to-video/data.d.ts +4 -0
  28. package/dist/commonjs/tasks/image-text-to-video/data.d.ts.map +1 -0
  29. package/dist/commonjs/tasks/image-text-to-video/data.js +50 -0
  30. package/dist/commonjs/tasks/image-text-to-video/inference.d.ts +78 -0
  31. package/dist/commonjs/tasks/image-text-to-video/inference.d.ts.map +1 -0
  32. package/dist/commonjs/tasks/image-text-to-video/inference.js +2 -0
  33. package/dist/commonjs/tasks/index.d.ts +2 -0
  34. package/dist/commonjs/tasks/index.d.ts.map +1 -1
  35. package/dist/commonjs/tasks/index.js +72 -66
  36. package/dist/esm/dataset-libraries.d.ts +6 -0
  37. package/dist/esm/dataset-libraries.d.ts.map +1 -1
  38. package/dist/esm/dataset-libraries.js +6 -0
  39. package/dist/esm/hardware.d.ts +4 -0
  40. package/dist/esm/hardware.d.ts.map +1 -1
  41. package/dist/esm/hardware.js +4 -0
  42. package/dist/esm/local-apps.d.ts +0 -7
  43. package/dist/esm/local-apps.d.ts.map +1 -1
  44. package/dist/esm/local-apps.js +0 -7
  45. package/dist/esm/model-libraries-snippets.d.ts +1 -0
  46. package/dist/esm/model-libraries-snippets.d.ts.map +1 -1
  47. package/dist/esm/model-libraries-snippets.js +12 -0
  48. package/dist/esm/model-libraries.d.ts +15 -1
  49. package/dist/esm/model-libraries.d.ts.map +1 -1
  50. package/dist/esm/model-libraries.js +14 -0
  51. package/dist/esm/pipelines.d.ts +9 -1
  52. package/dist/esm/pipelines.d.ts.map +1 -1
  53. package/dist/esm/pipelines.js +8 -0
  54. package/dist/esm/snippets/inputs.d.ts.map +1 -1
  55. package/dist/esm/snippets/inputs.js +10 -0
  56. package/dist/esm/tasks/image-text-to-image/data.d.ts +4 -0
  57. package/dist/esm/tasks/image-text-to-image/data.d.ts.map +1 -0
  58. package/dist/esm/tasks/image-text-to-image/data.js +48 -0
  59. package/dist/esm/tasks/image-text-to-image/inference.d.ts +76 -0
  60. package/dist/esm/tasks/image-text-to-image/inference.d.ts.map +1 -0
  61. package/dist/esm/tasks/image-text-to-image/inference.js +1 -0
  62. package/dist/esm/tasks/image-text-to-video/data.d.ts +4 -0
  63. package/dist/esm/tasks/image-text-to-video/data.d.ts.map +1 -0
  64. package/dist/esm/tasks/image-text-to-video/data.js +48 -0
  65. package/dist/esm/tasks/image-text-to-video/inference.d.ts +78 -0
  66. package/dist/esm/tasks/image-text-to-video/inference.d.ts.map +1 -0
  67. package/dist/esm/tasks/image-text-to-video/inference.js +1 -0
  68. package/dist/esm/tasks/index.d.ts +2 -0
  69. package/dist/esm/tasks/index.d.ts.map +1 -1
  70. package/dist/esm/tasks/index.js +6 -0
  71. package/package.json +1 -1
  72. package/src/dataset-libraries.ts +6 -0
  73. package/src/hardware.ts +4 -0
  74. package/src/local-apps.ts +0 -7
  75. package/src/model-libraries-snippets.ts +13 -0
  76. package/src/model-libraries.ts +14 -0
  77. package/src/pipelines.ts +8 -0
  78. package/src/snippets/inputs.ts +12 -0
  79. package/src/tasks/image-text-to-image/about.md +73 -0
  80. package/src/tasks/image-text-to-image/data.ts +54 -0
  81. package/src/tasks/image-text-to-image/inference.ts +75 -0
  82. package/src/tasks/image-text-to-image/spec/input.json +59 -0
  83. package/src/tasks/image-text-to-image/spec/output.json +13 -0
  84. package/src/tasks/image-text-to-video/about.md +71 -0
  85. package/src/tasks/image-text-to-video/data.ts +54 -0
  86. package/src/tasks/image-text-to-video/inference.ts +77 -0
  87. package/src/tasks/image-text-to-video/spec/input.json +63 -0
  88. package/src/tasks/image-text-to-video/spec/output.json +13 -0
  89. package/src/tasks/index.ts +16 -0
@@ -0,0 +1,75 @@
1
+ /**
2
+ * Inference code generated from the JSON schema spec in ./spec
3
+ *
4
+ * Using src/scripts/inference-codegen
5
+ */
6
+ /**
7
+ * Inputs for Image Text To Image inference. Either inputs (image) or prompt (in parameters)
8
+ * must be provided, or both.
9
+ */
10
+ export interface ImageTextToImageInput {
11
+ /**
12
+ * The input image data as a base64-encoded string. If no `parameters` are provided, you can
13
+ * also provide the image data as a raw bytes payload. Either this or prompt must be
14
+ * provided.
15
+ */
16
+ inputs?: Blob;
17
+ /**
18
+ * Additional inference parameters for Image Text To Image
19
+ */
20
+ parameters?: ImageTextToImageParameters;
21
+ [property: string]: unknown;
22
+ }
23
+ /**
24
+ * Additional inference parameters for Image Text To Image
25
+ */
26
+ export interface ImageTextToImageParameters {
27
+ /**
28
+ * For diffusion models. A higher guidance scale value encourages the model to generate
29
+ * images closely linked to the text prompt at the expense of lower image quality.
30
+ */
31
+ guidance_scale?: number;
32
+ /**
33
+ * One prompt to guide what NOT to include in image generation.
34
+ */
35
+ negative_prompt?: string;
36
+ /**
37
+ * For diffusion models. The number of denoising steps. More denoising steps usually lead to
38
+ * a higher quality image at the expense of slower inference.
39
+ */
40
+ num_inference_steps?: number;
41
+ /**
42
+ * The text prompt to guide the image generation. Either this or inputs (image) must be
43
+ * provided.
44
+ */
45
+ prompt?: string;
46
+ /**
47
+ * Seed for the random number generator.
48
+ */
49
+ seed?: number;
50
+ /**
51
+ * The size in pixels of the output image. This parameter is only supported by some
52
+ * providers and for specific models. It will be ignored when unsupported.
53
+ */
54
+ target_size?: TargetSize;
55
+ [property: string]: unknown;
56
+ }
57
+ /**
58
+ * The size in pixels of the output image. This parameter is only supported by some
59
+ * providers and for specific models. It will be ignored when unsupported.
60
+ */
61
+ export interface TargetSize {
62
+ height: number;
63
+ width: number;
64
+ [property: string]: unknown;
65
+ }
66
+ /**
67
+ * Outputs of inference for the Image Text To Image task
68
+ */
69
+ export interface ImageTextToImageOutput {
70
+ /**
71
+ * The generated image returned as raw bytes in the payload.
72
+ */
73
+ image: unknown;
74
+ [property: string]: unknown;
75
+ }
@@ -0,0 +1,59 @@
1
+ {
2
+ "$id": "/inference/schemas/image-text-to-image/input.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Inputs for Image Text To Image inference. Either inputs (image) or prompt (in parameters) must be provided, or both.",
5
+ "title": "ImageTextToImageInput",
6
+ "type": "object",
7
+ "properties": {
8
+ "inputs": {
9
+ "type": "string",
10
+ "description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload. Either this or prompt must be provided.",
11
+ "comment": "type=binary"
12
+ },
13
+ "parameters": {
14
+ "description": "Additional inference parameters for Image Text To Image",
15
+ "$ref": "#/$defs/ImageTextToImageParameters"
16
+ }
17
+ },
18
+ "$defs": {
19
+ "ImageTextToImageParameters": {
20
+ "title": "ImageTextToImageParameters",
21
+ "type": "object",
22
+ "properties": {
23
+ "prompt": {
24
+ "type": "string",
25
+ "description": "The text prompt to guide the image generation. Either this or inputs (image) must be provided."
26
+ },
27
+ "guidance_scale": {
28
+ "type": "number",
29
+ "description": "For diffusion models. A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality."
30
+ },
31
+ "negative_prompt": {
32
+ "type": "string",
33
+ "description": "One prompt to guide what NOT to include in image generation."
34
+ },
35
+ "num_inference_steps": {
36
+ "type": "integer",
37
+ "description": "For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference."
38
+ },
39
+ "target_size": {
40
+ "type": "object",
41
+ "description": "The size in pixels of the output image. This parameter is only supported by some providers and for specific models. It will be ignored when unsupported.",
42
+ "properties": {
43
+ "width": {
44
+ "type": "integer"
45
+ },
46
+ "height": {
47
+ "type": "integer"
48
+ }
49
+ },
50
+ "required": ["width", "height"]
51
+ },
52
+ "seed": {
53
+ "type": "integer",
54
+ "description": "Seed for the random number generator."
55
+ }
56
+ }
57
+ }
58
+ }
59
+ }
@@ -0,0 +1,13 @@
1
+ {
2
+ "$id": "/inference/schemas/image-text-to-image/output.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Outputs of inference for the Image Text To Image task",
5
+ "title": "ImageTextToImageOutput",
6
+ "type": "object",
7
+ "properties": {
8
+ "image": {
9
+ "description": "The generated image returned as raw bytes in the payload."
10
+ }
11
+ },
12
+ "required": ["image"]
13
+ }
@@ -0,0 +1,71 @@
1
+ ## Use Cases
2
+
3
+ ### Image Animation
4
+
5
+ Image-text-to-video models can be used to animate still images based on text descriptions. For example, you can provide a landscape photo and the instruction "A camera pan from left to right" to create a video with camera movement.
6
+
7
+ ### Dynamic Content Creation
8
+
9
+ Transform images into video by adding motion, transformations, or effects described in text prompts. This is useful for creating engaging social media content, presentations, or marketing materials.
10
+
11
+ ### Guided Video Generation
12
+
13
+ Use a reference image with text prompts to guide the video generation process. This provides more control over the visual style and composition compared to text-to-video models alone.
14
+
15
+ ### Story Visualization
16
+
17
+ Create video sequences from storyboards or concept art by providing scene descriptions. This can help filmmakers and animators visualize scenes before production.
18
+
19
+ ### Motion Control
20
+
21
+ Generate videos with specific camera movements, object motions, or scene transitions by combining reference images with detailed motion descriptions.
22
+
23
+ ## Task Variants
24
+
25
+ ### Image-to-Video with Motion Control
26
+
27
+ Models that generate videos from images while following specific motion instructions, such as camera movements, object animations, or scene dynamics.
28
+
29
+ ### Reference-guided Video Generation
30
+
31
+ Models that use a reference image to guide the visual style and composition of the generated video while incorporating text prompts for motion and transformation control.
32
+
33
+ ### Conditional Video Synthesis
34
+
35
+ Models that perform specific video transformations based on text conditions, such as adding weather effects, time-of-day changes, or environmental animations.
36
+
37
+ ## Inference
38
+
39
+ You can use the Diffusers library to interact with image-text-to-video models. Here's example snippet to use `LTXImageToVideoPipeline`.
40
+
41
+ ```python
42
+ import torch
43
+ from diffusers import LTXImageToVideoPipeline
44
+ from diffusers.utils import export_to_video, load_image
45
+
46
+ pipe = LTXImageToVideoPipeline.from_pretrained("Lightricks/LTX-Video", torch_dtype=torch.bfloat16)
47
+ pipe.to("cuda")
48
+
49
+ image = load_image(
50
+ "https://huggingface.co/datasets/a-r-r-o-w/tiny-meme-dataset-captioned/resolve/main/images/8.png"
51
+ )
52
+ prompt = "A young girl stands calmly in the foreground, looking directly at the camera, as a house fire rages in the background. Flames engulf the structure, with smoke billowing into the air. Firefighters in protective gear rush to the scene, a fire truck labeled '38' visible behind them. The girl's neutral expression contrasts sharply with the chaos of the fire, creating a poignant and emotionally charged scene."
53
+ negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
54
+
55
+ video = pipe(
56
+ image=image,
57
+ prompt=prompt,
58
+ negative_prompt=negative_prompt,
59
+ width=704,
60
+ height=480,
61
+ num_frames=161,
62
+ num_inference_steps=50,
63
+ ).frames[0]
64
+ export_to_video(video, "output.mp4", fps=24)
65
+ ```
66
+
67
+ ## Useful Resources
68
+
69
+ - [LTX-Video Model Card](https://huggingface.co/Lightricks/LTX-Video)
70
+ - [Text-to-Video: The Task, Challenges and the Current State](https://huggingface.co/blog/text-to-video)
71
+ - [Diffusers documentation on Video Generation](https://huggingface.co/docs/diffusers/using-diffusers/text-img2vid)
@@ -0,0 +1,54 @@
1
+ import type { TaskDataCustom } from "../index.js";
2
+
3
+ const taskData: TaskDataCustom = {
4
+ datasets: [],
5
+ demo: {
6
+ inputs: [
7
+ {
8
+ filename: "image-text-to-video-input.jpg",
9
+ type: "img",
10
+ },
11
+ {
12
+ label: "Input",
13
+ content: "Darth Vader is surfing on the waves.",
14
+ type: "text",
15
+ },
16
+ ],
17
+ outputs: [
18
+ {
19
+ filename: "image-text-to-video-output.gif",
20
+ type: "img",
21
+ },
22
+ ],
23
+ },
24
+ metrics: [
25
+ {
26
+ description:
27
+ "Frechet Video Distance uses a model that captures coherence for changes in frames and the quality of each frame. A smaller score indicates better video generation.",
28
+ id: "fvd",
29
+ },
30
+ {
31
+ description:
32
+ "CLIPSIM measures similarity between video frames and text using an image-text similarity model. A higher score indicates better video generation.",
33
+ id: "clipsim",
34
+ },
35
+ ],
36
+ models: [
37
+ {
38
+ description: "A powerful model for image-text-to-video generation.",
39
+ id: "Lightricks/LTX-Video",
40
+ },
41
+ ],
42
+ spaces: [
43
+ {
44
+ description: "An application for image-text-to-video generation.",
45
+ id: "Lightricks/ltx-video-distilled",
46
+ },
47
+ ],
48
+ summary:
49
+ "Image-text-to-video models take an reference image and a text instructions as and generate a video based on them. These models are useful for animating still images, creating dynamic content from static references, and generating videos with specific motion or transformation guidance.",
50
+ widgetModels: ["Lightricks/LTX-Video"],
51
+ youtubeId: undefined,
52
+ };
53
+
54
+ export default taskData;
@@ -0,0 +1,77 @@
1
+ /**
2
+ * Inference code generated from the JSON schema spec in ./spec
3
+ *
4
+ * Using src/scripts/inference-codegen
5
+ */
6
+ /**
7
+ * Inputs for Image Text To Video inference. Either inputs (image) or prompt (in parameters)
8
+ * must be provided, or both.
9
+ */
10
+ export interface ImageTextToVideoInput {
11
+ /**
12
+ * The input image data as a base64-encoded string. If no `parameters` are provided, you can
13
+ * also provide the image data as a raw bytes payload. Either this or prompt must be
14
+ * provided.
15
+ */
16
+ inputs?: Blob;
17
+ /**
18
+ * Additional inference parameters for Image Text To Video
19
+ */
20
+ parameters?: ImageTextToVideoParameters;
21
+ [property: string]: unknown;
22
+ }
23
+ /**
24
+ * Additional inference parameters for Image Text To Video
25
+ */
26
+ export interface ImageTextToVideoParameters {
27
+ /**
28
+ * For diffusion models. A higher guidance scale value encourages the model to generate
29
+ * videos closely linked to the text prompt at the expense of lower image quality.
30
+ */
31
+ guidance_scale?: number;
32
+ /**
33
+ * One prompt to guide what NOT to include in video generation.
34
+ */
35
+ negative_prompt?: string;
36
+ /**
37
+ * The num_frames parameter determines how many video frames are generated.
38
+ */
39
+ num_frames?: number;
40
+ /**
41
+ * The number of denoising steps. More denoising steps usually lead to a higher quality
42
+ * video at the expense of slower inference.
43
+ */
44
+ num_inference_steps?: number;
45
+ /**
46
+ * The text prompt to guide the video generation. Either this or inputs (image) must be
47
+ * provided.
48
+ */
49
+ prompt?: string;
50
+ /**
51
+ * Seed for the random number generator.
52
+ */
53
+ seed?: number;
54
+ /**
55
+ * The size in pixel of the output video frames.
56
+ */
57
+ target_size?: TargetSize;
58
+ [property: string]: unknown;
59
+ }
60
+ /**
61
+ * The size in pixel of the output video frames.
62
+ */
63
+ export interface TargetSize {
64
+ height: number;
65
+ width: number;
66
+ [property: string]: unknown;
67
+ }
68
+ /**
69
+ * Outputs of inference for the Image Text To Video task
70
+ */
71
+ export interface ImageTextToVideoOutput {
72
+ /**
73
+ * The generated video returned as raw bytes in the payload.
74
+ */
75
+ video: unknown;
76
+ [property: string]: unknown;
77
+ }
@@ -0,0 +1,63 @@
1
+ {
2
+ "$id": "/inference/schemas/image-text-to-video/input.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Inputs for Image Text To Video inference. Either inputs (image) or prompt (in parameters) must be provided, or both.",
5
+ "title": "ImageTextToVideoInput",
6
+ "type": "object",
7
+ "properties": {
8
+ "inputs": {
9
+ "type": "string",
10
+ "description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload. Either this or prompt must be provided.",
11
+ "comment": "type=binary"
12
+ },
13
+ "parameters": {
14
+ "description": "Additional inference parameters for Image Text To Video",
15
+ "$ref": "#/$defs/ImageTextToVideoParameters"
16
+ }
17
+ },
18
+ "$defs": {
19
+ "ImageTextToVideoParameters": {
20
+ "title": "ImageTextToVideoParameters",
21
+ "type": "object",
22
+ "properties": {
23
+ "prompt": {
24
+ "type": "string",
25
+ "description": "The text prompt to guide the video generation. Either this or inputs (image) must be provided."
26
+ },
27
+ "guidance_scale": {
28
+ "type": "number",
29
+ "description": "For diffusion models. A higher guidance scale value encourages the model to generate videos closely linked to the text prompt at the expense of lower image quality."
30
+ },
31
+ "negative_prompt": {
32
+ "type": "string",
33
+ "description": "One prompt to guide what NOT to include in video generation."
34
+ },
35
+ "num_inference_steps": {
36
+ "type": "integer",
37
+ "description": "The number of denoising steps. More denoising steps usually lead to a higher quality video at the expense of slower inference."
38
+ },
39
+ "num_frames": {
40
+ "type": "number",
41
+ "description": "The num_frames parameter determines how many video frames are generated."
42
+ },
43
+ "target_size": {
44
+ "type": "object",
45
+ "description": "The size in pixel of the output video frames.",
46
+ "properties": {
47
+ "width": {
48
+ "type": "integer"
49
+ },
50
+ "height": {
51
+ "type": "integer"
52
+ }
53
+ },
54
+ "required": ["width", "height"]
55
+ },
56
+ "seed": {
57
+ "type": "integer",
58
+ "description": "Seed for the random number generator."
59
+ }
60
+ }
61
+ }
62
+ }
63
+ }
@@ -0,0 +1,13 @@
1
+ {
2
+ "$id": "/inference/schemas/image-text-to-video/output.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Outputs of inference for the Image Text To Video task",
5
+ "title": "ImageTextToVideoOutput",
6
+ "type": "object",
7
+ "properties": {
8
+ "video": {
9
+ "description": "The generated video returned as raw bytes in the payload."
10
+ }
11
+ },
12
+ "required": ["video"]
13
+ }
@@ -14,6 +14,8 @@ import imageFeatureExtraction from "./image-feature-extraction/data.js";
14
14
  import imageToImage from "./image-to-image/data.js";
15
15
  import imageToText from "./image-to-text/data.js";
16
16
  import imageTextToText from "./image-text-to-text/data.js";
17
+ import imageTextToImage from "./image-text-to-image/data.js";
18
+ import imageTextToVideo from "./image-text-to-video/data.js";
17
19
  import imageSegmentation from "./image-segmentation/data.js";
18
20
  import imageToVideo from "./image-to-video/data.js";
19
21
  import maskGeneration from "./mask-generation/data.js";
@@ -74,6 +76,16 @@ export type * from "./image-to-image/inference.js";
74
76
  export type { ImageToTextInput, ImageToTextOutput, ImageToTextParameters } from "./image-to-text/inference.js";
75
77
  export type * from "./image-segmentation/inference.js";
76
78
  export type { ImageToVideoInput, ImageToVideoOutput, ImageToVideoParameters } from "./image-to-video/inference.js";
79
+ export type {
80
+ ImageTextToImageInput,
81
+ ImageTextToImageOutput,
82
+ ImageTextToImageParameters,
83
+ } from "./image-text-to-image/inference.js";
84
+ export type {
85
+ ImageTextToVideoInput,
86
+ ImageTextToVideoOutput,
87
+ ImageTextToVideoParameters,
88
+ } from "./image-text-to-video/inference.js";
77
89
  export type * from "./object-detection/inference.js";
78
90
  export type * from "./depth-estimation/inference.js";
79
91
  export type * from "./question-answering/inference.js";
@@ -133,6 +145,8 @@ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
133
145
  "image-feature-extraction": ["timm", "transformers"],
134
146
  "image-segmentation": ["transformers", "transformers.js"],
135
147
  "image-text-to-text": ["transformers"],
148
+ "image-text-to-image": ["diffusers"],
149
+ "image-text-to-video": ["diffusers"],
136
150
  "image-to-image": ["diffusers", "transformers", "transformers.js"],
137
151
  "image-to-text": ["transformers", "transformers.js"],
138
152
  "image-to-video": ["diffusers"],
@@ -220,6 +234,8 @@ export const TASKS_DATA: Record<PipelineType, TaskData | undefined> = {
220
234
  "image-segmentation": getData("image-segmentation", imageSegmentation),
221
235
  "image-to-image": getData("image-to-image", imageToImage),
222
236
  "image-text-to-text": getData("image-text-to-text", imageTextToText),
237
+ "image-text-to-image": getData("image-text-to-image", imageTextToImage),
238
+ "image-text-to-video": getData("image-text-to-video", imageTextToVideo),
223
239
  "image-to-text": getData("image-to-text", imageToText),
224
240
  "image-to-video": getData("image-to-video", imageToVideo),
225
241
  "keypoint-detection": getData("keypoint-detection", keypointDetection),