@huggingface/tasks 0.19.7 → 0.19.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/dist/commonjs/hardware.d.ts +7 -0
  2. package/dist/commonjs/hardware.d.ts.map +1 -1
  3. package/dist/commonjs/hardware.js +7 -0
  4. package/dist/commonjs/local-apps.js +10 -10
  5. package/dist/commonjs/local-apps.spec.js +2 -3
  6. package/dist/commonjs/model-libraries.d.ts +16 -1
  7. package/dist/commonjs/model-libraries.d.ts.map +1 -1
  8. package/dist/commonjs/model-libraries.js +16 -1
  9. package/dist/commonjs/snippets/inputs.d.ts.map +1 -1
  10. package/dist/commonjs/snippets/inputs.js +5 -0
  11. package/dist/commonjs/tasks/image-to-video/data.d.ts +4 -0
  12. package/dist/commonjs/tasks/image-to-video/data.d.ts.map +1 -0
  13. package/dist/commonjs/tasks/image-to-video/data.js +119 -0
  14. package/dist/commonjs/tasks/image-to-video/inference.d.ts +75 -0
  15. package/dist/commonjs/tasks/image-to-video/inference.d.ts.map +1 -0
  16. package/dist/commonjs/tasks/image-to-video/inference.js +2 -0
  17. package/dist/commonjs/tasks/index.d.ts +1 -0
  18. package/dist/commonjs/tasks/index.d.ts.map +1 -1
  19. package/dist/commonjs/tasks/index.js +63 -62
  20. package/dist/esm/hardware.d.ts +7 -0
  21. package/dist/esm/hardware.d.ts.map +1 -1
  22. package/dist/esm/hardware.js +7 -0
  23. package/dist/esm/local-apps.js +10 -10
  24. package/dist/esm/local-apps.spec.js +2 -3
  25. package/dist/esm/model-libraries.d.ts +16 -1
  26. package/dist/esm/model-libraries.d.ts.map +1 -1
  27. package/dist/esm/model-libraries.js +16 -1
  28. package/dist/esm/snippets/inputs.d.ts.map +1 -1
  29. package/dist/esm/snippets/inputs.js +5 -0
  30. package/dist/esm/tasks/image-to-video/data.d.ts +4 -0
  31. package/dist/esm/tasks/image-to-video/data.d.ts.map +1 -0
  32. package/dist/esm/tasks/image-to-video/data.js +117 -0
  33. package/dist/esm/tasks/image-to-video/inference.d.ts +75 -0
  34. package/dist/esm/tasks/image-to-video/inference.d.ts.map +1 -0
  35. package/dist/esm/tasks/image-to-video/inference.js +1 -0
  36. package/dist/esm/tasks/index.d.ts +1 -0
  37. package/dist/esm/tasks/index.d.ts.map +1 -1
  38. package/dist/esm/tasks/index.js +2 -1
  39. package/package.json +1 -1
  40. package/src/hardware.ts +7 -0
  41. package/src/local-apps.spec.ts +2 -3
  42. package/src/local-apps.ts +10 -10
  43. package/src/model-libraries.ts +16 -1
  44. package/src/snippets/inputs.ts +6 -0
  45. package/src/tasks/image-text-to-text/about.md +2 -1
  46. package/src/tasks/image-to-video/about.md +51 -0
  47. package/src/tasks/image-to-video/data.ts +126 -0
  48. package/src/tasks/image-to-video/inference.ts +74 -0
  49. package/src/tasks/image-to-video/spec/input.json +64 -0
  50. package/src/tasks/image-to-video/spec/output.json +13 -0
  51. package/src/tasks/index.ts +3 -1
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/tasks/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AA+CpD,mBAAmB,qCAAqC,CAAC;AACzD,mBAAmB,6CAA6C,CAAC;AACjE,YAAY,EACX,mBAAmB,EACnB,0BAA0B,EAC1B,oBAAoB,EACpB,4BAA4B,EAC5B,2BAA2B,EAC3B,0BAA0B,EAC1B,gCAAgC,EAChC,+BAA+B,GAC/B,MAAM,gCAAgC,CAAC;AACxC,mBAAmB,4CAA4C,CAAC;AAChE,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,0BAA0B,CAAC;AAC9C,YAAY,EACX,wBAAwB,EACxB,yBAAyB,EACzB,gCAAgC,EAChC,6BAA6B,GAC7B,MAAM,qCAAqC,CAAC;AAC7C,mBAAmB,+BAA+B,CAAC;AACnD,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,iCAAiC,CAAC;AACrD,mBAAmB,iCAAiC,CAAC;AACrD,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,oCAAoC,CAAC;AACxD,mBAAmB,8BAA8B,CAAC;AAClD,mBAAmB,yCAAyC,CAAC;AAC7D,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,YAAY,EAAE,qBAAqB,EAAE,iBAAiB,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,YAAY,EAAE,sBAAsB,EAAE,iBAAiB,EAAE,kBAAkB,EAAE,MAAM,+BAA+B,CAAC;AACnH,mBAAmB,qCAAqC,CAAC;AACzD,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AACtF,YAAY,EACX,6BAA6B,EAC7B,uBAAuB,EACvB,wBAAwB,EACxB,+BAA+B,EAC/B,4BAA4B,GAC5B,MAAM,oCAAoC,CAAC;AAC5C,YAAY,EACX,gCAAgC,EAChC,gCAAgC,EAChC,mBAAmB,EACnB,oBAAoB,EACpB,2BAA2B,EAC3B,qCAAqC,EACrC,kCAAkC,EAClC,yBAAyB,EACzB,uCAAuC,EACvC,0BAA0B,GAC1B,MAAM,gCAAgC,CAAC;AACxC,mBAAmB,qCAAqC,CAAC;AACzD,mBAAmB,0CAA0C,CAAC;AAC9D,mBAAmB,yCAAyC,CAAC;AAC7D,mBAAmB,+CAA+C,CAAC;AACnE,YAAY,EACX,WAAW,EACX,4BAA4B,EAC5B,6BAA6B,EAC7B,oCAAoC,GACpC,MAAM,2CAA2C,CAAC;AAEnD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAE7D;;GAEG;AACH,eAAO,MAAM,qBAAqB,EAAE,MAAM,CAAC,YAAY,EAAE,eAAe,EAAE,CAgEzE,CAAC;AAoBF,eAAO,MAAM,UAAU,EAAE,MAAM,CAAC,YAAY,EAAE,QAAQ,GAAG,SAAS,CAwDxD,CAAC;AAEX,MAAM,WAAW,WAAW;IAC3B,WAAW,EAAE,MAAM,CAAC;IACpB,EAAE,EAAE,MAAM,CAAC;CACX;AAED,MAAM,MAAM,aAAa,GACtB;IACA,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,OAAO,CAAC;CACb,GACD;IACA,IAAI,EAAE,KAAK,CAAC;QACX,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;KACd,CAAC,CAAC;IACH,IAAI,EAAE,OAAO,CAAC;CACb,GACD;IACA,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,KAAK,CAAC;CACX,GACD;IACA,KAAK,EAAE,MAAM,EAAE,EAAE,CAAC;IAClB,IAAI,EAAE,SAAS,CAAC;CACf,GACD;IACA,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,MAAM,CAAC;CACZ,GACD;IACA,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,KAAK,CAAC;QACb,GAAG,EAAE,MAAM,CAAC;QACZ,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;KACb,CAAC,CAAC;IACH,IAAI,EAAE,kBAAkB,CAAC;CACxB,CAAC;AAEL,MAAM,WAAW,QAAQ;IACxB,MAAM,EAAE,aAAa,EAAE,CAAC;IACxB,OAAO,EAAE,aAAa,EAAE,CAAC;CACzB;AAED,MAAM,WAAW,QAAQ;IACxB,QAAQ,EAAE,WAAW,EAAE,CAAC;IACxB,IAAI,EAAE,QAAQ,CAAC;IACf,EAAE,EAAE,YAAY,CAAC;IACjB,WAAW,CAAC,EAAE,YAAY,CAAC;IAC3B,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,KAAK,EAAE,MAAM,CAAC;IACd,SAAS,EAAE,eAAe,EAAE,CAAC;IAC7B,OAAO,EAAE,WAAW,EAAE,CAAC;IACvB,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,OAAO,EAAE,MAAM,CAAC;IAChB,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,SAAS,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,MAAM,cAAc,GAAG,IAAI,CAAC,QAAQ,EAAE,IAAI,GAAG,OAAO,GAAG,WAAW,CAAC,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/tasks/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AAgDpD,mBAAmB,qCAAqC,CAAC;AACzD,mBAAmB,6CAA6C,CAAC;AACjE,YAAY,EACX,mBAAmB,EACnB,0BAA0B,EAC1B,oBAAoB,EACpB,4BAA4B,EAC5B,2BAA2B,EAC3B,0BAA0B,EAC1B,gCAAgC,EAChC,+BAA+B,GAC/B,MAAM,gCAAgC,CAAC;AACxC,mBAAmB,4CAA4C,CAAC;AAChE,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,0BAA0B,CAAC;AAC9C,YAAY,EACX,wBAAwB,EACxB,yBAAyB,EACzB,gCAAgC,EAChC,6BAA6B,GAC7B,MAAM,qCAAqC,CAAC;AAC7C,mBAAmB,+BAA+B,CAAC;AACnD,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,mBAAmB,mCAAmC,CAAC;AACvD,YAAY,EAAE,iBAAiB,EAAE,kBAAkB,EAAE,sBAAsB,EAAE,MAAM,+BAA+B,CAAC;AACnH,mBAAmB,iCAAiC,CAAC;AACrD,mBAAmB,iCAAiC,CAAC;AACrD,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,oCAAoC,CAAC;AACxD,mBAAmB,8BAA8B,CAAC;AAClD,mBAAmB,yCAAyC,CAAC;AAC7D,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,YAAY,EAAE,qBAAqB,EAAE,iBAAiB,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,YAAY,EAAE,sBAAsB,EAAE,iBAAiB,EAAE,kBAAkB,EAAE,MAAM,+BAA+B,CAAC;AACnH,mBAAmB,qCAAqC,CAAC;AACzD,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AACtF,YAAY,EACX,6BAA6B,EAC7B,uBAAuB,EACvB,wBAAwB,EACxB,+BAA+B,EAC/B,4BAA4B,GAC5B,MAAM,oCAAoC,CAAC;AAC5C,YAAY,EACX,gCAAgC,EAChC,gCAAgC,EAChC,mBAAmB,EACnB,oBAAoB,EACpB,2BAA2B,EAC3B,qCAAqC,EACrC,kCAAkC,EAClC,yBAAyB,EACzB,uCAAuC,EACvC,0BAA0B,GAC1B,MAAM,gCAAgC,CAAC;AACxC,mBAAmB,qCAAqC,CAAC;AACzD,mBAAmB,0CAA0C,CAAC;AAC9D,mBAAmB,yCAAyC,CAAC;AAC7D,mBAAmB,+CAA+C,CAAC;AACnE,YAAY,EACX,WAAW,EACX,4BAA4B,EAC5B,6BAA6B,EAC7B,oCAAoC,GACpC,MAAM,2CAA2C,CAAC;AAEnD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAE7D;;GAEG;AACH,eAAO,MAAM,qBAAqB,EAAE,MAAM,CAAC,YAAY,EAAE,eAAe,EAAE,CAgEzE,CAAC;AAoBF,eAAO,MAAM,UAAU,EAAE,MAAM,CAAC,YAAY,EAAE,QAAQ,GAAG,SAAS,CAwDxD,CAAC;AAEX,MAAM,WAAW,WAAW;IAC3B,WAAW,EAAE,MAAM,CAAC;IACpB,EAAE,EAAE,MAAM,CAAC;CACX;AAED,MAAM,MAAM,aAAa,GACtB;IACA,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,OAAO,CAAC;CACb,GACD;IACA,IAAI,EAAE,KAAK,CAAC;QACX,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;KACd,CAAC,CAAC;IACH,IAAI,EAAE,OAAO,CAAC;CACb,GACD;IACA,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,KAAK,CAAC;CACX,GACD;IACA,KAAK,EAAE,MAAM,EAAE,EAAE,CAAC;IAClB,IAAI,EAAE,SAAS,CAAC;CACf,GACD;IACA,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,MAAM,CAAC;CACZ,GACD;IACA,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,KAAK,CAAC;QACb,GAAG,EAAE,MAAM,CAAC;QACZ,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;KACb,CAAC,CAAC;IACH,IAAI,EAAE,kBAAkB,CAAC;CACxB,CAAC;AAEL,MAAM,WAAW,QAAQ;IACxB,MAAM,EAAE,aAAa,EAAE,CAAC;IACxB,OAAO,EAAE,aAAa,EAAE,CAAC;CACzB;AAED,MAAM,WAAW,QAAQ;IACxB,QAAQ,EAAE,WAAW,EAAE,CAAC;IACxB,IAAI,EAAE,QAAQ,CAAC;IACf,EAAE,EAAE,YAAY,CAAC;IACjB,WAAW,CAAC,EAAE,YAAY,CAAC;IAC3B,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,KAAK,EAAE,MAAM,CAAC;IACd,SAAS,EAAE,eAAe,EAAE,CAAC;IAC7B,OAAO,EAAE,WAAW,EAAE,CAAC;IACvB,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,OAAO,EAAE,MAAM,CAAC;IAChB,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,SAAS,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,MAAM,cAAc,GAAG,IAAI,CAAC,QAAQ,EAAE,IAAI,GAAG,OAAO,GAAG,WAAW,CAAC,CAAC"}
@@ -12,6 +12,7 @@ import imageToImage from "./image-to-image/data.js";
12
12
  import imageToText from "./image-to-text/data.js";
13
13
  import imageTextToText from "./image-text-to-text/data.js";
14
14
  import imageSegmentation from "./image-segmentation/data.js";
15
+ import imageToVideo from "./image-to-video/data.js";
15
16
  import maskGeneration from "./mask-generation/data.js";
16
17
  import objectDetection from "./object-detection/data.js";
17
18
  import depthEstimation from "./depth-estimation/data.js";
@@ -145,7 +146,7 @@ export const TASKS_DATA = {
145
146
  "image-to-image": getData("image-to-image", imageToImage),
146
147
  "image-text-to-text": getData("image-text-to-text", imageTextToText),
147
148
  "image-to-text": getData("image-to-text", imageToText),
148
- "image-to-video": undefined,
149
+ "image-to-video": getData("image-to-video", imageToVideo),
149
150
  "keypoint-detection": getData("keypoint-detection", keypointDetection),
150
151
  "mask-generation": getData("mask-generation", maskGeneration),
151
152
  "multiple-choice": undefined,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@huggingface/tasks",
3
- "version": "0.19.7",
3
+ "version": "0.19.9",
4
4
  "description": "List of ML tasks for huggingface.co/tasks",
5
5
  "repository": "https://github.com/huggingface/huggingface.js.git",
6
6
  "publishConfig": {
package/src/hardware.ts CHANGED
@@ -100,6 +100,10 @@ export const SKUS = {
100
100
  tflops: 38.7,
101
101
  memory: [48],
102
102
  },
103
+ "RTX A5000": {
104
+ tflops: 34.1,
105
+ memory: [24],
106
+ },
103
107
  "RTX A4000": {
104
108
  tflops: 19.2,
105
109
  memory: [16],
@@ -533,6 +537,9 @@ export const SKUS = {
533
537
  "Intel Core Ultra 7 265KF": {
534
538
  tflops: 1.53,
535
539
  },
540
+ "Intel Core 14th Generation (i7)": {
541
+ tflops: 0.8,
542
+ },
536
543
  "Intel Core 13th Generation (i9)": {
537
544
  tflops: 0.85,
538
545
  },
@@ -13,7 +13,7 @@ describe("local-apps", () => {
13
13
  const snippet = snippetFunc(model);
14
14
 
15
15
  expect(snippet[0].content).toEqual(`# Load and run the model:
16
- llama-cli -hf bartowski/Llama-3.2-3B-Instruct-GGUF:{{QUANT_TAG}}`);
16
+ llama-server -hf bartowski/Llama-3.2-3B-Instruct-GGUF:{{QUANT_TAG}}`);
17
17
  });
18
18
 
19
19
  it("llama.cpp non-conversational", async () => {
@@ -26,8 +26,7 @@ llama-cli -hf bartowski/Llama-3.2-3B-Instruct-GGUF:{{QUANT_TAG}}`);
26
26
  const snippet = snippetFunc(model);
27
27
 
28
28
  expect(snippet[0].content).toEqual(`# Load and run the model:
29
- llama-cli -hf mlabonne/gemma-2b-GGUF:{{QUANT_TAG}} \\
30
- -p "Once upon a time,"`);
29
+ llama-server -hf mlabonne/gemma-2b-GGUF:{{QUANT_TAG}}`);
31
30
  });
32
31
 
33
32
  it("vLLM conversational llm", async () => {
package/src/local-apps.ts CHANGED
@@ -108,18 +108,18 @@ function getQuantTag(filepath?: string): string {
108
108
  const snippetLlamacpp = (model: ModelData, filepath?: string): LocalAppSnippet[] => {
109
109
  const command = (binary: string) => {
110
110
  const snippet = ["# Load and run the model:", `${binary} -hf ${model.id}${getQuantTag(filepath)}`];
111
- if (!model.tags.includes("conversational")) {
112
- // for non-conversational models, add a prompt
113
- snippet[snippet.length - 1] += " \\";
114
- snippet.push(' -p "Once upon a time,"');
115
- }
116
111
  return snippet.join("\n");
117
112
  };
118
113
  return [
119
114
  {
120
115
  title: "Install from brew",
121
116
  setup: "brew install llama.cpp",
122
- content: command("llama-cli"),
117
+ content: command("llama-server"),
118
+ },
119
+ {
120
+ title: "Install from WinGet (Windows)",
121
+ setup: "winget install llama.cpp",
122
+ content: command("llama-server"),
123
123
  },
124
124
  {
125
125
  title: "Use pre-built binary",
@@ -128,17 +128,17 @@ const snippetLlamacpp = (model: ModelData, filepath?: string): LocalAppSnippet[]
128
128
  "# Download pre-built binary from:",
129
129
  "# https://github.com/ggerganov/llama.cpp/releases",
130
130
  ].join("\n"),
131
- content: command("./llama-cli"),
131
+ content: command("./llama-server"),
132
132
  },
133
133
  {
134
134
  title: "Build from source code",
135
135
  setup: [
136
136
  "git clone https://github.com/ggerganov/llama.cpp.git",
137
137
  "cd llama.cpp",
138
- "cmake -B build -DLLAMA_CURL=ON",
139
- "cmake --build build -j --target llama-cli",
138
+ "cmake -B build",
139
+ "cmake --build build -j --target llama-server",
140
140
  ].join("\n"),
141
- content: command("./build/bin/llama-cli"),
141
+ content: command("./build/bin/llama-server"),
142
142
  },
143
143
  ];
144
144
  };
@@ -116,6 +116,13 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
116
116
  countDownloads: `path_extension:"pth"`,
117
117
  snippets: snippets.audioseal,
118
118
  },
119
+ "bagel-mot": {
120
+ prettyLabel: "Bagel",
121
+ repoName: "Bagel",
122
+ repoUrl: "https://github.com/ByteDance-Seed/Bagel/",
123
+ filter: false,
124
+ countDownloads: `path:"llm_config.json"`,
125
+ },
119
126
  ben2: {
120
127
  prettyLabel: "BEN2",
121
128
  repoName: "BEN2",
@@ -929,7 +936,7 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
929
936
  repoName: "timesfm",
930
937
  repoUrl: "https://github.com/google-research/timesfm",
931
938
  filter: false,
932
- countDownloads: `path:"checkpoints/checkpoint_1100000/state/checkpoint"`,
939
+ countDownloads: `path:"checkpoints/checkpoint_1100000/state/checkpoint" OR path:"checkpoints/checkpoint_2150000/state/checkpoint" OR path_extension:"ckpt"`,
933
940
  },
934
941
  timm: {
935
942
  prettyLabel: "timm",
@@ -940,6 +947,14 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
940
947
  filter: true,
941
948
  countDownloads: `path:"pytorch_model.bin" OR path:"model.safetensors"`,
942
949
  },
950
+ torchgeo: {
951
+ prettyLabel: "TorchGeo",
952
+ repoName: "TorchGeo",
953
+ repoUrl: "https://github.com/microsoft/torchgeo",
954
+ docsUrl: "https://torchgeo.readthedocs.io/",
955
+ filter: false,
956
+ countDownloads: `path_extension:"pt" OR path_extension:"pth"`,
957
+ },
943
958
  transformers: {
944
959
  prettyLabel: "Transformers",
945
960
  repoName: "🤗/transformers",
@@ -91,6 +91,11 @@ const inputsImageToImage = () => `{
91
91
  "prompt": "Turn the cat into a tiger."
92
92
  }`;
93
93
 
94
+ const inputsImageToVideo = () => `{
95
+ "image": "cat.png",
96
+ "prompt": "The cat starts to dance"
97
+ }`;
98
+
94
99
  const inputsImageSegmentation = () => `"cats.jpg"`;
95
100
 
96
101
  const inputsObjectDetection = () => `"cats.jpg"`;
@@ -126,6 +131,7 @@ const modelInputSnippets: {
126
131
  "image-classification": inputsImageClassification,
127
132
  "image-to-text": inputsImageToText,
128
133
  "image-to-image": inputsImageToImage,
134
+ "image-to-video": inputsImageToVideo,
129
135
  "image-segmentation": inputsImageSegmentation,
130
136
  "object-detection": inputsObjectDetection,
131
137
  "question-answering": inputsQuestionAnswering,
@@ -85,9 +85,10 @@ curl https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.2-11B-
85
85
 
86
86
  ## Useful Resources
87
87
 
88
+ - [Vision Language Models (Better, Faster, Stronger)](https://huggingface.co/blog/vlms-2025)
88
89
  - [Vision Language Models Explained](https://huggingface.co/blog/vlms)
89
90
  - [Welcome PaliGemma 2 – New vision language models by Google](https://huggingface.co/blog/paligemma2)
90
91
  - [SmolVLM - small yet mighty Vision Language Model](https://huggingface.co/blog/smolvlm)
91
92
  - [Multimodal RAG using ColPali and Qwen2-VL](https://github.com/merveenoyan/smol-vision/blob/main/ColPali_%2B_Qwen2_VL.ipynb)
92
- - [Image-text-to-text task guide](https://huggingface.co/tasks/image-text-to-text)
93
93
  - [Preference Optimization for Vision Language Models with TRL](https://huggingface.co/blog/dpo_vlm)
94
+ - [Image-text-to-text task guide](https://huggingface.co/docs/transformers/tasks/image_text_to_text)
@@ -0,0 +1,51 @@
1
+ ## Use Cases
2
+
3
+ Image-to-video models transform a static image into a video sequence. This can be used for a variety of creative and practical applications.
4
+
5
+ ### Animated Images
6
+
7
+ Bring still photos to life by adding subtle motion or creating short animated clips. This is great for social media content or dynamic presentations.
8
+
9
+ ### Storytelling from a Single Frame
10
+
11
+ Expand on the narrative of an image by generating a short video that imagines what happened before or after the moment captured in the photo.
12
+
13
+ ### Video Generation with Visual Consistency
14
+
15
+ Use an input image as a strong visual anchor to guide the generation of a video, ensuring that the style, characters, or objects in the video remain consistent with the source image.
16
+
17
+ ### Controllable Motion
18
+
19
+ Image-to-video models can be used to specify the direction or intensity of motion or camera control, giving more fine-grained control over the generated animation.
20
+
21
+ ## Inference
22
+
23
+ Running the model Wan 2.1 T2V 1.3B with diffusers
24
+
25
+ ```py
26
+ import torch
27
+ from diffusers import AutoencoderKLWan, WanPipeline
28
+ from diffusers.utils import export_to_video
29
+
30
+ model_id = "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
31
+ vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
32
+ pipe = WanPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
33
+ pipe.to("cuda")
34
+
35
+ prompt = "A cat walks on the grass, realistic"
36
+ negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
37
+
38
+ output = pipe(
39
+ prompt=prompt,
40
+ negative_prompt=negative_prompt,
41
+ height=480,
42
+ width=832,
43
+ num_frames=81,
44
+ guidance_scale=5.0
45
+ ).frames[0]
46
+ export_to_video(output, "output.mp4", fps=15)
47
+ ```
48
+
49
+ ## Useful Resources
50
+
51
+ To train image-to-video LoRAs check out [finetrainers](https://github.com/a-r-r-o-w/finetrainers) and [musubi trainer](https://github.com/kohya-ss/musubi-tuner).
@@ -0,0 +1,126 @@
1
+ import type { TaskDataCustom } from "../index.js";
2
+
3
+ const taskData: TaskDataCustom = {
4
+ datasets: [
5
+ {
6
+ description: "A benchmark dataset for reference image controlled video generation.",
7
+ id: "ali-vilab/VACE-Benchmark",
8
+ },
9
+ {
10
+ description: "A dataset of video generation style preferences.",
11
+ id: "Rapidata/sora-video-generation-style-likert-scoring",
12
+ },
13
+ {
14
+ description: "A dataset with videos and captions throughout the videos.",
15
+ id: "BestWishYsh/ChronoMagic",
16
+ },
17
+ ],
18
+ demo: {
19
+ inputs: [
20
+ {
21
+ filename: "image-to-video-input.jpg",
22
+ type: "img",
23
+ },
24
+ {
25
+ label: "Optional Text Prompt",
26
+ content: "This penguin is dancing",
27
+ type: "text",
28
+ },
29
+ ],
30
+ outputs: [
31
+ {
32
+ filename: "image-to-video-output.gif",
33
+ type: "img",
34
+ },
35
+ ],
36
+ },
37
+ metrics: [
38
+ {
39
+ description:
40
+ "Fréchet Video Distance (FVD) measures the perceptual similarity between the distributions of generated videos and a set of real videos, assessing overall visual quality and temporal coherence of the video generated from an input image.",
41
+ id: "fvd",
42
+ },
43
+ {
44
+ description:
45
+ "CLIP Score measures the semantic similarity between a textual prompt (if provided alongside the input image) and the generated video frames. It evaluates how well the video's generated content and motion align with the textual description, conditioned on the initial image.",
46
+ id: "clip_score",
47
+ },
48
+ {
49
+ description:
50
+ "First Frame Fidelity, often measured using LPIPS (Learned Perceptual Image Patch Similarity), PSNR, or SSIM, quantifies how closely the first frame of the generated video matches the input conditioning image.",
51
+ id: "lpips",
52
+ },
53
+ {
54
+ description:
55
+ "Identity Preservation Score measures the consistency of identity (e.g., a person's face or a specific object's characteristics) between the input image and throughout the generated video frames, often calculated using features from specialized models like face recognition (e.g., ArcFace) or re-identification models.",
56
+ id: "identity_preservation",
57
+ },
58
+ {
59
+ description:
60
+ "Motion Score evaluates the quality, realism, and temporal consistency of motion in the video generated from a static image. This can be based on optical flow analysis (e.g., smoothness, magnitude), consistency of object trajectories, or specific motion plausibility assessments.",
61
+ id: "motion_score",
62
+ },
63
+ ],
64
+ models: [
65
+ {
66
+ description: "LTX-Video, a 13B parameter model for high quality video generation",
67
+ id: "Lightricks/LTX-Video-0.9.7-dev",
68
+ },
69
+ {
70
+ description: "A 14B parameter model for reference image controlled video generation",
71
+ id: "Wan-AI/Wan2.1-VACE-14B",
72
+ },
73
+ {
74
+ description: "An image-to-video generation model using FramePack F1 methodology with Hunyuan-DiT architecture",
75
+ id: "lllyasviel/FramePack_F1_I2V_HY_20250503",
76
+ },
77
+ {
78
+ description: "A distilled version of the LTX-Video-0.9.7-dev model for faster inference",
79
+ id: "Lightricks/LTX-Video-0.9.7-distilled",
80
+ },
81
+ {
82
+ description: "An image-to-video generation model by Skywork AI, 14B parameters, producing 720p videos.",
83
+ id: "Skywork/SkyReels-V2-I2V-14B-720P",
84
+ },
85
+ {
86
+ description: "Image-to-video variant of Tencent's HunyuanVideo.",
87
+ id: "tencent/HunyuanVideo-I2V",
88
+ },
89
+ {
90
+ description: "A 14B parameter model for 720p image-to-video generation by Wan-AI.",
91
+ id: "Wan-AI/Wan2.1-I2V-14B-720P",
92
+ },
93
+ {
94
+ description: "A Diffusers version of the Wan2.1-I2V-14B-720P model for 720p image-to-video generation.",
95
+ id: "Wan-AI/Wan2.1-I2V-14B-720P-Diffusers",
96
+ },
97
+ ],
98
+ spaces: [
99
+ {
100
+ description: "An application to generate videos fast.",
101
+ id: "Lightricks/ltx-video-distilled",
102
+ },
103
+ {
104
+ description: "Generate videos with the FramePack-F1",
105
+ id: "linoyts/FramePack-F1",
106
+ },
107
+ {
108
+ description: "Generate videos with the FramePack",
109
+ id: "lisonallen/framepack-i2v",
110
+ },
111
+ {
112
+ description: "Wan2.1 with CausVid LoRA",
113
+ id: "multimodalart/wan2-1-fast",
114
+ },
115
+ {
116
+ description: "A demo for Stable Video Diffusion",
117
+ id: "multimodalart/stable-video-diffusion",
118
+ },
119
+ ],
120
+ summary:
121
+ "Image-to-video models take a still image as input and generate a video. These models can be guided by text prompts to influence the content and style of the output video.",
122
+ widgetModels: [],
123
+ youtubeId: undefined,
124
+ };
125
+
126
+ export default taskData;
@@ -0,0 +1,74 @@
1
+ /**
2
+ * Inference code generated from the JSON schema spec in ./spec
3
+ *
4
+ * Using src/scripts/inference-codegen
5
+ */
6
+ /**
7
+ * Inputs for Image To Video inference
8
+ */
9
+ export interface ImageToVideoInput {
10
+ /**
11
+ * The input image data as a base64-encoded string. If no `parameters` are provided, you can
12
+ * also provide the image data as a raw bytes payload.
13
+ */
14
+ inputs: Blob;
15
+ /**
16
+ * Additional inference parameters for Image To Video
17
+ */
18
+ parameters?: ImageToVideoParameters;
19
+ [property: string]: unknown;
20
+ }
21
+ /**
22
+ * Additional inference parameters for Image To Video
23
+ */
24
+ export interface ImageToVideoParameters {
25
+ /**
26
+ * For diffusion models. A higher guidance scale value encourages the model to generate
27
+ * videos closely linked to the text prompt at the expense of lower image quality.
28
+ */
29
+ guidance_scale?: number;
30
+ /**
31
+ * One prompt to guide what NOT to include in video generation.
32
+ */
33
+ negative_prompt?: string;
34
+ /**
35
+ * The num_frames parameter determines how many video frames are generated.
36
+ */
37
+ num_frames?: number;
38
+ /**
39
+ * The number of denoising steps. More denoising steps usually lead to a higher quality
40
+ * video at the expense of slower inference.
41
+ */
42
+ num_inference_steps?: number;
43
+ /**
44
+ * The text prompt to guide the video generation.
45
+ */
46
+ prompt?: string;
47
+ /**
48
+ * Seed for the random number generator.
49
+ */
50
+ seed?: number;
51
+ /**
52
+ * The size in pixel of the output video frames.
53
+ */
54
+ target_size?: TargetSize;
55
+ [property: string]: unknown;
56
+ }
57
+ /**
58
+ * The size in pixel of the output video frames.
59
+ */
60
+ export interface TargetSize {
61
+ height: number;
62
+ width: number;
63
+ [property: string]: unknown;
64
+ }
65
+ /**
66
+ * Outputs of inference for the Image To Video task
67
+ */
68
+ export interface ImageToVideoOutput {
69
+ /**
70
+ * The generated video returned as raw bytes in the payload.
71
+ */
72
+ video: unknown;
73
+ [property: string]: unknown;
74
+ }
@@ -0,0 +1,64 @@
1
+ {
2
+ "$id": "/inference/schemas/image-to-video/input.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Inputs for Image To Video inference",
5
+ "title": "ImageToVideoInput",
6
+ "type": "object",
7
+ "properties": {
8
+ "inputs": {
9
+ "type": "string",
10
+ "description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload.",
11
+ "comment": "type=binary"
12
+ },
13
+ "parameters": {
14
+ "description": "Additional inference parameters for Image To Video",
15
+ "$ref": "#/$defs/ImageToVideoParameters"
16
+ }
17
+ },
18
+ "$defs": {
19
+ "ImageToVideoParameters": {
20
+ "title": "ImageToVideoParameters",
21
+ "type": "object",
22
+ "properties": {
23
+ "prompt": {
24
+ "type": "string",
25
+ "description": "The text prompt to guide the video generation."
26
+ },
27
+ "guidance_scale": {
28
+ "type": "number",
29
+ "description": "For diffusion models. A higher guidance scale value encourages the model to generate videos closely linked to the text prompt at the expense of lower image quality."
30
+ },
31
+ "negative_prompt": {
32
+ "type": "string",
33
+ "description": "One prompt to guide what NOT to include in video generation."
34
+ },
35
+ "num_inference_steps": {
36
+ "type": "integer",
37
+ "description": "The number of denoising steps. More denoising steps usually lead to a higher quality video at the expense of slower inference."
38
+ },
39
+ "num_frames": {
40
+ "type": "number",
41
+ "description": "The num_frames parameter determines how many video frames are generated."
42
+ },
43
+ "target_size": {
44
+ "type": "object",
45
+ "description": "The size in pixel of the output video frames.",
46
+ "properties": {
47
+ "width": {
48
+ "type": "integer"
49
+ },
50
+ "height": {
51
+ "type": "integer"
52
+ }
53
+ },
54
+ "required": ["width", "height"]
55
+ },
56
+ "seed": {
57
+ "type": "integer",
58
+ "description": "Seed for the random number generator."
59
+ }
60
+ }
61
+ }
62
+ },
63
+ "required": ["inputs"]
64
+ }
@@ -0,0 +1,13 @@
1
+ {
2
+ "$id": "/inference/schemas/image-to-video/output.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Outputs of inference for the Image To Video task",
5
+ "title": "ImageToVideoOutput",
6
+ "type": "object",
7
+ "properties": {
8
+ "video": {
9
+ "description": "The generated video returned as raw bytes in the payload."
10
+ }
11
+ },
12
+ "required": ["video"]
13
+ }
@@ -14,6 +14,7 @@ import imageToImage from "./image-to-image/data.js";
14
14
  import imageToText from "./image-to-text/data.js";
15
15
  import imageTextToText from "./image-text-to-text/data.js";
16
16
  import imageSegmentation from "./image-segmentation/data.js";
17
+ import imageToVideo from "./image-to-video/data.js";
17
18
  import maskGeneration from "./mask-generation/data.js";
18
19
  import objectDetection from "./object-detection/data.js";
19
20
  import depthEstimation from "./depth-estimation/data.js";
@@ -69,6 +70,7 @@ export type {
69
70
  export type * from "./image-to-image/inference.js";
70
71
  export type { ImageToTextInput, ImageToTextOutput, ImageToTextParameters } from "./image-to-text/inference.js";
71
72
  export type * from "./image-segmentation/inference.js";
73
+ export type { ImageToVideoInput, ImageToVideoOutput, ImageToVideoParameters } from "./image-to-video/inference.js";
72
74
  export type * from "./object-detection/inference.js";
73
75
  export type * from "./depth-estimation/inference.js";
74
76
  export type * from "./question-answering/inference.js";
@@ -217,7 +219,7 @@ export const TASKS_DATA: Record<PipelineType, TaskData | undefined> = {
217
219
  "image-to-image": getData("image-to-image", imageToImage),
218
220
  "image-text-to-text": getData("image-text-to-text", imageTextToText),
219
221
  "image-to-text": getData("image-to-text", imageToText),
220
- "image-to-video": undefined,
222
+ "image-to-video": getData("image-to-video", imageToVideo),
221
223
  "keypoint-detection": getData("keypoint-detection", keypointDetection),
222
224
  "mask-generation": getData("mask-generation", maskGeneration),
223
225
  "multiple-choice": undefined,