@huggingface/tasks 0.19.65 → 0.19.67

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/dist/commonjs/dataset-libraries.d.ts +6 -0
  2. package/dist/commonjs/dataset-libraries.d.ts.map +1 -1
  3. package/dist/commonjs/dataset-libraries.js +6 -0
  4. package/dist/commonjs/hardware.d.ts +4 -0
  5. package/dist/commonjs/hardware.d.ts.map +1 -1
  6. package/dist/commonjs/hardware.js +4 -0
  7. package/dist/commonjs/local-apps.d.ts +0 -7
  8. package/dist/commonjs/local-apps.d.ts.map +1 -1
  9. package/dist/commonjs/local-apps.js +0 -7
  10. package/dist/commonjs/model-libraries-snippets.d.ts +1 -0
  11. package/dist/commonjs/model-libraries-snippets.d.ts.map +1 -1
  12. package/dist/commonjs/model-libraries-snippets.js +16 -3
  13. package/dist/commonjs/model-libraries.d.ts +15 -1
  14. package/dist/commonjs/model-libraries.d.ts.map +1 -1
  15. package/dist/commonjs/model-libraries.js +14 -0
  16. package/dist/commonjs/pipelines.d.ts +9 -1
  17. package/dist/commonjs/pipelines.d.ts.map +1 -1
  18. package/dist/commonjs/pipelines.js +8 -0
  19. package/dist/commonjs/snippets/inputs.d.ts.map +1 -1
  20. package/dist/commonjs/snippets/inputs.js +10 -0
  21. package/dist/commonjs/tasks/image-text-to-image/data.d.ts +4 -0
  22. package/dist/commonjs/tasks/image-text-to-image/data.d.ts.map +1 -0
  23. package/dist/commonjs/tasks/image-text-to-image/data.js +50 -0
  24. package/dist/commonjs/tasks/image-text-to-image/inference.d.ts +76 -0
  25. package/dist/commonjs/tasks/image-text-to-image/inference.d.ts.map +1 -0
  26. package/dist/commonjs/tasks/image-text-to-image/inference.js +2 -0
  27. package/dist/commonjs/tasks/image-text-to-video/data.d.ts +4 -0
  28. package/dist/commonjs/tasks/image-text-to-video/data.d.ts.map +1 -0
  29. package/dist/commonjs/tasks/image-text-to-video/data.js +50 -0
  30. package/dist/commonjs/tasks/image-text-to-video/inference.d.ts +78 -0
  31. package/dist/commonjs/tasks/image-text-to-video/inference.d.ts.map +1 -0
  32. package/dist/commonjs/tasks/image-text-to-video/inference.js +2 -0
  33. package/dist/commonjs/tasks/index.d.ts +2 -0
  34. package/dist/commonjs/tasks/index.d.ts.map +1 -1
  35. package/dist/commonjs/tasks/index.js +72 -66
  36. package/dist/esm/dataset-libraries.d.ts +6 -0
  37. package/dist/esm/dataset-libraries.d.ts.map +1 -1
  38. package/dist/esm/dataset-libraries.js +6 -0
  39. package/dist/esm/hardware.d.ts +4 -0
  40. package/dist/esm/hardware.d.ts.map +1 -1
  41. package/dist/esm/hardware.js +4 -0
  42. package/dist/esm/local-apps.d.ts +0 -7
  43. package/dist/esm/local-apps.d.ts.map +1 -1
  44. package/dist/esm/local-apps.js +0 -7
  45. package/dist/esm/model-libraries-snippets.d.ts +1 -0
  46. package/dist/esm/model-libraries-snippets.d.ts.map +1 -1
  47. package/dist/esm/model-libraries-snippets.js +12 -0
  48. package/dist/esm/model-libraries.d.ts +15 -1
  49. package/dist/esm/model-libraries.d.ts.map +1 -1
  50. package/dist/esm/model-libraries.js +14 -0
  51. package/dist/esm/pipelines.d.ts +9 -1
  52. package/dist/esm/pipelines.d.ts.map +1 -1
  53. package/dist/esm/pipelines.js +8 -0
  54. package/dist/esm/snippets/inputs.d.ts.map +1 -1
  55. package/dist/esm/snippets/inputs.js +10 -0
  56. package/dist/esm/tasks/image-text-to-image/data.d.ts +4 -0
  57. package/dist/esm/tasks/image-text-to-image/data.d.ts.map +1 -0
  58. package/dist/esm/tasks/image-text-to-image/data.js +48 -0
  59. package/dist/esm/tasks/image-text-to-image/inference.d.ts +76 -0
  60. package/dist/esm/tasks/image-text-to-image/inference.d.ts.map +1 -0
  61. package/dist/esm/tasks/image-text-to-image/inference.js +1 -0
  62. package/dist/esm/tasks/image-text-to-video/data.d.ts +4 -0
  63. package/dist/esm/tasks/image-text-to-video/data.d.ts.map +1 -0
  64. package/dist/esm/tasks/image-text-to-video/data.js +48 -0
  65. package/dist/esm/tasks/image-text-to-video/inference.d.ts +78 -0
  66. package/dist/esm/tasks/image-text-to-video/inference.d.ts.map +1 -0
  67. package/dist/esm/tasks/image-text-to-video/inference.js +1 -0
  68. package/dist/esm/tasks/index.d.ts +2 -0
  69. package/dist/esm/tasks/index.d.ts.map +1 -1
  70. package/dist/esm/tasks/index.js +6 -0
  71. package/package.json +1 -1
  72. package/src/dataset-libraries.ts +6 -0
  73. package/src/hardware.ts +4 -0
  74. package/src/local-apps.ts +0 -7
  75. package/src/model-libraries-snippets.ts +13 -0
  76. package/src/model-libraries.ts +14 -0
  77. package/src/pipelines.ts +8 -0
  78. package/src/snippets/inputs.ts +12 -0
  79. package/src/tasks/image-text-to-image/about.md +73 -0
  80. package/src/tasks/image-text-to-image/data.ts +54 -0
  81. package/src/tasks/image-text-to-image/inference.ts +75 -0
  82. package/src/tasks/image-text-to-image/spec/input.json +59 -0
  83. package/src/tasks/image-text-to-image/spec/output.json +13 -0
  84. package/src/tasks/image-text-to-video/about.md +71 -0
  85. package/src/tasks/image-text-to-video/data.ts +54 -0
  86. package/src/tasks/image-text-to-video/inference.ts +77 -0
  87. package/src/tasks/image-text-to-video/spec/input.json +63 -0
  88. package/src/tasks/image-text-to-video/spec/output.json +13 -0
  89. package/src/tasks/index.ts +16 -0
@@ -1 +1 @@
1
- {"version":3,"file":"inputs.d.ts","sourceRoot":"","sources":["../../../src/snippets/inputs.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,0BAA0B,EAAE,MAAM,mBAAmB,CAAC;AACpE,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,YAAY,CAAC;AAyJnD,wBAAgB,oBAAoB,CACnC,KAAK,EAAE,gBAAgB,EACvB,MAAM,UAAQ,EACd,QAAQ,UAAQ,GACd,MAAM,GAAG,0BAA0B,EAAE,CAmBvC"}
1
+ {"version":3,"file":"inputs.d.ts","sourceRoot":"","sources":["../../../src/snippets/inputs.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,0BAA0B,EAAE,MAAM,mBAAmB,CAAC;AACpE,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,YAAY,CAAC;AAqKnD,wBAAgB,oBAAoB,CACnC,KAAK,EAAE,gBAAgB,EACvB,MAAM,UAAQ,EACd,QAAQ,UAAQ,GACd,MAAM,GAAG,0BAA0B,EAAE,CAmBvC"}
@@ -68,6 +68,14 @@ const inputsImageToVideo = () => `{
68
68
  "image": "cat.png",
69
69
  "prompt": "The cat starts to dance"
70
70
  }`;
71
+ const inputsImageTextToImage = () => `{
72
+ "image": "cat.png",
73
+ "prompt": "Turn the cat into a tiger."
74
+ }`;
75
+ const inputsImageTextToVideo = () => `{
76
+ "image": "cat.png",
77
+ "prompt": "The cat starts to dance"
78
+ }`;
71
79
  const inputsImageSegmentation = () => `"cats.jpg"`;
72
80
  const inputsObjectDetection = () => `"cats.jpg"`;
73
81
  const inputsAudioToAudio = () => `"sample1.flac"`;
@@ -90,6 +98,8 @@ const modelInputSnippets = {
90
98
  "image-to-text": inputsImageToText,
91
99
  "image-to-image": inputsImageToImage,
92
100
  "image-to-video": inputsImageToVideo,
101
+ "image-text-to-image": inputsImageTextToImage,
102
+ "image-text-to-video": inputsImageTextToVideo,
93
103
  "image-segmentation": inputsImageSegmentation,
94
104
  "object-detection": inputsObjectDetection,
95
105
  "question-answering": inputsQuestionAnswering,
@@ -0,0 +1,4 @@
1
+ import type { TaskDataCustom } from "../index.js";
2
+ declare const taskData: TaskDataCustom;
3
+ export default taskData;
4
+ //# sourceMappingURL=data.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/image-text-to-image/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAiDf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
@@ -0,0 +1,48 @@
1
+ const taskData = {
2
+ datasets: [],
3
+ demo: {
4
+ inputs: [
5
+ {
6
+ filename: "image-text-to-image-input.jpeg",
7
+ type: "img",
8
+ },
9
+ {
10
+ label: "Input",
11
+ content: "A city above clouds, pastel colors, Victorian style",
12
+ type: "text",
13
+ },
14
+ ],
15
+ outputs: [
16
+ {
17
+ filename: "image-text-to-image-output.png",
18
+ type: "img",
19
+ },
20
+ ],
21
+ },
22
+ metrics: [
23
+ {
24
+ description: "The Fréchet Inception Distance (FID) calculates the distance between distributions between synthetic and real samples. A lower FID score indicates better similarity between the distributions of real and generated images.",
25
+ id: "FID",
26
+ },
27
+ {
28
+ description: "CLIP Score measures the similarity between the generated image and the text prompt using CLIP embeddings. A higher score indicates better alignment with the text prompt.",
29
+ id: "CLIP",
30
+ },
31
+ ],
32
+ models: [
33
+ {
34
+ description: "A powerful model for image-text-to-image generation.",
35
+ id: "black-forest-labs/FLUX.2-dev",
36
+ },
37
+ ],
38
+ spaces: [
39
+ {
40
+ description: "An application for image-text-to-image generation.",
41
+ id: "black-forest-labs/FLUX.2-dev",
42
+ },
43
+ ],
44
+ summary: "Image-text-to-image models take an image and a text prompt as input and generate a new image based on the reference image and text instructions. These models are useful for image editing, style transfer, image variations, and guided image generation tasks.",
45
+ widgetModels: ["black-forest-labs/FLUX.2-dev"],
46
+ youtubeId: undefined,
47
+ };
48
+ export default taskData;
@@ -0,0 +1,76 @@
1
+ /**
2
+ * Inference code generated from the JSON schema spec in ./spec
3
+ *
4
+ * Using src/scripts/inference-codegen
5
+ */
6
+ /**
7
+ * Inputs for Image Text To Image inference. Either inputs (image) or prompt (in parameters)
8
+ * must be provided, or both.
9
+ */
10
+ export interface ImageTextToImageInput {
11
+ /**
12
+ * The input image data as a base64-encoded string. If no `parameters` are provided, you can
13
+ * also provide the image data as a raw bytes payload. Either this or prompt must be
14
+ * provided.
15
+ */
16
+ inputs?: Blob;
17
+ /**
18
+ * Additional inference parameters for Image Text To Image
19
+ */
20
+ parameters?: ImageTextToImageParameters;
21
+ [property: string]: unknown;
22
+ }
23
+ /**
24
+ * Additional inference parameters for Image Text To Image
25
+ */
26
+ export interface ImageTextToImageParameters {
27
+ /**
28
+ * For diffusion models. A higher guidance scale value encourages the model to generate
29
+ * images closely linked to the text prompt at the expense of lower image quality.
30
+ */
31
+ guidance_scale?: number;
32
+ /**
33
+ * One prompt to guide what NOT to include in image generation.
34
+ */
35
+ negative_prompt?: string;
36
+ /**
37
+ * For diffusion models. The number of denoising steps. More denoising steps usually lead to
38
+ * a higher quality image at the expense of slower inference.
39
+ */
40
+ num_inference_steps?: number;
41
+ /**
42
+ * The text prompt to guide the image generation. Either this or inputs (image) must be
43
+ * provided.
44
+ */
45
+ prompt?: string;
46
+ /**
47
+ * Seed for the random number generator.
48
+ */
49
+ seed?: number;
50
+ /**
51
+ * The size in pixels of the output image. This parameter is only supported by some
52
+ * providers and for specific models. It will be ignored when unsupported.
53
+ */
54
+ target_size?: TargetSize;
55
+ [property: string]: unknown;
56
+ }
57
+ /**
58
+ * The size in pixels of the output image. This parameter is only supported by some
59
+ * providers and for specific models. It will be ignored when unsupported.
60
+ */
61
+ export interface TargetSize {
62
+ height: number;
63
+ width: number;
64
+ [property: string]: unknown;
65
+ }
66
+ /**
67
+ * Outputs of inference for the Image Text To Image task
68
+ */
69
+ export interface ImageTextToImageOutput {
70
+ /**
71
+ * The generated image returned as raw bytes in the payload.
72
+ */
73
+ image: unknown;
74
+ [property: string]: unknown;
75
+ }
76
+ //# sourceMappingURL=inference.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/image-text-to-image/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;;GAGG;AACH,MAAM,WAAW,qBAAqB;IACrC;;;;OAIG;IACH,MAAM,CAAC,EAAE,IAAI,CAAC;IACd;;OAEG;IACH,UAAU,CAAC,EAAE,0BAA0B,CAAC;IACxC,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;GAEG;AACH,MAAM,WAAW,0BAA0B;IAC1C;;;OAGG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB;;OAEG;IACH,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B;;;OAGG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB;;OAEG;IACH,IAAI,CAAC,EAAE,MAAM,CAAC;IACd;;;OAGG;IACH,WAAW,CAAC,EAAE,UAAU,CAAC;IACzB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;GAGG;AACH,MAAM,WAAW,UAAU;IAC1B,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;IACd,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;GAEG;AACH,MAAM,WAAW,sBAAsB;IACtC;;OAEG;IACH,KAAK,EAAE,OAAO,CAAC;IACf,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B"}
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,4 @@
1
+ import type { TaskDataCustom } from "../index.js";
2
+ declare const taskData: TaskDataCustom;
3
+ export default taskData;
4
+ //# sourceMappingURL=data.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/image-text-to-video/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAiDf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
@@ -0,0 +1,48 @@
1
+ const taskData = {
2
+ datasets: [],
3
+ demo: {
4
+ inputs: [
5
+ {
6
+ filename: "image-text-to-video-input.jpg",
7
+ type: "img",
8
+ },
9
+ {
10
+ label: "Input",
11
+ content: "Darth Vader is surfing on the waves.",
12
+ type: "text",
13
+ },
14
+ ],
15
+ outputs: [
16
+ {
17
+ filename: "image-text-to-video-output.gif",
18
+ type: "img",
19
+ },
20
+ ],
21
+ },
22
+ metrics: [
23
+ {
24
+ description: "Frechet Video Distance uses a model that captures coherence for changes in frames and the quality of each frame. A smaller score indicates better video generation.",
25
+ id: "fvd",
26
+ },
27
+ {
28
+ description: "CLIPSIM measures similarity between video frames and text using an image-text similarity model. A higher score indicates better video generation.",
29
+ id: "clipsim",
30
+ },
31
+ ],
32
+ models: [
33
+ {
34
+ description: "A powerful model for image-text-to-video generation.",
35
+ id: "Lightricks/LTX-Video",
36
+ },
37
+ ],
38
+ spaces: [
39
+ {
40
+ description: "An application for image-text-to-video generation.",
41
+ id: "Lightricks/ltx-video-distilled",
42
+ },
43
+ ],
44
+ summary: "Image-text-to-video models take an reference image and a text instructions as and generate a video based on them. These models are useful for animating still images, creating dynamic content from static references, and generating videos with specific motion or transformation guidance.",
45
+ widgetModels: ["Lightricks/LTX-Video"],
46
+ youtubeId: undefined,
47
+ };
48
+ export default taskData;
@@ -0,0 +1,78 @@
1
+ /**
2
+ * Inference code generated from the JSON schema spec in ./spec
3
+ *
4
+ * Using src/scripts/inference-codegen
5
+ */
6
+ /**
7
+ * Inputs for Image Text To Video inference. Either inputs (image) or prompt (in parameters)
8
+ * must be provided, or both.
9
+ */
10
+ export interface ImageTextToVideoInput {
11
+ /**
12
+ * The input image data as a base64-encoded string. If no `parameters` are provided, you can
13
+ * also provide the image data as a raw bytes payload. Either this or prompt must be
14
+ * provided.
15
+ */
16
+ inputs?: Blob;
17
+ /**
18
+ * Additional inference parameters for Image Text To Video
19
+ */
20
+ parameters?: ImageTextToVideoParameters;
21
+ [property: string]: unknown;
22
+ }
23
+ /**
24
+ * Additional inference parameters for Image Text To Video
25
+ */
26
+ export interface ImageTextToVideoParameters {
27
+ /**
28
+ * For diffusion models. A higher guidance scale value encourages the model to generate
29
+ * videos closely linked to the text prompt at the expense of lower image quality.
30
+ */
31
+ guidance_scale?: number;
32
+ /**
33
+ * One prompt to guide what NOT to include in video generation.
34
+ */
35
+ negative_prompt?: string;
36
+ /**
37
+ * The num_frames parameter determines how many video frames are generated.
38
+ */
39
+ num_frames?: number;
40
+ /**
41
+ * The number of denoising steps. More denoising steps usually lead to a higher quality
42
+ * video at the expense of slower inference.
43
+ */
44
+ num_inference_steps?: number;
45
+ /**
46
+ * The text prompt to guide the video generation. Either this or inputs (image) must be
47
+ * provided.
48
+ */
49
+ prompt?: string;
50
+ /**
51
+ * Seed for the random number generator.
52
+ */
53
+ seed?: number;
54
+ /**
55
+ * The size in pixel of the output video frames.
56
+ */
57
+ target_size?: TargetSize;
58
+ [property: string]: unknown;
59
+ }
60
+ /**
61
+ * The size in pixel of the output video frames.
62
+ */
63
+ export interface TargetSize {
64
+ height: number;
65
+ width: number;
66
+ [property: string]: unknown;
67
+ }
68
+ /**
69
+ * Outputs of inference for the Image Text To Video task
70
+ */
71
+ export interface ImageTextToVideoOutput {
72
+ /**
73
+ * The generated video returned as raw bytes in the payload.
74
+ */
75
+ video: unknown;
76
+ [property: string]: unknown;
77
+ }
78
+ //# sourceMappingURL=inference.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/image-text-to-video/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;;GAGG;AACH,MAAM,WAAW,qBAAqB;IACrC;;;;OAIG;IACH,MAAM,CAAC,EAAE,IAAI,CAAC;IACd;;OAEG;IACH,UAAU,CAAC,EAAE,0BAA0B,CAAC;IACxC,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;GAEG;AACH,MAAM,WAAW,0BAA0B;IAC1C;;;OAGG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB;;OAEG;IACH,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB;;OAEG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B;;;OAGG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB;;OAEG;IACH,IAAI,CAAC,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,WAAW,CAAC,EAAE,UAAU,CAAC;IACzB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;GAEG;AACH,MAAM,WAAW,UAAU;IAC1B,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;IACd,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;GAEG;AACH,MAAM,WAAW,sBAAsB;IACtC;;OAEG;IACH,KAAK,EAAE,OAAO,CAAC;IACf,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B"}
@@ -0,0 +1 @@
1
+ export {};
@@ -10,6 +10,8 @@ export type * from "./image-to-image/inference.js";
10
10
  export type { ImageToTextInput, ImageToTextOutput, ImageToTextParameters } from "./image-to-text/inference.js";
11
11
  export type * from "./image-segmentation/inference.js";
12
12
  export type { ImageToVideoInput, ImageToVideoOutput, ImageToVideoParameters } from "./image-to-video/inference.js";
13
+ export type { ImageTextToImageInput, ImageTextToImageOutput, ImageTextToImageParameters, } from "./image-text-to-image/inference.js";
14
+ export type { ImageTextToVideoInput, ImageTextToVideoOutput, ImageTextToVideoParameters, } from "./image-text-to-video/inference.js";
13
15
  export type * from "./object-detection/inference.js";
14
16
  export type * from "./depth-estimation/inference.js";
15
17
  export type * from "./question-answering/inference.js";
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/tasks/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AAkDpD,mBAAmB,qCAAqC,CAAC;AACzD,mBAAmB,6CAA6C,CAAC;AACjE,YAAY,EACX,mBAAmB,EACnB,0BAA0B,EAC1B,mCAAmC,EACnC,oBAAoB,EACpB,4BAA4B,EAC5B,2BAA2B,EAC3B,0BAA0B,EAC1B,gCAAgC,EAChC,+BAA+B,GAC/B,MAAM,gCAAgC,CAAC;AACxC,mBAAmB,4CAA4C,CAAC;AAChE,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,0BAA0B,CAAC;AAC9C,YAAY,EACX,wBAAwB,EACxB,yBAAyB,EACzB,gCAAgC,EAChC,6BAA6B,GAC7B,MAAM,qCAAqC,CAAC;AAC7C,mBAAmB,+BAA+B,CAAC;AACnD,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,mBAAmB,mCAAmC,CAAC;AACvD,YAAY,EAAE,iBAAiB,EAAE,kBAAkB,EAAE,sBAAsB,EAAE,MAAM,+BAA+B,CAAC;AACnH,mBAAmB,iCAAiC,CAAC;AACrD,mBAAmB,iCAAiC,CAAC;AACrD,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,oCAAoC,CAAC;AACxD,mBAAmB,8BAA8B,CAAC;AAClD,mBAAmB,yCAAyC,CAAC;AAC7D,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,YAAY,EAAE,qBAAqB,EAAE,iBAAiB,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,YAAY,EAAE,sBAAsB,EAAE,iBAAiB,EAAE,kBAAkB,EAAE,MAAM,+BAA+B,CAAC;AACnH,mBAAmB,qCAAqC,CAAC;AACzD,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AACtF,YAAY,EACX,6BAA6B,EAC7B,uBAAuB,EACvB,wBAAwB,EACxB,+BAA+B,EAC/B,4BAA4B,GAC5B,MAAM,oCAAoC,CAAC;AAC5C,YAAY,EACX,gCAAgC,EAChC,gCAAgC,EAChC,mBAAmB,EACnB,oBAAoB,EACpB,2BAA2B,EAC3B,qCAAqC,EACrC,kCAAkC,EAClC,yBAAyB,EACzB,uCAAuC,EACvC,0BAA0B,GAC1B,MAAM,gCAAgC,CAAC;AACxC,mBAAmB,qCAAqC,CAAC;AACzD,mBAAmB,0CAA0C,CAAC;AAC9D,mBAAmB,yCAAyC,CAAC;AAC7D,mBAAmB,+CAA+C,CAAC;AACnE,YAAY,EACX,WAAW,EACX,4BAA4B,EAC5B,6BAA6B,EAC7B,oCAAoC,GACpC,MAAM,2CAA2C,CAAC;AAEnD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAC7D;;GAEG;AACH,eAAO,MAAM,qBAAqB,EAAE,MAAM,CAAC,YAAY,EAAE,eAAe,EAAE,CAgEzE,CAAC;AAoBF,eAAO,MAAM,UAAU,EAAE,MAAM,CAAC,YAAY,EAAE,QAAQ,GAAG,SAAS,CAwDxD,CAAC;AAEX,MAAM,WAAW,WAAW;IAC3B,WAAW,EAAE,MAAM,CAAC;IACpB,EAAE,EAAE,MAAM,CAAC;CACX;AAED,MAAM,MAAM,aAAa,GACtB;IACA,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,OAAO,CAAC;CACb,GACD;IACA,IAAI,EAAE,KAAK,CAAC;QACX,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;KACd,CAAC,CAAC;IACH,IAAI,EAAE,OAAO,CAAC;CACb,GACD;IACA,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,KAAK,CAAC;CACX,GACD;IACA,KAAK,EAAE,MAAM,EAAE,EAAE,CAAC;IAClB,IAAI,EAAE,SAAS,CAAC;CACf,GACD;IACA,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,MAAM,CAAC;CACZ,GACD;IACA,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,KAAK,CAAC;QACb,GAAG,EAAE,MAAM,CAAC;QACZ,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;KACb,CAAC,CAAC;IACH,IAAI,EAAE,kBAAkB,CAAC;CACxB,CAAC;AAEL,MAAM,WAAW,QAAQ;IACxB,MAAM,EAAE,aAAa,EAAE,CAAC;IACxB,OAAO,EAAE,aAAa,EAAE,CAAC;CACzB;AAED,MAAM,WAAW,QAAQ;IACxB,QAAQ,EAAE,WAAW,EAAE,CAAC;IACxB,IAAI,EAAE,QAAQ,CAAC;IACf,EAAE,EAAE,YAAY,CAAC;IACjB,WAAW,CAAC,EAAE,YAAY,CAAC;IAC3B,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,KAAK,EAAE,MAAM,CAAC;IACd,SAAS,EAAE,eAAe,EAAE,CAAC;IAC7B,OAAO,EAAE,WAAW,EAAE,CAAC;IACvB,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,OAAO,EAAE,MAAM,CAAC;IAChB,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,SAAS,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,MAAM,cAAc,GAAG,IAAI,CAAC,QAAQ,EAAE,IAAI,GAAG,OAAO,GAAG,WAAW,CAAC,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/tasks/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AAoDpD,mBAAmB,qCAAqC,CAAC;AACzD,mBAAmB,6CAA6C,CAAC;AACjE,YAAY,EACX,mBAAmB,EACnB,0BAA0B,EAC1B,mCAAmC,EACnC,oBAAoB,EACpB,4BAA4B,EAC5B,2BAA2B,EAC3B,0BAA0B,EAC1B,gCAAgC,EAChC,+BAA+B,GAC/B,MAAM,gCAAgC,CAAC;AACxC,mBAAmB,4CAA4C,CAAC;AAChE,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,0BAA0B,CAAC;AAC9C,YAAY,EACX,wBAAwB,EACxB,yBAAyB,EACzB,gCAAgC,EAChC,6BAA6B,GAC7B,MAAM,qCAAqC,CAAC;AAC7C,mBAAmB,+BAA+B,CAAC;AACnD,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,mBAAmB,mCAAmC,CAAC;AACvD,YAAY,EAAE,iBAAiB,EAAE,kBAAkB,EAAE,sBAAsB,EAAE,MAAM,+BAA+B,CAAC;AACnH,YAAY,EACX,qBAAqB,EACrB,sBAAsB,EACtB,0BAA0B,GAC1B,MAAM,oCAAoC,CAAC;AAC5C,YAAY,EACX,qBAAqB,EACrB,sBAAsB,EACtB,0BAA0B,GAC1B,MAAM,oCAAoC,CAAC;AAC5C,mBAAmB,iCAAiC,CAAC;AACrD,mBAAmB,iCAAiC,CAAC;AACrD,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,oCAAoC,CAAC;AACxD,mBAAmB,8BAA8B,CAAC;AAClD,mBAAmB,yCAAyC,CAAC;AAC7D,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,YAAY,EAAE,qBAAqB,EAAE,iBAAiB,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,YAAY,EAAE,sBAAsB,EAAE,iBAAiB,EAAE,kBAAkB,EAAE,MAAM,+BAA+B,CAAC;AACnH,mBAAmB,qCAAqC,CAAC;AACzD,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AACtF,YAAY,EACX,6BAA6B,EAC7B,uBAAuB,EACvB,wBAAwB,EACxB,+BAA+B,EAC/B,4BAA4B,GAC5B,MAAM,oCAAoC,CAAC;AAC5C,YAAY,EACX,gCAAgC,EAChC,gCAAgC,EAChC,mBAAmB,EACnB,oBAAoB,EACpB,2BAA2B,EAC3B,qCAAqC,EACrC,kCAAkC,EAClC,yBAAyB,EACzB,uCAAuC,EACvC,0BAA0B,GAC1B,MAAM,gCAAgC,CAAC;AACxC,mBAAmB,qCAAqC,CAAC;AACzD,mBAAmB,0CAA0C,CAAC;AAC9D,mBAAmB,yCAAyC,CAAC;AAC7D,mBAAmB,+CAA+C,CAAC;AACnE,YAAY,EACX,WAAW,EACX,4BAA4B,EAC5B,6BAA6B,EAC7B,oCAAoC,GACpC,MAAM,2CAA2C,CAAC;AAEnD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAC7D;;GAEG;AACH,eAAO,MAAM,qBAAqB,EAAE,MAAM,CAAC,YAAY,EAAE,eAAe,EAAE,CAkEzE,CAAC;AAoBF,eAAO,MAAM,UAAU,EAAE,MAAM,CAAC,YAAY,EAAE,QAAQ,GAAG,SAAS,CA0DxD,CAAC;AAEX,MAAM,WAAW,WAAW;IAC3B,WAAW,EAAE,MAAM,CAAC;IACpB,EAAE,EAAE,MAAM,CAAC;CACX;AAED,MAAM,MAAM,aAAa,GACtB;IACA,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,OAAO,CAAC;CACb,GACD;IACA,IAAI,EAAE,KAAK,CAAC;QACX,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;KACd,CAAC,CAAC;IACH,IAAI,EAAE,OAAO,CAAC;CACb,GACD;IACA,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,KAAK,CAAC;CACX,GACD;IACA,KAAK,EAAE,MAAM,EAAE,EAAE,CAAC;IAClB,IAAI,EAAE,SAAS,CAAC;CACf,GACD;IACA,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,MAAM,CAAC;CACZ,GACD;IACA,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,KAAK,CAAC;QACb,GAAG,EAAE,MAAM,CAAC;QACZ,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;KACb,CAAC,CAAC;IACH,IAAI,EAAE,kBAAkB,CAAC;CACxB,CAAC;AAEL,MAAM,WAAW,QAAQ;IACxB,MAAM,EAAE,aAAa,EAAE,CAAC;IACxB,OAAO,EAAE,aAAa,EAAE,CAAC;CACzB;AAED,MAAM,WAAW,QAAQ;IACxB,QAAQ,EAAE,WAAW,EAAE,CAAC;IACxB,IAAI,EAAE,QAAQ,CAAC;IACf,EAAE,EAAE,YAAY,CAAC;IACjB,WAAW,CAAC,EAAE,YAAY,CAAC;IAC3B,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,KAAK,EAAE,MAAM,CAAC;IACd,SAAS,EAAE,eAAe,EAAE,CAAC;IAC7B,OAAO,EAAE,WAAW,EAAE,CAAC;IACvB,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,OAAO,EAAE,MAAM,CAAC;IAChB,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,SAAS,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,MAAM,cAAc,GAAG,IAAI,CAAC,QAAQ,EAAE,IAAI,GAAG,OAAO,GAAG,WAAW,CAAC,CAAC"}
@@ -12,6 +12,8 @@ import imageFeatureExtraction from "./image-feature-extraction/data.js";
12
12
  import imageToImage from "./image-to-image/data.js";
13
13
  import imageToText from "./image-to-text/data.js";
14
14
  import imageTextToText from "./image-text-to-text/data.js";
15
+ import imageTextToImage from "./image-text-to-image/data.js";
16
+ import imageTextToVideo from "./image-text-to-video/data.js";
15
17
  import imageSegmentation from "./image-segmentation/data.js";
16
18
  import imageToVideo from "./image-to-video/data.js";
17
19
  import maskGeneration from "./mask-generation/data.js";
@@ -62,6 +64,8 @@ export const TASKS_MODEL_LIBRARIES = {
62
64
  "image-feature-extraction": ["timm", "transformers"],
63
65
  "image-segmentation": ["transformers", "transformers.js"],
64
66
  "image-text-to-text": ["transformers"],
67
+ "image-text-to-image": ["diffusers"],
68
+ "image-text-to-video": ["diffusers"],
65
69
  "image-to-image": ["diffusers", "transformers", "transformers.js"],
66
70
  "image-to-text": ["transformers", "transformers.js"],
67
71
  "image-to-video": ["diffusers"],
@@ -147,6 +151,8 @@ export const TASKS_DATA = {
147
151
  "image-segmentation": getData("image-segmentation", imageSegmentation),
148
152
  "image-to-image": getData("image-to-image", imageToImage),
149
153
  "image-text-to-text": getData("image-text-to-text", imageTextToText),
154
+ "image-text-to-image": getData("image-text-to-image", imageTextToImage),
155
+ "image-text-to-video": getData("image-text-to-video", imageTextToVideo),
150
156
  "image-to-text": getData("image-to-text", imageToText),
151
157
  "image-to-video": getData("image-to-video", imageToVideo),
152
158
  "keypoint-detection": getData("keypoint-detection", keypointDetection),
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@huggingface/tasks",
3
- "version": "0.19.65",
3
+ "version": "0.19.67",
4
4
  "description": "List of ML tasks for huggingface.co/tasks",
5
5
  "repository": "https://github.com/huggingface/huggingface.js.git",
6
6
  "publishConfig": {
@@ -83,6 +83,12 @@ export const DATASET_LIBRARIES_UI_ELEMENTS = {
83
83
  repoUrl: "https://github.com/duckdb/duckdb",
84
84
  docsUrl: "https://huggingface.co/docs/hub/datasets-duckdb",
85
85
  },
86
+ datadesigner: {
87
+ prettyLabel: "NeMo Data Designer",
88
+ repoName: "datadesigner",
89
+ repoUrl: "https://github.com/NVIDIA-NeMo/DataDesigner",
90
+ docsUrl: "https://nvidia-nemo.github.io/DataDesigner/",
91
+ },
86
92
  } satisfies Record<string, DatasetLibraryUiElement>;
87
93
 
88
94
  /// List of the dataset libraries supported by the Hub
package/src/hardware.ts CHANGED
@@ -758,6 +758,10 @@ export const SKUS = {
758
758
  tflops: 18.4,
759
759
  memory: [36, 48, 64, 96, 128, 256, 512],
760
760
  },
761
+ "Apple M5": {
762
+ tflops: 5.7,
763
+ memory: [16, 24, 32],
764
+ },
761
765
  },
762
766
  },
763
767
  } satisfies Record<string, Record<string, Record<string, HardwareSpec>>>;
package/src/local-apps.ts CHANGED
@@ -517,13 +517,6 @@ export const LOCAL_APPS = {
517
517
  model.tags.includes("coreml") && model.tags.includes("joyfusion") && model.pipeline_tag === "text-to-image",
518
518
  deeplink: (model) => new URL(`https://joyfusion.app/import_from_hf?repo_id=${model.id}`),
519
519
  },
520
- invoke: {
521
- prettyLabel: "Invoke",
522
- docsUrl: "https://github.com/invoke-ai/InvokeAI",
523
- mainTask: "text-to-image",
524
- displayOnModelPage: (model) => model.library_name === "diffusers" && model.pipeline_tag === "text-to-image",
525
- deeplink: (model) => new URL(`https://models.invoke.ai/huggingface/${model.id}`),
526
- },
527
520
  ollama: {
528
521
  prettyLabel: "Ollama",
529
522
  docsUrl: "https://ollama.com",
@@ -331,6 +331,19 @@ output = model.generate(text)
331
331
  sf.write("simple.mp3", output, 44100)`,
332
332
  ];
333
333
 
334
+ export const dia2 = (model: ModelData): string[] => [
335
+ `from dia2 import Dia2, GenerationConfig, SamplingConfig
336
+
337
+ dia = Dia2.from_repo("${model.id}", device="cuda", dtype="bfloat16")
338
+ config = GenerationConfig(
339
+ cfg_scale=2.0,
340
+ audio=SamplingConfig(temperature=0.8, top_k=50),
341
+ use_cuda_graph=True,
342
+ )
343
+ result = dia.generate("[S1] Hello Dia2!", config=config, output_wav="hello.wav", verbose=True)
344
+ `,
345
+ ];
346
+
334
347
  export const describe_anything = (model: ModelData): string[] => [
335
348
  `# pip install git+https://github.com/NVlabs/describe-anything
336
349
  from huggingface_hub import snapshot_download
@@ -293,6 +293,13 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
293
293
  snippets: snippets.dia,
294
294
  filter: false,
295
295
  },
296
+ dia2: {
297
+ prettyLabel: "Dia2",
298
+ repoName: "Dia2",
299
+ repoUrl: "https://github.com/nari-labs/dia2",
300
+ snippets: snippets.dia2,
301
+ filter: false,
302
+ },
296
303
  "diff-interpretation-tuning": {
297
304
  prettyLabel: "Diff Interpretation Tuning",
298
305
  repoName: "Diff Interpretation Tuning",
@@ -413,6 +420,13 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
413
420
  filter: true,
414
421
  countDownloads: `path_extension:"bin"`,
415
422
  },
423
+ fixer: {
424
+ prettyLabel: "Fixer",
425
+ repoName: "Fixer",
426
+ repoUrl: "https://github.com/nv-tlabs/Fixer",
427
+ filter: false,
428
+ countDownloads: `path:"pretrained/pretrained_fixer.pkl"`,
429
+ },
416
430
  flair: {
417
431
  prettyLabel: "Flair",
418
432
  repoName: "Flair",
package/src/pipelines.ts CHANGED
@@ -557,6 +557,14 @@ export const PIPELINE_DATA = {
557
557
  name: "Image-Text-to-Text",
558
558
  modality: "multimodal",
559
559
  },
560
+ "image-text-to-image": {
561
+ name: "Image-Text-to-Image",
562
+ modality: "multimodal",
563
+ },
564
+ "image-text-to-video": {
565
+ name: "Image-Text-to-Video",
566
+ modality: "multimodal",
567
+ },
560
568
  "visual-question-answering": {
561
569
  name: "Visual Question Answering",
562
570
  subtasks: [
@@ -94,6 +94,16 @@ const inputsImageToVideo = () => `{
94
94
  "prompt": "The cat starts to dance"
95
95
  }`;
96
96
 
97
+ const inputsImageTextToImage = () => `{
98
+ "image": "cat.png",
99
+ "prompt": "Turn the cat into a tiger."
100
+ }`;
101
+
102
+ const inputsImageTextToVideo = () => `{
103
+ "image": "cat.png",
104
+ "prompt": "The cat starts to dance"
105
+ }`;
106
+
97
107
  const inputsImageSegmentation = () => `"cats.jpg"`;
98
108
 
99
109
  const inputsObjectDetection = () => `"cats.jpg"`;
@@ -130,6 +140,8 @@ const modelInputSnippets: {
130
140
  "image-to-text": inputsImageToText,
131
141
  "image-to-image": inputsImageToImage,
132
142
  "image-to-video": inputsImageToVideo,
143
+ "image-text-to-image": inputsImageTextToImage,
144
+ "image-text-to-video": inputsImageTextToVideo,
133
145
  "image-segmentation": inputsImageSegmentation,
134
146
  "object-detection": inputsObjectDetection,
135
147
  "question-answering": inputsQuestionAnswering,
@@ -0,0 +1,73 @@
1
+ ## Use Cases
2
+
3
+ ### Instruction-based Image Editing
4
+
5
+ Image-text-to-image models can be used to edit images based on natural language instructions. For example, you can provide an image of a summer landscape and the instruction "Make it winter, add snow" to generate a winter version of the same scene.
6
+
7
+ ### Style Transfer
8
+
9
+ These models can apply artistic styles or transformations to images based on text descriptions. For instance, you can transform a photo into a painting style by providing prompts like "Make it look like a Van Gogh painting" or "Convert to watercolor style."
10
+
11
+ ### Image Variations
12
+
13
+ Generate variations of an existing image by providing different text prompts. This is useful for creative workflows where you want to explore different versions of the same image with specific modifications.
14
+
15
+ ### Guided Image Generation
16
+
17
+ Use a reference image along with text prompts to guide the generation process. This allows for more controlled image generation compared to text-to-image models alone, as the reference image provides structural guidance.
18
+
19
+ ### Image Inpainting and Outpainting
20
+
21
+ Fill in missing or masked parts of an image based on text descriptions, or extend an image beyond its original boundaries with text-guided generation.
22
+
23
+ ## Task Variants
24
+
25
+ ### Instruction-based Editing
26
+
27
+ Models that follow natural language instructions to edit images, which can perform complex edits like object removal, color changes, and compositional modifications.
28
+
29
+ ### Reference-guided Generation
30
+
31
+ Models that use a reference image to guide the generation process while incorporating text prompts to control specific attributes or modifications.
32
+
33
+ ### Conditional Image-to-Image
34
+
35
+ Models that perform specific transformations based on text conditions, such as changing weather conditions, time of day, or seasonal variations.
36
+
37
+ ## Inference
38
+
39
+ You can use the Diffusers library to interact with image-text-to-image models.
40
+
41
+ ```python
42
+ import torch
43
+ from diffusers import Flux2Pipeline
44
+ from diffusers.utils import load_image
45
+
46
+ repo_id = "black-forest-labs/FLUX.2-dev"
47
+ device = "cuda:0"
48
+ torch_dtype = torch.bfloat16
49
+
50
+ pipe = Flux2Pipeline.from_pretrained(
51
+ repo_id, torch_dtype=torch_dtype
52
+ )
53
+ pipe.enable_model_cpu_offload() #no need to do cpu offload for >80G VRAM carts like H200, B200, etc. and do a `pipe.to(device)` instead
54
+
55
+ prompt = "Realistic macro photograph of a hermit crab using a soda can as its shell, partially emerging from the can, captured with sharp detail and natural colors, on a sunlit beach with soft shadows and a shallow depth of field, with blurred ocean waves in the background. The can has the text `BFL Diffusers` on it and it has a color gradient that start with #FF5733 at the top and transitions to #33FF57 at the bottom."
56
+
57
+ #cat_image = load_image("https://huggingface.co/spaces/zerogpu-aoti/FLUX.1-Kontext-Dev-fp8-dynamic/resolve/main/cat.png")
58
+ image = pipe(
59
+ prompt=prompt,
60
+ #image=[cat_image] #multi-image input
61
+ generator=torch.Generator(device=device).manual_seed(42),
62
+ num_inference_steps=50,
63
+ guidance_scale=4,
64
+ ).images[0]
65
+
66
+ image.save("flux2_output.png")
67
+ ```
68
+
69
+ ## Useful Resources
70
+
71
+ - [FLUX.2 Model Card](https://huggingface.co/black-forest-labs/FLUX.2-dev)
72
+ - [Diffusers documentation on Image-to-Image](https://huggingface.co/docs/diffusers/using-diffusers/img2img)
73
+ - [ControlNet for Conditional Image Generation](https://huggingface.co/docs/diffusers/using-diffusers/controlnet)
@@ -0,0 +1,54 @@
1
+ import type { TaskDataCustom } from "../index.js";
2
+
3
+ const taskData: TaskDataCustom = {
4
+ datasets: [],
5
+ demo: {
6
+ inputs: [
7
+ {
8
+ filename: "image-text-to-image-input.jpeg",
9
+ type: "img",
10
+ },
11
+ {
12
+ label: "Input",
13
+ content: "A city above clouds, pastel colors, Victorian style",
14
+ type: "text",
15
+ },
16
+ ],
17
+ outputs: [
18
+ {
19
+ filename: "image-text-to-image-output.png",
20
+ type: "img",
21
+ },
22
+ ],
23
+ },
24
+ metrics: [
25
+ {
26
+ description:
27
+ "The Fréchet Inception Distance (FID) calculates the distance between distributions between synthetic and real samples. A lower FID score indicates better similarity between the distributions of real and generated images.",
28
+ id: "FID",
29
+ },
30
+ {
31
+ description:
32
+ "CLIP Score measures the similarity between the generated image and the text prompt using CLIP embeddings. A higher score indicates better alignment with the text prompt.",
33
+ id: "CLIP",
34
+ },
35
+ ],
36
+ models: [
37
+ {
38
+ description: "A powerful model for image-text-to-image generation.",
39
+ id: "black-forest-labs/FLUX.2-dev",
40
+ },
41
+ ],
42
+ spaces: [
43
+ {
44
+ description: "An application for image-text-to-image generation.",
45
+ id: "black-forest-labs/FLUX.2-dev",
46
+ },
47
+ ],
48
+ summary:
49
+ "Image-text-to-image models take an image and a text prompt as input and generate a new image based on the reference image and text instructions. These models are useful for image editing, style transfer, image variations, and guided image generation tasks.",
50
+ widgetModels: ["black-forest-labs/FLUX.2-dev"],
51
+ youtubeId: undefined,
52
+ };
53
+
54
+ export default taskData;