npm - @huggingface/inference - Versions diffs - 2.2.2 → 2.3.0 - Mend

@huggingface/inference 2.2.2 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md +12 -1
package/dist/index.js +57 -21
package/dist/index.mjs +56 -21
package/package.json +2 -1
package/src/tasks/cv/imageToImage.ts +83 -0
package/src/tasks/index.ts +1 -0
package/src/tasks/multimodal/documentQuestionAnswering.ts +7 -3
package/src/tasks/multimodal/visualQuestionAnswering.ts +7 -3

package/README.md CHANGED Viewed

@@ -170,6 +170,14 @@ await hf.imageToText({
   model: 'nlpconnect/vit-gpt2-image-captioning'
 })
+await hf.imageToImage({
+  inputs: readFileSync("test/stormtrooper_depth.png"),
+  parameters: {
+    prompt: "elmo's lecture",
+  },
+  model: "lllyasviel/sd-controlnet-depth",
+});
 // Multimodal
 await hf.visualQuestionAnswering({
@@ -260,12 +268,15 @@ const { generated_text } = await gpt2.textGeneration({inputs: 'The answer to the
 - [x] Image segmentation
 - [x] Text to image
 - [x] Image to text - [demo](https://huggingface.co/spaces/huggingfacejs/image-to-text)
+- [x] Image to Image
 ### Multimodal
 - [x] Document question answering - [demo](https://huggingface.co/spaces/huggingfacejs/doc-vis-qa)
 - [x] Visual question answering - [demo](https://huggingface.co/spaces/huggingfacejs/doc-vis-qa)
 ### Tabular
 - [x] Tabular regression
 ## Tree-shaking
@@ -288,7 +299,7 @@ This will enable tree-shaking by your bundler.
 ## Running tests
 ```console
-HF_ACCESS_TOKEN="your access token" npm run test
+HF_ACCESS_TOKEN="your access token" pnpm run test
 ```
 ## Finding appropriate models

package/dist/index.js CHANGED Viewed

@@ -30,6 +30,7 @@ __export(src_exports, {
   fillMask: () => fillMask,
   imageClassification: () => imageClassification,
   imageSegmentation: () => imageSegmentation,
+  imageToImage: () => imageToImage,
   imageToText: () => imageToText,
   objectDetection: () => objectDetection,
   questionAnswering: () => questionAnswering,
@@ -62,6 +63,7 @@ __export(tasks_exports, {
   fillMask: () => fillMask,
   imageClassification: () => imageClassification,
   imageSegmentation: () => imageSegmentation,
+  imageToImage: () => imageToImage,
   imageToText: () => imageToText,
   objectDetection: () => objectDetection,
   questionAnswering: () => questionAnswering,
@@ -397,6 +399,48 @@ async function textToImage(args, options) {
   return res;
 }
+// ../shared/src/base64FromBytes.ts
+function base64FromBytes(arr) {
+  if (globalThis.Buffer) {
+    return globalThis.Buffer.from(arr).toString("base64");
+  } else {
+    const bin = [];
+    arr.forEach((byte) => {
+      bin.push(String.fromCharCode(byte));
+    });
+    return globalThis.btoa(bin.join(""));
+  }
+}
+// ../shared/src/isBackend.ts
+var isBrowser = typeof window !== "undefined" && typeof window.document !== "undefined";
+var isWebWorker = typeof self === "object" && self.constructor && self.constructor.name === "DedicatedWorkerGlobalScope";
+// src/tasks/cv/imageToImage.ts
+async function imageToImage(args, options) {
+  let reqArgs;
+  if (!args.parameters) {
+    reqArgs = {
+      accessToken: args.accessToken,
+      model: args.model,
+      data: args.inputs
+    };
+  } else {
+    reqArgs = {
+      ...args,
+      inputs: base64FromBytes(
+        new Uint8Array(args.inputs instanceof ArrayBuffer ? args.inputs : await args.inputs.arrayBuffer())
+      )
+    };
+  }
+  const res = await request(reqArgs, options);
+  const isValidOutput = res && res instanceof Blob;
+  if (!isValidOutput) {
+    throw new InferenceOutputError("Expected Blob");
+  }
+  return res;
+}
 // src/tasks/nlp/conversational.ts
 async function conversational(args, options) {
   const res = await request(args, options);
@@ -561,31 +605,18 @@ async function zeroShotClassification(args, options) {
   return res;
 }
-// ../shared/src/base64FromBytes.ts
-function base64FromBytes(arr) {
-  if (globalThis.Buffer) {
-    return globalThis.Buffer.from(arr).toString("base64");
-  } else {
-    const bin = [];
-    arr.forEach((byte) => {
-      bin.push(String.fromCharCode(byte));
-    });
-    return globalThis.btoa(bin.join(""));
-  }
-}
-// ../shared/src/isBackend.ts
-var isBrowser = typeof window !== "undefined" && typeof window.document !== "undefined";
-var isWebWorker = typeof self === "object" && self.constructor && self.constructor.name === "DedicatedWorkerGlobalScope";
 // src/tasks/multimodal/documentQuestionAnswering.ts
 async function documentQuestionAnswering(args, options) {
   const reqArgs = {
     ...args,
     inputs: {
       question: args.inputs.question,
-      // convert Blob to base64
-      image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
+      // convert Blob or ArrayBuffer to base64
+      image: base64FromBytes(
+        new Uint8Array(
+          args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
+        )
+      )
     }
   };
   const res = toArray(
@@ -604,8 +635,12 @@ async function visualQuestionAnswering(args, options) {
     ...args,
     inputs: {
       question: args.inputs.question,
-      // convert Blob to base64
-      image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
+      // convert Blob or ArrayBuffer to base64
+      image: base64FromBytes(
+        new Uint8Array(
+          args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
+        )
+      )
     }
   };
   const res = (await request(reqArgs, options))?.[0];
@@ -677,6 +712,7 @@ var HfInferenceEndpoint = class {
   fillMask,
   imageClassification,
   imageSegmentation,
+  imageToImage,
   imageToText,
   objectDetection,
   questionAnswering,

package/dist/index.mjs CHANGED Viewed

@@ -15,6 +15,7 @@ __export(tasks_exports, {
   fillMask: () => fillMask,
   imageClassification: () => imageClassification,
   imageSegmentation: () => imageSegmentation,
+  imageToImage: () => imageToImage,
   imageToText: () => imageToText,
   objectDetection: () => objectDetection,
   questionAnswering: () => questionAnswering,
@@ -350,6 +351,48 @@ async function textToImage(args, options) {
   return res;
 }
+// ../shared/src/base64FromBytes.ts
+function base64FromBytes(arr) {
+  if (globalThis.Buffer) {
+    return globalThis.Buffer.from(arr).toString("base64");
+  } else {
+    const bin = [];
+    arr.forEach((byte) => {
+      bin.push(String.fromCharCode(byte));
+    });
+    return globalThis.btoa(bin.join(""));
+  }
+}
+// ../shared/src/isBackend.ts
+var isBrowser = typeof window !== "undefined" && typeof window.document !== "undefined";
+var isWebWorker = typeof self === "object" && self.constructor && self.constructor.name === "DedicatedWorkerGlobalScope";
+// src/tasks/cv/imageToImage.ts
+async function imageToImage(args, options) {
+  let reqArgs;
+  if (!args.parameters) {
+    reqArgs = {
+      accessToken: args.accessToken,
+      model: args.model,
+      data: args.inputs
+    };
+  } else {
+    reqArgs = {
+      ...args,
+      inputs: base64FromBytes(
+        new Uint8Array(args.inputs instanceof ArrayBuffer ? args.inputs : await args.inputs.arrayBuffer())
+      )
+    };
+  }
+  const res = await request(reqArgs, options);
+  const isValidOutput = res && res instanceof Blob;
+  if (!isValidOutput) {
+    throw new InferenceOutputError("Expected Blob");
+  }
+  return res;
+}
 // src/tasks/nlp/conversational.ts
 async function conversational(args, options) {
   const res = await request(args, options);
@@ -514,31 +557,18 @@ async function zeroShotClassification(args, options) {
   return res;
 }
-// ../shared/src/base64FromBytes.ts
-function base64FromBytes(arr) {
-  if (globalThis.Buffer) {
-    return globalThis.Buffer.from(arr).toString("base64");
-  } else {
-    const bin = [];
-    arr.forEach((byte) => {
-      bin.push(String.fromCharCode(byte));
-    });
-    return globalThis.btoa(bin.join(""));
-  }
-}
-// ../shared/src/isBackend.ts
-var isBrowser = typeof window !== "undefined" && typeof window.document !== "undefined";
-var isWebWorker = typeof self === "object" && self.constructor && self.constructor.name === "DedicatedWorkerGlobalScope";
 // src/tasks/multimodal/documentQuestionAnswering.ts
 async function documentQuestionAnswering(args, options) {
   const reqArgs = {
     ...args,
     inputs: {
       question: args.inputs.question,
-      // convert Blob to base64
-      image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
+      // convert Blob or ArrayBuffer to base64
+      image: base64FromBytes(
+        new Uint8Array(
+          args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
+        )
+      )
     }
   };
   const res = toArray(
@@ -557,8 +587,12 @@ async function visualQuestionAnswering(args, options) {
     ...args,
     inputs: {
       question: args.inputs.question,
-      // convert Blob to base64
-      image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
+      // convert Blob or ArrayBuffer to base64
+      image: base64FromBytes(
+        new Uint8Array(
+          args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
+        )
+      )
     }
   };
   const res = (await request(reqArgs, options))?.[0];
@@ -629,6 +663,7 @@ export {
   fillMask,
   imageClassification,
   imageSegmentation,
+  imageToImage,
   imageToText,
   objectDetection,
   questionAnswering,

package/package.json CHANGED Viewed

@@ -1,6 +1,7 @@
 {
   "name": "@huggingface/inference",
-  "version": "2.2.2",
+  "version": "2.3.0",
+  "packageManager": "pnpm@8.3.1",
   "license": "MIT",
   "author": "Tim Mikeladze <tim.mikeladze@gmail.com>",
   "description": "Typescript wrapper for the Hugging Face Inference API",

package/src/tasks/cv/imageToImage.ts ADDED Viewed

@@ -0,0 +1,83 @@
+import { InferenceOutputError } from "../../lib/InferenceOutputError";
+import type { BaseArgs, Options, RequestArgs } from "../../types";
+import { request } from "../custom/request";
+import { base64FromBytes } from "@huggingface/shared";
+export type ImageToImageArgs = BaseArgs & {
+	/**
+	 * The initial image condition
+	 *
+	 **/
+	inputs: Blob | ArrayBuffer;
+	parameters?: {
+		/**
+		 * The text prompt to guide the image generation.
+		 */
+		prompt?: string;
+		/**
+		 * strengh param only works for SD img2img and alt diffusion img2img models
+		 * Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+		 * will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+		 * denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+		 * be maximum and the denoising process will run for the full number of iterations specified in
+		 * `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+		 **/
+		strength?: number;
+		/**
+		 * An optional negative prompt for the image generation
+		 */
+		negative_prompt?: string;
+		/**
+		 * The height in pixels of the generated image
+		 */
+		height?: number;
+		/**
+		 * The width in pixels of the generated image
+		 */
+		width?: number;
+		/**
+		 * The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference.
+		 */
+		num_inference_steps?: number;
+		/**
+		 * Guidance scale: Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality.
+		 */
+		guidance_scale?: number;
+		/**
+		 * guess_mode only works for ControlNet models, defaults to False In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
+		 * you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
+		 */
+		guess_mode?: boolean;
+	};
+};
+export type ImageToImageOutput = Blob;
+/**
+ * This task reads some text input and outputs an image.
+ * Recommended model: lllyasviel/sd-controlnet-depth
+ */
+export async function imageToImage(args: ImageToImageArgs, options?: Options): Promise<ImageToImageOutput> {
+	let reqArgs: RequestArgs;
+	if (!args.parameters) {
+		reqArgs = {
+			accessToken: args.accessToken,
+			model: args.model,
+			data: args.inputs,
+		};
+	} else {
+		reqArgs = {
+			...args,
+			inputs: base64FromBytes(
+				new Uint8Array(args.inputs instanceof ArrayBuffer ? args.inputs : await args.inputs.arrayBuffer())
+			),
+		};
+	}
+	const res = await request<ImageToImageOutput>(reqArgs, options);
+	const isValidOutput = res && res instanceof Blob;
+	if (!isValidOutput) {
+		throw new InferenceOutputError("Expected Blob");
+	}
+	return res;
+}

package/src/tasks/index.ts CHANGED Viewed

@@ -13,6 +13,7 @@ export * from "./cv/imageSegmentation";
 export * from "./cv/imageToText";
 export * from "./cv/objectDetection";
 export * from "./cv/textToImage";
+export * from "./cv/imageToImage";
 // Natural Language Processing tasks
 export * from "./nlp/conversational";

package/src/tasks/multimodal/documentQuestionAnswering.ts CHANGED Viewed

@@ -12,7 +12,7 @@ export type DocumentQuestionAnsweringArgs = BaseArgs & {
 		 *
 		 * You can use native `File` in browsers, or `new Blob([buffer])` in node, or for a base64 image `new Blob([btoa(base64String)])`, or even `await (await fetch('...)).blob()`
 		 **/
-		image: Blob;
+		image: Blob | ArrayBuffer;
 		question: string;
 	};
 };
@@ -47,8 +47,12 @@ export async function documentQuestionAnswering(
 		...args,
 		inputs: {
 			question: args.inputs.question,
-			// convert Blob to base64
-			image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer())),
+			// convert Blob or ArrayBuffer to base64
+			image: base64FromBytes(
+				new Uint8Array(
+					args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
+				)
+			),
 		},
 	} as RequestArgs;
 	const res = toArray(

package/src/tasks/multimodal/visualQuestionAnswering.ts CHANGED Viewed

@@ -10,7 +10,7 @@ export type VisualQuestionAnsweringArgs = BaseArgs & {
 		 *
 		 * You can use native `File` in browsers, or `new Blob([buffer])` in node, or for a base64 image `new Blob([btoa(base64String)])`, or even `await (await fetch('...)).blob()`
 		 **/
-		image: Blob;
+		image: Blob | ArrayBuffer;
 		question: string;
 	};
 };
@@ -37,8 +37,12 @@ export async function visualQuestionAnswering(
 		...args,
 		inputs: {
 			question: args.inputs.question,
-			// convert Blob to base64
-			image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer())),
+			// convert Blob or ArrayBuffer to base64
+			image: base64FromBytes(
+				new Uint8Array(
+					args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
+				)
+			),
 		},
 	} as RequestArgs;
 	const res = (await request<[VisualQuestionAnsweringOutput]>(reqArgs, options))?.[0];