npm - @huggingface/inference - Versions diffs - 2.0.0-rc2 → 2.1.0 - Mend

@huggingface/inference 2.0.0-rc2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/README.md +24 -2
package/dist/index.js +56 -1
package/dist/index.mjs +54 -1
package/package.json +1 -1
package/src/lib/makeRequestOptions.ts +1 -1
package/src/tasks/index.ts +4 -0
package/src/tasks/multimodal/documentQuestionAnswering.ts +63 -0
package/src/tasks/multimodal/visualQuestionAnswering.ts +50 -0
package/src/tasks/nlp/sentenceSimilarity.ts +3 -5

package/README.md CHANGED Viewed

@@ -124,8 +124,8 @@ await hf.sentenceSimilarity({
 })
 await hf.featureExtraction({
-    model: "sentence-transformers/distilbert-base-nli-mean-tokens",
-    inputs: "That is a happy person",
+  model: "sentence-transformers/distilbert-base-nli-mean-tokens",
+  inputs: "That is a happy person",
 });
 // Audio
@@ -170,6 +170,24 @@ await hf.imageToText({
   model: 'nlpconnect/vit-gpt2-image-captioning'
 })
+// Multimodal
+await hf.visualQuestionAnswering({
+  model: 'dandelin/vilt-b32-finetuned-vqa',
+  inputs: {
+    question: 'How many cats are lying down?',
+    image: await (await fetch('https://placekitten.com/300/300')).blob()
+  }
+})
+await hf.documentQuestionAnswering({
+  model: 'impira/layoutlm-document-qa',
+  inputs: {
+    question: 'Invoice number?',
+    image: await (await fetch('https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png')).blob(),
+  }
+})
 // Custom call, for models with custom parameters / outputs
 await hf.request({
   model: 'my-custom-model',
@@ -227,6 +245,10 @@ const { generated_text } = await gpt2.textGeneration({inputs: 'The answer to the
 - [x] Text to image
 - [x] Image to text
+### Multimodal
+- [x] Document question answering
+- [x] Visual question answering
 ## Tree-shaking
 You can import the functions you need directly from the module, rather than using the `HfInference` class:

package/dist/index.js CHANGED Viewed

@@ -25,6 +25,7 @@ __export(src_exports, {
   audioClassification: () => audioClassification,
   automaticSpeechRecognition: () => automaticSpeechRecognition,
   conversational: () => conversational,
+  documentQuestionAnswering: () => documentQuestionAnswering,
   featureExtraction: () => featureExtraction,
   fillMask: () => fillMask,
   imageClassification: () => imageClassification,
@@ -43,6 +44,7 @@ __export(src_exports, {
   textToImage: () => textToImage,
   tokenClassification: () => tokenClassification,
   translation: () => translation,
+  visualQuestionAnswering: () => visualQuestionAnswering,
   zeroShotClassification: () => zeroShotClassification
 });
 module.exports = __toCommonJS(src_exports);
@@ -53,6 +55,7 @@ __export(tasks_exports, {
   audioClassification: () => audioClassification,
   automaticSpeechRecognition: () => automaticSpeechRecognition,
   conversational: () => conversational,
+  documentQuestionAnswering: () => documentQuestionAnswering,
   featureExtraction: () => featureExtraction,
   fillMask: () => fillMask,
   imageClassification: () => imageClassification,
@@ -71,6 +74,7 @@ __export(tasks_exports, {
   textToImage: () => textToImage,
   tokenClassification: () => tokenClassification,
   translation: () => translation,
+  visualQuestionAnswering: () => visualQuestionAnswering,
   zeroShotClassification: () => zeroShotClassification
 });
@@ -96,7 +100,7 @@ function makeRequestOptions(args, options) {
       headers["X-Load-Model"] = "0";
     }
   }
-  const url = /^http(s?):/.test(model) ? model : `${HF_INFERENCE_API_BASE_URL}${model}`;
+  const url = /^http(s?):/.test(model) || model.startsWith("/") ? model : `${HF_INFERENCE_API_BASE_URL}${model}`;
   const info = {
     headers,
     method: "POST",
@@ -539,6 +543,55 @@ async function zeroShotClassification(args, options) {
   return res;
 }
+// ../shared/src/base64FromBytes.ts
+function base64FromBytes(arr) {
+  if (globalThis.Buffer) {
+    return globalThis.Buffer.from(arr).toString("base64");
+  } else {
+    const bin = [];
+    arr.forEach((byte) => {
+      bin.push(String.fromCharCode(byte));
+    });
+    return globalThis.btoa(bin.join(""));
+  }
+}
+// src/tasks/multimodal/documentQuestionAnswering.ts
+async function documentQuestionAnswering(args, options) {
+  const reqArgs = {
+    ...args,
+    inputs: {
+      question: args.inputs.question,
+      // convert Blob to base64
+      image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
+    }
+  };
+  const res = (await request(reqArgs, options))?.[0];
+  const isValidOutput = typeof res?.answer === "string" && typeof res.end === "number" && typeof res.score === "number" && typeof res.start === "number";
+  if (!isValidOutput) {
+    throw new InferenceOutputError("Expected Array<{answer: string, end: number, score: number, start: number}>");
+  }
+  return res;
+}
+// src/tasks/multimodal/visualQuestionAnswering.ts
+async function visualQuestionAnswering(args, options) {
+  const reqArgs = {
+    ...args,
+    inputs: {
+      question: args.inputs.question,
+      // convert Blob to base64
+      image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
+    }
+  };
+  const res = (await request(reqArgs, options))?.[0];
+  const isValidOutput = typeof res?.answer === "string" && typeof res.score === "number";
+  if (!isValidOutput) {
+    throw new InferenceOutputError("Expected Array<{answer: string, score: number}>");
+  }
+  return res;
+}
 // src/HfInference.ts
 var HfInference = class {
   accessToken;
@@ -585,6 +638,7 @@ var HfInferenceEndpoint = class {
   audioClassification,
   automaticSpeechRecognition,
   conversational,
+  documentQuestionAnswering,
   featureExtraction,
   fillMask,
   imageClassification,
@@ -603,5 +657,6 @@ var HfInferenceEndpoint = class {
   textToImage,
   tokenClassification,
   translation,
+  visualQuestionAnswering,
   zeroShotClassification
 });

package/dist/index.mjs CHANGED Viewed

@@ -10,6 +10,7 @@ __export(tasks_exports, {
   audioClassification: () => audioClassification,
   automaticSpeechRecognition: () => automaticSpeechRecognition,
   conversational: () => conversational,
+  documentQuestionAnswering: () => documentQuestionAnswering,
   featureExtraction: () => featureExtraction,
   fillMask: () => fillMask,
   imageClassification: () => imageClassification,
@@ -28,6 +29,7 @@ __export(tasks_exports, {
   textToImage: () => textToImage,
   tokenClassification: () => tokenClassification,
   translation: () => translation,
+  visualQuestionAnswering: () => visualQuestionAnswering,
   zeroShotClassification: () => zeroShotClassification
 });
@@ -53,7 +55,7 @@ function makeRequestOptions(args, options) {
       headers["X-Load-Model"] = "0";
     }
   }
-  const url = /^http(s?):/.test(model) ? model : `${HF_INFERENCE_API_BASE_URL}${model}`;
+  const url = /^http(s?):/.test(model) || model.startsWith("/") ? model : `${HF_INFERENCE_API_BASE_URL}${model}`;
   const info = {
     headers,
     method: "POST",
@@ -496,6 +498,55 @@ async function zeroShotClassification(args, options) {
   return res;
 }
+// ../shared/src/base64FromBytes.ts
+function base64FromBytes(arr) {
+  if (globalThis.Buffer) {
+    return globalThis.Buffer.from(arr).toString("base64");
+  } else {
+    const bin = [];
+    arr.forEach((byte) => {
+      bin.push(String.fromCharCode(byte));
+    });
+    return globalThis.btoa(bin.join(""));
+  }
+}
+// src/tasks/multimodal/documentQuestionAnswering.ts
+async function documentQuestionAnswering(args, options) {
+  const reqArgs = {
+    ...args,
+    inputs: {
+      question: args.inputs.question,
+      // convert Blob to base64
+      image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
+    }
+  };
+  const res = (await request(reqArgs, options))?.[0];
+  const isValidOutput = typeof res?.answer === "string" && typeof res.end === "number" && typeof res.score === "number" && typeof res.start === "number";
+  if (!isValidOutput) {
+    throw new InferenceOutputError("Expected Array<{answer: string, end: number, score: number, start: number}>");
+  }
+  return res;
+}
+// src/tasks/multimodal/visualQuestionAnswering.ts
+async function visualQuestionAnswering(args, options) {
+  const reqArgs = {
+    ...args,
+    inputs: {
+      question: args.inputs.question,
+      // convert Blob to base64
+      image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
+    }
+  };
+  const res = (await request(reqArgs, options))?.[0];
+  const isValidOutput = typeof res?.answer === "string" && typeof res.score === "number";
+  if (!isValidOutput) {
+    throw new InferenceOutputError("Expected Array<{answer: string, score: number}>");
+  }
+  return res;
+}
 // src/HfInference.ts
 var HfInference = class {
   accessToken;
@@ -541,6 +592,7 @@ export {
   audioClassification,
   automaticSpeechRecognition,
   conversational,
+  documentQuestionAnswering,
   featureExtraction,
   fillMask,
   imageClassification,
@@ -559,5 +611,6 @@ export {
   textToImage,
   tokenClassification,
   translation,
+  visualQuestionAnswering,
   zeroShotClassification
 };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@huggingface/inference",
-  "version": "2.0.0-rc2",
+  "version": "2.1.0",
   "license": "MIT",
   "author": "Tim Mikeladze <tim.mikeladze@gmail.com>",
   "description": "Typescript wrapper for the Hugging Face Inference API",

package/src/lib/makeRequestOptions.ts CHANGED Viewed

@@ -38,7 +38,7 @@ export function makeRequestOptions(
 		}
 	}
-	const url = /^http(s?):/.test(model) ? model : `${HF_INFERENCE_API_BASE_URL}${model}`;
+	const url = /^http(s?):/.test(model) || model.startsWith("/") ? model : `${HF_INFERENCE_API_BASE_URL}${model}`;
 	const info: RequestInit = {
 		headers,
 		method: "POST",

package/src/tasks/index.ts CHANGED Viewed

@@ -27,3 +27,7 @@ export * from "./nlp/textGenerationStream";
 export * from "./nlp/tokenClassification";
 export * from "./nlp/translation";
 export * from "./nlp/zeroShotClassification";
+// Multimodal tasks
+export * from "./multimodal/documentQuestionAnswering";
+export * from "./multimodal/visualQuestionAnswering";

package/src/tasks/multimodal/documentQuestionAnswering.ts ADDED Viewed

@@ -0,0 +1,63 @@
+import { InferenceOutputError } from "../../lib/InferenceOutputError";
+import type { BaseArgs, Options } from "../../types";
+import { request } from "../custom/request";
+import type { RequestArgs } from "../../types";
+import { base64FromBytes } from "../../../../shared/src/base64FromBytes";
+export type DocumentQuestionAnsweringArgs = BaseArgs & {
+	inputs: {
+		/**
+		 * Raw image
+		 *
+		 * You can use native `File` in browsers, or `new Blob([buffer])` in node, or for a base64 image `new Blob([btoa(base64String)])`, or even `await (await fetch('...)).blob()`
+		 **/
+		image: Blob;
+		question: string;
+	};
+};
+export interface DocumentQuestionAnsweringOutput {
+	/**
+	 * A string that’s the answer within the document.
+	 */
+	answer: string;
+	/**
+	 * ?
+	 */
+	end: number;
+	/**
+	 * A float that represents how likely that the answer is correct
+	 */
+	score: number;
+	/**
+	 * ?
+	 */
+	start: number;
+}
+/**
+ * Answers a question on a document image. Recommended model: impira/layoutlm-document-qa.
+ */
+export async function documentQuestionAnswering(
+	args: DocumentQuestionAnsweringArgs,
+	options?: Options
+): Promise<DocumentQuestionAnsweringOutput> {
+	const reqArgs: RequestArgs = {
+		...args,
+		inputs: {
+			question: args.inputs.question,
+			// convert Blob to base64
+			image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer())),
+		},
+	} as RequestArgs;
+	const res = (await request<[DocumentQuestionAnsweringOutput]>(reqArgs, options))?.[0];
+	const isValidOutput =
+		typeof res?.answer === "string" &&
+		typeof res.end === "number" &&
+		typeof res.score === "number" &&
+		typeof res.start === "number";
+	if (!isValidOutput) {
+		throw new InferenceOutputError("Expected Array<{answer: string, end: number, score: number, start: number}>");
+	}
+	return res;
+}

package/src/tasks/multimodal/visualQuestionAnswering.ts ADDED Viewed

@@ -0,0 +1,50 @@
+import { InferenceOutputError } from "../../lib/InferenceOutputError";
+import type { BaseArgs, Options, RequestArgs } from "../../types";
+import { request } from "../custom/request";
+import { base64FromBytes } from "../../../../shared/src/base64FromBytes";
+export type VisualQuestionAnsweringArgs = BaseArgs & {
+	inputs: {
+		/**
+		 * Raw image
+		 *
+		 * You can use native `File` in browsers, or `new Blob([buffer])` in node, or for a base64 image `new Blob([btoa(base64String)])`, or even `await (await fetch('...)).blob()`
+		 **/
+		image: Blob;
+		question: string;
+	};
+};
+export interface VisualQuestionAnsweringOutput {
+	/**
+	 * A string that’s the answer to a visual question.
+	 */
+	answer: string;
+	/**
+	 * Answer correctness score.
+	 */
+	score: number;
+}
+/**
+ * Answers a question on an image. Recommended model: dandelin/vilt-b32-finetuned-vqa.
+ */
+export async function visualQuestionAnswering(
+	args: VisualQuestionAnsweringArgs,
+	options?: Options
+): Promise<VisualQuestionAnsweringOutput> {
+	const reqArgs: RequestArgs = {
+		...args,
+		inputs: {
+			question: args.inputs.question,
+			// convert Blob to base64
+			image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer())),
+		},
+	} as RequestArgs;
+	const res = (await request<[VisualQuestionAnsweringOutput]>(reqArgs, options))?.[0];
+	const isValidOutput = typeof res?.answer === "string" && typeof res.score === "number";
+	if (!isValidOutput) {
+		throw new InferenceOutputError("Expected Array<{answer: string, score: number}>");
+	}
+	return res;
+}

package/src/tasks/nlp/sentenceSimilarity.ts CHANGED Viewed

@@ -4,12 +4,10 @@ import { request } from "../custom/request";
 export type SentenceSimilarityArgs = BaseArgs & {
 	/**
-	 * The inputs vary based on the model. For example when using sentence-transformers/paraphrase-xlm-r-multilingual-v1 the inputs will look like this:
+	 * The inputs vary based on the model.
 	 *
-	 *  inputs: &#123;
-	 *    "source_sentence": "That is a happy person",
-	 *    "sentences": ["That is a happy dog", "That is a very happy person", "Today is a sunny day"]
-	 *  &#125;
+	 * For example when using sentence-transformers/paraphrase-xlm-r-multilingual-v1 the inputs will have a `source_sentence` string and
+	 * a `sentences` array of strings
 	 */
 	inputs: Record<string, unknown> | Record<string, unknown>[];
 };