npm - @huggingface/inference - Versions diffs - 3.0.0 → 3.1.0 - Mend

@huggingface/inference 3.0.0 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

package/README.md +11 -6
package/dist/index.cjs +193 -76
package/dist/index.js +193 -76
package/dist/src/lib/makeRequestOptions.d.ts.map +1 -1
package/dist/src/providers/fal-ai.d.ts.map +1 -1
package/dist/src/providers/replicate.d.ts.map +1 -1
package/dist/src/providers/together.d.ts.map +1 -1
package/dist/src/tasks/audio/audioClassification.d.ts +4 -18
package/dist/src/tasks/audio/audioClassification.d.ts.map +1 -1
package/dist/src/tasks/audio/audioToAudio.d.ts +10 -9
package/dist/src/tasks/audio/audioToAudio.d.ts.map +1 -1
package/dist/src/tasks/audio/automaticSpeechRecognition.d.ts +3 -12
package/dist/src/tasks/audio/automaticSpeechRecognition.d.ts.map +1 -1
package/dist/src/tasks/audio/textToSpeech.d.ts +4 -8
package/dist/src/tasks/audio/textToSpeech.d.ts.map +1 -1
package/dist/src/tasks/audio/utils.d.ts +11 -0
package/dist/src/tasks/audio/utils.d.ts.map +1 -0
package/dist/src/tasks/cv/imageClassification.d.ts +3 -17
package/dist/src/tasks/cv/imageClassification.d.ts.map +1 -1
package/dist/src/tasks/cv/imageSegmentation.d.ts +3 -21
package/dist/src/tasks/cv/imageSegmentation.d.ts.map +1 -1
package/dist/src/tasks/cv/imageToImage.d.ts +3 -49
package/dist/src/tasks/cv/imageToImage.d.ts.map +1 -1
package/dist/src/tasks/cv/imageToText.d.ts +3 -12
package/dist/src/tasks/cv/imageToText.d.ts.map +1 -1
package/dist/src/tasks/cv/objectDetection.d.ts +3 -26
package/dist/src/tasks/cv/objectDetection.d.ts.map +1 -1
package/dist/src/tasks/cv/textToImage.d.ts +3 -38
package/dist/src/tasks/cv/textToImage.d.ts.map +1 -1
package/dist/src/tasks/cv/textToVideo.d.ts +6 -0
package/dist/src/tasks/cv/textToVideo.d.ts.map +1 -0
package/dist/src/tasks/cv/utils.d.ts +11 -0
package/dist/src/tasks/cv/utils.d.ts.map +1 -0
package/dist/src/tasks/cv/zeroShotImageClassification.d.ts +7 -15
package/dist/src/tasks/cv/zeroShotImageClassification.d.ts.map +1 -1
package/dist/src/tasks/multimodal/documentQuestionAnswering.d.ts +5 -28
package/dist/src/tasks/multimodal/documentQuestionAnswering.d.ts.map +1 -1
package/dist/src/tasks/multimodal/visualQuestionAnswering.d.ts +5 -20
package/dist/src/tasks/multimodal/visualQuestionAnswering.d.ts.map +1 -1
package/dist/src/tasks/nlp/fillMask.d.ts +2 -21
package/dist/src/tasks/nlp/fillMask.d.ts.map +1 -1
package/dist/src/tasks/nlp/questionAnswering.d.ts +3 -25
package/dist/src/tasks/nlp/questionAnswering.d.ts.map +1 -1
package/dist/src/tasks/nlp/sentenceSimilarity.d.ts +2 -13
package/dist/src/tasks/nlp/sentenceSimilarity.d.ts.map +1 -1
package/dist/src/tasks/nlp/summarization.d.ts +2 -42
package/dist/src/tasks/nlp/summarization.d.ts.map +1 -1
package/dist/src/tasks/nlp/tableQuestionAnswering.d.ts +3 -31
package/dist/src/tasks/nlp/tableQuestionAnswering.d.ts.map +1 -1
package/dist/src/tasks/nlp/textClassification.d.ts +2 -16
package/dist/src/tasks/nlp/textClassification.d.ts.map +1 -1
package/dist/src/tasks/nlp/tokenClassification.d.ts +2 -45
package/dist/src/tasks/nlp/tokenClassification.d.ts.map +1 -1
package/dist/src/tasks/nlp/translation.d.ts +2 -13
package/dist/src/tasks/nlp/translation.d.ts.map +1 -1
package/dist/src/tasks/nlp/zeroShotClassification.d.ts +2 -22
package/dist/src/tasks/nlp/zeroShotClassification.d.ts.map +1 -1
package/dist/src/types.d.ts +4 -0
package/dist/src/types.d.ts.map +1 -1
package/package.json +2 -2
package/src/lib/makeRequestOptions.ts +7 -5
package/src/providers/fal-ai.ts +12 -0
package/src/providers/replicate.ts +6 -3
package/src/providers/together.ts +2 -0
package/src/tasks/audio/audioClassification.ts +7 -22
package/src/tasks/audio/audioToAudio.ts +43 -23
package/src/tasks/audio/automaticSpeechRecognition.ts +35 -23
package/src/tasks/audio/textToSpeech.ts +23 -14
package/src/tasks/audio/utils.ts +18 -0
package/src/tasks/cv/imageClassification.ts +5 -20
package/src/tasks/cv/imageSegmentation.ts +5 -24
package/src/tasks/cv/imageToImage.ts +4 -52
package/src/tasks/cv/imageToText.ts +6 -15
package/src/tasks/cv/objectDetection.ts +5 -30
package/src/tasks/cv/textToImage.ts +14 -50
package/src/tasks/cv/textToVideo.ts +67 -0
package/src/tasks/cv/utils.ts +13 -0
package/src/tasks/cv/zeroShotImageClassification.ts +32 -31
package/src/tasks/multimodal/documentQuestionAnswering.ts +25 -43
package/src/tasks/multimodal/visualQuestionAnswering.ts +20 -36
package/src/tasks/nlp/fillMask.ts +2 -22
package/src/tasks/nlp/questionAnswering.ts +22 -36
package/src/tasks/nlp/sentenceSimilarity.ts +12 -15
package/src/tasks/nlp/summarization.ts +2 -43
package/src/tasks/nlp/tableQuestionAnswering.ts +25 -41
package/src/tasks/nlp/textClassification.ts +3 -18
package/src/tasks/nlp/tokenClassification.ts +2 -47
package/src/tasks/nlp/translation.ts +3 -17
package/src/tasks/nlp/zeroShotClassification.ts +2 -24
package/src/types.ts +7 -1

package/src/tasks/cv/imageToImage.ts CHANGED Viewed

@@ -1,64 +1,16 @@
+import type { ImageToImageInput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options, RequestArgs } from "../../types";
 import { base64FromBytes } from "../../utils/base64FromBytes";
 import { request } from "../custom/request";
-export type ImageToImageArgs = BaseArgs & {
-	/**
-	 * The initial image condition
-	 *
-	 **/
-	inputs: Blob | ArrayBuffer;
-	parameters?: {
-		/**
-		 * The text prompt to guide the image generation.
-		 */
-		prompt?: string;
-		/**
-		 * strengh param only works for SD img2img and alt diffusion img2img models
-		 * Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
-		 * will be used as a starting point, adding more noise to it the larger the `strength`. The number of
-		 * denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
-		 * be maximum and the denoising process will run for the full number of iterations specified in
-		 * `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-		 **/
-		strength?: number;
-		/**
-		 * An optional negative prompt for the image generation
-		 */
-		negative_prompt?: string;
-		/**
-		 * The height in pixels of the generated image
-		 */
-		height?: number;
-		/**
-		 * The width in pixels of the generated image
-		 */
-		width?: number;
-		/**
-		 * The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference.
-		 */
-		num_inference_steps?: number;
-		/**
-		 * Guidance scale: Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality.
-		 */
-		guidance_scale?: number;
-		/**
-		 * guess_mode only works for ControlNet models, defaults to False In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
-		 * you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
-		 */
-		guess_mode?: boolean;
-	};
-};
-export type ImageToImageOutput = Blob;
+export type ImageToImageArgs = BaseArgs & ImageToImageInput;
 /**
  * This task reads some text input and outputs an image.
  * Recommended model: lllyasviel/sd-controlnet-depth
  */
-export async function imageToImage(args: ImageToImageArgs, options?: Options): Promise<ImageToImageOutput> {
+export async function imageToImage(args: ImageToImageArgs, options?: Options): Promise<Blob> {
 	let reqArgs: RequestArgs;
 	if (!args.parameters) {
 		reqArgs = {
@@ -74,7 +26,7 @@ export async function imageToImage(args: ImageToImageArgs, options?: Options): P
 			),
 		};
 	}
-	const res = await request<ImageToImageOutput>(reqArgs, {
+	const res = await request<Blob>(reqArgs, {
 		...options,
 		taskHint: "image-to-image",
 	});

package/src/tasks/cv/imageToText.ts CHANGED Viewed

@@ -1,27 +1,18 @@
+import type { ImageToTextInput, ImageToTextOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
+import type { LegacyImageInput } from "./utils";
+import { preparePayload } from "./utils";
-export type ImageToTextArgs = BaseArgs & {
-	/**
-	 * Binary image data
-	 */
-	data: Blob | ArrayBuffer;
-};
-export interface ImageToTextOutput {
-	/**
-	 * The generated caption
-	 */
-	generated_text: string;
-}
+export type ImageToTextArgs = BaseArgs & (ImageToTextInput | LegacyImageInput);
 /**
  * This task reads some image input and outputs the text caption.
  */
 export async function imageToText(args: ImageToTextArgs, options?: Options): Promise<ImageToTextOutput> {
+	const payload = preparePayload(args);
 	const res = (
-		await request<[ImageToTextOutput]>(args, {
+		await request<[ImageToTextOutput]>(payload, {
 			...options,
 			taskHint: "image-to-text",
 		})

package/src/tasks/cv/objectDetection.ts CHANGED Viewed

@@ -1,43 +1,18 @@
 import { request } from "../custom/request";
 import type { BaseArgs, Options } from "../../types";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
+import type { ObjectDetectionInput, ObjectDetectionOutput } from "@huggingface/tasks";
+import { preparePayload, type LegacyImageInput } from "./utils";
-export type ObjectDetectionArgs = BaseArgs & {
-	/**
-	 * Binary image data
-	 */
-	data: Blob | ArrayBuffer;
-};
-export interface ObjectDetectionOutputValue {
-	/**
-	 * A dict (with keys [xmin,ymin,xmax,ymax]) representing the bounding box of a detected object.
-	 */
-	box: {
-		xmax: number;
-		xmin: number;
-		ymax: number;
-		ymin: number;
-	};
-	/**
-	 * The label for the class (model specific) of a detected object.
-	 */
-	label: string;
-	/**
-	 * A float that represents how likely it is that the detected object belongs to the given class.
-	 */
-	score: number;
-}
-export type ObjectDetectionOutput = ObjectDetectionOutputValue[];
+export type ObjectDetectionArgs = BaseArgs & (ObjectDetectionInput | LegacyImageInput);
 /**
  * This task reads some image input and outputs the likelihood of classes & bounding boxes of detected objects.
  * Recommended model: facebook/detr-resnet-50
  */
 export async function objectDetection(args: ObjectDetectionArgs, options?: Options): Promise<ObjectDetectionOutput> {
-	const res = await request<ObjectDetectionOutput>(args, {
+	const payload = preparePayload(args);
+	const res = await request<ObjectDetectionOutput>(payload, {
 		...options,
 		taskHint: "object-detection",
 	});

package/src/tasks/cv/textToImage.ts CHANGED Viewed

@@ -1,47 +1,10 @@
+import type { TextToImageInput, TextToImageOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
+import { omit } from "../../utils/omit";
 import { request } from "../custom/request";
-export type TextToImageArgs = BaseArgs & {
-	/**
-	 * The text to generate an image from
-	 */
-	inputs: string;
-	/**
-	 * Same param but for external providers like Together, Replicate
-	 */
-	prompt?: string;
-	response_format?: "base64";
-	input?: {
-		prompt: string;
-	};
-	parameters?: {
-		/**
-		 * An optional negative prompt for the image generation
-		 */
-		negative_prompt?: string;
-		/**
-		 * The height in pixels of the generated image
-		 */
-		height?: number;
-		/**
-		 * The width in pixels of the generated image
-		 */
-		width?: number;
-		/**
-		 * The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference.
-		 */
-		num_inference_steps?: number;
-		/**
-		 * Guidance scale: Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality.
-		 */
-		guidance_scale?: number;
-	};
-};
-export type TextToImageOutput = Blob;
+export type TextToImageArgs = BaseArgs & TextToImageInput;
 interface Base64ImageGeneration {
 	data: Array<{
@@ -56,16 +19,17 @@ interface OutputUrlImageGeneration {
  * This task reads some text input and outputs an image.
  * Recommended model: stabilityai/stable-diffusion-2
  */
-export async function textToImage(args: TextToImageArgs, options?: Options): Promise<TextToImageOutput> {
-	if (args.provider === "together" || args.provider === "fal-ai") {
-		args.prompt = args.inputs;
-		args.inputs = "";
-		args.response_format = "base64";
-	} else if (args.provider === "replicate") {
-		args.input = { prompt: args.inputs };
-		delete (args as unknown as { inputs: unknown }).inputs;
-	}
-	const res = await request<TextToImageOutput | Base64ImageGeneration | OutputUrlImageGeneration>(args, {
+export async function textToImage(args: TextToImageArgs, options?: Options): Promise<Blob> {
+	const payload =
+		args.provider === "together" || args.provider === "fal-ai" || args.provider === "replicate"
+			? {
+					...omit(args, ["inputs", "parameters"]),
+					...args.parameters,
+					...(args.provider !== "replicate" ? { response_format: "base64" } : undefined),
+					prompt: args.inputs,
+			  }
+			: args;
+	const res = await request<TextToImageOutput | Base64ImageGeneration | OutputUrlImageGeneration>(payload, {
 		...options,
 		taskHint: "text-to-image",
 	});

package/src/tasks/cv/textToVideo.ts ADDED Viewed

@@ -0,0 +1,67 @@
+import type { BaseArgs, InferenceProvider, Options } from "../../types";
+import type { TextToVideoInput } from "@huggingface/tasks";
+import { request } from "../custom/request";
+import { omit } from "../../utils/omit";
+import { isUrl } from "../../lib/isUrl";
+import { InferenceOutputError } from "../../lib/InferenceOutputError";
+import { typedInclude } from "../../utils/typedInclude";
+export type TextToVideoArgs = BaseArgs & TextToVideoInput;
+export type TextToVideoOutput = Blob;
+interface FalAiOutput {
+	video: {
+		url: string;
+	};
+}
+interface ReplicateOutput {
+	output: string;
+}
+const SUPPORTED_PROVIDERS = ["fal-ai", "replicate"] as const satisfies readonly InferenceProvider[];
+export async function textToVideo(args: TextToVideoArgs, options?: Options): Promise<TextToVideoOutput> {
+	if (!args.provider || !typedInclude(SUPPORTED_PROVIDERS, args.provider)) {
+		throw new Error(
+			`textToVideo inference is only supported for the following providers: ${SUPPORTED_PROVIDERS.join(", ")}`
+		);
+	}
+	const payload =
+		args.provider === "fal-ai" || args.provider === "replicate"
+			? { ...omit(args, ["inputs", "parameters"]), ...args.parameters, prompt: args.inputs }
+			: args;
+	const res = await request<FalAiOutput | ReplicateOutput>(payload, {
+		...options,
+		taskHint: "text-to-video",
+	});
+	if (args.provider === "fal-ai") {
+		const isValidOutput =
+			typeof res === "object" &&
+			!!res &&
+			"video" in res &&
+			typeof res.video === "object" &&
+			!!res.video &&
+			"url" in res.video &&
+			typeof res.video.url === "string" &&
+			isUrl(res.video.url);
+		if (!isValidOutput) {
+			throw new InferenceOutputError("Expected { video: { url: string } }");
+		}
+		const urlResponse = await fetch(res.video.url);
+		return await urlResponse.blob();
+	} else {
+		/// TODO: Replicate: handle the case where the generation request "times out" / is async (ie output is null)
+		/// https://replicate.com/docs/topics/predictions/create-a-prediction
+		const isValidOutput =
+			typeof res === "object" && !!res && "output" in res && typeof res.output === "string" && isUrl(res.output);
+		if (!isValidOutput) {
+			throw new InferenceOutputError("Expected { output: string }");
+		}
+		const urlResponse = await fetch(res.output);
+		return await urlResponse.blob();
+	}
+}

package/src/tasks/cv/utils.ts ADDED Viewed

@@ -0,0 +1,13 @@
+import type { BaseArgs, RequestArgs } from "../../types";
+import { omit } from "../../utils/omit";
+/**
+ * @deprecated
+ */
+export interface LegacyImageInput {
+	data: Blob | ArrayBuffer;
+}
+export function preparePayload(args: BaseArgs & ({ inputs: Blob } | LegacyImageInput)): RequestArgs {
+	return "data" in args ? args : { ...omit(args, "inputs"), data: args.inputs };
+}

package/src/tasks/cv/zeroShotImageClassification.ts CHANGED Viewed

@@ -3,28 +3,39 @@ import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
 import type { RequestArgs } from "../../types";
 import { base64FromBytes } from "../../utils/base64FromBytes";
+import type { ZeroShotImageClassificationInput, ZeroShotImageClassificationOutput } from "@huggingface/tasks";
-export type ZeroShotImageClassificationArgs = BaseArgs & {
-	inputs: {
-		/**
-		 * Binary image data
-		 */
-		image: Blob | ArrayBuffer;
-	};
-	parameters: {
-		/**
-		 * A list of strings that are potential classes for inputs. (max 10)
-		 */
-		candidate_labels: string[];
-	};
-};
-export interface ZeroShotImageClassificationOutputValue {
-	label: string;
-	score: number;
+/**
+ * @deprecated
+ */
+interface LegacyZeroShotImageClassificationInput {
+	inputs: { image: Blob | ArrayBuffer };
 }
-export type ZeroShotImageClassificationOutput = ZeroShotImageClassificationOutputValue[];
+export type ZeroShotImageClassificationArgs = BaseArgs &
+	(ZeroShotImageClassificationInput | LegacyZeroShotImageClassificationInput);
+async function preparePayload(args: ZeroShotImageClassificationArgs): Promise<RequestArgs> {
+	if (args.inputs instanceof Blob) {
+		return {
+			...args,
+			inputs: {
+				image: base64FromBytes(new Uint8Array(await args.inputs.arrayBuffer())),
+			},
+		};
+	} else {
+		return {
+			...args,
+			inputs: {
+				image: base64FromBytes(
+					new Uint8Array(
+						args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
+					)
+				),
+			},
+		};
+	}
+}
 /**
  * Classify an image to specified classes.
@@ -34,18 +45,8 @@ export async function zeroShotImageClassification(
 	args: ZeroShotImageClassificationArgs,
 	options?: Options
 ): Promise<ZeroShotImageClassificationOutput> {
-	const reqArgs: RequestArgs = {
-		...args,
-		inputs: {
-			image: base64FromBytes(
-				new Uint8Array(
-					args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
-				)
-			),
-		},
-	} as RequestArgs;
-	const res = await request<ZeroShotImageClassificationOutput>(reqArgs, {
+	const payload = await preparePayload(args);
+	const res = await request<ZeroShotImageClassificationOutput>(payload, {
 		...options,
 		taskHint: "zero-shot-image-classification",
 	});

package/src/tasks/multimodal/documentQuestionAnswering.ts CHANGED Viewed

@@ -4,37 +4,15 @@ import { request } from "../custom/request";
 import type { RequestArgs } from "../../types";
 import { toArray } from "../../utils/toArray";
 import { base64FromBytes } from "../../utils/base64FromBytes";
+import type {
+	DocumentQuestionAnsweringInput,
+	DocumentQuestionAnsweringInputData,
+	DocumentQuestionAnsweringOutput,
+} from "@huggingface/tasks";
-export type DocumentQuestionAnsweringArgs = BaseArgs & {
-	inputs: {
-		/**
-		 * Raw image
-		 *
-		 * You can use native `File` in browsers, or `new Blob([buffer])` in node, or for a base64 image `new Blob([btoa(base64String)])`, or even `await (await fetch('...)).blob()`
-		 **/
-		image: Blob | ArrayBuffer;
-		question: string;
-	};
-};
-export interface DocumentQuestionAnsweringOutput {
-	/**
-	 * A string that’s the answer within the document.
-	 */
-	answer: string;
-	/**
-	 * ?
-	 */
-	end?: number;
-	/**
-	 * A float that represents how likely that the answer is correct
-	 */
-	score?: number;
-	/**
-	 * ?
-	 */
-	start?: number;
-}
+/// Override the type to properly set inputs.image as Blob
+export type DocumentQuestionAnsweringArgs = BaseArgs &
+	DocumentQuestionAnsweringInput & { inputs: DocumentQuestionAnsweringInputData & { image: Blob } };
 /**
  * Answers a question on a document image. Recommended model: impira/layoutlm-document-qa.
@@ -42,32 +20,36 @@ export interface DocumentQuestionAnsweringOutput {
 export async function documentQuestionAnswering(
 	args: DocumentQuestionAnsweringArgs,
 	options?: Options
-): Promise<DocumentQuestionAnsweringOutput> {
+): Promise<DocumentQuestionAnsweringOutput[number]> {
 	const reqArgs: RequestArgs = {
 		...args,
 		inputs: {
 			question: args.inputs.question,
 			// convert Blob or ArrayBuffer to base64
-			image: base64FromBytes(
-				new Uint8Array(
-					args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
-				)
-			),
+			image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer())),
 		},
 	} as RequestArgs;
 	const res = toArray(
-		await request<[DocumentQuestionAnsweringOutput] | DocumentQuestionAnsweringOutput>(reqArgs, {
+		await request<DocumentQuestionAnsweringOutput | DocumentQuestionAnsweringOutput[number]>(reqArgs, {
 			...options,
 			taskHint: "document-question-answering",
 		})
-	)?.[0];
+	);
 	const isValidOutput =
-		typeof res?.answer === "string" &&
-		(typeof res.end === "number" || typeof res.end === "undefined") &&
-		(typeof res.score === "number" || typeof res.score === "undefined") &&
-		(typeof res.start === "number" || typeof res.start === "undefined");
+		Array.isArray(res) &&
+		res.every(
+			(elem) =>
+				typeof elem === "object" &&
+				!!elem &&
+				typeof elem?.answer === "string" &&
+				(typeof elem.end === "number" || typeof elem.end === "undefined") &&
+				(typeof elem.score === "number" || typeof elem.score === "undefined") &&
+				(typeof elem.start === "number" || typeof elem.start === "undefined")
+		);
 	if (!isValidOutput) {
 		throw new InferenceOutputError("Expected Array<{answer: string, end?: number, score?: number, start?: number}>");
 	}
-	return res;
+	return res[0];
 }

package/src/tasks/multimodal/visualQuestionAnswering.ts CHANGED Viewed

@@ -1,30 +1,16 @@
+import type {
+	VisualQuestionAnsweringInput,
+	VisualQuestionAnsweringInputData,
+	VisualQuestionAnsweringOutput,
+} from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options, RequestArgs } from "../../types";
 import { base64FromBytes } from "../../utils/base64FromBytes";
 import { request } from "../custom/request";
-export type VisualQuestionAnsweringArgs = BaseArgs & {
-	inputs: {
-		/**
-		 * Raw image
-		 *
-		 * You can use native `File` in browsers, or `new Blob([buffer])` in node, or for a base64 image `new Blob([btoa(base64String)])`, or even `await (await fetch('...)).blob()`
-		 **/
-		image: Blob | ArrayBuffer;
-		question: string;
-	};
-};
-export interface VisualQuestionAnsweringOutput {
-	/**
-	 * A string that’s the answer to a visual question.
-	 */
-	answer: string;
-	/**
-	 * Answer correctness score.
-	 */
-	score: number;
-}
+/// Override the type to properly set inputs.image as Blob
+export type VisualQuestionAnsweringArgs = BaseArgs &
+	VisualQuestionAnsweringInput & { inputs: VisualQuestionAnsweringInputData & { image: Blob } };
 /**
  * Answers a question on an image. Recommended model: dandelin/vilt-b32-finetuned-vqa.
@@ -32,28 +18,26 @@ export interface VisualQuestionAnsweringOutput {
 export async function visualQuestionAnswering(
 	args: VisualQuestionAnsweringArgs,
 	options?: Options
-): Promise<VisualQuestionAnsweringOutput> {
+): Promise<VisualQuestionAnsweringOutput[number]> {
 	const reqArgs: RequestArgs = {
 		...args,
 		inputs: {
 			question: args.inputs.question,
 			// convert Blob or ArrayBuffer to base64
-			image: base64FromBytes(
-				new Uint8Array(
-					args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
-				)
-			),
+			image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer())),
 		},
 	} as RequestArgs;
-	const res = (
-		await request<[VisualQuestionAnsweringOutput]>(reqArgs, {
-			...options,
-			taskHint: "visual-question-answering",
-		})
-	)?.[0];
-	const isValidOutput = typeof res?.answer === "string" && typeof res.score === "number";
+	const res = await request<VisualQuestionAnsweringOutput>(reqArgs, {
+		...options,
+		taskHint: "visual-question-answering",
+	});
+	const isValidOutput =
+		Array.isArray(res) &&
+		res.every(
+			(elem) => typeof elem === "object" && !!elem && typeof elem?.answer === "string" && typeof elem.score === "number"
+		);
 	if (!isValidOutput) {
 		throw new InferenceOutputError("Expected Array<{answer: string, score: number}>");
 	}
-	return res;
+	return res[0];
 }

package/src/tasks/nlp/fillMask.ts CHANGED Viewed

@@ -1,29 +1,9 @@
+import type { FillMaskInput, FillMaskOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
-export type FillMaskArgs = BaseArgs & {
-	inputs: string;
-};
-export type FillMaskOutput = {
-	/**
-	 * The probability for this token.
-	 */
-	score: number;
-	/**
-	 * The actual sequence of tokens that ran against the model (may contain special tokens)
-	 */
-	sequence: string;
-	/**
-	 * The id of the token
-	 */
-	token: number;
-	/**
-	 * The string representation of the token
-	 */
-	token_str: string;
-}[];
+export type FillMaskArgs = BaseArgs & FillMaskInput;
 /**
  * Tries to fill in a hole with a missing word (token to be precise). That’s the base task for BERT models.