npm - @huggingface/inference - Versions diffs - 2.4.0 → 2.5.1 - Mend

@huggingface/inference 2.4.0 → 2.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/README.md +23 -0
package/dist/index.d.ts +97 -2
package/dist/index.js +106 -9
package/dist/index.mjs +103 -8
package/package.json +1 -1
package/src/lib/getDefaultTask.ts +53 -0
package/src/lib/isUrl.ts +3 -0
package/src/lib/makeRequestOptions.ts +20 -5
package/src/tasks/audio/audioToAudio.ts +46 -0
package/src/tasks/custom/request.ts +3 -1
package/src/tasks/custom/streamingRequest.ts +3 -1
package/src/tasks/cv/imageClassification.ts +2 -2
package/src/tasks/cv/zeroShotImageClassification.ts +55 -0
package/src/tasks/index.ts +2 -0
package/src/tasks/nlp/featureExtraction.ts +11 -1
package/src/tasks/nlp/sentenceSimilarity.ts +11 -1
package/src/types.ts +2 -0

package/README.md CHANGED Viewed

@@ -149,6 +149,16 @@ await hf.audioClassification({
   data: readFileSync('test/sample1.flac')
 })
+await hf.textToSpeech({
+  model: 'espnet/kan-bayashi_ljspeech_vits',
+  inputs: 'Hello world!'
+})
+await hf.audioToAudio({
+  model: 'speechbrain/sepformer-wham',
+  data: readFileSync('test/sample1.flac')
+})
 // Computer Vision
 await hf.imageClassification({
@@ -187,6 +197,16 @@ await hf.imageToImage({
   model: "lllyasviel/sd-controlnet-depth",
 });
+await hf.zeroShotImageClassification({
+  model: 'openai/clip-vit-large-patch14-336',
+  inputs: {
+    image: await (await fetch('https://placekitten.com/300/300')).blob()
+  },
+  parameters: {
+    candidate_labels: ['cat', 'dog']
+  }
+})
 // Multimodal
 await hf.visualQuestionAnswering({
@@ -288,6 +308,8 @@ const { generated_text } = await gpt2.textGeneration({inputs: 'The answer to the
 - [x] Automatic speech recognition
 - [x] Audio classification
+- [x] Text to speech
+- [x] Audio to audio
 ### Computer Vision
@@ -297,6 +319,7 @@ const { generated_text } = await gpt2.textGeneration({inputs: 'The answer to the
 - [x] Text to image
 - [x] Image to text - [demo](https://huggingface.co/spaces/huggingfacejs/image-to-text)
 - [x] Image to Image
+- [x] Zero-shot image classification
 ### Multimodal

package/dist/index.d.ts CHANGED Viewed

@@ -26,6 +26,8 @@ export interface Options {
 	fetch?: typeof fetch;
 }
+export type InferenceTask = "text-classification" | "feature-extraction" | "sentence-similarity";
 export interface BaseArgs {
 	/**
 	 * The access token to use. Without it, you'll get rate-limited quickly.
@@ -72,6 +74,34 @@ export function audioClassification(
 	args: AudioClassificationArgs,
 	options?: Options
 ): Promise<AudioClassificationReturn>;
+export type AudioToAudioArgs = BaseArgs & {
+	/**
+	 * Binary audio data
+	 */
+	data: Blob | ArrayBuffer;
+};
+export type AudioToAudioReturn = AudioToAudioOutputValue[];
+export interface AudioToAudioOutputValue {
+	/**
+	 * The label for the audio output (model specific)
+	 */
+	label: string;
+	/**
+	 * Base64 encoded audio output.
+	 */
+	blob: string;
+	/**
+	 * Content-type for blob, e.g. audio/flac
+	 */
+	"content-type": string;
+}
+/**
+ * This task reads some audio input and outputs one or multiple audio files.
+ * Example model: speechbrain/sepformer-wham does audio source separation.
+ */
+export function audioToAudio(args: AudioToAudioArgs, options?: Options): Promise<AudioToAudioReturn>;
 export type AutomaticSpeechRecognitionArgs = BaseArgs & {
 	/**
 	 * Binary audio data
@@ -112,6 +142,8 @@ export function request<T>(
 	options?: Options & {
 		/** For internal HF use, which is why it's not exposed in {@link Options} */
 		includeCredentials?: boolean;
+		/** When a model can be used for multiple tasks, and we want to run a non-default task */
+		task?: string | InferenceTask;
 	}
 ): Promise<T>;
 /**
@@ -122,6 +154,8 @@ export function streamingRequest<T>(
 	options?: Options & {
 		/** For internal HF use, which is why it's not exposed in {@link Options} */
 		includeCredentials?: boolean;
+		/** When a model can be used for multiple tasks, and we want to run a non-default task */
+		task?: string | InferenceTask;
 	}
 ): AsyncGenerator<T>;
 export type ImageClassificationArgs = BaseArgs & {
@@ -133,11 +167,11 @@ export type ImageClassificationArgs = BaseArgs & {
 export type ImageClassificationOutput = ImageClassificationOutputValue[];
 export interface ImageClassificationOutputValue {
 	/**
-	 * A float that represents how likely it is that the image file belongs to this class.
+	 * The label for the class (model specific)
 	 */
 	label: string;
 	/**
-	 * The label for the class (model specific)
+	 * A float that represents how likely it is that the image file belongs to this class.
 	 */
 	score: number;
 }
@@ -315,6 +349,33 @@ export type TextToImageOutput = Blob;
  * Recommended model: stabilityai/stable-diffusion-2
  */
 export function textToImage(args: TextToImageArgs, options?: Options): Promise<TextToImageOutput>;
+export type ZeroShotImageClassificationArgs = BaseArgs & {
+	inputs: {
+		/**
+		 * Binary image data
+		 */
+		image: Blob | ArrayBuffer;
+	};
+	parameters: {
+		/**
+		 * A list of strings that are potential classes for inputs. (max 10)
+		 */
+		candidate_labels: string[];
+	};
+};
+export type ZeroShotImageClassificationOutput = ZeroShotImageClassificationOutputValue[];
+export interface ZeroShotImageClassificationOutputValue {
+	label: string;
+	score: number;
+}
+/**
+ * Classify an image to specified classes.
+ * Recommended model: openai/clip-vit-large-patch14-336
+ */
+export function zeroShotImageClassification(
+	args: ZeroShotImageClassificationArgs,
+	options?: Options
+): Promise<ZeroShotImageClassificationOutput>;
 export type DocumentQuestionAnsweringArgs = BaseArgs & {
 	inputs: {
 		/**
@@ -931,6 +992,11 @@ export class HfInference {
 		args: Omit<AudioClassificationArgs, 'accessToken'>,
 		options?: Options
 	): Promise<AudioClassificationReturn>;
+	/**
+	 * This task reads some audio input and outputs one or multiple audio files.
+	 * Example model: speechbrain/sepformer-wham does audio source separation.
+	 */
+	audioToAudio(args: Omit<AudioToAudioArgs, 'accessToken'>, options?: Options): Promise<AudioToAudioReturn>;
 	/**
 	 * This task reads some audio input and outputs the said words within the audio files.
 	 * Recommended model (english language): facebook/wav2vec2-large-960h-lv60-self
@@ -952,6 +1018,8 @@ export class HfInference {
 		options?: Options & {
 			/** For internal HF use, which is why it's not exposed in {@link Options} */
 			includeCredentials?: boolean;
+			/** When a model can be used for multiple tasks, and we want to run a non-default task */
+			task?: string | InferenceTask;
 		}
 	): Promise<T>;
 	/**
@@ -962,6 +1030,8 @@ export class HfInference {
 		options?: Options & {
 			/** For internal HF use, which is why it's not exposed in {@link Options} */
 			includeCredentials?: boolean;
+			/** When a model can be used for multiple tasks, and we want to run a non-default task */
+			task?: string | InferenceTask;
 		}
 	): AsyncGenerator<T>;
 	/**
@@ -999,6 +1069,14 @@ export class HfInference {
 	 * Recommended model: stabilityai/stable-diffusion-2
 	 */
 	textToImage(args: Omit<TextToImageArgs, 'accessToken'>, options?: Options): Promise<TextToImageOutput>;
+	/**
+	 * Classify an image to specified classes.
+	 * Recommended model: openai/clip-vit-large-patch14-336
+	 */
+	zeroShotImageClassification(
+		args: Omit<ZeroShotImageClassificationArgs, 'accessToken'>,
+		options?: Options
+	): Promise<ZeroShotImageClassificationOutput>;
 	/**
 	 * Answers a question on a document image. Recommended model: impira/layoutlm-document-qa.
 	 */
@@ -1119,6 +1197,11 @@ export class HfInferenceEndpoint {
 		args: Omit<AudioClassificationArgs, 'accessToken' | 'model'>,
 		options?: Options
 	): Promise<AudioClassificationReturn>;
+	/**
+	 * This task reads some audio input and outputs one or multiple audio files.
+	 * Example model: speechbrain/sepformer-wham does audio source separation.
+	 */
+	audioToAudio(args: Omit<AudioToAudioArgs, 'accessToken' | 'model'>, options?: Options): Promise<AudioToAudioReturn>;
 	/**
 	 * This task reads some audio input and outputs the said words within the audio files.
 	 * Recommended model (english language): facebook/wav2vec2-large-960h-lv60-self
@@ -1140,6 +1223,8 @@ export class HfInferenceEndpoint {
 		options?: Options & {
 			/** For internal HF use, which is why it's not exposed in {@link Options} */
 			includeCredentials?: boolean;
+			/** When a model can be used for multiple tasks, and we want to run a non-default task */
+			task?: string | InferenceTask;
 		}
 	): Promise<T>;
 	/**
@@ -1150,6 +1235,8 @@ export class HfInferenceEndpoint {
 		options?: Options & {
 			/** For internal HF use, which is why it's not exposed in {@link Options} */
 			includeCredentials?: boolean;
+			/** When a model can be used for multiple tasks, and we want to run a non-default task */
+			task?: string | InferenceTask;
 		}
 	): AsyncGenerator<T>;
 	/**
@@ -1187,6 +1274,14 @@ export class HfInferenceEndpoint {
 	 * Recommended model: stabilityai/stable-diffusion-2
 	 */
 	textToImage(args: Omit<TextToImageArgs, 'accessToken' | 'model'>, options?: Options): Promise<TextToImageOutput>;
+	/**
+	 * Classify an image to specified classes.
+	 * Recommended model: openai/clip-vit-large-patch14-336
+	 */
+	zeroShotImageClassification(
+		args: Omit<ZeroShotImageClassificationArgs, 'accessToken' | 'model'>,
+		options?: Options
+	): Promise<ZeroShotImageClassificationOutput>;
 	/**
 	 * Answers a question on a document image. Recommended model: impira/layoutlm-document-qa.
 	 */

package/dist/index.js CHANGED Viewed

@@ -25,6 +25,7 @@ __export(src_exports, {
   HfInferenceEndpoint: () => HfInferenceEndpoint,
   InferenceOutputError: () => InferenceOutputError,
   audioClassification: () => audioClassification,
+  audioToAudio: () => audioToAudio,
   automaticSpeechRecognition: () => automaticSpeechRecognition,
   conversational: () => conversational,
   documentQuestionAnswering: () => documentQuestionAnswering,
@@ -51,7 +52,8 @@ __export(src_exports, {
   tokenClassification: () => tokenClassification,
   translation: () => translation,
   visualQuestionAnswering: () => visualQuestionAnswering,
-  zeroShotClassification: () => zeroShotClassification
+  zeroShotClassification: () => zeroShotClassification,
+  zeroShotImageClassification: () => zeroShotImageClassification
 });
 module.exports = __toCommonJS(src_exports);
@@ -59,6 +61,7 @@ module.exports = __toCommonJS(src_exports);
 var tasks_exports = {};
 __export(tasks_exports, {
   audioClassification: () => audioClassification,
+  audioToAudio: () => audioToAudio,
   automaticSpeechRecognition: () => automaticSpeechRecognition,
   conversational: () => conversational,
   documentQuestionAnswering: () => documentQuestionAnswering,
@@ -85,13 +88,20 @@ __export(tasks_exports, {
   tokenClassification: () => tokenClassification,
   translation: () => translation,
   visualQuestionAnswering: () => visualQuestionAnswering,
-  zeroShotClassification: () => zeroShotClassification
+  zeroShotClassification: () => zeroShotClassification,
+  zeroShotImageClassification: () => zeroShotImageClassification
 });
+// src/lib/isUrl.ts
+function isUrl(modelOrUrl) {
+  return /^http(s?):/.test(modelOrUrl) || modelOrUrl.startsWith("/");
+}
 // src/lib/makeRequestOptions.ts
-var HF_INFERENCE_API_BASE_URL = "https://api-inference.huggingface.co/models/";
+var HF_INFERENCE_API_BASE_URL = "https://api-inference.huggingface.co";
 function makeRequestOptions(args, options) {
   const { model, accessToken, ...otherArgs } = args;
+  const { task, includeCredentials, ...otherOptions } = options ?? {};
   const headers = {};
   if (accessToken) {
     headers["Authorization"] = `Bearer ${accessToken}`;
@@ -110,15 +120,23 @@ function makeRequestOptions(args, options) {
       headers["X-Load-Model"] = "0";
     }
   }
-  const url = /^http(s?):/.test(model) || model.startsWith("/") ? model : `${HF_INFERENCE_API_BASE_URL}${model}`;
+  const url = (() => {
+    if (isUrl(model)) {
+      return model;
+    }
+    if (task) {
+      return `${HF_INFERENCE_API_BASE_URL}/pipeline/${task}/${model}`;
+    }
+    return `${HF_INFERENCE_API_BASE_URL}/models/${model}`;
+  })();
   const info = {
     headers,
     method: "POST",
     body: binary ? args.data : JSON.stringify({
       ...otherArgs,
-      options
+      options: options && otherOptions
     }),
-    credentials: options?.includeCredentials ? "include" : "same-origin"
+    credentials: includeCredentials ? "include" : "same-origin"
   };
   return { url, info };
 }
@@ -350,6 +368,18 @@ async function textToSpeech(args, options) {
   return res;
 }
+// src/tasks/audio/audioToAudio.ts
+async function audioToAudio(args, options) {
+  const res = await request(args, options);
+  const isValidOutput = Array.isArray(res) && res.every(
+    (x) => typeof x.label === "string" && typeof x.blob === "string" && typeof x["content-type"] === "string"
+  );
+  if (!isValidOutput) {
+    throw new InferenceOutputError("Expected Array<{label: string, blob: string, content-type: string}>");
+  }
+  return res;
+}
 // src/tasks/cv/imageClassification.ts
 async function imageClassification(args, options) {
   const res = await request(args, options);
@@ -445,6 +475,26 @@ async function imageToImage(args, options) {
   return res;
 }
+// src/tasks/cv/zeroShotImageClassification.ts
+async function zeroShotImageClassification(args, options) {
+  const reqArgs = {
+    ...args,
+    inputs: {
+      image: base64FromBytes(
+        new Uint8Array(
+          args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
+        )
+      )
+    }
+  };
+  const res = await request(reqArgs, options);
+  const isValidOutput = Array.isArray(res) && res.every((x) => typeof x.label === "string" && typeof x.score === "number");
+  if (!isValidOutput) {
+    throw new InferenceOutputError("Expected Array<{label: string, score: number}>");
+  }
+  return res;
+}
 // src/tasks/nlp/conversational.ts
 async function conversational(args, options) {
   const res = await request(args, options);
@@ -457,9 +507,47 @@ async function conversational(args, options) {
   return res;
 }
+// src/lib/getDefaultTask.ts
+var taskCache = /* @__PURE__ */ new Map();
+var CACHE_DURATION = 10 * 60 * 1e3;
+var MAX_CACHE_ITEMS = 1e3;
+var HF_HUB_URL = "https://huggingface.co";
+async function getDefaultTask(model, accessToken) {
+  if (isUrl(model)) {
+    return null;
+  }
+  const key = `${model}:${accessToken}`;
+  let cachedTask = taskCache.get(key);
+  if (cachedTask && cachedTask.date < new Date(Date.now() - CACHE_DURATION)) {
+    taskCache.delete(key);
+    cachedTask = void 0;
+  }
+  if (cachedTask === void 0) {
+    const modelTask = await fetch(`${HF_HUB_URL}/api/models/${model}?expand[]=pipeline_tag`, {
+      headers: accessToken ? { Authorization: `Bearer ${accessToken}` } : {}
+    }).then((resp) => resp.json()).then((json) => json.pipeline_tag).catch(() => null);
+    if (!modelTask) {
+      return null;
+    }
+    cachedTask = { task: modelTask, date: /* @__PURE__ */ new Date() };
+    taskCache.set(key, { task: modelTask, date: /* @__PURE__ */ new Date() });
+    if (taskCache.size > MAX_CACHE_ITEMS) {
+      taskCache.delete(taskCache.keys().next().value);
+    }
+  }
+  return cachedTask.task;
+}
 // src/tasks/nlp/featureExtraction.ts
 async function featureExtraction(args, options) {
-  const res = await request(args, options);
+  const defaultTask = await getDefaultTask(args.model, args.accessToken);
+  const res = await request(
+    args,
+    defaultTask === "sentence-similarity" ? {
+      ...options,
+      task: "feature-extraction"
+    } : options
+  );
   let isValidOutput = true;
   const isNumArrayRec = (arr, maxDepth, curDepth = 0) => {
     if (curDepth > maxDepth)
@@ -503,7 +591,14 @@ async function questionAnswering(args, options) {
 // src/tasks/nlp/sentenceSimilarity.ts
 async function sentenceSimilarity(args, options) {
-  const res = await request(args, options);
+  const defaultTask = await getDefaultTask(args.model, args.accessToken);
+  const res = await request(
+    args,
+    defaultTask === "feature-extraction" ? {
+      ...options,
+      task: "sentence-similarity"
+    } : options
+  );
   const isValidOutput = Array.isArray(res) && res.every((x) => typeof x === "number");
   if (!isValidOutput) {
     throw new InferenceOutputError("Expected number[]");
@@ -715,6 +810,7 @@ var HfInferenceEndpoint = class {
   HfInferenceEndpoint,
   InferenceOutputError,
   audioClassification,
+  audioToAudio,
   automaticSpeechRecognition,
   conversational,
   documentQuestionAnswering,
@@ -741,5 +837,6 @@ var HfInferenceEndpoint = class {
   tokenClassification,
   translation,
   visualQuestionAnswering,
-  zeroShotClassification
+  zeroShotClassification,
+  zeroShotImageClassification
 });

package/dist/index.mjs CHANGED Viewed

@@ -9,6 +9,7 @@ var __export = (target, all) => {
 var tasks_exports = {};
 __export(tasks_exports, {
   audioClassification: () => audioClassification,
+  audioToAudio: () => audioToAudio,
   automaticSpeechRecognition: () => automaticSpeechRecognition,
   conversational: () => conversational,
   documentQuestionAnswering: () => documentQuestionAnswering,
@@ -35,13 +36,20 @@ __export(tasks_exports, {
   tokenClassification: () => tokenClassification,
   translation: () => translation,
   visualQuestionAnswering: () => visualQuestionAnswering,
-  zeroShotClassification: () => zeroShotClassification
+  zeroShotClassification: () => zeroShotClassification,
+  zeroShotImageClassification: () => zeroShotImageClassification
 });
+// src/lib/isUrl.ts
+function isUrl(modelOrUrl) {
+  return /^http(s?):/.test(modelOrUrl) || modelOrUrl.startsWith("/");
+}
 // src/lib/makeRequestOptions.ts
-var HF_INFERENCE_API_BASE_URL = "https://api-inference.huggingface.co/models/";
+var HF_INFERENCE_API_BASE_URL = "https://api-inference.huggingface.co";
 function makeRequestOptions(args, options) {
   const { model, accessToken, ...otherArgs } = args;
+  const { task, includeCredentials, ...otherOptions } = options ?? {};
   const headers = {};
   if (accessToken) {
     headers["Authorization"] = `Bearer ${accessToken}`;
@@ -60,15 +68,23 @@ function makeRequestOptions(args, options) {
       headers["X-Load-Model"] = "0";
     }
   }
-  const url = /^http(s?):/.test(model) || model.startsWith("/") ? model : `${HF_INFERENCE_API_BASE_URL}${model}`;
+  const url = (() => {
+    if (isUrl(model)) {
+      return model;
+    }
+    if (task) {
+      return `${HF_INFERENCE_API_BASE_URL}/pipeline/${task}/${model}`;
+    }
+    return `${HF_INFERENCE_API_BASE_URL}/models/${model}`;
+  })();
   const info = {
     headers,
     method: "POST",
     body: binary ? args.data : JSON.stringify({
       ...otherArgs,
-      options
+      options: options && otherOptions
     }),
-    credentials: options?.includeCredentials ? "include" : "same-origin"
+    credentials: includeCredentials ? "include" : "same-origin"
   };
   return { url, info };
 }
@@ -300,6 +316,18 @@ async function textToSpeech(args, options) {
   return res;
 }
+// src/tasks/audio/audioToAudio.ts
+async function audioToAudio(args, options) {
+  const res = await request(args, options);
+  const isValidOutput = Array.isArray(res) && res.every(
+    (x) => typeof x.label === "string" && typeof x.blob === "string" && typeof x["content-type"] === "string"
+  );
+  if (!isValidOutput) {
+    throw new InferenceOutputError("Expected Array<{label: string, blob: string, content-type: string}>");
+  }
+  return res;
+}
 // src/tasks/cv/imageClassification.ts
 async function imageClassification(args, options) {
   const res = await request(args, options);
@@ -395,6 +423,26 @@ async function imageToImage(args, options) {
   return res;
 }
+// src/tasks/cv/zeroShotImageClassification.ts
+async function zeroShotImageClassification(args, options) {
+  const reqArgs = {
+    ...args,
+    inputs: {
+      image: base64FromBytes(
+        new Uint8Array(
+          args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
+        )
+      )
+    }
+  };
+  const res = await request(reqArgs, options);
+  const isValidOutput = Array.isArray(res) && res.every((x) => typeof x.label === "string" && typeof x.score === "number");
+  if (!isValidOutput) {
+    throw new InferenceOutputError("Expected Array<{label: string, score: number}>");
+  }
+  return res;
+}
 // src/tasks/nlp/conversational.ts
 async function conversational(args, options) {
   const res = await request(args, options);
@@ -407,9 +455,47 @@ async function conversational(args, options) {
   return res;
 }
+// src/lib/getDefaultTask.ts
+var taskCache = /* @__PURE__ */ new Map();
+var CACHE_DURATION = 10 * 60 * 1e3;
+var MAX_CACHE_ITEMS = 1e3;
+var HF_HUB_URL = "https://huggingface.co";
+async function getDefaultTask(model, accessToken) {
+  if (isUrl(model)) {
+    return null;
+  }
+  const key = `${model}:${accessToken}`;
+  let cachedTask = taskCache.get(key);
+  if (cachedTask && cachedTask.date < new Date(Date.now() - CACHE_DURATION)) {
+    taskCache.delete(key);
+    cachedTask = void 0;
+  }
+  if (cachedTask === void 0) {
+    const modelTask = await fetch(`${HF_HUB_URL}/api/models/${model}?expand[]=pipeline_tag`, {
+      headers: accessToken ? { Authorization: `Bearer ${accessToken}` } : {}
+    }).then((resp) => resp.json()).then((json) => json.pipeline_tag).catch(() => null);
+    if (!modelTask) {
+      return null;
+    }
+    cachedTask = { task: modelTask, date: /* @__PURE__ */ new Date() };
+    taskCache.set(key, { task: modelTask, date: /* @__PURE__ */ new Date() });
+    if (taskCache.size > MAX_CACHE_ITEMS) {
+      taskCache.delete(taskCache.keys().next().value);
+    }
+  }
+  return cachedTask.task;
+}
 // src/tasks/nlp/featureExtraction.ts
 async function featureExtraction(args, options) {
-  const res = await request(args, options);
+  const defaultTask = await getDefaultTask(args.model, args.accessToken);
+  const res = await request(
+    args,
+    defaultTask === "sentence-similarity" ? {
+      ...options,
+      task: "feature-extraction"
+    } : options
+  );
   let isValidOutput = true;
   const isNumArrayRec = (arr, maxDepth, curDepth = 0) => {
     if (curDepth > maxDepth)
@@ -453,7 +539,14 @@ async function questionAnswering(args, options) {
 // src/tasks/nlp/sentenceSimilarity.ts
 async function sentenceSimilarity(args, options) {
-  const res = await request(args, options);
+  const defaultTask = await getDefaultTask(args.model, args.accessToken);
+  const res = await request(
+    args,
+    defaultTask === "feature-extraction" ? {
+      ...options,
+      task: "sentence-similarity"
+    } : options
+  );
   const isValidOutput = Array.isArray(res) && res.every((x) => typeof x === "number");
   if (!isValidOutput) {
     throw new InferenceOutputError("Expected number[]");
@@ -664,6 +757,7 @@ export {
   HfInferenceEndpoint,
   InferenceOutputError,
   audioClassification,
+  audioToAudio,
   automaticSpeechRecognition,
   conversational,
   documentQuestionAnswering,
@@ -690,5 +784,6 @@ export {
   tokenClassification,
   translation,
   visualQuestionAnswering,
-  zeroShotClassification
+  zeroShotClassification,
+  zeroShotImageClassification
 };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@huggingface/inference",
-  "version": "2.4.0",
+  "version": "2.5.1",
   "packageManager": "pnpm@8.3.1",
   "license": "MIT",
   "author": "Tim Mikeladze <tim.mikeladze@gmail.com>",

package/src/lib/getDefaultTask.ts ADDED Viewed

@@ -0,0 +1,53 @@
+import { isUrl } from "./isUrl";
+/**
+ * We want to make calls to the huggingface hub the least possible, eg if
+ * someone is calling the inference API 1000 times per second, we don't want
+ * to make 1000 calls to the hub to get the task name.
+ */
+const taskCache = new Map<string, { task: string; date: Date }>();
+const CACHE_DURATION = 10 * 60 * 1000;
+const MAX_CACHE_ITEMS = 1000;
+const HF_HUB_URL = "https://huggingface.co";
+/**
+ * Get the default task. Use a LRU cache of 1000 items with 10 minutes expiration
+ * to avoid making too many calls to the HF hub.
+ *
+ * @returns The default task for the model, or `null` if it was impossible to get it
+ */
+export async function getDefaultTask(model: string, accessToken: string | undefined): Promise<string | null> {
+	if (isUrl(model)) {
+		return null;
+	}
+	const key = `${model}:${accessToken}`;
+	let cachedTask = taskCache.get(key);
+	if (cachedTask && cachedTask.date < new Date(Date.now() - CACHE_DURATION)) {
+		taskCache.delete(key);
+		cachedTask = undefined;
+	}
+	if (cachedTask === undefined) {
+		const modelTask = await fetch(`${HF_HUB_URL}/api/models/${model}?expand[]=pipeline_tag`, {
+			headers: accessToken ? { Authorization: `Bearer ${accessToken}` } : {},
+		})
+			.then((resp) => resp.json())
+			.then((json) => json.pipeline_tag)
+			.catch(() => null);
+		if (!modelTask) {
+			return null;
+		}
+		cachedTask = { task: modelTask, date: new Date() };
+		taskCache.set(key, { task: modelTask, date: new Date() });
+		if (taskCache.size > MAX_CACHE_ITEMS) {
+			taskCache.delete(taskCache.keys().next().value);
+		}
+	}
+	return cachedTask.task;
+}

package/src/lib/isUrl.ts ADDED Viewed

@@ -0,0 +1,3 @@
+export function isUrl(modelOrUrl: string): boolean {
+	return /^http(s?):/.test(modelOrUrl) || modelOrUrl.startsWith("/");
+}

package/src/lib/makeRequestOptions.ts CHANGED Viewed

@@ -1,6 +1,7 @@
-import type { Options, RequestArgs } from "../types";
+import type { InferenceTask, Options, RequestArgs } from "../types";
+import { isUrl } from "./isUrl";
-const HF_INFERENCE_API_BASE_URL = "https://api-inference.huggingface.co/models/";
+const HF_INFERENCE_API_BASE_URL = "https://api-inference.huggingface.co";
 /**
  * Helper that prepares request arguments
@@ -13,9 +14,12 @@ export function makeRequestOptions(
 	options?: Options & {
 		/** For internal HF use, which is why it's not exposed in {@link Options} */
 		includeCredentials?: boolean;
+		/** When a model can be used for multiple tasks, and we want to run a non-default task */
+		task?: string | InferenceTask;
 	}
 ): { url: string; info: RequestInit } {
 	const { model, accessToken, ...otherArgs } = args;
+	const { task, includeCredentials, ...otherOptions } = options ?? {};
 	const headers: Record<string, string> = {};
 	if (accessToken) {
@@ -38,7 +42,18 @@ export function makeRequestOptions(
 		}
 	}
-	const url = /^http(s?):/.test(model) || model.startsWith("/") ? model : `${HF_INFERENCE_API_BASE_URL}${model}`;
+	const url = (() => {
+		if (isUrl(model)) {
+			return model;
+		}
+		if (task) {
+			return `${HF_INFERENCE_API_BASE_URL}/pipeline/${task}/${model}`;
+		}
+		return `${HF_INFERENCE_API_BASE_URL}/models/${model}`;
+	})();
 	const info: RequestInit = {
 		headers,
 		method: "POST",
@@ -46,9 +61,9 @@ export function makeRequestOptions(
 			? args.data
 			: JSON.stringify({
 					...otherArgs,
-					options,
+					options: options && otherOptions,
 			  }),
-		credentials: options?.includeCredentials ? "include" : "same-origin",
+		credentials: includeCredentials ? "include" : "same-origin",
 	};
 	return { url, info };

package/src/tasks/audio/audioToAudio.ts ADDED Viewed

@@ -0,0 +1,46 @@
+import { InferenceOutputError } from "../../lib/InferenceOutputError";
+import type { BaseArgs, Options } from "../../types";
+import { request } from "../custom/request";
+export type AudioToAudioArgs = BaseArgs & {
+	/**
+	 * Binary audio data
+	 */
+	data: Blob | ArrayBuffer;
+};
+export interface AudioToAudioOutputValue {
+	/**
+	 * The label for the audio output (model specific)
+	 */
+	label: string;
+	/**
+	 * Base64 encoded audio output.
+	 */
+	blob: string;
+	/**
+	 * Content-type for blob, e.g. audio/flac
+	 */
+	"content-type": string;
+}
+export type AudioToAudioReturn = AudioToAudioOutputValue[];
+/**
+ * This task reads some audio input and outputs one or multiple audio files.
+ * Example model: speechbrain/sepformer-wham does audio source separation.
+ */
+export async function audioToAudio(args: AudioToAudioArgs, options?: Options): Promise<AudioToAudioReturn> {
+	const res = await request<AudioToAudioReturn>(args, options);
+	const isValidOutput =
+		Array.isArray(res) &&
+		res.every(
+			(x) => typeof x.label === "string" && typeof x.blob === "string" && typeof x["content-type"] === "string"
+		);
+	if (!isValidOutput) {
+		throw new InferenceOutputError("Expected Array<{label: string, blob: string, content-type: string}>");
+	}
+	return res;
+}

package/src/tasks/custom/request.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import type { Options, RequestArgs } from "../../types";
+import type { InferenceTask, Options, RequestArgs } from "../../types";
 import { makeRequestOptions } from "../../lib/makeRequestOptions";
 /**
@@ -9,6 +9,8 @@ export async function request<T>(
 	options?: Options & {
 		/** For internal HF use, which is why it's not exposed in {@link Options} */
 		includeCredentials?: boolean;
+		/** When a model can be used for multiple tasks, and we want to run a non-default task */
+		task?: string | InferenceTask;
 	}
 ): Promise<T> {
 	const { url, info } = makeRequestOptions(args, options);

package/src/tasks/custom/streamingRequest.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import type { Options, RequestArgs } from "../../types";
+import type { InferenceTask, Options, RequestArgs } from "../../types";
 import { makeRequestOptions } from "../../lib/makeRequestOptions";
 import type { EventSourceMessage } from "../../vendor/fetch-event-source/parse";
 import { getLines, getMessages } from "../../vendor/fetch-event-source/parse";
@@ -11,6 +11,8 @@ export async function* streamingRequest<T>(
 	options?: Options & {
 		/** For internal HF use, which is why it's not exposed in {@link Options} */
 		includeCredentials?: boolean;
+		/** When a model can be used for multiple tasks, and we want to run a non-default task */
+		task?: string | InferenceTask;
 	}
 ): AsyncGenerator<T> {
 	const { url, info } = makeRequestOptions({ ...args, stream: true }, options);

package/src/tasks/cv/imageClassification.ts CHANGED Viewed

@@ -11,11 +11,11 @@ export type ImageClassificationArgs = BaseArgs & {
 export interface ImageClassificationOutputValue {
 	/**
-	 * A float that represents how likely it is that the image file belongs to this class.
+	 * The label for the class (model specific)
 	 */
 	label: string;
 	/**
-	 * The label for the class (model specific)
+	 * A float that represents how likely it is that the image file belongs to this class.
 	 */
 	score: number;
 }

package/src/tasks/cv/zeroShotImageClassification.ts ADDED Viewed

@@ -0,0 +1,55 @@
+import { InferenceOutputError } from "../../lib/InferenceOutputError";
+import type { BaseArgs, Options } from "../../types";
+import { request } from "../custom/request";
+import type { RequestArgs } from "../../types";
+import { base64FromBytes } from "../../../../shared";
+export type ZeroShotImageClassificationArgs = BaseArgs & {
+	inputs: {
+		/**
+		 * Binary image data
+		 */
+		image: Blob | ArrayBuffer;
+	};
+	parameters: {
+		/**
+		 * A list of strings that are potential classes for inputs. (max 10)
+		 */
+		candidate_labels: string[];
+	};
+};
+export interface ZeroShotImageClassificationOutputValue {
+	label: string;
+	score: number;
+}
+export type ZeroShotImageClassificationOutput = ZeroShotImageClassificationOutputValue[];
+/**
+ * Classify an image to specified classes.
+ * Recommended model: openai/clip-vit-large-patch14-336
+ */
+export async function zeroShotImageClassification(
+	args: ZeroShotImageClassificationArgs,
+	options?: Options
+): Promise<ZeroShotImageClassificationOutput> {
+	const reqArgs: RequestArgs = {
+		...args,
+		inputs: {
+			image: base64FromBytes(
+				new Uint8Array(
+					args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
+				)
+			),
+		},
+	} as RequestArgs;
+	const res = await request<ZeroShotImageClassificationOutput>(reqArgs, options);
+	const isValidOutput =
+		Array.isArray(res) && res.every((x) => typeof x.label === "string" && typeof x.score === "number");
+	if (!isValidOutput) {
+		throw new InferenceOutputError("Expected Array<{label: string, score: number}>");
+	}
+	return res;
+}

package/src/tasks/index.ts CHANGED Viewed

@@ -6,6 +6,7 @@ export * from "./custom/streamingRequest";
 export * from "./audio/audioClassification";
 export * from "./audio/automaticSpeechRecognition";
 export * from "./audio/textToSpeech";
+export * from "./audio/audioToAudio";
 // Computer Vision tasks
 export * from "./cv/imageClassification";
@@ -14,6 +15,7 @@ export * from "./cv/imageToText";
 export * from "./cv/objectDetection";
 export * from "./cv/textToImage";
 export * from "./cv/imageToImage";
+export * from "./cv/zeroShotImageClassification";
 // Natural Language Processing tasks
 export * from "./nlp/conversational";

package/src/tasks/nlp/featureExtraction.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
+import { getDefaultTask } from "../../lib/getDefaultTask";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
@@ -24,7 +25,16 @@ export async function featureExtraction(
 	args: FeatureExtractionArgs,
 	options?: Options
 ): Promise<FeatureExtractionOutput> {
-	const res = await request<FeatureExtractionOutput>(args, options);
+	const defaultTask = await getDefaultTask(args.model, args.accessToken);
+	const res = await request<FeatureExtractionOutput>(
+		args,
+		defaultTask === "sentence-similarity"
+			? {
+					...options,
+					task: "feature-extraction",
+			  }
+			: options
+	);
 	let isValidOutput = true;
 	const isNumArrayRec = (arr: unknown[], maxDepth: number, curDepth = 0): boolean => {

package/src/tasks/nlp/sentenceSimilarity.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
+import { getDefaultTask } from "../../lib/getDefaultTask";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
@@ -24,7 +25,16 @@ export async function sentenceSimilarity(
 	args: SentenceSimilarityArgs,
 	options?: Options
 ): Promise<SentenceSimilarityOutput> {
-	const res = await request<SentenceSimilarityOutput>(args, options);
+	const defaultTask = await getDefaultTask(args.model, args.accessToken);
+	const res = await request<SentenceSimilarityOutput>(
+		args,
+		defaultTask === "feature-extraction"
+			? {
+					...options,
+					task: "sentence-similarity",
+			  }
+			: options
+	);
 	const isValidOutput = Array.isArray(res) && res.every((x) => typeof x === "number");
 	if (!isValidOutput) {

package/src/types.ts CHANGED Viewed

@@ -26,6 +26,8 @@ export interface Options {
 	fetch?: typeof fetch;
 }
+export type InferenceTask = "text-classification" | "feature-extraction" | "sentence-similarity";
 export interface BaseArgs {
 	/**
 	 * The access token to use. Without it, you'll get rate-limited quickly.