npm - @huggingface/inference - Versions diffs - 2.6.5 → 2.6.7 - Mend

@huggingface/inference 2.6.5 → 2.6.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/README.md +15 -0
package/dist/index.cjs +21 -16
package/dist/index.d.ts +196 -1
package/dist/index.js +21 -16
package/package.json +2 -2
package/src/HfInference.ts +3 -2
package/src/lib/makeRequestOptions.ts +25 -22
package/src/tasks/nlp/textGeneration.ts +203 -1
package/src/tasks/nlp/textGenerationStream.ts +1 -2
package/src/types.ts +1 -1

package/README.md CHANGED Viewed

@@ -506,6 +506,21 @@ const gpt2 = hf.endpoint('https://xyz.eu-west-1.aws.endpoints.huggingface.cloud/
 const { generated_text } = await gpt2.textGeneration({inputs: 'The answer to the universe is'});
 ```
+By default, all calls to the inference endpoint will wait until the model is
+loaded. When [scaling to
+0](https://huggingface.co/docs/inference-endpoints/en/autoscaling#scaling-to-0)
+is enabled on the endpoint, this can result in non-trivial waiting time. If
+you'd rather disable this behavior and handle the endpoint's returned 500 HTTP
+errors yourself, you can do so like so:
+```typescript
+const gpt2 = hf.endpoint('https://xyz.eu-west-1.aws.endpoints.huggingface.cloud/gpt2');
+const { generated_text } = await gpt2.textGeneration(
+  {inputs: 'The answer to the universe is'},
+  {retry_on_error: false},
+);
+```
 ## Running tests
 ```console

package/dist/index.cjs CHANGED Viewed

@@ -132,7 +132,15 @@ var tasks = null;
 async function makeRequestOptions(args, options) {
   const { accessToken, model: _model, ...otherArgs } = args;
   let { model } = args;
-  const { forceTask: task, includeCredentials, taskHint, ...otherOptions } = options ?? {};
+  const {
+    forceTask: task,
+    includeCredentials,
+    taskHint,
+    wait_for_model,
+    use_cache,
+    dont_load_model,
+    ...otherOptions
+  } = options ?? {};
   const headers = {};
   if (accessToken) {
     headers["Authorization"] = `Bearer ${accessToken}`;
@@ -155,16 +163,15 @@ async function makeRequestOptions(args, options) {
   const binary = "data" in args && !!args.data;
   if (!binary) {
     headers["Content-Type"] = "application/json";
-  } else {
-    if (options?.wait_for_model) {
-      headers["X-Wait-For-Model"] = "true";
-    }
-    if (options?.use_cache === false) {
-      headers["X-Use-Cache"] = "false";
-    }
-    if (options?.dont_load_model) {
-      headers["X-Load-Model"] = "0";
-    }
+  }
+  if (wait_for_model) {
+    headers["X-Wait-For-Model"] = "true";
+  }
+  if (use_cache === false) {
+    headers["X-Use-Cache"] = "false";
+  }
+  if (dont_load_model) {
+    headers["X-Load-Model"] = "0";
   }
   const url = (() => {
     if (isUrl(model)) {
@@ -178,10 +185,8 @@ async function makeRequestOptions(args, options) {
   let credentials;
   if (typeof includeCredentials === "string") {
     credentials = includeCredentials;
-  } else if (typeof includeCredentials === "boolean") {
-    credentials = includeCredentials ? "include" : void 0;
-  } else if (includeCredentials === void 0) {
-    credentials = "same-origin";
+  } else if (includeCredentials === true) {
+    credentials = "include";
   }
   const info = {
     headers,
@@ -190,7 +195,7 @@ async function makeRequestOptions(args, options) {
       ...otherArgs,
       options: options && otherOptions
     }),
-    credentials,
+    ...credentials && { credentials },
     signal: options?.signal
   };
   return { url, info };

package/dist/index.d.ts CHANGED Viewed

@@ -31,7 +31,7 @@ export interface Options {
 	signal?: AbortSignal;
 	/**
-	 * (Default: "same-origin"). String | Boolean. Credentials to use for the request. If this is a string, it will be passed straight on. If it's a boolean, true will be "include" and false will not send credentials at all.
+	 * Credentials to use for the request. If this is a string, it will be passed straight on. If it's a boolean, true will be "include" and false will not send credentials at all (which defaults to "same-origin" inside browsers).
 	 */
 	includeCredentials?: string | boolean;
 }
@@ -702,6 +702,201 @@ export function textClassification(
 	args: TextClassificationArgs,
 	options?: Options
 ): Promise<TextClassificationOutput>;
+/**
+ * The reason why the generation was stopped.
+ *
+ * length: The generated sequence reached the maximum allowed length
+ *
+ * eos_token: The model generated an end-of-sentence (EOS) token
+ *
+ * stop_sequence: One of the sequence in stop_sequences was generated
+ */
+export type TextGenerationFinishReason = "length" | "eos_token" | "stop_sequence";
+/**
+ * Inputs for Text Generation inference
+ */
+export interface TextGenerationInput {
+	/**
+	 * The text to initialize generation with
+	 */
+	inputs: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TextGenerationParameters;
+	/**
+	 * Whether to stream output tokens
+	 */
+	stream?: boolean;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text Generation
+ */
+export interface TextGenerationParameters {
+	/**
+	 * The number of sampling queries to run. Only the best one (in terms of total logprob) will
+	 * be returned.
+	 */
+	best_of?: number;
+	/**
+	 * Whether or not to output decoder input details
+	 */
+	decoder_input_details?: boolean;
+	/**
+	 * Whether or not to output details
+	 */
+	details?: boolean;
+	/**
+	 * Whether to use logits sampling instead of greedy decoding when generating new tokens.
+	 */
+	do_sample?: boolean;
+	/**
+	 * The maximum number of tokens to generate.
+	 */
+	max_new_tokens?: number;
+	/**
+	 * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
+	 * paper](https://hf.co/papers/1909.05858) for more details.
+	 */
+	repetition_penalty?: number;
+	/**
+	 * Whether to prepend the prompt to the generated text.
+	 */
+	return_full_text?: boolean;
+	/**
+	 * The random sampling seed.
+	 */
+	seed?: number;
+	/**
+	 * Stop generating tokens if a member of `stop_sequences` is generated.
+	 */
+	stop_sequences?: string[];
+	/**
+	 * The value used to modulate the logits distribution.
+	 */
+	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	top_k?: number;
+	/**
+	 * If set to < 1, only the smallest set of most probable tokens with probabilities that add
+	 * up to `top_p` or higher are kept for generation.
+	 */
+	top_p?: number;
+	/**
+	 * Truncate input tokens to the given size.
+	 */
+	truncate?: number;
+	/**
+	 * Typical Decoding mass. See [Typical Decoding for Natural Language
+	 * Generation](https://hf.co/papers/2202.00666) for more information
+	 */
+	typical_p?: number;
+	/**
+	 * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
+	 */
+	watermark?: boolean;
+	[property: string]: unknown;
+}
+/**
+ * Outputs for Text Generation inference
+ */
+export interface TextGenerationOutput {
+	/**
+	 * When enabled, details about the generation
+	 */
+	details?: TextGenerationOutputDetails;
+	/**
+	 * The generated text
+	 */
+	generated_text: string;
+	[property: string]: unknown;
+}
+/**
+ * When enabled, details about the generation
+ */
+export interface TextGenerationOutputDetails {
+	/**
+	 * Details about additional sequences when best_of is provided
+	 */
+	best_of_sequences?: TextGenerationOutputSequenceDetails[];
+	/**
+	 * The reason why the generation was stopped.
+	 */
+	finish_reason: TextGenerationFinishReason;
+	/**
+	 * The number of generated tokens
+	 */
+	generated_tokens: number;
+	prefill: TextGenerationPrefillToken[];
+	/**
+	 * The random seed used for generation
+	 */
+	seed?: number;
+	/**
+	 * The generated tokens and associated details
+	 */
+	tokens: TextGenerationOutputToken[];
+	/**
+	 * Most likely tokens
+	 */
+	top_tokens?: Array<TextGenerationOutputToken[]>;
+	[property: string]: unknown;
+}
+export interface TextGenerationOutputSequenceDetails {
+	finish_reason: TextGenerationFinishReason;
+	/**
+	 * The generated text
+	 */
+	generated_text: string;
+	/**
+	 * The number of generated tokens
+	 */
+	generated_tokens: number;
+	prefill: TextGenerationPrefillToken[];
+	/**
+	 * The random seed used for generation
+	 */
+	seed?: number;
+	/**
+	 * The generated tokens and associated details
+	 */
+	tokens: TextGenerationOutputToken[];
+	/**
+	 * Most likely tokens
+	 */
+	top_tokens?: Array<TextGenerationOutputToken[]>;
+	[property: string]: unknown;
+}
+export interface TextGenerationPrefillToken {
+	id: number;
+	logprob: number;
+	/**
+	 * The text associated with that token
+	 */
+	text: string;
+	[property: string]: unknown;
+}
+/**
+ * Generated token.
+ */
+export interface TextGenerationOutputToken {
+	id: number;
+	logprob?: number;
+	/**
+	 * Whether or not that token is a special one
+	 */
+	special: boolean;
+	/**
+	 * The text associated with that token
+	 */
+	text: string;
+	[property: string]: unknown;
+}
 /**
  * Use to continue text from a prompt. This is a very generic task. Recommended model: gpt2 (it’s a simple model, but fun to play with).
  */

package/dist/index.js CHANGED Viewed

@@ -81,7 +81,15 @@ var tasks = null;
 async function makeRequestOptions(args, options) {
   const { accessToken, model: _model, ...otherArgs } = args;
   let { model } = args;
-  const { forceTask: task, includeCredentials, taskHint, ...otherOptions } = options ?? {};
+  const {
+    forceTask: task,
+    includeCredentials,
+    taskHint,
+    wait_for_model,
+    use_cache,
+    dont_load_model,
+    ...otherOptions
+  } = options ?? {};
   const headers = {};
   if (accessToken) {
     headers["Authorization"] = `Bearer ${accessToken}`;
@@ -104,16 +112,15 @@ async function makeRequestOptions(args, options) {
   const binary = "data" in args && !!args.data;
   if (!binary) {
     headers["Content-Type"] = "application/json";
-  } else {
-    if (options?.wait_for_model) {
-      headers["X-Wait-For-Model"] = "true";
-    }
-    if (options?.use_cache === false) {
-      headers["X-Use-Cache"] = "false";
-    }
-    if (options?.dont_load_model) {
-      headers["X-Load-Model"] = "0";
-    }
+  }
+  if (wait_for_model) {
+    headers["X-Wait-For-Model"] = "true";
+  }
+  if (use_cache === false) {
+    headers["X-Use-Cache"] = "false";
+  }
+  if (dont_load_model) {
+    headers["X-Load-Model"] = "0";
   }
   const url = (() => {
     if (isUrl(model)) {
@@ -127,10 +134,8 @@ async function makeRequestOptions(args, options) {
   let credentials;
   if (typeof includeCredentials === "string") {
     credentials = includeCredentials;
-  } else if (typeof includeCredentials === "boolean") {
-    credentials = includeCredentials ? "include" : void 0;
-  } else if (includeCredentials === void 0) {
-    credentials = "same-origin";
+  } else if (includeCredentials === true) {
+    credentials = "include";
   }
   const info = {
     headers,
@@ -139,7 +144,7 @@ async function makeRequestOptions(args, options) {
       ...otherArgs,
       options: options && otherOptions
     }),
-    credentials,
+    ...credentials && { credentials },
     signal: options?.signal
   };
   return { url, info };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@huggingface/inference",
-  "version": "2.6.5",
+  "version": "2.6.7",
   "packageManager": "pnpm@8.10.5",
   "license": "MIT",
   "author": "Tim Mikeladze <tim.mikeladze@gmail.com>",
@@ -40,7 +40,7 @@
   "type": "module",
   "devDependencies": {
     "@types/node": "18.13.0",
-    "@huggingface/tasks": "^0.6.0"
+    "@huggingface/tasks": "^0.8.0"
   },
   "resolutions": {},
   "scripts": {

package/src/HfInference.ts CHANGED Viewed

@@ -2,6 +2,9 @@ import * as tasks from "./tasks";
 import type { Options, RequestArgs } from "./types";
 import type { DistributiveOmit } from "./utils/distributive-omit";
+/* eslint-disable @typescript-eslint/no-empty-interface */
+/* eslint-disable @typescript-eslint/no-unsafe-declaration-merging */
 type Task = typeof tasks;
 type TaskWithNoAccessToken = {
@@ -60,8 +63,6 @@ export class HfInferenceEndpoint {
 	}
 }
-// eslint-disable-next-line @typescript-eslint/no-empty-interface
 export interface HfInference extends TaskWithNoAccessToken {}
-// eslint-disable-next-line @typescript-eslint/no-empty-interface
 export interface HfInferenceEndpoint extends TaskWithNoAccessTokenNoModel {}

package/src/lib/makeRequestOptions.ts CHANGED Viewed

@@ -27,7 +27,15 @@ export async function makeRequestOptions(
 	// eslint-disable-next-line @typescript-eslint/no-unused-vars
 	const { accessToken, model: _model, ...otherArgs } = args;
 	let { model } = args;
-	const { forceTask: task, includeCredentials, taskHint, ...otherOptions } = options ?? {};
+	const {
+		forceTask: task,
+		includeCredentials,
+		taskHint,
+		wait_for_model,
+		use_cache,
+		dont_load_model,
+		...otherOptions
+	} = options ?? {};
 	const headers: Record<string, string> = {};
 	if (accessToken) {
@@ -57,16 +65,16 @@ export async function makeRequestOptions(
 	if (!binary) {
 		headers["Content-Type"] = "application/json";
-	} else {
-		if (options?.wait_for_model) {
-			headers["X-Wait-For-Model"] = "true";
-		}
-		if (options?.use_cache === false) {
-			headers["X-Use-Cache"] = "false";
-		}
-		if (options?.dont_load_model) {
-			headers["X-Load-Model"] = "0";
-		}
+	}
+	if (wait_for_model) {
+		headers["X-Wait-For-Model"] = "true";
+	}
+	if (use_cache === false) {
+		headers["X-Use-Cache"] = "false";
+	}
+	if (dont_load_model) {
+		headers["X-Load-Model"] = "0";
 	}
 	const url = (() => {
@@ -81,19 +89,14 @@ export async function makeRequestOptions(
 		return `${HF_INFERENCE_API_BASE_URL}/models/${model}`;
 	})();
-	// Let users configure credentials, or disable them all together (or keep default behavior).
-	// ---
-	// This used to be an internal property only and never exposed to users. This means that most usages will never define this value
-	// So in order to make this backwards compatible, if it's undefined we go to "same-origin" (default behaviour before).
-	// If it's a boolean and set to true then set to "include". If false, don't define credentials at all (useful for edge runtimes)
-	// Then finally, if it's a string, use it as-is.
+	/**
+	 * For edge runtimes, leave 'credentials' undefined, otherwise cloudflare workers will error
+	 */
 	let credentials: RequestCredentials | undefined;
 	if (typeof includeCredentials === "string") {
 		credentials = includeCredentials as RequestCredentials;
-	} else if (typeof includeCredentials === "boolean") {
-		credentials = includeCredentials ? "include" : undefined;
-	} else if (includeCredentials === undefined) {
-		credentials = "same-origin";
+	} else if (includeCredentials === true) {
+		credentials = "include";
 	}
 	const info: RequestInit = {
@@ -105,7 +108,7 @@ export async function makeRequestOptions(
 					...otherArgs,
 					options: options && otherOptions,
 			  }),
-		credentials,
+		...(credentials && { credentials }),
 		signal: options?.signal,
 	};

package/src/tasks/nlp/textGeneration.ts CHANGED Viewed

@@ -1,8 +1,210 @@
-import type { TextGenerationInput, TextGenerationOutput } from "@huggingface/tasks/src/tasks/text-generation/inference";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
+/**
+ * Inputs for Text Generation inference
+ */
+export interface TextGenerationInput {
+	/**
+	 * The text to initialize generation with
+	 */
+	inputs: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TextGenerationParameters;
+	/**
+	 * Whether to stream output tokens
+	 */
+	stream?: boolean;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text Generation
+ */
+export interface TextGenerationParameters {
+	/**
+	 * The number of sampling queries to run. Only the best one (in terms of total logprob) will
+	 * be returned.
+	 */
+	best_of?: number;
+	/**
+	 * Whether or not to output decoder input details
+	 */
+	decoder_input_details?: boolean;
+	/**
+	 * Whether or not to output details
+	 */
+	details?: boolean;
+	/**
+	 * Whether to use logits sampling instead of greedy decoding when generating new tokens.
+	 */
+	do_sample?: boolean;
+	/**
+	 * The maximum number of tokens to generate.
+	 */
+	max_new_tokens?: number;
+	/**
+	 * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
+	 * paper](https://hf.co/papers/1909.05858) for more details.
+	 */
+	repetition_penalty?: number;
+	/**
+	 * Whether to prepend the prompt to the generated text.
+	 */
+	return_full_text?: boolean;
+	/**
+	 * The random sampling seed.
+	 */
+	seed?: number;
+	/**
+	 * Stop generating tokens if a member of `stop_sequences` is generated.
+	 */
+	stop_sequences?: string[];
+	/**
+	 * The value used to modulate the logits distribution.
+	 */
+	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	top_k?: number;
+	/**
+	 * If set to < 1, only the smallest set of most probable tokens with probabilities that add
+	 * up to `top_p` or higher are kept for generation.
+	 */
+	top_p?: number;
+	/**
+	 * Truncate input tokens to the given size.
+	 */
+	truncate?: number;
+	/**
+	 * Typical Decoding mass. See [Typical Decoding for Natural Language
+	 * Generation](https://hf.co/papers/2202.00666) for more information
+	 */
+	typical_p?: number;
+	/**
+	 * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
+	 */
+	watermark?: boolean;
+	[property: string]: unknown;
+}
+/**
+ * Outputs for Text Generation inference
+ */
+export interface TextGenerationOutput {
+	/**
+	 * When enabled, details about the generation
+	 */
+	details?: TextGenerationOutputDetails;
+	/**
+	 * The generated text
+	 */
+	generated_text: string;
+	[property: string]: unknown;
+}
+/**
+ * When enabled, details about the generation
+ */
+export interface TextGenerationOutputDetails {
+	/**
+	 * Details about additional sequences when best_of is provided
+	 */
+	best_of_sequences?: TextGenerationOutputSequenceDetails[];
+	/**
+	 * The reason why the generation was stopped.
+	 */
+	finish_reason: TextGenerationFinishReason;
+	/**
+	 * The number of generated tokens
+	 */
+	generated_tokens: number;
+	prefill: TextGenerationPrefillToken[];
+	/**
+	 * The random seed used for generation
+	 */
+	seed?: number;
+	/**
+	 * The generated tokens and associated details
+	 */
+	tokens: TextGenerationOutputToken[];
+	/**
+	 * Most likely tokens
+	 */
+	top_tokens?: Array<TextGenerationOutputToken[]>;
+	[property: string]: unknown;
+}
+export interface TextGenerationOutputSequenceDetails {
+	finish_reason: TextGenerationFinishReason;
+	/**
+	 * The generated text
+	 */
+	generated_text: string;
+	/**
+	 * The number of generated tokens
+	 */
+	generated_tokens: number;
+	prefill: TextGenerationPrefillToken[];
+	/**
+	 * The random seed used for generation
+	 */
+	seed?: number;
+	/**
+	 * The generated tokens and associated details
+	 */
+	tokens: TextGenerationOutputToken[];
+	/**
+	 * Most likely tokens
+	 */
+	top_tokens?: Array<TextGenerationOutputToken[]>;
+	[property: string]: unknown;
+}
+export interface TextGenerationPrefillToken {
+	id: number;
+	logprob: number;
+	/**
+	 * The text associated with that token
+	 */
+	text: string;
+	[property: string]: unknown;
+}
+/**
+ * Generated token.
+ */
+export interface TextGenerationOutputToken {
+	id: number;
+	logprob?: number;
+	/**
+	 * Whether or not that token is a special one
+	 */
+	special: boolean;
+	/**
+	 * The text associated with that token
+	 */
+	text: string;
+	[property: string]: unknown;
+}
+/**
+ * The reason why the generation was stopped.
+ *
+ * length: The generated sequence reached the maximum allowed length
+ *
+ * eos_token: The model generated an end-of-sentence (EOS) token
+ *
+ * stop_sequence: One of the sequence in stop_sequences was generated
+ */
+export type TextGenerationFinishReason = "length" | "eos_token" | "stop_sequence";
 /**
  * Use to continue text from a prompt. This is a very generic task. Recommended model: gpt2 (it’s a simple model, but fun to play with).
  */

package/src/tasks/nlp/textGenerationStream.ts CHANGED Viewed

@@ -1,7 +1,6 @@
 import type { BaseArgs, Options } from "../../types";
 import { streamingRequest } from "../custom/streamingRequest";
-import type { TextGenerationInput } from "@huggingface/tasks/src/tasks/text-generation/inference";
+import type { TextGenerationInput } from "./textGeneration";
 export interface TextGenerationStreamToken {
 	/** Token ID from the model tokenizer */

package/src/types.ts CHANGED Viewed

@@ -32,7 +32,7 @@ export interface Options {
 	signal?: AbortSignal;
 	/**
-	 * (Default: "same-origin"). String | Boolean. Credentials to use for the request. If this is a string, it will be passed straight on. If it's a boolean, true will be "include" and false will not send credentials at all.
+	 * Credentials to use for the request. If this is a string, it will be passed straight on. If it's a boolean, true will be "include" and false will not send credentials at all (which defaults to "same-origin" inside browsers).
 	 */
 	includeCredentials?: string | boolean;
 }