npm - @huggingface/tasks - Versions diffs - 0.13.17 → 0.14.0 - Mend

@huggingface/tasks 0.13.17 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (155) hide show

package/src/tasks/image-to-image/inference.ts CHANGED Viewed

@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
 /**
  * Inputs for Image To Image inference
  */
@@ -12,14 +11,13 @@ export interface ImageToImageInput {
 	 * The input image data as a base64-encoded string. If no `parameters` are provided, you can
 	 * also provide the image data as a raw bytes payload.
 	 */
-	inputs: string;
+	inputs: Blob;
 	/**
 	 * Additional inference parameters for Image To Image
 	 */
 	parameters?: ImageToImageParameters;
 	[property: string]: unknown;
 }
 /**
  * Additional inference parameters for Image To Image
  */
@@ -30,9 +28,9 @@ export interface ImageToImageParameters {
 	 */
 	guidance_scale?: number;
 	/**
-	 * One or several prompt to guide what NOT to include in image generation.
+	 * One prompt to guide what NOT to include in image generation.
 	 */
-	negative_prompt?: string[];
+	negative_prompt?: string;
 	/**
 	 * For diffusion models. The number of denoising steps. More denoising steps usually lead to
 	 * a higher quality image at the expense of slower inference.
@@ -44,7 +42,6 @@ export interface ImageToImageParameters {
 	target_size?: TargetSize;
 	[property: string]: unknown;
 }
 /**
  * The size in pixel of the output image.
  */
@@ -53,7 +50,6 @@ export interface TargetSize {
 	width: number;
 	[property: string]: unknown;
 }
 /**
  * Outputs of inference for the Image To Image task
  */

package/src/tasks/image-to-image/spec/input.json CHANGED Viewed

@@ -7,7 +7,8 @@
 	"properties": {
 		"inputs": {
 			"type": "string",
-			"description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload."
+			"description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload.",
+			"comment": "type=binary"
 		},
 		"parameters": {
 			"description": "Additional inference parameters for Image To Image",
@@ -24,11 +25,8 @@
 					"description": "For diffusion models. A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality."
 				},
 				"negative_prompt": {
-					"type": "array",
-					"items": {
-						"type": "string"
-					},
-					"description": "One or several prompt to guide what NOT to include in image generation."
+					"type": "string",
+					"description": "One prompt to guide what NOT to include in image generation."
 				},
 				"num_inference_steps": {
 					"type": "integer",

package/src/tasks/image-to-text/inference.ts CHANGED Viewed

@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
 /**
  * Inputs for Image To Text inference
  */
@@ -11,14 +10,13 @@ export interface ImageToTextInput {
 	/**
 	 * The input image data
 	 */
-	inputs: unknown;
+	inputs: Blob;
 	/**
 	 * Additional inference parameters for Image To Text
 	 */
 	parameters?: ImageToTextParameters;
 	[property: string]: unknown;
 }
 /**
  * Additional inference parameters for Image To Text
  */
@@ -33,7 +31,6 @@ export interface ImageToTextParameters {
 	max_new_tokens?: number;
 	[property: string]: unknown;
 }
 /**
  * Parametrization of the text generation process
  */
@@ -120,12 +117,10 @@ export interface GenerationParameters {
 	use_cache?: boolean;
 	[property: string]: unknown;
 }
 /**
  * Controls the stopping condition for beam-based methods.
  */
 export type EarlyStoppingUnion = boolean | "never";
 /**
  * Outputs of inference for the Image To Text task
  */

package/src/tasks/image-to-text/spec/input.json CHANGED Viewed

@@ -6,7 +6,8 @@
 	"type": "object",
 	"properties": {
 		"inputs": {
-			"description": "The input image data"
+			"description": "The input image data",
+			"comment": "type=binary"
 		},
 		"parameters": {
 			"description": "Additional inference parameters for Image To Text",

package/src/tasks/index.ts CHANGED Viewed

@@ -73,6 +73,7 @@ export type * from "./sentence-similarity/inference.js";
 export type * from "./summarization/inference.js";
 export type * from "./table-question-answering/inference.js";
 export type { TextToImageInput, TextToImageOutput, TextToImageParameters } from "./text-to-image/inference.js";
+export type { TextToVideoParameters, TextToVideoOutput, TextToVideoInput } from "./text-to-video/inference.js";
 export type { TextToSpeechParameters, TextToSpeechInput, TextToSpeechOutput } from "./text-to-speech/inference.js";
 export type * from "./token-classification/inference.js";
 export type { TranslationInput, TranslationOutput } from "./translation/inference.js";

package/src/tasks/object-detection/inference.ts CHANGED Viewed

@@ -11,7 +11,7 @@ export interface ObjectDetectionInput {
 	 * The input image data as a base64-encoded string. If no `parameters` are provided, you can
 	 * also provide the image data as a raw bytes payload.
 	 */
-	inputs: string;
+	inputs: Blob;
 	/**
 	 * Additional inference parameters for Object Detection
 	 */

package/src/tasks/object-detection/spec/input.json CHANGED Viewed

@@ -7,7 +7,8 @@
 	"properties": {
 		"inputs": {
 			"type": "string",
-			"description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload."
+			"description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload.",
+			"comment": "type=binary"
 		},
 		"parameters": {
 			"description": "Additional inference parameters for Object Detection",

package/src/tasks/sentence-similarity/inference.ts CHANGED Viewed

@@ -3,9 +3,7 @@
  *
  * Using src/scripts/inference-codegen
  */
 export type SentenceSimilarityOutput = number[];
 /**
  * Inputs for Sentence similarity inference
  */
@@ -14,10 +12,11 @@ export interface SentenceSimilarityInput {
 	/**
 	 * Additional inference parameters for Sentence Similarity
 	 */
-	parameters?: { [key: string]: unknown };
+	parameters?: {
+		[key: string]: unknown;
+	};
 	[property: string]: unknown;
 }
 export interface SentenceSimilarityInputData {
 	/**
 	 * A list of strings which will be compared against the source_sentence.

package/src/tasks/summarization/inference.ts CHANGED Viewed

@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
 /**
  * Inputs for Summarization inference
  */
@@ -18,7 +17,6 @@ export interface SummarizationInput {
 	parameters?: SummarizationParameters;
 	[property: string]: unknown;
 }
 /**
  * Additional inference parameters for summarization.
  */
@@ -30,19 +28,19 @@ export interface SummarizationParameters {
 	/**
 	 * Additional parametrization of the text generation algorithm.
 	 */
-	generate_parameters?: { [key: string]: unknown };
+	generate_parameters?: {
+		[key: string]: unknown;
+	};
 	/**
 	 * The truncation strategy to use.
 	 */
 	truncation?: SummarizationTruncationStrategy;
 	[property: string]: unknown;
 }
 /**
  * The truncation strategy to use.
  */
 export type SummarizationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
 /**
  * Outputs of inference for the Summarization task
  */

package/src/tasks/text-generation/inference.ts CHANGED Viewed

@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
 /**
  * Text Generation Input.
  *
@@ -17,7 +16,6 @@ export interface TextGenerationInput {
 	stream?: boolean;
 	[property: string]: unknown;
 }
 export interface TextGenerationInputGenerateParameters {
 	/**
 	 * Lora adapter id
@@ -100,7 +98,6 @@ export interface TextGenerationInputGenerateParameters {
 	watermark?: boolean;
 	[property: string]: unknown;
 }
 export interface TextGenerationInputGrammarType {
 	type: Type;
 	/**
@@ -112,9 +109,7 @@ export interface TextGenerationInputGrammarType {
 	value: unknown;
 	[property: string]: unknown;
 }
 export type Type = "json" | "regex";
 /**
  * Text Generation Output.
  *
@@ -127,7 +122,6 @@ export interface TextGenerationOutput {
 	generated_text: string;
 	[property: string]: unknown;
 }
 export interface TextGenerationOutputDetails {
 	best_of_sequences?: TextGenerationOutputBestOfSequence[];
 	finish_reason: TextGenerationOutputFinishReason;
@@ -138,7 +132,6 @@ export interface TextGenerationOutputDetails {
 	top_tokens?: Array<TextGenerationOutputToken[]>;
 	[property: string]: unknown;
 }
 export interface TextGenerationOutputBestOfSequence {
 	finish_reason: TextGenerationOutputFinishReason;
 	generated_text: string;
@@ -149,16 +142,13 @@ export interface TextGenerationOutputBestOfSequence {
 	top_tokens?: Array<TextGenerationOutputToken[]>;
 	[property: string]: unknown;
 }
 export type TextGenerationOutputFinishReason = "length" | "eos_token" | "stop_sequence";
 export interface TextGenerationOutputPrefillToken {
 	id: number;
 	logprob: number;
 	text: string;
 	[property: string]: unknown;
 }
 export interface TextGenerationOutputToken {
 	id: number;
 	logprob: number;
@@ -166,7 +156,6 @@ export interface TextGenerationOutputToken {
 	text: string;
 	[property: string]: unknown;
 }
 /**
  * Text Generation Stream Output.
  *
@@ -182,7 +171,6 @@ export interface TextGenerationStreamOutput {
 	top_tokens?: TextGenerationStreamOutputToken[];
 	[property: string]: unknown;
 }
 export interface TextGenerationStreamOutputStreamDetails {
 	finish_reason: TextGenerationOutputFinishReason;
 	generated_tokens: number;
@@ -190,7 +178,6 @@ export interface TextGenerationStreamOutputStreamDetails {
 	seed?: number;
 	[property: string]: unknown;
 }
 export interface TextGenerationStreamOutputToken {
 	id: number;
 	logprob: number;

package/src/tasks/text-to-audio/inference.ts CHANGED Viewed

@@ -1,9 +1,22 @@
+/**
+ * Outputs of inference for the Text To Audio task
+ */
+export interface TextToAudioOutput {
+	/**
+	 * The generated audio waveform.
+	 */
+	audio: Blob;
+	/**
+	 * The sampling rate of the generated audio waveform.
+	 */
+	sampling_rate: number;
+	[property: string]: unknown;
+}
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
  * Using src/scripts/inference-codegen
  */
 /**
  * Inputs for Text To Audio inference
  */
@@ -18,7 +31,6 @@ export interface TextToAudioInput {
 	parameters?: TextToAudioParameters;
 	[property: string]: unknown;
 }
 /**
  * Additional inference parameters for Text To Audio
  */
@@ -29,7 +41,6 @@ export interface TextToAudioParameters {
 	generation_parameters?: GenerationParameters;
 	[property: string]: unknown;
 }
 /**
  * Parametrization of the text generation process
  */
@@ -116,24 +127,7 @@ export interface GenerationParameters {
 	use_cache?: boolean;
 	[property: string]: unknown;
 }
 /**
  * Controls the stopping condition for beam-based methods.
  */
 export type EarlyStoppingUnion = boolean | "never";
-/**
- * Outputs of inference for the Text To Audio task
- */
-export interface TextToAudioOutput {
-	/**
-	 * The generated audio waveform.
-	 */
-	audio: unknown;
-	samplingRate: unknown;
-	/**
-	 * The sampling rate of the generated audio waveform.
-	 */
-	sampling_rate?: number;
-	[property: string]: unknown;
-}

package/src/tasks/text-to-audio/spec/output.json CHANGED Viewed

@@ -6,12 +6,13 @@
 	"type": "object",
 	"properties": {
 		"audio": {
-			"description": "The generated audio waveform."
+			"description": "The generated audio waveform.",
+			"comment": "type=binary"
 		},
 		"sampling_rate": {
 			"type": "number",
 			"description": "The sampling rate of the generated audio waveform."
 		}
 	},
-	"required": ["audio", "samplingRate"]
+	"required": ["audio", "sampling_rate"]
 }

package/src/tasks/text-to-image/inference.ts CHANGED Viewed

@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
 /**
  * Inputs for Text To Image inference
  */
@@ -18,7 +17,6 @@ export interface TextToImageInput {
 	parameters?: TextToImageParameters;
 	[property: string]: unknown;
 }
 /**
  * Additional inference parameters for Text To Image
  */
@@ -29,9 +27,9 @@ export interface TextToImageParameters {
 	 */
 	guidance_scale?: number;
 	/**
-	 * One or several prompt to guide what NOT to include in image generation.
+	 * One prompt to guide what NOT to include in image generation.
 	 */
-	negative_prompt?: string[];
+	negative_prompt?: string;
 	/**
 	 * The number of denoising steps. More denoising steps usually lead to a higher quality
 	 * image at the expense of slower inference.
@@ -51,7 +49,6 @@ export interface TextToImageParameters {
 	target_size?: TargetSize;
 	[property: string]: unknown;
 }
 /**
  * The size in pixel of the output image
  */
@@ -60,7 +57,6 @@ export interface TargetSize {
 	width: number;
 	[property: string]: unknown;
 }
 /**
  * Outputs of inference for the Text To Image task
  */

package/src/tasks/text-to-image/spec/input.json CHANGED Viewed

@@ -24,11 +24,8 @@
 					"description": "A higher guidance scale value encourages the model to generate images closely linked to the text prompt, but values too high may cause saturation and other artifacts."
 				},
 				"negative_prompt": {
-					"type": "array",
-					"items": {
-						"type": "string"
-					},
-					"description": "One or several prompt to guide what NOT to include in image generation."
+					"type": "string",
+					"description": "One prompt to guide what NOT to include in image generation."
 				},
 				"num_inference_steps": {
 					"type": "integer",

package/src/tasks/text-to-speech/inference.ts CHANGED Viewed

@@ -1,9 +1,22 @@
+/**
+ * Outputs of inference for the Text To Speech task
+ */
+export interface TextToSpeechOutput {
+	/**
+	 * The generated audio
+	 */
+	audio: Blob;
+	/**
+	 * The sampling rate of the generated audio waveform.
+	 */
+	sampling_rate?: number;
+	[property: string]: unknown;
+}
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
  * Using src/scripts/inference-codegen
  */
 /**
  * Inputs for Text To Speech inference
  */
@@ -18,7 +31,6 @@ export interface TextToSpeechInput {
 	parameters?: TextToSpeechParameters;
 	[property: string]: unknown;
 }
 /**
  * Additional inference parameters for Text To Speech
  */
@@ -29,7 +41,6 @@ export interface TextToSpeechParameters {
 	generation_parameters?: GenerationParameters;
 	[property: string]: unknown;
 }
 /**
  * Parametrization of the text generation process
  */
@@ -116,26 +127,7 @@ export interface GenerationParameters {
 	use_cache?: boolean;
 	[property: string]: unknown;
 }
 /**
  * Controls the stopping condition for beam-based methods.
  */
 export type EarlyStoppingUnion = boolean | "never";
-/**
- * Outputs for Text to Speech inference
- *
- * Outputs of inference for the Text To Audio task
- */
-export interface TextToSpeechOutput {
-	/**
-	 * The generated audio waveform.
-	 */
-	audio: unknown;
-	samplingRate: unknown;
-	/**
-	 * The sampling rate of the generated audio waveform.
-	 */
-	sampling_rate?: number;
-	[property: string]: unknown;
-}

package/src/tasks/text-to-speech/spec/output.json CHANGED Viewed

@@ -1,7 +1,18 @@
 {
-	"$ref": "/inference/schemas/text-to-audio/output.json",
 	"$id": "/inference/schemas/text-to-speech/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Text To Speech task",
 	"title": "TextToSpeechOutput",
-	"description": "Outputs for Text to Speech inference"
+	"type": "object",
+	"properties": {
+		"audio": {
+			"description": "The generated audio",
+			"comment": "type=binary"
+		},
+		"sampling_rate": {
+			"type": "number",
+			"description": "The sampling rate of the generated audio waveform."
+		}
+	},
+	"required": ["audio"]
 }

package/src/tasks/text-to-video/inference.ts ADDED Viewed

@@ -0,0 +1,57 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Text To Video inference
+ */
+export interface TextToVideoInput {
+	/**
+	 * The input text data (sometimes called "prompt")
+	 */
+	inputs: string;
+	/**
+	 * Additional inference parameters for Text To Video
+	 */
+	parameters?: TextToVideoParameters;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters for Text To Video
+ */
+export interface TextToVideoParameters {
+	/**
+	 * A higher guidance scale value encourages the model to generate images closely linked to
+	 * the text prompt, but values too high may cause saturation and other artifacts.
+	 */
+	guidance_scale?: number;
+	/**
+	 * One or several prompt to guide what NOT to include in image generation.
+	 */
+	negative_prompt?: string[];
+	/**
+	 * The num_frames parameter determines how many video frames are generated.
+	 */
+	num_frames?: number;
+	/**
+	 * The number of denoising steps. More denoising steps usually lead to a higher quality
+	 * image at the expense of slower inference.
+	 */
+	num_inference_steps?: number;
+	/**
+	 * Seed for the random number generator.
+	 */
+	seed?: number;
+	[property: string]: unknown;
+}
+/**
+ * Outputs of inference for the Text To Video task
+ */
+export interface TextToVideoOutput {
+	/**
+	 * The generated video returned as raw bytes in the payload.
+	 */
+	video: unknown;
+	[property: string]: unknown;
+}

package/src/tasks/text-to-video/spec/input.json ADDED Viewed

@@ -0,0 +1,49 @@
+{
+	"$id": "/inference/schemas/text-to-video/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Text To Video inference",
+	"title": "TextToVideoInput",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "The input text data (sometimes called \"prompt\")",
+			"type": "string"
+		},
+		"parameters": {
+			"description": "Additional inference parameters for Text To Video",
+			"$ref": "#/$defs/TextToVideoParameters"
+		}
+	},
+	"$defs": {
+		"TextToVideoParameters": {
+			"title": "TextToVideoParameters",
+			"type": "object",
+			"properties": {
+				"num_frames": {
+					"type": "number",
+					"description": "The num_frames parameter determines how many video frames are generated."
+				},
+				"guidance_scale": {
+					"type": "number",
+					"description": "A higher guidance scale value encourages the model to generate images closely linked to the text prompt, but values too high may cause saturation and other artifacts."
+				},
+				"negative_prompt": {
+					"type": "array",
+					"items": {
+						"type": "string"
+					},
+					"description": "One or several prompt to guide what NOT to include in image generation."
+				},
+				"num_inference_steps": {
+					"type": "integer",
+					"description": "The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference."
+				},
+				"seed": {
+					"type": "integer",
+					"description": "Seed for the random number generator."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}

package/src/tasks/text-to-video/spec/output.json ADDED Viewed

@@ -0,0 +1,13 @@
+{
+	"$id": "/inference/schemas/text-to-video/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Text To Video task",
+	"title": "TextToVideoOutput",
+	"type": "object",
+	"properties": {
+		"video": {
+			"description": "The generated video returned as raw bytes in the payload."
+		}
+	},
+	"required": ["video"]
+}