npm - @huggingface/tasks - Versions diffs - 0.13.16 → 0.14.0 - Mend

@huggingface/tasks 0.13.16 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (165) hide show

package/src/tasks/text-to-audio/spec/output.json CHANGED Viewed

@@ -6,12 +6,13 @@
 	"type": "object",
 	"properties": {
 		"audio": {
-			"description": "The generated audio waveform."
+			"description": "The generated audio waveform.",
+			"comment": "type=binary"
 		},
 		"sampling_rate": {
 			"type": "number",
 			"description": "The sampling rate of the generated audio waveform."
 		}
 	},
-	"required": ["audio", "samplingRate"]
+	"required": ["audio", "sampling_rate"]
 }

package/src/tasks/text-to-image/inference.ts CHANGED Viewed

@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
 /**
  * Inputs for Text To Image inference
  */
@@ -18,7 +17,6 @@ export interface TextToImageInput {
 	parameters?: TextToImageParameters;
 	[property: string]: unknown;
 }
 /**
  * Additional inference parameters for Text To Image
  */
@@ -29,9 +27,9 @@ export interface TextToImageParameters {
 	 */
 	guidance_scale?: number;
 	/**
-	 * One or several prompt to guide what NOT to include in image generation.
+	 * One prompt to guide what NOT to include in image generation.
 	 */
-	negative_prompt?: string[];
+	negative_prompt?: string;
 	/**
 	 * The number of denoising steps. More denoising steps usually lead to a higher quality
 	 * image at the expense of slower inference.
@@ -51,7 +49,6 @@ export interface TextToImageParameters {
 	target_size?: TargetSize;
 	[property: string]: unknown;
 }
 /**
  * The size in pixel of the output image
  */
@@ -60,7 +57,6 @@ export interface TargetSize {
 	width: number;
 	[property: string]: unknown;
 }
 /**
  * Outputs of inference for the Text To Image task
  */

package/src/tasks/text-to-image/spec/input.json CHANGED Viewed

@@ -24,11 +24,8 @@
 					"description": "A higher guidance scale value encourages the model to generate images closely linked to the text prompt, but values too high may cause saturation and other artifacts."
 				},
 				"negative_prompt": {
-					"type": "array",
-					"items": {
-						"type": "string"
-					},
-					"description": "One or several prompt to guide what NOT to include in image generation."
+					"type": "string",
+					"description": "One prompt to guide what NOT to include in image generation."
 				},
 				"num_inference_steps": {
 					"type": "integer",

package/src/tasks/text-to-speech/inference.ts CHANGED Viewed

@@ -1,9 +1,22 @@
+/**
+ * Outputs of inference for the Text To Speech task
+ */
+export interface TextToSpeechOutput {
+	/**
+	 * The generated audio
+	 */
+	audio: Blob;
+	/**
+	 * The sampling rate of the generated audio waveform.
+	 */
+	sampling_rate?: number;
+	[property: string]: unknown;
+}
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
  * Using src/scripts/inference-codegen
  */
 /**
  * Inputs for Text To Speech inference
  */
@@ -18,7 +31,6 @@ export interface TextToSpeechInput {
 	parameters?: TextToSpeechParameters;
 	[property: string]: unknown;
 }
 /**
  * Additional inference parameters for Text To Speech
  */
@@ -29,7 +41,6 @@ export interface TextToSpeechParameters {
 	generation_parameters?: GenerationParameters;
 	[property: string]: unknown;
 }
 /**
  * Parametrization of the text generation process
  */
@@ -116,26 +127,7 @@ export interface GenerationParameters {
 	use_cache?: boolean;
 	[property: string]: unknown;
 }
 /**
  * Controls the stopping condition for beam-based methods.
  */
 export type EarlyStoppingUnion = boolean | "never";
-/**
- * Outputs for Text to Speech inference
- *
- * Outputs of inference for the Text To Audio task
- */
-export interface TextToSpeechOutput {
-	/**
-	 * The generated audio waveform.
-	 */
-	audio: unknown;
-	samplingRate: unknown;
-	/**
-	 * The sampling rate of the generated audio waveform.
-	 */
-	sampling_rate?: number;
-	[property: string]: unknown;
-}

package/src/tasks/text-to-speech/spec/output.json CHANGED Viewed

@@ -1,7 +1,18 @@
 {
-	"$ref": "/inference/schemas/text-to-audio/output.json",
 	"$id": "/inference/schemas/text-to-speech/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Text To Speech task",
 	"title": "TextToSpeechOutput",
-	"description": "Outputs for Text to Speech inference"
+	"type": "object",
+	"properties": {
+		"audio": {
+			"description": "The generated audio",
+			"comment": "type=binary"
+		},
+		"sampling_rate": {
+			"type": "number",
+			"description": "The sampling rate of the generated audio waveform."
+		}
+	},
+	"required": ["audio"]
 }

package/src/tasks/text-to-video/inference.ts ADDED Viewed

@@ -0,0 +1,57 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Text To Video inference
+ */
+export interface TextToVideoInput {
+	/**
+	 * The input text data (sometimes called "prompt")
+	 */
+	inputs: string;
+	/**
+	 * Additional inference parameters for Text To Video
+	 */
+	parameters?: TextToVideoParameters;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters for Text To Video
+ */
+export interface TextToVideoParameters {
+	/**
+	 * A higher guidance scale value encourages the model to generate images closely linked to
+	 * the text prompt, but values too high may cause saturation and other artifacts.
+	 */
+	guidance_scale?: number;
+	/**
+	 * One or several prompt to guide what NOT to include in image generation.
+	 */
+	negative_prompt?: string[];
+	/**
+	 * The num_frames parameter determines how many video frames are generated.
+	 */
+	num_frames?: number;
+	/**
+	 * The number of denoising steps. More denoising steps usually lead to a higher quality
+	 * image at the expense of slower inference.
+	 */
+	num_inference_steps?: number;
+	/**
+	 * Seed for the random number generator.
+	 */
+	seed?: number;
+	[property: string]: unknown;
+}
+/**
+ * Outputs of inference for the Text To Video task
+ */
+export interface TextToVideoOutput {
+	/**
+	 * The generated video returned as raw bytes in the payload.
+	 */
+	video: unknown;
+	[property: string]: unknown;
+}

package/src/tasks/text-to-video/spec/input.json ADDED Viewed

@@ -0,0 +1,49 @@
+{
+	"$id": "/inference/schemas/text-to-video/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Text To Video inference",
+	"title": "TextToVideoInput",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "The input text data (sometimes called \"prompt\")",
+			"type": "string"
+		},
+		"parameters": {
+			"description": "Additional inference parameters for Text To Video",
+			"$ref": "#/$defs/TextToVideoParameters"
+		}
+	},
+	"$defs": {
+		"TextToVideoParameters": {
+			"title": "TextToVideoParameters",
+			"type": "object",
+			"properties": {
+				"num_frames": {
+					"type": "number",
+					"description": "The num_frames parameter determines how many video frames are generated."
+				},
+				"guidance_scale": {
+					"type": "number",
+					"description": "A higher guidance scale value encourages the model to generate images closely linked to the text prompt, but values too high may cause saturation and other artifacts."
+				},
+				"negative_prompt": {
+					"type": "array",
+					"items": {
+						"type": "string"
+					},
+					"description": "One or several prompt to guide what NOT to include in image generation."
+				},
+				"num_inference_steps": {
+					"type": "integer",
+					"description": "The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference."
+				},
+				"seed": {
+					"type": "integer",
+					"description": "Seed for the random number generator."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}

package/src/tasks/text-to-video/spec/output.json ADDED Viewed

@@ -0,0 +1,13 @@
+{
+	"$id": "/inference/schemas/text-to-video/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Text To Video task",
+	"title": "TextToVideoOutput",
+	"type": "object",
+	"properties": {
+		"video": {
+			"description": "The generated video returned as raw bytes in the payload."
+		}
+	},
+	"required": ["video"]
+}

package/src/tasks/text2text-generation/inference.ts CHANGED Viewed

@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
 /**
  * Inputs for Text2text Generation inference
  */
@@ -18,7 +17,6 @@ export interface Text2TextGenerationInput {
 	parameters?: Text2TextGenerationParameters;
 	[property: string]: unknown;
 }
 /**
  * Additional inference parameters for Text2text Generation
  */
@@ -30,16 +28,16 @@ export interface Text2TextGenerationParameters {
 	/**
 	 * Additional parametrization of the text generation algorithm
 	 */
-	generate_parameters?: { [key: string]: unknown };
+	generate_parameters?: {
+		[key: string]: unknown;
+	};
 	/**
 	 * The truncation strategy to use
 	 */
 	truncation?: Text2TextGenerationTruncationStrategy;
 	[property: string]: unknown;
 }
 export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
 /**
  * Outputs of inference for the Text2text Generation task
  */

package/src/tasks/translation/inference.ts CHANGED Viewed

@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
 /**
  * Inputs for Translation inference
  */
@@ -18,7 +17,6 @@ export interface TranslationInput {
 	parameters?: TranslationParameters;
 	[property: string]: unknown;
 }
 /**
  * Additional inference parameters for Translation
  */
@@ -30,7 +28,9 @@ export interface TranslationParameters {
 	/**
 	 * Additional parametrization of the text generation algorithm.
 	 */
-	generate_parameters?: { [key: string]: unknown };
+	generate_parameters?: {
+		[key: string]: unknown;
+	};
 	/**
 	 * The source language of the text. Required for models that can translate from multiple
 	 * languages.
@@ -47,12 +47,10 @@ export interface TranslationParameters {
 	truncation?: TranslationTruncationStrategy;
 	[property: string]: unknown;
 }
 /**
  * The truncation strategy to use.
  */
 export type TranslationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
 /**
  * Outputs of inference for the Translation task
  */

package/src/tasks/visual-question-answering/inference.ts CHANGED Viewed

@@ -28,7 +28,7 @@ export interface VisualQuestionAnsweringInputData {
 	/**
 	 * The question to answer based on the image.
 	 */
-	question: unknown;
+	question: string;
 	[property: string]: unknown;
 }
 /**

package/src/tasks/visual-question-answering/spec/input.json CHANGED Viewed

@@ -11,10 +11,12 @@
 			"title": "VisualQuestionAnsweringInputData",
 			"properties": {
 				"image": {
-					"description": "The image."
+					"description": "The image.",
+					"comment": "type=binary"
 				},
 				"question": {
-					"description": "The question to answer based on the image."
+					"description": "The question to answer based on the image.",
+					"type": "string"
 				}
 			},
 			"required": ["question", "image"]

package/src/tasks/zero-shot-image-classification/inference.ts CHANGED Viewed

@@ -10,7 +10,7 @@ export interface ZeroShotImageClassificationInput {
 	/**
 	 * The input image data to classify as a base64-encoded string.
 	 */
-	inputs: string;
+	inputs: Blob;
 	/**
 	 * Additional inference parameters for Zero Shot Image Classification
 	 */

package/src/tasks/zero-shot-image-classification/spec/input.json CHANGED Viewed

@@ -7,7 +7,8 @@
 	"properties": {
 		"inputs": {
 			"type": "string",
-			"description": "The input image data to classify as a base64-encoded string."
+			"description": "The input image data to classify as a base64-encoded string.",
+			"comment": "type=binary"
 		},
 		"parameters": {
 			"description": "Additional inference parameters for Zero Shot Image Classification",

package/src/tasks/zero-shot-object-detection/inference.ts CHANGED Viewed

@@ -10,7 +10,7 @@ export interface ZeroShotObjectDetectionInput {
 	/**
 	 * The input image data as a base64-encoded string.
 	 */
-	inputs: string;
+	inputs: Blob;
 	/**
 	 * Additional inference parameters for Zero Shot Object Detection
 	 */

package/src/tasks/zero-shot-object-detection/spec/input.json CHANGED Viewed

@@ -7,7 +7,8 @@
 	"properties": {
 		"inputs": {
 			"description": "The input image data as a base64-encoded string.",
-			"type": "string"
+			"type": "string",
+			"comment": "type=binary"
 		},
 		"parameters": {
 			"description": "Additional inference parameters for Zero Shot Object Detection",