npm - @huggingface/tasks - Versions diffs - 0.3.2 → 0.3.4 - Mend

@huggingface/tasks 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/index.cjs +43 -1
package/dist/index.d.ts +1993 -5
package/dist/index.js +43 -1
package/package.json +1 -1
package/src/model-data.ts +2 -2
package/src/model-libraries-snippets.ts +38 -0
package/src/model-libraries.ts +7 -0
package/src/pipelines.ts +1 -1
package/src/tasks/index.ts +58 -0
package/src/tasks/summarization/inference.ts +3 -6
package/src/tasks/summarization/spec/output.json +9 -2
package/src/tasks/translation/inference.ts +3 -6
package/src/tasks/translation/spec/output.json +9 -2

package/dist/index.d.ts CHANGED Viewed

@@ -549,7 +549,7 @@ interface ModelData {
             base_model_name?: string;
             task_type?: string;
         };
-        tokenizer?: TokenizerConfig;
+        tokenizer_config?: TokenizerConfig;
     };
     /**
      * all the model tags
@@ -575,7 +575,7 @@ interface ModelData {
      */
     widgetData?: WidgetExample[] | undefined;
     /**
-     * Parameters that will be used by the widget when calling Inference Endpoints (serverless)
+     * Parameters that will be used by the widget when calling Inference API (serverless)
      * https://huggingface.co/docs/api-inference/detailed_parameters
      *
      * can be set in the model card metadata (under `inference/parameters`)
@@ -732,6 +732,13 @@ declare const MODEL_LIBRARIES_UI_ELEMENTS: {
             };
         };
     };
+    audiocraft: {
+        prettyLabel: string;
+        repoName: string;
+        repoUrl: string;
+        snippets: (model: ModelData) => string[];
+        filter: false;
+    };
     bertopic: {
         prettyLabel: string;
         repoName: string;
@@ -1051,8 +1058,8 @@ declare const MODEL_LIBRARIES_UI_ELEMENTS: {
     };
 };
 type ModelLibraryKey = keyof typeof MODEL_LIBRARIES_UI_ELEMENTS;
-declare const ALL_MODEL_LIBRARY_KEYS: ("sklearn" | "adapter-transformers" | "allennlp" | "asteroid" | "bertopic" | "diffusers" | "doctr" | "espnet" | "fairseq" | "fastai" | "fasttext" | "flair" | "keras" | "k2" | "mindspore" | "ml-agents" | "mlx" | "nemo" | "open_clip" | "paddlenlp" | "peft" | "pyannote-audio" | "pythae" | "sample-factory" | "sentence-transformers" | "setfit" | "spacy" | "span-marker" | "speechbrain" | "stable-baselines3" | "stanza" | "tensorflowtts" | "timm" | "transformers" | "transformers.js" | "unity-sentis")[];
-declare const ALL_DISPLAY_MODEL_LIBRARY_KEYS: ("sklearn" | "adapter-transformers" | "allennlp" | "asteroid" | "bertopic" | "diffusers" | "doctr" | "espnet" | "fairseq" | "fastai" | "fasttext" | "flair" | "keras" | "k2" | "mindspore" | "ml-agents" | "mlx" | "nemo" | "open_clip" | "paddlenlp" | "peft" | "pyannote-audio" | "pythae" | "sample-factory" | "sentence-transformers" | "setfit" | "spacy" | "span-marker" | "speechbrain" | "stable-baselines3" | "stanza" | "tensorflowtts" | "timm" | "transformers" | "transformers.js" | "unity-sentis")[];
+declare const ALL_MODEL_LIBRARY_KEYS: ("sklearn" | "adapter-transformers" | "allennlp" | "asteroid" | "audiocraft" | "bertopic" | "diffusers" | "doctr" | "espnet" | "fairseq" | "fastai" | "fasttext" | "flair" | "keras" | "k2" | "mindspore" | "ml-agents" | "mlx" | "nemo" | "open_clip" | "paddlenlp" | "peft" | "pyannote-audio" | "pythae" | "sample-factory" | "sentence-transformers" | "setfit" | "spacy" | "span-marker" | "speechbrain" | "stable-baselines3" | "stanza" | "tensorflowtts" | "timm" | "transformers" | "transformers.js" | "unity-sentis")[];
+declare const ALL_DISPLAY_MODEL_LIBRARY_KEYS: ("sklearn" | "adapter-transformers" | "allennlp" | "asteroid" | "audiocraft" | "bertopic" | "diffusers" | "doctr" | "espnet" | "fairseq" | "fastai" | "fasttext" | "flair" | "keras" | "k2" | "mindspore" | "ml-agents" | "mlx" | "nemo" | "open_clip" | "paddlenlp" | "peft" | "pyannote-audio" | "pythae" | "sample-factory" | "sentence-transformers" | "setfit" | "spacy" | "span-marker" | "speechbrain" | "stable-baselines3" | "stanza" | "tensorflowtts" | "timm" | "transformers" | "transformers.js" | "unity-sentis")[];
 /**
  * Mapping from library name (excluding Transformers) to its supported tasks.
@@ -1066,6 +1073,1987 @@ declare const LIBRARY_TASK_MAPPING_EXCLUDING_TRANSFORMERS: Partial<Record<ModelL
 type PerLanguageMapping = Map<WidgetType, string[] | WidgetExample[]>;
 declare const MAPPING_DEFAULT_WIDGET: Map<string, PerLanguageMapping>;
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Audio Classification inference
+ */
+interface AudioClassificationInput {
+    /**
+     * The input audio data
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: AudioClassificationParameters;
+    [property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Audio Classification
+ */
+interface AudioClassificationParameters {
+    function_to_apply?: ClassificationOutputTransform$3;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    top_k?: number;
+    [property: string]: unknown;
+}
+/**
+ * The function to apply to the model outputs in order to retrieve the scores.
+ */
+type ClassificationOutputTransform$3 = "sigmoid" | "softmax" | "none";
+type AudioClassificationOutput = AudioClassificationOutputElement[];
+/**
+ * Outputs for Audio Classification inference
+ */
+interface AudioClassificationOutputElement {
+    /**
+     * The predicted class label.
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Automatic Speech Recognition inference
+ */
+interface AutomaticSpeechRecognitionInput {
+    /**
+     * The input audio data
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: AutomaticSpeechRecognitionParameters;
+    [property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Automatic Speech Recognition
+ */
+interface AutomaticSpeechRecognitionParameters {
+    /**
+     * Parametrization of the text generation process
+     */
+    generate?: GenerationParameters$2;
+    /**
+     * Whether to output corresponding timestamps with the generated text
+     */
+    return_timestamps?: boolean;
+    [property: string]: unknown;
+}
+/**
+ * Parametrization of the text generation process
+ *
+ * Ad-hoc parametrization of the text generation process
+ */
+interface GenerationParameters$2 {
+    /**
+     * Whether to use sampling instead of greedy decoding when generating new tokens.
+     */
+    do_sample?: boolean;
+    /**
+     * Controls the stopping condition for beam-based methods.
+     */
+    early_stopping?: EarlyStoppingUnion$2;
+    /**
+     * If set to float strictly between 0 and 1, only tokens with a conditional probability
+     * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+     * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+     * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+     */
+    epsilon_cutoff?: number;
+    /**
+     * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+     * float strictly between 0 and 1, a token is only considered if it is greater than either
+     * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+     * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+     * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+     * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+     * for more details.
+     */
+    eta_cutoff?: number;
+    /**
+     * The maximum length (in tokens) of the generated text, including the input.
+     */
+    max_length?: number;
+    /**
+     * The maximum number of tokens to generate. Takes precedence over maxLength.
+     */
+    max_new_tokens?: number;
+    /**
+     * The minimum length (in tokens) of the generated text, including the input.
+     */
+    min_length?: number;
+    /**
+     * The minimum number of tokens to generate. Takes precedence over maxLength.
+     */
+    min_new_tokens?: number;
+    /**
+     * Number of groups to divide num_beams into in order to ensure diversity among different
+     * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+     */
+    num_beam_groups?: number;
+    /**
+     * Number of beams to use for beam search.
+     */
+    num_beams?: number;
+    /**
+     * The value balances the model confidence and the degeneration penalty in contrastive
+     * search decoding.
+     */
+    penalty_alpha?: number;
+    /**
+     * The value used to modulate the next token probabilities.
+     */
+    temperature?: number;
+    /**
+     * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+     */
+    top_k?: number;
+    /**
+     * If set to float < 1, only the smallest set of most probable tokens with probabilities
+     * that add up to top_p or higher are kept for generation.
+     */
+    top_p?: number;
+    /**
+     * Local typicality measures how similar the conditional probability of predicting a target
+     * token next is to the expected conditional probability of predicting a random token next,
+     * given the partial text already generated. If set to float < 1, the smallest set of the
+     * most locally typical tokens with probabilities that add up to typical_p or higher are
+     * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+     */
+    typical_p?: number;
+    /**
+     * Whether the model should use the past last key/values attentions to speed up decoding
+     */
+    use_cache?: boolean;
+    [property: string]: unknown;
+}
+/**
+ * Controls the stopping condition for beam-based methods.
+ */
+type EarlyStoppingUnion$2 = boolean | "never";
+/**
+ * Outputs of inference for the Automatic Speech Recognition task
+ */
+interface AutomaticSpeechRecognitionOutput {
+    /**
+     * When returnTimestamps is enabled, chunks contains a list of audio chunks identified by
+     * the model.
+     */
+    chunks?: AutomaticSpeechRecognitionOutputChunk[];
+    /**
+     * The recognized text.
+     */
+    text: string;
+    [property: string]: unknown;
+}
+interface AutomaticSpeechRecognitionOutputChunk {
+    /**
+     * A chunk of text identified by the model
+     */
+    text: string;
+    /**
+     * The start and end timestamps corresponding with the text
+     */
+    timestamps: number[];
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Document Question Answering inference
+ */
+interface DocumentQuestionAnsweringInput {
+    /**
+     * One (document, question) pair to answer
+     */
+    inputs: DocumentQuestionAnsweringInputData;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: DocumentQuestionAnsweringParameters;
+    [property: string]: unknown;
+}
+/**
+ * One (document, question) pair to answer
+ */
+interface DocumentQuestionAnsweringInputData {
+    /**
+     * The image on which the question is asked
+     */
+    image: unknown;
+    /**
+     * A question to ask of the document
+     */
+    question: string;
+    [property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Document Question Answering
+ */
+interface DocumentQuestionAnsweringParameters {
+    /**
+     * If the words in the document are too long to fit with the question for the model, it will
+     * be split in several chunks with some overlap. This argument controls the size of that
+     * overlap.
+     */
+    doc_stride?: number;
+    /**
+     * Whether to accept impossible as an answer
+     */
+    handle_impossible_answer?: boolean;
+    /**
+     * Language to use while running OCR. Defaults to english.
+     */
+    lang?: string;
+    /**
+     * The maximum length of predicted answers (e.g., only answers with a shorter length are
+     * considered).
+     */
+    max_answer_len?: number;
+    /**
+     * The maximum length of the question after tokenization. It will be truncated if needed.
+     */
+    max_question_len?: number;
+    /**
+     * The maximum length of the total sentence (context + question) in tokens of each chunk
+     * passed to the model. The context will be split in several chunks (using doc_stride as
+     * overlap) if needed.
+     */
+    max_seq_len?: number;
+    /**
+     * The number of answers to return (will be chosen by order of likelihood). Can return less
+     * than top_k answers if there are not enough options available within the context.
+     */
+    top_k?: number;
+    /**
+     * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
+     * skip the OCR step and use the provided bounding boxes instead.
+     */
+    word_boxes?: WordBox[];
+    [property: string]: unknown;
+}
+type WordBox = number[] | string;
+type DocumentQuestionAnsweringOutput = DocumentQuestionAnsweringOutputElement[];
+/**
+ * Outputs of inference for the Document Question Answering task
+ */
+interface DocumentQuestionAnsweringOutputElement {
+    /**
+     * The answer to the question.
+     */
+    answer: string;
+    /**
+     * The end word index of the answer (in the OCR’d version of the input or provided word
+     * boxes).
+     */
+    end: number;
+    /**
+     * The probability associated to the answer.
+     */
+    score: number;
+    /**
+     * The start word index of the answer (in the OCR’d version of the input or provided word
+     * boxes).
+     */
+    start: number;
+    /**
+     * The index of each word/box pair that is in the answer
+     */
+    words: number[];
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+type FeatureExtractionOutput = unknown[];
+/**
+ * Inputs for Text Embedding inference
+ */
+interface FeatureExtractionInput {
+    /**
+     * The text to get the embeddings of
+     */
+    inputs: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: {
+        [key: string]: unknown;
+    };
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Fill Mask inference
+ */
+interface FillMaskInput {
+    /**
+     * The text with masked tokens
+     */
+    inputs: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: FillMaskParameters;
+    [property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Fill Mask
+ */
+interface FillMaskParameters {
+    /**
+     * When passed, the model will limit the scores to the passed targets instead of looking up
+     * in the whole vocabulary. If the provided targets are not in the model vocab, they will be
+     * tokenized and the first resulting token will be used (with a warning, and that might be
+     * slower).
+     */
+    targets?: string[];
+    /**
+     * When passed, overrides the number of predictions to return.
+     */
+    top_k?: number;
+    [property: string]: unknown;
+}
+type FillMaskOutput = FillMaskOutputElement[];
+/**
+ * Outputs of inference for the Fill Mask task
+ */
+interface FillMaskOutputElement {
+    /**
+     * The corresponding probability
+     */
+    score: number;
+    /**
+     * The corresponding input with the mask token prediction.
+     */
+    sequence: string;
+    /**
+     * The predicted token id (to replace the masked one).
+     */
+    token: number;
+    tokenStr: unknown;
+    /**
+     * The predicted token (to replace the masked one).
+     */
+    token_str?: string;
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Image Classification inference
+ */
+interface ImageClassificationInput {
+    /**
+     * The input image data
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ImageClassificationParameters;
+    [property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Image Classification
+ */
+interface ImageClassificationParameters {
+    function_to_apply?: ClassificationOutputTransform$2;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    top_k?: number;
+    [property: string]: unknown;
+}
+/**
+ * The function to apply to the model outputs in order to retrieve the scores.
+ */
+type ClassificationOutputTransform$2 = "sigmoid" | "softmax" | "none";
+type ImageClassificationOutput = ImageClassificationOutputElement[];
+/**
+ * Outputs of inference for the Image Classification task
+ */
+interface ImageClassificationOutputElement {
+    /**
+     * The predicted class label.
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Image To Image inference
+ */
+interface ImageToImageInput {
+    /**
+     * The input image data
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ImageToImageParameters;
+    [property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Image To Image
+ */
+interface ImageToImageParameters {
+    /**
+     * For diffusion models. A higher guidance scale value encourages the model to generate
+     * images closely linked to the text prompt at the expense of lower image quality.
+     */
+    guidance_scale?: number;
+    /**
+     * One or several prompt to guide what NOT to include in image generation.
+     */
+    negative_prompt?: string[];
+    /**
+     * For diffusion models. The number of denoising steps. More denoising steps usually lead to
+     * a higher quality image at the expense of slower inference.
+     */
+    num_inference_steps?: number;
+    /**
+     * The size in pixel of the output image
+     */
+    target_size?: TargetSize$1;
+    [property: string]: unknown;
+}
+/**
+ * The size in pixel of the output image
+ */
+interface TargetSize$1 {
+    height: number;
+    width: number;
+    [property: string]: unknown;
+}
+/**
+ * Outputs of inference for the Image To Image task
+ */
+interface ImageToImageOutput {
+    /**
+     * The output image
+     */
+    image?: unknown;
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Image To Text inference
+ */
+interface ImageToTextInput {
+    /**
+     * The input image data
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ImageToTextParameters;
+    [property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Image To Text
+ */
+interface ImageToTextParameters {
+    /**
+     * Parametrization of the text generation process
+     */
+    generate?: GenerationParameters$1;
+    /**
+     * The amount of maximum tokens to generate.
+     */
+    max_new_tokens?: number;
+    [property: string]: unknown;
+}
+/**
+ * Parametrization of the text generation process
+ *
+ * Ad-hoc parametrization of the text generation process
+ */
+interface GenerationParameters$1 {
+    /**
+     * Whether to use sampling instead of greedy decoding when generating new tokens.
+     */
+    do_sample?: boolean;
+    /**
+     * Controls the stopping condition for beam-based methods.
+     */
+    early_stopping?: EarlyStoppingUnion$1;
+    /**
+     * If set to float strictly between 0 and 1, only tokens with a conditional probability
+     * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+     * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+     * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+     */
+    epsilon_cutoff?: number;
+    /**
+     * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+     * float strictly between 0 and 1, a token is only considered if it is greater than either
+     * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+     * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+     * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+     * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+     * for more details.
+     */
+    eta_cutoff?: number;
+    /**
+     * The maximum length (in tokens) of the generated text, including the input.
+     */
+    max_length?: number;
+    /**
+     * The maximum number of tokens to generate. Takes precedence over maxLength.
+     */
+    max_new_tokens?: number;
+    /**
+     * The minimum length (in tokens) of the generated text, including the input.
+     */
+    min_length?: number;
+    /**
+     * The minimum number of tokens to generate. Takes precedence over maxLength.
+     */
+    min_new_tokens?: number;
+    /**
+     * Number of groups to divide num_beams into in order to ensure diversity among different
+     * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+     */
+    num_beam_groups?: number;
+    /**
+     * Number of beams to use for beam search.
+     */
+    num_beams?: number;
+    /**
+     * The value balances the model confidence and the degeneration penalty in contrastive
+     * search decoding.
+     */
+    penalty_alpha?: number;
+    /**
+     * The value used to modulate the next token probabilities.
+     */
+    temperature?: number;
+    /**
+     * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+     */
+    top_k?: number;
+    /**
+     * If set to float < 1, only the smallest set of most probable tokens with probabilities
+     * that add up to top_p or higher are kept for generation.
+     */
+    top_p?: number;
+    /**
+     * Local typicality measures how similar the conditional probability of predicting a target
+     * token next is to the expected conditional probability of predicting a random token next,
+     * given the partial text already generated. If set to float < 1, the smallest set of the
+     * most locally typical tokens with probabilities that add up to typical_p or higher are
+     * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+     */
+    typical_p?: number;
+    /**
+     * Whether the model should use the past last key/values attentions to speed up decoding
+     */
+    use_cache?: boolean;
+    [property: string]: unknown;
+}
+/**
+ * Controls the stopping condition for beam-based methods.
+ */
+type EarlyStoppingUnion$1 = boolean | "never";
+/**
+ * Outputs of inference for the Image To Text task
+ */
+interface ImageToTextOutput {
+    generatedText: unknown;
+    /**
+     * The generated text.
+     */
+    generated_text?: string;
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Image Segmentation inference
+ */
+interface ImageSegmentationInput {
+    /**
+     * The input image data
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ImageSegmentationParameters;
+    [property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Image Segmentation
+ */
+interface ImageSegmentationParameters {
+    /**
+     * Threshold to use when turning the predicted masks into binary values.
+     */
+    mask_threshold?: number;
+    /**
+     * Mask overlap threshold to eliminate small, disconnected segments.
+     */
+    overlap_mask_area_threshold?: number;
+    /**
+     * Segmentation task to be performed, depending on model capabilities.
+     */
+    subtask?: ImageSegmentationSubtask;
+    /**
+     * Probability threshold to filter out predicted masks.
+     */
+    threshold?: number;
+    [property: string]: unknown;
+}
+type ImageSegmentationSubtask = "instance" | "panoptic" | "semantic";
+type ImageSegmentationOutput = ImageSegmentationOutputElement[];
+/**
+ * Outputs of inference for the Image Segmentation task
+ *
+ * A predicted mask / segment
+ */
+interface ImageSegmentationOutputElement {
+    /**
+     * The label of the predicted segment
+     */
+    label: string;
+    /**
+     * The corresponding mask as a black-and-white image
+     */
+    mask: unknown;
+    /**
+     * The score or confidence degreee the model has
+     */
+    score?: number;
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Object Detection inference
+ */
+interface ObjectDetectionInput {
+    /**
+     * The input image data
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ObjectDetectionParameters;
+    [property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Object Detection
+ */
+interface ObjectDetectionParameters {
+    /**
+     * The probability necessary to make a prediction.
+     */
+    threshold?: number;
+    [property: string]: unknown;
+}
+/**
+ * The predicted bounding box. Coordinates are relative to the top left corner of the input
+ * image.
+ */
+interface BoundingBox$1 {
+    xmax: number;
+    xmin: number;
+    ymax: number;
+    ymin: number;
+    [property: string]: unknown;
+}
+type ObjectDetectionOutput = ObjectDetectionOutputElement[];
+/**
+ * Outputs of inference for the Object Detection task
+ */
+interface ObjectDetectionOutputElement {
+    /**
+     * The predicted bounding box. Coordinates are relative to the top left corner of the input
+     * image.
+     */
+    box: BoundingBox$1;
+    /**
+     * The predicted label for the bounding box
+     */
+    label: string;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Depth Estimation inference
+ */
+interface DepthEstimationInput {
+    /**
+     * The input image data
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: {
+        [key: string]: unknown;
+    };
+    [property: string]: unknown;
+}
+/**
+ * Outputs of inference for the Depth Estimation task
+ */
+interface DepthEstimationOutput {
+    /**
+     * The predicted depth as an image
+     */
+    depth?: unknown;
+    /**
+     * The predicted depth as a tensor
+     */
+    predicted_depth?: unknown;
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Question Answering inference
+ */
+interface QuestionAnsweringInput {
+    /**
+     * One (context, question) pair to answer
+     */
+    inputs: QuestionAnsweringInputData;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: QuestionAnsweringParameters;
+    [property: string]: unknown;
+}
+/**
+ * One (context, question) pair to answer
+ */
+interface QuestionAnsweringInputData {
+    /**
+     * The context to be used for answering the question
+     */
+    context: string;
+    /**
+     * The question to be answered
+     */
+    question: string;
+    [property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Question Answering
+ */
+interface QuestionAnsweringParameters {
+    /**
+     * Attempts to align the answer to real words. Improves quality on space separated
+     * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
+     */
+    align_to_words?: boolean;
+    /**
+     * If the context is too long to fit with the question for the model, it will be split in
+     * several chunks with some overlap. This argument controls the size of that overlap.
+     */
+    doc_stride?: number;
+    /**
+     * Whether to accept impossible as an answer.
+     */
+    handle_impossible_answer?: boolean;
+    /**
+     * The maximum length of predicted answers (e.g., only answers with a shorter length are
+     * considered).
+     */
+    max_answer_len?: number;
+    /**
+     * The maximum length of the question after tokenization. It will be truncated if needed.
+     */
+    max_question_len?: number;
+    /**
+     * The maximum length of the total sentence (context + question) in tokens of each chunk
+     * passed to the model. The context will be split in several chunks (using docStride as
+     * overlap) if needed.
+     */
+    max_seq_len?: number;
+    /**
+     * The number of answers to return (will be chosen by order of likelihood). Note that we
+     * return less than topk answers if there are not enough options available within the
+     * context.
+     */
+    top_k?: number;
+    [property: string]: unknown;
+}
+type QuestionAnsweringOutput = QuestionAnsweringOutputElement[];
+/**
+ * Outputs of inference for the Question Answering task
+ */
+interface QuestionAnsweringOutputElement {
+    /**
+     * The answer to the question.
+     */
+    answer: string;
+    /**
+     * The character position in the input where the answer ends.
+     */
+    end: number;
+    /**
+     * The probability associated to the answer.
+     */
+    score: number;
+    /**
+     * The character position in the input where the answer begins.
+     */
+    start: number;
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+type SentenceSimilarityOutput = number[];
+/**
+ * Inputs for Sentence similarity inference
+ */
+interface SentenceSimilarityInput {
+    inputs: SentenceSimilarityInputData;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: {
+        [key: string]: unknown;
+    };
+    [property: string]: unknown;
+}
+interface SentenceSimilarityInputData {
+    /**
+     * A list of strings which will be compared against the source_sentence.
+     */
+    sentences: string[];
+    /**
+     * The string that you wish to compare the other strings with. This can be a phrase,
+     * sentence, or longer passage, depending on the model being used.
+     */
+    sourceSentence: string;
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Summarization inference
+ *
+ * Inputs for Text2text Generation inference
+ */
+interface SummarizationInput {
+    /**
+     * The input text data
+     */
+    inputs: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: Text2TextGenerationParameters$1;
+    [property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text2text Generation
+ */
+interface Text2TextGenerationParameters$1 {
+    /**
+     * Whether to clean up the potential extra spaces in the text output.
+     */
+    clean_up_tokenization_spaces?: boolean;
+    /**
+     * Additional parametrization of the text generation algorithm
+     */
+    generate_parameters?: {
+        [key: string]: unknown;
+    };
+    /**
+     * The truncation strategy to use
+     */
+    truncation?: Text2TextGenerationTruncationStrategy$1;
+    [property: string]: unknown;
+}
+type Text2TextGenerationTruncationStrategy$1 = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
+/**
+ * Outputs of inference for the Summarization task
+ */
+interface SummarizationOutput {
+    /**
+     * The summarized text.
+     */
+    summary_text: string;
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Table Question Answering inference
+ */
+interface TableQuestionAnsweringInput {
+    /**
+     * One (table, question) pair to answer
+     */
+    inputs: TableQuestionAnsweringInputData;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: {
+        [key: string]: unknown;
+    };
+    [property: string]: unknown;
+}
+/**
+ * One (table, question) pair to answer
+ */
+interface TableQuestionAnsweringInputData {
+    /**
+     * The question to be answered about the table
+     */
+    question: string;
+    /**
+     * The table to serve as context for the questions
+     */
+    table: {
+        [key: string]: string[];
+    };
+    [property: string]: unknown;
+}
+type TableQuestionAnsweringOutput = TableQuestionAnsweringOutputElement[];
+/**
+ * Outputs of inference for the Table Question Answering task
+ */
+interface TableQuestionAnsweringOutputElement {
+    /**
+     * If the model has an aggregator, this returns the aggregator.
+     */
+    aggregator?: string;
+    /**
+     * The answer of the question given the table. If there is an aggregator, the answer will be
+     * preceded by `AGGREGATOR >`.
+     */
+    answer: string;
+    /**
+     * List of strings made up of the answer cell values.
+     */
+    cells: string[];
+    /**
+     * Coordinates of the cells of the answers.
+     */
+    coordinates: Array<number[]>;
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Text To Image inference
+ */
+interface TextToImageInput {
+    /**
+     * The input text data (sometimes called "prompt"
+     */
+    inputs: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: TextToImageParameters;
+    [property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text To Image
+ */
+interface TextToImageParameters {
+    /**
+     * For diffusion models. A higher guidance scale value encourages the model to generate
+     * images closely linked to the text prompt at the expense of lower image quality.
+     */
+    guidance_scale?: number;
+    /**
+     * One or several prompt to guide what NOT to include in image generation.
+     */
+    negative_prompt?: string[];
+    /**
+     * For diffusion models. The number of denoising steps. More denoising steps usually lead to
+     * a higher quality image at the expense of slower inference.
+     */
+    num_inference_steps?: number;
+    /**
+     * For diffusion models. Override the scheduler with a compatible one
+     */
+    scheduler?: string;
+    /**
+     * The size in pixel of the output image
+     */
+    target_size?: TargetSize;
+    [property: string]: unknown;
+}
+/**
+ * The size in pixel of the output image
+ */
+interface TargetSize {
+    height: number;
+    width: number;
+    [property: string]: unknown;
+}
+/**
+ * Outputs of inference for the Text To Image task
+ */
+interface TextToImageOutput {
+    /**
+     * The generated image
+     */
+    image: unknown;
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Text to Speech inference
+ *
+ * Inputs for Text To Audio inference
+ */
+interface TextToSpeechInput {
+    /**
+     * The input text data
+     */
+    inputs: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: TextToAudioParameters;
+    [property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text To Audio
+ */
+interface TextToAudioParameters {
+    /**
+     * Parametrization of the text generation process
+     */
+    generate?: GenerationParameters;
+    [property: string]: unknown;
+}
+/**
+ * Parametrization of the text generation process
+ *
+ * Ad-hoc parametrization of the text generation process
+ */
+interface GenerationParameters {
+    /**
+     * Whether to use sampling instead of greedy decoding when generating new tokens.
+     */
+    do_sample?: boolean;
+    /**
+     * Controls the stopping condition for beam-based methods.
+     */
+    early_stopping?: EarlyStoppingUnion;
+    /**
+     * If set to float strictly between 0 and 1, only tokens with a conditional probability
+     * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+     * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+     * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+     */
+    epsilon_cutoff?: number;
+    /**
+     * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+     * float strictly between 0 and 1, a token is only considered if it is greater than either
+     * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+     * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+     * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+     * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+     * for more details.
+     */
+    eta_cutoff?: number;
+    /**
+     * The maximum length (in tokens) of the generated text, including the input.
+     */
+    max_length?: number;
+    /**
+     * The maximum number of tokens to generate. Takes precedence over maxLength.
+     */
+    max_new_tokens?: number;
+    /**
+     * The minimum length (in tokens) of the generated text, including the input.
+     */
+    min_length?: number;
+    /**
+     * The minimum number of tokens to generate. Takes precedence over maxLength.
+     */
+    min_new_tokens?: number;
+    /**
+     * Number of groups to divide num_beams into in order to ensure diversity among different
+     * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+     */
+    num_beam_groups?: number;
+    /**
+     * Number of beams to use for beam search.
+     */
+    num_beams?: number;
+    /**
+     * The value balances the model confidence and the degeneration penalty in contrastive
+     * search decoding.
+     */
+    penalty_alpha?: number;
+    /**
+     * The value used to modulate the next token probabilities.
+     */
+    temperature?: number;
+    /**
+     * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+     */
+    top_k?: number;
+    /**
+     * If set to float < 1, only the smallest set of most probable tokens with probabilities
+     * that add up to top_p or higher are kept for generation.
+     */
+    top_p?: number;
+    /**
+     * Local typicality measures how similar the conditional probability of predicting a target
+     * token next is to the expected conditional probability of predicting a random token next,
+     * given the partial text already generated. If set to float < 1, the smallest set of the
+     * most locally typical tokens with probabilities that add up to typical_p or higher are
+     * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+     */
+    typical_p?: number;
+    /**
+     * Whether the model should use the past last key/values attentions to speed up decoding
+     */
+    use_cache?: boolean;
+    [property: string]: unknown;
+}
+/**
+ * Controls the stopping condition for beam-based methods.
+ */
+type EarlyStoppingUnion = boolean | "never";
+/**
+ * Outputs for Text to Speech inference
+ *
+ * Outputs of inference for the Text To Audio task
+ */
+interface TextToSpeechOutput {
+    /**
+     * The generated audio waveform.
+     */
+    audio: unknown;
+    samplingRate: unknown;
+    /**
+     * The sampling rate of the generated audio waveform.
+     */
+    sampling_rate?: number;
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Token Classification inference
+ */
+interface TokenClassificationInput {
+    /**
+     * The input text data
+     */
+    inputs: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: TokenClassificationParameters;
+    [property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Token Classification
+ */
+interface TokenClassificationParameters {
+    /**
+     * The strategy used to fuse tokens based on model predictions
+     */
+    aggregation_strategy?: TokenClassificationAggregationStrategy;
+    /**
+     * A list of labels to ignore
+     */
+    ignore_labels?: string[];
+    /**
+     * The number of overlapping tokens between chunks when splitting the input text.
+     */
+    stride?: number;
+    [property: string]: unknown;
+}
+/**
+ * Do not aggregate tokens
+ *
+ * Group consecutive tokens with the same label in a single entity.
+ *
+ * Similar to "simple", also preserves word integrity (use the label predicted for the first
+ * token in a word).
+ *
+ * Similar to "simple", also preserves word integrity (uses the label with the highest
+ * score, averaged across the word's tokens).
+ *
+ * Similar to "simple", also preserves word integrity (uses the label with the highest score
+ * across the word's tokens).
+ */
+type TokenClassificationAggregationStrategy = "none" | "simple" | "first" | "average" | "max";
+type TokenClassificationOutput = TokenClassificationOutputElement[];
+/**
+ * Outputs of inference for the Token Classification task
+ */
+interface TokenClassificationOutputElement {
+    /**
+     * The character position in the input where this group ends.
+     */
+    end?: number;
+    /**
+     * The predicted label for that group of tokens
+     */
+    entity_group?: string;
+    label: unknown;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    /**
+     * The character position in the input where this group begins.
+     */
+    start?: number;
+    /**
+     * The corresponding text
+     */
+    word?: string;
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Translation inference
+ *
+ * Inputs for Text2text Generation inference
+ */
+interface TranslationInput {
+    /**
+     * The input text data
+     */
+    inputs: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: Text2TextGenerationParameters;
+    [property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text2text Generation
+ */
+interface Text2TextGenerationParameters {
+    /**
+     * Whether to clean up the potential extra spaces in the text output.
+     */
+    clean_up_tokenization_spaces?: boolean;
+    /**
+     * Additional parametrization of the text generation algorithm
+     */
+    generate_parameters?: {
+        [key: string]: unknown;
+    };
+    /**
+     * The truncation strategy to use
+     */
+    truncation?: Text2TextGenerationTruncationStrategy;
+    [property: string]: unknown;
+}
+type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
+/**
+ * Outputs of inference for the Translation task
+ */
+interface TranslationOutput {
+    /**
+     * The translated text.
+     */
+    translation_text: string;
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Text Classification inference
+ */
+interface TextClassificationInput {
+    /**
+     * The text to classify
+     */
+    inputs: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: TextClassificationParameters;
+    [property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text Classification
+ */
+interface TextClassificationParameters {
+    function_to_apply?: ClassificationOutputTransform$1;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    top_k?: number;
+    [property: string]: unknown;
+}
+/**
+ * The function to apply to the model outputs in order to retrieve the scores.
+ */
+type ClassificationOutputTransform$1 = "sigmoid" | "softmax" | "none";
+type TextClassificationOutput = TextClassificationOutputElement[];
+/**
+ * Outputs of inference for the Text Classification task
+ */
+interface TextClassificationOutputElement {
+    /**
+     * The predicted class label.
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Text Generation inference
+ */
+interface TextGenerationInput {
+    /**
+     * The text to initialize generation with
+     */
+    inputs: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: TextGenerationParameters;
+    [property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text Generation
+ */
+interface TextGenerationParameters {
+    /**
+     * The number of sampling queries to run. Only the best one (in terms of total logprob) will
+     * be returned.
+     */
+    best_of?: number;
+    /**
+     * Whether or not to output decoder input details
+     */
+    decoder_input_details?: boolean;
+    /**
+     * Whether or not to output details
+     */
+    details?: boolean;
+    /**
+     * Whether to use logits sampling instead of greedy decoding when generating new tokens.
+     */
+    do_sample?: boolean;
+    /**
+     * The maximum number of tokens to generate.
+     */
+    max_new_tokens?: number;
+    /**
+     * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
+     * paper](https://hf.co/papers/1909.05858) for more details.
+     */
+    repetition_penalty?: number;
+    /**
+     * Whether to prepend the prompt to the generated text.
+     */
+    return_full_text?: boolean;
+    /**
+     * The random sampling seed.
+     */
+    seed?: number;
+    /**
+     * Stop generating tokens if a member of `stop_sequences` is generated.
+     */
+    stop_sequences?: string[];
+    /**
+     * The value used to modulate the logits distribution.
+     */
+    temperature?: number;
+    /**
+     * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+     */
+    top_k?: number;
+    /**
+     * If set to < 1, only the smallest set of most probable tokens with probabilities that add
+     * up to `top_p` or higher are kept for generation.
+     */
+    top_p?: number;
+    /**
+     * Truncate input tokens to the given size.
+     */
+    truncate?: number;
+    /**
+     * Typical Decoding mass. See [Typical Decoding for Natural Language
+     * Generation](https://hf.co/papers/2202.00666) for more information
+     */
+    typical_p?: number;
+    /**
+     * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
+     */
+    watermark?: boolean;
+    [property: string]: unknown;
+}
+/**
+ * Outputs for Text Generation inference
+ */
+interface TextGenerationOutput {
+    /**
+     * When enabled, details about the generation
+     */
+    details?: TextGenerationOutputDetails;
+    /**
+     * The generated text
+     */
+    generated_text: string;
+    [property: string]: unknown;
+}
+/**
+ * When enabled, details about the generation
+ */
+interface TextGenerationOutputDetails {
+    /**
+     * Details about additional sequences when best_of is provided
+     */
+    best_of_sequences?: TextGenerationSequenceDetails[];
+    /**
+     * The reason why the generation was stopped.
+     */
+    finish_reason: FinishReason;
+    /**
+     * The number of generated tokens
+     */
+    generated_tokens: number;
+    prefill: PrefillToken[];
+    /**
+     * The random seed used for generation
+     */
+    seed?: number;
+    /**
+     * The generated tokens and associated details
+     */
+    tokens: Token[];
+    [property: string]: unknown;
+}
+interface TextGenerationSequenceDetails {
+    /**
+     * The reason why the generation was stopped.
+     */
+    finish_reason: FinishReason;
+    /**
+     * The generated text
+     */
+    generated_text: number;
+    /**
+     * The number of generated tokens
+     */
+    generated_tokens: number;
+    prefill: PrefillToken[];
+    /**
+     * The random seed used for generation
+     */
+    seed?: number;
+    /**
+     * The generated tokens and associated details
+     */
+    tokens: Token[];
+    [property: string]: unknown;
+}
+/**
+ * The generated sequence reached the maximum allowed length
+ *
+ * The model generated an end-of-sentence (EOS) token
+ *
+ * One of the sequence in stop_sequences was generated
+ */
+type FinishReason = "length" | "eos_token" | "stop_sequence";
+interface PrefillToken {
+    id: number;
+    logprob: number;
+    /**
+     * The text associated with that token
+     */
+    text: string;
+    [property: string]: unknown;
+}
+interface Token {
+    id: number;
+    logprob: number;
+    /**
+     * Whether or not that token is a special one
+     */
+    special: boolean;
+    /**
+     * The text associated with that token
+     */
+    text: string;
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Video Classification inference
+ */
+interface VideoClassificationInput {
+    /**
+     * The input video data
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: VideoClassificationParameters;
+    [property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Video Classification
+ */
+interface VideoClassificationParameters {
+    /**
+     * The sampling rate used to select frames from the video.
+     */
+    frame_sampling_rate?: number;
+    function_to_apply?: ClassificationOutputTransform;
+    /**
+     * The number of sampled frames to consider for classification.
+     */
+    num_frames?: number;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    top_k?: number;
+    [property: string]: unknown;
+}
+/**
+ * The function to apply to the model outputs in order to retrieve the scores.
+ */
+type ClassificationOutputTransform = "sigmoid" | "softmax" | "none";
+type VideoClassificationOutput = VideoClassificationOutputElement[];
+/**
+ * Outputs of inference for the Video Classification task
+ */
+interface VideoClassificationOutputElement {
+    /**
+     * The predicted class label.
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Visual Question Answering inference
+ */
+interface VisualQuestionAnsweringInput {
+    /**
+     * One (image, question) pair to answer
+     */
+    inputs: VisualQuestionAnsweringInputData;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: VisualQuestionAnsweringParameters;
+    [property: string]: unknown;
+}
+/**
+ * One (image, question) pair to answer
+ */
+interface VisualQuestionAnsweringInputData {
+    /**
+     * The image.
+     */
+    image: unknown;
+    /**
+     * The question to answer based on the image.
+     */
+    question: unknown;
+    [property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Visual Question Answering
+ */
+interface VisualQuestionAnsweringParameters {
+    /**
+     * The number of answers to return (will be chosen by order of likelihood). Note that we
+     * return less than topk answers if there are not enough options available within the
+     * context.
+     */
+    top_k?: number;
+    [property: string]: unknown;
+}
+type VisualQuestionAnsweringOutput = VisualQuestionAnsweringOutputElement[];
+/**
+ * Outputs of inference for the Visual Question Answering task
+ */
+interface VisualQuestionAnsweringOutputElement {
+    /**
+     * The answer to the question
+     */
+    answer?: string;
+    label: unknown;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Zero Shot Classification inference
+ */
+interface ZeroShotClassificationInput {
+    /**
+     * The input text data, with candidate labels
+     */
+    inputs: ZeroShotClassificationInputData;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ZeroShotClassificationParameters;
+    [property: string]: unknown;
+}
+/**
+ * The input text data, with candidate labels
+ */
+interface ZeroShotClassificationInputData {
+    /**
+     * The set of possible class labels to classify the text into.
+     */
+    candidateLabels: string[];
+    /**
+     * The text to classify
+     */
+    text: string;
+    [property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Zero Shot Classification
+ */
+interface ZeroShotClassificationParameters {
+    /**
+     * The sentence used in conjunction with candidateLabels to attempt the text classification
+     * by replacing the placeholder with the candidate labels.
+     */
+    hypothesis_template?: string;
+    /**
+     * Whether multiple candidate labels can be true. If false, the scores are normalized such
+     * that the sum of the label likelihoods for each sequence is 1. If true, the labels are
+     * considered independent and probabilities are normalized for each candidate.
+     */
+    multi_label?: boolean;
+    [property: string]: unknown;
+}
+type ZeroShotClassificationOutput = ZeroShotClassificationOutputElement[];
+/**
+ * Outputs of inference for the Zero Shot Classification task
+ */
+interface ZeroShotClassificationOutputElement {
+    /**
+     * The predicted class label.
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Zero Shot Image Classification inference
+ */
+interface ZeroShotImageClassificationInput {
+    /**
+     * The input image data, with candidate labels
+     */
+    inputs: ZeroShotImageClassificationInputData;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ZeroShotImageClassificationParameters;
+    [property: string]: unknown;
+}
+/**
+ * The input image data, with candidate labels
+ */
+interface ZeroShotImageClassificationInputData {
+    /**
+     * The candidate labels for this image
+     */
+    candidateLabels: string[];
+    /**
+     * The image data to classify
+     */
+    image: unknown;
+    [property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Zero Shot Image Classification
+ */
+interface ZeroShotImageClassificationParameters {
+    /**
+     * The sentence used in conjunction with candidateLabels to attempt the text classification
+     * by replacing the placeholder with the candidate labels.
+     */
+    hypothesis_template?: string;
+    [property: string]: unknown;
+}
+type ZeroShotImageClassificationOutput = ZeroShotImageClassificationOutputElement[];
+/**
+ * Outputs of inference for the Zero Shot Image Classification task
+ */
+interface ZeroShotImageClassificationOutputElement {
+    /**
+     * The predicted class label.
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: unknown;
+}
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Zero Shot Object Detection inference
+ */
+interface ZeroShotObjectDetectionInput {
+    /**
+     * The input image data, with candidate labels
+     */
+    inputs: ZeroShotObjectDetectionInputData;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: {
+        [key: string]: unknown;
+    };
+    [property: string]: unknown;
+}
+/**
+ * The input image data, with candidate labels
+ */
+interface ZeroShotObjectDetectionInputData {
+    /**
+     * The candidate labels for this image
+     */
+    candidateLabels: string[];
+    /**
+     * The image data to generate bounding boxes from
+     */
+    image: unknown;
+    [property: string]: unknown;
+}
+/**
+ * The predicted bounding box. Coordinates are relative to the top left corner of the input
+ * image.
+ */
+interface BoundingBox {
+    xmax: number;
+    xmin: number;
+    ymax: number;
+    ymin: number;
+    [property: string]: unknown;
+}
+type ZeroShotObjectDetectionOutput = ZeroShotObjectDetectionOutputElement[];
+/**
+ * Outputs of inference for the Zero Shot Object Detection task
+ */
+interface ZeroShotObjectDetectionOutputElement {
+    /**
+     * The predicted bounding box. Coordinates are relative to the top left corner of the input
+     * image.
+     */
+    box: BoundingBox;
+    /**
+     * A candidate label
+     */
+    label: string;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
+}
 /**
  * Model libraries compatible with each ML task
  */
@@ -1231,4 +3219,4 @@ declare namespace index {
   };
 }
-export { ALL_DISPLAY_MODEL_LIBRARY_KEYS, ALL_MODEL_LIBRARY_KEYS, ExampleRepo, InferenceDisplayability, LIBRARY_TASK_MAPPING_EXCLUDING_TRANSFORMERS, LibraryUiElement, MAPPING_DEFAULT_WIDGET, MODALITIES, MODALITY_LABELS, MODEL_LIBRARIES_UI_ELEMENTS, Modality, ModelData, ModelLibraryKey, PIPELINE_DATA, PIPELINE_TYPES, PIPELINE_TYPES_SET, PipelineData, PipelineType, SPECIAL_TOKENS_ATTRIBUTES, SUBTASK_TYPES, SpecialTokensMap, TASKS_DATA, TASKS_MODEL_LIBRARIES, TaskData, TaskDataCustom, TaskDemo, TaskDemoEntry, TokenizerConfig, TransformersInfo, WidgetExample, WidgetExampleAssetAndPromptInput, WidgetExampleAssetAndTextInput, WidgetExampleAssetAndZeroShotInput, WidgetExampleAssetInput, WidgetExampleAttribute, WidgetExampleOutput, WidgetExampleOutputAnswerScore, WidgetExampleOutputLabels, WidgetExampleOutputText, WidgetExampleOutputUrl, WidgetExampleSentenceSimilarityInput, WidgetExampleStructuredDataInput, WidgetExampleTableDataInput, WidgetExampleTextAndContextInput, WidgetExampleTextAndTableInput, WidgetExampleTextInput, WidgetExampleZeroShotTextInput, WidgetType, index as snippets };
+export { ALL_DISPLAY_MODEL_LIBRARY_KEYS, ALL_MODEL_LIBRARY_KEYS, AudioClassificationInput, AudioClassificationOutput, AudioClassificationOutputElement, AudioClassificationParameters, AutomaticSpeechRecognitionInput, AutomaticSpeechRecognitionOutput, AutomaticSpeechRecognitionOutputChunk, AutomaticSpeechRecognitionParameters, BoundingBox, ClassificationOutputTransform$1 as ClassificationOutputTransform, DepthEstimationInput, DepthEstimationOutput, DocumentQuestionAnsweringInput, DocumentQuestionAnsweringInputData, DocumentQuestionAnsweringOutput, DocumentQuestionAnsweringOutputElement, DocumentQuestionAnsweringParameters, EarlyStoppingUnion$2 as EarlyStoppingUnion, ExampleRepo, FeatureExtractionInput, FeatureExtractionOutput, FillMaskInput, FillMaskOutput, FillMaskOutputElement, FillMaskParameters, FinishReason, GenerationParameters$2 as GenerationParameters, ImageClassificationInput, ImageClassificationOutput, ImageClassificationOutputElement, ImageClassificationParameters, ImageSegmentationInput, ImageSegmentationOutput, ImageSegmentationOutputElement, ImageSegmentationParameters, ImageSegmentationSubtask, ImageToImageInput, ImageToImageOutput, ImageToImageParameters, ImageToTextInput, ImageToTextOutput, ImageToTextParameters, InferenceDisplayability, LIBRARY_TASK_MAPPING_EXCLUDING_TRANSFORMERS, LibraryUiElement, MAPPING_DEFAULT_WIDGET, MODALITIES, MODALITY_LABELS, MODEL_LIBRARIES_UI_ELEMENTS, Modality, ModelData, ModelLibraryKey, ObjectDetectionInput, ObjectDetectionOutput, ObjectDetectionOutputElement, ObjectDetectionParameters, PIPELINE_DATA, PIPELINE_TYPES, PIPELINE_TYPES_SET, PipelineData, PipelineType, PrefillToken, QuestionAnsweringInput, QuestionAnsweringInputData, QuestionAnsweringOutput, QuestionAnsweringOutputElement, QuestionAnsweringParameters, SPECIAL_TOKENS_ATTRIBUTES, SUBTASK_TYPES, SentenceSimilarityInput, SentenceSimilarityInputData, SentenceSimilarityOutput, SpecialTokensMap, SummarizationInput, SummarizationOutput, TASKS_DATA, TASKS_MODEL_LIBRARIES, TableQuestionAnsweringInput, TableQuestionAnsweringInputData, TableQuestionAnsweringOutput, TableQuestionAnsweringOutputElement, TargetSize$1 as TargetSize, TaskData, TaskDataCustom, TaskDemo, TaskDemoEntry, Text2TextGenerationParameters, Text2TextGenerationTruncationStrategy, TextClassificationInput, TextClassificationOutput, TextClassificationOutputElement, TextClassificationParameters, TextGenerationInput, TextGenerationOutput, TextGenerationOutputDetails, TextGenerationParameters, TextGenerationSequenceDetails, TextToAudioParameters, TextToImageInput, TextToImageOutput, TextToImageParameters, TextToSpeechInput, TextToSpeechOutput, Token, TokenClassificationAggregationStrategy, TokenClassificationInput, TokenClassificationOutput, TokenClassificationOutputElement, TokenClassificationParameters, TokenizerConfig, TransformersInfo, TranslationInput, TranslationOutput, VideoClassificationInput, VideoClassificationOutput, VideoClassificationOutputElement, VideoClassificationParameters, VisualQuestionAnsweringInput, VisualQuestionAnsweringInputData, VisualQuestionAnsweringOutput, VisualQuestionAnsweringOutputElement, VisualQuestionAnsweringParameters, WidgetExample, WidgetExampleAssetAndPromptInput, WidgetExampleAssetAndTextInput, WidgetExampleAssetAndZeroShotInput, WidgetExampleAssetInput, WidgetExampleAttribute, WidgetExampleOutput, WidgetExampleOutputAnswerScore, WidgetExampleOutputLabels, WidgetExampleOutputText, WidgetExampleOutputUrl, WidgetExampleSentenceSimilarityInput, WidgetExampleStructuredDataInput, WidgetExampleTableDataInput, WidgetExampleTextAndContextInput, WidgetExampleTextAndTableInput, WidgetExampleTextInput, WidgetExampleZeroShotTextInput, WidgetType, WordBox, ZeroShotClassificationInput, ZeroShotClassificationInputData, ZeroShotClassificationOutput, ZeroShotClassificationOutputElement, ZeroShotClassificationParameters, ZeroShotImageClassificationInput, ZeroShotImageClassificationInputData, ZeroShotImageClassificationOutput, ZeroShotImageClassificationOutputElement, ZeroShotImageClassificationParameters, ZeroShotObjectDetectionInput, ZeroShotObjectDetectionInputData, ZeroShotObjectDetectionOutput, ZeroShotObjectDetectionOutputElement, index as snippets };