PyPI - huggingface-hub - Versions diffs - 0.26.2__py3-none-any.whl → 0.27.0rc0__py3-none-any.whl - Mend - Supply Chain Defender

huggingface-hub 0.26.2py3-none-any.whl → 0.27.0rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of huggingface-hub might be problematic. Click here for more details.

Files changed (62) hide show

huggingface_hub/inference/_generated/_async_client.py CHANGED Viewed

@@ -56,17 +56,24 @@ from huggingface_hub.inference._generated.types import (
     AutomaticSpeechRecognitionOutput,
     ChatCompletionInputGrammarType,
     ChatCompletionInputStreamOptions,
-    ChatCompletionInputToolType,
+    ChatCompletionInputTool,
+    ChatCompletionInputToolChoiceClass,
+    ChatCompletionInputToolChoiceEnum,
     ChatCompletionOutput,
     ChatCompletionStreamOutput,
     DocumentQuestionAnsweringOutputElement,
     FillMaskOutputElement,
     ImageClassificationOutputElement,
+    ImageClassificationOutputTransform,
     ImageSegmentationOutputElement,
+    ImageSegmentationSubtask,
+    ImageToImageTargetSize,
     ImageToTextOutput,
     ObjectDetectionOutputElement,
+    Padding,
     QuestionAnsweringOutputElement,
     SummarizationOutput,
+    SummarizationTruncationStrategy,
     TableQuestionAnsweringOutputElement,
     TextClassificationOutputElement,
     TextClassificationOutputTransform,
@@ -75,9 +82,10 @@ from huggingface_hub.inference._generated.types import (
     TextGenerationStreamOutput,
     TextToImageTargetSize,
     TextToSpeechEarlyStoppingEnum,
+    TokenClassificationAggregationStrategy,
     TokenClassificationOutputElement,
-    ToolElement,
     TranslationOutput,
+    TranslationTruncationStrategy,
     VisualQuestionAnsweringOutputElement,
     ZeroShotClassificationOutputElement,
     ZeroShotImageClassificationOutputElement,
@@ -170,7 +178,9 @@ class AsyncInferenceClient:
         self.model: Optional[str] = model
         self.token: Union[str, bool, None] = token if token is not None else api_key
-        self.headers = CaseInsensitiveDict(build_hf_headers(token=self.token))  # 'authorization' + 'user-agent'
+        self.headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(
+            build_hf_headers(token=self.token)  # 'authorization' + 'user-agent'
+        )
         if headers is not None:
             self.headers.update(headers)
         self.cookies = cookies
@@ -317,7 +327,7 @@ class AsyncInferenceClient:
                         logger.info(f"Waiting for model to be loaded on the server: {error}")
                         if "X-wait-for-model" not in headers and url.startswith(INFERENCE_ENDPOINT):
                             headers["X-wait-for-model"] = "1"
-                        time.sleep(1)
+                        await asyncio.sleep(1)
                         if timeout is not None:
                             timeout = max(self.timeout - (time.time() - t0), 1)  # type: ignore
                         continue
@@ -374,7 +384,7 @@ class AsyncInferenceClient:
             top_k (`int`, *optional*):
                 When specified, limits the output to the top K most probable classes.
             function_to_apply (`"AudioClassificationOutputTransform"`, *optional*):
-                The function to apply to the output.
+                The function to apply to the model outputs in order to retrieve the scores.
         Returns:
             `List[AudioClassificationOutputElement]`: List of [`AudioClassificationOutputElement`] items containing the predicted labels and their confidence.
@@ -502,9 +512,9 @@ class AsyncInferenceClient:
         stop: Optional[List[str]] = None,
         stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolChoiceClass, "ChatCompletionInputToolChoiceEnum"]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ToolElement]] = None,
+        tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
     ) -> ChatCompletionOutput: ...
@@ -527,9 +537,9 @@ class AsyncInferenceClient:
         stop: Optional[List[str]] = None,
         stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolChoiceClass, "ChatCompletionInputToolChoiceEnum"]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ToolElement]] = None,
+        tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
     ) -> AsyncIterable[ChatCompletionStreamOutput]: ...
@@ -552,9 +562,9 @@ class AsyncInferenceClient:
         stop: Optional[List[str]] = None,
         stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolChoiceClass, "ChatCompletionInputToolChoiceEnum"]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ToolElement]] = None,
+        tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
     ) -> Union[ChatCompletionOutput, AsyncIterable[ChatCompletionStreamOutput]]: ...
@@ -577,9 +587,9 @@ class AsyncInferenceClient:
         stop: Optional[List[str]] = None,
         stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolChoiceClass, "ChatCompletionInputToolChoiceEnum"]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ToolElement]] = None,
+        tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
     ) -> Union[ChatCompletionOutput, AsyncIterable[ChatCompletionStreamOutput]]:
@@ -618,7 +628,7 @@ class AsyncInferenceClient:
                 Whether to return log probabilities of the output tokens or not. If true, returns the log
                 probabilities of each output token returned in the content of message.
             max_tokens (`int`, *optional*):
-                Maximum number of tokens allowed in the response. Defaults to 20.
+                Maximum number of tokens allowed in the response. Defaults to 100.
             n (`int`, *optional*):
                 UNUSED.
             presence_penalty (`float`, *optional*):
@@ -645,11 +655,11 @@ class AsyncInferenceClient:
             top_p (`float`, *optional*):
                 Fraction of the most likely next words to sample from.
                 Must be between 0 and 1. Defaults to 1.0.
-            tool_choice ([`ChatCompletionInputToolType`] or `str`, *optional*):
+            tool_choice ([`ChatCompletionInputToolChoiceClass`] or [`ChatCompletionInputToolChoiceEnum`], *optional*):
                 The tool to use for the completion. Defaults to "auto".
             tool_prompt (`str`, *optional*):
                 A prompt to be appended before the tools.
-            tools (List of [`ToolElement`], *optional*):
+            tools (List of [`ChatCompletionInputTool`], *optional*):
                 A list of tools the model may call. Currently, only functions are supported as a tool. Use this to
                 provide a list of functions the model may generate JSON inputs for.
@@ -981,28 +991,25 @@ class AsyncInferenceClient:
                 a deployed Inference Endpoint. If not provided, the default recommended document question answering model will be used.
                 Defaults to None.
             doc_stride (`int`, *optional*):
-                If the words in the document are too long to fit with the question for the model, it will
-                be split in several chunks with some overlap. This argument controls the size of that
-                overlap.
+                If the words in the document are too long to fit with the question for the model, it will be split in
+                several chunks with some overlap. This argument controls the size of that overlap.
             handle_impossible_answer (`bool`, *optional*):
-                Whether to accept impossible as an answer.
+                Whether to accept impossible as an answer
             lang (`str`, *optional*):
-                Language to use while running OCR.
+                Language to use while running OCR. Defaults to english.
             max_answer_len (`int`, *optional*):
-                The maximum length of predicted answers (e.g., only answers with a shorter length are
-                considered).
+                The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
             max_question_len (`int`, *optional*):
                 The maximum length of the question after tokenization. It will be truncated if needed.
             max_seq_len (`int`, *optional*):
-                The maximum length of the total sentence (context + question) in tokens of each chunk
-                passed to the model. The context will be split in several chunks (using doc_stride as
-                overlap) if needed.
+                The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
+                model. The context will be split in several chunks (using doc_stride as overlap) if needed.
             top_k (`int`, *optional*):
-                The number of answers to return (will be chosen by order of likelihood). Can return less
-                than top_k answers if there are not enough options available within the context.
-            word_boxes (`List[Union[List[float], str]]`, *optional*):
-                A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
-                skip the OCR step and use the provided bounding boxes instead.
+                The number of answers to return (will be chosen by order of likelihood). Can return less than top_k
+                answers if there are not enough options available within the context.
+            word_boxes (`List[Union[List[float], str`, *optional*):
+                A list of words and bounding boxes (normalized 0->1000). If provided, the inference will skip the OCR
+                step and use the provided bounding boxes instead.
         Returns:
             `List[DocumentQuestionAnsweringOutputElement]`: a list of [`DocumentQuestionAnsweringOutputElement`] items containing the predicted label, associated probability, word ids, and page number.
@@ -1019,7 +1026,7 @@ class AsyncInferenceClient:
         >>> from huggingface_hub import AsyncInferenceClient
         >>> client = AsyncInferenceClient()
         >>> await client.document_question_answering(image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png", question="What is the invoice number?")
-        [DocumentQuestionAnsweringOutputElement(answer='us-001', end=16, score=0.9999666213989258, start=16, words=None)]
+        [DocumentQuestionAnsweringOutputElement(answer='us-001', end=16, score=0.9999666213989258, start=16)]
         ```
         """
         inputs: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
@@ -1121,11 +1128,10 @@ class AsyncInferenceClient:
             model (`str`, *optional*):
                 The model to use for the fill mask task. Can be a model ID hosted on the Hugging Face Hub or a URL to
                 a deployed Inference Endpoint. If not provided, the default recommended fill mask model will be used.
-            targets (`List[str]`, *optional*):
-                When passed, the model will limit the scores to the passed targets instead of looking up
-                in the whole vocabulary. If the provided targets are not in the model vocab, they will be
-                tokenized and the first resulting token will be used (with a warning, and that might be
-                slower).
+            targets (`List[str`, *optional*):
+                When passed, the model will limit the scores to the passed targets instead of looking up in the whole
+                vocabulary. If the provided targets are not in the model vocab, they will be tokenized and the first
+                resulting token will be used (with a warning, and that might be slower).
             top_k (`int`, *optional*):
                 When passed, overrides the number of predictions to return.
         Returns:
@@ -1160,7 +1166,7 @@ class AsyncInferenceClient:
         image: ContentT,
         *,
         model: Optional[str] = None,
-        function_to_apply: Optional[Literal["sigmoid", "softmax", "none"]] = None,
+        function_to_apply: Optional["ImageClassificationOutputTransform"] = None,
         top_k: Optional[int] = None,
     ) -> List[ImageClassificationOutputElement]:
         """
@@ -1172,8 +1178,8 @@ class AsyncInferenceClient:
             model (`str`, *optional*):
                 The model to use for image classification. Can be a model ID hosted on the Hugging Face Hub or a URL to a
                 deployed Inference Endpoint. If not provided, the default recommended model for image classification will be used.
-            function_to_apply (`Literal["sigmoid", "softmax", "none"]`, *optional*):
-                The function to apply to the output scores.
+            function_to_apply (`"ImageClassificationOutputTransform"`, *optional*):
+                The function to apply to the model outputs in order to retrieve the scores.
             top_k (`int`, *optional*):
                 When specified, limits the output to the top K most probable classes.
         Returns:
@@ -1206,7 +1212,7 @@ class AsyncInferenceClient:
         model: Optional[str] = None,
         mask_threshold: Optional[float] = None,
         overlap_mask_area_threshold: Optional[float] = None,
-        subtask: Optional[Literal["instance", "panoptic", "semantic"]] = None,
+        subtask: Optional["ImageSegmentationSubtask"] = None,
         threshold: Optional[float] = None,
     ) -> List[ImageSegmentationOutputElement]:
         """
@@ -1228,7 +1234,7 @@ class AsyncInferenceClient:
                 Threshold to use when turning the predicted masks into binary values.
             overlap_mask_area_threshold (`float`, *optional*):
                 Mask overlap threshold to eliminate small, disconnected segments.
-            subtask (`Literal["instance", "panoptic", "semantic"]`, *optional*):
+            subtask (`"ImageSegmentationSubtask"`, *optional*):
                 Segmentation task to be performed, depending on model capabilities.
             threshold (`float`, *optional*):
                 Probability threshold to filter out predicted masks.
@@ -1268,12 +1274,11 @@ class AsyncInferenceClient:
         image: ContentT,
         prompt: Optional[str] = None,
         *,
-        negative_prompt: Optional[str] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
+        negative_prompt: Optional[List[str]] = None,
         num_inference_steps: Optional[int] = None,
         guidance_scale: Optional[float] = None,
         model: Optional[str] = None,
+        target_size: Optional[ImageToImageTargetSize] = None,
         **kwargs,
     ) -> "Image":
         """
@@ -1290,21 +1295,19 @@ class AsyncInferenceClient:
                 The input image for translation. It can be raw bytes, an image file, or a URL to an online image.
             prompt (`str`, *optional*):
                 The text prompt to guide the image generation.
-            negative_prompt (`str`, *optional*):
-                A negative prompt to guide the translation process.
-            height (`int`, *optional*):
-                The height in pixels of the generated image.
-            width (`int`, *optional*):
-                The width in pixels of the generated image.
+            negative_prompt (`List[str]`, *optional*):
+                One or several prompt to guide what NOT to include in image generation.
             num_inference_steps (`int`, *optional*):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
+                For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher
+                quality image at the expense of slower inference.
             guidance_scale (`float`, *optional*):
-                Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
+                For diffusion models. A higher guidance scale value encourages the model to generate images closely
+                linked to the text prompt at the expense of lower image quality.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+            target_size (`ImageToImageTargetSize`, *optional*):
+                The size in pixel of the output image.
         Returns:
             `Image`: The translated image.
@@ -1327,8 +1330,7 @@ class AsyncInferenceClient:
         parameters = {
             "prompt": prompt,
             "negative_prompt": negative_prompt,
-            "height": height,
-            "width": width,
+            "target_size": target_size,
             "num_inference_steps": num_inference_steps,
             "guidance_scale": guidance_scale,
             **kwargs,
@@ -1537,26 +1539,24 @@ class AsyncInferenceClient:
                 The model to use for the question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
                 a deployed Inference Endpoint.
             align_to_words (`bool`, *optional*):
-                Attempts to align the answer to real words. Improves quality on space separated
-                languages. Might hurt on non-space-separated languages (like Japanese or Chinese).
+                Attempts to align the answer to real words. Improves quality on space separated languages. Might hurt
+                on non-space-separated languages (like Japanese or Chinese)
             doc_stride (`int`, *optional*):
-                If the context is too long to fit with the question for the model, it will be split in
-                several chunks with some overlap. This argument controls the size of that overlap.
+                If the context is too long to fit with the question for the model, it will be split in several chunks
+                with some overlap. This argument controls the size of that overlap.
             handle_impossible_answer (`bool`, *optional*):
                 Whether to accept impossible as an answer.
             max_answer_len (`int`, *optional*):
-                The maximum length of predicted answers (e.g., only answers with a shorter length are
-                considered).
+                The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
             max_question_len (`int`, *optional*):
                 The maximum length of the question after tokenization. It will be truncated if needed.
             max_seq_len (`int`, *optional*):
-                The maximum length of the total sentence (context + question) in tokens of each chunk
-                passed to the model. The context will be split in several chunks (using docStride as
-                overlap) if needed.
+                The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
+                model. The context will be split in several chunks (using docStride as overlap) if needed.
             top_k (`int`, *optional*):
-                The number of answers to return (will be chosen by order of likelihood). Note that we
-                return less than topk answers if there are not enough options available within the
-                context.
+                The number of answers to return (will be chosen by order of likelihood). Note that we return less than
+                topk answers if there are not enough options available within the context.
         Returns:
             Union[`QuestionAnsweringOutputElement`, List[`QuestionAnsweringOutputElement`]]:
                 When top_k is 1 or not provided, it returns a single `QuestionAnsweringOutputElement`.
@@ -1660,7 +1660,7 @@ class AsyncInferenceClient:
         model: Optional[str] = None,
         clean_up_tokenization_spaces: Optional[bool] = None,
         generate_parameters: Optional[Dict[str, Any]] = None,
-        truncation: Optional[Literal["do_not_truncate", "longest_first", "only_first", "only_second"]] = None,
+        truncation: Optional["SummarizationTruncationStrategy"] = None,
     ) -> SummarizationOutput:
         """
         Generate a summary of a given text using a specified model.
@@ -1678,7 +1678,7 @@ class AsyncInferenceClient:
                 Whether to clean up the potential extra spaces in the text output.
             generate_parameters (`Dict[str, Any]`, *optional*):
                 Additional parametrization of the text generation algorithm.
-            truncation (`Literal["do_not_truncate", "longest_first", "only_first", "only_second"]`, *optional*):
+            truncation (`"SummarizationTruncationStrategy"`, *optional*):
                 The truncation strategy to use.
         Returns:
             [`SummarizationOutput`]: The generated summary text.
@@ -1714,7 +1714,9 @@ class AsyncInferenceClient:
         query: str,
         *,
         model: Optional[str] = None,
-        parameters: Optional[Dict[str, Any]] = None,
+        padding: Optional["Padding"] = None,
+        sequential: Optional[bool] = None,
+        truncation: Optional[bool] = None,
     ) -> TableQuestionAnsweringOutputElement:
         """
         Retrieve the answer to a question from information given in a table.
@@ -1728,8 +1730,14 @@ class AsyncInferenceClient:
             model (`str`):
                 The model to use for the table-question-answering task. Can be a model ID hosted on the Hugging Face
                 Hub or a URL to a deployed Inference Endpoint.
-            parameters (`Dict[str, Any]`, *optional*):
-                Additional inference parameters. Defaults to None.
+            padding (`"Padding"`, *optional*):
+                Activates and controls padding.
+            sequential (`bool`, *optional*):
+                Whether to do inference sequentially or as a batch. Batching is faster, but models like SQA require the
+                inference to be done sequentially to extract relations within sequences, given their conversational
+                nature.
+            truncation (`bool`, *optional*):
+                Activates and controls truncation.
         Returns:
             [`TableQuestionAnsweringOutputElement`]: a table question answering output containing the answer, coordinates, cells and the aggregator used.
@@ -1751,6 +1759,11 @@ class AsyncInferenceClient:
         TableQuestionAnsweringOutputElement(answer='36542', coordinates=[[0, 1]], cells=['36542'], aggregator='AVERAGE')
         ```
         """
+        parameters = {
+            "padding": padding,
+            "sequential": sequential,
+            "truncation": truncation,
+        }
         inputs = {
             "query": query,
             "table": table,
@@ -1875,7 +1888,7 @@ class AsyncInferenceClient:
             top_k (`int`, *optional*):
                 When specified, limits the output to the top K most probable classes.
             function_to_apply (`"TextClassificationOutputTransform"`, *optional*):
-                The function to apply to the output.
+                The function to apply to the model outputs in order to retrieve the scores.
         Returns:
             `List[TextClassificationOutputElement]`: a list of [`TextClassificationOutputElement`] items containing the predicted label and associated probability.
@@ -2136,7 +2149,7 @@ class AsyncInferenceClient:
             grammar ([`TextGenerationInputGrammarType`], *optional*):
                 Grammar constraints. Can be either a JSONSchema or a regex.
             max_new_tokens (`int`, *optional*):
-                Maximum number of generated tokens
+                Maximum number of generated tokens. Defaults to 100.
             repetition_penalty (`float`, *optional*):
                 The parameter for repetition penalty. 1.0 means no penalty. See [this
                 paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
@@ -2411,10 +2424,10 @@ class AsyncInferenceClient:
         self,
         prompt: str,
         *,
-        negative_prompt: Optional[str] = None,
+        negative_prompt: Optional[List[str]] = None,
         height: Optional[float] = None,
         width: Optional[float] = None,
-        num_inference_steps: Optional[float] = None,
+        num_inference_steps: Optional[int] = None,
         guidance_scale: Optional[float] = None,
         model: Optional[str] = None,
         scheduler: Optional[str] = None,
@@ -2434,8 +2447,8 @@ class AsyncInferenceClient:
         Args:
             prompt (`str`):
                 The prompt to generate an image from.
-            negative_prompt (`str`, *optional*):
-                An optional negative prompt for the image generation.
+            negative_prompt (`List[str`, *optional*):
+                One or several prompt to guide what NOT to include in image generation.
             height (`float`, *optional*):
                 The height in pixels of the image to generate.
             width (`float`, *optional*):
@@ -2444,8 +2457,8 @@ class AsyncInferenceClient:
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference.
             guidance_scale (`float`, *optional*):
-                Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                prompt, but values too high may cause saturation and other artifacts.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. If not provided, the default recommended text-to-image model will be used.
@@ -2533,44 +2546,42 @@ class AsyncInferenceClient:
                 Defaults to None.
             do_sample (`bool`, *optional*):
                 Whether to use sampling instead of greedy decoding when generating new tokens.
-            early_stopping (`Union[bool, "TextToSpeechEarlyStoppingEnum"`, *optional*):
+            early_stopping (`Union[bool, "TextToSpeechEarlyStoppingEnum"]`, *optional*):
                 Controls the stopping condition for beam-based methods.
             epsilon_cutoff (`float`, *optional*):
-                If set to float strictly between 0 and 1, only tokens with a conditional probability
-                greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
-                3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
-                Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+                If set to float strictly between 0 and 1, only tokens with a conditional probability greater than
+                epsilon_cutoff will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on
+                the size of the model. See [Truncation Sampling as Language Model
+                Desmoothing](https://hf.co/papers/2210.15191) for more details.
             eta_cutoff (`float`, *optional*):
-                Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
-                float strictly between 0 and 1, a token is only considered if it is greater than either
-                eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
-                term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
-                the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
-                See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
-                for more details.
+                Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly
+                between 0 and 1, a token is only considered if it is greater than either eta_cutoff or sqrt(eta_cutoff)
+                * exp(-entropy(softmax(next_token_logits))). The latter term is intuitively the expected next token
+                probability, scaled by sqrt(eta_cutoff). In the paper, suggested values range from 3e-4 to 2e-3,
+                depending on the size of the model. See [Truncation Sampling as Language Model
+                Desmoothing](https://hf.co/papers/2210.15191) for more details.
             max_length (`int`, *optional*):
                 The maximum length (in tokens) of the generated text, including the input.
             max_new_tokens (`int`, *optional*):
-                The maximum number of tokens to generate. Takes precedence over maxLength.
+                The maximum number of tokens to generate. Takes precedence over max_length.
             min_length (`int`, *optional*):
                 The minimum length (in tokens) of the generated text, including the input.
             min_new_tokens (`int`, *optional*):
-                The minimum number of tokens to generate. Takes precedence over maxLength.
+                The minimum number of tokens to generate. Takes precedence over min_length.
             num_beam_groups (`int`, *optional*):
-                Number of groups to divide num_beams into in order to ensure diversity among different
-                groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+                Number of groups to divide num_beams into in order to ensure diversity among different groups of beams.
+                See [this paper](https://hf.co/papers/1610.02424) for more details.
             num_beams (`int`, *optional*):
                 Number of beams to use for beam search.
             penalty_alpha (`float`, *optional*):
-                The value balances the model confidence and the degeneration penalty in contrastive
-                search decoding.
+                The value balances the model confidence and the degeneration penalty in contrastive search decoding.
             temperature (`float`, *optional*):
                 The value used to modulate the next token probabilities.
             top_k (`int`, *optional*):
                 The number of highest probability vocabulary tokens to keep for top-k-filtering.
             top_p (`float`, *optional*):
-                If set to float < 1, only the smallest set of most probable tokens with probabilities
-                that add up to top_p or higher are kept for generation.
+                If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to
+                top_p or higher are kept for generation.
             typical_p (`float`, *optional*):
                 Local typicality measures how similar the conditional probability of predicting a target token next is
                 to the expected conditional probability of predicting a random token next, given the partial text
@@ -2627,7 +2638,7 @@ class AsyncInferenceClient:
         text: str,
         *,
         model: Optional[str] = None,
-        aggregation_strategy: Optional[Literal["none", "simple", "first", "average", "max"]] = None,
+        aggregation_strategy: Optional["TokenClassificationAggregationStrategy"] = None,
         ignore_labels: Optional[List[str]] = None,
         stride: Optional[int] = None,
     ) -> List[TokenClassificationOutputElement]:
@@ -2642,10 +2653,10 @@ class AsyncInferenceClient:
                 The model to use for the token classification task. Can be a model ID hosted on the Hugging Face Hub or a URL to
                 a deployed Inference Endpoint. If not provided, the default recommended token classification model will be used.
                 Defaults to None.
-            aggregation_strategy (`Literal["none", "simple", "first", "average", "max"]`, *optional*):
-                The strategy used to fuse tokens based on model predictions.
-            ignore_labels (`List[str]`, *optional*):
-                A list of labels to ignore.
+            aggregation_strategy (`"TokenClassificationAggregationStrategy"`, *optional*):
+                The strategy used to fuse tokens based on model predictions
+            ignore_labels (`List[str`, *optional*):
+                A list of labels to ignore
             stride (`int`, *optional*):
                 The number of overlapping tokens between chunks when splitting the input text.
@@ -2704,7 +2715,7 @@ class AsyncInferenceClient:
         src_lang: Optional[str] = None,
         tgt_lang: Optional[str] = None,
         clean_up_tokenization_spaces: Optional[bool] = None,
-        truncation: Optional[Literal["do_not_truncate", "longest_first", "only_first", "only_second"]] = None,
+        truncation: Optional["TranslationTruncationStrategy"] = None,
         generate_parameters: Optional[Dict[str, Any]] = None,
     ) -> TranslationOutput:
         """
@@ -2728,7 +2739,7 @@ class AsyncInferenceClient:
                 Target language to translate to. Required for models that can translate to multiple languages.
             clean_up_tokenization_spaces (`bool`, *optional*):
                 Whether to clean up the potential extra spaces in the text output.
-            truncation (`Literal["do_not_truncate", "longest_first", "only_first", "only_second"]`, *optional*):
+            truncation (`"TranslationTruncationStrategy"`, *optional*):
                 The truncation strategy to use.
             generate_parameters (`Dict[str, Any]`, *optional*):
                 Additional parametrization of the text generation algorithm.
@@ -2752,13 +2763,13 @@ class AsyncInferenceClient:
         >>> await client.translation("My name is Wolfgang and I live in Berlin")
         'Mein Name ist Wolfgang und ich lebe in Berlin.'
         >>> await client.translation("My name is Wolfgang and I live in Berlin", model="Helsinki-NLP/opus-mt-en-fr")
-        TranslationOutput(translation_text='Je m\'appelle Wolfgang et je vis à Berlin.')
+        TranslationOutput(translation_text='Je m'appelle Wolfgang et je vis à Berlin.')
         ```
         Specifying languages:
         ```py
         >>> client.translation("My name is Sarah Jessica Parker but you can call me Jessica", model="facebook/mbart-large-50-many-to-many-mmt", src_lang="en_XX", tgt_lang="fr_XX")
-        "Mon nom est Sarah Jessica Parker mais vous pouvez m\'appeler Jessica"
+        "Mon nom est Sarah Jessica Parker mais vous pouvez m'appeler Jessica"
         ```
         """
         # Throw error if only one of `src_lang` and `tgt_lang` was given
@@ -2799,9 +2810,8 @@ class AsyncInferenceClient:
                 a deployed Inference Endpoint. If not provided, the default recommended visual question answering model will be used.
                 Defaults to None.
             top_k (`int`, *optional*):
-                The number of answers to return (will be chosen by order of likelihood). Note that we
-                return less than topk answers if there are not enough options available within the
-                context.
+                The number of answers to return (will be chosen by order of likelihood). Note that we return less than
+                topk answers if there are not enough options available within the context.
         Returns:
             `List[VisualQuestionAnsweringOutputElement]`: a list of [`VisualQuestionAnsweringOutputElement`] items containing the predicted label and associated probability.
@@ -2832,14 +2842,22 @@ class AsyncInferenceClient:
         response = await self.post(json=payload, model=model, task="visual-question-answering")
         return VisualQuestionAnsweringOutputElement.parse_obj_as_list(response)
+    @_deprecate_arguments(
+        version="0.30.0",
+        deprecated_args=["labels"],
+        custom_message="`labels`has been renamed to `candidate_labels` and will be removed in huggingface_hub>=0.30.0.",
+    )
     async def zero_shot_classification(
         self,
         text: str,
-        labels: List[str],
+        # temporarily keeping it optional for backward compatibility.
+        candidate_labels: List[str] = None,  # type: ignore
         *,
-        multi_label: bool = False,
+        multi_label: Optional[bool] = False,
         hypothesis_template: Optional[str] = None,
         model: Optional[str] = None,
+        # deprecated argument
+        labels: List[str] = None,  # type: ignore
     ) -> List[ZeroShotClassificationOutputElement]:
         """
         Provide as input a text and a set of candidate labels to classify the input text.
@@ -2847,20 +2865,22 @@ class AsyncInferenceClient:
         Args:
             text (`str`):
                 The input text to classify.
-            labels (`List[str]`):
-                List of strings. Each string is the verbalization of a possible label for the input text.
-            multi_label (`bool`):
-                Boolean. If True, the probability for each label is evaluated independently and multiple labels can have a probability close to 1 simultaneously or all probabilities can be close to 0.
-                If False, the labels are considered mutually exclusive and the probability over all labels always sums to 1. Defaults to False.
+            candidate_labels (`List[str]`):
+                The set of possible class labels to classify the text into.
+            labels (`List[str]`, *optional*):
+                (deprecated) List of strings. Each string is the verbalization of a possible label for the input text.
+            multi_label (`bool`, *optional*):
+                Whether multiple candidate labels can be true. If false, the scores are normalized such that the sum of
+                the label likelihoods for each sequence is 1. If true, the labels are considered independent and
+                probabilities are normalized for each candidate.
             hypothesis_template (`str`, *optional*):
-                A template sentence string with curly brackets to which the label strings are added. The label strings are added at the position of the curly brackets "{}".
-                Zero-shot classifiers are based on NLI models, which evaluate if a hypothesis is entailed in another text or not.
-                For example, with hypothesis_template="This text is about {}." and labels=["economics", "politics"], the system internally creates the two hypotheses "This text is about economics." and "This text is about politics.".
-                The model then evaluates for both hypotheses if they are entailed in the provided `text` or not.
+                The sentence used in conjunction with `candidate_labels` to attempt the text classification by
+                replacing the placeholder with the candidate labels.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot classification model will be used.
         Returns:
             `List[ZeroShotClassificationOutputElement]`: List of [`ZeroShotClassificationOutputElement`] items containing the predicted labels and their confidence.
@@ -2918,9 +2938,17 @@ class AsyncInferenceClient:
         ]
         ```
         """
+        # handle deprecation
+        if labels is not None:
+            if candidate_labels is not None:
+                raise ValueError(
+                    "Cannot specify both `labels` and `candidate_labels`. Use `candidate_labels` instead."
+                )
+            candidate_labels = labels
+        elif candidate_labels is None:
+            raise ValueError("Must specify `candidate_labels`")
         parameters = {
-            "candidate_labels": labels,
+            "candidate_labels": candidate_labels,
             "multi_label": multi_label,
             "hypothesis_template": hypothesis_template,
         }
@@ -2936,13 +2964,21 @@ class AsyncInferenceClient:
             for label, score in zip(output["labels"], output["scores"])
         ]
+    @_deprecate_arguments(
+        version="0.30.0",
+        deprecated_args=["labels"],
+        custom_message="`labels`has been renamed to `candidate_labels` and will be removed in huggingface_hub>=0.30.0.",
+    )
     async def zero_shot_image_classification(
         self,
         image: ContentT,
-        labels: List[str],
+        # temporarily keeping it optional for backward compatibility.
+        candidate_labels: List[str] = None,  # type: ignore
         *,
         model: Optional[str] = None,
         hypothesis_template: Optional[str] = None,
+        # deprecated argument
+        labels: List[str] = None,  # type: ignore
     ) -> List[ZeroShotImageClassificationOutputElement]:
         """
         Provide input image and text labels to predict text labels for the image.
@@ -2950,14 +2986,17 @@ class AsyncInferenceClient:
         Args:
             image (`Union[str, Path, bytes, BinaryIO]`):
                 The input image to caption. It can be raw bytes, an image file, or a URL to an online image.
-            labels (`List[str]`):
-                List of string possible labels. There must be at least 2 labels.
+            candidate_labels (`List[str]`):
+                The candidate labels for this image
+            labels (`List[str]`, *optional*):
+                (deprecated) List of string possible labels. There must be at least 2 labels.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot image classification model will be used.
             hypothesis_template (`str`, *optional*):
-                The sentence used in conjunction with `labels` to attempt the text classification by replacing the
-                placeholder with the candidate labels.
+                The sentence used in conjunction with `candidate_labels` to attempt the image classification by
+                replacing the placeholder with the candidate labels.
         Returns:
             `List[ZeroShotImageClassificationOutputElement]`: List of [`ZeroShotImageClassificationOutputElement`] items containing the predicted labels and their confidence.
@@ -2980,13 +3019,23 @@ class AsyncInferenceClient:
         [ZeroShotImageClassificationOutputElement(label='dog', score=0.956),...]
         ```
         """
+        # handle deprecation
+        if labels is not None:
+            if candidate_labels is not None:
+                raise ValueError(
+                    "Cannot specify both `labels` and `candidate_labels`. Use `candidate_labels` instead."
+                )
+            candidate_labels = labels
+        elif candidate_labels is None:
+            raise ValueError("Must specify `candidate_labels`")
         # Raise ValueError if input is less than 2 labels
-        if len(labels) < 2:
+        if len(candidate_labels) < 2:
             raise ValueError("You must specify at least 2 classes to compare.")
-        inputs = {"image": _b64_encode(image), "candidateLabels": ",".join(labels)}
-        parameters = {"hypothesis_template": hypothesis_template}
-        payload = _prepare_payload(inputs, parameters=parameters)
+        parameters = {
+            "candidate_labels": candidate_labels,
+            "hypothesis_template": hypothesis_template,
+        }
+        payload = _prepare_payload(image, parameters=parameters, expect_binary=True)
         response = await self.post(
             **payload,
             model=model,