PyPI - huggingface-hub - Versions diffs - 0.25.2__py3-none-any.whl → 0.26.0rc0__py3-none-any.whl - Mend - Supply Chain Defender

huggingface-hub 0.25.2py3-none-any.whl → 0.26.0rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of huggingface-hub might be problematic. Click here for more details.

Files changed (45) hide show

huggingface_hub/inference/_generated/_async_client.py CHANGED Viewed

@@ -24,18 +24,7 @@ import logging
 import re
 import time
 import warnings
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    AsyncIterable,
-    Dict,
-    List,
-    Literal,
-    Optional,
-    Set,
-    Union,
-    overload,
-)
+from typing import TYPE_CHECKING, Any, AsyncIterable, Dict, List, Literal, Optional, Set, Union, overload
 from requests.structures import CaseInsensitiveDict
@@ -56,16 +45,18 @@ from huggingface_hub.inference._common import (
     _get_unsupported_text_generation_kwargs,
     _import_numpy,
     _open_as_binary,
+    _prepare_payload,
     _set_unsupported_text_generation_kwargs,
     raise_text_generation_error,
 )
 from huggingface_hub.inference._generated.types import (
     AudioClassificationOutputElement,
+    AudioClassificationOutputTransform,
     AudioToAudioOutputElement,
     AutomaticSpeechRecognitionOutput,
     ChatCompletionInputGrammarType,
-    ChatCompletionInputTool,
-    ChatCompletionInputToolTypeClass,
+    ChatCompletionInputStreamOptions,
+    ChatCompletionInputToolType,
     ChatCompletionOutput,
     ChatCompletionStreamOutput,
     DocumentQuestionAnsweringOutputElement,
@@ -78,19 +69,21 @@ from huggingface_hub.inference._generated.types import (
     SummarizationOutput,
     TableQuestionAnsweringOutputElement,
     TextClassificationOutputElement,
+    TextClassificationOutputTransform,
     TextGenerationInputGrammarType,
     TextGenerationOutput,
     TextGenerationStreamOutput,
+    TextToImageTargetSize,
+    TextToSpeechEarlyStoppingEnum,
     TokenClassificationOutputElement,
+    ToolElement,
     TranslationOutput,
     VisualQuestionAnsweringOutputElement,
     ZeroShotClassificationOutputElement,
     ZeroShotImageClassificationOutputElement,
 )
-from huggingface_hub.utils import (
-    build_hf_headers,
-)
-from huggingface_hub.utils._deprecation import _deprecate_positional_args
+from huggingface_hub.utils import build_hf_headers
+from huggingface_hub.utils._deprecation import _deprecate_arguments
 from .._common import _async_yield_from, _import_aiohttp
@@ -147,7 +140,6 @@ class AsyncInferenceClient:
             follow the same pattern as `openai.OpenAI` client. Cannot be used if `token` is set. Defaults to None.
     """
-    @_deprecate_positional_args(version="0.26")
     def __init__(
         self,
         model: Optional[str] = None,
@@ -365,6 +357,8 @@ class AsyncInferenceClient:
         audio: ContentT,
         *,
         model: Optional[str] = None,
+        top_k: Optional[int] = None,
+        function_to_apply: Optional["AudioClassificationOutputTransform"] = None,
     ) -> List[AudioClassificationOutputElement]:
         """
         Perform audio classification on the provided audio content.
@@ -377,6 +371,10 @@ class AsyncInferenceClient:
                 The model to use for audio classification. Can be a model ID hosted on the Hugging Face Hub
                 or a URL to a deployed Inference Endpoint. If not provided, the default recommended model for
                 audio classification will be used.
+            top_k (`int`, *optional*):
+                When specified, limits the output to the top K most probable classes.
+            function_to_apply (`"AudioClassificationOutputTransform"`, *optional*):
+                The function to apply to the output.
         Returns:
             `List[AudioClassificationOutputElement]`: List of [`AudioClassificationOutputElement`] items containing the predicted labels and their confidence.
@@ -400,7 +398,9 @@ class AsyncInferenceClient:
         ]
         ```
         """
-        response = await self.post(data=audio, model=model, task="audio-classification")
+        parameters = {"function_to_apply": function_to_apply, "top_k": top_k}
+        payload = _prepare_payload(audio, parameters=parameters, expect_binary=True)
+        response = await self.post(**payload, model=model, task="audio-classification")
         return AudioClassificationOutputElement.parse_obj_as_list(response)
     async def audio_to_audio(
@@ -487,7 +487,7 @@ class AsyncInferenceClient:
     @overload
     async def chat_completion(  # type: ignore
         self,
-        messages: List[Dict[str, str]],
+        messages: List[Dict],
         *,
         model: Optional[str] = None,
         stream: Literal[False] = False,
@@ -500,10 +500,11 @@ class AsyncInferenceClient:
         response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ChatCompletionInputTool]] = None,
+        tools: Optional[List[ToolElement]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
     ) -> ChatCompletionOutput: ...
@@ -511,7 +512,7 @@ class AsyncInferenceClient:
     @overload
     async def chat_completion(  # type: ignore
         self,
-        messages: List[Dict[str, str]],
+        messages: List[Dict],
         *,
         model: Optional[str] = None,
         stream: Literal[True] = True,
@@ -524,10 +525,11 @@ class AsyncInferenceClient:
         response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ChatCompletionInputTool]] = None,
+        tools: Optional[List[ToolElement]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
     ) -> AsyncIterable[ChatCompletionStreamOutput]: ...
@@ -535,7 +537,7 @@ class AsyncInferenceClient:
     @overload
     async def chat_completion(
         self,
-        messages: List[Dict[str, str]],
+        messages: List[Dict],
         *,
         model: Optional[str] = None,
         stream: bool = False,
@@ -548,17 +550,18 @@ class AsyncInferenceClient:
         response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ChatCompletionInputTool]] = None,
+        tools: Optional[List[ToolElement]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
     ) -> Union[ChatCompletionOutput, AsyncIterable[ChatCompletionStreamOutput]]: ...
     async def chat_completion(
         self,
-        messages: List[Dict[str, str]],
+        messages: List[Dict],
         *,
         model: Optional[str] = None,
         stream: bool = False,
@@ -572,10 +575,11 @@ class AsyncInferenceClient:
         response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ChatCompletionInputTool]] = None,
+        tools: Optional[List[ToolElement]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
     ) -> Union[ChatCompletionOutput, AsyncIterable[ChatCompletionStreamOutput]]:
@@ -592,7 +596,7 @@ class AsyncInferenceClient:
         </Tip>
         Args:
-            messages (List[Union[`SystemMessage`, `UserMessage`, `AssistantMessage`]]):
+            messages (List of [`ChatCompletionInputMessage`]):
                 Conversation history consisting of roles and content pairs.
             model (`str`, *optional*):
                 The model to use for chat-completion. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
@@ -629,6 +633,8 @@ class AsyncInferenceClient:
                 Defaults to None.
             stream (`bool`, *optional*):
                 Enable realtime streaming of responses. Defaults to False.
+            stream_options ([`ChatCompletionInputStreamOptions`], *optional*):
+                Options for streaming completions.
             temperature (`float`, *optional*):
                 Controls randomness of the generations. Lower values ensure
                 less random completions. Range: [0, 2]. Defaults to 1.0.
@@ -639,11 +645,11 @@ class AsyncInferenceClient:
             top_p (`float`, *optional*):
                 Fraction of the most likely next words to sample from.
                 Must be between 0 and 1. Defaults to 1.0.
-            tool_choice ([`ChatCompletionInputToolTypeClass`] or `str`, *optional*):
+            tool_choice ([`ChatCompletionInputToolType`] or `str`, *optional*):
                 The tool to use for the completion. Defaults to "auto".
             tool_prompt (`str`, *optional*):
                 A prompt to be appended before the tools.
-            tools (List of [`ChatCompletionInputTool`], *optional*):
+            tools (List of [`ToolElement`], *optional*):
                 A list of tools the model may call. Currently, only functions are supported as a tool. Use this to
                 provide a list of functions the model may generate JSON inputs for.
@@ -694,7 +700,7 @@ class AsyncInferenceClient:
         )
         ```
-        Example (stream=True):
+        Example using streaming:
         ```py
         # Must be run in an async context
         >>> from huggingface_hub import AsyncInferenceClient
@@ -734,6 +740,41 @@ class AsyncInferenceClient:
             print(chunk.choices[0].delta.content)
         ```
+        Example using Image + Text as input:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        # provide a remote URL
+        >>> image_url ="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+        # or a base64-encoded image
+        >>> image_path = "/path/to/image.jpeg"
+        >>> with open(image_path, "rb") as f:
+        ...     base64_image = base64.b64encode(f.read()).decode("utf-8")
+        >>> image_url = f"data:image/jpeg;base64,{base64_image}"
+        >>> client = AsyncInferenceClient("meta-llama/Llama-3.2-11B-Vision-Instruct")
+        >>> output = await client.chat.completions.create(
+        ...     messages=[
+        ...         {
+        ...             "role": "user",
+        ...             "content": [
+        ...                 {
+        ...                     "type": "image_url",
+        ...                     "image_url": {"url": image_url},
+        ...                 },
+        ...                 {
+        ...                     "type": "text",
+        ...                     "text": "Describe this image in one sentence.",
+        ...                 },
+        ...             ],
+        ...         },
+        ...     ],
+        ... )
+        >>> output
+        The image depicts the iconic Statue of Liberty situated in New York Harbor, New York, on a clear day.
+        ```
         Example using tools:
         ```py
         # Must be run in an async context
@@ -877,6 +918,7 @@ class AsyncInferenceClient:
             top_logprobs=top_logprobs,
             top_p=top_p,
             stream=stream,
+            stream_options=stream_options,
         )
         payload = {key: value for key, value in payload.items() if value is not None}
         data = await self.post(model=model_url, json=payload, stream=stream)
@@ -917,6 +959,14 @@ class AsyncInferenceClient:
         question: str,
         *,
         model: Optional[str] = None,
+        doc_stride: Optional[int] = None,
+        handle_impossible_answer: Optional[bool] = None,
+        lang: Optional[str] = None,
+        max_answer_len: Optional[int] = None,
+        max_question_len: Optional[int] = None,
+        max_seq_len: Optional[int] = None,
+        top_k: Optional[int] = None,
+        word_boxes: Optional[List[Union[List[float], str]]] = None,
     ) -> List[DocumentQuestionAnsweringOutputElement]:
         """
         Answer questions on document images.
@@ -930,7 +980,29 @@ class AsyncInferenceClient:
                 The model to use for the document question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
                 a deployed Inference Endpoint. If not provided, the default recommended document question answering model will be used.
                 Defaults to None.
+            doc_stride (`int`, *optional*):
+                If the words in the document are too long to fit with the question for the model, it will
+                be split in several chunks with some overlap. This argument controls the size of that
+                overlap.
+            handle_impossible_answer (`bool`, *optional*):
+                Whether to accept impossible as an answer.
+            lang (`str`, *optional*):
+                Language to use while running OCR.
+            max_answer_len (`int`, *optional*):
+                The maximum length of predicted answers (e.g., only answers with a shorter length are
+                considered).
+            max_question_len (`int`, *optional*):
+                The maximum length of the question after tokenization. It will be truncated if needed.
+            max_seq_len (`int`, *optional*):
+                The maximum length of the total sentence (context + question) in tokens of each chunk
+                passed to the model. The context will be split in several chunks (using doc_stride as
+                overlap) if needed.
+            top_k (`int`, *optional*):
+                The number of answers to return (will be chosen by order of likelihood). Can return less
+                than top_k answers if there are not enough options available within the context.
+            word_boxes (`List[Union[List[float], str]]`, *optional*):
+                A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
+                skip the OCR step and use the provided bounding boxes instead.
         Returns:
             `List[DocumentQuestionAnsweringOutputElement]`: a list of [`DocumentQuestionAnsweringOutputElement`] items containing the predicted label, associated probability, word ids, and page number.
@@ -940,17 +1012,29 @@ class AsyncInferenceClient:
             `aiohttp.ClientResponseError`:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
         ```py
         # Must be run in an async context
         >>> from huggingface_hub import AsyncInferenceClient
         >>> client = AsyncInferenceClient()
         >>> await client.document_question_answering(image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png", question="What is the invoice number?")
-        [DocumentQuestionAnsweringOutputElement(score=0.42515629529953003, answer='us-001', start=16, end=16)]
+        [DocumentQuestionAnsweringOutputElement(answer='us-001', end=16, score=0.9999666213989258, start=16, words=None)]
         ```
         """
-        payload: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
-        response = await self.post(json=payload, model=model, task="document-question-answering")
+        inputs: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
+        parameters = {
+            "doc_stride": doc_stride,
+            "handle_impossible_answer": handle_impossible_answer,
+            "lang": lang,
+            "max_answer_len": max_answer_len,
+            "max_question_len": max_question_len,
+            "max_seq_len": max_seq_len,
+            "top_k": top_k,
+            "word_boxes": word_boxes,
+        }
+        payload = _prepare_payload(inputs, parameters=parameters)
+        response = await self.post(**payload, model=model, task="document-question-answering")
         return DocumentQuestionAnsweringOutputElement.parse_obj_as_list(response)
     async def feature_extraction(
@@ -974,7 +1058,7 @@ class AsyncInferenceClient:
                 a deployed Inference Endpoint. If not provided, the default recommended conversational model will be used.
                 Defaults to None.
             normalize (`bool`, *optional*):
-                Whether to normalize the embeddings or not. Defaults to None.
+                Whether to normalize the embeddings or not.
                 Only available on server powered by Text-Embedding-Inference.
             prompt_name (`str`, *optional*):
                 The name of the prompt that should be used by for encoding. If not set, no prompt will be applied.
@@ -983,7 +1067,7 @@ class AsyncInferenceClient:
                 then the sentence "What is the capital of France?" will be encoded as "query: What is the capital of France?"
                 because the prompt text will be prepended before any text to encode.
             truncate (`bool`, *optional*):
-                Whether to truncate the embeddings or not. Defaults to None.
+                Whether to truncate the embeddings or not.
                 Only available on server powered by Text-Embedding-Inference.
             truncation_direction (`Literal["Left", "Right"]`, *optional*):
                 Which side of the input should be truncated when `truncate=True` is passed.
@@ -1009,20 +1093,25 @@ class AsyncInferenceClient:
         [ 0.28552425, -0.928395  , -1.2077185 , ...,  0.76810825, -2.1069427 ,  0.6236161 ]], dtype=float32)
         ```
         """
-        payload: Dict = {"inputs": text}
-        if normalize is not None:
-            payload["normalize"] = normalize
-        if prompt_name is not None:
-            payload["prompt_name"] = prompt_name
-        if truncate is not None:
-            payload["truncate"] = truncate
-        if truncation_direction is not None:
-            payload["truncation_direction"] = truncation_direction
-        response = await self.post(json=payload, model=model, task="feature-extraction")
+        parameters = {
+            "normalize": normalize,
+            "prompt_name": prompt_name,
+            "truncate": truncate,
+            "truncation_direction": truncation_direction,
+        }
+        payload = _prepare_payload(text, parameters=parameters)
+        response = await self.post(**payload, model=model, task="feature-extraction")
         np = _import_numpy()
         return np.array(_bytes_to_dict(response), dtype="float32")
-    async def fill_mask(self, text: str, *, model: Optional[str] = None) -> List[FillMaskOutputElement]:
+    async def fill_mask(
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        targets: Optional[List[str]] = None,
+        top_k: Optional[int] = None,
+    ) -> List[FillMaskOutputElement]:
         """
         Fill in a hole with a missing word (token to be precise).
@@ -1032,8 +1121,13 @@ class AsyncInferenceClient:
             model (`str`, *optional*):
                 The model to use for the fill mask task. Can be a model ID hosted on the Hugging Face Hub or a URL to
                 a deployed Inference Endpoint. If not provided, the default recommended fill mask model will be used.
-                Defaults to None.
+            targets (`List[str]`, *optional*):
+                When passed, the model will limit the scores to the passed targets instead of looking up
+                in the whole vocabulary. If the provided targets are not in the model vocab, they will be
+                tokenized and the first resulting token will be used (with a warning, and that might be
+                slower).
+            top_k (`int`, *optional*):
+                When passed, overrides the number of predictions to return.
         Returns:
             `List[FillMaskOutputElement]`: a list of [`FillMaskOutputElement`] items containing the predicted label, associated
             probability, token reference, and completed text.
@@ -1056,7 +1150,9 @@ class AsyncInferenceClient:
         ]
         ```
         """
-        response = await self.post(json={"inputs": text}, model=model, task="fill-mask")
+        parameters = {"targets": targets, "top_k": top_k}
+        payload = _prepare_payload(text, parameters=parameters)
+        response = await self.post(**payload, model=model, task="fill-mask")
         return FillMaskOutputElement.parse_obj_as_list(response)
     async def image_classification(
@@ -1064,6 +1160,8 @@ class AsyncInferenceClient:
         image: ContentT,
         *,
         model: Optional[str] = None,
+        function_to_apply: Optional[Literal["sigmoid", "softmax", "none"]] = None,
+        top_k: Optional[int] = None,
     ) -> List[ImageClassificationOutputElement]:
         """
         Perform image classification on the given image using the specified model.
@@ -1074,7 +1172,10 @@ class AsyncInferenceClient:
             model (`str`, *optional*):
                 The model to use for image classification. Can be a model ID hosted on the Hugging Face Hub or a URL to a
                 deployed Inference Endpoint. If not provided, the default recommended model for image classification will be used.
+            function_to_apply (`Literal["sigmoid", "softmax", "none"]`, *optional*):
+                The function to apply to the output scores.
+            top_k (`int`, *optional*):
+                When specified, limits the output to the top K most probable classes.
         Returns:
             `List[ImageClassificationOutputElement]`: a list of [`ImageClassificationOutputElement`] items containing the predicted label and associated probability.
@@ -1090,10 +1191,12 @@ class AsyncInferenceClient:
         >>> from huggingface_hub import AsyncInferenceClient
         >>> client = AsyncInferenceClient()
         >>> await client.image_classification("https://upload.wikimedia.org/wikipedia/commons/thumb/4/43/Cute_dog.jpg/320px-Cute_dog.jpg")
-        [ImageClassificationOutputElement(score=0.9779096841812134, label='Blenheim spaniel'), ...]
+        [ImageClassificationOutputElement(label='Blenheim spaniel', score=0.9779096841812134), ...]
         ```
         """
-        response = await self.post(data=image, model=model, task="image-classification")
+        parameters = {"function_to_apply": function_to_apply, "top_k": top_k}
+        payload = _prepare_payload(image, parameters=parameters, expect_binary=True)
+        response = await self.post(**payload, model=model, task="image-classification")
         return ImageClassificationOutputElement.parse_obj_as_list(response)
     async def image_segmentation(
@@ -1101,6 +1204,10 @@ class AsyncInferenceClient:
         image: ContentT,
         *,
         model: Optional[str] = None,
+        mask_threshold: Optional[float] = None,
+        overlap_mask_area_threshold: Optional[float] = None,
+        subtask: Optional[Literal["instance", "panoptic", "semantic"]] = None,
+        threshold: Optional[float] = None,
     ) -> List[ImageSegmentationOutputElement]:
         """
         Perform image segmentation on the given image using the specified model.
@@ -1117,7 +1224,14 @@ class AsyncInferenceClient:
             model (`str`, *optional*):
                 The model to use for image segmentation. Can be a model ID hosted on the Hugging Face Hub or a URL to a
                 deployed Inference Endpoint. If not provided, the default recommended model for image segmentation will be used.
+            mask_threshold (`float`, *optional*):
+                Threshold to use when turning the predicted masks into binary values.
+            overlap_mask_area_threshold (`float`, *optional*):
+                Mask overlap threshold to eliminate small, disconnected segments.
+            subtask (`Literal["instance", "panoptic", "semantic"]`, *optional*):
+                Segmentation task to be performed, depending on model capabilities.
+            threshold (`float`, *optional*):
+                Probability threshold to filter out predicted masks.
         Returns:
             `List[ImageSegmentationOutputElement]`: A list of [`ImageSegmentationOutputElement`] items containing the segmented masks and associated attributes.
@@ -1132,14 +1246,21 @@ class AsyncInferenceClient:
         # Must be run in an async context
         >>> from huggingface_hub import AsyncInferenceClient
         >>> client = AsyncInferenceClient()
-        >>> await client.image_segmentation("cat.jpg"):
+        >>> await client.image_segmentation("cat.jpg")
         [ImageSegmentationOutputElement(score=0.989008, label='LABEL_184', mask=<PIL.PngImagePlugin.PngImageFile image mode=L size=400x300 at 0x7FDD2B129CC0>), ...]
         ```
         """
-        response = await self.post(data=image, model=model, task="image-segmentation")
+        parameters = {
+            "mask_threshold": mask_threshold,
+            "overlap_mask_area_threshold": overlap_mask_area_threshold,
+            "subtask": subtask,
+            "threshold": threshold,
+        }
+        payload = _prepare_payload(image, parameters=parameters, expect_binary=True)
+        response = await self.post(**payload, model=model, task="image-segmentation")
         output = ImageSegmentationOutputElement.parse_obj_as_list(response)
         for item in output:
-            item.mask = _b64_to_image(item.mask)
+            item.mask = _b64_to_image(item.mask)  # type: ignore [assignment]
         return output
     async def image_to_image(
@@ -1212,19 +1333,8 @@ class AsyncInferenceClient:
             "guidance_scale": guidance_scale,
             **kwargs,
         }
-        if all(parameter is None for parameter in parameters.values()):
-            # Either only an image to send => send as raw bytes
-            data = image
-            payload: Optional[Dict[str, Any]] = None
-        else:
-            # Or an image + some parameters => use base64 encoding
-            data = None
-            payload = {"inputs": _b64_encode(image)}
-            for key, value in parameters.items():
-                if value is not None:
-                    payload.setdefault("parameters", {})[key] = value
-        response = await self.post(json=payload, data=data, model=model, task="image-to-image")
+        payload = _prepare_payload(image, parameters=parameters, expect_binary=True)
+        response = await self.post(**payload, model=model, task="image-to-image")
         return _bytes_to_image(response)
     async def image_to_text(self, image: ContentT, *, model: Optional[str] = None) -> ImageToTextOutput:
@@ -1355,10 +1465,7 @@ class AsyncInferenceClient:
         return models_by_task
     async def object_detection(
-        self,
-        image: ContentT,
-        *,
-        model: Optional[str] = None,
+        self, image: ContentT, *, model: Optional[str] = None, threshold: Optional[float] = None
     ) -> List[ObjectDetectionOutputElement]:
         """
         Perform object detection on the given image using the specified model.
@@ -1375,7 +1482,8 @@ class AsyncInferenceClient:
             model (`str`, *optional*):
                 The model to use for object detection. Can be a model ID hosted on the Hugging Face Hub or a URL to a
                 deployed Inference Endpoint. If not provided, the default recommended model for object detection (DETR) will be used.
+            threshold (`float`, *optional*):
+                The probability necessary to make a prediction.
         Returns:
             `List[ObjectDetectionOutputElement]`: A list of [`ObjectDetectionOutputElement`] items containing the bounding boxes and associated attributes.
@@ -1392,17 +1500,31 @@ class AsyncInferenceClient:
         # Must be run in an async context
         >>> from huggingface_hub import AsyncInferenceClient
         >>> client = AsyncInferenceClient()
-        >>> await client.object_detection("people.jpg"):
+        >>> await client.object_detection("people.jpg")
         [ObjectDetectionOutputElement(score=0.9486683011054993, label='person', box=ObjectDetectionBoundingBox(xmin=59, ymin=39, xmax=420, ymax=510)), ...]
         ```
         """
-        # detect objects
-        response = await self.post(data=image, model=model, task="object-detection")
+        parameters = {
+            "threshold": threshold,
+        }
+        payload = _prepare_payload(image, parameters=parameters, expect_binary=True)
+        response = await self.post(**payload, model=model, task="object-detection")
         return ObjectDetectionOutputElement.parse_obj_as_list(response)
     async def question_answering(
-        self, question: str, context: str, *, model: Optional[str] = None
-    ) -> QuestionAnsweringOutputElement:
+        self,
+        question: str,
+        context: str,
+        *,
+        model: Optional[str] = None,
+        align_to_words: Optional[bool] = None,
+        doc_stride: Optional[int] = None,
+        handle_impossible_answer: Optional[bool] = None,
+        max_answer_len: Optional[int] = None,
+        max_question_len: Optional[int] = None,
+        max_seq_len: Optional[int] = None,
+        top_k: Optional[int] = None,
+    ) -> Union[QuestionAnsweringOutputElement, List[QuestionAnsweringOutputElement]]:
         """
         Retrieve the answer to a question from a given text.
@@ -1414,10 +1536,31 @@ class AsyncInferenceClient:
             model (`str`):
                 The model to use for the question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
                 a deployed Inference Endpoint.
+            align_to_words (`bool`, *optional*):
+                Attempts to align the answer to real words. Improves quality on space separated
+                languages. Might hurt on non-space-separated languages (like Japanese or Chinese).
+            doc_stride (`int`, *optional*):
+                If the context is too long to fit with the question for the model, it will be split in
+                several chunks with some overlap. This argument controls the size of that overlap.
+            handle_impossible_answer (`bool`, *optional*):
+                Whether to accept impossible as an answer.
+            max_answer_len (`int`, *optional*):
+                The maximum length of predicted answers (e.g., only answers with a shorter length are
+                considered).
+            max_question_len (`int`, *optional*):
+                The maximum length of the question after tokenization. It will be truncated if needed.
+            max_seq_len (`int`, *optional*):
+                The maximum length of the total sentence (context + question) in tokens of each chunk
+                passed to the model. The context will be split in several chunks (using docStride as
+                overlap) if needed.
+            top_k (`int`, *optional*):
+                The number of answers to return (will be chosen by order of likelihood). Note that we
+                return less than topk answers if there are not enough options available within the
+                context.
         Returns:
-            [`QuestionAnsweringOutputElement`]: an question answering output containing the score, start index, end index, and answer.
+            Union[`QuestionAnsweringOutputElement`, List[`QuestionAnsweringOutputElement`]]:
+                When top_k is 1 or not provided, it returns a single `QuestionAnsweringOutputElement`.
+                When top_k is greater than 1, it returns a list of `QuestionAnsweringOutputElement`.
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
@@ -1430,17 +1573,28 @@ class AsyncInferenceClient:
         >>> from huggingface_hub import AsyncInferenceClient
         >>> client = AsyncInferenceClient()
         >>> await client.question_answering(question="What's my name?", context="My name is Clara and I live in Berkeley.")
-        QuestionAnsweringOutputElement(score=0.9326562285423279, start=11, end=16, answer='Clara')
+        QuestionAnsweringOutputElement(answer='Clara', end=16, score=0.9326565265655518, start=11)
         ```
         """
-        payload: Dict[str, Any] = {"question": question, "context": context}
+        parameters = {
+            "align_to_words": align_to_words,
+            "doc_stride": doc_stride,
+            "handle_impossible_answer": handle_impossible_answer,
+            "max_answer_len": max_answer_len,
+            "max_question_len": max_question_len,
+            "max_seq_len": max_seq_len,
+            "top_k": top_k,
+        }
+        inputs: Dict[str, Any] = {"question": question, "context": context}
+        payload = _prepare_payload(inputs, parameters=parameters)
         response = await self.post(
-            json=payload,
+            **payload,
             model=model,
             task="question-answering",
         )
-        return QuestionAnsweringOutputElement.parse_obj_as_instance(response)
+        # Parse the response as a single `QuestionAnsweringOutputElement` when top_k is 1 or not provided, or a list of `QuestionAnsweringOutputElement` to ensure backward compatibility.
+        output = QuestionAnsweringOutputElement.parse_obj(response)
+        return output
     async def sentence_similarity(
         self, sentence: str, other_sentences: List[str], *, model: Optional[str] = None
@@ -1490,12 +1644,23 @@ class AsyncInferenceClient:
         )
         return _bytes_to_list(response)
+    @_deprecate_arguments(
+        version="0.29",
+        deprecated_args=["parameters"],
+        custom_message=(
+            "The `parameters` argument is deprecated and will be removed in a future version. "
+            "Provide individual parameters instead: `clean_up_tokenization_spaces`, `generate_parameters`, and `truncation`."
+        ),
+    )
     async def summarization(
         self,
         text: str,
         *,
         parameters: Optional[Dict[str, Any]] = None,
         model: Optional[str] = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        generate_parameters: Optional[Dict[str, Any]] = None,
+        truncation: Optional[Literal["do_not_truncate", "longest_first", "only_first", "only_second"]] = None,
     ) -> SummarizationOutput:
         """
         Generate a summary of a given text using a specified model.
@@ -1508,8 +1673,13 @@ class AsyncInferenceClient:
                 for more details.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
-                Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+                Inference Endpoint. If not provided, the default recommended model for summarization will be used.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether to clean up the potential extra spaces in the text output.
+            generate_parameters (`Dict[str, Any]`, *optional*):
+                Additional parametrization of the text generation algorithm.
+            truncation (`Literal["do_not_truncate", "longest_first", "only_first", "only_second"]`, *optional*):
+                The truncation strategy to use.
         Returns:
             [`SummarizationOutput`]: The generated summary text.
@@ -1528,14 +1698,23 @@ class AsyncInferenceClient:
         SummarizationOutput(generated_text="The Eiffel tower is one of the most famous landmarks in the world....")
         ```
         """
-        payload: Dict[str, Any] = {"inputs": text}
-        if parameters is not None:
-            payload["parameters"] = parameters
-        response = await self.post(json=payload, model=model, task="summarization")
+        if parameters is None:
+            parameters = {
+                "clean_up_tokenization_spaces": clean_up_tokenization_spaces,
+                "generate_parameters": generate_parameters,
+                "truncation": truncation,
+            }
+        payload = _prepare_payload(text, parameters=parameters)
+        response = await self.post(**payload, model=model, task="summarization")
         return SummarizationOutput.parse_obj_as_list(response)[0]
     async def table_question_answering(
-        self, table: Dict[str, Any], query: str, *, model: Optional[str] = None
+        self,
+        table: Dict[str, Any],
+        query: str,
+        *,
+        model: Optional[str] = None,
+        parameters: Optional[Dict[str, Any]] = None,
     ) -> TableQuestionAnsweringOutputElement:
         """
         Retrieve the answer to a question from information given in a table.
@@ -1549,6 +1728,8 @@ class AsyncInferenceClient:
             model (`str`):
                 The model to use for the table-question-answering task. Can be a model ID hosted on the Hugging Face
                 Hub or a URL to a deployed Inference Endpoint.
+            parameters (`Dict[str, Any]`, *optional*):
+                Additional inference parameters. Defaults to None.
         Returns:
             [`TableQuestionAnsweringOutputElement`]: a table question answering output containing the answer, coordinates, cells and the aggregator used.
@@ -1570,11 +1751,13 @@ class AsyncInferenceClient:
         TableQuestionAnsweringOutputElement(answer='36542', coordinates=[[0, 1]], cells=['36542'], aggregator='AVERAGE')
         ```
         """
+        inputs = {
+            "query": query,
+            "table": table,
+        }
+        payload = _prepare_payload(inputs, parameters=parameters)
         response = await self.post(
-            json={
-                "query": query,
-                "table": table,
-            },
+            **payload,
             model=model,
             task="table-question-answering",
         )
@@ -1623,7 +1806,11 @@ class AsyncInferenceClient:
         ["5", "5", "5"]
         ```
         """
-        response = await self.post(json={"table": table}, model=model, task="tabular-classification")
+        response = await self.post(
+            json={"table": table},
+            model=model,
+            task="tabular-classification",
+        )
         return _bytes_to_list(response)
     async def tabular_regression(self, table: Dict[str, Any], *, model: Optional[str] = None) -> List[float]:
@@ -1668,7 +1855,12 @@ class AsyncInferenceClient:
         return _bytes_to_list(response)
     async def text_classification(
-        self, text: str, *, model: Optional[str] = None
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        top_k: Optional[int] = None,
+        function_to_apply: Optional["TextClassificationOutputTransform"] = None,
     ) -> List[TextClassificationOutputElement]:
         """
         Perform text classification (e.g. sentiment-analysis) on the given text.
@@ -1680,6 +1872,10 @@ class AsyncInferenceClient:
                 The model to use for the text classification task. Can be a model ID hosted on the Hugging Face Hub or a URL to
                 a deployed Inference Endpoint. If not provided, the default recommended text classification model will be used.
                 Defaults to None.
+            top_k (`int`, *optional*):
+                When specified, limits the output to the top K most probable classes.
+            function_to_apply (`"TextClassificationOutputTransform"`, *optional*):
+                The function to apply to the output.
         Returns:
             `List[TextClassificationOutputElement]`: a list of [`TextClassificationOutputElement`] items containing the predicted label and associated probability.
@@ -1702,7 +1898,16 @@ class AsyncInferenceClient:
         ]
         ```
         """
-        response = await self.post(json={"inputs": text}, model=model, task="text-classification")
+        parameters = {
+            "function_to_apply": function_to_apply,
+            "top_k": top_k,
+        }
+        payload = _prepare_payload(text, parameters=parameters)
+        response = await self.post(
+            **payload,
+            model=model,
+            task="text-classification",
+        )
         return TextClassificationOutputElement.parse_obj_as_list(response)[0]  # type: ignore [return-value]
     @overload
@@ -2212,6 +2417,9 @@ class AsyncInferenceClient:
         num_inference_steps: Optional[float] = None,
         guidance_scale: Optional[float] = None,
         model: Optional[str] = None,
+        scheduler: Optional[str] = None,
+        target_size: Optional[TextToImageTargetSize] = None,
+        seed: Optional[int] = None,
         **kwargs,
     ) -> "Image":
         """
@@ -2240,7 +2448,14 @@ class AsyncInferenceClient:
                 usually at the expense of lower image quality.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
-                Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+                Inference Endpoint. If not provided, the default recommended text-to-image model will be used.
+                Defaults to None.
+            scheduler (`str`, *optional*):
+                Override the scheduler with a compatible one.
+            target_size (`TextToImageTargetSize`, *optional*):
+                The size in pixel of the output image
+            seed (`int`, *optional*):
+                Seed for the random number generator.
         Returns:
             `Image`: The generated image.
@@ -2268,22 +2483,44 @@ class AsyncInferenceClient:
         >>> image.save("better_astronaut.png")
         ```
         """
-        payload = {"inputs": prompt}
         parameters = {
             "negative_prompt": negative_prompt,
             "height": height,
             "width": width,
             "num_inference_steps": num_inference_steps,
             "guidance_scale": guidance_scale,
+            "scheduler": scheduler,
+            "target_size": target_size,
+            "seed": seed,
             **kwargs,
         }
-        for key, value in parameters.items():
-            if value is not None:
-                payload.setdefault("parameters", {})[key] = value  # type: ignore
-        response = await self.post(json=payload, model=model, task="text-to-image")
+        payload = _prepare_payload(prompt, parameters=parameters)
+        response = await self.post(**payload, model=model, task="text-to-image")
         return _bytes_to_image(response)
-    async def text_to_speech(self, text: str, *, model: Optional[str] = None) -> bytes:
+    async def text_to_speech(
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        do_sample: Optional[bool] = None,
+        early_stopping: Optional[Union[bool, "TextToSpeechEarlyStoppingEnum"]] = None,
+        epsilon_cutoff: Optional[float] = None,
+        eta_cutoff: Optional[float] = None,
+        max_length: Optional[int] = None,
+        max_new_tokens: Optional[int] = None,
+        min_length: Optional[int] = None,
+        min_new_tokens: Optional[int] = None,
+        num_beam_groups: Optional[int] = None,
+        num_beams: Optional[int] = None,
+        penalty_alpha: Optional[float] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        typical_p: Optional[float] = None,
+        use_cache: Optional[bool] = None,
+    ) -> bytes:
         """
         Synthesize an audio of a voice pronouncing a given text.
@@ -2292,7 +2529,56 @@ class AsyncInferenceClient:
                 The text to synthesize.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
-                Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+                Inference Endpoint. If not provided, the default recommended text-to-speech model will be used.
+                Defaults to None.
+            do_sample (`bool`, *optional*):
+                Whether to use sampling instead of greedy decoding when generating new tokens.
+            early_stopping (`Union[bool, "TextToSpeechEarlyStoppingEnum"`, *optional*):
+                Controls the stopping condition for beam-based methods.
+            epsilon_cutoff (`float`, *optional*):
+                If set to float strictly between 0 and 1, only tokens with a conditional probability
+                greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+                3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+                Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+            eta_cutoff (`float`, *optional*):
+                Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+                float strictly between 0 and 1, a token is only considered if it is greater than either
+                eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+                term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+                the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+                See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+                for more details.
+            max_length (`int`, *optional*):
+                The maximum length (in tokens) of the generated text, including the input.
+            max_new_tokens (`int`, *optional*):
+                The maximum number of tokens to generate. Takes precedence over maxLength.
+            min_length (`int`, *optional*):
+                The minimum length (in tokens) of the generated text, including the input.
+            min_new_tokens (`int`, *optional*):
+                The minimum number of tokens to generate. Takes precedence over maxLength.
+            num_beam_groups (`int`, *optional*):
+                Number of groups to divide num_beams into in order to ensure diversity among different
+                groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+            num_beams (`int`, *optional*):
+                Number of beams to use for beam search.
+            penalty_alpha (`float`, *optional*):
+                The value balances the model confidence and the degeneration penalty in contrastive
+                search decoding.
+            temperature (`float`, *optional*):
+                The value used to modulate the next token probabilities.
+            top_k (`int`, *optional*):
+                The number of highest probability vocabulary tokens to keep for top-k-filtering.
+            top_p (`float`, *optional*):
+                If set to float < 1, only the smallest set of most probable tokens with probabilities
+                that add up to top_p or higher are kept for generation.
+            typical_p (`float`, *optional*):
+                Local typicality measures how similar the conditional probability of predicting a target token next is
+                to the expected conditional probability of predicting a random token next, given the partial text
+                already generated. If set to float < 1, the smallest set of the most locally typical tokens with
+                probabilities that add up to typical_p or higher are kept for generation. See [this
+                paper](https://hf.co/papers/2202.00666) for more details.
+            use_cache (`bool`, *optional*):
+                Whether the model should use the past last key/values attentions to speed up decoding
         Returns:
             `bytes`: The generated audio.
@@ -2314,10 +2600,36 @@ class AsyncInferenceClient:
         >>> Path("hello_world.flac").write_bytes(audio)
         ```
         """
-        return await self.post(json={"inputs": text}, model=model, task="text-to-speech")
+        parameters = {
+            "do_sample": do_sample,
+            "early_stopping": early_stopping,
+            "epsilon_cutoff": epsilon_cutoff,
+            "eta_cutoff": eta_cutoff,
+            "max_length": max_length,
+            "max_new_tokens": max_new_tokens,
+            "min_length": min_length,
+            "min_new_tokens": min_new_tokens,
+            "num_beam_groups": num_beam_groups,
+            "num_beams": num_beams,
+            "penalty_alpha": penalty_alpha,
+            "temperature": temperature,
+            "top_k": top_k,
+            "top_p": top_p,
+            "typical_p": typical_p,
+            "use_cache": use_cache,
+        }
+        payload = _prepare_payload(text, parameters=parameters)
+        response = await self.post(**payload, model=model, task="text-to-speech")
+        return response
     async def token_classification(
-        self, text: str, *, model: Optional[str] = None
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        aggregation_strategy: Optional[Literal["none", "simple", "first", "average", "max"]] = None,
+        ignore_labels: Optional[List[str]] = None,
+        stride: Optional[int] = None,
     ) -> List[TokenClassificationOutputElement]:
         """
         Perform token classification on the given text.
@@ -2330,6 +2642,12 @@ class AsyncInferenceClient:
                 The model to use for the token classification task. Can be a model ID hosted on the Hugging Face Hub or a URL to
                 a deployed Inference Endpoint. If not provided, the default recommended token classification model will be used.
                 Defaults to None.
+            aggregation_strategy (`Literal["none", "simple", "first", "average", "max"]`, *optional*):
+                The strategy used to fuse tokens based on model predictions.
+            ignore_labels (`List[str]`, *optional*):
+                A list of labels to ignore.
+            stride (`int`, *optional*):
+                The number of overlapping tokens between chunks when splitting the input text.
         Returns:
             `List[TokenClassificationOutputElement]`: List of [`TokenClassificationOutputElement`] items containing the entity group, confidence score, word, start and end index.
@@ -2364,16 +2682,30 @@ class AsyncInferenceClient:
         ]
         ```
         """
-        payload: Dict[str, Any] = {"inputs": text}
+        parameters = {
+            "aggregation_strategy": aggregation_strategy,
+            "ignore_labels": ignore_labels,
+            "stride": stride,
+        }
+        payload = _prepare_payload(text, parameters=parameters)
         response = await self.post(
-            json=payload,
+            **payload,
             model=model,
             task="token-classification",
         )
         return TokenClassificationOutputElement.parse_obj_as_list(response)
     async def translation(
-        self, text: str, *, model: Optional[str] = None, src_lang: Optional[str] = None, tgt_lang: Optional[str] = None
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        src_lang: Optional[str] = None,
+        tgt_lang: Optional[str] = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        truncation: Optional[Literal["do_not_truncate", "longest_first", "only_first", "only_second"]] = None,
+        generate_parameters: Optional[Dict[str, Any]] = None,
     ) -> TranslationOutput:
         """
         Convert text from one language to another.
@@ -2382,7 +2714,6 @@ class AsyncInferenceClient:
         your specific use case. Source and target languages usually depend on the model.
         However, it is possible to specify source and target languages for certain models. If you are working with one of these models,
         you can use `src_lang` and `tgt_lang` arguments to pass the relevant information.
-        You can find this information in the model card.
         Args:
             text (`str`):
@@ -2392,9 +2723,15 @@ class AsyncInferenceClient:
                 a deployed Inference Endpoint. If not provided, the default recommended translation model will be used.
                 Defaults to None.
             src_lang (`str`, *optional*):
-                Source language of the translation task, i.e. input language. Cannot be passed without `tgt_lang`.
+                The source language of the text. Required for models that can translate from multiple languages.
             tgt_lang (`str`, *optional*):
-                Target language of the translation task, i.e. output language. Cannot be passed without `src_lang`.
+                Target language to translate to. Required for models that can translate to multiple languages.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether to clean up the potential extra spaces in the text output.
+            truncation (`Literal["do_not_truncate", "longest_first", "only_first", "only_second"]`, *optional*):
+                The truncation strategy to use.
+            generate_parameters (`Dict[str, Any]`, *optional*):
+                Additional parametrization of the text generation algorithm.
         Returns:
             [`TranslationOutput`]: The generated translated text.
@@ -2430,12 +2767,15 @@ class AsyncInferenceClient:
         if src_lang is None and tgt_lang is not None:
             raise ValueError("You cannot specify `tgt_lang` without specifying `src_lang`.")
-        # If both `src_lang` and `tgt_lang` are given, pass them to the request body
-        payload: Dict = {"inputs": text}
-        if src_lang and tgt_lang:
-            payload["parameters"] = {"src_lang": src_lang, "tgt_lang": tgt_lang}
-        response = await self.post(json=payload, model=model, task="translation")
+        parameters = {
+            "src_lang": src_lang,
+            "tgt_lang": tgt_lang,
+            "clean_up_tokenization_spaces": clean_up_tokenization_spaces,
+            "truncation": truncation,
+            "generate_parameters": generate_parameters,
+        }
+        payload = _prepare_payload(text, parameters=parameters)
+        response = await self.post(**payload, model=model, task="translation")
         return TranslationOutput.parse_obj_as_list(response)[0]
     async def visual_question_answering(
@@ -2444,6 +2784,7 @@ class AsyncInferenceClient:
         question: str,
         *,
         model: Optional[str] = None,
+        top_k: Optional[int] = None,
     ) -> List[VisualQuestionAnsweringOutputElement]:
         """
         Answering open-ended questions based on an image.
@@ -2457,7 +2798,10 @@ class AsyncInferenceClient:
                 The model to use for the visual question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
                 a deployed Inference Endpoint. If not provided, the default recommended visual question answering model will be used.
                 Defaults to None.
+            top_k (`int`, *optional*):
+                The number of answers to return (will be chosen by order of likelihood). Note that we
+                return less than topk answers if there are not enough options available within the
+                context.
         Returns:
             `List[VisualQuestionAnsweringOutputElement]`: a list of [`VisualQuestionAnsweringOutputElement`] items containing the predicted label and associated probability.
@@ -2483,6 +2827,8 @@ class AsyncInferenceClient:
         ```
         """
         payload: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
+        if top_k is not None:
+            payload.setdefault("parameters", {})["top_k"] = top_k
         response = await self.post(json=payload, model=model, task="visual-question-answering")
         return VisualQuestionAnsweringOutputElement.parse_obj_as_list(response)
@@ -2513,7 +2859,7 @@ class AsyncInferenceClient:
                 The model then evaluates for both hypotheses if they are entailed in the provided `text` or not.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
-                Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+                Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot classification model will be used.
         Returns:
             `List[ZeroShotClassificationOutputElement]`: List of [`ZeroShotClassificationOutputElement`] items containing the predicted labels and their confidence.
@@ -2573,15 +2919,14 @@ class AsyncInferenceClient:
         ```
         """
-        parameters = {"candidate_labels": labels, "multi_label": multi_label}
-        if hypothesis_template is not None:
-            parameters["hypothesis_template"] = hypothesis_template
+        parameters = {
+            "candidate_labels": labels,
+            "multi_label": multi_label,
+            "hypothesis_template": hypothesis_template,
+        }
+        payload = _prepare_payload(text, parameters=parameters)
         response = await self.post(
-            json={
-                "inputs": text,
-                "parameters": parameters,
-            },
+            **payload,
             task="zero-shot-classification",
             model=model,
         )
@@ -2592,7 +2937,12 @@ class AsyncInferenceClient:
         ]
     async def zero_shot_image_classification(
-        self, image: ContentT, labels: List[str], *, model: Optional[str] = None
+        self,
+        image: ContentT,
+        labels: List[str],
+        *,
+        model: Optional[str] = None,
+        hypothesis_template: Optional[str] = None,
     ) -> List[ZeroShotImageClassificationOutputElement]:
         """
         Provide input image and text labels to predict text labels for the image.
@@ -2604,8 +2954,10 @@ class AsyncInferenceClient:
                 List of string possible labels. There must be at least 2 labels.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
-                Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+                Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot image classification model will be used.
+            hypothesis_template (`str`, *optional*):
+                The sentence used in conjunction with `labels` to attempt the text classification by replacing the
+                placeholder with the candidate labels.
         Returns:
             `List[ZeroShotImageClassificationOutputElement]`: List of [`ZeroShotImageClassificationOutputElement`] items containing the predicted labels and their confidence.
@@ -2632,8 +2984,11 @@ class AsyncInferenceClient:
         if len(labels) < 2:
             raise ValueError("You must specify at least 2 classes to compare.")
+        inputs = {"image": _b64_encode(image), "candidateLabels": ",".join(labels)}
+        parameters = {"hypothesis_template": hypothesis_template}
+        payload = _prepare_payload(inputs, parameters=parameters)
         response = await self.post(
-            json={"image": _b64_encode(image), "parameters": {"candidate_labels": ",".join(labels)}},
+            **payload,
             model=model,
             task="zero-shot-image-classification",
         )