PyPI - huggingface-hub - Versions diffs - 0.25.2__py3-none-any.whl → 0.26.0rc0__py3-none-any.whl - Mend - Supply Chain Defender

huggingface-hub 0.25.2py3-none-any.whl → 0.26.0rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of huggingface-hub might be problematic. Click here for more details.

Files changed (45) hide show

huggingface_hub/inference/_client.py CHANGED Viewed

@@ -37,17 +37,7 @@ import logging
 import re
 import time
 import warnings
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Dict,
-    Iterable,
-    List,
-    Literal,
-    Optional,
-    Union,
-    overload,
-)
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Literal, Optional, Union, overload
 from requests import HTTPError
 from requests.structures import CaseInsensitiveDict
@@ -67,6 +57,7 @@ from huggingface_hub.inference._common import (
     _get_unsupported_text_generation_kwargs,
     _import_numpy,
     _open_as_binary,
+    _prepare_payload,
     _set_unsupported_text_generation_kwargs,
     _stream_chat_completion_response,
     _stream_text_generation_response,
@@ -74,11 +65,12 @@ from huggingface_hub.inference._common import (
 )
 from huggingface_hub.inference._generated.types import (
     AudioClassificationOutputElement,
+    AudioClassificationOutputTransform,
     AudioToAudioOutputElement,
     AutomaticSpeechRecognitionOutput,
     ChatCompletionInputGrammarType,
-    ChatCompletionInputTool,
-    ChatCompletionInputToolTypeClass,
+    ChatCompletionInputStreamOptions,
+    ChatCompletionInputToolType,
     ChatCompletionOutput,
     ChatCompletionStreamOutput,
     DocumentQuestionAnsweringOutputElement,
@@ -91,21 +83,21 @@ from huggingface_hub.inference._generated.types import (
     SummarizationOutput,
     TableQuestionAnsweringOutputElement,
     TextClassificationOutputElement,
+    TextClassificationOutputTransform,
     TextGenerationInputGrammarType,
     TextGenerationOutput,
     TextGenerationStreamOutput,
+    TextToImageTargetSize,
+    TextToSpeechEarlyStoppingEnum,
     TokenClassificationOutputElement,
+    ToolElement,
     TranslationOutput,
     VisualQuestionAnsweringOutputElement,
     ZeroShotClassificationOutputElement,
     ZeroShotImageClassificationOutputElement,
 )
-from huggingface_hub.utils import (
-    build_hf_headers,
-    get_session,
-    hf_raise_for_status,
-)
-from huggingface_hub.utils._deprecation import _deprecate_positional_args
+from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
+from huggingface_hub.utils._deprecation import _deprecate_arguments
 if TYPE_CHECKING:
@@ -157,7 +149,6 @@ class InferenceClient:
             follow the same pattern as `openai.OpenAI` client. Cannot be used if `token` is set. Defaults to None.
     """
-    @_deprecate_positional_args(version="0.26")
     def __init__(
         self,
         model: Optional[str] = None,
@@ -333,6 +324,8 @@ class InferenceClient:
         audio: ContentT,
         *,
         model: Optional[str] = None,
+        top_k: Optional[int] = None,
+        function_to_apply: Optional["AudioClassificationOutputTransform"] = None,
     ) -> List[AudioClassificationOutputElement]:
         """
         Perform audio classification on the provided audio content.
@@ -345,6 +338,10 @@ class InferenceClient:
                 The model to use for audio classification. Can be a model ID hosted on the Hugging Face Hub
                 or a URL to a deployed Inference Endpoint. If not provided, the default recommended model for
                 audio classification will be used.
+            top_k (`int`, *optional*):
+                When specified, limits the output to the top K most probable classes.
+            function_to_apply (`"AudioClassificationOutputTransform"`, *optional*):
+                The function to apply to the output.
         Returns:
             `List[AudioClassificationOutputElement]`: List of [`AudioClassificationOutputElement`] items containing the predicted labels and their confidence.
@@ -367,7 +364,9 @@ class InferenceClient:
         ]
         ```
         """
-        response = self.post(data=audio, model=model, task="audio-classification")
+        parameters = {"function_to_apply": function_to_apply, "top_k": top_k}
+        payload = _prepare_payload(audio, parameters=parameters, expect_binary=True)
+        response = self.post(**payload, model=model, task="audio-classification")
         return AudioClassificationOutputElement.parse_obj_as_list(response)
     def audio_to_audio(
@@ -452,7 +451,7 @@ class InferenceClient:
     @overload
     def chat_completion(  # type: ignore
         self,
-        messages: List[Dict[str, str]],
+        messages: List[Dict],
         *,
         model: Optional[str] = None,
         stream: Literal[False] = False,
@@ -465,10 +464,11 @@ class InferenceClient:
         response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ChatCompletionInputTool]] = None,
+        tools: Optional[List[ToolElement]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
     ) -> ChatCompletionOutput: ...
@@ -476,7 +476,7 @@ class InferenceClient:
     @overload
     def chat_completion(  # type: ignore
         self,
-        messages: List[Dict[str, str]],
+        messages: List[Dict],
         *,
         model: Optional[str] = None,
         stream: Literal[True] = True,
@@ -489,10 +489,11 @@ class InferenceClient:
         response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ChatCompletionInputTool]] = None,
+        tools: Optional[List[ToolElement]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
     ) -> Iterable[ChatCompletionStreamOutput]: ...
@@ -500,7 +501,7 @@ class InferenceClient:
     @overload
     def chat_completion(
         self,
-        messages: List[Dict[str, str]],
+        messages: List[Dict],
         *,
         model: Optional[str] = None,
         stream: bool = False,
@@ -513,17 +514,18 @@ class InferenceClient:
         response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ChatCompletionInputTool]] = None,
+        tools: Optional[List[ToolElement]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
     ) -> Union[ChatCompletionOutput, Iterable[ChatCompletionStreamOutput]]: ...
     def chat_completion(
         self,
-        messages: List[Dict[str, str]],
+        messages: List[Dict],
         *,
         model: Optional[str] = None,
         stream: bool = False,
@@ -537,10 +539,11 @@ class InferenceClient:
         response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ChatCompletionInputTool]] = None,
+        tools: Optional[List[ToolElement]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
     ) -> Union[ChatCompletionOutput, Iterable[ChatCompletionStreamOutput]]:
@@ -557,7 +560,7 @@ class InferenceClient:
         </Tip>
         Args:
-            messages (List[Union[`SystemMessage`, `UserMessage`, `AssistantMessage`]]):
+            messages (List of [`ChatCompletionInputMessage`]):
                 Conversation history consisting of roles and content pairs.
             model (`str`, *optional*):
                 The model to use for chat-completion. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
@@ -594,6 +597,8 @@ class InferenceClient:
                 Defaults to None.
             stream (`bool`, *optional*):
                 Enable realtime streaming of responses. Defaults to False.
+            stream_options ([`ChatCompletionInputStreamOptions`], *optional*):
+                Options for streaming completions.
             temperature (`float`, *optional*):
                 Controls randomness of the generations. Lower values ensure
                 less random completions. Range: [0, 2]. Defaults to 1.0.
@@ -604,11 +609,11 @@ class InferenceClient:
             top_p (`float`, *optional*):
                 Fraction of the most likely next words to sample from.
                 Must be between 0 and 1. Defaults to 1.0.
-            tool_choice ([`ChatCompletionInputToolTypeClass`] or `str`, *optional*):
+            tool_choice ([`ChatCompletionInputToolType`] or `str`, *optional*):
                 The tool to use for the completion. Defaults to "auto".
             tool_prompt (`str`, *optional*):
                 A prompt to be appended before the tools.
-            tools (List of [`ChatCompletionInputTool`], *optional*):
+            tools (List of [`ToolElement`], *optional*):
                 A list of tools the model may call. Currently, only functions are supported as a tool. Use this to
                 provide a list of functions the model may generate JSON inputs for.
@@ -658,7 +663,7 @@ class InferenceClient:
         )
         ```
-        Example (stream=True):
+        Example using streaming:
         ```py
         >>> from huggingface_hub import InferenceClient
         >>> messages = [{"role": "user", "content": "What is the capital of France?"}]
@@ -696,6 +701,40 @@ class InferenceClient:
             print(chunk.choices[0].delta.content)
         ```
+        Example using Image + Text as input:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        # provide a remote URL
+        >>> image_url ="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+        # or a base64-encoded image
+        >>> image_path = "/path/to/image.jpeg"
+        >>> with open(image_path, "rb") as f:
+        ...     base64_image = base64.b64encode(f.read()).decode("utf-8")
+        >>> image_url = f"data:image/jpeg;base64,{base64_image}"
+        >>> client = InferenceClient("meta-llama/Llama-3.2-11B-Vision-Instruct")
+        >>> output = client.chat.completions.create(
+        ...     messages=[
+        ...         {
+        ...             "role": "user",
+        ...             "content": [
+        ...                 {
+        ...                     "type": "image_url",
+        ...                     "image_url": {"url": image_url},
+        ...                 },
+        ...                 {
+        ...                     "type": "text",
+        ...                     "text": "Describe this image in one sentence.",
+        ...                 },
+        ...             ],
+        ...         },
+        ...     ],
+        ... )
+        >>> output
+        The image depicts the iconic Statue of Liberty situated in New York Harbor, New York, on a clear day.
+        ```
         Example using tools:
         ```py
         >>> client = InferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
@@ -837,6 +876,7 @@ class InferenceClient:
             top_logprobs=top_logprobs,
             top_p=top_p,
             stream=stream,
+            stream_options=stream_options,
         )
         payload = {key: value for key, value in payload.items() if value is not None}
         data = self.post(model=model_url, json=payload, stream=stream)
@@ -877,6 +917,14 @@ class InferenceClient:
         question: str,
         *,
         model: Optional[str] = None,
+        doc_stride: Optional[int] = None,
+        handle_impossible_answer: Optional[bool] = None,
+        lang: Optional[str] = None,
+        max_answer_len: Optional[int] = None,
+        max_question_len: Optional[int] = None,
+        max_seq_len: Optional[int] = None,
+        top_k: Optional[int] = None,
+        word_boxes: Optional[List[Union[List[float], str]]] = None,
     ) -> List[DocumentQuestionAnsweringOutputElement]:
         """
         Answer questions on document images.
@@ -890,7 +938,29 @@ class InferenceClient:
                 The model to use for the document question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
                 a deployed Inference Endpoint. If not provided, the default recommended document question answering model will be used.
                 Defaults to None.
+            doc_stride (`int`, *optional*):
+                If the words in the document are too long to fit with the question for the model, it will
+                be split in several chunks with some overlap. This argument controls the size of that
+                overlap.
+            handle_impossible_answer (`bool`, *optional*):
+                Whether to accept impossible as an answer.
+            lang (`str`, *optional*):
+                Language to use while running OCR.
+            max_answer_len (`int`, *optional*):
+                The maximum length of predicted answers (e.g., only answers with a shorter length are
+                considered).
+            max_question_len (`int`, *optional*):
+                The maximum length of the question after tokenization. It will be truncated if needed.
+            max_seq_len (`int`, *optional*):
+                The maximum length of the total sentence (context + question) in tokens of each chunk
+                passed to the model. The context will be split in several chunks (using doc_stride as
+                overlap) if needed.
+            top_k (`int`, *optional*):
+                The number of answers to return (will be chosen by order of likelihood). Can return less
+                than top_k answers if there are not enough options available within the context.
+            word_boxes (`List[Union[List[float], str]]`, *optional*):
+                A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
+                skip the OCR step and use the provided bounding boxes instead.
         Returns:
             `List[DocumentQuestionAnsweringOutputElement]`: a list of [`DocumentQuestionAnsweringOutputElement`] items containing the predicted label, associated probability, word ids, and page number.
@@ -900,16 +970,28 @@ class InferenceClient:
             `HTTPError`:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
         ```py
         >>> from huggingface_hub import InferenceClient
         >>> client = InferenceClient()
         >>> client.document_question_answering(image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png", question="What is the invoice number?")
-        [DocumentQuestionAnsweringOutputElement(score=0.42515629529953003, answer='us-001', start=16, end=16)]
+        [DocumentQuestionAnsweringOutputElement(answer='us-001', end=16, score=0.9999666213989258, start=16, words=None)]
         ```
         """
-        payload: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
-        response = self.post(json=payload, model=model, task="document-question-answering")
+        inputs: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
+        parameters = {
+            "doc_stride": doc_stride,
+            "handle_impossible_answer": handle_impossible_answer,
+            "lang": lang,
+            "max_answer_len": max_answer_len,
+            "max_question_len": max_question_len,
+            "max_seq_len": max_seq_len,
+            "top_k": top_k,
+            "word_boxes": word_boxes,
+        }
+        payload = _prepare_payload(inputs, parameters=parameters)
+        response = self.post(**payload, model=model, task="document-question-answering")
         return DocumentQuestionAnsweringOutputElement.parse_obj_as_list(response)
     def feature_extraction(
@@ -933,7 +1015,7 @@ class InferenceClient:
                 a deployed Inference Endpoint. If not provided, the default recommended conversational model will be used.
                 Defaults to None.
             normalize (`bool`, *optional*):
-                Whether to normalize the embeddings or not. Defaults to None.
+                Whether to normalize the embeddings or not.
                 Only available on server powered by Text-Embedding-Inference.
             prompt_name (`str`, *optional*):
                 The name of the prompt that should be used by for encoding. If not set, no prompt will be applied.
@@ -942,7 +1024,7 @@ class InferenceClient:
                 then the sentence "What is the capital of France?" will be encoded as "query: What is the capital of France?"
                 because the prompt text will be prepended before any text to encode.
             truncate (`bool`, *optional*):
-                Whether to truncate the embeddings or not. Defaults to None.
+                Whether to truncate the embeddings or not.
                 Only available on server powered by Text-Embedding-Inference.
             truncation_direction (`Literal["Left", "Right"]`, *optional*):
                 Which side of the input should be truncated when `truncate=True` is passed.
@@ -967,20 +1049,25 @@ class InferenceClient:
         [ 0.28552425, -0.928395  , -1.2077185 , ...,  0.76810825, -2.1069427 ,  0.6236161 ]], dtype=float32)
         ```
         """
-        payload: Dict = {"inputs": text}
-        if normalize is not None:
-            payload["normalize"] = normalize
-        if prompt_name is not None:
-            payload["prompt_name"] = prompt_name
-        if truncate is not None:
-            payload["truncate"] = truncate
-        if truncation_direction is not None:
-            payload["truncation_direction"] = truncation_direction
-        response = self.post(json=payload, model=model, task="feature-extraction")
+        parameters = {
+            "normalize": normalize,
+            "prompt_name": prompt_name,
+            "truncate": truncate,
+            "truncation_direction": truncation_direction,
+        }
+        payload = _prepare_payload(text, parameters=parameters)
+        response = self.post(**payload, model=model, task="feature-extraction")
         np = _import_numpy()
         return np.array(_bytes_to_dict(response), dtype="float32")
-    def fill_mask(self, text: str, *, model: Optional[str] = None) -> List[FillMaskOutputElement]:
+    def fill_mask(
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        targets: Optional[List[str]] = None,
+        top_k: Optional[int] = None,
+    ) -> List[FillMaskOutputElement]:
         """
         Fill in a hole with a missing word (token to be precise).
@@ -990,8 +1077,13 @@ class InferenceClient:
             model (`str`, *optional*):
                 The model to use for the fill mask task. Can be a model ID hosted on the Hugging Face Hub or a URL to
                 a deployed Inference Endpoint. If not provided, the default recommended fill mask model will be used.
-                Defaults to None.
+            targets (`List[str]`, *optional*):
+                When passed, the model will limit the scores to the passed targets instead of looking up
+                in the whole vocabulary. If the provided targets are not in the model vocab, they will be
+                tokenized and the first resulting token will be used (with a warning, and that might be
+                slower).
+            top_k (`int`, *optional*):
+                When passed, overrides the number of predictions to return.
         Returns:
             `List[FillMaskOutputElement]`: a list of [`FillMaskOutputElement`] items containing the predicted label, associated
             probability, token reference, and completed text.
@@ -1013,7 +1105,9 @@ class InferenceClient:
         ]
         ```
         """
-        response = self.post(json={"inputs": text}, model=model, task="fill-mask")
+        parameters = {"targets": targets, "top_k": top_k}
+        payload = _prepare_payload(text, parameters=parameters)
+        response = self.post(**payload, model=model, task="fill-mask")
         return FillMaskOutputElement.parse_obj_as_list(response)
     def image_classification(
@@ -1021,6 +1115,8 @@ class InferenceClient:
         image: ContentT,
         *,
         model: Optional[str] = None,
+        function_to_apply: Optional[Literal["sigmoid", "softmax", "none"]] = None,
+        top_k: Optional[int] = None,
     ) -> List[ImageClassificationOutputElement]:
         """
         Perform image classification on the given image using the specified model.
@@ -1031,7 +1127,10 @@ class InferenceClient:
             model (`str`, *optional*):
                 The model to use for image classification. Can be a model ID hosted on the Hugging Face Hub or a URL to a
                 deployed Inference Endpoint. If not provided, the default recommended model for image classification will be used.
+            function_to_apply (`Literal["sigmoid", "softmax", "none"]`, *optional*):
+                The function to apply to the output scores.
+            top_k (`int`, *optional*):
+                When specified, limits the output to the top K most probable classes.
         Returns:
             `List[ImageClassificationOutputElement]`: a list of [`ImageClassificationOutputElement`] items containing the predicted label and associated probability.
@@ -1046,10 +1145,12 @@ class InferenceClient:
         >>> from huggingface_hub import InferenceClient
         >>> client = InferenceClient()
         >>> client.image_classification("https://upload.wikimedia.org/wikipedia/commons/thumb/4/43/Cute_dog.jpg/320px-Cute_dog.jpg")
-        [ImageClassificationOutputElement(score=0.9779096841812134, label='Blenheim spaniel'), ...]
+        [ImageClassificationOutputElement(label='Blenheim spaniel', score=0.9779096841812134), ...]
         ```
         """
-        response = self.post(data=image, model=model, task="image-classification")
+        parameters = {"function_to_apply": function_to_apply, "top_k": top_k}
+        payload = _prepare_payload(image, parameters=parameters, expect_binary=True)
+        response = self.post(**payload, model=model, task="image-classification")
         return ImageClassificationOutputElement.parse_obj_as_list(response)
     def image_segmentation(
@@ -1057,6 +1158,10 @@ class InferenceClient:
         image: ContentT,
         *,
         model: Optional[str] = None,
+        mask_threshold: Optional[float] = None,
+        overlap_mask_area_threshold: Optional[float] = None,
+        subtask: Optional[Literal["instance", "panoptic", "semantic"]] = None,
+        threshold: Optional[float] = None,
     ) -> List[ImageSegmentationOutputElement]:
         """
         Perform image segmentation on the given image using the specified model.
@@ -1073,7 +1178,14 @@ class InferenceClient:
             model (`str`, *optional*):
                 The model to use for image segmentation. Can be a model ID hosted on the Hugging Face Hub or a URL to a
                 deployed Inference Endpoint. If not provided, the default recommended model for image segmentation will be used.
+            mask_threshold (`float`, *optional*):
+                Threshold to use when turning the predicted masks into binary values.
+            overlap_mask_area_threshold (`float`, *optional*):
+                Mask overlap threshold to eliminate small, disconnected segments.
+            subtask (`Literal["instance", "panoptic", "semantic"]`, *optional*):
+                Segmentation task to be performed, depending on model capabilities.
+            threshold (`float`, *optional*):
+                Probability threshold to filter out predicted masks.
         Returns:
             `List[ImageSegmentationOutputElement]`: A list of [`ImageSegmentationOutputElement`] items containing the segmented masks and associated attributes.
@@ -1087,14 +1199,21 @@ class InferenceClient:
         ```py
         >>> from huggingface_hub import InferenceClient
         >>> client = InferenceClient()
-        >>> client.image_segmentation("cat.jpg"):
+        >>> client.image_segmentation("cat.jpg")
         [ImageSegmentationOutputElement(score=0.989008, label='LABEL_184', mask=<PIL.PngImagePlugin.PngImageFile image mode=L size=400x300 at 0x7FDD2B129CC0>), ...]
         ```
         """
-        response = self.post(data=image, model=model, task="image-segmentation")
+        parameters = {
+            "mask_threshold": mask_threshold,
+            "overlap_mask_area_threshold": overlap_mask_area_threshold,
+            "subtask": subtask,
+            "threshold": threshold,
+        }
+        payload = _prepare_payload(image, parameters=parameters, expect_binary=True)
+        response = self.post(**payload, model=model, task="image-segmentation")
         output = ImageSegmentationOutputElement.parse_obj_as_list(response)
         for item in output:
-            item.mask = _b64_to_image(item.mask)
+            item.mask = _b64_to_image(item.mask)  # type: ignore [assignment]
         return output
     def image_to_image(
@@ -1166,19 +1285,8 @@ class InferenceClient:
             "guidance_scale": guidance_scale,
             **kwargs,
         }
-        if all(parameter is None for parameter in parameters.values()):
-            # Either only an image to send => send as raw bytes
-            data = image
-            payload: Optional[Dict[str, Any]] = None
-        else:
-            # Or an image + some parameters => use base64 encoding
-            data = None
-            payload = {"inputs": _b64_encode(image)}
-            for key, value in parameters.items():
-                if value is not None:
-                    payload.setdefault("parameters", {})[key] = value
-        response = self.post(json=payload, data=data, model=model, task="image-to-image")
+        payload = _prepare_payload(image, parameters=parameters, expect_binary=True)
+        response = self.post(**payload, model=model, task="image-to-image")
         return _bytes_to_image(response)
     def image_to_text(self, image: ContentT, *, model: Optional[str] = None) -> ImageToTextOutput:
@@ -1302,10 +1410,7 @@ class InferenceClient:
         return models_by_task
     def object_detection(
-        self,
-        image: ContentT,
-        *,
-        model: Optional[str] = None,
+        self, image: ContentT, *, model: Optional[str] = None, threshold: Optional[float] = None
     ) -> List[ObjectDetectionOutputElement]:
         """
         Perform object detection on the given image using the specified model.
@@ -1322,7 +1427,8 @@ class InferenceClient:
             model (`str`, *optional*):
                 The model to use for object detection. Can be a model ID hosted on the Hugging Face Hub or a URL to a
                 deployed Inference Endpoint. If not provided, the default recommended model for object detection (DETR) will be used.
+            threshold (`float`, *optional*):
+                The probability necessary to make a prediction.
         Returns:
             `List[ObjectDetectionOutputElement]`: A list of [`ObjectDetectionOutputElement`] items containing the bounding boxes and associated attributes.
@@ -1338,17 +1444,31 @@ class InferenceClient:
         ```py
         >>> from huggingface_hub import InferenceClient
         >>> client = InferenceClient()
-        >>> client.object_detection("people.jpg"):
+        >>> client.object_detection("people.jpg")
         [ObjectDetectionOutputElement(score=0.9486683011054993, label='person', box=ObjectDetectionBoundingBox(xmin=59, ymin=39, xmax=420, ymax=510)), ...]
         ```
         """
-        # detect objects
-        response = self.post(data=image, model=model, task="object-detection")
+        parameters = {
+            "threshold": threshold,
+        }
+        payload = _prepare_payload(image, parameters=parameters, expect_binary=True)
+        response = self.post(**payload, model=model, task="object-detection")
         return ObjectDetectionOutputElement.parse_obj_as_list(response)
     def question_answering(
-        self, question: str, context: str, *, model: Optional[str] = None
-    ) -> QuestionAnsweringOutputElement:
+        self,
+        question: str,
+        context: str,
+        *,
+        model: Optional[str] = None,
+        align_to_words: Optional[bool] = None,
+        doc_stride: Optional[int] = None,
+        handle_impossible_answer: Optional[bool] = None,
+        max_answer_len: Optional[int] = None,
+        max_question_len: Optional[int] = None,
+        max_seq_len: Optional[int] = None,
+        top_k: Optional[int] = None,
+    ) -> Union[QuestionAnsweringOutputElement, List[QuestionAnsweringOutputElement]]:
         """
         Retrieve the answer to a question from a given text.
@@ -1360,10 +1480,31 @@ class InferenceClient:
             model (`str`):
                 The model to use for the question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
                 a deployed Inference Endpoint.
+            align_to_words (`bool`, *optional*):
+                Attempts to align the answer to real words. Improves quality on space separated
+                languages. Might hurt on non-space-separated languages (like Japanese or Chinese).
+            doc_stride (`int`, *optional*):
+                If the context is too long to fit with the question for the model, it will be split in
+                several chunks with some overlap. This argument controls the size of that overlap.
+            handle_impossible_answer (`bool`, *optional*):
+                Whether to accept impossible as an answer.
+            max_answer_len (`int`, *optional*):
+                The maximum length of predicted answers (e.g., only answers with a shorter length are
+                considered).
+            max_question_len (`int`, *optional*):
+                The maximum length of the question after tokenization. It will be truncated if needed.
+            max_seq_len (`int`, *optional*):
+                The maximum length of the total sentence (context + question) in tokens of each chunk
+                passed to the model. The context will be split in several chunks (using docStride as
+                overlap) if needed.
+            top_k (`int`, *optional*):
+                The number of answers to return (will be chosen by order of likelihood). Note that we
+                return less than topk answers if there are not enough options available within the
+                context.
         Returns:
-            [`QuestionAnsweringOutputElement`]: an question answering output containing the score, start index, end index, and answer.
+            Union[`QuestionAnsweringOutputElement`, List[`QuestionAnsweringOutputElement`]]:
+                When top_k is 1 or not provided, it returns a single `QuestionAnsweringOutputElement`.
+                When top_k is greater than 1, it returns a list of `QuestionAnsweringOutputElement`.
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
@@ -1375,17 +1516,28 @@ class InferenceClient:
         >>> from huggingface_hub import InferenceClient
         >>> client = InferenceClient()
         >>> client.question_answering(question="What's my name?", context="My name is Clara and I live in Berkeley.")
-        QuestionAnsweringOutputElement(score=0.9326562285423279, start=11, end=16, answer='Clara')
+        QuestionAnsweringOutputElement(answer='Clara', end=16, score=0.9326565265655518, start=11)
         ```
         """
-        payload: Dict[str, Any] = {"question": question, "context": context}
+        parameters = {
+            "align_to_words": align_to_words,
+            "doc_stride": doc_stride,
+            "handle_impossible_answer": handle_impossible_answer,
+            "max_answer_len": max_answer_len,
+            "max_question_len": max_question_len,
+            "max_seq_len": max_seq_len,
+            "top_k": top_k,
+        }
+        inputs: Dict[str, Any] = {"question": question, "context": context}
+        payload = _prepare_payload(inputs, parameters=parameters)
         response = self.post(
-            json=payload,
+            **payload,
             model=model,
             task="question-answering",
         )
-        return QuestionAnsweringOutputElement.parse_obj_as_instance(response)
+        # Parse the response as a single `QuestionAnsweringOutputElement` when top_k is 1 or not provided, or a list of `QuestionAnsweringOutputElement` to ensure backward compatibility.
+        output = QuestionAnsweringOutputElement.parse_obj(response)
+        return output
     def sentence_similarity(
         self, sentence: str, other_sentences: List[str], *, model: Optional[str] = None
@@ -1434,12 +1586,23 @@ class InferenceClient:
         )
         return _bytes_to_list(response)
+    @_deprecate_arguments(
+        version="0.29",
+        deprecated_args=["parameters"],
+        custom_message=(
+            "The `parameters` argument is deprecated and will be removed in a future version. "
+            "Provide individual parameters instead: `clean_up_tokenization_spaces`, `generate_parameters`, and `truncation`."
+        ),
+    )
     def summarization(
         self,
         text: str,
         *,
         parameters: Optional[Dict[str, Any]] = None,
         model: Optional[str] = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        generate_parameters: Optional[Dict[str, Any]] = None,
+        truncation: Optional[Literal["do_not_truncate", "longest_first", "only_first", "only_second"]] = None,
     ) -> SummarizationOutput:
         """
         Generate a summary of a given text using a specified model.
@@ -1452,8 +1615,13 @@ class InferenceClient:
                 for more details.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
-                Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+                Inference Endpoint. If not provided, the default recommended model for summarization will be used.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether to clean up the potential extra spaces in the text output.
+            generate_parameters (`Dict[str, Any]`, *optional*):
+                Additional parametrization of the text generation algorithm.
+            truncation (`Literal["do_not_truncate", "longest_first", "only_first", "only_second"]`, *optional*):
+                The truncation strategy to use.
         Returns:
             [`SummarizationOutput`]: The generated summary text.
@@ -1471,14 +1639,23 @@ class InferenceClient:
         SummarizationOutput(generated_text="The Eiffel tower is one of the most famous landmarks in the world....")
         ```
         """
-        payload: Dict[str, Any] = {"inputs": text}
-        if parameters is not None:
-            payload["parameters"] = parameters
-        response = self.post(json=payload, model=model, task="summarization")
+        if parameters is None:
+            parameters = {
+                "clean_up_tokenization_spaces": clean_up_tokenization_spaces,
+                "generate_parameters": generate_parameters,
+                "truncation": truncation,
+            }
+        payload = _prepare_payload(text, parameters=parameters)
+        response = self.post(**payload, model=model, task="summarization")
         return SummarizationOutput.parse_obj_as_list(response)[0]
     def table_question_answering(
-        self, table: Dict[str, Any], query: str, *, model: Optional[str] = None
+        self,
+        table: Dict[str, Any],
+        query: str,
+        *,
+        model: Optional[str] = None,
+        parameters: Optional[Dict[str, Any]] = None,
     ) -> TableQuestionAnsweringOutputElement:
         """
         Retrieve the answer to a question from information given in a table.
@@ -1492,6 +1669,8 @@ class InferenceClient:
             model (`str`):
                 The model to use for the table-question-answering task. Can be a model ID hosted on the Hugging Face
                 Hub or a URL to a deployed Inference Endpoint.
+            parameters (`Dict[str, Any]`, *optional*):
+                Additional inference parameters. Defaults to None.
         Returns:
             [`TableQuestionAnsweringOutputElement`]: a table question answering output containing the answer, coordinates, cells and the aggregator used.
@@ -1512,11 +1691,13 @@ class InferenceClient:
         TableQuestionAnsweringOutputElement(answer='36542', coordinates=[[0, 1]], cells=['36542'], aggregator='AVERAGE')
         ```
         """
+        inputs = {
+            "query": query,
+            "table": table,
+        }
+        payload = _prepare_payload(inputs, parameters=parameters)
         response = self.post(
-            json={
-                "query": query,
-                "table": table,
-            },
+            **payload,
             model=model,
             task="table-question-answering",
         )
@@ -1564,7 +1745,11 @@ class InferenceClient:
         ["5", "5", "5"]
         ```
         """
-        response = self.post(json={"table": table}, model=model, task="tabular-classification")
+        response = self.post(
+            json={"table": table},
+            model=model,
+            task="tabular-classification",
+        )
         return _bytes_to_list(response)
     def tabular_regression(self, table: Dict[str, Any], *, model: Optional[str] = None) -> List[float]:
@@ -1607,7 +1792,14 @@ class InferenceClient:
         response = self.post(json={"table": table}, model=model, task="tabular-regression")
         return _bytes_to_list(response)
-    def text_classification(self, text: str, *, model: Optional[str] = None) -> List[TextClassificationOutputElement]:
+    def text_classification(
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        top_k: Optional[int] = None,
+        function_to_apply: Optional["TextClassificationOutputTransform"] = None,
+    ) -> List[TextClassificationOutputElement]:
         """
         Perform text classification (e.g. sentiment-analysis) on the given text.
@@ -1618,6 +1810,10 @@ class InferenceClient:
                 The model to use for the text classification task. Can be a model ID hosted on the Hugging Face Hub or a URL to
                 a deployed Inference Endpoint. If not provided, the default recommended text classification model will be used.
                 Defaults to None.
+            top_k (`int`, *optional*):
+                When specified, limits the output to the top K most probable classes.
+            function_to_apply (`"TextClassificationOutputTransform"`, *optional*):
+                The function to apply to the output.
         Returns:
             `List[TextClassificationOutputElement]`: a list of [`TextClassificationOutputElement`] items containing the predicted label and associated probability.
@@ -1639,7 +1835,16 @@ class InferenceClient:
         ]
         ```
         """
-        response = self.post(json={"inputs": text}, model=model, task="text-classification")
+        parameters = {
+            "function_to_apply": function_to_apply,
+            "top_k": top_k,
+        }
+        payload = _prepare_payload(text, parameters=parameters)
+        response = self.post(
+            **payload,
+            model=model,
+            task="text-classification",
+        )
         return TextClassificationOutputElement.parse_obj_as_list(response)[0]  # type: ignore [return-value]
     @overload
@@ -2148,6 +2353,9 @@ class InferenceClient:
         num_inference_steps: Optional[float] = None,
         guidance_scale: Optional[float] = None,
         model: Optional[str] = None,
+        scheduler: Optional[str] = None,
+        target_size: Optional[TextToImageTargetSize] = None,
+        seed: Optional[int] = None,
         **kwargs,
     ) -> "Image":
         """
@@ -2176,7 +2384,14 @@ class InferenceClient:
                 usually at the expense of lower image quality.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
-                Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+                Inference Endpoint. If not provided, the default recommended text-to-image model will be used.
+                Defaults to None.
+            scheduler (`str`, *optional*):
+                Override the scheduler with a compatible one.
+            target_size (`TextToImageTargetSize`, *optional*):
+                The size in pixel of the output image
+            seed (`int`, *optional*):
+                Seed for the random number generator.
         Returns:
             `Image`: The generated image.
@@ -2203,22 +2418,44 @@ class InferenceClient:
         >>> image.save("better_astronaut.png")
         ```
         """
-        payload = {"inputs": prompt}
         parameters = {
             "negative_prompt": negative_prompt,
             "height": height,
             "width": width,
             "num_inference_steps": num_inference_steps,
             "guidance_scale": guidance_scale,
+            "scheduler": scheduler,
+            "target_size": target_size,
+            "seed": seed,
             **kwargs,
         }
-        for key, value in parameters.items():
-            if value is not None:
-                payload.setdefault("parameters", {})[key] = value  # type: ignore
-        response = self.post(json=payload, model=model, task="text-to-image")
+        payload = _prepare_payload(prompt, parameters=parameters)
+        response = self.post(**payload, model=model, task="text-to-image")
         return _bytes_to_image(response)
-    def text_to_speech(self, text: str, *, model: Optional[str] = None) -> bytes:
+    def text_to_speech(
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        do_sample: Optional[bool] = None,
+        early_stopping: Optional[Union[bool, "TextToSpeechEarlyStoppingEnum"]] = None,
+        epsilon_cutoff: Optional[float] = None,
+        eta_cutoff: Optional[float] = None,
+        max_length: Optional[int] = None,
+        max_new_tokens: Optional[int] = None,
+        min_length: Optional[int] = None,
+        min_new_tokens: Optional[int] = None,
+        num_beam_groups: Optional[int] = None,
+        num_beams: Optional[int] = None,
+        penalty_alpha: Optional[float] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        typical_p: Optional[float] = None,
+        use_cache: Optional[bool] = None,
+    ) -> bytes:
         """
         Synthesize an audio of a voice pronouncing a given text.
@@ -2227,7 +2464,56 @@ class InferenceClient:
                 The text to synthesize.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
-                Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+                Inference Endpoint. If not provided, the default recommended text-to-speech model will be used.
+                Defaults to None.
+            do_sample (`bool`, *optional*):
+                Whether to use sampling instead of greedy decoding when generating new tokens.
+            early_stopping (`Union[bool, "TextToSpeechEarlyStoppingEnum"`, *optional*):
+                Controls the stopping condition for beam-based methods.
+            epsilon_cutoff (`float`, *optional*):
+                If set to float strictly between 0 and 1, only tokens with a conditional probability
+                greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+                3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+                Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+            eta_cutoff (`float`, *optional*):
+                Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+                float strictly between 0 and 1, a token is only considered if it is greater than either
+                eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+                term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+                the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+                See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+                for more details.
+            max_length (`int`, *optional*):
+                The maximum length (in tokens) of the generated text, including the input.
+            max_new_tokens (`int`, *optional*):
+                The maximum number of tokens to generate. Takes precedence over maxLength.
+            min_length (`int`, *optional*):
+                The minimum length (in tokens) of the generated text, including the input.
+            min_new_tokens (`int`, *optional*):
+                The minimum number of tokens to generate. Takes precedence over maxLength.
+            num_beam_groups (`int`, *optional*):
+                Number of groups to divide num_beams into in order to ensure diversity among different
+                groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+            num_beams (`int`, *optional*):
+                Number of beams to use for beam search.
+            penalty_alpha (`float`, *optional*):
+                The value balances the model confidence and the degeneration penalty in contrastive
+                search decoding.
+            temperature (`float`, *optional*):
+                The value used to modulate the next token probabilities.
+            top_k (`int`, *optional*):
+                The number of highest probability vocabulary tokens to keep for top-k-filtering.
+            top_p (`float`, *optional*):
+                If set to float < 1, only the smallest set of most probable tokens with probabilities
+                that add up to top_p or higher are kept for generation.
+            typical_p (`float`, *optional*):
+                Local typicality measures how similar the conditional probability of predicting a target token next is
+                to the expected conditional probability of predicting a random token next, given the partial text
+                already generated. If set to float < 1, the smallest set of the most locally typical tokens with
+                probabilities that add up to typical_p or higher are kept for generation. See [this
+                paper](https://hf.co/papers/2202.00666) for more details.
+            use_cache (`bool`, *optional*):
+                Whether the model should use the past last key/values attentions to speed up decoding
         Returns:
             `bytes`: The generated audio.
@@ -2248,10 +2534,36 @@ class InferenceClient:
         >>> Path("hello_world.flac").write_bytes(audio)
         ```
         """
-        return self.post(json={"inputs": text}, model=model, task="text-to-speech")
+        parameters = {
+            "do_sample": do_sample,
+            "early_stopping": early_stopping,
+            "epsilon_cutoff": epsilon_cutoff,
+            "eta_cutoff": eta_cutoff,
+            "max_length": max_length,
+            "max_new_tokens": max_new_tokens,
+            "min_length": min_length,
+            "min_new_tokens": min_new_tokens,
+            "num_beam_groups": num_beam_groups,
+            "num_beams": num_beams,
+            "penalty_alpha": penalty_alpha,
+            "temperature": temperature,
+            "top_k": top_k,
+            "top_p": top_p,
+            "typical_p": typical_p,
+            "use_cache": use_cache,
+        }
+        payload = _prepare_payload(text, parameters=parameters)
+        response = self.post(**payload, model=model, task="text-to-speech")
+        return response
     def token_classification(
-        self, text: str, *, model: Optional[str] = None
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        aggregation_strategy: Optional[Literal["none", "simple", "first", "average", "max"]] = None,
+        ignore_labels: Optional[List[str]] = None,
+        stride: Optional[int] = None,
     ) -> List[TokenClassificationOutputElement]:
         """
         Perform token classification on the given text.
@@ -2264,6 +2576,12 @@ class InferenceClient:
                 The model to use for the token classification task. Can be a model ID hosted on the Hugging Face Hub or a URL to
                 a deployed Inference Endpoint. If not provided, the default recommended token classification model will be used.
                 Defaults to None.
+            aggregation_strategy (`Literal["none", "simple", "first", "average", "max"]`, *optional*):
+                The strategy used to fuse tokens based on model predictions.
+            ignore_labels (`List[str]`, *optional*):
+                A list of labels to ignore.
+            stride (`int`, *optional*):
+                The number of overlapping tokens between chunks when splitting the input text.
         Returns:
             `List[TokenClassificationOutputElement]`: List of [`TokenClassificationOutputElement`] items containing the entity group, confidence score, word, start and end index.
@@ -2297,16 +2615,30 @@ class InferenceClient:
         ]
         ```
         """
-        payload: Dict[str, Any] = {"inputs": text}
+        parameters = {
+            "aggregation_strategy": aggregation_strategy,
+            "ignore_labels": ignore_labels,
+            "stride": stride,
+        }
+        payload = _prepare_payload(text, parameters=parameters)
         response = self.post(
-            json=payload,
+            **payload,
             model=model,
             task="token-classification",
         )
         return TokenClassificationOutputElement.parse_obj_as_list(response)
     def translation(
-        self, text: str, *, model: Optional[str] = None, src_lang: Optional[str] = None, tgt_lang: Optional[str] = None
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        src_lang: Optional[str] = None,
+        tgt_lang: Optional[str] = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        truncation: Optional[Literal["do_not_truncate", "longest_first", "only_first", "only_second"]] = None,
+        generate_parameters: Optional[Dict[str, Any]] = None,
     ) -> TranslationOutput:
         """
         Convert text from one language to another.
@@ -2315,7 +2647,6 @@ class InferenceClient:
         your specific use case. Source and target languages usually depend on the model.
         However, it is possible to specify source and target languages for certain models. If you are working with one of these models,
         you can use `src_lang` and `tgt_lang` arguments to pass the relevant information.
-        You can find this information in the model card.
         Args:
             text (`str`):
@@ -2325,9 +2656,15 @@ class InferenceClient:
                 a deployed Inference Endpoint. If not provided, the default recommended translation model will be used.
                 Defaults to None.
             src_lang (`str`, *optional*):
-                Source language of the translation task, i.e. input language. Cannot be passed without `tgt_lang`.
+                The source language of the text. Required for models that can translate from multiple languages.
             tgt_lang (`str`, *optional*):
-                Target language of the translation task, i.e. output language. Cannot be passed without `src_lang`.
+                Target language to translate to. Required for models that can translate to multiple languages.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether to clean up the potential extra spaces in the text output.
+            truncation (`Literal["do_not_truncate", "longest_first", "only_first", "only_second"]`, *optional*):
+                The truncation strategy to use.
+            generate_parameters (`Dict[str, Any]`, *optional*):
+                Additional parametrization of the text generation algorithm.
         Returns:
             [`TranslationOutput`]: The generated translated text.
@@ -2362,12 +2699,15 @@ class InferenceClient:
         if src_lang is None and tgt_lang is not None:
             raise ValueError("You cannot specify `tgt_lang` without specifying `src_lang`.")
-        # If both `src_lang` and `tgt_lang` are given, pass them to the request body
-        payload: Dict = {"inputs": text}
-        if src_lang and tgt_lang:
-            payload["parameters"] = {"src_lang": src_lang, "tgt_lang": tgt_lang}
-        response = self.post(json=payload, model=model, task="translation")
+        parameters = {
+            "src_lang": src_lang,
+            "tgt_lang": tgt_lang,
+            "clean_up_tokenization_spaces": clean_up_tokenization_spaces,
+            "truncation": truncation,
+            "generate_parameters": generate_parameters,
+        }
+        payload = _prepare_payload(text, parameters=parameters)
+        response = self.post(**payload, model=model, task="translation")
         return TranslationOutput.parse_obj_as_list(response)[0]
     def visual_question_answering(
@@ -2376,6 +2716,7 @@ class InferenceClient:
         question: str,
         *,
         model: Optional[str] = None,
+        top_k: Optional[int] = None,
     ) -> List[VisualQuestionAnsweringOutputElement]:
         """
         Answering open-ended questions based on an image.
@@ -2389,7 +2730,10 @@ class InferenceClient:
                 The model to use for the visual question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
                 a deployed Inference Endpoint. If not provided, the default recommended visual question answering model will be used.
                 Defaults to None.
+            top_k (`int`, *optional*):
+                The number of answers to return (will be chosen by order of likelihood). Note that we
+                return less than topk answers if there are not enough options available within the
+                context.
         Returns:
             `List[VisualQuestionAnsweringOutputElement]`: a list of [`VisualQuestionAnsweringOutputElement`] items containing the predicted label and associated probability.
@@ -2414,6 +2758,8 @@ class InferenceClient:
         ```
         """
         payload: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
+        if top_k is not None:
+            payload.setdefault("parameters", {})["top_k"] = top_k
         response = self.post(json=payload, model=model, task="visual-question-answering")
         return VisualQuestionAnsweringOutputElement.parse_obj_as_list(response)
@@ -2444,7 +2790,7 @@ class InferenceClient:
                 The model then evaluates for both hypotheses if they are entailed in the provided `text` or not.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
-                Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+                Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot classification model will be used.
         Returns:
             `List[ZeroShotClassificationOutputElement]`: List of [`ZeroShotClassificationOutputElement`] items containing the predicted labels and their confidence.
@@ -2502,15 +2848,14 @@ class InferenceClient:
         ```
         """
-        parameters = {"candidate_labels": labels, "multi_label": multi_label}
-        if hypothesis_template is not None:
-            parameters["hypothesis_template"] = hypothesis_template
+        parameters = {
+            "candidate_labels": labels,
+            "multi_label": multi_label,
+            "hypothesis_template": hypothesis_template,
+        }
+        payload = _prepare_payload(text, parameters=parameters)
         response = self.post(
-            json={
-                "inputs": text,
-                "parameters": parameters,
-            },
+            **payload,
             task="zero-shot-classification",
             model=model,
         )
@@ -2521,7 +2866,12 @@ class InferenceClient:
         ]
     def zero_shot_image_classification(
-        self, image: ContentT, labels: List[str], *, model: Optional[str] = None
+        self,
+        image: ContentT,
+        labels: List[str],
+        *,
+        model: Optional[str] = None,
+        hypothesis_template: Optional[str] = None,
     ) -> List[ZeroShotImageClassificationOutputElement]:
         """
         Provide input image and text labels to predict text labels for the image.
@@ -2533,8 +2883,10 @@ class InferenceClient:
                 List of string possible labels. There must be at least 2 labels.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
-                Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+                Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot image classification model will be used.
+            hypothesis_template (`str`, *optional*):
+                The sentence used in conjunction with `labels` to attempt the text classification by replacing the
+                placeholder with the candidate labels.
         Returns:
             `List[ZeroShotImageClassificationOutputElement]`: List of [`ZeroShotImageClassificationOutputElement`] items containing the predicted labels and their confidence.
@@ -2560,8 +2912,11 @@ class InferenceClient:
         if len(labels) < 2:
             raise ValueError("You must specify at least 2 classes to compare.")
+        inputs = {"image": _b64_encode(image), "candidateLabels": ",".join(labels)}
+        parameters = {"hypothesis_template": hypothesis_template}
+        payload = _prepare_payload(inputs, parameters=parameters)
         response = self.post(
-            json={"image": _b64_encode(image), "parameters": {"candidate_labels": ",".join(labels)}},
+            **payload,
             model=model,
             task="zero-shot-image-classification",
         )