PyPI - huggingface-hub - Versions diffs - 0.21.4__py3-none-any.whl → 0.22.0__py3-none-any.whl - Mend

huggingface-hub 0.21.4py3-none-any.whl → 0.22.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of huggingface-hub might be problematic. Click here for more details.

Files changed (97) hide show

huggingface_hub/__init__.py +217 -1
huggingface_hub/_commit_api.py +14 -15
huggingface_hub/_inference_endpoints.py +12 -11
huggingface_hub/_login.py +1 -0
huggingface_hub/_multi_commits.py +1 -0
huggingface_hub/_snapshot_download.py +9 -1
huggingface_hub/_tensorboard_logger.py +1 -0
huggingface_hub/_webhooks_payload.py +1 -0
huggingface_hub/_webhooks_server.py +1 -0
huggingface_hub/commands/_cli_utils.py +1 -0
huggingface_hub/commands/delete_cache.py +1 -0
huggingface_hub/commands/download.py +1 -0
huggingface_hub/commands/env.py +1 -0
huggingface_hub/commands/scan_cache.py +1 -0
huggingface_hub/commands/upload.py +1 -0
huggingface_hub/community.py +1 -0
huggingface_hub/constants.py +3 -1
huggingface_hub/errors.py +38 -0
huggingface_hub/file_download.py +102 -95
huggingface_hub/hf_api.py +47 -35
huggingface_hub/hf_file_system.py +77 -3
huggingface_hub/hub_mixin.py +215 -54
huggingface_hub/inference/_client.py +554 -239
huggingface_hub/inference/_common.py +195 -41
huggingface_hub/inference/_generated/_async_client.py +558 -239
huggingface_hub/inference/_generated/types/__init__.py +115 -0
huggingface_hub/inference/_generated/types/audio_classification.py +43 -0
huggingface_hub/inference/_generated/types/audio_to_audio.py +31 -0
huggingface_hub/inference/_generated/types/automatic_speech_recognition.py +116 -0
huggingface_hub/inference/_generated/types/base.py +149 -0
huggingface_hub/inference/_generated/types/chat_completion.py +106 -0
huggingface_hub/inference/_generated/types/depth_estimation.py +29 -0
huggingface_hub/inference/_generated/types/document_question_answering.py +85 -0
huggingface_hub/inference/_generated/types/feature_extraction.py +19 -0
huggingface_hub/inference/_generated/types/fill_mask.py +50 -0
huggingface_hub/inference/_generated/types/image_classification.py +43 -0
huggingface_hub/inference/_generated/types/image_segmentation.py +52 -0
huggingface_hub/inference/_generated/types/image_to_image.py +55 -0
huggingface_hub/inference/_generated/types/image_to_text.py +105 -0
huggingface_hub/inference/_generated/types/object_detection.py +55 -0
huggingface_hub/inference/_generated/types/question_answering.py +77 -0
huggingface_hub/inference/_generated/types/sentence_similarity.py +28 -0
huggingface_hub/inference/_generated/types/summarization.py +46 -0
huggingface_hub/inference/_generated/types/table_question_answering.py +45 -0
huggingface_hub/inference/_generated/types/text2text_generation.py +45 -0
huggingface_hub/inference/_generated/types/text_classification.py +43 -0
huggingface_hub/inference/_generated/types/text_generation.py +161 -0
huggingface_hub/inference/_generated/types/text_to_audio.py +105 -0
huggingface_hub/inference/_generated/types/text_to_image.py +57 -0
huggingface_hub/inference/_generated/types/token_classification.py +53 -0
huggingface_hub/inference/_generated/types/translation.py +46 -0
huggingface_hub/inference/_generated/types/video_classification.py +47 -0
huggingface_hub/inference/_generated/types/visual_question_answering.py +53 -0
huggingface_hub/inference/_generated/types/zero_shot_classification.py +56 -0
huggingface_hub/inference/_generated/types/zero_shot_image_classification.py +51 -0
huggingface_hub/inference/_generated/types/zero_shot_object_detection.py +55 -0
huggingface_hub/inference/_templating.py +105 -0
huggingface_hub/inference/_types.py +4 -152
huggingface_hub/keras_mixin.py +39 -17
huggingface_hub/lfs.py +20 -8
huggingface_hub/repocard.py +11 -3
huggingface_hub/repocard_data.py +12 -2
huggingface_hub/serialization/__init__.py +1 -0
huggingface_hub/serialization/_base.py +1 -0
huggingface_hub/serialization/_numpy.py +1 -0
huggingface_hub/serialization/_tensorflow.py +1 -0
huggingface_hub/serialization/_torch.py +1 -0
huggingface_hub/utils/__init__.py +4 -1
huggingface_hub/utils/_cache_manager.py +7 -0
huggingface_hub/utils/_chunk_utils.py +1 -0
huggingface_hub/utils/_datetime.py +1 -0
huggingface_hub/utils/_errors.py +10 -1
huggingface_hub/utils/_experimental.py +1 -0
huggingface_hub/utils/_fixes.py +19 -3
huggingface_hub/utils/_git_credential.py +1 -0
huggingface_hub/utils/_headers.py +10 -3
huggingface_hub/utils/_hf_folder.py +1 -0
huggingface_hub/utils/_http.py +1 -0
huggingface_hub/utils/_pagination.py +1 -0
huggingface_hub/utils/_paths.py +1 -0
huggingface_hub/utils/_runtime.py +22 -0
huggingface_hub/utils/_subprocess.py +1 -0
huggingface_hub/utils/_token.py +1 -0
huggingface_hub/utils/_typing.py +29 -1
huggingface_hub/utils/_validators.py +1 -0
huggingface_hub/utils/endpoint_helpers.py +1 -0
huggingface_hub/utils/logging.py +1 -1
huggingface_hub/utils/sha.py +1 -0
huggingface_hub/utils/tqdm.py +1 -0
{huggingface_hub-0.21.4.dist-info → huggingface_hub-0.22.0.dist-info}/METADATA +14 -15
huggingface_hub-0.22.0.dist-info/RECORD +113 -0
{huggingface_hub-0.21.4.dist-info → huggingface_hub-0.22.0.dist-info}/WHEEL +1 -1
huggingface_hub/inference/_text_generation.py +0 -551
huggingface_hub-0.21.4.dist-info/RECORD +0 -81
{huggingface_hub-0.21.4.dist-info → huggingface_hub-0.22.0.dist-info}/LICENSE +0 -0
{huggingface_hub-0.21.4.dist-info → huggingface_hub-0.22.0.dist-info}/entry_points.txt +0 -0
{huggingface_hub-0.21.4.dist-info → huggingface_hub-0.22.0.dist-info}/top_level.txt +0 -0

huggingface_hub/inference/_client.py CHANGED Viewed

@@ -23,7 +23,6 @@
 #    https://github.com/huggingface/unity-api#tasks
 #
 # Some TODO:
-# - validate inputs/options/parameters? with Pydantic for instance? or only optionally?
 # - add all tasks
 #
 # NOTE: the philosophy of this client is "let's make it as easy as possible to use it, even if less optimized". Some
@@ -37,7 +36,6 @@ import base64
 import logging
 import time
 import warnings
-from dataclasses import asdict
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -54,10 +52,10 @@ from requests import HTTPError
 from requests.structures import CaseInsensitiveDict
 from huggingface_hub.constants import ALL_INFERENCE_API_FRAMEWORKS, INFERENCE_ENDPOINT, MAIN_INFERENCE_API_FRAMEWORKS
+from huggingface_hub.errors import InferenceTimeoutError
 from huggingface_hub.inference._common import (
     TASKS_EXPECTING_IMAGES,
     ContentT,
-    InferenceTimeoutError,
     ModelStatus,
     _b64_encode,
     _b64_to_image,
@@ -66,28 +64,45 @@ from huggingface_hub.inference._common import (
     _bytes_to_list,
     _fetch_recommended_models,
     _import_numpy,
+    _is_chat_completion_server,
     _is_tgi_server,
     _open_as_binary,
+    _set_as_non_chat_completion_server,
     _set_as_non_tgi,
+    _stream_chat_completion_response_from_bytes,
+    _stream_chat_completion_response_from_text_generation,
     _stream_text_generation_response,
-)
-from huggingface_hub.inference._text_generation import (
-    TextGenerationParameters,
-    TextGenerationRequest,
-    TextGenerationResponse,
-    TextGenerationStreamResponse,
     raise_text_generation_error,
 )
+from huggingface_hub.inference._generated.types import (
+    AudioClassificationOutputElement,
+    AudioToAudioOutputElement,
+    AutomaticSpeechRecognitionOutput,
+    ChatCompletionOutput,
+    ChatCompletionOutputChoice,
+    ChatCompletionOutputChoiceMessage,
+    ChatCompletionStreamOutput,
+    DocumentQuestionAnsweringOutputElement,
+    FillMaskOutputElement,
+    ImageClassificationOutputElement,
+    ImageSegmentationOutputElement,
+    ImageToTextOutput,
+    ObjectDetectionOutputElement,
+    QuestionAnsweringOutputElement,
+    SummarizationOutput,
+    TableQuestionAnsweringOutputElement,
+    TextClassificationOutputElement,
+    TextGenerationOutput,
+    TextGenerationStreamOutput,
+    TokenClassificationOutputElement,
+    TranslationOutput,
+    VisualQuestionAnsweringOutputElement,
+    ZeroShotClassificationOutputElement,
+    ZeroShotImageClassificationOutputElement,
+)
+from huggingface_hub.inference._templating import render_chat_prompt
 from huggingface_hub.inference._types import (
-    AudioToAudioOutput,
-    ClassificationOutput,
-    ConversationalOutput,
-    FillMaskOutput,
-    ImageSegmentationOutput,
-    ObjectDetectionOutput,
-    QuestionAnsweringOutput,
-    TableQuestionAnsweringOutput,
-    TokenClassificationOutput,
+    ConversationalOutput,  # soon to be removed
 )
 from huggingface_hub.utils import (
     BadRequestError,
@@ -116,9 +131,9 @@ class InferenceClient:
             The model to run inference with. Can be a model id hosted on the Hugging Face Hub, e.g. `bigcode/starcoder`
             or a URL to a deployed Inference Endpoint. Defaults to None, in which case a recommended model is
             automatically selected for the task.
-        token (`str`, *optional*):
-            Hugging Face token. Will default to the locally saved token. Pass `token=False` if you don't want to send
-            your token to the server.
+        token (`str` or `bool`, *optional*):
+            Hugging Face token. Will default to the locally saved token if not provided.
+            Pass `token=False` if you don't want to send your token to the server.
         timeout (`float`, `optional`):
             The maximum number of seconds to wait for a response from the server. Loading a new model in Inference
             API can take up to several minutes. Defaults to None, meaning it will loop until the server is available.
@@ -138,6 +153,7 @@ class InferenceClient:
         cookies: Optional[Dict[str, str]] = None,
     ) -> None:
         self.model: Optional[str] = model
+        self.token: Union[str, bool, None] = token
         self.headers = CaseInsensitiveDict(build_hf_headers(token=token))  # contains 'authorization' + 'user-agent'
         if headers is not None:
             self.headers.update(headers)
@@ -156,11 +172,10 @@ class InferenceClient:
         model: Optional[str] = None,
         task: Optional[str] = None,
         stream: Literal[False] = ...,
-    ) -> bytes:
-        pass
+    ) -> bytes: ...
     @overload
-    def post(
+    def post(  # type: ignore[misc]
         self,
         *,
         json: Optional[Union[str, Dict, List]] = None,
@@ -168,8 +183,18 @@ class InferenceClient:
         model: Optional[str] = None,
         task: Optional[str] = None,
         stream: Literal[True] = ...,
-    ) -> Iterable[bytes]:
-        pass
+    ) -> Iterable[bytes]: ...
+    @overload
+    def post(
+        self,
+        *,
+        json: Optional[Union[str, Dict, List]] = None,
+        data: Optional[ContentT] = None,
+        model: Optional[str] = None,
+        task: Optional[str] = None,
+        stream: bool = False,
+    ) -> Union[bytes, Iterable[bytes]]: ...
     def post(
         self,
@@ -268,7 +293,7 @@ class InferenceClient:
         audio: ContentT,
         *,
         model: Optional[str] = None,
-    ) -> List[ClassificationOutput]:
+    ) -> List[AudioClassificationOutputElement]:
         """
         Perform audio classification on the provided audio content.
@@ -282,7 +307,7 @@ class InferenceClient:
                 audio classification will be used.
         Returns:
-            `List[Dict]`: The classification output containing the predicted label and its confidence.
+            `List[AudioClassificationOutputElement]`: List of [`AudioClassificationOutputElement`] items containing the predicted labels and their confidence.
         Raises:
             [`InferenceTimeoutError`]:
@@ -295,18 +320,22 @@ class InferenceClient:
         >>> from huggingface_hub import InferenceClient
         >>> client = InferenceClient()
         >>> client.audio_classification("audio.flac")
-        [{'score': 0.4976358711719513, 'label': 'hap'}, {'score': 0.3677836060523987, 'label': 'neu'},...]
+        [
+            AudioClassificationOutputElement(score=0.4976358711719513, label='hap'),
+            AudioClassificationOutputElement(score=0.3677836060523987, label='neu'),
+            ...
+        ]
         ```
         """
         response = self.post(data=audio, model=model, task="audio-classification")
-        return _bytes_to_list(response)
+        return AudioClassificationOutputElement.parse_obj_as_list(response)
     def audio_to_audio(
         self,
         audio: ContentT,
         *,
         model: Optional[str] = None,
-    ) -> List[AudioToAudioOutput]:
+    ) -> List[AudioToAudioOutputElement]:
         """
         Performs multiple tasks related to audio-to-audio depending on the model (eg: speech enhancement, source separation).
@@ -320,7 +349,7 @@ class InferenceClient:
                 audio_to_audio will be used.
         Returns:
-            `List[Dict]`: A list of dictionary where each index contains audios label, content-type, and audio content in blob.
+            `List[AudioToAudioOutputElement]`: A list of [`AudioToAudioOutputElement`] items containing audios label, content-type, and audio content in blob.
         Raises:
             `InferenceTimeoutError`:
@@ -335,13 +364,13 @@ class InferenceClient:
         >>> audio_output = client.audio_to_audio("audio.flac")
         >>> for i, item in enumerate(audio_output):
         >>>     with open(f"output_{i}.flac", "wb") as f:
-                    f.write(item["blob"])
+                    f.write(item.blob)
         ```
         """
         response = self.post(data=audio, model=model, task="audio-to-audio")
-        audio_output = _bytes_to_list(response)
+        audio_output = AudioToAudioOutputElement.parse_obj_as_list(response)
         for item in audio_output:
-            item["blob"] = base64.b64decode(item["blob"])
+            item.blob = base64.b64decode(item.blob)
         return audio_output
     def automatic_speech_recognition(
@@ -349,7 +378,7 @@ class InferenceClient:
         audio: ContentT,
         *,
         model: Optional[str] = None,
-    ) -> str:
+    ) -> AutomaticSpeechRecognitionOutput:
         """
         Perform automatic speech recognition (ASR or audio-to-text) on the given audio content.
@@ -361,7 +390,7 @@ class InferenceClient:
                 Inference Endpoint. If not provided, the default recommended model for ASR will be used.
         Returns:
-            str: The transcribed text.
+            [`AutomaticSpeechRecognitionOutput`]: An item containing the transcribed text and optionally the timestamp chunks.
         Raises:
             [`InferenceTimeoutError`]:
@@ -373,12 +402,265 @@ class InferenceClient:
         ```py
         >>> from huggingface_hub import InferenceClient
         >>> client = InferenceClient()
-        >>> client.automatic_speech_recognition("hello_world.flac")
+        >>> client.automatic_speech_recognition("hello_world.flac").text
         "hello world"
         ```
         """
         response = self.post(data=audio, model=model, task="automatic-speech-recognition")
-        return _bytes_to_dict(response)["text"]
+        return AutomaticSpeechRecognitionOutput.parse_obj_as_instance(response)
+    @overload
+    def chat_completion(  # type: ignore
+        self,
+        messages: List[Dict[str, str]],
+        *,
+        model: Optional[str] = None,
+        stream: Literal[False] = False,
+        max_tokens: int = 20,
+        seed: Optional[int] = None,
+        stop: Optional[Union[List[str], str]] = None,
+        temperature: float = 1.0,
+        top_p: Optional[float] = None,
+    ) -> ChatCompletionOutput: ...
+    @overload
+    def chat_completion(  # type: ignore
+        self,
+        messages: List[Dict[str, str]],
+        *,
+        model: Optional[str] = None,
+        stream: Literal[True] = True,
+        max_tokens: int = 20,
+        seed: Optional[int] = None,
+        stop: Optional[Union[List[str], str]] = None,
+        temperature: float = 1.0,
+        top_p: Optional[float] = None,
+    ) -> Iterable[ChatCompletionStreamOutput]: ...
+    @overload
+    def chat_completion(
+        self,
+        messages: List[Dict[str, str]],
+        *,
+        model: Optional[str] = None,
+        stream: bool = False,
+        max_tokens: int = 20,
+        seed: Optional[int] = None,
+        stop: Optional[Union[List[str], str]] = None,
+        temperature: float = 1.0,
+        top_p: Optional[float] = None,
+    ) -> Union[ChatCompletionOutput, Iterable[ChatCompletionStreamOutput]]: ...
+    def chat_completion(
+        self,
+        messages: List[Dict[str, str]],
+        *,
+        model: Optional[str] = None,
+        stream: bool = False,
+        max_tokens: int = 20,
+        seed: Optional[int] = None,
+        stop: Optional[Union[List[str], str]] = None,
+        temperature: float = 1.0,
+        top_p: Optional[float] = None,
+    ) -> Union[ChatCompletionOutput, Iterable[ChatCompletionStreamOutput]]:
+        """
+        A method for completing conversations using a specified language model.
+        <Tip>
+        If the model is served by a server supporting chat-completion, the method will directly call the server's
+        `/v1/chat/completions` endpoint. If the server does not support chat-completion, the method will render the
+        chat template client-side based on the information fetched from the Hub API. In this case, you will need to
+        have `minijinja` template engine installed. Run `pip install "huggingface_hub[inference]"` or `pip install minijinja`
+        to install it.
+        </Tip>
+        Args:
+            messages (List[Union[`SystemMessage`, `UserMessage`, `AssistantMessage`]]):
+                Conversation history consisting of roles and content pairs.
+            model (`str`, *optional*):
+                The model to use for chat-completion. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. If not provided, the default recommended model for chat-based text-generation will be used.
+                See https://huggingface.co/tasks/text-generation for more details.
+            frequency_penalty (`float`, optional):
+                Penalizes new tokens based on their existing frequency
+                in the text so far. Range: [-2.0, 2.0]. Defaults to 0.0.
+            max_tokens (`int`, optional):
+                Maximum number of tokens allowed in the response. Defaults to 20.
+            seed (Optional[`int`], optional):
+                Seed for reproducible control flow. Defaults to None.
+            stop (Optional[`str`], optional):
+                Up to four strings which trigger the end of the response.
+                Defaults to None.
+            stream (`bool`, optional):
+                Enable realtime streaming of responses. Defaults to False.
+            temperature (`float`, optional):
+                Controls randomness of the generations. Lower values ensure
+                less random completions. Range: [0, 2]. Defaults to 1.0.
+            top_p (`float`, optional):
+                Fraction of the most likely next words to sample from.
+                Must be between 0 and 1. Defaults to 1.0.
+        Returns:
+            `Union[ChatCompletionOutput, Iterable[ChatCompletionStreamOutput]]`:
+            Generated text returned from the server:
+            - if `stream=False`, the generated text is returned as a [`ChatCompletionOutput`] (default).
+            - if `stream=True`, the generated text is returned token by token as a sequence of [`ChatCompletionStreamOutput`].
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> messages = [{"role": "user", "content": "What is the capital of France?"}]
+        >>> client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
+        >>> client.chat_completion(messages, max_tokens=100)
+        ChatCompletionOutput(
+            choices=[
+                ChatCompletionOutputChoice(
+                    finish_reason='eos_token',
+                    index=0,
+                    message=ChatCompletionOutputChoiceMessage(
+                        content='The capital of France is Paris. The official name of the city is "Ville de Paris" (City of Paris) and the name of the country\'s governing body, which is located in Paris, is "La République française" (The French Republic). \nI hope that helps! Let me know if you need any further information.'
+                    )
+                )
+            ],
+            created=1710498360
+        )
+        >>> for token in client.chat_completion(messages, max_tokens=10, stream=True):
+        ...     print(token)
+        ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content='The', role='assistant'), index=0, finish_reason=None)], created=1710498504)
+        ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' capital', role='assistant'), index=0, finish_reason=None)], created=1710498504)
+        (...)
+        ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' may', role='assistant'), index=0, finish_reason=None)], created=1710498504)
+        ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=None, role=None), index=0, finish_reason='length')], created=1710498504)
+        ```
+        """
+        # determine model
+        model = model or self.model or self.get_recommended_model("text-generation")
+        if _is_chat_completion_server(model):
+            # First, let's consider the server has a `/v1/chat/completions` endpoint.
+            # If that's the case, we don't have to render the chat template client-side.
+            model_url = self._resolve_url(model) + "/v1/chat/completions"
+            try:
+                data = self.post(
+                    model=model_url,
+                    json=dict(
+                        model="tgi",  # random string
+                        messages=messages,
+                        max_tokens=max_tokens,
+                        seed=seed,
+                        stop=stop,
+                        temperature=temperature,
+                        top_p=top_p,
+                        stream=stream,
+                    ),
+                    stream=stream,
+                )
+            except HTTPError:
+                # Let's consider the server is not a chat completion server.
+                # Then we call again `chat_completion` which will render the chat template client side.
+                # (can be HTTP 500, HTTP 400, HTTP 404 depending on the server)
+                _set_as_non_chat_completion_server(model)
+                return self.chat_completion(
+                    messages=messages,
+                    model=model,
+                    stream=stream,
+                    max_tokens=max_tokens,
+                    seed=seed,
+                    stop=stop,
+                    temperature=temperature,
+                    top_p=top_p,
+                )
+            if stream:
+                return _stream_chat_completion_response_from_bytes(data)  # type: ignore[arg-type]
+            return ChatCompletionOutput.parse_obj_as_instance(data)  # type: ignore[arg-type]
+        # At this point, we know the server is not a chat completion server.
+        # We need to render the chat template client side based on the information we can fetch from
+        # the Hub API.
+        model_id = None
+        if model.startswith(("http://", "https://")):
+            # If URL, we need to know which model is served. This is not always possible.
+            # A workaround is to list the user Inference Endpoints and check if one of them correspond to the model URL.
+            # If not, we raise an error.
+            # TODO: fix when we have a proper API for this (at least for Inference Endpoints)
+            # TODO: what if Sagemaker URL?
+            # TODO: what if Azure URL?
+            from ..hf_api import HfApi
+            for endpoint in HfApi(token=self.token).list_inference_endpoints():
+                if endpoint.url == model:
+                    model_id = endpoint.repository
+                    break
+        else:
+            model_id = model
+        if model_id is None:
+            # If we don't have the model ID, we can't fetch the chat template.
+            # We raise an error.
+            raise ValueError(
+                "Request can't be processed as the model ID can't be inferred from model URL. "
+                "This is needed to fetch the chat template from the Hub since the model is not "
+                "served with a Chat-completion API."
+            )
+        # fetch chat template + tokens
+        prompt = render_chat_prompt(model_id=model_id, token=self.token, messages=messages)
+        # generate response
+        stop_sequences = [stop] if isinstance(stop, str) else stop
+        text_generation_output = self.text_generation(
+            prompt=prompt,
+            details=True,
+            stream=stream,
+            model=model,
+            max_new_tokens=max_tokens,
+            seed=seed,
+            stop_sequences=stop_sequences,
+            temperature=temperature,
+            top_p=top_p,
+        )
+        created = int(time.time())
+        if stream:
+            return _stream_chat_completion_response_from_text_generation(text_generation_output)  # type: ignore [arg-type]
+        if isinstance(text_generation_output, TextGenerationOutput):
+            # General use case => format ChatCompletionOutput from text generation details
+            content: str = text_generation_output.generated_text
+            finish_reason: str = text_generation_output.details.finish_reason  # type: ignore[union-attr]
+        else:
+            # Corner case: if server doesn't support details (e.g. if not a TGI server), we only receive an output string.
+            # In such a case, `finish_reason` is set to `"unk"`.
+            content = text_generation_output  # type: ignore[assignment]
+            finish_reason = "unk"
+        return ChatCompletionOutput(
+            created=created,
+            choices=[
+                ChatCompletionOutputChoice(
+                    finish_reason=finish_reason,  # type: ignore
+                    index=0,
+                    message=ChatCompletionOutputChoiceMessage(
+                        content=content,
+                        role="assistant",
+                    ),
+                )
+            ],
+        )
     def conversational(
         self,
@@ -392,6 +674,13 @@ class InferenceClient:
         """
         Generate conversational responses based on the given input text (i.e. chat with the API).
+        <Tip warning={true}>
+        [`InferenceClient.conversational`] API is deprecated and will be removed in a future release. Please use
+        [`InferenceClient.chat_completion`] instead.
+        </Tip>
         Args:
             text (`str`):
                 The last input from the user in the conversation.
@@ -431,6 +720,11 @@ class InferenceClient:
         ... )
         ```
         """
+        warnings.warn(
+            "'InferenceClient.conversational' is deprecated and will be removed starting from huggingface_hub>=0.25. "
+            "Please use the more appropriate 'InferenceClient.chat_completion' API instead.",
+            FutureWarning,
+        )
         payload: Dict[str, Any] = {"inputs": {"text": text}}
         if generated_responses is not None:
             payload["inputs"]["generated_responses"] = generated_responses
@@ -441,57 +735,13 @@ class InferenceClient:
         response = self.post(json=payload, model=model, task="conversational")
         return _bytes_to_dict(response)  # type: ignore
-    def visual_question_answering(
-        self,
-        image: ContentT,
-        question: str,
-        *,
-        model: Optional[str] = None,
-    ) -> List[str]:
-        """
-        Answering open-ended questions based on an image.
-        Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The input image for the context. It can be raw bytes, an image file, or a URL to an online image.
-            question (`str`):
-                Question to be answered.
-            model (`str`, *optional*):
-                The model to use for the visual question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
-                a deployed Inference Endpoint. If not provided, the default recommended visual question answering model will be used.
-                Defaults to None.
-        Returns:
-            `List[Dict]`: a list of dictionaries containing the predicted label and associated probability.
-        Raises:
-            `InferenceTimeoutError`:
-                If the model is unavailable or the request times out.
-            `HTTPError`:
-                If the request fails with an HTTP error status code other than HTTP 503.
-        Example:
-        ```py
-        >>> from huggingface_hub import InferenceClient
-        >>> client = InferenceClient()
-        >>> client.visual_question_answering(
-        ...     image="https://huggingface.co/datasets/mishig/sample_images/resolve/main/tiger.jpg",
-        ...     question="What is the animal doing?"
-        ... )
-        [{'score': 0.778609573841095, 'answer': 'laying down'},{'score': 0.6957435607910156, 'answer': 'sitting'}, ...]
-        ```
-        """
-        payload: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
-        response = self.post(json=payload, model=model, task="visual-question-answering")
-        return _bytes_to_list(response)
     def document_question_answering(
         self,
         image: ContentT,
         question: str,
         *,
         model: Optional[str] = None,
-    ) -> List[QuestionAnsweringOutput]:
+    ) -> List[DocumentQuestionAnsweringOutputElement]:
         """
         Answer questions on document images.
@@ -506,7 +756,7 @@ class InferenceClient:
                 Defaults to None.
         Returns:
-            `List[Dict]`: a list of dictionaries containing the predicted label, associated probability, word ids, and page number.
+            `List[DocumentQuestionAnsweringOutputElement]`: a list of [`DocumentQuestionAnsweringOutputElement`] items containing the predicted label, associated probability, word ids, and page number.
         Raises:
             [`InferenceTimeoutError`]:
@@ -519,12 +769,12 @@ class InferenceClient:
         >>> from huggingface_hub import InferenceClient
         >>> client = InferenceClient()
         >>> client.document_question_answering(image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png", question="What is the invoice number?")
-        [{'score': 0.42515629529953003, 'answer': 'us-001', 'start': 16, 'end': 16}]
+        [DocumentQuestionAnsweringOutputElement(score=0.42515629529953003, answer='us-001', start=16, end=16)]
         ```
         """
         payload: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
         response = self.post(json=payload, model=model, task="document-question-answering")
-        return _bytes_to_list(response)
+        return DocumentQuestionAnsweringOutputElement.parse_obj_as_list(response)
     def feature_extraction(self, text: str, *, model: Optional[str] = None) -> "np.ndarray":
         """
@@ -562,7 +812,7 @@ class InferenceClient:
         np = _import_numpy()
         return np.array(_bytes_to_dict(response), dtype="float32")
-    def fill_mask(self, text: str, *, model: Optional[str] = None) -> List[FillMaskOutput]:
+    def fill_mask(self, text: str, *, model: Optional[str] = None) -> List[FillMaskOutputElement]:
         """
         Fill in a hole with a missing word (token to be precise).
@@ -575,7 +825,7 @@ class InferenceClient:
                 Defaults to None.
         Returns:
-            `List[Dict]`: a list of fill mask output dictionaries containing the predicted label, associated
+            `List[FillMaskOutputElement]`: a list of [`FillMaskOutputElement`] items containing the predicted label, associated
             probability, token reference, and completed text.
         Raises:
@@ -589,25 +839,21 @@ class InferenceClient:
         >>> from huggingface_hub import InferenceClient
         >>> client = InferenceClient()
         >>> client.fill_mask("The goal of life is <mask>.")
-        [{'score': 0.06897063553333282,
-        'token': 11098,
-        'token_str': ' happiness',
-        'sequence': 'The goal of life is happiness.'},
-        {'score': 0.06554922461509705,
-        'token': 45075,
-        'token_str': ' immortality',
-        'sequence': 'The goal of life is immortality.'}]
+        [
+            FillMaskOutputElement(score=0.06897063553333282, token=11098, token_str=' happiness', sequence='The goal of life is happiness.'),
+            FillMaskOutputElement(score=0.06554922461509705, token=45075, token_str=' immortality', sequence='The goal of life is immortality.')
+        ]
         ```
         """
         response = self.post(json={"inputs": text}, model=model, task="fill-mask")
-        return _bytes_to_list(response)
+        return FillMaskOutputElement.parse_obj_as_list(response)
     def image_classification(
         self,
         image: ContentT,
         *,
         model: Optional[str] = None,
-    ) -> List[ClassificationOutput]:
+    ) -> List[ImageClassificationOutputElement]:
         """
         Perform image classification on the given image using the specified model.
@@ -619,7 +865,7 @@ class InferenceClient:
                 deployed Inference Endpoint. If not provided, the default recommended model for image classification will be used.
         Returns:
-            `List[Dict]`: a list of dictionaries containing the predicted label and associated probability.
+            `List[ImageClassificationOutputElement]`: a list of [`ImageClassificationOutputElement`] items containing the predicted label and associated probability.
         Raises:
             [`InferenceTimeoutError`]:
@@ -632,18 +878,18 @@ class InferenceClient:
         >>> from huggingface_hub import InferenceClient
         >>> client = InferenceClient()
         >>> client.image_classification("https://upload.wikimedia.org/wikipedia/commons/thumb/4/43/Cute_dog.jpg/320px-Cute_dog.jpg")
-        [{'score': 0.9779096841812134, 'label': 'Blenheim spaniel'}, ...]
+        [ImageClassificationOutputElement(score=0.9779096841812134, label='Blenheim spaniel'), ...]
         ```
         """
         response = self.post(data=image, model=model, task="image-classification")
-        return _bytes_to_list(response)
+        return ImageClassificationOutputElement.parse_obj_as_list(response)
     def image_segmentation(
         self,
         image: ContentT,
         *,
         model: Optional[str] = None,
-    ) -> List[ImageSegmentationOutput]:
+    ) -> List[ImageSegmentationOutputElement]:
         """
         Perform image segmentation on the given image using the specified model.
@@ -661,7 +907,7 @@ class InferenceClient:
                 deployed Inference Endpoint. If not provided, the default recommended model for image segmentation will be used.
         Returns:
-            `List[Dict]`: A list of dictionaries containing the segmented masks and associated attributes.
+            `List[ImageSegmentationOutputElement]`: A list of [`ImageSegmentationOutputElement`] items containing the segmented masks and associated attributes.
         Raises:
             [`InferenceTimeoutError`]:
@@ -674,19 +920,13 @@ class InferenceClient:
         >>> from huggingface_hub import InferenceClient
         >>> client = InferenceClient()
         >>> client.image_segmentation("cat.jpg"):
-        [{'score': 0.989008, 'label': 'LABEL_184', 'mask': <PIL.PngImagePlugin.PngImageFile image mode=L size=400x300 at 0x7FDD2B129CC0>}, ...]
+        [ImageSegmentationOutputElement(score=0.989008, label='LABEL_184', mask=<PIL.PngImagePlugin.PngImageFile image mode=L size=400x300 at 0x7FDD2B129CC0>), ...]
         ```
         """
-        # Segment
         response = self.post(data=image, model=model, task="image-segmentation")
-        output = _bytes_to_dict(response)
-        # Parse masks as PIL Image
-        if not isinstance(output, list):
-            raise ValueError(f"Server output must be a list. Got {type(output)}: {str(output)[:200]}...")
+        output = ImageSegmentationOutputElement.parse_obj_as_list(response)
         for item in output:
-            item["mask"] = _b64_to_image(item["mask"])
+            item.mask = _b64_to_image(item.mask)
         return output
     def image_to_image(
@@ -773,7 +1013,7 @@ class InferenceClient:
         response = self.post(json=payload, data=data, model=model, task="image-to-image")
         return _bytes_to_image(response)
-    def image_to_text(self, image: ContentT, *, model: Optional[str] = None) -> str:
+    def image_to_text(self, image: ContentT, *, model: Optional[str] = None) -> ImageToTextOutput:
         """
         Takes an input image and return text.
@@ -788,7 +1028,7 @@ class InferenceClient:
                 Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
         Returns:
-            `str`: The generated text.
+            [`ImageToTextOutput`]: The generated text.
         Raises:
             [`InferenceTimeoutError`]:
@@ -807,7 +1047,7 @@ class InferenceClient:
         ```
         """
         response = self.post(data=image, model=model, task="image-to-text")
-        return _bytes_to_dict(response)[0]["generated_text"]
+        return ImageToTextOutput.parse_obj_as_instance(response)
     def list_deployed_models(
         self, frameworks: Union[None, str, Literal["all"], List[str]] = None
@@ -889,7 +1129,7 @@ class InferenceClient:
         image: ContentT,
         *,
         model: Optional[str] = None,
-    ) -> List[ObjectDetectionOutput]:
+    ) -> List[ObjectDetectionOutputElement]:
         """
         Perform object detection on the given image using the specified model.
@@ -907,7 +1147,7 @@ class InferenceClient:
                 deployed Inference Endpoint. If not provided, the default recommended model for object detection (DETR) will be used.
         Returns:
-            `List[ObjectDetectionOutput]`: A list of dictionaries containing the bounding boxes and associated attributes.
+            `List[ObjectDetectionOutputElement]`: A list of [`ObjectDetectionOutputElement`] items containing the bounding boxes and associated attributes.
         Raises:
             [`InferenceTimeoutError`]:
@@ -922,19 +1162,16 @@ class InferenceClient:
         >>> from huggingface_hub import InferenceClient
         >>> client = InferenceClient()
         >>> client.object_detection("people.jpg"):
-        [{"score":0.9486683011054993,"label":"person","box":{"xmin":59,"ymin":39,"xmax":420,"ymax":510}}, ... ]
+        [ObjectDetectionOutputElement(score=0.9486683011054993, label='person', box=ObjectDetectionBoundingBox(xmin=59, ymin=39, xmax=420, ymax=510)), ...]
         ```
         """
         # detect objects
         response = self.post(data=image, model=model, task="object-detection")
-        output = _bytes_to_dict(response)
-        if not isinstance(output, list):
-            raise ValueError(f"Server output must be a list. Got {type(output)}: {str(output)[:200]}...")
-        return output
+        return ObjectDetectionOutputElement.parse_obj_as_list(response)
     def question_answering(
         self, question: str, context: str, *, model: Optional[str] = None
-    ) -> QuestionAnsweringOutput:
+    ) -> QuestionAnsweringOutputElement:
         """
         Retrieve the answer to a question from a given text.
@@ -948,7 +1185,7 @@ class InferenceClient:
                 a deployed Inference Endpoint.
         Returns:
-            `Dict`: a dictionary of question answering output containing the score, start index, end index, and answer.
+            [`QuestionAnsweringOutputElement`]: an question answering output containing the score, start index, end index, and answer.
         Raises:
             [`InferenceTimeoutError`]:
@@ -961,7 +1198,7 @@ class InferenceClient:
         >>> from huggingface_hub import InferenceClient
         >>> client = InferenceClient()
         >>> client.question_answering(question="What's my name?", context="My name is Clara and I live in Berkeley.")
-        {'score': 0.9326562285423279, 'start': 11, 'end': 16, 'answer': 'Clara'}
+        QuestionAnsweringOutputElement(score=0.9326562285423279, start=11, end=16, answer='Clara')
         ```
         """
@@ -971,7 +1208,7 @@ class InferenceClient:
             model=model,
             task="question-answering",
         )
-        return _bytes_to_dict(response)  # type: ignore
+        return QuestionAnsweringOutputElement.parse_obj_as_instance(response)
     def sentence_similarity(
         self, sentence: str, other_sentences: List[str], *, model: Optional[str] = None
@@ -1026,7 +1263,7 @@ class InferenceClient:
         *,
         parameters: Optional[Dict[str, Any]] = None,
         model: Optional[str] = None,
-    ) -> str:
+    ) -> SummarizationOutput:
         """
         Generate a summary of a given text using a specified model.
@@ -1041,7 +1278,7 @@ class InferenceClient:
                 Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
         Returns:
-            `str`: The generated summary text.
+            [`SummarizationOutput`]: The generated summary text.
         Raises:
             [`InferenceTimeoutError`]:
@@ -1054,18 +1291,18 @@ class InferenceClient:
         >>> from huggingface_hub import InferenceClient
         >>> client = InferenceClient()
         >>> client.summarization("The Eiffel tower...")
-        'The Eiffel tower is one of the most famous landmarks in the world....'
+        SummarizationOutput(generated_text="The Eiffel tower is one of the most famous landmarks in the world....")
         ```
         """
         payload: Dict[str, Any] = {"inputs": text}
         if parameters is not None:
             payload["parameters"] = parameters
         response = self.post(json=payload, model=model, task="summarization")
-        return _bytes_to_dict(response)[0]["summary_text"]
+        return SummarizationOutput.parse_obj_as_list(response)[0]
     def table_question_answering(
         self, table: Dict[str, Any], query: str, *, model: Optional[str] = None
-    ) -> TableQuestionAnsweringOutput:
+    ) -> TableQuestionAnsweringOutputElement:
         """
         Retrieve the answer to a question from information given in a table.
@@ -1080,7 +1317,7 @@ class InferenceClient:
                 Hub or a URL to a deployed Inference Endpoint.
         Returns:
-            `Dict`: a dictionary of table question answering output containing the answer, coordinates, cells and the aggregator used.
+            [`TableQuestionAnsweringOutputElement`]: a table question answering output containing the answer, coordinates, cells and the aggregator used.
         Raises:
             [`InferenceTimeoutError`]:
@@ -1095,7 +1332,7 @@ class InferenceClient:
         >>> query = "How many stars does the transformers repository have?"
         >>> table = {"Repository": ["Transformers", "Datasets", "Tokenizers"], "Stars": ["36542", "4512", "3934"]}
         >>> client.table_question_answering(table, query, model="google/tapas-base-finetuned-wtq")
-        {'answer': 'AVERAGE > 36542', 'coordinates': [[0, 1]], 'cells': ['36542'], 'aggregator': 'AVERAGE'}
+        TableQuestionAnsweringOutputElement(answer='36542', coordinates=[[0, 1]], cells=['36542'], aggregator='AVERAGE')
         ```
         """
         response = self.post(
@@ -1106,7 +1343,7 @@ class InferenceClient:
             model=model,
             task="table-question-answering",
         )
-        return _bytes_to_dict(response)  # type: ignore
+        return TableQuestionAnsweringOutputElement.parse_obj_as_instance(response)
     def tabular_classification(self, table: Dict[str, Any], *, model: Optional[str] = None) -> List[str]:
         """
@@ -1193,7 +1430,7 @@ class InferenceClient:
         response = self.post(json={"table": table}, model=model, task="tabular-regression")
         return _bytes_to_list(response)
-    def text_classification(self, text: str, *, model: Optional[str] = None) -> List[ClassificationOutput]:
+    def text_classification(self, text: str, *, model: Optional[str] = None) -> List[TextClassificationOutputElement]:
         """
         Perform text classification (e.g. sentiment-analysis) on the given text.
@@ -1206,7 +1443,7 @@ class InferenceClient:
                 Defaults to None.
         Returns:
-            `List[Dict]`: a list of dictionaries containing the predicted label and associated probability.
+            `List[TextClassificationOutputElement]`: a list of [`TextClassificationOutputElement`] items containing the predicted label and associated probability.
         Raises:
             [`InferenceTimeoutError`]:
@@ -1219,11 +1456,14 @@ class InferenceClient:
         >>> from huggingface_hub import InferenceClient
         >>> client = InferenceClient()
         >>> client.text_classification("I like you")
-        [{'label': 'POSITIVE', 'score': 0.9998695850372314}, {'label': 'NEGATIVE', 'score': 0.0001304351753788069}]
+        [
+            TextClassificationOutputElement(label='POSITIVE', score=0.9998695850372314),
+            TextClassificationOutputElement(label='NEGATIVE', score=0.0001304351753788069),
+        ]
         ```
         """
         response = self.post(json={"inputs": text}, model=model, task="text-classification")
-        return _bytes_to_list(response)[0]
+        return TextClassificationOutputElement.parse_obj_as_list(response)[0]  # type: ignore [return-value]
     @overload
     def text_generation(  # type: ignore
@@ -1246,8 +1486,7 @@ class InferenceClient:
         truncate: Optional[int] = None,
         typical_p: Optional[float] = None,
         watermark: bool = False,
-    ) -> str:
-        ...
+    ) -> str: ...
     @overload
     def text_generation(  # type: ignore
@@ -1270,8 +1509,7 @@ class InferenceClient:
         truncate: Optional[int] = None,
         typical_p: Optional[float] = None,
         watermark: bool = False,
-    ) -> TextGenerationResponse:
-        ...
+    ) -> TextGenerationOutput: ...
     @overload
     def text_generation(  # type: ignore
@@ -1294,11 +1532,10 @@ class InferenceClient:
         truncate: Optional[int] = None,
         typical_p: Optional[float] = None,
         watermark: bool = False,
-    ) -> Iterable[str]:
-        ...
+    ) -> Iterable[str]: ...
     @overload
-    def text_generation(
+    def text_generation(  # type: ignore
         self,
         prompt: str,
         *,
@@ -1318,8 +1555,30 @@ class InferenceClient:
         truncate: Optional[int] = None,
         typical_p: Optional[float] = None,
         watermark: bool = False,
-    ) -> Iterable[TextGenerationStreamResponse]:
-        ...
+    ) -> Iterable[TextGenerationStreamOutput]: ...
+    @overload
+    def text_generation(
+        self,
+        prompt: str,
+        *,
+        details: Literal[True] = ...,
+        stream: bool = ...,
+        model: Optional[str] = None,
+        do_sample: bool = False,
+        max_new_tokens: int = 20,
+        best_of: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        return_full_text: bool = False,
+        seed: Optional[int] = None,
+        stop_sequences: Optional[List[str]] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: bool = False,
+    ) -> Union[TextGenerationOutput, Iterable[TextGenerationStreamOutput]]: ...
     def text_generation(
         self,
@@ -1342,13 +1601,10 @@ class InferenceClient:
         typical_p: Optional[float] = None,
         watermark: bool = False,
         decoder_input_details: bool = False,
-    ) -> Union[str, TextGenerationResponse, Iterable[str], Iterable[TextGenerationStreamResponse]]:
+    ) -> Union[str, TextGenerationOutput, Iterable[str], Iterable[TextGenerationStreamOutput]]:
         """
         Given a prompt, generate the following text.
-        It is recommended to have Pydantic installed in order to get inputs validated. This is preferable as it allow
-        early failures.
         API endpoint is supposed to run with the `text-generation-inference` backend (TGI). This backend is the
         go-to solution to run large language models at scale. However, for some smaller models (e.g. "gpt2") the
         default `transformers` + `api-inference` solution is still in use. Both approaches have very similar APIs, but
@@ -1406,12 +1662,12 @@ class InferenceClient:
                 into account. Defaults to `False`.
         Returns:
-            `Union[str, TextGenerationResponse, Iterable[str], Iterable[TextGenerationStreamResponse]]`:
+            `Union[str, TextGenerationOutput, Iterable[str], Iterable[TextGenerationStreamOutput]]`:
             Generated text returned from the server:
             - if `stream=False` and `details=False`, the generated text is returned as a `str` (default)
             - if `stream=True` and `details=False`, the generated text is returned token by token as a `Iterable[str]`
-            - if `stream=False` and `details=True`, the generated text is returned with more details as a [`~huggingface_hub.inference._text_generation.TextGenerationResponse`]
-            - if `details=True` and `stream=True`, the generated text is returned token by token as a iterable of [`~huggingface_hub.inference._text_generation.TextGenerationStreamResponse`]
+            - if `stream=False` and `details=True`, the generated text is returned with more details as a [`~huggingface_hub.TextGenerationOutput`]
+            - if `details=True` and `stream=True`, the generated text is returned token by token as a iterable of [`~huggingface_hub.TextGenerationStreamOutput`]
         Raises:
             `ValidationError`:
@@ -1448,23 +1704,23 @@ class InferenceClient:
         # Case 3: get more details about the generation process.
         >>> client.text_generation("The huggingface_hub library is ", max_new_tokens=12, details=True)
-        TextGenerationResponse(
+        TextGenerationOutput(
             generated_text='100% open source and built to be easy to use.',
-            details=Details(
-                finish_reason=<FinishReason.Length: 'length'>,
+            details=TextGenerationDetails(
+                finish_reason='length',
                 generated_tokens=12,
                 seed=None,
                 prefill=[
-                    InputToken(id=487, text='The', logprob=None),
-                    InputToken(id=53789, text=' hugging', logprob=-13.171875),
+                    TextGenerationPrefillToken(id=487, text='The', logprob=None),
+                    TextGenerationPrefillToken(id=53789, text=' hugging', logprob=-13.171875),
                     (...)
-                    InputToken(id=204, text=' ', logprob=-7.0390625)
+                    TextGenerationPrefillToken(id=204, text=' ', logprob=-7.0390625)
                 ],
                 tokens=[
-                    Token(id=1425, text='100', logprob=-1.0175781, special=False),
-                    Token(id=16, text='%', logprob=-0.0463562, special=False),
+                    TokenElement(id=1425, text='100', logprob=-1.0175781, special=False),
+                    TokenElement(id=16, text='%', logprob=-0.0463562, special=False),
                     (...)
-                    Token(id=25, text='.', logprob=-0.5703125, special=False)
+                    TokenElement(id=25, text='.', logprob=-0.5703125, special=False)
                 ],
                 best_of_sequences=None
             )
@@ -1475,30 +1731,27 @@ class InferenceClient:
         >>> for details in client.text_generation("The huggingface_hub library is ", max_new_tokens=12, details=True, stream=True):
         ...     print(details)
         ...
-        TextGenerationStreamResponse(token=Token(id=1425, text='100', logprob=-1.0175781, special=False), generated_text=None, details=None)
-        TextGenerationStreamResponse(token=Token(id=16, text='%', logprob=-0.0463562, special=False), generated_text=None, details=None)
-        TextGenerationStreamResponse(token=Token(id=1314, text=' open', logprob=-1.3359375, special=False), generated_text=None, details=None)
-        TextGenerationStreamResponse(token=Token(id=3178, text=' source', logprob=-0.28100586, special=False), generated_text=None, details=None)
-        TextGenerationStreamResponse(token=Token(id=273, text=' and', logprob=-0.5961914, special=False), generated_text=None, details=None)
-        TextGenerationStreamResponse(token=Token(id=3426, text=' built', logprob=-1.9423828, special=False), generated_text=None, details=None)
-        TextGenerationStreamResponse(token=Token(id=271, text=' to', logprob=-1.4121094, special=False), generated_text=None, details=None)
-        TextGenerationStreamResponse(token=Token(id=314, text=' be', logprob=-1.5224609, special=False), generated_text=None, details=None)
-        TextGenerationStreamResponse(token=Token(id=1833, text=' easy', logprob=-2.1132812, special=False), generated_text=None, details=None)
-        TextGenerationStreamResponse(token=Token(id=271, text=' to', logprob=-0.08520508, special=False), generated_text=None, details=None)
-        TextGenerationStreamResponse(token=Token(id=745, text=' use', logprob=-0.39453125, special=False), generated_text=None, details=None)
-        TextGenerationStreamResponse(token=Token(
+        TextGenerationStreamOutput(token=TokenElement(id=1425, text='100', logprob=-1.0175781, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=16, text='%', logprob=-0.0463562, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=1314, text=' open', logprob=-1.3359375, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=3178, text=' source', logprob=-0.28100586, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=273, text=' and', logprob=-0.5961914, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=3426, text=' built', logprob=-1.9423828, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=271, text=' to', logprob=-1.4121094, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=314, text=' be', logprob=-1.5224609, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=1833, text=' easy', logprob=-2.1132812, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=271, text=' to', logprob=-0.08520508, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=745, text=' use', logprob=-0.39453125, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(
             id=25,
             text='.',
             logprob=-0.5703125,
             special=False),
             generated_text='100% open source and built to be easy to use.',
-            details=StreamDetails(finish_reason=<FinishReason.Length: 'length'>, generated_tokens=12, seed=None)
+            details=TextGenerationStreamDetails(finish_reason='length', generated_tokens=12, seed=None)
         )
         ```
         """
-        # NOTE: Text-generation integration is taken from the text-generation-inference project. It has more features
-        # like input/output validation (if Pydantic is installed). See `_text_generation.py` header for more details.
         if decoder_input_details and not details:
             warnings.warn(
                 "`decoder_input_details=True` has been passed to the server but `details=False` is set meaning that"
@@ -1506,34 +1759,38 @@ class InferenceClient:
             )
             decoder_input_details = False
-        # Validate parameters
-        parameters = TextGenerationParameters(
-            best_of=best_of,
-            details=details,
-            do_sample=do_sample,
-            max_new_tokens=max_new_tokens,
-            repetition_penalty=repetition_penalty,
-            return_full_text=return_full_text,
-            seed=seed,
-            stop=stop_sequences if stop_sequences is not None else [],
-            temperature=temperature,
-            top_k=top_k,
-            top_p=top_p,
-            truncate=truncate,
-            typical_p=typical_p,
-            watermark=watermark,
-            decoder_input_details=decoder_input_details,
-        )
-        request = TextGenerationRequest(inputs=prompt, stream=stream, parameters=parameters)
-        payload = asdict(request)
+        # Build payload
+        payload = {
+            "inputs": prompt,
+            "parameters": {
+                "best_of": best_of,
+                "decoder_input_details": decoder_input_details,
+                "details": details,
+                "do_sample": do_sample,
+                "max_new_tokens": max_new_tokens,
+                "repetition_penalty": repetition_penalty,
+                "return_full_text": return_full_text,
+                "seed": seed,
+                "stop": stop_sequences if stop_sequences is not None else [],
+                "temperature": temperature,
+                "top_k": top_k,
+                "top_p": top_p,
+                "truncate": truncate,
+                "typical_p": typical_p,
+                "watermark": watermark,
+            },
+            "stream": stream,
+        }
         # Remove some parameters if not a TGI server
         if not _is_tgi_server(model):
+            parameters: Dict = payload["parameters"]  # type: ignore [assignment]
             ignored_parameters = []
-            for key in "watermark", "stop", "details", "decoder_input_details", "best_of":
-                if payload["parameters"][key] is not None:
+            for key in "watermark", "details", "decoder_input_details", "best_of", "stop", "return_full_text":
+                if parameters[key] is not None:
                     ignored_parameters.append(key)
-                del payload["parameters"][key]
+                del parameters[key]
             if len(ignored_parameters) > 0:
                 warnings.warn(
                     "API endpoint/model for text-generation is not served via TGI. Ignoring parameters"
@@ -1585,8 +1842,8 @@ class InferenceClient:
         if stream:
             return _stream_text_generation_response(bytes_output, details)  # type: ignore
-        data = _bytes_to_dict(bytes_output)[0]
-        return TextGenerationResponse(**data) if details else data["generated_text"]
+        data = _bytes_to_dict(bytes_output)[0]  # type: ignore[arg-type]
+        return TextGenerationOutput.parse_obj_as_instance(data) if details else data["generated_text"]
     def text_to_image(
         self,
@@ -1700,7 +1957,9 @@ class InferenceClient:
         """
         return self.post(json={"inputs": text}, model=model, task="text-to-speech")
-    def token_classification(self, text: str, *, model: Optional[str] = None) -> List[TokenClassificationOutput]:
+    def token_classification(
+        self, text: str, *, model: Optional[str] = None
+    ) -> List[TokenClassificationOutputElement]:
         """
         Perform token classification on the given text.
         Usually used for sentence parsing, either grammatical, or Named Entity Recognition (NER) to understand keywords contained within text.
@@ -1714,7 +1973,7 @@ class InferenceClient:
                 Defaults to None.
         Returns:
-            `List[Dict]`: List of token classification outputs containing the entity group, confidence score, word, start and end index.
+            `List[TokenClassificationOutputElement]`: List of [`TokenClassificationOutputElement`] items containing the entity group, confidence score, word, start and end index.
         Raises:
             [`InferenceTimeoutError`]:
@@ -1727,16 +1986,22 @@ class InferenceClient:
         >>> from huggingface_hub import InferenceClient
         >>> client = InferenceClient()
         >>> client.token_classification("My name is Sarah Jessica Parker but you can call me Jessica")
-        [{'entity_group': 'PER',
-        'score': 0.9971321225166321,
-        'word': 'Sarah Jessica Parker',
-        'start': 11,
-        'end': 31},
-        {'entity_group': 'PER',
-        'score': 0.9773476123809814,
-        'word': 'Jessica',
-        'start': 52,
-        'end': 59}]
+        [
+            TokenClassificationOutputElement(
+                entity_group='PER',
+                score=0.9971321225166321,
+                word='Sarah Jessica Parker',
+                start=11,
+                end=31,
+            ),
+            TokenClassificationOutputElement(
+                entity_group='PER',
+                score=0.9773476123809814,
+                word='Jessica',
+                start=52,
+                end=59,
+            )
+        ]
         ```
         """
         payload: Dict[str, Any] = {"inputs": text}
@@ -1745,11 +2010,11 @@ class InferenceClient:
             model=model,
             task="token-classification",
         )
-        return _bytes_to_list(response)
+        return TokenClassificationOutputElement.parse_obj_as_list(response)
     def translation(
         self, text: str, *, model: Optional[str] = None, src_lang: Optional[str] = None, tgt_lang: Optional[str] = None
-    ) -> str:
+    ) -> TranslationOutput:
         """
         Convert text from one language to another.
@@ -1772,7 +2037,7 @@ class InferenceClient:
                 Target language of the translation task, i.e. output language. Cannot be passed without `src_lang`.
         Returns:
-            `str`: The generated translated text.
+            [`TranslationOutput`]: The generated translated text.
         Raises:
             [`InferenceTimeoutError`]:
@@ -1789,7 +2054,7 @@ class InferenceClient:
         >>> client.translation("My name is Wolfgang and I live in Berlin")
         'Mein Name ist Wolfgang und ich lebe in Berlin.'
         >>> client.translation("My name is Wolfgang and I live in Berlin", model="Helsinki-NLP/opus-mt-en-fr")
-        "Je m'appelle Wolfgang et je vis à Berlin."
+        TranslationOutput(translation_text='Je m\'appelle Wolfgang et je vis à Berlin.')
         ```
         Specifying languages:
@@ -1810,11 +2075,58 @@ class InferenceClient:
         if src_lang and tgt_lang:
             payload["parameters"] = {"src_lang": src_lang, "tgt_lang": tgt_lang}
         response = self.post(json=payload, model=model, task="translation")
-        return _bytes_to_dict(response)[0]["translation_text"]
+        return TranslationOutput.parse_obj_as_list(response)[0]
+    def visual_question_answering(
+        self,
+        image: ContentT,
+        question: str,
+        *,
+        model: Optional[str] = None,
+    ) -> List[VisualQuestionAnsweringOutputElement]:
+        """
+        Answering open-ended questions based on an image.
+        Args:
+            image (`Union[str, Path, bytes, BinaryIO]`):
+                The input image for the context. It can be raw bytes, an image file, or a URL to an online image.
+            question (`str`):
+                Question to be answered.
+            model (`str`, *optional*):
+                The model to use for the visual question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended visual question answering model will be used.
+                Defaults to None.
+        Returns:
+            `List[VisualQuestionAnsweringOutputElement]`: a list of [`VisualQuestionAnsweringOutputElement`] items containing the predicted label and associated probability.
+        Raises:
+            `InferenceTimeoutError`:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> client.visual_question_answering(
+        ...     image="https://huggingface.co/datasets/mishig/sample_images/resolve/main/tiger.jpg",
+        ...     question="What is the animal doing?"
+        ... )
+        [
+            VisualQuestionAnsweringOutputElement(score=0.778609573841095, answer='laying down'),
+            VisualQuestionAnsweringOutputElement(score=0.6957435607910156, answer='sitting'),
+        ]
+        ```
+        """
+        payload: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
+        response = self.post(json=payload, model=model, task="visual-question-answering")
+        return VisualQuestionAnsweringOutputElement.parse_obj_as_list(response)
     def zero_shot_classification(
         self, text: str, labels: List[str], *, multi_label: bool = False, model: Optional[str] = None
-    ) -> List[ClassificationOutput]:
+    ) -> List[ZeroShotClassificationOutputElement]:
         """
         Provide as input a text and a set of candidate labels to classify the input text.
@@ -1830,7 +2142,7 @@ class InferenceClient:
                 Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
         Returns:
-            `List[Dict]`: List of classification outputs containing the predicted labels and their confidence.
+            `List[ZeroShotClassificationOutputElement]`: List of [`ZeroShotClassificationOutputElement`] items containing the predicted labels and their confidence.
         Raises:
             [`InferenceTimeoutError`]:
@@ -1850,19 +2162,19 @@ class InferenceClient:
         >>> labels = ["space & cosmos", "scientific discovery", "microbiology", "robots", "archeology"]
         >>> client.zero_shot_classification(text, labels)
         [
-            {"label": "scientific discovery", "score": 0.7961668968200684},
-            {"label": "space & cosmos", "score": 0.18570658564567566},
-            {"label": "microbiology", "score": 0.00730885099619627},
-            {"label": "archeology", "score": 0.006258360575884581},
-            {"label": "robots", "score": 0.004559356719255447},
+            ZeroShotClassificationOutputElement(label='scientific discovery', score=0.7961668968200684),
+            ZeroShotClassificationOutputElement(label='space & cosmos', score=0.18570658564567566),
+            ZeroShotClassificationOutputElement(label='microbiology', score=0.00730885099619627),
+            ZeroShotClassificationOutputElement(label='archeology', score=0.006258360575884581),
+            ZeroShotClassificationOutputElement(label='robots', score=0.004559356719255447),
         ]
         >>> client.zero_shot_classification(text, labels, multi_label=True)
         [
-            {"label": "scientific discovery", "score": 0.9829297661781311},
-            {"label": "space & cosmos", "score": 0.755190908908844},
-            {"label": "microbiology", "score": 0.0005462635890580714},
-            {"label": "archeology", "score": 0.00047131875180639327},
-            {"label": "robots", "score": 0.00030448526376858354},
+            ZeroShotClassificationOutputElement(label='scientific discovery', score=0.9829297661781311),
+            ZeroShotClassificationOutputElement(label='space & cosmos', score=0.755190908908844),
+            ZeroShotClassificationOutputElement(label='microbiology', score=0.0005462635890580714),
+            ZeroShotClassificationOutputElement(label='archeology', score=0.00047131875180639327),
+            ZeroShotClassificationOutputElement(label='robots', score=0.00030448526376858354),
         ]
         ```
         """
@@ -1882,11 +2194,14 @@ class InferenceClient:
             task="zero-shot-classification",
         )
         output = _bytes_to_dict(response)
-        return [{"label": label, "score": score} for label, score in zip(output["labels"], output["scores"])]
+        return [
+            ZeroShotClassificationOutputElement.parse_obj_as_instance({"label": label, "score": score})
+            for label, score in zip(output["labels"], output["scores"])
+        ]
     def zero_shot_image_classification(
         self, image: ContentT, labels: List[str], *, model: Optional[str] = None
-    ) -> List[ClassificationOutput]:
+    ) -> List[ZeroShotImageClassificationOutputElement]:
         """
         Provide input image and text labels to predict text labels for the image.
@@ -1900,7 +2215,7 @@ class InferenceClient:
                 Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
         Returns:
-            `List[Dict]`: List of classification outputs containing the predicted labels and their confidence.
+            `List[ZeroShotImageClassificationOutputElement]`: List of [`ZeroShotImageClassificationOutputElement`] items containing the predicted labels and their confidence.
         Raises:
             [`InferenceTimeoutError`]:
@@ -1917,7 +2232,7 @@ class InferenceClient:
         ...     "https://upload.wikimedia.org/wikipedia/commons/thumb/4/43/Cute_dog.jpg/320px-Cute_dog.jpg",
         ...     labels=["dog", "cat", "horse"],
         ... )
-        [{"label": "dog", "score": 0.956}, ...]
+        [ZeroShotImageClassificationOutputElement(label='dog', score=0.956),...]
         ```
         """
         # Raise ValueError if input is less than 2 labels
@@ -1929,7 +2244,7 @@ class InferenceClient:
             model=model,
             task="zero-shot-image-classification",
         )
-        return _bytes_to_list(response)
+        return ZeroShotImageClassificationOutputElement.parse_obj_as_list(response)
     def _resolve_url(self, model: Optional[str] = None, task: Optional[str] = None) -> str:
         model = model or self.model

huggingface-hub 0.21.4__py3-none-any.whl → 0.22.0__py3-none-any.whl

Potentially problematic release.

huggingface-hub 0.21.4py3-none-any.whl → 0.22.0py3-none-any.whl