PyPI - huggingface-hub - Versions diffs - 0.21.4__py3-none-any.whl → 0.22.0rc0__py3-none-any.whl - Mend

huggingface-hub 0.21.4py3-none-any.whl → 0.22.0rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of huggingface-hub might be problematic. Click here for more details.

Files changed (96) hide show

huggingface_hub/__init__.py +217 -1
huggingface_hub/_commit_api.py +14 -15
huggingface_hub/_inference_endpoints.py +12 -11
huggingface_hub/_login.py +1 -0
huggingface_hub/_multi_commits.py +1 -0
huggingface_hub/_snapshot_download.py +9 -1
huggingface_hub/_tensorboard_logger.py +1 -0
huggingface_hub/_webhooks_payload.py +1 -0
huggingface_hub/_webhooks_server.py +1 -0
huggingface_hub/commands/_cli_utils.py +1 -0
huggingface_hub/commands/delete_cache.py +1 -0
huggingface_hub/commands/download.py +1 -0
huggingface_hub/commands/env.py +1 -0
huggingface_hub/commands/scan_cache.py +1 -0
huggingface_hub/commands/upload.py +1 -0
huggingface_hub/community.py +1 -0
huggingface_hub/constants.py +3 -1
huggingface_hub/errors.py +38 -0
huggingface_hub/file_download.py +24 -24
huggingface_hub/hf_api.py +47 -35
huggingface_hub/hub_mixin.py +210 -54
huggingface_hub/inference/_client.py +554 -239
huggingface_hub/inference/_common.py +195 -41
huggingface_hub/inference/_generated/_async_client.py +558 -239
huggingface_hub/inference/_generated/types/__init__.py +115 -0
huggingface_hub/inference/_generated/types/audio_classification.py +43 -0
huggingface_hub/inference/_generated/types/audio_to_audio.py +31 -0
huggingface_hub/inference/_generated/types/automatic_speech_recognition.py +116 -0
huggingface_hub/inference/_generated/types/base.py +149 -0
huggingface_hub/inference/_generated/types/chat_completion.py +106 -0
huggingface_hub/inference/_generated/types/depth_estimation.py +29 -0
huggingface_hub/inference/_generated/types/document_question_answering.py +85 -0
huggingface_hub/inference/_generated/types/feature_extraction.py +19 -0
huggingface_hub/inference/_generated/types/fill_mask.py +50 -0
huggingface_hub/inference/_generated/types/image_classification.py +43 -0
huggingface_hub/inference/_generated/types/image_segmentation.py +52 -0
huggingface_hub/inference/_generated/types/image_to_image.py +55 -0
huggingface_hub/inference/_generated/types/image_to_text.py +105 -0
huggingface_hub/inference/_generated/types/object_detection.py +55 -0
huggingface_hub/inference/_generated/types/question_answering.py +77 -0
huggingface_hub/inference/_generated/types/sentence_similarity.py +28 -0
huggingface_hub/inference/_generated/types/summarization.py +46 -0
huggingface_hub/inference/_generated/types/table_question_answering.py +45 -0
huggingface_hub/inference/_generated/types/text2text_generation.py +45 -0
huggingface_hub/inference/_generated/types/text_classification.py +43 -0
huggingface_hub/inference/_generated/types/text_generation.py +161 -0
huggingface_hub/inference/_generated/types/text_to_audio.py +105 -0
huggingface_hub/inference/_generated/types/text_to_image.py +57 -0
huggingface_hub/inference/_generated/types/token_classification.py +53 -0
huggingface_hub/inference/_generated/types/translation.py +46 -0
huggingface_hub/inference/_generated/types/video_classification.py +47 -0
huggingface_hub/inference/_generated/types/visual_question_answering.py +53 -0
huggingface_hub/inference/_generated/types/zero_shot_classification.py +56 -0
huggingface_hub/inference/_generated/types/zero_shot_image_classification.py +51 -0
huggingface_hub/inference/_generated/types/zero_shot_object_detection.py +55 -0
huggingface_hub/inference/_templating.py +105 -0
huggingface_hub/inference/_types.py +4 -152
huggingface_hub/keras_mixin.py +39 -17
huggingface_hub/lfs.py +20 -8
huggingface_hub/repocard.py +11 -3
huggingface_hub/repocard_data.py +12 -2
huggingface_hub/serialization/__init__.py +1 -0
huggingface_hub/serialization/_base.py +1 -0
huggingface_hub/serialization/_numpy.py +1 -0
huggingface_hub/serialization/_tensorflow.py +1 -0
huggingface_hub/serialization/_torch.py +1 -0
huggingface_hub/utils/__init__.py +4 -1
huggingface_hub/utils/_cache_manager.py +7 -0
huggingface_hub/utils/_chunk_utils.py +1 -0
huggingface_hub/utils/_datetime.py +1 -0
huggingface_hub/utils/_errors.py +10 -1
huggingface_hub/utils/_experimental.py +1 -0
huggingface_hub/utils/_fixes.py +19 -3
huggingface_hub/utils/_git_credential.py +1 -0
huggingface_hub/utils/_headers.py +10 -3
huggingface_hub/utils/_hf_folder.py +1 -0
huggingface_hub/utils/_http.py +1 -0
huggingface_hub/utils/_pagination.py +1 -0
huggingface_hub/utils/_paths.py +1 -0
huggingface_hub/utils/_runtime.py +22 -0
huggingface_hub/utils/_subprocess.py +1 -0
huggingface_hub/utils/_token.py +1 -0
huggingface_hub/utils/_typing.py +29 -1
huggingface_hub/utils/_validators.py +1 -0
huggingface_hub/utils/endpoint_helpers.py +1 -0
huggingface_hub/utils/logging.py +1 -1
huggingface_hub/utils/sha.py +1 -0
huggingface_hub/utils/tqdm.py +1 -0
{huggingface_hub-0.21.4.dist-info → huggingface_hub-0.22.0rc0.dist-info}/METADATA +14 -15
huggingface_hub-0.22.0rc0.dist-info/RECORD +113 -0
{huggingface_hub-0.21.4.dist-info → huggingface_hub-0.22.0rc0.dist-info}/WHEEL +1 -1
huggingface_hub/inference/_text_generation.py +0 -551
huggingface_hub-0.21.4.dist-info/RECORD +0 -81
{huggingface_hub-0.21.4.dist-info → huggingface_hub-0.22.0rc0.dist-info}/LICENSE +0 -0
{huggingface_hub-0.21.4.dist-info → huggingface_hub-0.22.0rc0.dist-info}/entry_points.txt +0 -0
{huggingface_hub-0.21.4.dist-info → huggingface_hub-0.22.0rc0.dist-info}/top_level.txt +0 -0

huggingface_hub/inference/_generated/_async_client.py CHANGED Viewed

@@ -23,7 +23,6 @@ import base64
 import logging
 import time
 import warnings
-from dataclasses import asdict
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -39,11 +38,13 @@ from typing import (
 from requests.structures import CaseInsensitiveDict
 from huggingface_hub.constants import ALL_INFERENCE_API_FRAMEWORKS, INFERENCE_ENDPOINT, MAIN_INFERENCE_API_FRAMEWORKS
+from huggingface_hub.errors import InferenceTimeoutError
 from huggingface_hub.inference._common import (
     TASKS_EXPECTING_IMAGES,
     ContentT,
-    InferenceTimeoutError,
     ModelStatus,
+    _async_stream_chat_completion_response_from_bytes,
+    _async_stream_chat_completion_response_from_text_generation,
     _async_stream_text_generation_response,
     _b64_encode,
     _b64_to_image,
@@ -52,27 +53,42 @@ from huggingface_hub.inference._common import (
     _bytes_to_list,
     _fetch_recommended_models,
     _import_numpy,
+    _is_chat_completion_server,
     _is_tgi_server,
     _open_as_binary,
+    _set_as_non_chat_completion_server,
     _set_as_non_tgi,
-)
-from huggingface_hub.inference._text_generation import (
-    TextGenerationParameters,
-    TextGenerationRequest,
-    TextGenerationResponse,
-    TextGenerationStreamResponse,
     raise_text_generation_error,
 )
+from huggingface_hub.inference._generated.types import (
+    AudioClassificationOutputElement,
+    AudioToAudioOutputElement,
+    AutomaticSpeechRecognitionOutput,
+    ChatCompletionOutput,
+    ChatCompletionOutputChoice,
+    ChatCompletionOutputChoiceMessage,
+    ChatCompletionStreamOutput,
+    DocumentQuestionAnsweringOutputElement,
+    FillMaskOutputElement,
+    ImageClassificationOutputElement,
+    ImageSegmentationOutputElement,
+    ImageToTextOutput,
+    ObjectDetectionOutputElement,
+    QuestionAnsweringOutputElement,
+    SummarizationOutput,
+    TableQuestionAnsweringOutputElement,
+    TextClassificationOutputElement,
+    TextGenerationOutput,
+    TextGenerationStreamOutput,
+    TokenClassificationOutputElement,
+    TranslationOutput,
+    VisualQuestionAnsweringOutputElement,
+    ZeroShotClassificationOutputElement,
+    ZeroShotImageClassificationOutputElement,
+)
+from huggingface_hub.inference._templating import render_chat_prompt
 from huggingface_hub.inference._types import (
-    AudioToAudioOutput,
-    ClassificationOutput,
-    ConversationalOutput,
-    FillMaskOutput,
-    ImageSegmentationOutput,
-    ObjectDetectionOutput,
-    QuestionAnsweringOutput,
-    TableQuestionAnsweringOutput,
-    TokenClassificationOutput,
+    ConversationalOutput,  # soon to be removed
 )
 from huggingface_hub.utils import (
     build_hf_headers,
@@ -100,9 +116,9 @@ class AsyncInferenceClient:
             The model to run inference with. Can be a model id hosted on the Hugging Face Hub, e.g. `bigcode/starcoder`
             or a URL to a deployed Inference Endpoint. Defaults to None, in which case a recommended model is
             automatically selected for the task.
-        token (`str`, *optional*):
-            Hugging Face token. Will default to the locally saved token. Pass `token=False` if you don't want to send
-            your token to the server.
+        token (`str` or `bool`, *optional*):
+            Hugging Face token. Will default to the locally saved token if not provided.
+            Pass `token=False` if you don't want to send your token to the server.
         timeout (`float`, `optional`):
             The maximum number of seconds to wait for a response from the server. Loading a new model in Inference
             API can take up to several minutes. Defaults to None, meaning it will loop until the server is available.
@@ -122,6 +138,7 @@ class AsyncInferenceClient:
         cookies: Optional[Dict[str, str]] = None,
     ) -> None:
         self.model: Optional[str] = model
+        self.token: Union[str, bool, None] = token
         self.headers = CaseInsensitiveDict(build_hf_headers(token=token))  # contains 'authorization' + 'user-agent'
         if headers is not None:
             self.headers.update(headers)
@@ -140,11 +157,10 @@ class AsyncInferenceClient:
         model: Optional[str] = None,
         task: Optional[str] = None,
         stream: Literal[False] = ...,
-    ) -> bytes:
-        pass
+    ) -> bytes: ...
     @overload
-    async def post(
+    async def post(  # type: ignore[misc]
         self,
         *,
         json: Optional[Union[str, Dict, List]] = None,
@@ -152,8 +168,18 @@ class AsyncInferenceClient:
         model: Optional[str] = None,
         task: Optional[str] = None,
         stream: Literal[True] = ...,
-    ) -> AsyncIterable[bytes]:
-        pass
+    ) -> AsyncIterable[bytes]: ...
+    @overload
+    async def post(
+        self,
+        *,
+        json: Optional[Union[str, Dict, List]] = None,
+        data: Optional[ContentT] = None,
+        model: Optional[str] = None,
+        task: Optional[str] = None,
+        stream: bool = False,
+    ) -> Union[bytes, AsyncIterable[bytes]]: ...
     async def post(
         self,
@@ -263,7 +289,7 @@ class AsyncInferenceClient:
         audio: ContentT,
         *,
         model: Optional[str] = None,
-    ) -> List[ClassificationOutput]:
+    ) -> List[AudioClassificationOutputElement]:
         """
         Perform audio classification on the provided audio content.
@@ -277,7 +303,7 @@ class AsyncInferenceClient:
                 audio classification will be used.
         Returns:
-            `List[Dict]`: The classification output containing the predicted label and its confidence.
+            `List[AudioClassificationOutputElement]`: List of [`AudioClassificationOutputElement`] items containing the predicted labels and their confidence.
         Raises:
             [`InferenceTimeoutError`]:
@@ -291,18 +317,22 @@ class AsyncInferenceClient:
         >>> from huggingface_hub import AsyncInferenceClient
         >>> client = AsyncInferenceClient()
         >>> await client.audio_classification("audio.flac")
-        [{'score': 0.4976358711719513, 'label': 'hap'}, {'score': 0.3677836060523987, 'label': 'neu'},...]
+        [
+            AudioClassificationOutputElement(score=0.4976358711719513, label='hap'),
+            AudioClassificationOutputElement(score=0.3677836060523987, label='neu'),
+            ...
+        ]
         ```
         """
         response = await self.post(data=audio, model=model, task="audio-classification")
-        return _bytes_to_list(response)
+        return AudioClassificationOutputElement.parse_obj_as_list(response)
     async def audio_to_audio(
         self,
         audio: ContentT,
         *,
         model: Optional[str] = None,
-    ) -> List[AudioToAudioOutput]:
+    ) -> List[AudioToAudioOutputElement]:
         """
         Performs multiple tasks related to audio-to-audio depending on the model (eg: speech enhancement, source separation).
@@ -316,7 +346,7 @@ class AsyncInferenceClient:
                 audio_to_audio will be used.
         Returns:
-            `List[Dict]`: A list of dictionary where each index contains audios label, content-type, and audio content in blob.
+            `List[AudioToAudioOutputElement]`: A list of [`AudioToAudioOutputElement`] items containing audios label, content-type, and audio content in blob.
         Raises:
             `InferenceTimeoutError`:
@@ -332,13 +362,13 @@ class AsyncInferenceClient:
         >>> audio_output = await client.audio_to_audio("audio.flac")
         >>> async for i, item in enumerate(audio_output):
         >>>     with open(f"output_{i}.flac", "wb") as f:
-                    f.write(item["blob"])
+                    f.write(item.blob)
         ```
         """
         response = await self.post(data=audio, model=model, task="audio-to-audio")
-        audio_output = _bytes_to_list(response)
+        audio_output = AudioToAudioOutputElement.parse_obj_as_list(response)
         for item in audio_output:
-            item["blob"] = base64.b64decode(item["blob"])
+            item.blob = base64.b64decode(item.blob)
         return audio_output
     async def automatic_speech_recognition(
@@ -346,7 +376,7 @@ class AsyncInferenceClient:
         audio: ContentT,
         *,
         model: Optional[str] = None,
-    ) -> str:
+    ) -> AutomaticSpeechRecognitionOutput:
         """
         Perform automatic speech recognition (ASR or audio-to-text) on the given audio content.
@@ -358,7 +388,7 @@ class AsyncInferenceClient:
                 Inference Endpoint. If not provided, the default recommended model for ASR will be used.
         Returns:
-            str: The transcribed text.
+            [`AutomaticSpeechRecognitionOutput`]: An item containing the transcribed text and optionally the timestamp chunks.
         Raises:
             [`InferenceTimeoutError`]:
@@ -371,12 +401,266 @@ class AsyncInferenceClient:
         # Must be run in an async context
         >>> from huggingface_hub import AsyncInferenceClient
         >>> client = AsyncInferenceClient()
-        >>> await client.automatic_speech_recognition("hello_world.flac")
+        >>> await client.automatic_speech_recognition("hello_world.flac").text
         "hello world"
         ```
         """
         response = await self.post(data=audio, model=model, task="automatic-speech-recognition")
-        return _bytes_to_dict(response)["text"]
+        return AutomaticSpeechRecognitionOutput.parse_obj_as_instance(response)
+    @overload
+    async def chat_completion(  # type: ignore
+        self,
+        messages: List[Dict[str, str]],
+        *,
+        model: Optional[str] = None,
+        stream: Literal[False] = False,
+        max_tokens: int = 20,
+        seed: Optional[int] = None,
+        stop: Optional[Union[List[str], str]] = None,
+        temperature: float = 1.0,
+        top_p: Optional[float] = None,
+    ) -> ChatCompletionOutput: ...
+    @overload
+    async def chat_completion(  # type: ignore
+        self,
+        messages: List[Dict[str, str]],
+        *,
+        model: Optional[str] = None,
+        stream: Literal[True] = True,
+        max_tokens: int = 20,
+        seed: Optional[int] = None,
+        stop: Optional[Union[List[str], str]] = None,
+        temperature: float = 1.0,
+        top_p: Optional[float] = None,
+    ) -> AsyncIterable[ChatCompletionStreamOutput]: ...
+    @overload
+    async def chat_completion(
+        self,
+        messages: List[Dict[str, str]],
+        *,
+        model: Optional[str] = None,
+        stream: bool = False,
+        max_tokens: int = 20,
+        seed: Optional[int] = None,
+        stop: Optional[Union[List[str], str]] = None,
+        temperature: float = 1.0,
+        top_p: Optional[float] = None,
+    ) -> Union[ChatCompletionOutput, AsyncIterable[ChatCompletionStreamOutput]]: ...
+    async def chat_completion(
+        self,
+        messages: List[Dict[str, str]],
+        *,
+        model: Optional[str] = None,
+        stream: bool = False,
+        max_tokens: int = 20,
+        seed: Optional[int] = None,
+        stop: Optional[Union[List[str], str]] = None,
+        temperature: float = 1.0,
+        top_p: Optional[float] = None,
+    ) -> Union[ChatCompletionOutput, AsyncIterable[ChatCompletionStreamOutput]]:
+        """
+        A method for completing conversations using a specified language model.
+        <Tip>
+        If the model is served by a server supporting chat-completion, the method will directly call the server's
+        `/v1/chat/completions` endpoint. If the server does not support chat-completion, the method will render the
+        chat template client-side based on the information fetched from the Hub API. In this case, you will need to
+        have `minijinja` template engine installed. Run `pip install "huggingface_hub[inference]"` or `pip install minijinja`
+        to install it.
+        </Tip>
+        Args:
+            messages (List[Union[`SystemMessage`, `UserMessage`, `AssistantMessage`]]):
+                Conversation history consisting of roles and content pairs.
+            model (`str`, *optional*):
+                The model to use for chat-completion. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. If not provided, the default recommended model for chat-based text-generation will be used.
+                See https://huggingface.co/tasks/text-generation for more details.
+            frequency_penalty (`float`, optional):
+                Penalizes new tokens based on their existing frequency
+                in the text so far. Range: [-2.0, 2.0]. Defaults to 0.0.
+            max_tokens (`int`, optional):
+                Maximum number of tokens allowed in the response. Defaults to 20.
+            seed (Optional[`int`], optional):
+                Seed for reproducible control flow. Defaults to None.
+            stop (Optional[`str`], optional):
+                Up to four strings which trigger the end of the response.
+                Defaults to None.
+            stream (`bool`, optional):
+                Enable realtime streaming of responses. Defaults to False.
+            temperature (`float`, optional):
+                Controls randomness of the generations. Lower values ensure
+                less random completions. Range: [0, 2]. Defaults to 1.0.
+            top_p (`float`, optional):
+                Fraction of the most likely next words to sample from.
+                Must be between 0 and 1. Defaults to 1.0.
+        Returns:
+            `Union[ChatCompletionOutput, Iterable[ChatCompletionStreamOutput]]`:
+            Generated text returned from the server:
+            - if `stream=False`, the generated text is returned as a [`ChatCompletionOutput`] (default).
+            - if `stream=True`, the generated text is returned token by token as a sequence of [`ChatCompletionStreamOutput`].
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> messages = [{"role": "user", "content": "What is the capital of France?"}]
+        >>> client = AsyncInferenceClient("HuggingFaceH4/zephyr-7b-beta")
+        >>> await client.chat_completion(messages, max_tokens=100)
+        ChatCompletionOutput(
+            choices=[
+                ChatCompletionOutputChoice(
+                    finish_reason='eos_token',
+                    index=0,
+                    message=ChatCompletionOutputChoiceMessage(
+                        content='The capital of France is Paris. The official name of the city is "Ville de Paris" (City of Paris) and the name of the country\'s governing body, which is located in Paris, is "La République française" (The French Republic). \nI hope that helps! Let me know if you need any further information.'
+                    )
+                )
+            ],
+            created=1710498360
+        )
+        >>> async for token in await client.chat_completion(messages, max_tokens=10, stream=True):
+        ...     print(token)
+        ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content='The', role='assistant'), index=0, finish_reason=None)], created=1710498504)
+        ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' capital', role='assistant'), index=0, finish_reason=None)], created=1710498504)
+        (...)
+        ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' may', role='assistant'), index=0, finish_reason=None)], created=1710498504)
+        ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=None, role=None), index=0, finish_reason='length')], created=1710498504)
+        ```
+        """
+        # determine model
+        model = model or self.model or self.get_recommended_model("text-generation")
+        if _is_chat_completion_server(model):
+            # First, let's consider the server has a `/v1/chat/completions` endpoint.
+            # If that's the case, we don't have to render the chat template client-side.
+            model_url = self._resolve_url(model) + "/v1/chat/completions"
+            try:
+                data = await self.post(
+                    model=model_url,
+                    json=dict(
+                        model="tgi",  # random string
+                        messages=messages,
+                        max_tokens=max_tokens,
+                        seed=seed,
+                        stop=stop,
+                        temperature=temperature,
+                        top_p=top_p,
+                        stream=stream,
+                    ),
+                    stream=stream,
+                )
+            except _import_aiohttp().ClientResponseError:
+                # Let's consider the server is not a chat completion server.
+                # Then we call again `chat_completion` which will render the chat template client side.
+                # (can be HTTP 500, HTTP 400, HTTP 404 depending on the server)
+                _set_as_non_chat_completion_server(model)
+                return await self.chat_completion(
+                    messages=messages,
+                    model=model,
+                    stream=stream,
+                    max_tokens=max_tokens,
+                    seed=seed,
+                    stop=stop,
+                    temperature=temperature,
+                    top_p=top_p,
+                )
+            if stream:
+                return _async_stream_chat_completion_response_from_bytes(data)  # type: ignore[arg-type]
+            return ChatCompletionOutput.parse_obj_as_instance(data)  # type: ignore[arg-type]
+        # At this point, we know the server is not a chat completion server.
+        # We need to render the chat template client side based on the information we can fetch from
+        # the Hub API.
+        model_id = None
+        if model.startswith(("http://", "https://")):
+            # If URL, we need to know which model is served. This is not always possible.
+            # A workaround is to list the user Inference Endpoints and check if one of them correspond to the model URL.
+            # If not, we raise an error.
+            # TODO: fix when we have a proper API for this (at least for Inference Endpoints)
+            # TODO: what if Sagemaker URL?
+            # TODO: what if Azure URL?
+            from ..hf_api import HfApi
+            for endpoint in HfApi(token=self.token).list_inference_endpoints():
+                if endpoint.url == model:
+                    model_id = endpoint.repository
+                    break
+        else:
+            model_id = model
+        if model_id is None:
+            # If we don't have the model ID, we can't fetch the chat template.
+            # We raise an error.
+            raise ValueError(
+                "Request can't be processed as the model ID can't be inferred from model URL. "
+                "This is needed to fetch the chat template from the Hub since the model is not "
+                "served with a Chat-completion API."
+            )
+        # fetch chat template + tokens
+        prompt = render_chat_prompt(model_id=model_id, token=self.token, messages=messages)
+        # generate response
+        stop_sequences = [stop] if isinstance(stop, str) else stop
+        text_generation_output = await self.text_generation(
+            prompt=prompt,
+            details=True,
+            stream=stream,
+            model=model,
+            max_new_tokens=max_tokens,
+            seed=seed,
+            stop_sequences=stop_sequences,
+            temperature=temperature,
+            top_p=top_p,
+        )
+        created = int(time.time())
+        if stream:
+            return _async_stream_chat_completion_response_from_text_generation(text_generation_output)  # type: ignore [arg-type]
+        if isinstance(text_generation_output, TextGenerationOutput):
+            # General use case => format ChatCompletionOutput from text generation details
+            content: str = text_generation_output.generated_text
+            finish_reason: str = text_generation_output.details.finish_reason  # type: ignore[union-attr]
+        else:
+            # Corner case: if server doesn't support details (e.g. if not a TGI server), we only receive an output string.
+            # In such a case, `finish_reason` is set to `"unk"`.
+            content = text_generation_output  # type: ignore[assignment]
+            finish_reason = "unk"
+        return ChatCompletionOutput(
+            created=created,
+            choices=[
+                ChatCompletionOutputChoice(
+                    finish_reason=finish_reason,  # type: ignore
+                    index=0,
+                    message=ChatCompletionOutputChoiceMessage(
+                        content=content,
+                        role="assistant",
+                    ),
+                )
+            ],
+        )
     async def conversational(
         self,
@@ -390,6 +674,13 @@ class AsyncInferenceClient:
         """
         Generate conversational responses based on the given input text (i.e. chat with the API).
+        <Tip warning={true}>
+        [`InferenceClient.conversational`] API is deprecated and will be removed in a future release. Please use
+        [`InferenceClient.chat_completion`] instead.
+        </Tip>
         Args:
             text (`str`):
                 The last input from the user in the conversation.
@@ -430,6 +721,11 @@ class AsyncInferenceClient:
         ... )
         ```
         """
+        warnings.warn(
+            "'InferenceClient.conversational' is deprecated and will be removed starting from huggingface_hub>=0.25. "
+            "Please use the more appropriate 'InferenceClient.chat_completion' API instead.",
+            FutureWarning,
+        )
         payload: Dict[str, Any] = {"inputs": {"text": text}}
         if generated_responses is not None:
             payload["inputs"]["generated_responses"] = generated_responses
@@ -440,58 +736,13 @@ class AsyncInferenceClient:
         response = await self.post(json=payload, model=model, task="conversational")
         return _bytes_to_dict(response)  # type: ignore
-    async def visual_question_answering(
-        self,
-        image: ContentT,
-        question: str,
-        *,
-        model: Optional[str] = None,
-    ) -> List[str]:
-        """
-        Answering open-ended questions based on an image.
-        Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The input image for the context. It can be raw bytes, an image file, or a URL to an online image.
-            question (`str`):
-                Question to be answered.
-            model (`str`, *optional*):
-                The model to use for the visual question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
-                a deployed Inference Endpoint. If not provided, the default recommended visual question answering model will be used.
-                Defaults to None.
-        Returns:
-            `List[Dict]`: a list of dictionaries containing the predicted label and associated probability.
-        Raises:
-            `InferenceTimeoutError`:
-                If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
-                If the request fails with an HTTP error status code other than HTTP 503.
-        Example:
-        ```py
-        # Must be run in an async context
-        >>> from huggingface_hub import AsyncInferenceClient
-        >>> client = AsyncInferenceClient()
-        >>> await client.visual_question_answering(
-        ...     image="https://huggingface.co/datasets/mishig/sample_images/resolve/main/tiger.jpg",
-        ...     question="What is the animal doing?"
-        ... )
-        [{'score': 0.778609573841095, 'answer': 'laying down'},{'score': 0.6957435607910156, 'answer': 'sitting'}, ...]
-        ```
-        """
-        payload: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
-        response = await self.post(json=payload, model=model, task="visual-question-answering")
-        return _bytes_to_list(response)
     async def document_question_answering(
         self,
         image: ContentT,
         question: str,
         *,
         model: Optional[str] = None,
-    ) -> List[QuestionAnsweringOutput]:
+    ) -> List[DocumentQuestionAnsweringOutputElement]:
         """
         Answer questions on document images.
@@ -506,7 +757,7 @@ class AsyncInferenceClient:
                 Defaults to None.
         Returns:
-            `List[Dict]`: a list of dictionaries containing the predicted label, associated probability, word ids, and page number.
+            `List[DocumentQuestionAnsweringOutputElement]`: a list of [`DocumentQuestionAnsweringOutputElement`] items containing the predicted label, associated probability, word ids, and page number.
         Raises:
             [`InferenceTimeoutError`]:
@@ -520,12 +771,12 @@ class AsyncInferenceClient:
         >>> from huggingface_hub import AsyncInferenceClient
         >>> client = AsyncInferenceClient()
         >>> await client.document_question_answering(image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png", question="What is the invoice number?")
-        [{'score': 0.42515629529953003, 'answer': 'us-001', 'start': 16, 'end': 16}]
+        [DocumentQuestionAnsweringOutputElement(score=0.42515629529953003, answer='us-001', start=16, end=16)]
         ```
         """
         payload: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
         response = await self.post(json=payload, model=model, task="document-question-answering")
-        return _bytes_to_list(response)
+        return DocumentQuestionAnsweringOutputElement.parse_obj_as_list(response)
     async def feature_extraction(self, text: str, *, model: Optional[str] = None) -> "np.ndarray":
         """
@@ -564,7 +815,7 @@ class AsyncInferenceClient:
         np = _import_numpy()
         return np.array(_bytes_to_dict(response), dtype="float32")
-    async def fill_mask(self, text: str, *, model: Optional[str] = None) -> List[FillMaskOutput]:
+    async def fill_mask(self, text: str, *, model: Optional[str] = None) -> List[FillMaskOutputElement]:
         """
         Fill in a hole with a missing word (token to be precise).
@@ -577,7 +828,7 @@ class AsyncInferenceClient:
                 Defaults to None.
         Returns:
-            `List[Dict]`: a list of fill mask output dictionaries containing the predicted label, associated
+            `List[FillMaskOutputElement]`: a list of [`FillMaskOutputElement`] items containing the predicted label, associated
             probability, token reference, and completed text.
         Raises:
@@ -592,25 +843,21 @@ class AsyncInferenceClient:
         >>> from huggingface_hub import AsyncInferenceClient
         >>> client = AsyncInferenceClient()
         >>> await client.fill_mask("The goal of life is <mask>.")
-        [{'score': 0.06897063553333282,
-        'token': 11098,
-        'token_str': ' happiness',
-        'sequence': 'The goal of life is happiness.'},
-        {'score': 0.06554922461509705,
-        'token': 45075,
-        'token_str': ' immortality',
-        'sequence': 'The goal of life is immortality.'}]
+        [
+            FillMaskOutputElement(score=0.06897063553333282, token=11098, token_str=' happiness', sequence='The goal of life is happiness.'),
+            FillMaskOutputElement(score=0.06554922461509705, token=45075, token_str=' immortality', sequence='The goal of life is immortality.')
+        ]
         ```
         """
         response = await self.post(json={"inputs": text}, model=model, task="fill-mask")
-        return _bytes_to_list(response)
+        return FillMaskOutputElement.parse_obj_as_list(response)
     async def image_classification(
         self,
         image: ContentT,
         *,
         model: Optional[str] = None,
-    ) -> List[ClassificationOutput]:
+    ) -> List[ImageClassificationOutputElement]:
         """
         Perform image classification on the given image using the specified model.
@@ -622,7 +869,7 @@ class AsyncInferenceClient:
                 deployed Inference Endpoint. If not provided, the default recommended model for image classification will be used.
         Returns:
-            `List[Dict]`: a list of dictionaries containing the predicted label and associated probability.
+            `List[ImageClassificationOutputElement]`: a list of [`ImageClassificationOutputElement`] items containing the predicted label and associated probability.
         Raises:
             [`InferenceTimeoutError`]:
@@ -636,18 +883,18 @@ class AsyncInferenceClient:
         >>> from huggingface_hub import AsyncInferenceClient
         >>> client = AsyncInferenceClient()
         >>> await client.image_classification("https://upload.wikimedia.org/wikipedia/commons/thumb/4/43/Cute_dog.jpg/320px-Cute_dog.jpg")
-        [{'score': 0.9779096841812134, 'label': 'Blenheim spaniel'}, ...]
+        [ImageClassificationOutputElement(score=0.9779096841812134, label='Blenheim spaniel'), ...]
         ```
         """
         response = await self.post(data=image, model=model, task="image-classification")
-        return _bytes_to_list(response)
+        return ImageClassificationOutputElement.parse_obj_as_list(response)
     async def image_segmentation(
         self,
         image: ContentT,
         *,
         model: Optional[str] = None,
-    ) -> List[ImageSegmentationOutput]:
+    ) -> List[ImageSegmentationOutputElement]:
         """
         Perform image segmentation on the given image using the specified model.
@@ -665,7 +912,7 @@ class AsyncInferenceClient:
                 deployed Inference Endpoint. If not provided, the default recommended model for image segmentation will be used.
         Returns:
-            `List[Dict]`: A list of dictionaries containing the segmented masks and associated attributes.
+            `List[ImageSegmentationOutputElement]`: A list of [`ImageSegmentationOutputElement`] items containing the segmented masks and associated attributes.
         Raises:
             [`InferenceTimeoutError`]:
@@ -679,19 +926,13 @@ class AsyncInferenceClient:
         >>> from huggingface_hub import AsyncInferenceClient
         >>> client = AsyncInferenceClient()
         >>> await client.image_segmentation("cat.jpg"):
-        [{'score': 0.989008, 'label': 'LABEL_184', 'mask': <PIL.PngImagePlugin.PngImageFile image mode=L size=400x300 at 0x7FDD2B129CC0>}, ...]
+        [ImageSegmentationOutputElement(score=0.989008, label='LABEL_184', mask=<PIL.PngImagePlugin.PngImageFile image mode=L size=400x300 at 0x7FDD2B129CC0>), ...]
         ```
         """
-        # Segment
         response = await self.post(data=image, model=model, task="image-segmentation")
-        output = _bytes_to_dict(response)
-        # Parse masks as PIL Image
-        if not isinstance(output, list):
-            raise ValueError(f"Server output must be a list. Got {type(output)}: {str(output)[:200]}...")
+        output = ImageSegmentationOutputElement.parse_obj_as_list(response)
         for item in output:
-            item["mask"] = _b64_to_image(item["mask"])
+            item.mask = _b64_to_image(item.mask)
         return output
     async def image_to_image(
@@ -779,7 +1020,7 @@ class AsyncInferenceClient:
         response = await self.post(json=payload, data=data, model=model, task="image-to-image")
         return _bytes_to_image(response)
-    async def image_to_text(self, image: ContentT, *, model: Optional[str] = None) -> str:
+    async def image_to_text(self, image: ContentT, *, model: Optional[str] = None) -> ImageToTextOutput:
         """
         Takes an input image and return text.
@@ -794,7 +1035,7 @@ class AsyncInferenceClient:
                 Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
         Returns:
-            `str`: The generated text.
+            [`ImageToTextOutput`]: The generated text.
         Raises:
             [`InferenceTimeoutError`]:
@@ -814,7 +1055,7 @@ class AsyncInferenceClient:
         ```
         """
         response = await self.post(data=image, model=model, task="image-to-text")
-        return _bytes_to_dict(response)[0]["generated_text"]
+        return ImageToTextOutput.parse_obj_as_instance(response)
     async def list_deployed_models(
         self, frameworks: Union[None, str, Literal["all"], List[str]] = None
@@ -902,7 +1143,7 @@ class AsyncInferenceClient:
         image: ContentT,
         *,
         model: Optional[str] = None,
-    ) -> List[ObjectDetectionOutput]:
+    ) -> List[ObjectDetectionOutputElement]:
         """
         Perform object detection on the given image using the specified model.
@@ -920,7 +1161,7 @@ class AsyncInferenceClient:
                 deployed Inference Endpoint. If not provided, the default recommended model for object detection (DETR) will be used.
         Returns:
-            `List[ObjectDetectionOutput]`: A list of dictionaries containing the bounding boxes and associated attributes.
+            `List[ObjectDetectionOutputElement]`: A list of [`ObjectDetectionOutputElement`] items containing the bounding boxes and associated attributes.
         Raises:
             [`InferenceTimeoutError`]:
@@ -936,19 +1177,16 @@ class AsyncInferenceClient:
         >>> from huggingface_hub import AsyncInferenceClient
         >>> client = AsyncInferenceClient()
         >>> await client.object_detection("people.jpg"):
-        [{"score":0.9486683011054993,"label":"person","box":{"xmin":59,"ymin":39,"xmax":420,"ymax":510}}, ... ]
+        [ObjectDetectionOutputElement(score=0.9486683011054993, label='person', box=ObjectDetectionBoundingBox(xmin=59, ymin=39, xmax=420, ymax=510)), ...]
         ```
         """
         # detect objects
         response = await self.post(data=image, model=model, task="object-detection")
-        output = _bytes_to_dict(response)
-        if not isinstance(output, list):
-            raise ValueError(f"Server output must be a list. Got {type(output)}: {str(output)[:200]}...")
-        return output
+        return ObjectDetectionOutputElement.parse_obj_as_list(response)
     async def question_answering(
         self, question: str, context: str, *, model: Optional[str] = None
-    ) -> QuestionAnsweringOutput:
+    ) -> QuestionAnsweringOutputElement:
         """
         Retrieve the answer to a question from a given text.
@@ -962,7 +1200,7 @@ class AsyncInferenceClient:
                 a deployed Inference Endpoint.
         Returns:
-            `Dict`: a dictionary of question answering output containing the score, start index, end index, and answer.
+            [`QuestionAnsweringOutputElement`]: an question answering output containing the score, start index, end index, and answer.
         Raises:
             [`InferenceTimeoutError`]:
@@ -976,7 +1214,7 @@ class AsyncInferenceClient:
         >>> from huggingface_hub import AsyncInferenceClient
         >>> client = AsyncInferenceClient()
         >>> await client.question_answering(question="What's my name?", context="My name is Clara and I live in Berkeley.")
-        {'score': 0.9326562285423279, 'start': 11, 'end': 16, 'answer': 'Clara'}
+        QuestionAnsweringOutputElement(score=0.9326562285423279, start=11, end=16, answer='Clara')
         ```
         """
@@ -986,7 +1224,7 @@ class AsyncInferenceClient:
             model=model,
             task="question-answering",
         )
-        return _bytes_to_dict(response)  # type: ignore
+        return QuestionAnsweringOutputElement.parse_obj_as_instance(response)
     async def sentence_similarity(
         self, sentence: str, other_sentences: List[str], *, model: Optional[str] = None
@@ -1042,7 +1280,7 @@ class AsyncInferenceClient:
         *,
         parameters: Optional[Dict[str, Any]] = None,
         model: Optional[str] = None,
-    ) -> str:
+    ) -> SummarizationOutput:
         """
         Generate a summary of a given text using a specified model.
@@ -1057,7 +1295,7 @@ class AsyncInferenceClient:
                 Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
         Returns:
-            `str`: The generated summary text.
+            [`SummarizationOutput`]: The generated summary text.
         Raises:
             [`InferenceTimeoutError`]:
@@ -1071,18 +1309,18 @@ class AsyncInferenceClient:
         >>> from huggingface_hub import AsyncInferenceClient
         >>> client = AsyncInferenceClient()
         >>> await client.summarization("The Eiffel tower...")
-        'The Eiffel tower is one of the most famous landmarks in the world....'
+        SummarizationOutput(generated_text="The Eiffel tower is one of the most famous landmarks in the world....")
         ```
         """
         payload: Dict[str, Any] = {"inputs": text}
         if parameters is not None:
             payload["parameters"] = parameters
         response = await self.post(json=payload, model=model, task="summarization")
-        return _bytes_to_dict(response)[0]["summary_text"]
+        return SummarizationOutput.parse_obj_as_list(response)[0]
     async def table_question_answering(
         self, table: Dict[str, Any], query: str, *, model: Optional[str] = None
-    ) -> TableQuestionAnsweringOutput:
+    ) -> TableQuestionAnsweringOutputElement:
         """
         Retrieve the answer to a question from information given in a table.
@@ -1097,7 +1335,7 @@ class AsyncInferenceClient:
                 Hub or a URL to a deployed Inference Endpoint.
         Returns:
-            `Dict`: a dictionary of table question answering output containing the answer, coordinates, cells and the aggregator used.
+            [`TableQuestionAnsweringOutputElement`]: a table question answering output containing the answer, coordinates, cells and the aggregator used.
         Raises:
             [`InferenceTimeoutError`]:
@@ -1113,7 +1351,7 @@ class AsyncInferenceClient:
         >>> query = "How many stars does the transformers repository have?"
         >>> table = {"Repository": ["Transformers", "Datasets", "Tokenizers"], "Stars": ["36542", "4512", "3934"]}
         >>> await client.table_question_answering(table, query, model="google/tapas-base-finetuned-wtq")
-        {'answer': 'AVERAGE > 36542', 'coordinates': [[0, 1]], 'cells': ['36542'], 'aggregator': 'AVERAGE'}
+        TableQuestionAnsweringOutputElement(answer='36542', coordinates=[[0, 1]], cells=['36542'], aggregator='AVERAGE')
         ```
         """
         response = await self.post(
@@ -1124,7 +1362,7 @@ class AsyncInferenceClient:
             model=model,
             task="table-question-answering",
         )
-        return _bytes_to_dict(response)  # type: ignore
+        return TableQuestionAnsweringOutputElement.parse_obj_as_instance(response)
     async def tabular_classification(self, table: Dict[str, Any], *, model: Optional[str] = None) -> List[str]:
         """
@@ -1213,7 +1451,9 @@ class AsyncInferenceClient:
         response = await self.post(json={"table": table}, model=model, task="tabular-regression")
         return _bytes_to_list(response)
-    async def text_classification(self, text: str, *, model: Optional[str] = None) -> List[ClassificationOutput]:
+    async def text_classification(
+        self, text: str, *, model: Optional[str] = None
+    ) -> List[TextClassificationOutputElement]:
         """
         Perform text classification (e.g. sentiment-analysis) on the given text.
@@ -1226,7 +1466,7 @@ class AsyncInferenceClient:
                 Defaults to None.
         Returns:
-            `List[Dict]`: a list of dictionaries containing the predicted label and associated probability.
+            `List[TextClassificationOutputElement]`: a list of [`TextClassificationOutputElement`] items containing the predicted label and associated probability.
         Raises:
             [`InferenceTimeoutError`]:
@@ -1240,11 +1480,14 @@ class AsyncInferenceClient:
         >>> from huggingface_hub import AsyncInferenceClient
         >>> client = AsyncInferenceClient()
         >>> await client.text_classification("I like you")
-        [{'label': 'POSITIVE', 'score': 0.9998695850372314}, {'label': 'NEGATIVE', 'score': 0.0001304351753788069}]
+        [
+            TextClassificationOutputElement(label='POSITIVE', score=0.9998695850372314),
+            TextClassificationOutputElement(label='NEGATIVE', score=0.0001304351753788069),
+        ]
         ```
         """
         response = await self.post(json={"inputs": text}, model=model, task="text-classification")
-        return _bytes_to_list(response)[0]
+        return TextClassificationOutputElement.parse_obj_as_list(response)[0]  # type: ignore [return-value]
     @overload
     async def text_generation(  # type: ignore
@@ -1267,8 +1510,7 @@ class AsyncInferenceClient:
         truncate: Optional[int] = None,
         typical_p: Optional[float] = None,
         watermark: bool = False,
-    ) -> str:
-        ...
+    ) -> str: ...
     @overload
     async def text_generation(  # type: ignore
@@ -1291,8 +1533,7 @@ class AsyncInferenceClient:
         truncate: Optional[int] = None,
         typical_p: Optional[float] = None,
         watermark: bool = False,
-    ) -> TextGenerationResponse:
-        ...
+    ) -> TextGenerationOutput: ...
     @overload
     async def text_generation(  # type: ignore
@@ -1315,11 +1556,10 @@ class AsyncInferenceClient:
         truncate: Optional[int] = None,
         typical_p: Optional[float] = None,
         watermark: bool = False,
-    ) -> AsyncIterable[str]:
-        ...
+    ) -> AsyncIterable[str]: ...
     @overload
-    async def text_generation(
+    async def text_generation(  # type: ignore
         self,
         prompt: str,
         *,
@@ -1339,8 +1579,30 @@ class AsyncInferenceClient:
         truncate: Optional[int] = None,
         typical_p: Optional[float] = None,
         watermark: bool = False,
-    ) -> AsyncIterable[TextGenerationStreamResponse]:
-        ...
+    ) -> AsyncIterable[TextGenerationStreamOutput]: ...
+    @overload
+    async def text_generation(
+        self,
+        prompt: str,
+        *,
+        details: Literal[True] = ...,
+        stream: bool = ...,
+        model: Optional[str] = None,
+        do_sample: bool = False,
+        max_new_tokens: int = 20,
+        best_of: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        return_full_text: bool = False,
+        seed: Optional[int] = None,
+        stop_sequences: Optional[List[str]] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: bool = False,
+    ) -> Union[TextGenerationOutput, AsyncIterable[TextGenerationStreamOutput]]: ...
     async def text_generation(
         self,
@@ -1363,13 +1625,10 @@ class AsyncInferenceClient:
         typical_p: Optional[float] = None,
         watermark: bool = False,
         decoder_input_details: bool = False,
-    ) -> Union[str, TextGenerationResponse, AsyncIterable[str], AsyncIterable[TextGenerationStreamResponse]]:
+    ) -> Union[str, TextGenerationOutput, AsyncIterable[str], AsyncIterable[TextGenerationStreamOutput]]:
         """
         Given a prompt, generate the following text.
-        It is recommended to have Pydantic installed in order to get inputs validated. This is preferable as it allow
-        early failures.
         API endpoint is supposed to run with the `text-generation-inference` backend (TGI). This backend is the
         go-to solution to run large language models at scale. However, for some smaller models (e.g. "gpt2") the
         default `transformers` + `api-inference` solution is still in use. Both approaches have very similar APIs, but
@@ -1427,12 +1686,12 @@ class AsyncInferenceClient:
                 into account. Defaults to `False`.
         Returns:
-            `Union[str, TextGenerationResponse, Iterable[str], Iterable[TextGenerationStreamResponse]]`:
+            `Union[str, TextGenerationOutput, Iterable[str], Iterable[TextGenerationStreamOutput]]`:
             Generated text returned from the server:
             - if `stream=False` and `details=False`, the generated text is returned as a `str` (default)
             - if `stream=True` and `details=False`, the generated text is returned token by token as a `Iterable[str]`
-            - if `stream=False` and `details=True`, the generated text is returned with more details as a [`~huggingface_hub.inference._text_generation.TextGenerationResponse`]
-            - if `details=True` and `stream=True`, the generated text is returned token by token as a iterable of [`~huggingface_hub.inference._text_generation.TextGenerationStreamResponse`]
+            - if `stream=False` and `details=True`, the generated text is returned with more details as a [`~huggingface_hub.TextGenerationOutput`]
+            - if `details=True` and `stream=True`, the generated text is returned token by token as a iterable of [`~huggingface_hub.TextGenerationStreamOutput`]
         Raises:
             `ValidationError`:
@@ -1470,23 +1729,23 @@ class AsyncInferenceClient:
         # Case 3: get more details about the generation process.
         >>> await client.text_generation("The huggingface_hub library is ", max_new_tokens=12, details=True)
-        TextGenerationResponse(
+        TextGenerationOutput(
             generated_text='100% open source and built to be easy to use.',
-            details=Details(
-                finish_reason=<FinishReason.Length: 'length'>,
+            details=TextGenerationDetails(
+                finish_reason='length',
                 generated_tokens=12,
                 seed=None,
                 prefill=[
-                    InputToken(id=487, text='The', logprob=None),
-                    InputToken(id=53789, text=' hugging', logprob=-13.171875),
+                    TextGenerationPrefillToken(id=487, text='The', logprob=None),
+                    TextGenerationPrefillToken(id=53789, text=' hugging', logprob=-13.171875),
                     (...)
-                    InputToken(id=204, text=' ', logprob=-7.0390625)
+                    TextGenerationPrefillToken(id=204, text=' ', logprob=-7.0390625)
                 ],
                 tokens=[
-                    Token(id=1425, text='100', logprob=-1.0175781, special=False),
-                    Token(id=16, text='%', logprob=-0.0463562, special=False),
+                    TokenElement(id=1425, text='100', logprob=-1.0175781, special=False),
+                    TokenElement(id=16, text='%', logprob=-0.0463562, special=False),
                     (...)
-                    Token(id=25, text='.', logprob=-0.5703125, special=False)
+                    TokenElement(id=25, text='.', logprob=-0.5703125, special=False)
                 ],
                 best_of_sequences=None
             )
@@ -1497,30 +1756,27 @@ class AsyncInferenceClient:
         >>> async for details in await client.text_generation("The huggingface_hub library is ", max_new_tokens=12, details=True, stream=True):
         ...     print(details)
         ...
-        TextGenerationStreamResponse(token=Token(id=1425, text='100', logprob=-1.0175781, special=False), generated_text=None, details=None)
-        TextGenerationStreamResponse(token=Token(id=16, text='%', logprob=-0.0463562, special=False), generated_text=None, details=None)
-        TextGenerationStreamResponse(token=Token(id=1314, text=' open', logprob=-1.3359375, special=False), generated_text=None, details=None)
-        TextGenerationStreamResponse(token=Token(id=3178, text=' source', logprob=-0.28100586, special=False), generated_text=None, details=None)
-        TextGenerationStreamResponse(token=Token(id=273, text=' and', logprob=-0.5961914, special=False), generated_text=None, details=None)
-        TextGenerationStreamResponse(token=Token(id=3426, text=' built', logprob=-1.9423828, special=False), generated_text=None, details=None)
-        TextGenerationStreamResponse(token=Token(id=271, text=' to', logprob=-1.4121094, special=False), generated_text=None, details=None)
-        TextGenerationStreamResponse(token=Token(id=314, text=' be', logprob=-1.5224609, special=False), generated_text=None, details=None)
-        TextGenerationStreamResponse(token=Token(id=1833, text=' easy', logprob=-2.1132812, special=False), generated_text=None, details=None)
-        TextGenerationStreamResponse(token=Token(id=271, text=' to', logprob=-0.08520508, special=False), generated_text=None, details=None)
-        TextGenerationStreamResponse(token=Token(id=745, text=' use', logprob=-0.39453125, special=False), generated_text=None, details=None)
-        TextGenerationStreamResponse(token=Token(
+        TextGenerationStreamOutput(token=TokenElement(id=1425, text='100', logprob=-1.0175781, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=16, text='%', logprob=-0.0463562, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=1314, text=' open', logprob=-1.3359375, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=3178, text=' source', logprob=-0.28100586, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=273, text=' and', logprob=-0.5961914, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=3426, text=' built', logprob=-1.9423828, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=271, text=' to', logprob=-1.4121094, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=314, text=' be', logprob=-1.5224609, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=1833, text=' easy', logprob=-2.1132812, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=271, text=' to', logprob=-0.08520508, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=745, text=' use', logprob=-0.39453125, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(
             id=25,
             text='.',
             logprob=-0.5703125,
             special=False),
             generated_text='100% open source and built to be easy to use.',
-            details=StreamDetails(finish_reason=<FinishReason.Length: 'length'>, generated_tokens=12, seed=None)
+            details=TextGenerationStreamDetails(finish_reason='length', generated_tokens=12, seed=None)
         )
         ```
         """
-        # NOTE: Text-generation integration is taken from the text-generation-inference project. It has more features
-        # like input/output validation (if Pydantic is installed). See `_text_generation.py` header for more details.
         if decoder_input_details and not details:
             warnings.warn(
                 "`decoder_input_details=True` has been passed to the server but `details=False` is set meaning that"
@@ -1528,34 +1784,38 @@ class AsyncInferenceClient:
             )
             decoder_input_details = False
-        # Validate parameters
-        parameters = TextGenerationParameters(
-            best_of=best_of,
-            details=details,
-            do_sample=do_sample,
-            max_new_tokens=max_new_tokens,
-            repetition_penalty=repetition_penalty,
-            return_full_text=return_full_text,
-            seed=seed,
-            stop=stop_sequences if stop_sequences is not None else [],
-            temperature=temperature,
-            top_k=top_k,
-            top_p=top_p,
-            truncate=truncate,
-            typical_p=typical_p,
-            watermark=watermark,
-            decoder_input_details=decoder_input_details,
-        )
-        request = TextGenerationRequest(inputs=prompt, stream=stream, parameters=parameters)
-        payload = asdict(request)
+        # Build payload
+        payload = {
+            "inputs": prompt,
+            "parameters": {
+                "best_of": best_of,
+                "decoder_input_details": decoder_input_details,
+                "details": details,
+                "do_sample": do_sample,
+                "max_new_tokens": max_new_tokens,
+                "repetition_penalty": repetition_penalty,
+                "return_full_text": return_full_text,
+                "seed": seed,
+                "stop": stop_sequences if stop_sequences is not None else [],
+                "temperature": temperature,
+                "top_k": top_k,
+                "top_p": top_p,
+                "truncate": truncate,
+                "typical_p": typical_p,
+                "watermark": watermark,
+            },
+            "stream": stream,
+        }
         # Remove some parameters if not a TGI server
         if not _is_tgi_server(model):
+            parameters: Dict = payload["parameters"]  # type: ignore [assignment]
             ignored_parameters = []
-            for key in "watermark", "stop", "details", "decoder_input_details", "best_of":
-                if payload["parameters"][key] is not None:
+            for key in "watermark", "details", "decoder_input_details", "best_of", "stop", "return_full_text":
+                if parameters[key] is not None:
                     ignored_parameters.append(key)
-                del payload["parameters"][key]
+                del parameters[key]
             if len(ignored_parameters) > 0:
                 warnings.warn(
                     "API endpoint/model for text-generation is not served via TGI. Ignoring parameters"
@@ -1608,8 +1868,8 @@ class AsyncInferenceClient:
         if stream:
             return _async_stream_text_generation_response(bytes_output, details)  # type: ignore
-        data = _bytes_to_dict(bytes_output)[0]
-        return TextGenerationResponse(**data) if details else data["generated_text"]
+        data = _bytes_to_dict(bytes_output)[0]  # type: ignore[arg-type]
+        return TextGenerationOutput.parse_obj_as_instance(data) if details else data["generated_text"]
     async def text_to_image(
         self,
@@ -1725,7 +1985,9 @@ class AsyncInferenceClient:
         """
         return await self.post(json={"inputs": text}, model=model, task="text-to-speech")
-    async def token_classification(self, text: str, *, model: Optional[str] = None) -> List[TokenClassificationOutput]:
+    async def token_classification(
+        self, text: str, *, model: Optional[str] = None
+    ) -> List[TokenClassificationOutputElement]:
         """
         Perform token classification on the given text.
         Usually used for sentence parsing, either grammatical, or Named Entity Recognition (NER) to understand keywords contained within text.
@@ -1739,7 +2001,7 @@ class AsyncInferenceClient:
                 Defaults to None.
         Returns:
-            `List[Dict]`: List of token classification outputs containing the entity group, confidence score, word, start and end index.
+            `List[TokenClassificationOutputElement]`: List of [`TokenClassificationOutputElement`] items containing the entity group, confidence score, word, start and end index.
         Raises:
             [`InferenceTimeoutError`]:
@@ -1753,16 +2015,22 @@ class AsyncInferenceClient:
         >>> from huggingface_hub import AsyncInferenceClient
         >>> client = AsyncInferenceClient()
         >>> await client.token_classification("My name is Sarah Jessica Parker but you can call me Jessica")
-        [{'entity_group': 'PER',
-        'score': 0.9971321225166321,
-        'word': 'Sarah Jessica Parker',
-        'start': 11,
-        'end': 31},
-        {'entity_group': 'PER',
-        'score': 0.9773476123809814,
-        'word': 'Jessica',
-        'start': 52,
-        'end': 59}]
+        [
+            TokenClassificationOutputElement(
+                entity_group='PER',
+                score=0.9971321225166321,
+                word='Sarah Jessica Parker',
+                start=11,
+                end=31,
+            ),
+            TokenClassificationOutputElement(
+                entity_group='PER',
+                score=0.9773476123809814,
+                word='Jessica',
+                start=52,
+                end=59,
+            )
+        ]
         ```
         """
         payload: Dict[str, Any] = {"inputs": text}
@@ -1771,11 +2039,11 @@ class AsyncInferenceClient:
             model=model,
             task="token-classification",
         )
-        return _bytes_to_list(response)
+        return TokenClassificationOutputElement.parse_obj_as_list(response)
     async def translation(
         self, text: str, *, model: Optional[str] = None, src_lang: Optional[str] = None, tgt_lang: Optional[str] = None
-    ) -> str:
+    ) -> TranslationOutput:
         """
         Convert text from one language to another.
@@ -1798,7 +2066,7 @@ class AsyncInferenceClient:
                 Target language of the translation task, i.e. output language. Cannot be passed without `src_lang`.
         Returns:
-            `str`: The generated translated text.
+            [`TranslationOutput`]: The generated translated text.
         Raises:
             [`InferenceTimeoutError`]:
@@ -1816,7 +2084,7 @@ class AsyncInferenceClient:
         >>> await client.translation("My name is Wolfgang and I live in Berlin")
         'Mein Name ist Wolfgang und ich lebe in Berlin.'
         >>> await client.translation("My name is Wolfgang and I live in Berlin", model="Helsinki-NLP/opus-mt-en-fr")
-        "Je m'appelle Wolfgang et je vis à Berlin."
+        TranslationOutput(translation_text='Je m\'appelle Wolfgang et je vis à Berlin.')
         ```
         Specifying languages:
@@ -1837,11 +2105,59 @@ class AsyncInferenceClient:
         if src_lang and tgt_lang:
             payload["parameters"] = {"src_lang": src_lang, "tgt_lang": tgt_lang}
         response = await self.post(json=payload, model=model, task="translation")
-        return _bytes_to_dict(response)[0]["translation_text"]
+        return TranslationOutput.parse_obj_as_list(response)[0]
+    async def visual_question_answering(
+        self,
+        image: ContentT,
+        question: str,
+        *,
+        model: Optional[str] = None,
+    ) -> List[VisualQuestionAnsweringOutputElement]:
+        """
+        Answering open-ended questions based on an image.
+        Args:
+            image (`Union[str, Path, bytes, BinaryIO]`):
+                The input image for the context. It can be raw bytes, an image file, or a URL to an online image.
+            question (`str`):
+                Question to be answered.
+            model (`str`, *optional*):
+                The model to use for the visual question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended visual question answering model will be used.
+                Defaults to None.
+        Returns:
+            `List[VisualQuestionAnsweringOutputElement]`: a list of [`VisualQuestionAnsweringOutputElement`] items containing the predicted label and associated probability.
+        Raises:
+            `InferenceTimeoutError`:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> await client.visual_question_answering(
+        ...     image="https://huggingface.co/datasets/mishig/sample_images/resolve/main/tiger.jpg",
+        ...     question="What is the animal doing?"
+        ... )
+        [
+            VisualQuestionAnsweringOutputElement(score=0.778609573841095, answer='laying down'),
+            VisualQuestionAnsweringOutputElement(score=0.6957435607910156, answer='sitting'),
+        ]
+        ```
+        """
+        payload: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
+        response = await self.post(json=payload, model=model, task="visual-question-answering")
+        return VisualQuestionAnsweringOutputElement.parse_obj_as_list(response)
     async def zero_shot_classification(
         self, text: str, labels: List[str], *, multi_label: bool = False, model: Optional[str] = None
-    ) -> List[ClassificationOutput]:
+    ) -> List[ZeroShotClassificationOutputElement]:
         """
         Provide as input a text and a set of candidate labels to classify the input text.
@@ -1857,7 +2173,7 @@ class AsyncInferenceClient:
                 Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
         Returns:
-            `List[Dict]`: List of classification outputs containing the predicted labels and their confidence.
+            `List[ZeroShotClassificationOutputElement]`: List of [`ZeroShotClassificationOutputElement`] items containing the predicted labels and their confidence.
         Raises:
             [`InferenceTimeoutError`]:
@@ -1878,19 +2194,19 @@ class AsyncInferenceClient:
         >>> labels = ["space & cosmos", "scientific discovery", "microbiology", "robots", "archeology"]
         >>> await client.zero_shot_classification(text, labels)
         [
-            {"label": "scientific discovery", "score": 0.7961668968200684},
-            {"label": "space & cosmos", "score": 0.18570658564567566},
-            {"label": "microbiology", "score": 0.00730885099619627},
-            {"label": "archeology", "score": 0.006258360575884581},
-            {"label": "robots", "score": 0.004559356719255447},
+            ZeroShotClassificationOutputElement(label='scientific discovery', score=0.7961668968200684),
+            ZeroShotClassificationOutputElement(label='space & cosmos', score=0.18570658564567566),
+            ZeroShotClassificationOutputElement(label='microbiology', score=0.00730885099619627),
+            ZeroShotClassificationOutputElement(label='archeology', score=0.006258360575884581),
+            ZeroShotClassificationOutputElement(label='robots', score=0.004559356719255447),
         ]
         >>> await client.zero_shot_classification(text, labels, multi_label=True)
         [
-            {"label": "scientific discovery", "score": 0.9829297661781311},
-            {"label": "space & cosmos", "score": 0.755190908908844},
-            {"label": "microbiology", "score": 0.0005462635890580714},
-            {"label": "archeology", "score": 0.00047131875180639327},
-            {"label": "robots", "score": 0.00030448526376858354},
+            ZeroShotClassificationOutputElement(label='scientific discovery', score=0.9829297661781311),
+            ZeroShotClassificationOutputElement(label='space & cosmos', score=0.755190908908844),
+            ZeroShotClassificationOutputElement(label='microbiology', score=0.0005462635890580714),
+            ZeroShotClassificationOutputElement(label='archeology', score=0.00047131875180639327),
+            ZeroShotClassificationOutputElement(label='robots', score=0.00030448526376858354),
         ]
         ```
         """
@@ -1910,11 +2226,14 @@ class AsyncInferenceClient:
             task="zero-shot-classification",
         )
         output = _bytes_to_dict(response)
-        return [{"label": label, "score": score} for label, score in zip(output["labels"], output["scores"])]
+        return [
+            ZeroShotClassificationOutputElement.parse_obj_as_instance({"label": label, "score": score})
+            for label, score in zip(output["labels"], output["scores"])
+        ]
     async def zero_shot_image_classification(
         self, image: ContentT, labels: List[str], *, model: Optional[str] = None
-    ) -> List[ClassificationOutput]:
+    ) -> List[ZeroShotImageClassificationOutputElement]:
         """
         Provide input image and text labels to predict text labels for the image.
@@ -1928,7 +2247,7 @@ class AsyncInferenceClient:
                 Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
         Returns:
-            `List[Dict]`: List of classification outputs containing the predicted labels and their confidence.
+            `List[ZeroShotImageClassificationOutputElement]`: List of [`ZeroShotImageClassificationOutputElement`] items containing the predicted labels and their confidence.
         Raises:
             [`InferenceTimeoutError`]:
@@ -1946,7 +2265,7 @@ class AsyncInferenceClient:
         ...     "https://upload.wikimedia.org/wikipedia/commons/thumb/4/43/Cute_dog.jpg/320px-Cute_dog.jpg",
         ...     labels=["dog", "cat", "horse"],
         ... )
-        [{"label": "dog", "score": 0.956}, ...]
+        [ZeroShotImageClassificationOutputElement(label='dog', score=0.956),...]
         ```
         """
         # Raise ValueError if input is less than 2 labels
@@ -1958,7 +2277,7 @@ class AsyncInferenceClient:
             model=model,
             task="zero-shot-image-classification",
         )
-        return _bytes_to_list(response)
+        return ZeroShotImageClassificationOutputElement.parse_obj_as_list(response)
     def _resolve_url(self, model: Optional[str] = None, task: Optional[str] = None) -> str:
         model = model or self.model

huggingface-hub 0.21.4__py3-none-any.whl → 0.22.0rc0__py3-none-any.whl

Potentially problematic release.

huggingface-hub 0.21.4py3-none-any.whl → 0.22.0rc0py3-none-any.whl