PyPI - huggingface-hub - Versions diffs - 0.29.0rc2__py3-none-any.whl → 1.1.3__py3-none-any.whl - Mend

huggingface-hub 0.29.0rc2py3-none-any.whl → 1.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (153) hide show

huggingface_hub/__init__.py +160 -46
huggingface_hub/_commit_api.py +277 -71
huggingface_hub/_commit_scheduler.py +15 -15
huggingface_hub/_inference_endpoints.py +33 -22
huggingface_hub/_jobs_api.py +301 -0
huggingface_hub/_local_folder.py +18 -3
huggingface_hub/_login.py +31 -63
huggingface_hub/_oauth.py +460 -0
huggingface_hub/_snapshot_download.py +241 -81
huggingface_hub/_space_api.py +18 -10
huggingface_hub/_tensorboard_logger.py +15 -19
huggingface_hub/_upload_large_folder.py +196 -76
huggingface_hub/_webhooks_payload.py +3 -3
huggingface_hub/_webhooks_server.py +15 -25
huggingface_hub/{commands → cli}/__init__.py +1 -15
huggingface_hub/cli/_cli_utils.py +173 -0
huggingface_hub/cli/auth.py +147 -0
huggingface_hub/cli/cache.py +841 -0
huggingface_hub/cli/download.py +189 -0
huggingface_hub/cli/hf.py +60 -0
huggingface_hub/cli/inference_endpoints.py +377 -0
huggingface_hub/cli/jobs.py +772 -0
huggingface_hub/cli/lfs.py +175 -0
huggingface_hub/cli/repo.py +315 -0
huggingface_hub/cli/repo_files.py +94 -0
huggingface_hub/{commands/env.py → cli/system.py} +10 -13
huggingface_hub/cli/upload.py +294 -0
huggingface_hub/cli/upload_large_folder.py +117 -0
huggingface_hub/community.py +20 -12
huggingface_hub/constants.py +83 -59
huggingface_hub/dataclasses.py +609 -0
huggingface_hub/errors.py +99 -30
huggingface_hub/fastai_utils.py +30 -41
huggingface_hub/file_download.py +606 -346
huggingface_hub/hf_api.py +2445 -1132
huggingface_hub/hf_file_system.py +269 -152
huggingface_hub/hub_mixin.py +61 -66
huggingface_hub/inference/_client.py +501 -630
huggingface_hub/inference/_common.py +133 -121
huggingface_hub/inference/_generated/_async_client.py +536 -722
huggingface_hub/inference/_generated/types/__init__.py +6 -1
huggingface_hub/inference/_generated/types/automatic_speech_recognition.py +5 -6
huggingface_hub/inference/_generated/types/base.py +10 -7
huggingface_hub/inference/_generated/types/chat_completion.py +77 -31
huggingface_hub/inference/_generated/types/depth_estimation.py +2 -2
huggingface_hub/inference/_generated/types/document_question_answering.py +2 -2
huggingface_hub/inference/_generated/types/feature_extraction.py +2 -2
huggingface_hub/inference/_generated/types/fill_mask.py +2 -2
huggingface_hub/inference/_generated/types/image_to_image.py +8 -2
huggingface_hub/inference/_generated/types/image_to_text.py +2 -3
huggingface_hub/inference/_generated/types/image_to_video.py +60 -0
huggingface_hub/inference/_generated/types/sentence_similarity.py +3 -3
huggingface_hub/inference/_generated/types/summarization.py +2 -2
huggingface_hub/inference/_generated/types/table_question_answering.py +5 -5
huggingface_hub/inference/_generated/types/text2text_generation.py +2 -2
huggingface_hub/inference/_generated/types/text_generation.py +11 -11
huggingface_hub/inference/_generated/types/text_to_audio.py +1 -2
huggingface_hub/inference/_generated/types/text_to_speech.py +1 -2
huggingface_hub/inference/_generated/types/text_to_video.py +2 -2
huggingface_hub/inference/_generated/types/token_classification.py +2 -2
huggingface_hub/inference/_generated/types/translation.py +2 -2
huggingface_hub/inference/_generated/types/zero_shot_classification.py +2 -2
huggingface_hub/inference/_generated/types/zero_shot_image_classification.py +2 -2
huggingface_hub/inference/_generated/types/zero_shot_object_detection.py +1 -3
huggingface_hub/inference/_mcp/__init__.py +0 -0
huggingface_hub/inference/_mcp/_cli_hacks.py +88 -0
huggingface_hub/inference/_mcp/agent.py +100 -0
huggingface_hub/inference/_mcp/cli.py +247 -0
huggingface_hub/inference/_mcp/constants.py +81 -0
huggingface_hub/inference/_mcp/mcp_client.py +395 -0
huggingface_hub/inference/_mcp/types.py +45 -0
huggingface_hub/inference/_mcp/utils.py +128 -0
huggingface_hub/inference/_providers/__init__.py +149 -20
huggingface_hub/inference/_providers/_common.py +160 -37
huggingface_hub/inference/_providers/black_forest_labs.py +12 -9
huggingface_hub/inference/_providers/cerebras.py +6 -0
huggingface_hub/inference/_providers/clarifai.py +13 -0
huggingface_hub/inference/_providers/cohere.py +32 -0
huggingface_hub/inference/_providers/fal_ai.py +231 -22
huggingface_hub/inference/_providers/featherless_ai.py +38 -0
huggingface_hub/inference/_providers/fireworks_ai.py +22 -1
huggingface_hub/inference/_providers/groq.py +9 -0
huggingface_hub/inference/_providers/hf_inference.py +143 -33
huggingface_hub/inference/_providers/hyperbolic.py +9 -5
huggingface_hub/inference/_providers/nebius.py +47 -5
huggingface_hub/inference/_providers/novita.py +48 -5
huggingface_hub/inference/_providers/nscale.py +44 -0
huggingface_hub/inference/_providers/openai.py +25 -0
huggingface_hub/inference/_providers/publicai.py +6 -0
huggingface_hub/inference/_providers/replicate.py +46 -9
huggingface_hub/inference/_providers/sambanova.py +37 -1
huggingface_hub/inference/_providers/scaleway.py +28 -0
huggingface_hub/inference/_providers/together.py +34 -5
huggingface_hub/inference/_providers/wavespeed.py +138 -0
huggingface_hub/inference/_providers/zai_org.py +17 -0
huggingface_hub/lfs.py +33 -100
huggingface_hub/repocard.py +34 -38
huggingface_hub/repocard_data.py +79 -59
huggingface_hub/serialization/__init__.py +0 -1
huggingface_hub/serialization/_base.py +12 -15
huggingface_hub/serialization/_dduf.py +8 -8
huggingface_hub/serialization/_torch.py +69 -69
huggingface_hub/utils/__init__.py +27 -8
huggingface_hub/utils/_auth.py +7 -7
huggingface_hub/utils/_cache_manager.py +92 -147
huggingface_hub/utils/_chunk_utils.py +2 -3
huggingface_hub/utils/_deprecation.py +1 -1
huggingface_hub/utils/_dotenv.py +55 -0
huggingface_hub/utils/_experimental.py +7 -5
huggingface_hub/utils/_fixes.py +0 -10
huggingface_hub/utils/_git_credential.py +5 -5
huggingface_hub/utils/_headers.py +8 -30
huggingface_hub/utils/_http.py +399 -237
huggingface_hub/utils/_pagination.py +6 -6
huggingface_hub/utils/_parsing.py +98 -0
huggingface_hub/utils/_paths.py +5 -5
huggingface_hub/utils/_runtime.py +74 -22
huggingface_hub/utils/_safetensors.py +21 -21
huggingface_hub/utils/_subprocess.py +13 -11
huggingface_hub/utils/_telemetry.py +4 -4
huggingface_hub/{commands/_cli_utils.py → utils/_terminal.py} +4 -4
huggingface_hub/utils/_typing.py +25 -5
huggingface_hub/utils/_validators.py +55 -74
huggingface_hub/utils/_verification.py +167 -0
huggingface_hub/utils/_xet.py +235 -0
huggingface_hub/utils/_xet_progress_reporting.py +162 -0
huggingface_hub/utils/insecure_hashlib.py +3 -5
huggingface_hub/utils/logging.py +8 -11
huggingface_hub/utils/tqdm.py +33 -4
{huggingface_hub-0.29.0rc2.dist-info → huggingface_hub-1.1.3.dist-info}/METADATA +94 -82
huggingface_hub-1.1.3.dist-info/RECORD +155 -0
{huggingface_hub-0.29.0rc2.dist-info → huggingface_hub-1.1.3.dist-info}/WHEEL +1 -1
huggingface_hub-1.1.3.dist-info/entry_points.txt +6 -0
huggingface_hub/commands/delete_cache.py +0 -428
huggingface_hub/commands/download.py +0 -200
huggingface_hub/commands/huggingface_cli.py +0 -61
huggingface_hub/commands/lfs.py +0 -200
huggingface_hub/commands/repo_files.py +0 -128
huggingface_hub/commands/scan_cache.py +0 -181
huggingface_hub/commands/tag.py +0 -159
huggingface_hub/commands/upload.py +0 -299
huggingface_hub/commands/upload_large_folder.py +0 -129
huggingface_hub/commands/user.py +0 -304
huggingface_hub/commands/version.py +0 -37
huggingface_hub/inference_api.py +0 -217
huggingface_hub/keras_mixin.py +0 -500
huggingface_hub/repository.py +0 -1477
huggingface_hub/serialization/_tensorflow.py +0 -95
huggingface_hub/utils/_hf_folder.py +0 -68
huggingface_hub-0.29.0rc2.dist-info/RECORD +0 -131
huggingface_hub-0.29.0rc2.dist-info/entry_points.txt +0 -6
{huggingface_hub-0.29.0rc2.dist-info → huggingface_hub-1.1.3.dist-info/licenses}/LICENSE +0 -0
{huggingface_hub-0.29.0rc2.dist-info → huggingface_hub-1.1.3.dist-info}/top_level.txt +0 -0

huggingface_hub/inference/_generated/_async_client.py CHANGED Viewed

@@ -21,16 +21,19 @@
 import asyncio
 import base64
 import logging
+import os
 import re
 import warnings
-from typing import TYPE_CHECKING, Any, AsyncIterable, Dict, List, Literal, Optional, Set, Union, overload
+from contextlib import AsyncExitStack
+from typing import TYPE_CHECKING, Any, AsyncIterable, Literal, Optional, Union, overload
+import httpx
 from huggingface_hub import constants
-from huggingface_hub.errors import InferenceTimeoutError
+from huggingface_hub.errors import BadRequestError, HfHubHTTPError, InferenceTimeoutError
 from huggingface_hub.inference._common import (
     TASKS_EXPECTING_IMAGES,
     ContentT,
-    ModelStatus,
     RequestParameters,
     _async_stream_chat_completion_response,
     _async_stream_text_generation_response,
@@ -41,7 +44,6 @@ from huggingface_hub.inference._common import (
     _bytes_to_list,
     _get_unsupported_text_generation_kwargs,
     _import_numpy,
-    _open_as_binary,
     _set_unsupported_text_generation_kwargs,
     raise_text_generation_error,
 )
@@ -51,6 +53,7 @@ from huggingface_hub.inference._generated.types import (
     AudioToAudioOutputElement,
     AutomaticSpeechRecognitionOutput,
     ChatCompletionInputGrammarType,
+    ChatCompletionInputMessage,
     ChatCompletionInputStreamOptions,
     ChatCompletionInputTool,
     ChatCompletionInputToolChoiceClass,
@@ -65,6 +68,7 @@ from huggingface_hub.inference._generated.types import (
     ImageSegmentationSubtask,
     ImageToImageTargetSize,
     ImageToTextOutput,
+    ImageToVideoTargetSize,
     ObjectDetectionOutputElement,
     Padding,
     QuestionAnsweringOutputElement,
@@ -85,16 +89,20 @@ from huggingface_hub.inference._generated.types import (
     ZeroShotClassificationOutputElement,
     ZeroShotImageClassificationOutputElement,
 )
-from huggingface_hub.inference._providers import PROVIDER_T, HFInferenceTask, get_provider_helper
-from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
-from huggingface_hub.utils._deprecation import _deprecate_arguments, _deprecate_method
+from huggingface_hub.inference._providers import PROVIDER_OR_POLICY_T, get_provider_helper
+from huggingface_hub.utils import (
+    build_hf_headers,
+    get_async_session,
+    hf_raise_for_status,
+    validate_hf_hub_args,
+)
+from huggingface_hub.utils._auth import get_token
-from .._common import _async_yield_from, _import_aiohttp
+from .._common import _async_yield_from
 if TYPE_CHECKING:
     import numpy as np
-    from aiohttp import ClientResponse, ClientSession
     from PIL.Image import Image
 logger = logging.getLogger(__name__)
@@ -116,30 +124,25 @@ class AsyncInferenceClient:
             or a URL to a deployed Inference Endpoint. Defaults to None, in which case a recommended model is
             automatically selected for the task.
             Note: for better compatibility with OpenAI's client, `model` has been aliased as `base_url`. Those 2
-            arguments are mutually exclusive. If using `base_url` for chat completion, the `/chat/completions` suffix
-            path will be appended to the base URL (see the [TGI Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api)
-            documentation for details). When passing a URL as `model`, the client will not append any suffix path to it.
+            arguments are mutually exclusive. If a URL is passed as `model` or `base_url` for chat completion, the `(/v1)/chat/completions` suffix path will be appended to the URL.
         provider (`str`, *optional*):
-            Name of the provider to use for inference. Can be `"black-forest-labs"`, `"fal-ai"`, `"fireworks-ai"`, `"hf-inference"`, `"hyperbolic"`, `"nebius"`, `"novita"`, `"replicate"`, "sambanova"` or `"together"`.
-            defaults to hf-inference (Hugging Face Serverless Inference API).
+            Name of the provider to use for inference. Can be `"black-forest-labs"`, `"cerebras"`, `"clarifai"`, `"cohere"`, `"fal-ai"`, `"featherless-ai"`, `"fireworks-ai"`, `"groq"`, `"hf-inference"`, `"hyperbolic"`, `"nebius"`, `"novita"`, `"nscale"`, `"openai"`, `"publicai"`, `"replicate"`, `"sambanova"`, `"scaleway"`, `"together"`, `"wavespeed"` or `"zai-org"`.
+            Defaults to "auto" i.e. the first of the providers available for the model, sorted by the user's order in https://hf.co/settings/inference-providers.
             If model is a URL or `base_url` is passed, then `provider` is not used.
-        token (`str` or `bool`, *optional*):
+        token (`str`, *optional*):
             Hugging Face token. Will default to the locally saved token if not provided.
-            Pass `token=False` if you don't want to send your token to the server.
             Note: for better compatibility with OpenAI's client, `token` has been aliased as `api_key`. Those 2
             arguments are mutually exclusive and have the exact same behavior.
         timeout (`float`, `optional`):
-            The maximum number of seconds to wait for a response from the server. Loading a new model in Inference
-            API can take up to several minutes. Defaults to None, meaning it will loop until the server is available.
-        headers (`Dict[str, str]`, `optional`):
+            The maximum number of seconds to wait for a response from the server. Defaults to None, meaning it will loop until the server is available.
+        headers (`dict[str, str]`, `optional`):
             Additional headers to send to the server. By default only the authorization and user-agent headers are sent.
             Values in this dictionary will override the default values.
-        cookies (`Dict[str, str]`, `optional`):
+        bill_to (`str`, `optional`):
+            The billing account to use for the requests. By default the requests are billed on the user's account.
+            Requests can only be billed to an organization the user is a member of, and which has subscribed to Enterprise Hub.
+        cookies (`dict[str, str]`, `optional`):
             Additional cookies to send to the server.
-        trust_env ('bool', 'optional'):
-            Trust environment settings for proxy configuration if the parameter is `True` (`False` by default).
-        proxies (`Any`, `optional`):
-            Proxies to use for the request.
         base_url (`str`, `optional`):
             Base URL to run inference. This is a duplicated argument from `model` to make [`InferenceClient`]
             follow the same pattern as `openai.OpenAI` client. Cannot be used if `model` is set. Defaults to None.
@@ -148,17 +151,17 @@ class AsyncInferenceClient:
             follow the same pattern as `openai.OpenAI` client. Cannot be used if `token` is set. Defaults to None.
     """
+    @validate_hf_hub_args
     def __init__(
         self,
         model: Optional[str] = None,
         *,
-        provider: Optional[PROVIDER_T] = None,
+        provider: Optional[PROVIDER_OR_POLICY_T] = None,
         token: Optional[str] = None,
         timeout: Optional[float] = None,
-        headers: Optional[Dict[str, str]] = None,
-        cookies: Optional[Dict[str, str]] = None,
-        trust_env: bool = False,
-        proxies: Optional[Any] = None,
+        headers: Optional[dict[str, str]] = None,
+        cookies: Optional[dict[str, str]] = None,
+        bill_to: Optional[str] = None,
         # OpenAI compatibility
         base_url: Optional[str] = None,
         api_key: Optional[str] = None,
@@ -176,101 +179,78 @@ class AsyncInferenceClient:
                 " `api_key` is an alias for `token` to make the API compatible with OpenAI's client."
                 " It has the exact same behavior as `token`."
             )
+        token = token if token is not None else api_key
+        if isinstance(token, bool):
+            # Legacy behavior: previously it was possible to pass `token=False` to disable authentication. This is not
+            # supported anymore as authentication is required. Better to explicitly raise here rather than risking
+            # sending the locally saved token without the user knowing about it.
+            if token is False:
+                raise ValueError(
+                    "Cannot use `token=False` to disable authentication as authentication is required to run Inference."
+                )
+            warnings.warn(
+                "Using `token=True` to automatically use the locally saved token is deprecated and will be removed in a future release. "
+                "Please use `token=None` instead (default).",
+                DeprecationWarning,
+            )
+            token = get_token()
         self.model: Optional[str] = base_url or model
-        self.token: Optional[str] = token if token is not None else api_key
-        self.headers = headers if headers is not None else {}
+        self.token: Optional[str] = token
+        self.headers = {**headers} if headers is not None else {}
+        if bill_to is not None:
+            if (
+                constants.HUGGINGFACE_HEADER_X_BILL_TO in self.headers
+                and self.headers[constants.HUGGINGFACE_HEADER_X_BILL_TO] != bill_to
+            ):
+                warnings.warn(
+                    f"Overriding existing '{self.headers[constants.HUGGINGFACE_HEADER_X_BILL_TO]}' value in headers with '{bill_to}'.",
+                    UserWarning,
+                )
+            self.headers[constants.HUGGINGFACE_HEADER_X_BILL_TO] = bill_to
+            if token is not None and not token.startswith("hf_"):
+                warnings.warn(
+                    "You've provided an external provider's API key, so requests will be billed directly by the provider. "
+                    "The `bill_to` parameter is only applicable for Hugging Face billing and will be ignored.",
+                    UserWarning,
+                )
         # Configure provider
-        self.provider = provider if provider is not None else "hf-inference"
+        self.provider = provider
         self.cookies = cookies
         self.timeout = timeout
-        self.trust_env = trust_env
-        self.proxies = proxies
-        # Keep track of the sessions to close them properly
-        self._sessions: Dict["ClientSession", Set["ClientResponse"]] = dict()
+        self.exit_stack = AsyncExitStack()
+        self._async_client: Optional[httpx.AsyncClient] = None
     def __repr__(self):
         return f"<InferenceClient(model='{self.model if self.model else ''}', timeout={self.timeout})>"
-    @overload
-    async def post(  # type: ignore[misc]
-        self,
-        *,
-        json: Optional[Union[str, Dict, List]] = None,
-        data: Optional[ContentT] = None,
-        model: Optional[str] = None,
-        task: Optional[str] = None,
-        stream: Literal[False] = ...,
-    ) -> bytes: ...
+    async def __aenter__(self):
+        return self
-    @overload
-    async def post(  # type: ignore[misc]
-        self,
-        *,
-        json: Optional[Union[str, Dict, List]] = None,
-        data: Optional[ContentT] = None,
-        model: Optional[str] = None,
-        task: Optional[str] = None,
-        stream: Literal[True] = ...,
-    ) -> AsyncIterable[bytes]: ...
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        await self.close()
-    @overload
-    async def post(
-        self,
-        *,
-        json: Optional[Union[str, Dict, List]] = None,
-        data: Optional[ContentT] = None,
-        model: Optional[str] = None,
-        task: Optional[str] = None,
-        stream: bool = False,
-    ) -> Union[bytes, AsyncIterable[bytes]]: ...
-    @_deprecate_method(
-        version="0.31.0",
-        message=(
-            "Making direct POST requests to the inference server is not supported anymore. "
-            "Please use task methods instead (e.g. `InferenceClient.chat_completion`). "
-            "If your use case is not supported, please open an issue in https://github.com/huggingface/huggingface_hub."
-        ),
-    )
-    async def post(
-        self,
-        *,
-        json: Optional[Union[str, Dict, List]] = None,
-        data: Optional[ContentT] = None,
-        model: Optional[str] = None,
-        task: Optional[str] = None,
-        stream: bool = False,
-    ) -> Union[bytes, AsyncIterable[bytes]]:
+    async def close(self):
+        """Close the client.
+        This method is automatically called when using the client as a context manager.
         """
-        Make a POST request to the inference server.
+        await self.exit_stack.aclose()
-        This method is deprecated and will be removed in the future.
-        Please use task methods instead (e.g. `InferenceClient.chat_completion`).
+    async def _get_async_client(self):
+        """Get a unique async client for this AsyncInferenceClient instance.
+        Returns the same client instance on subsequent calls, ensuring proper
+        connection reuse and resource management through the exit stack.
         """
-        if self.provider != "hf-inference":
-            raise ValueError(
-                "Cannot use `post` with another provider than `hf-inference`. "
-                "`InferenceClient.post` is deprecated and should not be used directly anymore."
-            )
-        provider_helper = HFInferenceTask(task or "unknown")
-        mapped_model = provider_helper._prepare_mapped_model(model or self.model)
-        url = provider_helper._prepare_url(self.token, mapped_model)  # type: ignore[arg-type]
-        headers = provider_helper._prepare_headers(self.headers, self.token)  # type: ignore[arg-type]
-        return await self._inner_post(
-            request_parameters=RequestParameters(
-                url=url,
-                task=task or "unknown",
-                model=model or "unknown",
-                json=json,
-                data=data,
-                headers=headers,
-            ),
-            stream=stream,
-        )
+        if self._async_client is None:
+            self._async_client = await self.exit_stack.enter_async_context(get_async_session())
+        return self._async_client
     @overload
     async def _inner_post(  # type: ignore[misc]
@@ -280,84 +260,59 @@ class AsyncInferenceClient:
     @overload
     async def _inner_post(  # type: ignore[misc]
         self, request_parameters: RequestParameters, *, stream: Literal[True] = ...
-    ) -> AsyncIterable[bytes]: ...
+    ) -> AsyncIterable[str]: ...
     @overload
     async def _inner_post(
         self, request_parameters: RequestParameters, *, stream: bool = False
-    ) -> Union[bytes, AsyncIterable[bytes]]: ...
+    ) -> Union[bytes, AsyncIterable[str]]: ...
     async def _inner_post(
         self, request_parameters: RequestParameters, *, stream: bool = False
-    ) -> Union[bytes, AsyncIterable[bytes]]:
+    ) -> Union[bytes, AsyncIterable[str]]:
         """Make a request to the inference server."""
-        aiohttp = _import_aiohttp()
         # TODO: this should be handled in provider helpers directly
         if request_parameters.task in TASKS_EXPECTING_IMAGES and "Accept" not in request_parameters.headers:
             request_parameters.headers["Accept"] = "image/png"
-        while True:
-            with _open_as_binary(request_parameters.data) as data_as_binary:
-                # Do not use context manager as we don't want to close the connection immediately when returning
-                # a stream
-                session = self._get_client_session(headers=request_parameters.headers)
-                try:
-                    response = await session.post(
-                        request_parameters.url, json=request_parameters.json, data=data_as_binary, proxy=self.proxies
+        try:
+            client = await self._get_async_client()
+            if stream:
+                response = await self.exit_stack.enter_async_context(
+                    client.stream(
+                        "POST",
+                        request_parameters.url,
+                        json=request_parameters.json,
+                        data=request_parameters.data,
+                        headers=request_parameters.headers,
+                        cookies=self.cookies,
+                        timeout=self.timeout,
                     )
-                    response_error_payload = None
-                    if response.status != 200:
-                        try:
-                            response_error_payload = await response.json()  # get payload before connection closed
-                        except Exception:
-                            pass
-                    response.raise_for_status()
-                    if stream:
-                        return _async_yield_from(session, response)
-                    else:
-                        content = await response.read()
-                        await session.close()
-                        return content
-                except asyncio.TimeoutError as error:
-                    await session.close()
-                    # Convert any `TimeoutError` to a `InferenceTimeoutError`
-                    raise InferenceTimeoutError(f"Inference call timed out: {request_parameters.url}") from error  # type: ignore
-                except aiohttp.ClientResponseError as error:
-                    error.response_error_payload = response_error_payload
-                    await session.close()
-                    raise error
-                except Exception:
-                    await session.close()
-                    raise
-    async def __aenter__(self):
-        return self
-    async def __aexit__(self, exc_type, exc_value, traceback):
-        await self.close()
-    def __del__(self):
-        if len(self._sessions) > 0:
-            warnings.warn(
-                "Deleting 'AsyncInferenceClient' client but some sessions are still open. "
-                "This can happen if you've stopped streaming data from the server before the stream was complete. "
-                "To close the client properly, you must call `await client.close()` "
-                "or use an async context (e.g. `async with AsyncInferenceClient(): ...`."
-            )
-    async def close(self):
-        """Close all open sessions.
-        By default, 'aiohttp.ClientSession' objects are closed automatically when a call is completed. However, if you
-        are streaming data from the server and you stop before the stream is complete, you must call this method to
-        close the session properly.
-        Another possibility is to use an async context (e.g. `async with AsyncInferenceClient(): ...`).
-        """
-        await asyncio.gather(*[session.close() for session in self._sessions.keys()])
+                )
+                hf_raise_for_status(response)
+                return _async_yield_from(client, response)
+            else:
+                response = await client.post(
+                    request_parameters.url,
+                    json=request_parameters.json,
+                    data=request_parameters.data,
+                    headers=request_parameters.headers,
+                    cookies=self.cookies,
+                    timeout=self.timeout,
+                )
+                hf_raise_for_status(response)
+                return response.content
+        except asyncio.TimeoutError as error:
+            # Convert any `TimeoutError` to a `InferenceTimeoutError`
+            raise InferenceTimeoutError(f"Inference call timed out: {request_parameters.url}") from error  # type: ignore
+        except HfHubHTTPError as error:
+            if error.response.status_code == 422 and request_parameters.task != "unknown":
+                msg = str(error.args[0])
+                if len(error.response.text) > 0:
+                    msg += f"{os.linesep}{error.response.text}{os.linesep}"
+                error.args = (msg,) + error.args[1:]
+            raise
     async def audio_classification(
         self,
@@ -366,7 +321,7 @@ class AsyncInferenceClient:
         model: Optional[str] = None,
         top_k: Optional[int] = None,
         function_to_apply: Optional["AudioClassificationOutputTransform"] = None,
-    ) -> List[AudioClassificationOutputElement]:
+    ) -> list[AudioClassificationOutputElement]:
         """
         Perform audio classification on the provided audio content.
@@ -384,12 +339,12 @@ class AsyncInferenceClient:
                 The function to apply to the model outputs in order to retrieve the scores.
         Returns:
-            `List[AudioClassificationOutputElement]`: List of [`AudioClassificationOutputElement`] items containing the predicted labels and their confidence.
+            `list[AudioClassificationOutputElement]`: List of [`AudioClassificationOutputElement`] items containing the predicted labels and their confidence.
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -405,12 +360,13 @@ class AsyncInferenceClient:
         ]
         ```
         """
-        provider_helper = get_provider_helper(self.provider, task="audio-classification")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="audio-classification", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=audio,
             parameters={"function_to_apply": function_to_apply, "top_k": top_k},
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
@@ -421,7 +377,7 @@ class AsyncInferenceClient:
         audio: ContentT,
         *,
         model: Optional[str] = None,
-    ) -> List[AudioToAudioOutputElement]:
+    ) -> list[AudioToAudioOutputElement]:
         """
         Performs multiple tasks related to audio-to-audio depending on the model (eg: speech enhancement, source separation).
@@ -435,12 +391,12 @@ class AsyncInferenceClient:
                 audio_to_audio will be used.
         Returns:
-            `List[AudioToAudioOutputElement]`: A list of [`AudioToAudioOutputElement`] items containing audios label, content-type, and audio content in blob.
+            `list[AudioToAudioOutputElement]`: A list of [`AudioToAudioOutputElement`] items containing audios label, content-type, and audio content in blob.
         Raises:
             `InferenceTimeoutError`:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -454,12 +410,13 @@ class AsyncInferenceClient:
                     f.write(item.blob)
         ```
         """
-        provider_helper = get_provider_helper(self.provider, task="audio-to-audio")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="audio-to-audio", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=audio,
             parameters={},
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
@@ -473,7 +430,7 @@ class AsyncInferenceClient:
         audio: ContentT,
         *,
         model: Optional[str] = None,
-        extra_body: Optional[Dict] = None,
+        extra_body: Optional[dict] = None,
     ) -> AutomaticSpeechRecognitionOutput:
         """
         Perform automatic speech recognition (ASR or audio-to-text) on the given audio content.
@@ -484,7 +441,7 @@ class AsyncInferenceClient:
             model (`str`, *optional*):
                 The model to use for ASR. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. If not provided, the default recommended model for ASR will be used.
-            extra_body (`Dict`, *optional*):
+            extra_body (`dict`, *optional*):
                 Additional provider-specific parameters to pass to the model. Refer to the provider's documentation
                 for supported parameters.
         Returns:
@@ -493,7 +450,7 @@ class AsyncInferenceClient:
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -505,12 +462,13 @@ class AsyncInferenceClient:
         "hello world"
         ```
         """
-        provider_helper = get_provider_helper(self.provider, task="automatic-speech-recognition")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="automatic-speech-recognition", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=audio,
             parameters={**(extra_body or {})},
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
@@ -519,121 +477,117 @@ class AsyncInferenceClient:
     @overload
     async def chat_completion(  # type: ignore
         self,
-        messages: List[Dict],
+        messages: list[Union[dict, ChatCompletionInputMessage]],
         *,
         model: Optional[str] = None,
         stream: Literal[False] = False,
         frequency_penalty: Optional[float] = None,
-        logit_bias: Optional[List[float]] = None,
+        logit_bias: Optional[list[float]] = None,
         logprobs: Optional[bool] = None,
         max_tokens: Optional[int] = None,
         n: Optional[int] = None,
         presence_penalty: Optional[float] = None,
         response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
-        stop: Optional[List[str]] = None,
+        stop: Optional[list[str]] = None,
         stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
         tool_choice: Optional[Union[ChatCompletionInputToolChoiceClass, "ChatCompletionInputToolChoiceEnum"]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ChatCompletionInputTool]] = None,
+        tools: Optional[list[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
-        extra_body: Optional[Dict] = None,
+        extra_body: Optional[dict] = None,
     ) -> ChatCompletionOutput: ...
     @overload
     async def chat_completion(  # type: ignore
         self,
-        messages: List[Dict],
+        messages: list[Union[dict, ChatCompletionInputMessage]],
         *,
         model: Optional[str] = None,
         stream: Literal[True] = True,
         frequency_penalty: Optional[float] = None,
-        logit_bias: Optional[List[float]] = None,
+        logit_bias: Optional[list[float]] = None,
         logprobs: Optional[bool] = None,
         max_tokens: Optional[int] = None,
         n: Optional[int] = None,
         presence_penalty: Optional[float] = None,
         response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
-        stop: Optional[List[str]] = None,
+        stop: Optional[list[str]] = None,
         stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
         tool_choice: Optional[Union[ChatCompletionInputToolChoiceClass, "ChatCompletionInputToolChoiceEnum"]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ChatCompletionInputTool]] = None,
+        tools: Optional[list[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
-        extra_body: Optional[Dict] = None,
+        extra_body: Optional[dict] = None,
     ) -> AsyncIterable[ChatCompletionStreamOutput]: ...
     @overload
     async def chat_completion(
         self,
-        messages: List[Dict],
+        messages: list[Union[dict, ChatCompletionInputMessage]],
         *,
         model: Optional[str] = None,
         stream: bool = False,
         frequency_penalty: Optional[float] = None,
-        logit_bias: Optional[List[float]] = None,
+        logit_bias: Optional[list[float]] = None,
         logprobs: Optional[bool] = None,
         max_tokens: Optional[int] = None,
         n: Optional[int] = None,
         presence_penalty: Optional[float] = None,
         response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
-        stop: Optional[List[str]] = None,
+        stop: Optional[list[str]] = None,
         stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
         tool_choice: Optional[Union[ChatCompletionInputToolChoiceClass, "ChatCompletionInputToolChoiceEnum"]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ChatCompletionInputTool]] = None,
+        tools: Optional[list[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
-        extra_body: Optional[Dict] = None,
+        extra_body: Optional[dict] = None,
     ) -> Union[ChatCompletionOutput, AsyncIterable[ChatCompletionStreamOutput]]: ...
     async def chat_completion(
         self,
-        messages: List[Dict],
+        messages: list[Union[dict, ChatCompletionInputMessage]],
         *,
         model: Optional[str] = None,
         stream: bool = False,
         # Parameters from ChatCompletionInput (handled manually)
         frequency_penalty: Optional[float] = None,
-        logit_bias: Optional[List[float]] = None,
+        logit_bias: Optional[list[float]] = None,
         logprobs: Optional[bool] = None,
         max_tokens: Optional[int] = None,
         n: Optional[int] = None,
         presence_penalty: Optional[float] = None,
         response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
-        stop: Optional[List[str]] = None,
+        stop: Optional[list[str]] = None,
         stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
         tool_choice: Optional[Union[ChatCompletionInputToolChoiceClass, "ChatCompletionInputToolChoiceEnum"]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ChatCompletionInputTool]] = None,
+        tools: Optional[list[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
-        extra_body: Optional[Dict] = None,
+        extra_body: Optional[dict] = None,
     ) -> Union[ChatCompletionOutput, AsyncIterable[ChatCompletionStreamOutput]]:
         """
         A method for completing conversations using a specified language model.
-        <Tip>
-        The `client.chat_completion` method is aliased as `client.chat.completions.create` for compatibility with OpenAI's client.
-        Inputs and outputs are strictly the same and using either syntax will yield the same results.
-        Check out the [Inference guide](https://huggingface.co/docs/huggingface_hub/guides/inference#openai-compatibility)
-        for more details about OpenAI's compatibility.
+        > [!TIP]
+        > The `client.chat_completion` method is aliased as `client.chat.completions.create` for compatibility with OpenAI's client.
+        > Inputs and outputs are strictly the same and using either syntax will yield the same results.
+        > Check out the [Inference guide](https://huggingface.co/docs/huggingface_hub/guides/inference#openai-compatibility)
+        > for more details about OpenAI's compatibility.
-        </Tip>
-        <Tip>
-        You can pass provider-specific parameters to the model by using the `extra_body` argument.
-        </Tip>
+        > [!TIP]
+        > You can pass provider-specific parameters to the model by using the `extra_body` argument.
         Args:
             messages (List of [`ChatCompletionInputMessage`]):
@@ -647,7 +601,7 @@ class AsyncInferenceClient:
             frequency_penalty (`float`, *optional*):
                 Penalizes new tokens based on their existing frequency
                 in the text so far. Range: [-2.0, 2.0]. Defaults to 0.0.
-            logit_bias (`List[float]`, *optional*):
+            logit_bias (`list[float]`, *optional*):
                 Adjusts the likelihood of specific tokens appearing in the generated output.
             logprobs (`bool`, *optional*):
                 Whether to return log probabilities of the output tokens or not. If true, returns the log
@@ -663,7 +617,7 @@ class AsyncInferenceClient:
                 Grammar constraints. Can be either a JSONSchema or a regex.
             seed (Optional[`int`], *optional*):
                 Seed for reproducible control flow. Defaults to None.
-            stop (`List[str]`, *optional*):
+            stop (`list[str]`, *optional*):
                 Up to four strings which trigger the end of the response.
                 Defaults to None.
             stream (`bool`, *optional*):
@@ -687,7 +641,7 @@ class AsyncInferenceClient:
             tools (List of [`ChatCompletionInputTool`], *optional*):
                 A list of tools the model may call. Currently, only functions are supported as a tool. Use this to
                 provide a list of functions the model may generate JSON inputs for.
-            extra_body (`Dict`, *optional*):
+            extra_body (`dict`, *optional*):
                 Additional provider-specific parameters to pass to the model. Refer to the provider's documentation
                 for supported parameters.
         Returns:
@@ -699,7 +653,7 @@ class AsyncInferenceClient:
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -931,7 +885,7 @@ class AsyncInferenceClient:
         >>> messages = [
         ...     {
         ...         "role": "user",
-        ...         "content": "I saw a puppy a cat and a raccoon during my bike ride in the park. What did I saw and when?",
+        ...         "content": "I saw a puppy a cat and a raccoon during my bike ride in the park. What did I see and when?",
         ...     },
         ... ]
         >>> response_format = {
@@ -950,20 +904,26 @@ class AsyncInferenceClient:
         ...     messages=messages,
         ...     response_format=response_format,
         ...     max_tokens=500,
-        )
+        ... )
         >>> response.choices[0].message.content
         '{\n\n"activity": "bike ride",\n"animals": ["puppy", "cat", "raccoon"],\n"animals_seen": 3,\n"location": "park"}'
         ```
         """
-        # Get the provider helper
-        provider_helper = get_provider_helper(self.provider, task="conversational")
         # Since `chat_completion(..., model=xxx)` is also a payload parameter for the server, we need to handle 'model' differently.
         # `self.model` takes precedence over 'model' argument for building URL.
         # `model` takes precedence for payload value.
         model_id_or_url = self.model or model
         payload_model = model or self.model
+        # Get the provider helper
+        provider_helper = get_provider_helper(
+            self.provider,
+            task="conversational",
+            model=model_id_or_url
+            if model_id_or_url is not None and model_id_or_url.startswith(("http://", "https://"))
+            else payload_model,
+        )
         # Prepare the payload
         parameters = {
             "model": payload_model,
@@ -1013,8 +973,8 @@ class AsyncInferenceClient:
         max_question_len: Optional[int] = None,
         max_seq_len: Optional[int] = None,
         top_k: Optional[int] = None,
-        word_boxes: Optional[List[Union[List[float], str]]] = None,
-    ) -> List[DocumentQuestionAnsweringOutputElement]:
+        word_boxes: Optional[list[Union[list[float], str]]] = None,
+    ) -> list[DocumentQuestionAnsweringOutputElement]:
         """
         Answer questions on document images.
@@ -1044,16 +1004,16 @@ class AsyncInferenceClient:
             top_k (`int`, *optional*):
                 The number of answers to return (will be chosen by order of likelihood). Can return less than top_k
                 answers if there are not enough options available within the context.
-            word_boxes (`List[Union[List[float], str`, *optional*):
+            word_boxes (`list[Union[list[float], str`, *optional*):
                 A list of words and bounding boxes (normalized 0->1000). If provided, the inference will skip the OCR
                 step and use the provided bounding boxes instead.
         Returns:
-            `List[DocumentQuestionAnsweringOutputElement]`: a list of [`DocumentQuestionAnsweringOutputElement`] items containing the predicted label, associated probability, word ids, and page number.
+            `list[DocumentQuestionAnsweringOutputElement]`: a list of [`DocumentQuestionAnsweringOutputElement`] items containing the predicted label, associated probability, word ids, and page number.
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
@@ -1066,8 +1026,9 @@ class AsyncInferenceClient:
         [DocumentQuestionAnsweringOutputElement(answer='us-001', end=16, score=0.9999666213989258, start=16)]
         ```
         """
-        inputs: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
-        provider_helper = get_provider_helper(self.provider, task="document-question-answering")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="document-question-answering", model=model_id)
+        inputs: dict[str, Any] = {"question": question, "image": _b64_encode(image)}
         request_parameters = provider_helper.prepare_request(
             inputs=inputs,
             parameters={
@@ -1081,7 +1042,7 @@ class AsyncInferenceClient:
                 "word_boxes": word_boxes,
             },
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
@@ -1104,8 +1065,8 @@ class AsyncInferenceClient:
             text (`str`):
                 The text to embed.
             model (`str`, *optional*):
-                The model to use for the conversational task. Can be a model ID hosted on the Hugging Face Hub or a URL to
-                a deployed Inference Endpoint. If not provided, the default recommended conversational model will be used.
+                The model to use for the feature extraction task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended feature extraction model will be used.
                 Defaults to None.
             normalize (`bool`, *optional*):
                 Whether to normalize the embeddings or not.
@@ -1128,7 +1089,7 @@ class AsyncInferenceClient:
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -1143,7 +1104,8 @@ class AsyncInferenceClient:
         [ 0.28552425, -0.928395  , -1.2077185 , ...,  0.76810825, -2.1069427 ,  0.6236161 ]], dtype=float32)
         ```
         """
-        provider_helper = get_provider_helper(self.provider, task="feature-extraction")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="feature-extraction", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=text,
             parameters={
@@ -1153,21 +1115,21 @@ class AsyncInferenceClient:
                 "truncation_direction": truncation_direction,
             },
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
         np = _import_numpy()
-        return np.array(_bytes_to_dict(response), dtype="float32")
+        return np.array(provider_helper.get_response(response), dtype="float32")
     async def fill_mask(
         self,
         text: str,
         *,
         model: Optional[str] = None,
-        targets: Optional[List[str]] = None,
+        targets: Optional[list[str]] = None,
         top_k: Optional[int] = None,
-    ) -> List[FillMaskOutputElement]:
+    ) -> list[FillMaskOutputElement]:
         """
         Fill in a hole with a missing word (token to be precise).
@@ -1177,20 +1139,20 @@ class AsyncInferenceClient:
             model (`str`, *optional*):
                 The model to use for the fill mask task. Can be a model ID hosted on the Hugging Face Hub or a URL to
                 a deployed Inference Endpoint. If not provided, the default recommended fill mask model will be used.
-            targets (`List[str`, *optional*):
+            targets (`list[str`, *optional*):
                 When passed, the model will limit the scores to the passed targets instead of looking up in the whole
                 vocabulary. If the provided targets are not in the model vocab, they will be tokenized and the first
                 resulting token will be used (with a warning, and that might be slower).
             top_k (`int`, *optional*):
                 When passed, overrides the number of predictions to return.
         Returns:
-            `List[FillMaskOutputElement]`: a list of [`FillMaskOutputElement`] items containing the predicted label, associated
+            `list[FillMaskOutputElement]`: a list of [`FillMaskOutputElement`] items containing the predicted label, associated
             probability, token reference, and completed text.
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -1205,12 +1167,13 @@ class AsyncInferenceClient:
         ]
         ```
         """
-        provider_helper = get_provider_helper(self.provider, task="fill-mask")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="fill-mask", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=text,
             parameters={"targets": targets, "top_k": top_k},
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
@@ -1223,13 +1186,13 @@ class AsyncInferenceClient:
         model: Optional[str] = None,
         function_to_apply: Optional["ImageClassificationOutputTransform"] = None,
         top_k: Optional[int] = None,
-    ) -> List[ImageClassificationOutputElement]:
+    ) -> list[ImageClassificationOutputElement]:
         """
         Perform image classification on the given image using the specified model.
         Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The image to classify. It can be raw bytes, an image file, or a URL to an online image.
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The image to classify. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
             model (`str`, *optional*):
                 The model to use for image classification. Can be a model ID hosted on the Hugging Face Hub or a URL to a
                 deployed Inference Endpoint. If not provided, the default recommended model for image classification will be used.
@@ -1238,12 +1201,12 @@ class AsyncInferenceClient:
             top_k (`int`, *optional*):
                 When specified, limits the output to the top K most probable classes.
         Returns:
-            `List[ImageClassificationOutputElement]`: a list of [`ImageClassificationOutputElement`] items containing the predicted label and associated probability.
+            `list[ImageClassificationOutputElement]`: a list of [`ImageClassificationOutputElement`] items containing the predicted label and associated probability.
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -1255,12 +1218,13 @@ class AsyncInferenceClient:
         [ImageClassificationOutputElement(label='Blenheim spaniel', score=0.9779096841812134), ...]
         ```
         """
-        provider_helper = get_provider_helper(self.provider, task="image-classification")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="image-classification", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=image,
             parameters={"function_to_apply": function_to_apply, "top_k": top_k},
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
@@ -1275,19 +1239,16 @@ class AsyncInferenceClient:
         overlap_mask_area_threshold: Optional[float] = None,
         subtask: Optional["ImageSegmentationSubtask"] = None,
         threshold: Optional[float] = None,
-    ) -> List[ImageSegmentationOutputElement]:
+    ) -> list[ImageSegmentationOutputElement]:
         """
         Perform image segmentation on the given image using the specified model.
-        <Tip warning={true}>
-        You must have `PIL` installed if you want to work with images (`pip install Pillow`).
-        </Tip>
+        > [!WARNING]
+        > You must have `PIL` installed if you want to work with images (`pip install Pillow`).
         Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The image to segment. It can be raw bytes, an image file, or a URL to an online image.
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The image to segment. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
             model (`str`, *optional*):
                 The model to use for image segmentation. Can be a model ID hosted on the Hugging Face Hub or a URL to a
                 deployed Inference Endpoint. If not provided, the default recommended model for image segmentation will be used.
@@ -1300,12 +1261,12 @@ class AsyncInferenceClient:
             threshold (`float`, *optional*):
                 Probability threshold to filter out predicted masks.
         Returns:
-            `List[ImageSegmentationOutputElement]`: A list of [`ImageSegmentationOutputElement`] items containing the segmented masks and associated attributes.
+            `list[ImageSegmentationOutputElement]`: A list of [`ImageSegmentationOutputElement`] items containing the segmented masks and associated attributes.
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -1317,7 +1278,8 @@ class AsyncInferenceClient:
         [ImageSegmentationOutputElement(score=0.989008, label='LABEL_184', mask=<PIL.PngImagePlugin.PngImageFile image mode=L size=400x300 at 0x7FDD2B129CC0>), ...]
         ```
         """
-        provider_helper = get_provider_helper(self.provider, task="audio-classification")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="image-segmentation", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=image,
             parameters={
@@ -1327,10 +1289,11 @@ class AsyncInferenceClient:
                 "threshold": threshold,
             },
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
+        response = provider_helper.get_response(response, request_parameters)
         output = ImageSegmentationOutputElement.parse_obj_as_list(response)
         for item in output:
             item.mask = _b64_to_image(item.mask)  # type: ignore [assignment]
@@ -1351,15 +1314,12 @@ class AsyncInferenceClient:
         """
         Perform image-to-image translation using a specified model.
-        <Tip warning={true}>
-        You must have `PIL` installed if you want to work with images (`pip install Pillow`).
-        </Tip>
+        > [!WARNING]
+        > You must have `PIL` installed if you want to work with images (`pip install Pillow`).
         Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The input image for translation. It can be raw bytes, an image file, or a URL to an online image.
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The input image for translation. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
             prompt (`str`, *optional*):
                 The text prompt to guide the image generation.
             negative_prompt (`str`, *optional*):
@@ -1374,7 +1334,8 @@ class AsyncInferenceClient:
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
             target_size (`ImageToImageTargetSize`, *optional*):
-                The size in pixel of the output image.
+                The size in pixels of the output image. This parameter is only supported by some providers and for
+                specific models. It will be ignored when unsupported.
         Returns:
             `Image`: The translated image.
@@ -1382,7 +1343,7 @@ class AsyncInferenceClient:
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -1393,8 +1354,10 @@ class AsyncInferenceClient:
         >>> image = await client.image_to_image("cat.jpg", prompt="turn the cat into a tiger")
         >>> image.save("tiger.jpg")
         ```
         """
-        provider_helper = get_provider_helper(self.provider, task="image-to-image")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="image-to-image", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=image,
             parameters={
@@ -1406,22 +1369,103 @@ class AsyncInferenceClient:
                 **kwargs,
             },
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
+        response = provider_helper.get_response(response, request_parameters)
         return _bytes_to_image(response)
+    async def image_to_video(
+        self,
+        image: ContentT,
+        *,
+        model: Optional[str] = None,
+        prompt: Optional[str] = None,
+        negative_prompt: Optional[str] = None,
+        num_frames: Optional[float] = None,
+        num_inference_steps: Optional[int] = None,
+        guidance_scale: Optional[float] = None,
+        seed: Optional[int] = None,
+        target_size: Optional[ImageToVideoTargetSize] = None,
+        **kwargs,
+    ) -> bytes:
+        """
+        Generate a video from an input image.
+        Args:
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The input image to generate a video from. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+            prompt (`str`, *optional*):
+                The text prompt to guide the video generation.
+            negative_prompt (`str`, *optional*):
+                One prompt to guide what NOT to include in video generation.
+            num_frames (`float`, *optional*):
+                The num_frames parameter determines how many video frames are generated.
+            num_inference_steps (`int`, *optional*):
+                For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher
+                quality image at the expense of slower inference.
+            guidance_scale (`float`, *optional*):
+                For diffusion models. A higher guidance scale value encourages the model to generate videos closely
+                linked to the text prompt at the expense of lower image quality.
+            seed (`int`, *optional*):
+                The seed to use for the video generation.
+            target_size (`ImageToVideoTargetSize`, *optional*):
+                The size in pixel of the output video frames.
+            num_inference_steps (`int`, *optional*):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference.
+            seed (`int`, *optional*):
+                Seed for the random number generator.
+        Returns:
+            `bytes`: The generated video.
+        Examples:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> video = await client.image_to_video("cat.jpg", model="Wan-AI/Wan2.2-I2V-A14B", prompt="turn the cat into a tiger")
+        >>> with open("tiger.mp4", "wb") as f:
+        ...     f.write(video)
+        ```
+        """
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="image-to-video", model=model_id)
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={
+                "prompt": prompt,
+                "negative_prompt": negative_prompt,
+                "num_frames": num_frames,
+                "num_inference_steps": num_inference_steps,
+                "guidance_scale": guidance_scale,
+                "seed": seed,
+                "target_size": target_size,
+                **kwargs,
+            },
+            headers=self.headers,
+            model=model_id,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        response = provider_helper.get_response(response, request_parameters)
+        return response
     async def image_to_text(self, image: ContentT, *, model: Optional[str] = None) -> ImageToTextOutput:
         """
         Takes an input image and return text.
         Models can have very different outputs depending on your use case (image captioning, optical character recognition
-        (OCR), Pix2Struct, etc). Please have a look to the model card to learn more about a model's specificities.
+        (OCR), Pix2Struct, etc.). Please have a look to the model card to learn more about a model's specificities.
         Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The input image to caption. It can be raw bytes, an image file, or a URL to an online image..
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The input image to caption. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
@@ -1432,7 +1476,7 @@ class AsyncInferenceClient:
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -1446,45 +1490,43 @@ class AsyncInferenceClient:
         'a dog laying on the grass next to a flower pot '
         ```
         """
-        provider_helper = get_provider_helper(self.provider, task="image-to-text")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="image-to-text", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=image,
             parameters={},
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
-        output = ImageToTextOutput.parse_obj(response)
-        return output[0] if isinstance(output, list) else output
+        output_list: list[ImageToTextOutput] = ImageToTextOutput.parse_obj_as_list(response)
+        return output_list[0]
     async def object_detection(
         self, image: ContentT, *, model: Optional[str] = None, threshold: Optional[float] = None
-    ) -> List[ObjectDetectionOutputElement]:
+    ) -> list[ObjectDetectionOutputElement]:
         """
         Perform object detection on the given image using the specified model.
-        <Tip warning={true}>
-        You must have `PIL` installed if you want to work with images (`pip install Pillow`).
-        </Tip>
+        > [!WARNING]
+        > You must have `PIL` installed if you want to work with images (`pip install Pillow`).
         Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The image to detect objects on. It can be raw bytes, an image file, or a URL to an online image.
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The image to detect objects on. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
             model (`str`, *optional*):
                 The model to use for object detection. Can be a model ID hosted on the Hugging Face Hub or a URL to a
                 deployed Inference Endpoint. If not provided, the default recommended model for object detection (DETR) will be used.
             threshold (`float`, *optional*):
                 The probability necessary to make a prediction.
         Returns:
-            `List[ObjectDetectionOutputElement]`: A list of [`ObjectDetectionOutputElement`] items containing the bounding boxes and associated attributes.
+            `list[ObjectDetectionOutputElement]`: A list of [`ObjectDetectionOutputElement`] items containing the bounding boxes and associated attributes.
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
             `ValueError`:
                 If the request output is not a List.
@@ -1498,12 +1540,13 @@ class AsyncInferenceClient:
         [ObjectDetectionOutputElement(score=0.9486683011054993, label='person', box=ObjectDetectionBoundingBox(xmin=59, ymin=39, xmax=420, ymax=510)), ...]
         ```
         """
-        provider_helper = get_provider_helper(self.provider, task="object-detection")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="object-detection", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=image,
             parameters={"threshold": threshold},
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
@@ -1522,7 +1565,7 @@ class AsyncInferenceClient:
         max_question_len: Optional[int] = None,
         max_seq_len: Optional[int] = None,
         top_k: Optional[int] = None,
-    ) -> Union[QuestionAnsweringOutputElement, List[QuestionAnsweringOutputElement]]:
+    ) -> Union[QuestionAnsweringOutputElement, list[QuestionAnsweringOutputElement]]:
         """
         Retrieve the answer to a question from a given text.
@@ -1554,13 +1597,13 @@ class AsyncInferenceClient:
                 topk answers if there are not enough options available within the context.
         Returns:
-            Union[`QuestionAnsweringOutputElement`, List[`QuestionAnsweringOutputElement`]]:
+            Union[`QuestionAnsweringOutputElement`, list[`QuestionAnsweringOutputElement`]]:
                 When top_k is 1 or not provided, it returns a single `QuestionAnsweringOutputElement`.
                 When top_k is greater than 1, it returns a list of `QuestionAnsweringOutputElement`.
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -1572,9 +1615,10 @@ class AsyncInferenceClient:
         QuestionAnsweringOutputElement(answer='Clara', end=16, score=0.9326565265655518, start=11)
         ```
         """
-        provider_helper = get_provider_helper(self.provider, task="question-answering")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="question-answering", model=model_id)
         request_parameters = provider_helper.prepare_request(
-            inputs=None,
+            inputs={"question": question, "context": context},
             parameters={
                 "align_to_words": align_to_words,
                 "doc_stride": doc_stride,
@@ -1584,9 +1628,8 @@ class AsyncInferenceClient:
                 "max_seq_len": max_seq_len,
                 "top_k": top_k,
             },
-            extra_payload={"question": question, "context": context},
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
@@ -1595,28 +1638,28 @@ class AsyncInferenceClient:
         return output
     async def sentence_similarity(
-        self, sentence: str, other_sentences: List[str], *, model: Optional[str] = None
-    ) -> List[float]:
+        self, sentence: str, other_sentences: list[str], *, model: Optional[str] = None
+    ) -> list[float]:
         """
         Compute the semantic similarity between a sentence and a list of other sentences by comparing their embeddings.
         Args:
             sentence (`str`):
                 The main sentence to compare to others.
-            other_sentences (`List[str]`):
+            other_sentences (`list[str]`):
                 The list of sentences to compare to.
             model (`str`, *optional*):
-                The model to use for the conversational task. Can be a model ID hosted on the Hugging Face Hub or a URL to
-                a deployed Inference Endpoint. If not provided, the default recommended conversational model will be used.
+                The model to use for the sentence similarity task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended sentence similarity model will be used.
                 Defaults to None.
         Returns:
-            `List[float]`: The embedding representing the input text.
+            `list[float]`: The embedding representing the input text.
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -1635,13 +1678,14 @@ class AsyncInferenceClient:
         [0.7785726189613342, 0.45876261591911316, 0.2906220555305481]
         ```
         """
-        provider_helper = get_provider_helper(self.provider, task="sentence-similarity")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="sentence-similarity", model=model_id)
         request_parameters = provider_helper.prepare_request(
-            inputs=None,
+            inputs={"source_sentence": sentence, "sentences": other_sentences},
             parameters={},
-            extra_payload={"source_sentence": sentence, "sentences": other_sentences},
+            extra_payload={},
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
@@ -1653,7 +1697,7 @@ class AsyncInferenceClient:
         *,
         model: Optional[str] = None,
         clean_up_tokenization_spaces: Optional[bool] = None,
-        generate_parameters: Optional[Dict[str, Any]] = None,
+        generate_parameters: Optional[dict[str, Any]] = None,
         truncation: Optional["SummarizationTruncationStrategy"] = None,
     ) -> SummarizationOutput:
         """
@@ -1667,7 +1711,7 @@ class AsyncInferenceClient:
                 Inference Endpoint. If not provided, the default recommended model for summarization will be used.
             clean_up_tokenization_spaces (`bool`, *optional*):
                 Whether to clean up the potential extra spaces in the text output.
-            generate_parameters (`Dict[str, Any]`, *optional*):
+            generate_parameters (`dict[str, Any]`, *optional*):
                 Additional parametrization of the text generation algorithm.
             truncation (`"SummarizationTruncationStrategy"`, *optional*):
                 The truncation strategy to use.
@@ -1677,7 +1721,7 @@ class AsyncInferenceClient:
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -1694,12 +1738,13 @@ class AsyncInferenceClient:
             "generate_parameters": generate_parameters,
             "truncation": truncation,
         }
-        provider_helper = get_provider_helper(self.provider, task="summarization")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="summarization", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=text,
             parameters=parameters,
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
@@ -1707,7 +1752,7 @@ class AsyncInferenceClient:
     async def table_question_answering(
         self,
-        table: Dict[str, Any],
+        table: dict[str, Any],
         query: str,
         *,
         model: Optional[str] = None,
@@ -1742,7 +1787,7 @@ class AsyncInferenceClient:
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -1756,24 +1801,24 @@ class AsyncInferenceClient:
         TableQuestionAnsweringOutputElement(answer='36542', coordinates=[[0, 1]], cells=['36542'], aggregator='AVERAGE')
         ```
         """
-        provider_helper = get_provider_helper(self.provider, task="table-question-answering")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="table-question-answering", model=model_id)
         request_parameters = provider_helper.prepare_request(
-            inputs=None,
+            inputs={"query": query, "table": table},
             parameters={"model": model, "padding": padding, "sequential": sequential, "truncation": truncation},
-            extra_payload={"query": query, "table": table},
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
         return TableQuestionAnsweringOutputElement.parse_obj_as_instance(response)
-    async def tabular_classification(self, table: Dict[str, Any], *, model: Optional[str] = None) -> List[str]:
+    async def tabular_classification(self, table: dict[str, Any], *, model: Optional[str] = None) -> list[str]:
         """
         Classifying a target category (a group) based on a set of attributes.
         Args:
-            table (`Dict[str, Any]`):
+            table (`dict[str, Any]`):
                 Set of attributes to classify.
             model (`str`, *optional*):
                 The model to use for the tabular classification task. Can be a model ID hosted on the Hugging Face Hub or a URL to
@@ -1786,7 +1831,7 @@ class AsyncInferenceClient:
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -1811,24 +1856,25 @@ class AsyncInferenceClient:
         ["5", "5", "5"]
         ```
         """
-        provider_helper = get_provider_helper(self.provider, task="tabular-classification")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="tabular-classification", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=None,
             extra_payload={"table": table},
             parameters={},
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
         return _bytes_to_list(response)
-    async def tabular_regression(self, table: Dict[str, Any], *, model: Optional[str] = None) -> List[float]:
+    async def tabular_regression(self, table: dict[str, Any], *, model: Optional[str] = None) -> list[float]:
         """
         Predicting a numerical target value given a set of attributes/features in a table.
         Args:
-            table (`Dict[str, Any]`):
+            table (`dict[str, Any]`):
                 Set of attributes stored in a table. The attributes used to predict the target can be both numerical and categorical.
             model (`str`, *optional*):
                 The model to use for the tabular regression task. Can be a model ID hosted on the Hugging Face Hub or a URL to
@@ -1841,7 +1887,7 @@ class AsyncInferenceClient:
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -1861,13 +1907,14 @@ class AsyncInferenceClient:
         [110, 120, 130]
         ```
         """
-        provider_helper = get_provider_helper(self.provider, task="tabular-regression")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="tabular-regression", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=None,
             parameters={},
             extra_payload={"table": table},
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
@@ -1880,7 +1927,7 @@ class AsyncInferenceClient:
         model: Optional[str] = None,
         top_k: Optional[int] = None,
         function_to_apply: Optional["TextClassificationOutputTransform"] = None,
-    ) -> List[TextClassificationOutputElement]:
+    ) -> list[TextClassificationOutputElement]:
         """
         Perform text classification (e.g. sentiment-analysis) on the given text.
@@ -1897,12 +1944,12 @@ class AsyncInferenceClient:
                 The function to apply to the model outputs in order to retrieve the scores.
         Returns:
-            `List[TextClassificationOutputElement]`: a list of [`TextClassificationOutputElement`] items containing the predicted label and associated probability.
+            `list[TextClassificationOutputElement]`: a list of [`TextClassificationOutputElement`] items containing the predicted label and associated probability.
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -1917,7 +1964,8 @@ class AsyncInferenceClient:
         ]
         ```
         """
-        provider_helper = get_provider_helper(self.provider, task="text-classification")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="text-classification", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=text,
             parameters={
@@ -1925,33 +1973,33 @@ class AsyncInferenceClient:
                 "top_k": top_k,
             },
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
         return TextClassificationOutputElement.parse_obj_as_list(response)[0]  # type: ignore [return-value]
     @overload
-    async def text_generation(  # type: ignore
+    async def text_generation(
         self,
         prompt: str,
         *,
-        details: Literal[False] = ...,
-        stream: Literal[False] = ...,
+        details: Literal[True],
+        stream: Literal[True],
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
         adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
-        do_sample: Optional[bool] = False,  # Manual default value
+        do_sample: Optional[bool] = None,
         frequency_penalty: Optional[float] = None,
         grammar: Optional[TextGenerationInputGrammarType] = None,
         max_new_tokens: Optional[int] = None,
         repetition_penalty: Optional[float] = None,
-        return_full_text: Optional[bool] = False,  # Manual default value
+        return_full_text: Optional[bool] = None,
         seed: Optional[int] = None,
-        stop: Optional[List[str]] = None,
-        stop_sequences: Optional[List[str]] = None,  # Deprecated, use `stop` instead
+        stop: Optional[list[str]] = None,
+        stop_sequences: Optional[list[str]] = None,  # Deprecated, use `stop` instead
         temperature: Optional[float] = None,
         top_k: Optional[int] = None,
         top_n_tokens: Optional[int] = None,
@@ -1959,29 +2007,29 @@ class AsyncInferenceClient:
         truncate: Optional[int] = None,
         typical_p: Optional[float] = None,
         watermark: Optional[bool] = None,
-    ) -> str: ...
+    ) -> AsyncIterable[TextGenerationStreamOutput]: ...
     @overload
-    async def text_generation(  # type: ignore
+    async def text_generation(
         self,
         prompt: str,
         *,
-        details: Literal[True] = ...,
-        stream: Literal[False] = ...,
+        details: Literal[True],
+        stream: Optional[Literal[False]] = None,
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
         adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
-        do_sample: Optional[bool] = False,  # Manual default value
+        do_sample: Optional[bool] = None,
         frequency_penalty: Optional[float] = None,
         grammar: Optional[TextGenerationInputGrammarType] = None,
         max_new_tokens: Optional[int] = None,
         repetition_penalty: Optional[float] = None,
-        return_full_text: Optional[bool] = False,  # Manual default value
+        return_full_text: Optional[bool] = None,
         seed: Optional[int] = None,
-        stop: Optional[List[str]] = None,
-        stop_sequences: Optional[List[str]] = None,  # Deprecated, use `stop` instead
+        stop: Optional[list[str]] = None,
+        stop_sequences: Optional[list[str]] = None,  # Deprecated, use `stop` instead
         temperature: Optional[float] = None,
         top_k: Optional[int] = None,
         top_n_tokens: Optional[int] = None,
@@ -1992,26 +2040,26 @@ class AsyncInferenceClient:
     ) -> TextGenerationOutput: ...
     @overload
-    async def text_generation(  # type: ignore
+    async def text_generation(
         self,
         prompt: str,
         *,
-        details: Literal[False] = ...,
-        stream: Literal[True] = ...,
+        details: Optional[Literal[False]] = None,
+        stream: Literal[True],
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
         adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
-        do_sample: Optional[bool] = False,  # Manual default value
+        do_sample: Optional[bool] = None,
         frequency_penalty: Optional[float] = None,
         grammar: Optional[TextGenerationInputGrammarType] = None,
         max_new_tokens: Optional[int] = None,
         repetition_penalty: Optional[float] = None,
-        return_full_text: Optional[bool] = False,  # Manual default value
+        return_full_text: Optional[bool] = None,  # Manual default value
         seed: Optional[int] = None,
-        stop: Optional[List[str]] = None,
-        stop_sequences: Optional[List[str]] = None,  # Deprecated, use `stop` instead
+        stop: Optional[list[str]] = None,
+        stop_sequences: Optional[list[str]] = None,  # Deprecated, use `stop` instead
         temperature: Optional[float] = None,
         top_k: Optional[int] = None,
         top_n_tokens: Optional[int] = None,
@@ -2022,26 +2070,26 @@ class AsyncInferenceClient:
     ) -> AsyncIterable[str]: ...
     @overload
-    async def text_generation(  # type: ignore
+    async def text_generation(
         self,
         prompt: str,
         *,
-        details: Literal[True] = ...,
-        stream: Literal[True] = ...,
+        details: Optional[Literal[False]] = None,
+        stream: Optional[Literal[False]] = None,
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
         adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
-        do_sample: Optional[bool] = False,  # Manual default value
+        do_sample: Optional[bool] = None,
         frequency_penalty: Optional[float] = None,
         grammar: Optional[TextGenerationInputGrammarType] = None,
         max_new_tokens: Optional[int] = None,
         repetition_penalty: Optional[float] = None,
-        return_full_text: Optional[bool] = False,  # Manual default value
+        return_full_text: Optional[bool] = None,
         seed: Optional[int] = None,
-        stop: Optional[List[str]] = None,
-        stop_sequences: Optional[List[str]] = None,  # Deprecated, use `stop` instead
+        stop: Optional[list[str]] = None,
+        stop_sequences: Optional[list[str]] = None,  # Deprecated, use `stop` instead
         temperature: Optional[float] = None,
         top_k: Optional[int] = None,
         top_n_tokens: Optional[int] = None,
@@ -2049,29 +2097,29 @@ class AsyncInferenceClient:
         truncate: Optional[int] = None,
         typical_p: Optional[float] = None,
         watermark: Optional[bool] = None,
-    ) -> AsyncIterable[TextGenerationStreamOutput]: ...
+    ) -> str: ...
     @overload
     async def text_generation(
         self,
         prompt: str,
         *,
-        details: Literal[True] = ...,
-        stream: bool = ...,
+        details: Optional[bool] = None,
+        stream: Optional[bool] = None,
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
         adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
-        do_sample: Optional[bool] = False,  # Manual default value
+        do_sample: Optional[bool] = None,
         frequency_penalty: Optional[float] = None,
         grammar: Optional[TextGenerationInputGrammarType] = None,
         max_new_tokens: Optional[int] = None,
         repetition_penalty: Optional[float] = None,
-        return_full_text: Optional[bool] = False,  # Manual default value
+        return_full_text: Optional[bool] = None,
         seed: Optional[int] = None,
-        stop: Optional[List[str]] = None,
-        stop_sequences: Optional[List[str]] = None,  # Deprecated, use `stop` instead
+        stop: Optional[list[str]] = None,
+        stop_sequences: Optional[list[str]] = None,  # Deprecated, use `stop` instead
         temperature: Optional[float] = None,
         top_k: Optional[int] = None,
         top_n_tokens: Optional[int] = None,
@@ -2079,28 +2127,28 @@ class AsyncInferenceClient:
         truncate: Optional[int] = None,
         typical_p: Optional[float] = None,
         watermark: Optional[bool] = None,
-    ) -> Union[TextGenerationOutput, AsyncIterable[TextGenerationStreamOutput]]: ...
+    ) -> Union[str, TextGenerationOutput, AsyncIterable[str], AsyncIterable[TextGenerationStreamOutput]]: ...
     async def text_generation(
         self,
         prompt: str,
         *,
-        details: bool = False,
-        stream: bool = False,
+        details: Optional[bool] = None,
+        stream: Optional[bool] = None,
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
         adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
-        do_sample: Optional[bool] = False,  # Manual default value
+        do_sample: Optional[bool] = None,
         frequency_penalty: Optional[float] = None,
         grammar: Optional[TextGenerationInputGrammarType] = None,
         max_new_tokens: Optional[int] = None,
         repetition_penalty: Optional[float] = None,
-        return_full_text: Optional[bool] = False,  # Manual default value
+        return_full_text: Optional[bool] = None,
         seed: Optional[int] = None,
-        stop: Optional[List[str]] = None,
-        stop_sequences: Optional[List[str]] = None,  # Deprecated, use `stop` instead
+        stop: Optional[list[str]] = None,
+        stop_sequences: Optional[list[str]] = None,  # Deprecated, use `stop` instead
         temperature: Optional[float] = None,
         top_k: Optional[int] = None,
         top_n_tokens: Optional[int] = None,
@@ -2112,12 +2160,9 @@ class AsyncInferenceClient:
         """
         Given a prompt, generate the following text.
-        <Tip>
-        If you want to generate a response from chat messages, you should use the [`InferenceClient.chat_completion`] method.
-        It accepts a list of messages instead of a single text prompt and handles the chat templating for you.
-        </Tip>
+        > [!TIP]
+        > If you want to generate a response from chat messages, you should use the [`InferenceClient.chat_completion`] method.
+        > It accepts a list of messages instead of a single text prompt and handles the chat templating for you.
         Args:
             prompt (`str`):
@@ -2156,9 +2201,9 @@ class AsyncInferenceClient:
                 Whether to prepend the prompt to the generated text
             seed (`int`, *optional*):
                 Random sampling seed
-            stop (`List[str]`, *optional*):
+            stop (`list[str]`, *optional*):
                 Stop generating tokens if a member of `stop` is generated.
-            stop_sequences (`List[str]`, *optional*):
+            stop_sequences (`list[str]`, *optional*):
                 Deprecated argument. Use `stop` instead.
             temperature (`float`, *optional*):
                 The value used to module the logits distribution.
@@ -2175,14 +2220,14 @@ class AsyncInferenceClient:
             typical_p (`float`, *optional`):
                 Typical Decoding mass
                 See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
-            watermark (`bool`, *optional`):
+            watermark (`bool`, *optional*):
                 Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
         Returns:
-            `Union[str, TextGenerationOutput, Iterable[str], Iterable[TextGenerationStreamOutput]]`:
+            `Union[str, TextGenerationOutput, AsyncIterable[str], AsyncIterable[TextGenerationStreamOutput]]`:
             Generated text returned from the server:
             - if `stream=False` and `details=False`, the generated text is returned as a `str` (default)
-            - if `stream=True` and `details=False`, the generated text is returned token by token as a `Iterable[str]`
+            - if `stream=True` and `details=False`, the generated text is returned token by token as a `AsyncIterable[str]`
             - if `stream=False` and `details=True`, the generated text is returned with more details as a [`~huggingface_hub.TextGenerationOutput`]
             - if `details=True` and `stream=True`, the generated text is returned token by token as a iterable of [`~huggingface_hub.TextGenerationStreamOutput`]
@@ -2191,7 +2236,7 @@ class AsyncInferenceClient:
                 If input values are not valid. No HTTP call is made to the server.
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -2326,7 +2371,7 @@ class AsyncInferenceClient:
             "repetition_penalty": repetition_penalty,
             "return_full_text": return_full_text,
             "seed": seed,
-            "stop": stop if stop is not None else [],
+            "stop": stop,
             "temperature": temperature,
             "top_k": top_k,
             "top_n_tokens": top_n_tokens,
@@ -2367,29 +2412,30 @@ class AsyncInferenceClient:
                     " Please pass `stream=False` as input."
                 )
-        provider_helper = get_provider_helper(self.provider, task="text-generation")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="text-generation", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=prompt,
             parameters=parameters,
             extra_payload={"stream": stream},
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         # Handle errors separately for more precise error messages
         try:
-            bytes_output = await self._inner_post(request_parameters, stream=stream)
-        except _import_aiohttp().ClientResponseError as e:
-            match = MODEL_KWARGS_NOT_USED_REGEX.search(e.response_error_payload["error"])
-            if e.status == 400 and match:
+            bytes_output = await self._inner_post(request_parameters, stream=stream or False)
+        except HfHubHTTPError as e:
+            match = MODEL_KWARGS_NOT_USED_REGEX.search(str(e))
+            if isinstance(e, BadRequestError) and match:
                 unused_params = [kwarg.strip("' ") for kwarg in match.group(1).split(",")]
                 _set_unsupported_text_generation_kwargs(model, unused_params)
                 return await self.text_generation(  # type: ignore
                     prompt=prompt,
                     details=details,
                     stream=stream,
-                    model=model or self.model,
+                    model=model_id,
                     adapter_id=adapter_id,
                     best_of=best_of,
                     decoder_input_details=decoder_input_details,
@@ -2420,8 +2466,8 @@ class AsyncInferenceClient:
         # Data can be a single element (dict) or an iterable of dicts where we select the first element of.
         if isinstance(data, list):
             data = data[0]
-        return TextGenerationOutput.parse_obj_as_instance(data) if details else data["generated_text"]
+        response = provider_helper.get_response(data, request_parameters)
+        return TextGenerationOutput.parse_obj_as_instance(response) if details else response["generated_text"]
     async def text_to_image(
         self,
@@ -2435,20 +2481,16 @@ class AsyncInferenceClient:
         model: Optional[str] = None,
         scheduler: Optional[str] = None,
         seed: Optional[int] = None,
-        extra_body: Optional[Dict[str, Any]] = None,
+        extra_body: Optional[dict[str, Any]] = None,
     ) -> "Image":
         """
         Generate an image based on a given text using a specified model.
-        <Tip warning={true}>
-        You must have `PIL` installed if you want to work with images (`pip install Pillow`).
+        > [!WARNING]
+        > You must have `PIL` installed if you want to work with images (`pip install Pillow`).
-        </Tip>
-        <Tip>
-        You can pass provider-specific parameters to the model by using the `extra_body` argument.
-        </Tip>
+        > [!TIP]
+        > You can pass provider-specific parameters to the model by using the `extra_body` argument.
         Args:
             prompt (`str`):
@@ -2473,7 +2515,7 @@ class AsyncInferenceClient:
                 Override the scheduler with a compatible one.
             seed (`int`, *optional*):
                 Seed for the random number generator.
-            extra_body (`Dict[str, Any]`, *optional*):
+            extra_body (`dict[str, Any]`, *optional*):
                 Additional provider-specific parameters to pass to the model. Refer to the provider's documentation
                 for supported parameters.
@@ -2483,7 +2525,7 @@ class AsyncInferenceClient:
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -2544,8 +2586,10 @@ class AsyncInferenceClient:
         ... )
         >>> image.save("astronaut.png")
         ```
         """
-        provider_helper = get_provider_helper(self.provider, task="text-to-image")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="text-to-image", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=prompt,
             parameters={
@@ -2559,11 +2603,11 @@ class AsyncInferenceClient:
                 **(extra_body or {}),
             },
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
-        response = provider_helper.get_response(response)
+        response = provider_helper.get_response(response, request_parameters)
         return _bytes_to_image(response)
     async def text_to_video(
@@ -2572,18 +2616,17 @@ class AsyncInferenceClient:
         *,
         model: Optional[str] = None,
         guidance_scale: Optional[float] = None,
-        negative_prompt: Optional[List[str]] = None,
+        negative_prompt: Optional[list[str]] = None,
         num_frames: Optional[float] = None,
         num_inference_steps: Optional[int] = None,
         seed: Optional[int] = None,
-        extra_body: Optional[Dict[str, Any]] = None,
+        extra_body: Optional[dict[str, Any]] = None,
     ) -> bytes:
         """
         Generate a video based on a given text.
-        <Tip>
-        You can pass provider-specific parameters to the model by using the `extra_body` argument.
-        </Tip>
+        > [!TIP]
+        > You can pass provider-specific parameters to the model by using the `extra_body` argument.
         Args:
             prompt (`str`):
@@ -2595,7 +2638,7 @@ class AsyncInferenceClient:
             guidance_scale (`float`, *optional*):
                 A higher guidance scale value encourages the model to generate videos closely linked to the text
                 prompt, but values too high may cause saturation and other artifacts.
-            negative_prompt (`List[str]`, *optional*):
+            negative_prompt (`list[str]`, *optional*):
                 One or several prompt to guide what NOT to include in video generation.
             num_frames (`float`, *optional*):
                 The num_frames parameter determines how many video frames are generated.
@@ -2604,7 +2647,7 @@ class AsyncInferenceClient:
                 expense of slower inference.
             seed (`int`, *optional*):
                 Seed for the random number generator.
-            extra_body (`Dict[str, Any]`, *optional*):
+            extra_body (`dict[str, Any]`, *optional*):
                 Additional provider-specific parameters to pass to the model. Refer to the provider's documentation
                 for supported parameters.
@@ -2642,8 +2685,10 @@ class AsyncInferenceClient:
         >>> with open("cat.mp4", "wb") as file:
         ...     file.write(video)
         ```
         """
-        provider_helper = get_provider_helper(self.provider, task="text-to-video")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="text-to-video", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=prompt,
             parameters={
@@ -2655,11 +2700,11 @@ class AsyncInferenceClient:
                 **(extra_body or {}),
             },
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
-        response = provider_helper.get_response(response)
+        response = provider_helper.get_response(response, request_parameters)
         return response
     async def text_to_speech(
@@ -2683,14 +2728,13 @@ class AsyncInferenceClient:
         top_p: Optional[float] = None,
         typical_p: Optional[float] = None,
         use_cache: Optional[bool] = None,
-        extra_body: Optional[Dict[str, Any]] = None,
+        extra_body: Optional[dict[str, Any]] = None,
     ) -> bytes:
         """
         Synthesize an audio of a voice pronouncing a given text.
-        <Tip>
-        You can pass provider-specific parameters to the model by using the `extra_body` argument.
-        </Tip>
+        > [!TIP]
+        > You can pass provider-specific parameters to the model by using the `extra_body` argument.
         Args:
             text (`str`):
@@ -2745,7 +2789,7 @@ class AsyncInferenceClient:
                 paper](https://hf.co/papers/2202.00666) for more details.
             use_cache (`bool`, *optional*):
                 Whether the model should use the past last key/values attentions to speed up decoding
-            extra_body (`Dict[str, Any]`, *optional*):
+            extra_body (`dict[str, Any]`, *optional*):
                 Additional provider-specific parameters to pass to the model. Refer to the provider's documentation
                 for supported parameters.
         Returns:
@@ -2754,7 +2798,7 @@ class AsyncInferenceClient:
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -2841,7 +2885,8 @@ class AsyncInferenceClient:
         ...     f.write(audio)
         ```
         """
-        provider_helper = get_provider_helper(self.provider, task="text-to-speech")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="text-to-speech", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=text,
             parameters={
@@ -2864,7 +2909,7 @@ class AsyncInferenceClient:
                 **(extra_body or {}),
             },
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
@@ -2877,9 +2922,9 @@ class AsyncInferenceClient:
         *,
         model: Optional[str] = None,
         aggregation_strategy: Optional["TokenClassificationAggregationStrategy"] = None,
-        ignore_labels: Optional[List[str]] = None,
+        ignore_labels: Optional[list[str]] = None,
         stride: Optional[int] = None,
-    ) -> List[TokenClassificationOutputElement]:
+    ) -> list[TokenClassificationOutputElement]:
         """
         Perform token classification on the given text.
         Usually used for sentence parsing, either grammatical, or Named Entity Recognition (NER) to understand keywords contained within text.
@@ -2893,18 +2938,18 @@ class AsyncInferenceClient:
                 Defaults to None.
             aggregation_strategy (`"TokenClassificationAggregationStrategy"`, *optional*):
                 The strategy used to fuse tokens based on model predictions
-            ignore_labels (`List[str`, *optional*):
+            ignore_labels (`list[str`, *optional*):
                 A list of labels to ignore
             stride (`int`, *optional*):
                 The number of overlapping tokens between chunks when splitting the input text.
         Returns:
-            `List[TokenClassificationOutputElement]`: List of [`TokenClassificationOutputElement`] items containing the entity group, confidence score, word, start and end index.
+            `list[TokenClassificationOutputElement]`: List of [`TokenClassificationOutputElement`] items containing the entity group, confidence score, word, start and end index.
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -2931,7 +2976,8 @@ class AsyncInferenceClient:
         ]
         ```
         """
-        provider_helper = get_provider_helper(self.provider, task="token-classification")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="token-classification", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=text,
             parameters={
@@ -2940,7 +2986,7 @@ class AsyncInferenceClient:
                 "stride": stride,
             },
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
@@ -2955,7 +3001,7 @@ class AsyncInferenceClient:
         tgt_lang: Optional[str] = None,
         clean_up_tokenization_spaces: Optional[bool] = None,
         truncation: Optional["TranslationTruncationStrategy"] = None,
-        generate_parameters: Optional[Dict[str, Any]] = None,
+        generate_parameters: Optional[dict[str, Any]] = None,
     ) -> TranslationOutput:
         """
         Convert text from one language to another.
@@ -2980,7 +3026,7 @@ class AsyncInferenceClient:
                 Whether to clean up the potential extra spaces in the text output.
             truncation (`"TranslationTruncationStrategy"`, *optional*):
                 The truncation strategy to use.
-            generate_parameters (`Dict[str, Any]`, *optional*):
+            generate_parameters (`dict[str, Any]`, *optional*):
                 Additional parametrization of the text generation algorithm.
         Returns:
@@ -2989,7 +3035,7 @@ class AsyncInferenceClient:
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
             `ValueError`:
                 If only one of the `src_lang` and `tgt_lang` arguments are provided.
@@ -3018,7 +3064,8 @@ class AsyncInferenceClient:
         if src_lang is None and tgt_lang is not None:
             raise ValueError("You cannot specify `tgt_lang` without specifying `src_lang`.")
-        provider_helper = get_provider_helper(self.provider, task="translation")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="translation", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=text,
             parameters={
@@ -3029,7 +3076,7 @@ class AsyncInferenceClient:
                 "generate_parameters": generate_parameters,
             },
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
@@ -3042,13 +3089,13 @@ class AsyncInferenceClient:
         *,
         model: Optional[str] = None,
         top_k: Optional[int] = None,
-    ) -> List[VisualQuestionAnsweringOutputElement]:
+    ) -> list[VisualQuestionAnsweringOutputElement]:
         """
         Answering open-ended questions based on an image.
         Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The input image for the context. It can be raw bytes, an image file, or a URL to an online image.
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The input image for the context. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
             question (`str`):
                 Question to be answered.
             model (`str`, *optional*):
@@ -3059,12 +3106,12 @@ class AsyncInferenceClient:
                 The number of answers to return (will be chosen by order of likelihood). Note that we return less than
                 topk answers if there are not enough options available within the context.
         Returns:
-            `List[VisualQuestionAnsweringOutputElement]`: a list of [`VisualQuestionAnsweringOutputElement`] items containing the predicted label and associated probability.
+            `list[VisualQuestionAnsweringOutputElement]`: a list of [`VisualQuestionAnsweringOutputElement`] items containing the predicted label and associated probability.
         Raises:
             `InferenceTimeoutError`:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -3082,44 +3129,37 @@ class AsyncInferenceClient:
         ]
         ```
         """
-        provider_helper = get_provider_helper(self.provider, task="visual-question-answering")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="visual-question-answering", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=image,
             parameters={"top_k": top_k},
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
             extra_payload={"question": question, "image": _b64_encode(image)},
         )
         response = await self._inner_post(request_parameters)
         return VisualQuestionAnsweringOutputElement.parse_obj_as_list(response)
-    @_deprecate_arguments(
-        version="0.30.0",
-        deprecated_args=["labels"],
-        custom_message="`labels`has been renamed to `candidate_labels` and will be removed in huggingface_hub>=0.30.0.",
-    )
     async def zero_shot_classification(
         self,
         text: str,
-        # temporarily keeping it optional for backward compatibility.
-        candidate_labels: List[str] = None,  # type: ignore
+        candidate_labels: list[str],
         *,
         multi_label: Optional[bool] = False,
         hypothesis_template: Optional[str] = None,
         model: Optional[str] = None,
-        # deprecated argument
-        labels: List[str] = None,  # type: ignore
-    ) -> List[ZeroShotClassificationOutputElement]:
+    ) -> list[ZeroShotClassificationOutputElement]:
         """
         Provide as input a text and a set of candidate labels to classify the input text.
         Args:
             text (`str`):
                 The input text to classify.
-            candidate_labels (`List[str]`):
+            candidate_labels (`list[str]`):
                 The set of possible class labels to classify the text into.
-            labels (`List[str]`, *optional*):
+            labels (`list[str]`, *optional*):
                 (deprecated) List of strings. Each string is the verbalization of a possible label for the input text.
             multi_label (`bool`, *optional*):
                 Whether multiple candidate labels can be true. If false, the scores are normalized such that the sum of
@@ -3134,12 +3174,12 @@ class AsyncInferenceClient:
         Returns:
-            `List[ZeroShotClassificationOutputElement]`: List of [`ZeroShotClassificationOutputElement`] items containing the predicted labels and their confidence.
+            `list[ZeroShotClassificationOutputElement]`: List of [`ZeroShotClassificationOutputElement`] items containing the predicted labels and their confidence.
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example with `multi_label=False`:
@@ -3190,17 +3230,8 @@ class AsyncInferenceClient:
         ]
         ```
         """
-        # handle deprecation
-        if labels is not None:
-            if candidate_labels is not None:
-                raise ValueError(
-                    "Cannot specify both `labels` and `candidate_labels`. Use `candidate_labels` instead."
-                )
-            candidate_labels = labels
-        elif candidate_labels is None:
-            raise ValueError("Must specify `candidate_labels`")
-        provider_helper = get_provider_helper(self.provider, task="zero-shot-classification")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="zero-shot-classification", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=text,
             parameters={
@@ -3209,7 +3240,7 @@ class AsyncInferenceClient:
                 "hypothesis_template": hypothesis_template,
             },
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
@@ -3219,31 +3250,25 @@ class AsyncInferenceClient:
             for label, score in zip(output["labels"], output["scores"])
         ]
-    @_deprecate_arguments(
-        version="0.30.0",
-        deprecated_args=["labels"],
-        custom_message="`labels`has been renamed to `candidate_labels` and will be removed in huggingface_hub>=0.30.0.",
-    )
     async def zero_shot_image_classification(
         self,
         image: ContentT,
-        # temporarily keeping it optional for backward compatibility.
-        candidate_labels: List[str] = None,  # type: ignore
+        candidate_labels: list[str],
         *,
         model: Optional[str] = None,
         hypothesis_template: Optional[str] = None,
         # deprecated argument
-        labels: List[str] = None,  # type: ignore
-    ) -> List[ZeroShotImageClassificationOutputElement]:
+        labels: list[str] = None,  # type: ignore
+    ) -> list[ZeroShotImageClassificationOutputElement]:
         """
         Provide input image and text labels to predict text labels for the image.
         Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The input image to caption. It can be raw bytes, an image file, or a URL to an online image.
-            candidate_labels (`List[str]`):
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The input image to caption. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
+            candidate_labels (`list[str]`):
                 The candidate labels for this image
-            labels (`List[str]`, *optional*):
+            labels (`list[str]`, *optional*):
                 (deprecated) List of string possible labels. There must be at least 2 labels.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
@@ -3253,12 +3278,12 @@ class AsyncInferenceClient:
                 replacing the placeholder with the candidate labels.
         Returns:
-            `List[ZeroShotImageClassificationOutputElement]`: List of [`ZeroShotImageClassificationOutputElement`] items containing the predicted labels and their confidence.
+            `list[ZeroShotImageClassificationOutputElement]`: List of [`ZeroShotImageClassificationOutputElement`] items containing the predicted labels and their confidence.
         Raises:
             [`InferenceTimeoutError`]:
                 If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
+            [`HfHubHTTPError`]:
                 If the request fails with an HTTP error status code other than HTTP 503.
         Example:
@@ -3274,20 +3299,12 @@ class AsyncInferenceClient:
         [ZeroShotImageClassificationOutputElement(label='dog', score=0.956),...]
         ```
         """
-        # handle deprecation
-        if labels is not None:
-            if candidate_labels is not None:
-                raise ValueError(
-                    "Cannot specify both `labels` and `candidate_labels`. Use `candidate_labels` instead."
-                )
-            candidate_labels = labels
-        elif candidate_labels is None:
-            raise ValueError("Must specify `candidate_labels`")
         # Raise ValueError if input is less than 2 labels
         if len(candidate_labels) < 2:
             raise ValueError("You must specify at least 2 classes to compare.")
-        provider_helper = get_provider_helper(self.provider, task="zero-shot-image-classification")
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="zero-shot-image-classification", model=model_id)
         request_parameters = provider_helper.prepare_request(
             inputs=image,
             parameters={
@@ -3295,150 +3312,13 @@ class AsyncInferenceClient:
                 "hypothesis_template": hypothesis_template,
             },
             headers=self.headers,
-            model=model or self.model,
+            model=model_id,
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
         return ZeroShotImageClassificationOutputElement.parse_obj_as_list(response)
-    @_deprecate_method(
-        version="0.33.0",
-        message=(
-            "HF Inference API is getting revamped and will only support warm models in the future (no cold start allowed)."
-            " Use `HfApi.list_models(..., inference_provider='...')` to list warm models per provider."
-        ),
-    )
-    async def list_deployed_models(
-        self, frameworks: Union[None, str, Literal["all"], List[str]] = None
-    ) -> Dict[str, List[str]]:
-        """
-        List models deployed on the HF Serverless Inference API service.
-        This helper checks deployed models framework by framework. By default, it will check the 4 main frameworks that
-        are supported and account for 95% of the hosted models. However, if you want a complete list of models you can
-        specify `frameworks="all"` as input. Alternatively, if you know before-hand which framework you are interested
-        in, you can also restrict to search to this one (e.g. `frameworks="text-generation-inference"`). The more
-        frameworks are checked, the more time it will take.
-        <Tip warning={true}>
-        This endpoint method does not return a live list of all models available for the HF Inference API service.
-        It searches over a cached list of models that were recently available and the list may not be up to date.
-        If you want to know the live status of a specific model, use [`~InferenceClient.get_model_status`].
-        </Tip>
-        <Tip>
-        This endpoint method is mostly useful for discoverability. If you already know which model you want to use and want to
-        check its availability, you can directly use [`~InferenceClient.get_model_status`].
-        </Tip>
-        Args:
-            frameworks (`Literal["all"]` or `List[str]` or `str`, *optional*):
-                The frameworks to filter on. By default only a subset of the available frameworks are tested. If set to
-                "all", all available frameworks will be tested. It is also possible to provide a single framework or a
-                custom set of frameworks to check.
-        Returns:
-            `Dict[str, List[str]]`: A dictionary mapping task names to a sorted list of model IDs.
-        Example:
-        ```py
-        # Must be run in an async contextthon
-        >>> from huggingface_hub import AsyncInferenceClient
-        >>> client = AsyncInferenceClient()
-        # Discover zero-shot-classification models currently deployed
-        >>> models = await client.list_deployed_models()
-        >>> models["zero-shot-classification"]
-        ['Narsil/deberta-large-mnli-zero-cls', 'facebook/bart-large-mnli', ...]
-        # List from only 1 framework
-        >>> await client.list_deployed_models("text-generation-inference")
-        {'text-generation': ['bigcode/starcoder', 'meta-llama/Llama-2-70b-chat-hf', ...], ...}
-        ```
-        """
-        if self.provider != "hf-inference":
-            raise ValueError(f"Listing deployed models is not supported on '{self.provider}'.")
-        # Resolve which frameworks to check
-        if frameworks is None:
-            frameworks = constants.MAIN_INFERENCE_API_FRAMEWORKS
-        elif frameworks == "all":
-            frameworks = constants.ALL_INFERENCE_API_FRAMEWORKS
-        elif isinstance(frameworks, str):
-            frameworks = [frameworks]
-        frameworks = list(set(frameworks))
-        # Fetch them iteratively
-        models_by_task: Dict[str, List[str]] = {}
-        def _unpack_response(framework: str, items: List[Dict]) -> None:
-            for model in items:
-                if framework == "sentence-transformers":
-                    # Model running with the `sentence-transformers` framework can work with both tasks even if not
-                    # branded as such in the API response
-                    models_by_task.setdefault("feature-extraction", []).append(model["model_id"])
-                    models_by_task.setdefault("sentence-similarity", []).append(model["model_id"])
-                else:
-                    models_by_task.setdefault(model["task"], []).append(model["model_id"])
-        for framework in frameworks:
-            response = get_session().get(
-                f"{constants.INFERENCE_ENDPOINT}/framework/{framework}", headers=build_hf_headers(token=self.token)
-            )
-            hf_raise_for_status(response)
-            _unpack_response(framework, response.json())
-        # Sort alphabetically for discoverability and return
-        for task, models in models_by_task.items():
-            models_by_task[task] = sorted(set(models), key=lambda x: x.lower())
-        return models_by_task
-    def _get_client_session(self, headers: Optional[Dict] = None) -> "ClientSession":
-        aiohttp = _import_aiohttp()
-        client_headers = self.headers.copy()
-        if headers is not None:
-            client_headers.update(headers)
-        # Return a new aiohttp ClientSession with correct settings.
-        session = aiohttp.ClientSession(
-            headers=client_headers,
-            cookies=self.cookies,
-            timeout=aiohttp.ClientTimeout(self.timeout),
-            trust_env=self.trust_env,
-        )
-        # Keep track of sessions to close them later
-        self._sessions[session] = set()
-        # Override the `._request` method to register responses to be closed
-        session._wrapped_request = session._request
-        async def _request(method, url, **kwargs):
-            response = await session._wrapped_request(method, url, **kwargs)
-            self._sessions[session].add(response)
-            return response
-        session._request = _request
-        # Override the 'close' method to
-        # 1. close ongoing responses
-        # 2. deregister the session when closed
-        session._close = session.close
-        async def close_session():
-            for response in self._sessions[session]:
-                response.close()
-            await session._close()
-            self._sessions.pop(session, None)
-        session.close = close_session
-        return session
-    async def get_endpoint_info(self, *, model: Optional[str] = None) -> Dict[str, Any]:
+    async def get_endpoint_info(self, *, model: Optional[str] = None) -> dict[str, Any]:
         """
         Get information about the deployed endpoint.
@@ -3451,7 +3331,7 @@ class AsyncInferenceClient:
                 Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
         Returns:
-            `Dict[str, Any]`: Information about the endpoint.
+            `dict[str, Any]`: Information about the endpoint.
         Example:
         ```py
@@ -3493,17 +3373,16 @@ class AsyncInferenceClient:
         else:
             url = f"{constants.INFERENCE_ENDPOINT}/models/{model}/info"
-        async with self._get_client_session(headers=build_hf_headers(token=self.token)) as client:
-            response = await client.get(url, proxy=self.proxies)
-            response.raise_for_status()
-            return await response.json()
+        client = await self._get_async_client()
+        response = await client.get(url, headers=build_hf_headers(token=self.token))
+        hf_raise_for_status(response)
+        return response.json()
     async def health_check(self, model: Optional[str] = None) -> bool:
         """
         Check the health of the deployed endpoint.
         Health check is only available with Inference Endpoints powered by Text-Generation-Inference (TGI) or Text-Embedding-Inference (TEI).
-        For Inference API, please use [`InferenceClient.get_model_status`] instead.
         Args:
             model (`str`, *optional*):
@@ -3528,77 +3407,12 @@ class AsyncInferenceClient:
         if model is None:
             raise ValueError("Model id not provided.")
         if not model.startswith(("http://", "https://")):
-            raise ValueError(
-                "Model must be an Inference Endpoint URL. For serverless Inference API, please use `InferenceClient.get_model_status`."
-            )
+            raise ValueError("Model must be an Inference Endpoint URL.")
         url = model.rstrip("/") + "/health"
-        async with self._get_client_session(headers=build_hf_headers(token=self.token)) as client:
-            response = await client.get(url, proxy=self.proxies)
-            return response.status == 200
-    @_deprecate_method(
-        version="0.33.0",
-        message=(
-            "HF Inference API is getting revamped and will only support warm models in the future (no cold start allowed)."
-            " Use `HfApi.model_info` to get the model status both with HF Inference API and external providers."
-        ),
-    )
-    async def get_model_status(self, model: Optional[str] = None) -> ModelStatus:
-        """
-        Get the status of a model hosted on the HF Inference API.
-        <Tip>
-        This endpoint is mostly useful when you already know which model you want to use and want to check its
-        availability. If you want to discover already deployed models, you should rather use [`~InferenceClient.list_deployed_models`].
-        </Tip>
-        Args:
-            model (`str`, *optional*):
-                Identifier of the model for witch the status gonna be checked. If model is not provided,
-                the model associated with this instance of [`InferenceClient`] will be used. Only HF Inference API service can be checked so the
-                identifier cannot be a URL.
-        Returns:
-            [`ModelStatus`]: An instance of ModelStatus dataclass, containing information,
-                         about the state of the model: load, state, compute type and framework.
-        Example:
-        ```py
-        # Must be run in an async context
-        >>> from huggingface_hub import AsyncInferenceClient
-        >>> client = AsyncInferenceClient()
-        >>> await client.get_model_status("meta-llama/Meta-Llama-3-8B-Instruct")
-        ModelStatus(loaded=True, state='Loaded', compute_type='gpu', framework='text-generation-inference')
-        ```
-        """
-        if self.provider != "hf-inference":
-            raise ValueError(f"Getting model status is not supported on '{self.provider}'.")
-        model = model or self.model
-        if model is None:
-            raise ValueError("Model id not provided.")
-        if model.startswith("https://"):
-            raise NotImplementedError("Model status is only available for Inference API endpoints.")
-        url = f"{constants.INFERENCE_ENDPOINT}/status/{model}"
-        async with self._get_client_session(headers=build_hf_headers(token=self.token)) as client:
-            response = await client.get(url, proxy=self.proxies)
-            response.raise_for_status()
-            response_data = await response.json()
-        if "error" in response_data:
-            raise ValueError(response_data["error"])
-        return ModelStatus(
-            loaded=response_data["loaded"],
-            state=response_data["state"],
-            compute_type=response_data["compute_type"],
-            framework=response_data["framework"],
-        )
+        client = await self._get_async_client()
+        response = await client.get(url, headers=build_hf_headers(token=self.token))
+        return response.status_code == 200
     @property
     def chat(self) -> "ProxyClientChat":

huggingface-hub 0.29.0rc2__py3-none-any.whl → 1.1.3__py3-none-any.whl

huggingface-hub 0.29.0rc2py3-none-any.whl → 1.1.3py3-none-any.whl