PyPI - huggingface-hub - Versions diffs - 0.27.1__py3-none-any.whl → 0.28.0rc0__py3-none-any.whl - Mend

huggingface-hub 0.27.1py3-none-any.whl → 0.28.0rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of huggingface-hub might be problematic. Click here for more details.

Files changed (39) hide show

huggingface_hub/__init__.py +418 -12
huggingface_hub/_commit_api.py +33 -4
huggingface_hub/_inference_endpoints.py +8 -2
huggingface_hub/_local_folder.py +14 -3
huggingface_hub/commands/scan_cache.py +1 -1
huggingface_hub/commands/upload_large_folder.py +1 -1
huggingface_hub/constants.py +7 -2
huggingface_hub/file_download.py +1 -2
huggingface_hub/hf_api.py +64 -83
huggingface_hub/inference/_client.py +706 -450
huggingface_hub/inference/_common.py +32 -64
huggingface_hub/inference/_generated/_async_client.py +722 -470
huggingface_hub/inference/_generated/types/__init__.py +1 -0
huggingface_hub/inference/_generated/types/image_to_image.py +3 -3
huggingface_hub/inference/_generated/types/text_to_audio.py +1 -2
huggingface_hub/inference/_generated/types/text_to_image.py +3 -3
huggingface_hub/inference/_generated/types/text_to_speech.py +3 -6
huggingface_hub/inference/_generated/types/text_to_video.py +47 -0
huggingface_hub/inference/_generated/types/visual_question_answering.py +1 -1
huggingface_hub/inference/_providers/__init__.py +89 -0
huggingface_hub/inference/_providers/fal_ai.py +155 -0
huggingface_hub/inference/_providers/hf_inference.py +202 -0
huggingface_hub/inference/_providers/replicate.py +144 -0
huggingface_hub/inference/_providers/sambanova.py +85 -0
huggingface_hub/inference/_providers/together.py +148 -0
huggingface_hub/py.typed +0 -0
huggingface_hub/repocard.py +1 -1
huggingface_hub/repocard_data.py +2 -1
huggingface_hub/serialization/_base.py +1 -1
huggingface_hub/serialization/_torch.py +1 -1
huggingface_hub/utils/_fixes.py +25 -13
huggingface_hub/utils/_http.py +2 -2
huggingface_hub/utils/logging.py +1 -1
{huggingface_hub-0.27.1.dist-info → huggingface_hub-0.28.0rc0.dist-info}/METADATA +4 -4
{huggingface_hub-0.27.1.dist-info → huggingface_hub-0.28.0rc0.dist-info}/RECORD +39 -31
{huggingface_hub-0.27.1.dist-info → huggingface_hub-0.28.0rc0.dist-info}/LICENSE +0 -0
{huggingface_hub-0.27.1.dist-info → huggingface_hub-0.28.0rc0.dist-info}/WHEEL +0 -0
{huggingface_hub-0.27.1.dist-info → huggingface_hub-0.28.0rc0.dist-info}/entry_points.txt +0 -0
{huggingface_hub-0.27.1.dist-info → huggingface_hub-0.28.0rc0.dist-info}/top_level.txt +0 -0

huggingface_hub/inference/_generated/_async_client.py CHANGED Viewed

@@ -26,14 +26,13 @@ import time
 import warnings
 from typing import TYPE_CHECKING, Any, AsyncIterable, Dict, List, Literal, Optional, Set, Union, overload
-from requests.structures import CaseInsensitiveDict
 from huggingface_hub.constants import ALL_INFERENCE_API_FRAMEWORKS, INFERENCE_ENDPOINT, MAIN_INFERENCE_API_FRAMEWORKS
 from huggingface_hub.errors import InferenceTimeoutError
 from huggingface_hub.inference._common import (
     TASKS_EXPECTING_IMAGES,
     ContentT,
     ModelStatus,
+    RequestParameters,
     _async_stream_chat_completion_response,
     _async_stream_text_generation_response,
     _b64_encode,
@@ -41,11 +40,9 @@ from huggingface_hub.inference._common import (
     _bytes_to_dict,
     _bytes_to_image,
     _bytes_to_list,
-    _fetch_recommended_models,
     _get_unsupported_text_generation_kwargs,
     _import_numpy,
     _open_as_binary,
-    _prepare_payload,
     _set_unsupported_text_generation_kwargs,
     raise_text_generation_error,
 )
@@ -90,8 +87,9 @@ from huggingface_hub.inference._generated.types import (
     ZeroShotClassificationOutputElement,
     ZeroShotImageClassificationOutputElement,
 )
-from huggingface_hub.utils import build_hf_headers
-from huggingface_hub.utils._deprecation import _deprecate_arguments
+from huggingface_hub.inference._providers import PROVIDER_T, HFInferenceTask, get_provider_helper
+from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
+from huggingface_hub.utils._deprecation import _deprecate_arguments, _deprecate_method
 from .._common import _async_yield_from, _import_aiohttp
@@ -112,7 +110,7 @@ class AsyncInferenceClient:
     Initialize a new Inference Client.
     [`InferenceClient`] aims to provide a unified experience to perform inference. The client can be used
-    seamlessly with either the (free) Inference API or self-hosted Inference Endpoints.
+    seamlessly with either the (free) Inference API, self-hosted Inference Endpoints, or third-party Inference Providers.
     Args:
         model (`str`, `optional`):
@@ -123,6 +121,10 @@ class AsyncInferenceClient:
             arguments are mutually exclusive. If using `base_url` for chat completion, the `/chat/completions` suffix
             path will be appended to the base URL (see the [TGI Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api)
             documentation for details). When passing a URL as `model`, the client will not append any suffix path to it.
+        provider (`str`, *optional*):
+                Name of the provider to use for inference. Can be `"replicate"`, `"together"`, `"fal-ai"`, `"sambanova"` or `"hf-inference"`.
+                defaults to hf-inference (Hugging Face Serverless Inference API).
+                If model is a URL or `base_url` is passed, then `provider` is not used.
         token (`str` or `bool`, *optional*):
             Hugging Face token. Will default to the locally saved token if not provided.
             Pass `token=False` if you don't want to send your token to the server.
@@ -152,7 +154,8 @@ class AsyncInferenceClient:
         self,
         model: Optional[str] = None,
         *,
-        token: Union[str, bool, None] = None,
+        provider: Optional[PROVIDER_T] = None,
+        token: Optional[str] = None,
         timeout: Optional[float] = None,
         headers: Optional[Dict[str, str]] = None,
         cookies: Optional[Dict[str, str]] = None,
@@ -177,12 +180,12 @@ class AsyncInferenceClient:
             )
         self.model: Optional[str] = model
-        self.token: Union[str, bool, None] = token if token is not None else api_key
-        self.headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(
-            build_hf_headers(token=self.token)  # 'authorization' + 'user-agent'
-        )
-        if headers is not None:
-            self.headers.update(headers)
+        self.token: Optional[str] = token if token is not None else api_key
+        self.headers = headers if headers is not None else {}
+        # Configure provider
+        self.provider = provider if provider is not None else "hf-inference"
         self.cookies = cookies
         self.timeout = timeout
         self.trust_env = trust_env
@@ -230,6 +233,14 @@ class AsyncInferenceClient:
         stream: bool = False,
     ) -> Union[bytes, AsyncIterable[bytes]]: ...
+    @_deprecate_method(
+        version="0.31.0",
+        message=(
+            "Making direct POST requests to the inference server is not supported anymore. "
+            "Please use task methods instead (e.g. `InferenceClient.chat_completion`). "
+            "If your use case is not supported, please open an issue in https://github.com/huggingface/huggingface_hub."
+        ),
+    )
     async def post(
         self,
         *,
@@ -242,56 +253,67 @@ class AsyncInferenceClient:
         """
         Make a POST request to the inference server.
-        Args:
-            json (`Union[str, Dict, List]`, *optional*):
-                The JSON data to send in the request body, specific to each task. Defaults to None.
-            data (`Union[str, Path, bytes, BinaryIO]`, *optional*):
-                The content to send in the request body, specific to each task.
-                It can be raw bytes, a pointer to an opened file, a local file path,
-                or a URL to an online resource (image, audio file,...). If both `json` and `data` are passed,
-                `data` will take precedence. At least `json` or `data` must be provided. Defaults to None.
-            model (`str`, *optional*):
-                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
-                Inference Endpoint. Will override the model defined at the instance level. Defaults to None.
-            task (`str`, *optional*):
-                The task to perform on the inference. All available tasks can be found
-                [here](https://huggingface.co/tasks). Used only to default to a recommended model if `model` is not
-                provided. At least `model` or `task` must be provided. Defaults to None.
-            stream (`bool`, *optional*):
-                Whether to iterate over streaming APIs.
+        This method is deprecated and will be removed in the future.
+        Please use task methods instead (e.g. `InferenceClient.chat_completion`).
+        """
+        if self.provider != "hf-inference":
+            raise ValueError(
+                "Cannot use `post` with another provider than `hf-inference`. "
+                "`InferenceClient.post` is deprecated and should not be used directly anymore."
+            )
+        provider_helper = HFInferenceTask(task or "unknown")
+        url = provider_helper.build_url(provider_helper.map_model(model))
+        headers = provider_helper.prepare_headers(headers=self.headers, api_key=self.token)
+        return await self._inner_post(
+            request_parameters=RequestParameters(
+                url=url,
+                task=task or "unknown",
+                model=model or "unknown",
+                json=json,
+                data=data,
+                headers=headers,
+            ),
+            stream=stream,
+        )
-        Returns:
-            bytes: The raw bytes returned by the server.
+    @overload
+    async def _inner_post(  # type: ignore[misc]
+        self, request_parameters: RequestParameters, *, stream: Literal[False] = ...
+    ) -> bytes: ...
-        Raises:
-            [`InferenceTimeoutError`]:
-                If the model is unavailable or the request times out.
-            `aiohttp.ClientResponseError`:
-                If the request fails with an HTTP error status code other than HTTP 503.
-        """
+    @overload
+    async def _inner_post(  # type: ignore[misc]
+        self, request_parameters: RequestParameters, *, stream: Literal[True] = ...
+    ) -> AsyncIterable[bytes]: ...
-        aiohttp = _import_aiohttp()
+    @overload
+    async def _inner_post(
+        self, request_parameters: RequestParameters, *, stream: bool = False
+    ) -> Union[bytes, AsyncIterable[bytes]]: ...
-        url = self._resolve_url(model, task)
+    async def _inner_post(
+        self, request_parameters: RequestParameters, *, stream: bool = False
+    ) -> Union[bytes, AsyncIterable[bytes]]:
+        """Make a request to the inference server."""
-        if data is not None and json is not None:
-            warnings.warn("Ignoring `json` as `data` is passed as binary.")
+        aiohttp = _import_aiohttp()
-        # Set Accept header if relevant
-        headers = dict()
-        if task in TASKS_EXPECTING_IMAGES and "Accept" not in headers:
-            headers["Accept"] = "image/png"
+        # TODO: this should be handled in provider helpers directly
+        if request_parameters.task in TASKS_EXPECTING_IMAGES and "Accept" not in request_parameters.headers:
+            request_parameters.headers["Accept"] = "image/png"
         t0 = time.time()
         timeout = self.timeout
         while True:
-            with _open_as_binary(data) as data_as_binary:
+            with _open_as_binary(request_parameters.data) as data_as_binary:
                 # Do not use context manager as we don't want to close the connection immediately when returning
                 # a stream
-                session = self._get_client_session(headers=headers)
+                session = self._get_client_session(headers=request_parameters.headers)
                 try:
-                    response = await session.post(url, json=json, data=data_as_binary, proxy=self.proxies)
+                    response = await session.post(
+                        request_parameters.url, json=request_parameters.json, data=data_as_binary, proxy=self.proxies
+                    )
                     response_error_payload = None
                     if response.status != 200:
                         try:
@@ -308,25 +330,27 @@ class AsyncInferenceClient:
                 except asyncio.TimeoutError as error:
                     await session.close()
                     # Convert any `TimeoutError` to a `InferenceTimeoutError`
-                    raise InferenceTimeoutError(f"Inference call timed out: {url}") from error  # type: ignore
+                    raise InferenceTimeoutError(f"Inference call timed out: {request_parameters.url}") from error  # type: ignore
                 except aiohttp.ClientResponseError as error:
                     error.response_error_payload = response_error_payload
                     await session.close()
-                    if response.status == 422 and task is not None:
-                        error.message += f". Make sure '{task}' task is supported by the model."
+                    if response.status == 422 and request_parameters.task != "unknown":
+                        error.message += f". Make sure '{request_parameters.task}' task is supported by the model."
                     if response.status == 503:
                         # If Model is unavailable, either raise a TimeoutError...
                         if timeout is not None and time.time() - t0 > timeout:
                             raise InferenceTimeoutError(
-                                f"Model not loaded on the server: {url}. Please retry with a higher timeout"
+                                f"Model not loaded on the server: {request_parameters.url}. Please retry with a higher timeout"
                                 f" (current: {self.timeout}).",
                                 request=error.request,
                                 response=error.response,
                             ) from error
                         # ...or wait 1s and retry
                         logger.info(f"Waiting for model to be loaded on the server: {error}")
-                        if "X-wait-for-model" not in headers and url.startswith(INFERENCE_ENDPOINT):
-                            headers["X-wait-for-model"] = "1"
+                        if "X-wait-for-model" not in request_parameters.headers and request_parameters.url.startswith(
+                            INFERENCE_ENDPOINT
+                        ):
+                            request_parameters.headers["X-wait-for-model"] = "1"
                         await asyncio.sleep(1)
                         if timeout is not None:
                             timeout = max(self.timeout - (time.time() - t0), 1)  # type: ignore
@@ -408,9 +432,15 @@ class AsyncInferenceClient:
         ]
         ```
         """
-        parameters = {"function_to_apply": function_to_apply, "top_k": top_k}
-        payload = _prepare_payload(audio, parameters=parameters, expect_binary=True)
-        response = await self.post(**payload, model=model, task="audio-classification")
+        provider_helper = get_provider_helper(self.provider, task="audio-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=audio,
+            parameters={"function_to_apply": function_to_apply, "top_k": top_k},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
         return AudioClassificationOutputElement.parse_obj_as_list(response)
     async def audio_to_audio(
@@ -451,7 +481,15 @@ class AsyncInferenceClient:
                     f.write(item.blob)
         ```
         """
-        response = await self.post(data=audio, model=model, task="audio-to-audio")
+        provider_helper = get_provider_helper(self.provider, task="audio-to-audio")
+        request_parameters = provider_helper.prepare_request(
+            inputs=audio,
+            parameters={},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
         audio_output = AudioToAudioOutputElement.parse_obj_as_list(response)
         for item in audio_output:
             item.blob = base64.b64decode(item.blob)
@@ -472,7 +510,8 @@ class AsyncInferenceClient:
             model (`str`, *optional*):
                 The model to use for ASR. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. If not provided, the default recommended model for ASR will be used.
+            parameters (Dict[str, Any], *optional*):
+                Additional parameters to pass to the model.
         Returns:
             [`AutomaticSpeechRecognitionOutput`]: An item containing the transcribed text and optionally the timestamp chunks.
@@ -491,7 +530,15 @@ class AsyncInferenceClient:
         "hello world"
         ```
         """
-        response = await self.post(data=audio, model=model, task="automatic-speech-recognition")
+        provider_helper = get_provider_helper(self.provider, task="automatic-speech-recognition")
+        request_parameters = provider_helper.prepare_request(
+            inputs=audio,
+            parameters={},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
         return AutomaticSpeechRecognitionOutput.parse_obj_as_instance(response)
     @overload
@@ -605,6 +652,10 @@ class AsyncInferenceClient:
         </Tip>
+        <Tip>
+        Some parameters might not be supported by some providers.
+        </Tip>
         Args:
             messages (List of [`ChatCompletionInputMessage`]):
                 Conversation history consisting of roles and content pairs.
@@ -612,25 +663,20 @@ class AsyncInferenceClient:
                 The model to use for chat-completion. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. If not provided, the default recommended model for chat-based text-generation will be used.
                 See https://huggingface.co/tasks/text-generation for more details.
                 If `model` is a model ID, it is passed to the server as the `model` parameter. If you want to define a
                 custom URL while setting `model` in the request payload, you must set `base_url` when initializing [`InferenceClient`].
             frequency_penalty (`float`, *optional*):
                 Penalizes new tokens based on their existing frequency
                 in the text so far. Range: [-2.0, 2.0]. Defaults to 0.0.
             logit_bias (`List[float]`, *optional*):
-                Modify the likelihood of specified tokens appearing in the completion. Accepts a JSON object that maps tokens
-                (specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically,
-                the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model,
-                but values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should
-                result in a ban or exclusive selection of the relevant token. Defaults to None.
+                Adjusts the likelihood of specific tokens appearing in the generated output.
             logprobs (`bool`, *optional*):
                 Whether to return log probabilities of the output tokens or not. If true, returns the log
                 probabilities of each output token returned in the content of message.
             max_tokens (`int`, *optional*):
                 Maximum number of tokens allowed in the response. Defaults to 100.
             n (`int`, *optional*):
-                UNUSED.
+                The number of completions to generate for each prompt.
             presence_penalty (`float`, *optional*):
                 Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the
                 text so far, increasing the model's likelihood to talk about new topics.
@@ -638,7 +684,7 @@ class AsyncInferenceClient:
                 Grammar constraints. Can be either a JSONSchema or a regex.
             seed (Optional[`int`], *optional*):
                 Seed for reproducible control flow. Defaults to None.
-            stop (Optional[`str`], *optional*):
+            stop (`List[str]`, *optional*):
                 Up to four strings which trigger the end of the response.
                 Defaults to None.
             stream (`bool`, *optional*):
@@ -750,6 +796,32 @@ class AsyncInferenceClient:
             print(chunk.choices[0].delta.content)
         ```
+        Example using a third-party provider directly. Usage will be billed on your Together AI account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="together",  # Use Together AI provider
+        ...     api_key="<together_api_key>",  # Pass your Together API key directly
+        ... )
+        >>> client.chat_completion(
+        ...     model="meta-llama/Meta-Llama-3-8B-Instruct",
+        ...     messages=[{"role": "user", "content": "What is the capital of France?"}],
+        ... )
+        ```
+        Example using a third-party provider through Hugging Face Routing. Usage will be billed on your Hugging Face account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="sambanova",  # Use Sambanova provider
+        ...     api_key="hf_...",  # Pass your HF token
+        ... )
+        >>> client.chat_completion(
+        ...     model="meta-llama/Meta-Llama-3-8B-Instruct",
+        ...     messages=[{"role": "user", "content": "What is the capital of France?"}],
+        ... )
+        ```
         Example using Image + Text as input:
         ```py
         # Must be run in an async context
@@ -901,68 +973,50 @@ class AsyncInferenceClient:
         '{\n\n"activity": "bike ride",\n"animals": ["puppy", "cat", "raccoon"],\n"animals_seen": 3,\n"location": "park"}'
         ```
         """
-        model_url = self._resolve_chat_completion_url(model)
-        # `model` is sent in the payload. Not used by the server but can be useful for debugging/routing.
-        # If it's a ID on the Hub => use it. Otherwise, we use a random string.
-        model_id = model or self.model or "tgi"
-        if model_id.startswith(("http://", "https://")):
-            model_id = "tgi"  # dummy value
-        payload = dict(
-            model=model_id,
-            messages=messages,
-            frequency_penalty=frequency_penalty,
-            logit_bias=logit_bias,
-            logprobs=logprobs,
-            max_tokens=max_tokens,
-            n=n,
-            presence_penalty=presence_penalty,
-            response_format=response_format,
-            seed=seed,
-            stop=stop,
-            temperature=temperature,
-            tool_choice=tool_choice,
-            tool_prompt=tool_prompt,
-            tools=tools,
-            top_logprobs=top_logprobs,
-            top_p=top_p,
-            stream=stream,
-            stream_options=stream_options,
+        # Get the provider helper
+        provider_helper = get_provider_helper(self.provider, task="conversational")
+        # Since `chat_completion(..., model=xxx)` is also a payload parameter for the server, we need to handle 'model' differently.
+        # `self.base_url` and `self.model` takes precedence over 'model' argument for building URL.
+        # `model` takes precedence for payload value.
+        model_id_or_url = self.base_url or self.model or model
+        payload_model = model or self.model
+        # Prepare the payload
+        parameters = {
+            "model": payload_model,
+            "frequency_penalty": frequency_penalty,
+            "logit_bias": logit_bias,
+            "logprobs": logprobs,
+            "max_tokens": max_tokens,
+            "n": n,
+            "presence_penalty": presence_penalty,
+            "response_format": response_format,
+            "seed": seed,
+            "stop": stop,
+            "temperature": temperature,
+            "tool_choice": tool_choice,
+            "tool_prompt": tool_prompt,
+            "tools": tools,
+            "top_logprobs": top_logprobs,
+            "top_p": top_p,
+            "stream": stream,
+            "stream_options": stream_options,
+        }
+        request_parameters = provider_helper.prepare_request(
+            inputs=messages,
+            parameters=parameters,
+            headers=self.headers,
+            model=model_id_or_url,
+            api_key=self.token,
         )
-        payload = {key: value for key, value in payload.items() if value is not None}
-        data = await self.post(model=model_url, json=payload, stream=stream)
+        data = await self._inner_post(request_parameters, stream=stream)
         if stream:
             return _async_stream_chat_completion_response(data)  # type: ignore[arg-type]
         return ChatCompletionOutput.parse_obj_as_instance(data)  # type: ignore[arg-type]
-    def _resolve_chat_completion_url(self, model: Optional[str] = None) -> str:
-        # Since `chat_completion(..., model=xxx)` is also a payload parameter for the server, we need to handle 'model' differently.
-        # `self.base_url` and `self.model` takes precedence over 'model' argument only in `chat_completion`.
-        model_id_or_url = self.base_url or self.model or model or self.get_recommended_model("text-generation")
-        # Resolve URL if it's a model ID
-        model_url = (
-            model_id_or_url
-            if model_id_or_url.startswith(("http://", "https://"))
-            else self._resolve_url(model_id_or_url, task="text-generation")
-        )
-        # Strip trailing /
-        model_url = model_url.rstrip("/")
-        # Append /chat/completions if not already present
-        if model_url.endswith("/v1"):
-            model_url += "/chat/completions"
-        # Append /v1/chat/completions if not already present
-        if not model_url.endswith("/chat/completions"):
-            model_url += "/v1/chat/completions"
-        return model_url
     async def document_question_answering(
         self,
         image: ContentT,
@@ -1030,18 +1084,24 @@ class AsyncInferenceClient:
         ```
         """
         inputs: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
-        parameters = {
-            "doc_stride": doc_stride,
-            "handle_impossible_answer": handle_impossible_answer,
-            "lang": lang,
-            "max_answer_len": max_answer_len,
-            "max_question_len": max_question_len,
-            "max_seq_len": max_seq_len,
-            "top_k": top_k,
-            "word_boxes": word_boxes,
-        }
-        payload = _prepare_payload(inputs, parameters=parameters)
-        response = await self.post(**payload, model=model, task="document-question-answering")
+        provider_helper = get_provider_helper(self.provider, task="document-question-answering")
+        request_parameters = provider_helper.prepare_request(
+            inputs=inputs,
+            parameters={
+                "doc_stride": doc_stride,
+                "handle_impossible_answer": handle_impossible_answer,
+                "lang": lang,
+                "max_answer_len": max_answer_len,
+                "max_question_len": max_question_len,
+                "max_seq_len": max_seq_len,
+                "top_k": top_k,
+                "word_boxes": word_boxes,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
         return DocumentQuestionAnsweringOutputElement.parse_obj_as_list(response)
     async def feature_extraction(
@@ -1100,14 +1160,20 @@ class AsyncInferenceClient:
         [ 0.28552425, -0.928395  , -1.2077185 , ...,  0.76810825, -2.1069427 ,  0.6236161 ]], dtype=float32)
         ```
         """
-        parameters = {
-            "normalize": normalize,
-            "prompt_name": prompt_name,
-            "truncate": truncate,
-            "truncation_direction": truncation_direction,
-        }
-        payload = _prepare_payload(text, parameters=parameters)
-        response = await self.post(**payload, model=model, task="feature-extraction")
+        provider_helper = get_provider_helper(self.provider, task="feature-extraction")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters={
+                "normalize": normalize,
+                "prompt_name": prompt_name,
+                "truncate": truncate,
+                "truncation_direction": truncation_direction,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
         np = _import_numpy()
         return np.array(_bytes_to_dict(response), dtype="float32")
@@ -1156,9 +1222,15 @@ class AsyncInferenceClient:
         ]
         ```
         """
-        parameters = {"targets": targets, "top_k": top_k}
-        payload = _prepare_payload(text, parameters=parameters)
-        response = await self.post(**payload, model=model, task="fill-mask")
+        provider_helper = get_provider_helper(self.provider, task="fill-mask")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters={"targets": targets, "top_k": top_k},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
         return FillMaskOutputElement.parse_obj_as_list(response)
     async def image_classification(
@@ -1200,9 +1272,15 @@ class AsyncInferenceClient:
         [ImageClassificationOutputElement(label='Blenheim spaniel', score=0.9779096841812134), ...]
         ```
         """
-        parameters = {"function_to_apply": function_to_apply, "top_k": top_k}
-        payload = _prepare_payload(image, parameters=parameters, expect_binary=True)
-        response = await self.post(**payload, model=model, task="image-classification")
+        provider_helper = get_provider_helper(self.provider, task="image-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={"function_to_apply": function_to_apply, "top_k": top_k},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
         return ImageClassificationOutputElement.parse_obj_as_list(response)
     async def image_segmentation(
@@ -1256,14 +1334,20 @@ class AsyncInferenceClient:
         [ImageSegmentationOutputElement(score=0.989008, label='LABEL_184', mask=<PIL.PngImagePlugin.PngImageFile image mode=L size=400x300 at 0x7FDD2B129CC0>), ...]
         ```
         """
-        parameters = {
-            "mask_threshold": mask_threshold,
-            "overlap_mask_area_threshold": overlap_mask_area_threshold,
-            "subtask": subtask,
-            "threshold": threshold,
-        }
-        payload = _prepare_payload(image, parameters=parameters, expect_binary=True)
-        response = await self.post(**payload, model=model, task="image-segmentation")
+        provider_helper = get_provider_helper(self.provider, task="audio-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={
+                "mask_threshold": mask_threshold,
+                "overlap_mask_area_threshold": overlap_mask_area_threshold,
+                "subtask": subtask,
+                "threshold": threshold,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
         output = ImageSegmentationOutputElement.parse_obj_as_list(response)
         for item in output:
             item.mask = _b64_to_image(item.mask)  # type: ignore [assignment]
@@ -1274,7 +1358,7 @@ class AsyncInferenceClient:
         image: ContentT,
         prompt: Optional[str] = None,
         *,
-        negative_prompt: Optional[List[str]] = None,
+        negative_prompt: Optional[str] = None,
         num_inference_steps: Optional[int] = None,
         guidance_scale: Optional[float] = None,
         model: Optional[str] = None,
@@ -1295,8 +1379,8 @@ class AsyncInferenceClient:
                 The input image for translation. It can be raw bytes, an image file, or a URL to an online image.
             prompt (`str`, *optional*):
                 The text prompt to guide the image generation.
-            negative_prompt (`List[str]`, *optional*):
-                One or several prompt to guide what NOT to include in image generation.
+            negative_prompt (`str`, *optional*):
+                One prompt to guide what NOT to include in image generation.
             num_inference_steps (`int`, *optional*):
                 For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher
                 quality image at the expense of slower inference.
@@ -1327,16 +1411,22 @@ class AsyncInferenceClient:
         >>> image.save("tiger.jpg")
         ```
         """
-        parameters = {
-            "prompt": prompt,
-            "negative_prompt": negative_prompt,
-            "target_size": target_size,
-            "num_inference_steps": num_inference_steps,
-            "guidance_scale": guidance_scale,
-            **kwargs,
-        }
-        payload = _prepare_payload(image, parameters=parameters, expect_binary=True)
-        response = await self.post(**payload, model=model, task="image-to-image")
+        provider_helper = get_provider_helper(self.provider, task="image-to-image")
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={
+                "prompt": prompt,
+                "negative_prompt": negative_prompt,
+                "target_size": target_size,
+                "num_inference_steps": num_inference_steps,
+                "guidance_scale": guidance_scale,
+                **kwargs,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
         return _bytes_to_image(response)
     async def image_to_text(self, image: ContentT, *, model: Optional[str] = None) -> ImageToTextOutput:
@@ -1373,99 +1463,18 @@ class AsyncInferenceClient:
         'a dog laying on the grass next to a flower pot '
         ```
         """
-        response = await self.post(data=image, model=model, task="image-to-text")
+        provider_helper = get_provider_helper(self.provider, task="image-to-text")
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
         output = ImageToTextOutput.parse_obj(response)
         return output[0] if isinstance(output, list) else output
-    async def list_deployed_models(
-        self, frameworks: Union[None, str, Literal["all"], List[str]] = None
-    ) -> Dict[str, List[str]]:
-        """
-        List models deployed on the Serverless Inference API service.
-        This helper checks deployed models framework by framework. By default, it will check the 4 main frameworks that
-        are supported and account for 95% of the hosted models. However, if you want a complete list of models you can
-        specify `frameworks="all"` as input. Alternatively, if you know before-hand which framework you are interested
-        in, you can also restrict to search to this one (e.g. `frameworks="text-generation-inference"`). The more
-        frameworks are checked, the more time it will take.
-        <Tip warning={true}>
-        This endpoint method does not return a live list of all models available for the Serverless Inference API service.
-        It searches over a cached list of models that were recently available and the list may not be up to date.
-        If you want to know the live status of a specific model, use [`~InferenceClient.get_model_status`].
-        </Tip>
-        <Tip>
-        This endpoint method is mostly useful for discoverability. If you already know which model you want to use and want to
-        check its availability, you can directly use [`~InferenceClient.get_model_status`].
-        </Tip>
-        Args:
-            frameworks (`Literal["all"]` or `List[str]` or `str`, *optional*):
-                The frameworks to filter on. By default only a subset of the available frameworks are tested. If set to
-                "all", all available frameworks will be tested. It is also possible to provide a single framework or a
-                custom set of frameworks to check.
-        Returns:
-            `Dict[str, List[str]]`: A dictionary mapping task names to a sorted list of model IDs.
-        Example:
-        ```py
-        # Must be run in an async contextthon
-        >>> from huggingface_hub import AsyncInferenceClient
-        >>> client = AsyncInferenceClient()
-        # Discover zero-shot-classification models currently deployed
-        >>> models = await client.list_deployed_models()
-        >>> models["zero-shot-classification"]
-        ['Narsil/deberta-large-mnli-zero-cls', 'facebook/bart-large-mnli', ...]
-        # List from only 1 framework
-        >>> await client.list_deployed_models("text-generation-inference")
-        {'text-generation': ['bigcode/starcoder', 'meta-llama/Llama-2-70b-chat-hf', ...], ...}
-        ```
-        """
-        # Resolve which frameworks to check
-        if frameworks is None:
-            frameworks = MAIN_INFERENCE_API_FRAMEWORKS
-        elif frameworks == "all":
-            frameworks = ALL_INFERENCE_API_FRAMEWORKS
-        elif isinstance(frameworks, str):
-            frameworks = [frameworks]
-        frameworks = list(set(frameworks))
-        # Fetch them iteratively
-        models_by_task: Dict[str, List[str]] = {}
-        def _unpack_response(framework: str, items: List[Dict]) -> None:
-            for model in items:
-                if framework == "sentence-transformers":
-                    # Model running with the `sentence-transformers` framework can work with both tasks even if not
-                    # branded as such in the API response
-                    models_by_task.setdefault("feature-extraction", []).append(model["model_id"])
-                    models_by_task.setdefault("sentence-similarity", []).append(model["model_id"])
-                else:
-                    models_by_task.setdefault(model["task"], []).append(model["model_id"])
-        async def _fetch_framework(framework: str) -> None:
-            async with self._get_client_session() as client:
-                response = await client.get(f"{INFERENCE_ENDPOINT}/framework/{framework}", proxy=self.proxies)
-                response.raise_for_status()
-                _unpack_response(framework, await response.json())
-        import asyncio
-        await asyncio.gather(*[_fetch_framework(framework) for framework in frameworks])
-        # Sort alphabetically for discoverability and return
-        for task, models in models_by_task.items():
-            models_by_task[task] = sorted(set(models), key=lambda x: x.lower())
-        return models_by_task
     async def object_detection(
         self, image: ContentT, *, model: Optional[str] = None, threshold: Optional[float] = None
     ) -> List[ObjectDetectionOutputElement]:
@@ -1506,11 +1515,15 @@ class AsyncInferenceClient:
         [ObjectDetectionOutputElement(score=0.9486683011054993, label='person', box=ObjectDetectionBoundingBox(xmin=59, ymin=39, xmax=420, ymax=510)), ...]
         ```
         """
-        parameters = {
-            "threshold": threshold,
-        }
-        payload = _prepare_payload(image, parameters=parameters, expect_binary=True)
-        response = await self.post(**payload, model=model, task="object-detection")
+        provider_helper = get_provider_helper(self.provider, task="object-detection")
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={"threshold": threshold},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
         return ObjectDetectionOutputElement.parse_obj_as_list(response)
     async def question_answering(
@@ -1576,22 +1589,24 @@ class AsyncInferenceClient:
         QuestionAnsweringOutputElement(answer='Clara', end=16, score=0.9326565265655518, start=11)
         ```
         """
-        parameters = {
-            "align_to_words": align_to_words,
-            "doc_stride": doc_stride,
-            "handle_impossible_answer": handle_impossible_answer,
-            "max_answer_len": max_answer_len,
-            "max_question_len": max_question_len,
-            "max_seq_len": max_seq_len,
-            "top_k": top_k,
-        }
-        inputs: Dict[str, Any] = {"question": question, "context": context}
-        payload = _prepare_payload(inputs, parameters=parameters)
-        response = await self.post(
-            **payload,
-            model=model,
-            task="question-answering",
+        provider_helper = get_provider_helper(self.provider, task="question-answering")
+        request_parameters = provider_helper.prepare_request(
+            inputs=None,
+            parameters={
+                "align_to_words": align_to_words,
+                "doc_stride": doc_stride,
+                "handle_impossible_answer": handle_impossible_answer,
+                "max_answer_len": max_answer_len,
+                "max_question_len": max_question_len,
+                "max_seq_len": max_seq_len,
+                "top_k": top_k,
+            },
+            extra_payload={"question": question, "context": context},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
         )
+        response = await self._inner_post(request_parameters)
         # Parse the response as a single `QuestionAnsweringOutputElement` when top_k is 1 or not provided, or a list of `QuestionAnsweringOutputElement` to ensure backward compatibility.
         output = QuestionAnsweringOutputElement.parse_obj(response)
         return output
@@ -1637,11 +1652,16 @@ class AsyncInferenceClient:
         [0.7785726189613342, 0.45876261591911316, 0.2906220555305481]
         ```
         """
-        response = await self.post(
-            json={"inputs": {"source_sentence": sentence, "sentences": other_sentences}},
-            model=model,
-            task="sentence-similarity",
+        provider_helper = get_provider_helper(self.provider, task="sentence-similarity")
+        request_parameters = provider_helper.prepare_request(
+            inputs=None,
+            parameters={},
+            extra_payload={"source_sentence": sentence, "sentences": other_sentences},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
         )
+        response = await self._inner_post(request_parameters)
         return _bytes_to_list(response)
     @_deprecate_arguments(
@@ -1704,8 +1724,15 @@ class AsyncInferenceClient:
                 "generate_parameters": generate_parameters,
                 "truncation": truncation,
             }
-        payload = _prepare_payload(text, parameters=parameters)
-        response = await self.post(**payload, model=model, task="summarization")
+        provider_helper = get_provider_helper(self.provider, task="summarization")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters=parameters,
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
         return SummarizationOutput.parse_obj_as_list(response)[0]
     async def table_question_answering(
@@ -1759,21 +1786,16 @@ class AsyncInferenceClient:
         TableQuestionAnsweringOutputElement(answer='36542', coordinates=[[0, 1]], cells=['36542'], aggregator='AVERAGE')
         ```
         """
-        parameters = {
-            "padding": padding,
-            "sequential": sequential,
-            "truncation": truncation,
-        }
-        inputs = {
-            "query": query,
-            "table": table,
-        }
-        payload = _prepare_payload(inputs, parameters=parameters)
-        response = await self.post(
-            **payload,
-            model=model,
-            task="table-question-answering",
+        provider_helper = get_provider_helper(self.provider, task="table-question-answering")
+        request_parameters = provider_helper.prepare_request(
+            inputs=None,
+            parameters={"model": model, "padding": padding, "sequential": sequential, "truncation": truncation},
+            extra_payload={"query": query, "table": table},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
         )
+        response = await self._inner_post(request_parameters)
         return TableQuestionAnsweringOutputElement.parse_obj_as_instance(response)
     async def tabular_classification(self, table: Dict[str, Any], *, model: Optional[str] = None) -> List[str]:
@@ -1819,11 +1841,16 @@ class AsyncInferenceClient:
         ["5", "5", "5"]
         ```
         """
-        response = await self.post(
-            json={"table": table},
-            model=model,
-            task="tabular-classification",
+        provider_helper = get_provider_helper(self.provider, task="tabular-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=None,
+            extra_payload={"table": table},
+            parameters={},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
         )
+        response = await self._inner_post(request_parameters)
         return _bytes_to_list(response)
     async def tabular_regression(self, table: Dict[str, Any], *, model: Optional[str] = None) -> List[float]:
@@ -1864,7 +1891,16 @@ class AsyncInferenceClient:
         [110, 120, 130]
         ```
         """
-        response = await self.post(json={"table": table}, model=model, task="tabular-regression")
+        provider_helper = get_provider_helper(self.provider, task="tabular-regression")
+        request_parameters = provider_helper.prepare_request(
+            inputs=None,
+            parameters={},
+            extra_payload={"table": table},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
         return _bytes_to_list(response)
     async def text_classification(
@@ -1911,16 +1947,18 @@ class AsyncInferenceClient:
         ]
         ```
         """
-        parameters = {
-            "function_to_apply": function_to_apply,
-            "top_k": top_k,
-        }
-        payload = _prepare_payload(text, parameters=parameters)
-        response = await self.post(
-            **payload,
-            model=model,
-            task="text-classification",
+        provider_helper = get_provider_helper(self.provider, task="text-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters={
+                "function_to_apply": function_to_apply,
+                "top_k": top_k,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
         )
+        response = await self._inner_post(request_parameters)
         return TextClassificationOutputElement.parse_obj_as_list(response)[0]  # type: ignore [return-value]
     @overload
@@ -2104,15 +2142,6 @@ class AsyncInferenceClient:
         """
         Given a prompt, generate the following text.
-        API endpoint is supposed to run with the `text-generation-inference` backend (TGI). This backend is the
-        go-to solution to run large language models at scale. However, for some smaller models (e.g. "gpt2") the
-        default `transformers` + `api-inference` solution is still in use. Both approaches have very similar APIs, but
-        not exactly the same. This method is compatible with both approaches but some parameters are only available for
-        `text-generation-inference`. If some parameters are ignored, a warning message is triggered but the process
-        continues correctly.
-        To learn more about the TGI project, please refer to https://github.com/huggingface/text-generation-inference.
         <Tip>
         If you want to generate a response from chat messages, you should use the [`InferenceClient.chat_completion`] method.
@@ -2336,12 +2365,6 @@ class AsyncInferenceClient:
             "typical_p": typical_p,
             "watermark": watermark,
         }
-        parameters = {k: v for k, v in parameters.items() if v is not None}
-        payload = {
-            "inputs": prompt,
-            "parameters": parameters,
-            "stream": stream,
-        }
         # Remove some parameters if not a TGI server
         unsupported_kwargs = _get_unsupported_text_generation_kwargs(model)
@@ -2374,9 +2397,19 @@ class AsyncInferenceClient:
                     " Please pass `stream=False` as input."
                 )
+        provider_helper = get_provider_helper(self.provider, task="text-generation")
+        request_parameters = provider_helper.prepare_request(
+            inputs=prompt,
+            parameters=parameters,
+            extra_payload={"stream": stream},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
         # Handle errors separately for more precise error messages
         try:
-            bytes_output = await self.post(json=payload, model=model, task="text-generation", stream=stream)  # type: ignore
+            bytes_output = await self._inner_post(request_parameters, stream=stream)
         except _import_aiohttp().ClientResponseError as e:
             match = MODEL_KWARGS_NOT_USED_REGEX.search(e.response_error_payload["error"])
             if e.status == 400 and match:
@@ -2386,7 +2419,7 @@ class AsyncInferenceClient:
                     prompt=prompt,
                     details=details,
                     stream=stream,
-                    model=model,
+                    model=model or self.model,
                     adapter_id=adapter_id,
                     best_of=best_of,
                     decoder_input_details=decoder_input_details,
@@ -2424,7 +2457,7 @@ class AsyncInferenceClient:
         self,
         prompt: str,
         *,
-        negative_prompt: Optional[List[str]] = None,
+        negative_prompt: Optional[str] = None,
         height: Optional[float] = None,
         width: Optional[float] = None,
         num_inference_steps: Optional[int] = None,
@@ -2447,8 +2480,8 @@ class AsyncInferenceClient:
         Args:
             prompt (`str`):
                 The prompt to generate an image from.
-            negative_prompt (`List[str`, *optional*):
-                One or several prompt to guide what NOT to include in image generation.
+            negative_prompt (`str`, *optional*):
+                One prompt to guide what NOT to include in image generation.
             height (`float`, *optional*):
                 The height in pixels of the image to generate.
             width (`float`, *optional*):
@@ -2495,23 +2528,143 @@ class AsyncInferenceClient:
         ... )
         >>> image.save("better_astronaut.png")
         ```
-        """
+        Example using a third-party provider directly. Usage will be billed on your fal.ai account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="fal-ai",  # Use fal.ai provider
+        ...     api_key="fal-ai-api-key",  # Pass your fal.ai API key
+        ... )
+        >>> image = client.text_to_image(
+        ...     "A majestic lion in a fantasy forest",
+        ...     model="black-forest-labs/FLUX.1-schnell",
+        ... )
+        >>> image.save("lion.png")
+        ```
-        parameters = {
-            "negative_prompt": negative_prompt,
-            "height": height,
-            "width": width,
-            "num_inference_steps": num_inference_steps,
-            "guidance_scale": guidance_scale,
-            "scheduler": scheduler,
-            "target_size": target_size,
-            "seed": seed,
-            **kwargs,
-        }
-        payload = _prepare_payload(prompt, parameters=parameters)
-        response = await self.post(**payload, model=model, task="text-to-image")
+        Example using a third-party provider through Hugging Face Routing. Usage will be billed on your Hugging Face account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="replicate",  # Use replicate provider
+        ...     api_key="hf_...",  # Pass your HF token
+        ... )
+        >>> image = client.text_to_image(
+        ...     "An astronaut riding a horse on the moon.",
+        ...     model="black-forest-labs/FLUX.1-dev",
+        ... )
+        >>> image.save("astronaut.png")
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="text-to-image")
+        request_parameters = provider_helper.prepare_request(
+            inputs=prompt,
+            parameters={
+                "negative_prompt": negative_prompt,
+                "height": height,
+                "width": width,
+                "num_inference_steps": num_inference_steps,
+                "guidance_scale": guidance_scale,
+                "scheduler": scheduler,
+                "target_size": target_size,
+                "seed": seed,
+                **kwargs,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        response = provider_helper.get_response(response)
         return _bytes_to_image(response)
+    async def text_to_video(
+        self,
+        prompt: str,
+        *,
+        model: Optional[str] = None,
+        guidance_scale: Optional[float] = None,
+        negative_prompt: Optional[List[str]] = None,
+        num_frames: Optional[float] = None,
+        num_inference_steps: Optional[int] = None,
+        seed: Optional[int] = None,
+    ) -> bytes:
+        """
+        Generate a video based on a given text.
+        Args:
+            prompt (`str`):
+                The prompt to generate a video from.
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. If not provided, the default recommended text-to-video model will be used.
+                Defaults to None.
+            guidance_scale (`float`, *optional*):
+                A higher guidance scale value encourages the model to generate videos closely linked to the text
+                prompt, but values too high may cause saturation and other artifacts.
+            negative_prompt (`List[str]`, *optional*):
+                One or several prompt to guide what NOT to include in video generation.
+            num_frames (`float`, *optional*):
+                The num_frames parameter determines how many video frames are generated.
+            num_inference_steps (`int`, *optional*):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference.
+            seed (`int`, *optional*):
+                Seed for the random number generator.
+        Returns:
+            `bytes`: The generated video.
+        Example:
+        Example using a third-party provider directly. Usage will be billed on your fal.ai account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="fal-ai",  # Using fal.ai provider
+        ...     api_key="fal-ai-api-key",  # Pass your fal.ai API key
+        ... )
+        >>> video = client.text_to_video(
+        ...     "A majestic lion running in a fantasy forest",
+        ...     model="tencent/HunyuanVideo",
+        ... )
+        >>> with open("lion.mp4", "wb") as file:
+        ...     file.write(video)
+        ```
+        Example using a third-party provider through Hugging Face Routing. Usage will be billed on your Hugging Face account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="replicate",  # Using replicate provider
+        ...     api_key="hf_...",  # Pass your HF token
+        ... )
+        >>> video = client.text_to_video(
+        ...     "A cat running in a park",
+        ...     model="genmo/mochi-1-preview",
+        ... )
+        >>> with open("cat.mp4", "wb") as file:
+        ...     file.write(video)
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="text-to-video")
+        request_parameters = provider_helper.prepare_request(
+            inputs=prompt,
+            parameters={
+                "guidance_scale": guidance_scale,
+                "negative_prompt": negative_prompt,
+                "num_frames": num_frames,
+                "num_inference_steps": num_inference_steps,
+                "seed": seed,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        response = provider_helper.get_response(response)
+        return response
     async def text_to_speech(
         self,
         text: str,
@@ -2610,27 +2763,62 @@ class AsyncInferenceClient:
         >>> audio = await client.text_to_speech("Hello world")
         >>> Path("hello_world.flac").write_bytes(audio)
         ```
+        Example using a third-party provider directly. Usage will be billed on your Replicate account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="replicate",
+        ...     api_key="your-replicate-api-key",  # Pass your Replicate API key directly
+        ... )
+        >>> audio = client.text_to_speech(
+        ...     text="Hello world",
+        ...     model="OuteAI/OuteTTS-0.3-500M",
+        ... )
+        >>> Path("hello_world.flac").write_bytes(audio)
+        ```
+        Example using a third-party provider through Hugging Face Routing. Usage will be billed on your Hugging Face account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="replicate",
+        ...     api_key="hf_...",  # Pass your HF token
+        ... )
+        >>> audio =client.text_to_speech(
+        ...     text="Hello world",
+        ...     model="OuteAI/OuteTTS-0.3-500M",
+        ... )
+        >>> Path("hello_world.flac").write_bytes(audio)
+        ```
         """
-        parameters = {
-            "do_sample": do_sample,
-            "early_stopping": early_stopping,
-            "epsilon_cutoff": epsilon_cutoff,
-            "eta_cutoff": eta_cutoff,
-            "max_length": max_length,
-            "max_new_tokens": max_new_tokens,
-            "min_length": min_length,
-            "min_new_tokens": min_new_tokens,
-            "num_beam_groups": num_beam_groups,
-            "num_beams": num_beams,
-            "penalty_alpha": penalty_alpha,
-            "temperature": temperature,
-            "top_k": top_k,
-            "top_p": top_p,
-            "typical_p": typical_p,
-            "use_cache": use_cache,
-        }
-        payload = _prepare_payload(text, parameters=parameters)
-        response = await self.post(**payload, model=model, task="text-to-speech")
+        provider_helper = get_provider_helper(self.provider, task="text-to-speech")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters={
+                "do_sample": do_sample,
+                "early_stopping": early_stopping,
+                "epsilon_cutoff": epsilon_cutoff,
+                "eta_cutoff": eta_cutoff,
+                "max_length": max_length,
+                "max_new_tokens": max_new_tokens,
+                "min_length": min_length,
+                "min_new_tokens": min_new_tokens,
+                "num_beam_groups": num_beam_groups,
+                "num_beams": num_beams,
+                "penalty_alpha": penalty_alpha,
+                "temperature": temperature,
+                "top_k": top_k,
+                "top_p": top_p,
+                "typical_p": typical_p,
+                "use_cache": use_cache,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        response = provider_helper.get_response(response)
         return response
     async def token_classification(
@@ -2693,18 +2881,19 @@ class AsyncInferenceClient:
         ]
         ```
         """
-        parameters = {
-            "aggregation_strategy": aggregation_strategy,
-            "ignore_labels": ignore_labels,
-            "stride": stride,
-        }
-        payload = _prepare_payload(text, parameters=parameters)
-        response = await self.post(
-            **payload,
-            model=model,
-            task="token-classification",
+        provider_helper = get_provider_helper(self.provider, task="token-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters={
+                "aggregation_strategy": aggregation_strategy,
+                "ignore_labels": ignore_labels,
+                "stride": stride,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
         )
+        response = await self._inner_post(request_parameters)
         return TokenClassificationOutputElement.parse_obj_as_list(response)
     async def translation(
@@ -2778,15 +2967,22 @@ class AsyncInferenceClient:
         if src_lang is None and tgt_lang is not None:
             raise ValueError("You cannot specify `tgt_lang` without specifying `src_lang`.")
-        parameters = {
-            "src_lang": src_lang,
-            "tgt_lang": tgt_lang,
-            "clean_up_tokenization_spaces": clean_up_tokenization_spaces,
-            "truncation": truncation,
-            "generate_parameters": generate_parameters,
-        }
-        payload = _prepare_payload(text, parameters=parameters)
-        response = await self.post(**payload, model=model, task="translation")
+        provider_helper = get_provider_helper(self.provider, task="translation")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters={
+                "src_lang": src_lang,
+                "tgt_lang": tgt_lang,
+                "clean_up_tokenization_spaces": clean_up_tokenization_spaces,
+                "truncation": truncation,
+                "generate_parameters": generate_parameters,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
         return TranslationOutput.parse_obj_as_list(response)[0]
     async def visual_question_answering(
@@ -2836,10 +3032,16 @@ class AsyncInferenceClient:
         ]
         ```
         """
-        payload: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
-        if top_k is not None:
-            payload.setdefault("parameters", {})["top_k"] = top_k
-        response = await self.post(json=payload, model=model, task="visual-question-answering")
+        provider_helper = get_provider_helper(self.provider, task="visual-question-answering")
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={"top_k": top_k},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+            extra_payload={"question": question, "image": _b64_encode(image)},
+        )
+        response = await self._inner_post(request_parameters)
         return VisualQuestionAnsweringOutputElement.parse_obj_as_list(response)
     @_deprecate_arguments(
@@ -2947,17 +3149,20 @@ class AsyncInferenceClient:
             candidate_labels = labels
         elif candidate_labels is None:
             raise ValueError("Must specify `candidate_labels`")
-        parameters = {
-            "candidate_labels": candidate_labels,
-            "multi_label": multi_label,
-            "hypothesis_template": hypothesis_template,
-        }
-        payload = _prepare_payload(text, parameters=parameters)
-        response = await self.post(
-            **payload,
-            task="zero-shot-classification",
-            model=model,
+        provider_helper = get_provider_helper(self.provider, task="zero-shot-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters={
+                "candidate_labels": candidate_labels,
+                "multi_label": multi_label,
+                "hypothesis_template": hypothesis_template,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
         )
+        response = await self._inner_post(request_parameters)
         output = _bytes_to_dict(response)
         return [
             ZeroShotClassificationOutputElement.parse_obj_as_instance({"label": label, "score": score})
@@ -3031,18 +3236,110 @@ class AsyncInferenceClient:
         # Raise ValueError if input is less than 2 labels
         if len(candidate_labels) < 2:
             raise ValueError("You must specify at least 2 classes to compare.")
-        parameters = {
-            "candidate_labels": candidate_labels,
-            "hypothesis_template": hypothesis_template,
-        }
-        payload = _prepare_payload(image, parameters=parameters, expect_binary=True)
-        response = await self.post(
-            **payload,
-            model=model,
-            task="zero-shot-image-classification",
+        provider_helper = get_provider_helper(self.provider, task="zero-shot-image-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={
+                "candidate_labels": candidate_labels,
+                "hypothesis_template": hypothesis_template,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
         )
+        response = await self._inner_post(request_parameters)
         return ZeroShotImageClassificationOutputElement.parse_obj_as_list(response)
+    async def list_deployed_models(
+        self, frameworks: Union[None, str, Literal["all"], List[str]] = None
+    ) -> Dict[str, List[str]]:
+        """
+        List models deployed on the Serverless Inference API service.
+        This helper checks deployed models framework by framework. By default, it will check the 4 main frameworks that
+        are supported and account for 95% of the hosted models. However, if you want a complete list of models you can
+        specify `frameworks="all"` as input. Alternatively, if you know before-hand which framework you are interested
+        in, you can also restrict to search to this one (e.g. `frameworks="text-generation-inference"`). The more
+        frameworks are checked, the more time it will take.
+        <Tip warning={true}>
+        This endpoint method does not return a live list of all models available for the Serverless Inference API service.
+        It searches over a cached list of models that were recently available and the list may not be up to date.
+        If you want to know the live status of a specific model, use [`~InferenceClient.get_model_status`].
+        </Tip>
+        <Tip>
+        This endpoint method is mostly useful for discoverability. If you already know which model you want to use and want to
+        check its availability, you can directly use [`~InferenceClient.get_model_status`].
+        </Tip>
+        Args:
+            frameworks (`Literal["all"]` or `List[str]` or `str`, *optional*):
+                The frameworks to filter on. By default only a subset of the available frameworks are tested. If set to
+                "all", all available frameworks will be tested. It is also possible to provide a single framework or a
+                custom set of frameworks to check.
+        Returns:
+            `Dict[str, List[str]]`: A dictionary mapping task names to a sorted list of model IDs.
+        Example:
+        ```py
+        # Must be run in an async contextthon
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        # Discover zero-shot-classification models currently deployed
+        >>> models = await client.list_deployed_models()
+        >>> models["zero-shot-classification"]
+        ['Narsil/deberta-large-mnli-zero-cls', 'facebook/bart-large-mnli', ...]
+        # List from only 1 framework
+        >>> await client.list_deployed_models("text-generation-inference")
+        {'text-generation': ['bigcode/starcoder', 'meta-llama/Llama-2-70b-chat-hf', ...], ...}
+        ```
+        """
+        if self.provider != "hf-inference":
+            raise ValueError(f"Listing deployed models is not supported on '{self.provider}'.")
+        # Resolve which frameworks to check
+        if frameworks is None:
+            frameworks = MAIN_INFERENCE_API_FRAMEWORKS
+        elif frameworks == "all":
+            frameworks = ALL_INFERENCE_API_FRAMEWORKS
+        elif isinstance(frameworks, str):
+            frameworks = [frameworks]
+        frameworks = list(set(frameworks))
+        # Fetch them iteratively
+        models_by_task: Dict[str, List[str]] = {}
+        def _unpack_response(framework: str, items: List[Dict]) -> None:
+            for model in items:
+                if framework == "sentence-transformers":
+                    # Model running with the `sentence-transformers` framework can work with both tasks even if not
+                    # branded as such in the API response
+                    models_by_task.setdefault("feature-extraction", []).append(model["model_id"])
+                    models_by_task.setdefault("sentence-similarity", []).append(model["model_id"])
+                else:
+                    models_by_task.setdefault(model["task"], []).append(model["model_id"])
+        for framework in frameworks:
+            response = get_session().get(
+                f"{INFERENCE_ENDPOINT}/framework/{framework}", headers=build_hf_headers(token=self.token)
+            )
+            hf_raise_for_status(response)
+            _unpack_response(framework, response.json())
+        # Sort alphabetically for discoverability and return
+        for task, models in models_by_task.items():
+            models_by_task[task] = sorted(set(models), key=lambda x: x.lower())
+        return models_by_task
     def _get_client_session(self, headers: Optional[Dict] = None) -> "ClientSession":
         aiohttp = _import_aiohttp()
         client_headers = self.headers.copy()
@@ -3084,60 +3381,6 @@ class AsyncInferenceClient:
         session.close = close_session
         return session
-    def _resolve_url(self, model: Optional[str] = None, task: Optional[str] = None) -> str:
-        model = model or self.model or self.base_url
-        # If model is already a URL, ignore `task` and return directly
-        if model is not None and (model.startswith("http://") or model.startswith("https://")):
-            return model
-        # # If no model but task is set => fetch the recommended one for this task
-        if model is None:
-            if task is None:
-                raise ValueError(
-                    "You must specify at least a model (repo_id or URL) or a task, either when instantiating"
-                    " `InferenceClient` or when making a request."
-                )
-            model = self.get_recommended_model(task)
-            logger.info(
-                f"Using recommended model {model} for task {task}. Note that it is"
-                f" encouraged to explicitly set `model='{model}'` as the recommended"
-                " models list might get updated without prior notice."
-            )
-        # Compute InferenceAPI url
-        return (
-            # Feature-extraction and sentence-similarity are the only cases where we handle models with several tasks.
-            f"{INFERENCE_ENDPOINT}/pipeline/{task}/{model}"
-            if task in ("feature-extraction", "sentence-similarity")
-            # Otherwise, we use the default endpoint
-            else f"{INFERENCE_ENDPOINT}/models/{model}"
-        )
-    @staticmethod
-    def get_recommended_model(task: str) -> str:
-        """
-        Get the model Hugging Face recommends for the input task.
-        Args:
-            task (`str`):
-                The Hugging Face task to get which model Hugging Face recommends.
-                All available tasks can be found [here](https://huggingface.co/tasks).
-        Returns:
-            `str`: Name of the model recommended for the input task.
-        Raises:
-            `ValueError`: If Hugging Face has no recommendation for the input task.
-        """
-        model = _fetch_recommended_models().get(task)
-        if model is None:
-            raise ValueError(
-                f"Task {task} has no recommended model. Please specify a model"
-                " explicitly. Visit https://huggingface.co/tasks for more info."
-            )
-        return model
     async def get_endpoint_info(self, *, model: Optional[str] = None) -> Dict[str, Any]:
         """
         Get information about the deployed endpoint.
@@ -3182,6 +3425,9 @@ class AsyncInferenceClient:
         }
         ```
         """
+        if self.provider != "hf-inference":
+            raise ValueError(f"Getting endpoint info is not supported on '{self.provider}'.")
         model = model or self.model
         if model is None:
             raise ValueError("Model id not provided.")
@@ -3190,7 +3436,7 @@ class AsyncInferenceClient:
         else:
             url = f"{INFERENCE_ENDPOINT}/models/{model}/info"
-        async with self._get_client_session() as client:
+        async with self._get_client_session(headers=build_hf_headers(token=self.token)) as client:
             response = await client.get(url, proxy=self.proxies)
             response.raise_for_status()
             return await response.json()
@@ -3218,6 +3464,9 @@ class AsyncInferenceClient:
         True
         ```
         """
+        if self.provider != "hf-inference":
+            raise ValueError(f"Health check is not supported on '{self.provider}'.")
         model = model or self.model
         if model is None:
             raise ValueError("Model id not provided.")
@@ -3227,7 +3476,7 @@ class AsyncInferenceClient:
             )
         url = model.rstrip("/") + "/health"
-        async with self._get_client_session() as client:
+        async with self._get_client_session(headers=build_hf_headers(token=self.token)) as client:
             response = await client.get(url, proxy=self.proxies)
             return response.status == 200
@@ -3262,6 +3511,9 @@ class AsyncInferenceClient:
         ModelStatus(loaded=True, state='Loaded', compute_type='gpu', framework='text-generation-inference')
         ```
         """
+        if self.provider != "hf-inference":
+            raise ValueError(f"Getting model status is not supported on '{self.provider}'.")
         model = model or self.model
         if model is None:
             raise ValueError("Model id not provided.")
@@ -3269,7 +3521,7 @@ class AsyncInferenceClient:
             raise NotImplementedError("Model status is only available for Inference API endpoints.")
         url = f"{INFERENCE_ENDPOINT}/status/{model}"
-        async with self._get_client_session() as client:
+        async with self._get_client_session(headers=build_hf_headers(token=self.token)) as client:
             response = await client.get(url, proxy=self.proxies)
             response.raise_for_status()
             response_data = await response.json()

huggingface-hub 0.27.1__py3-none-any.whl → 0.28.0rc0__py3-none-any.whl

Potentially problematic release.

huggingface-hub 0.27.1py3-none-any.whl → 0.28.0rc0py3-none-any.whl