PyPI - huggingface-hub - Versions diffs - 0.23.5__py3-none-any.whl → 0.24.1__py3-none-any.whl - Mend

huggingface-hub 0.23.5py3-none-any.whl → 0.24.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of huggingface-hub might be problematic. Click here for more details.

Files changed (42) hide show

huggingface_hub/__init__.py +47 -15
huggingface_hub/_commit_api.py +38 -8
huggingface_hub/_inference_endpoints.py +11 -4
huggingface_hub/_local_folder.py +22 -13
huggingface_hub/_snapshot_download.py +12 -7
huggingface_hub/_webhooks_server.py +3 -1
huggingface_hub/commands/huggingface_cli.py +4 -3
huggingface_hub/commands/repo_files.py +128 -0
huggingface_hub/constants.py +12 -0
huggingface_hub/file_download.py +127 -91
huggingface_hub/hf_api.py +976 -341
huggingface_hub/hf_file_system.py +30 -3
huggingface_hub/inference/_client.py +408 -147
huggingface_hub/inference/_common.py +25 -63
huggingface_hub/inference/_generated/_async_client.py +425 -153
huggingface_hub/inference/_generated/types/__init__.py +4 -1
huggingface_hub/inference/_generated/types/chat_completion.py +41 -21
huggingface_hub/inference/_generated/types/feature_extraction.py +23 -5
huggingface_hub/inference/_generated/types/text_generation.py +29 -0
huggingface_hub/lfs.py +11 -6
huggingface_hub/repocard_data.py +3 -3
huggingface_hub/repository.py +6 -6
huggingface_hub/serialization/__init__.py +8 -3
huggingface_hub/serialization/_base.py +13 -16
huggingface_hub/serialization/_tensorflow.py +4 -3
huggingface_hub/serialization/_torch.py +399 -22
huggingface_hub/utils/__init__.py +0 -1
huggingface_hub/utils/_errors.py +1 -1
huggingface_hub/utils/_fixes.py +14 -3
huggingface_hub/utils/_paths.py +17 -6
huggingface_hub/utils/_subprocess.py +0 -1
huggingface_hub/utils/_telemetry.py +9 -1
huggingface_hub/utils/endpoint_helpers.py +2 -186
huggingface_hub/utils/sha.py +36 -1
huggingface_hub/utils/tqdm.py +0 -1
{huggingface_hub-0.23.5.dist-info → huggingface_hub-0.24.1.dist-info}/METADATA +12 -9
{huggingface_hub-0.23.5.dist-info → huggingface_hub-0.24.1.dist-info}/RECORD +41 -41
huggingface_hub/serialization/_numpy.py +0 -68
{huggingface_hub-0.23.5.dist-info → huggingface_hub-0.24.1.dist-info}/LICENSE +0 -0
{huggingface_hub-0.23.5.dist-info → huggingface_hub-0.24.1.dist-info}/WHEEL +0 -0
{huggingface_hub-0.23.5.dist-info → huggingface_hub-0.24.1.dist-info}/entry_points.txt +0 -0
{huggingface_hub-0.23.5.dist-info → huggingface_hub-0.24.1.dist-info}/top_level.txt +0 -0

huggingface_hub/inference/_generated/_async_client.py CHANGED Viewed

@@ -44,7 +44,7 @@ from huggingface_hub.inference._common import (
     TASKS_EXPECTING_IMAGES,
     ContentT,
     ModelStatus,
-    _async_stream_chat_completion_response_from_bytes,
+    _async_stream_chat_completion_response,
     _async_stream_text_generation_response,
     _b64_encode,
     _b64_to_image,
@@ -54,9 +54,7 @@ from huggingface_hub.inference._common import (
     _fetch_recommended_models,
     _get_unsupported_text_generation_kwargs,
     _import_numpy,
-    _is_chat_completion_server,
     _open_as_binary,
-    _set_as_non_chat_completion_server,
     _set_unsupported_text_generation_kwargs,
     raise_text_generation_error,
 )
@@ -64,11 +62,10 @@ from huggingface_hub.inference._generated.types import (
     AudioClassificationOutputElement,
     AudioToAudioOutputElement,
     AutomaticSpeechRecognitionOutput,
+    ChatCompletionInputGrammarType,
     ChatCompletionInputTool,
     ChatCompletionInputToolTypeClass,
     ChatCompletionOutput,
-    ChatCompletionOutputComplete,
-    ChatCompletionOutputMessage,
     ChatCompletionStreamOutput,
     DocumentQuestionAnsweringOutputElement,
     FillMaskOutputElement,
@@ -89,13 +86,13 @@ from huggingface_hub.inference._generated.types import (
     ZeroShotClassificationOutputElement,
     ZeroShotImageClassificationOutputElement,
 )
-from huggingface_hub.inference._generated.types.chat_completion import ChatCompletionInputToolTypeEnum
 from huggingface_hub.inference._types import (
     ConversationalOutput,  # soon to be removed
 )
 from huggingface_hub.utils import (
     build_hf_headers,
 )
+from huggingface_hub.utils._deprecation import _deprecate_positional_args
 from .._common import _async_yield_from, _import_aiohttp
@@ -119,12 +116,16 @@ class AsyncInferenceClient:
     Args:
         model (`str`, `optional`):
-            The model to run inference with. Can be a model id hosted on the Hugging Face Hub, e.g. `bigcode/starcoder`
+            The model to run inference with. Can be a model id hosted on the Hugging Face Hub, e.g. `meta-llama/Meta-Llama-3-8B-Instruct`
             or a URL to a deployed Inference Endpoint. Defaults to None, in which case a recommended model is
             automatically selected for the task.
+            Note: for better compatibility with OpenAI's client, `model` has been aliased as `base_url`. Those 2
+            arguments are mutually exclusive and have the exact same behavior.
         token (`str` or `bool`, *optional*):
             Hugging Face token. Will default to the locally saved token if not provided.
             Pass `token=False` if you don't want to send your token to the server.
+            Note: for better compatibility with OpenAI's client, `token` has been aliased as `api_key`. Those 2
+            arguments are mutually exclusive and have the exact same behavior.
         timeout (`float`, `optional`):
             The maximum number of seconds to wait for a response from the server. Loading a new model in Inference
             API can take up to several minutes. Defaults to None, meaning it will loop until the server is available.
@@ -133,23 +134,52 @@ class AsyncInferenceClient:
             Values in this dictionary will override the default values.
         cookies (`Dict[str, str]`, `optional`):
             Additional cookies to send to the server.
+        base_url (`str`, `optional`):
+            Base URL to run inference. This is a duplicated argument from `model` to make [`InferenceClient`]
+            follow the same pattern as `openai.OpenAI` client. Cannot be used if `model` is set. Defaults to None.
+        api_key (`str`, `optional`):
+            Token to use for authentication. This is a duplicated argument from `token` to make [`InferenceClient`]
+            follow the same pattern as `openai.OpenAI` client. Cannot be used if `token` is set. Defaults to None.
     """
+    @_deprecate_positional_args(version="0.26")
     def __init__(
         self,
         model: Optional[str] = None,
+        *,
         token: Union[str, bool, None] = None,
         timeout: Optional[float] = None,
         headers: Optional[Dict[str, str]] = None,
         cookies: Optional[Dict[str, str]] = None,
+        proxies: Optional[Any] = None,
+        # OpenAI compatibility
+        base_url: Optional[str] = None,
+        api_key: Optional[str] = None,
     ) -> None:
+        if model is not None and base_url is not None:
+            raise ValueError(
+                "Received both `model` and `base_url` arguments. Please provide only one of them."
+                " `base_url` is an alias for `model` to make the API compatible with OpenAI's client."
+                " It has the exact same behavior as `model`."
+            )
+        if token is not None and api_key is not None:
+            raise ValueError(
+                "Received both `token` and `api_key` arguments. Please provide only one of them."
+                " `api_key` is an alias for `token` to make the API compatible with OpenAI's client."
+                " It has the exact same behavior as `token`."
+            )
         self.model: Optional[str] = model
-        self.token: Union[str, bool, None] = token
-        self.headers = CaseInsensitiveDict(build_hf_headers(token=token))  # contains 'authorization' + 'user-agent'
+        self.token: Union[str, bool, None] = token if token is not None else api_key
+        self.headers = CaseInsensitiveDict(build_hf_headers(token=self.token))  # 'authorization' + 'user-agent'
         if headers is not None:
             self.headers.update(headers)
         self.cookies = cookies
         self.timeout = timeout
+        self.proxies = proxies
+        # OpenAI compatibility
+        self.base_url = base_url
     def __repr__(self):
         return f"<InferenceClient(model='{self.model if self.model else ''}', timeout={self.timeout})>"
@@ -250,7 +280,7 @@ class AsyncInferenceClient:
                 )
                 try:
-                    response = await client.post(url, json=json, data=data_as_binary)
+                    response = await client.post(url, json=json, data=data_as_binary, proxy=self.proxies)
                     response_error_payload = None
                     if response.status != 200:
                         try:
@@ -284,11 +314,16 @@ class AsyncInferenceClient:
                             ) from error
                         # ...or wait 1s and retry
                         logger.info(f"Waiting for model to be loaded on the server: {error}")
+                        if "X-wait-for-model" not in headers and url.startswith(INFERENCE_ENDPOINT):
+                            headers["X-wait-for-model"] = "1"
                         time.sleep(1)
                         if timeout is not None:
                             timeout = max(self.timeout - (time.time() - t0), 1)  # type: ignore
                         continue
                     raise error
+                except Exception:
+                    await client.close()
+                    raise
     async def audio_classification(
         self,
@@ -427,10 +462,11 @@ class AsyncInferenceClient:
         max_tokens: Optional[int] = None,
         n: Optional[int] = None,
         presence_penalty: Optional[float] = None,
+        response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
         tool_prompt: Optional[str] = None,
         tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
@@ -450,10 +486,11 @@ class AsyncInferenceClient:
         max_tokens: Optional[int] = None,
         n: Optional[int] = None,
         presence_penalty: Optional[float] = None,
+        response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
         tool_prompt: Optional[str] = None,
         tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
@@ -473,10 +510,11 @@ class AsyncInferenceClient:
         max_tokens: Optional[int] = None,
         n: Optional[int] = None,
         presence_penalty: Optional[float] = None,
+        response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
         tool_prompt: Optional[str] = None,
         tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
@@ -496,10 +534,11 @@ class AsyncInferenceClient:
         max_tokens: Optional[int] = None,
         n: Optional[int] = None,
         presence_penalty: Optional[float] = None,
+        response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
         tool_prompt: Optional[str] = None,
         tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
@@ -510,11 +549,10 @@ class AsyncInferenceClient:
         <Tip>
-        If the model is served by a server supporting chat-completion, the method will directly call the server's
-        `/v1/chat/completions` endpoint. If the server does not support chat-completion, the method will render the
-        chat template client-side based on the information fetched from the Hub API. In this case, you will need to
-        have `minijinja` template engine installed. Run `pip install "huggingface_hub[inference]"` or `pip install minijinja`
-        to install it.
+        The `client.chat_completion` method is aliased as `client.chat.completions.create` for compatibility with OpenAI's client.
+        Inputs and outputs are strictly the same and using either syntax will yield the same results.
+        Check out the [Inference guide](https://huggingface.co/docs/huggingface_hub/guides/inference#openai-compatibility)
+        for more details about OpenAI's compatibility.
         </Tip>
@@ -525,6 +563,9 @@ class AsyncInferenceClient:
                 The model to use for chat-completion. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. If not provided, the default recommended model for chat-based text-generation will be used.
                 See https://huggingface.co/tasks/text-generation for more details.
+                If `model` is a model ID, it is passed to the server as the `model` parameter. If you want to define a
+                custom URL while setting `model` in the request payload, you must set `base_url` when initializing [`InferenceClient`].
             frequency_penalty (`float`, *optional*):
                 Penalizes new tokens based on their existing frequency
                 in the text so far. Range: [-2.0, 2.0]. Defaults to 0.0.
@@ -544,6 +585,8 @@ class AsyncInferenceClient:
             presence_penalty (`float`, *optional*):
                 Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the
                 text so far, increasing the model's likelihood to talk about new topics.
+            response_format ([`ChatCompletionInputGrammarType`], *optional*):
+                Grammar constraints. Can be either a JSONSchema or a regex.
             seed (Optional[`int`], *optional*):
                 Seed for reproducible control flow. Defaults to None.
             stop (Optional[`str`], *optional*):
@@ -561,7 +604,7 @@ class AsyncInferenceClient:
             top_p (`float`, *optional*):
                 Fraction of the most likely next words to sample from.
                 Must be between 0 and 1. Defaults to 1.0.
-            tool_choice ([`ChatCompletionInputToolTypeClass`] or [`ChatCompletionInputToolTypeEnum`], *optional*):
+            tool_choice ([`ChatCompletionInputToolTypeClass`] or `str`, *optional*):
                 The tool to use for the completion. Defaults to "auto".
             tool_prompt (`str`, *optional*):
                 A prompt to be appended before the tools.
@@ -570,7 +613,7 @@ class AsyncInferenceClient:
                 provide a list of functions the model may generate JSON inputs for.
         Returns:
-            [`ChatCompletionOutput] or Iterable of [`ChatCompletionStreamOutput`]:
+            [`ChatCompletionOutput`] or Iterable of [`ChatCompletionStreamOutput`]:
             Generated text returned from the server:
             - if `stream=False`, the generated text is returned as a [`ChatCompletionOutput`] (default).
             - if `stream=True`, the generated text is returned token by token as a sequence of [`ChatCompletionStreamOutput`].
@@ -585,10 +628,9 @@ class AsyncInferenceClient:
         ```py
         # Must be run in an async context
-        # Chat example
         >>> from huggingface_hub import AsyncInferenceClient
         >>> messages = [{"role": "user", "content": "What is the capital of France?"}]
-        >>> client = AsyncInferenceClient("HuggingFaceH4/zephyr-7b-beta")
+        >>> client = AsyncInferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
         >>> await client.chat_completion(messages, max_tokens=100)
         ChatCompletionOutput(
             choices=[
@@ -596,26 +638,75 @@ class AsyncInferenceClient:
                     finish_reason='eos_token',
                     index=0,
                     message=ChatCompletionOutputMessage(
-                        content='The capital of France is Paris. The official name of the city is Ville de Paris (City of Paris) and the name of the country governing body, which is located in Paris, is La République française (The French Republic). \nI hope that helps! Let me know if you need any further information.'
-                    )
+                        role='assistant',
+                        content='The capital of France is Paris.',
+                        name=None,
+                        tool_calls=None
+                    ),
+                    logprobs=None
                 )
             ],
-            created=1710498360
+            created=1719907176,
+            id='',
+            model='meta-llama/Meta-Llama-3-8B-Instruct',
+            object='text_completion',
+            system_fingerprint='2.0.4-sha-f426a33',
+            usage=ChatCompletionOutputUsage(
+                completion_tokens=8,
+                prompt_tokens=17,
+                total_tokens=25
+            )
         )
+        ```
+        Example (stream=True):
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> messages = [{"role": "user", "content": "What is the capital of France?"}]
+        >>> client = AsyncInferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
         >>> async for token in await client.chat_completion(messages, max_tokens=10, stream=True):
         ...     print(token)
         ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content='The', role='assistant'), index=0, finish_reason=None)], created=1710498504)
         ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' capital', role='assistant'), index=0, finish_reason=None)], created=1710498504)
         (...)
         ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' may', role='assistant'), index=0, finish_reason=None)], created=1710498504)
+        ```
+        Example using OpenAI's syntax:
+        ```py
+        # Must be run in an async context
+        # instead of `from openai import OpenAI`
+        from huggingface_hub import AsyncInferenceClient
-        # Chat example with tools
+        # instead of `client = OpenAI(...)`
+        client = AsyncInferenceClient(
+            base_url=...,
+            api_key=...,
+        )
+        output = await client.chat.completions.create(
+            model="meta-llama/Meta-Llama-3-8B-Instruct",
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Count to 10"},
+            ],
+            stream=True,
+            max_tokens=1024,
+        )
+        for chunk in output:
+            print(chunk.choices[0].delta.content)
+        ```
+        Example using tools:
+        ```py
+        # Must be run in an async context
         >>> client = AsyncInferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
         >>> messages = [
         ...     {
         ...         "role": "system",
-        ...         "content": "Don't make assumptions about what values to plug into functions. Ask async for clarification if a user request is ambiguous.",
+        ...         "content": "Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous.",
         ...     },
         ...     {
         ...         "role": "user",
@@ -691,113 +782,90 @@ class AsyncInferenceClient:
             description=None
         )
         ```
-        """
-        # determine model
-        model = model or self.model or self.get_recommended_model("text-generation")
-        if _is_chat_completion_server(model):
-            # First, let's consider the server has a `/v1/chat/completions` endpoint.
-            # If that's the case, we don't have to render the chat template client-side.
-            model_url = self._resolve_url(model)
-            if not model_url.endswith("/chat/completions"):
-                model_url += "/v1/chat/completions"
-            try:
-                data = await self.post(
-                    model=model_url,
-                    json=dict(
-                        model="tgi",  # random string
-                        messages=messages,
-                        frequency_penalty=frequency_penalty,
-                        logit_bias=logit_bias,
-                        logprobs=logprobs,
-                        max_tokens=max_tokens,
-                        n=n,
-                        presence_penalty=presence_penalty,
-                        seed=seed,
-                        stop=stop,
-                        temperature=temperature,
-                        tool_choice=tool_choice,
-                        tool_prompt=tool_prompt,
-                        tools=tools,
-                        top_logprobs=top_logprobs,
-                        top_p=top_p,
-                        stream=stream,
-                    ),
-                    stream=stream,
-                )
-            except _import_aiohttp().ClientResponseError as e:
-                if e.status in (400, 404, 500):
-                    # Let's consider the server is not a chat completion server.
-                    # Then we call again `chat_completion` which will render the chat template client side.
-                    # (can be HTTP 500, HTTP 400, HTTP 404 depending on the server)
-                    _set_as_non_chat_completion_server(model)
-                    logger.warning(
-                        f"Server {model_url} does not seem to support chat completion. Falling back to text generation. Error: {e}"
-                    )
-                    return await self.chat_completion(
-                        messages=messages,
-                        model=model,
-                        stream=stream,
-                        max_tokens=max_tokens,
-                        seed=seed,
-                        stop=stop,
-                        temperature=temperature,
-                        top_p=top_p,
-                    )
-                raise
-            if stream:
-                return _async_stream_chat_completion_response_from_bytes(data)  # type: ignore[arg-type]
-            return ChatCompletionOutput.parse_obj_as_instance(data)  # type: ignore[arg-type]
+        Example using response_format:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
+        >>> messages = [
+        ...     {
+        ...         "role": "user",
+        ...         "content": "I saw a puppy a cat and a raccoon during my bike ride in the park. What did I saw and when?",
+        ...     },
+        ... ]
+        >>> response_format = {
+        ...     "type": "json",
+        ...     "value": {
+        ...         "properties": {
+        ...             "location": {"type": "string"},
+        ...             "activity": {"type": "string"},
+        ...             "animals_seen": {"type": "integer", "minimum": 1, "maximum": 5},
+        ...             "animals": {"type": "array", "items": {"type": "string"}},
+        ...         },
+        ...         "required": ["location", "activity", "animals_seen", "animals"],
+        ...     },
+        ... }
+        >>> response = await client.chat_completion(
+        ...     messages=messages,
+        ...     response_format=response_format,
+        ...     max_tokens=500,
+        )
+        >>> response.choices[0].message.content
+        '{\n\n"activity": "bike ride",\n"animals": ["puppy", "cat", "raccoon"],\n"animals_seen": 3,\n"location": "park"}'
+        ```
+        """
+        # Determine model
+        # `self.xxx` takes precedence over the method argument only in `chat_completion`
+        # since `chat_completion(..., model=xxx)` is also a payload parameter for the
+        # server, we need to handle it differently
+        model = self.base_url or self.model or model or self.get_recommended_model("text-generation")
+        is_url = model.startswith(("http://", "https://"))
+        # First, resolve the model chat completions URL
+        if model == self.base_url:
+            # base_url passed => add server route
+            model_url = model + "/v1/chat/completions"
+        elif is_url:
+            # model is a URL => use it directly
+            model_url = model
+        else:
+            # model is a model ID => resolve it + add server route
+            model_url = self._resolve_url(model) + "/v1/chat/completions"
+        # `model` is sent in the payload. Not used by the server but can be useful for debugging/routing.
+        # If it's a ID on the Hub => use it. Otherwise, we use a random string.
+        model_id = model if not is_url and model.count("/") == 1 else "tgi"
+        data = await self.post(
+            model=model_url,
+            json=dict(
+                model=model_id,
+                messages=messages,
+                frequency_penalty=frequency_penalty,
+                logit_bias=logit_bias,
+                logprobs=logprobs,
+                max_tokens=max_tokens,
+                n=n,
+                presence_penalty=presence_penalty,
+                response_format=response_format,
+                seed=seed,
+                stop=stop,
+                temperature=temperature,
+                tool_choice=tool_choice,
+                tool_prompt=tool_prompt,
+                tools=tools,
+                top_logprobs=top_logprobs,
+                top_p=top_p,
+                stream=stream,
+            ),
+            stream=stream,
+        )
-        # At this point, we know the server is not a chat completion server.
-        # It means it's a transformers-backed server for which we can send a list of messages directly to the
-        # `text-generation` pipeline. We won't receive a detailed response but only the generated text.
         if stream:
-            raise ValueError(
-                "Streaming token is not supported by the model. This is due to the model not been served by a "
-                "Text-Generation-Inference server. Please pass `stream=False` as input."
-            )
-        if tool_choice is not None or tool_prompt is not None or tools is not None:
-            warnings.warn(
-                "Tools are not supported by the model. This is due to the model not been served by a "
-                "Text-Generation-Inference server. The provided tool parameters will be ignored."
-            )
-        # generate response
-        text_generation_output = await self.text_generation(
-            prompt=messages,  # type: ignore # Not correct type but works implicitly
-            model=model,
-            stream=False,
-            details=False,
-            max_new_tokens=max_tokens,
-            seed=seed,
-            stop_sequences=stop,
-            temperature=temperature,
-            top_p=top_p,
-        )
+            return _async_stream_chat_completion_response(data)  # type: ignore[arg-type]
-        # Format as a ChatCompletionOutput with dummy values for fields we can't provide
-        return ChatCompletionOutput(
-            id="dummy",
-            model="dummy",
-            object="dummy",
-            system_fingerprint="dummy",
-            usage=None,  # type: ignore # set to `None` as we don't want to provide false information
-            created=int(time.time()),
-            choices=[
-                ChatCompletionOutputComplete(
-                    finish_reason="unk",  # type: ignore # set to `unk` as we don't want to provide false information
-                    index=0,
-                    message=ChatCompletionOutputMessage(
-                        content=text_generation_output,
-                        role="assistant",
-                    ),
-                )
-            ],
-        )
+        return ChatCompletionOutput.parse_obj_as_instance(data)  # type: ignore[arg-type]
     async def conversational(
         self,
@@ -850,7 +918,7 @@ class AsyncInferenceClient:
         >>> client = AsyncInferenceClient()
         >>> output = await client.conversational("Hi, who are you?")
         >>> output
-        {'generated_text': 'I am the one who knocks.', 'conversation': {'generated_responses': ['I am the one who knocks.'], 'past_user_inputs': ['Hi, who are you?']}, 'warnings': ['Setting `pad_token_id` to `eos_token_id`:50256 async for open-end generation.']}
+        {'generated_text': 'I am the one who knocks.', 'conversation': {'generated_responses': ['I am the one who knocks.'], 'past_user_inputs': ['Hi, who are you?']}, 'warnings': ['Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.']}
         >>> await client.conversational(
         ...     "Wow, that's scary!",
         ...     generated_responses=output["conversation"]["generated_responses"],
@@ -915,7 +983,16 @@ class AsyncInferenceClient:
         response = await self.post(json=payload, model=model, task="document-question-answering")
         return DocumentQuestionAnsweringOutputElement.parse_obj_as_list(response)
-    async def feature_extraction(self, text: str, *, model: Optional[str] = None) -> "np.ndarray":
+    async def feature_extraction(
+        self,
+        text: str,
+        *,
+        normalize: Optional[bool] = None,
+        prompt_name: Optional[str] = None,
+        truncate: Optional[bool] = None,
+        truncation_direction: Optional[Literal["Left", "Right"]] = None,
+        model: Optional[str] = None,
+    ) -> "np.ndarray":
         """
         Generate embeddings for a given text.
@@ -926,6 +1003,20 @@ class AsyncInferenceClient:
                 The model to use for the conversational task. Can be a model ID hosted on the Hugging Face Hub or a URL to
                 a deployed Inference Endpoint. If not provided, the default recommended conversational model will be used.
                 Defaults to None.
+            normalize (`bool`, *optional*):
+                Whether to normalize the embeddings or not. Defaults to None.
+                Only available on server powered by Text-Embedding-Inference.
+            prompt_name (`str`, *optional*):
+                The name of the prompt that should be used by for encoding. If not set, no prompt will be applied.
+                Must be a key in the `Sentence Transformers` configuration `prompts` dictionary.
+                For example if ``prompt_name`` is "query" and the ``prompts`` is {"query": "query: ",...},
+                then the sentence "What is the capital of France?" will be encoded as "query: What is the capital of France?"
+                because the prompt text will be prepended before any text to encode.
+            truncate (`bool`, *optional*):
+                Whether to truncate the embeddings or not. Defaults to None.
+                Only available on server powered by Text-Embedding-Inference.
+            truncation_direction (`Literal["Left", "Right"]`, *optional*):
+                Which side of the input should be truncated when `truncate=True` is passed.
         Returns:
             `np.ndarray`: The embedding representing the input text as a float32 numpy array.
@@ -948,7 +1039,16 @@ class AsyncInferenceClient:
         [ 0.28552425, -0.928395  , -1.2077185 , ...,  0.76810825, -2.1069427 ,  0.6236161 ]], dtype=float32)
         ```
         """
-        response = await self.post(json={"inputs": text}, model=model, task="feature-extraction")
+        payload: Dict = {"inputs": text}
+        if normalize is not None:
+            payload["normalize"] = normalize
+        if prompt_name is not None:
+            payload["prompt_name"] = prompt_name
+        if truncate is not None:
+            payload["truncate"] = truncate
+        if truncation_direction is not None:
+            payload["truncation_direction"] = truncation_direction
+        response = await self.post(json=payload, model=model, task="feature-extraction")
         np = _import_numpy()
         return np.array(_bytes_to_dict(response), dtype="float32")
@@ -1192,7 +1292,8 @@ class AsyncInferenceClient:
         ```
         """
         response = await self.post(data=image, model=model, task="image-to-text")
-        return ImageToTextOutput.parse_obj_as_instance(response)
+        output = ImageToTextOutput.parse_obj(response)
+        return output[0] if isinstance(output, list) else output
     async def list_deployed_models(
         self, frameworks: Union[None, str, Literal["all"], List[str]] = None
@@ -1643,6 +1744,7 @@ class AsyncInferenceClient:
         stream: Literal[False] = ...,
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
         do_sample: Optional[bool] = False,  # Manual default value
@@ -1671,6 +1773,7 @@ class AsyncInferenceClient:
         stream: Literal[False] = ...,
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
         do_sample: Optional[bool] = False,  # Manual default value
@@ -1699,6 +1802,7 @@ class AsyncInferenceClient:
         stream: Literal[True] = ...,
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
         do_sample: Optional[bool] = False,  # Manual default value
@@ -1727,6 +1831,7 @@ class AsyncInferenceClient:
         stream: Literal[True] = ...,
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
         do_sample: Optional[bool] = False,  # Manual default value
@@ -1755,6 +1860,7 @@ class AsyncInferenceClient:
         stream: bool = ...,
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
         do_sample: Optional[bool] = False,  # Manual default value
@@ -1782,6 +1888,7 @@ class AsyncInferenceClient:
         stream: bool = False,
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
         do_sample: Optional[bool] = False,  # Manual default value
@@ -1812,6 +1919,13 @@ class AsyncInferenceClient:
         To learn more about the TGI project, please refer to https://github.com/huggingface/text-generation-inference.
+        <Tip>
+        If you want to generate a response from chat messages, you should use the [`InferenceClient.chat_completion`] method.
+        It accepts a list of messages instead of a single text prompt and handles the chat templating for you.
+        </Tip>
         Args:
             prompt (`str`):
                 Input text.
@@ -1826,6 +1940,8 @@ class AsyncInferenceClient:
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+            adapter_id (`str`, *optional*):
+                Lora adapter id.
             best_of (`int`, *optional*):
                 Generate best_of sequences and return the one if the highest token logprobs.
             decoder_input_details (`bool`, *optional*):
@@ -1893,7 +2009,7 @@ class AsyncInferenceClient:
         >>> await client.text_generation("The huggingface_hub library is ", max_new_tokens=12)
         '100% open source and built to be easy to use.'
-        # Case 2: iterate over the generated tokens. Useful async for large generation.
+        # Case 2: iterate over the generated tokens. Useful for large generation.
         >>> async for token in await client.text_generation("The huggingface_hub library is ", max_new_tokens=12, stream=True):
         ...     print(token)
         100
@@ -1995,6 +2111,7 @@ class AsyncInferenceClient:
         # Build payload
         parameters = {
+            "adapter_id": adapter_id,
             "best_of": best_of,
             "decoder_input_details": decoder_input_details,
             "details": details,
@@ -2065,6 +2182,7 @@ class AsyncInferenceClient:
                     details=details,
                     stream=stream,
                     model=model,
+                    adapter_id=adapter_id,
                     best_of=best_of,
                     decoder_input_details=decoder_input_details,
                     do_sample=do_sample,
@@ -2089,7 +2207,12 @@ class AsyncInferenceClient:
         if stream:
             return _async_stream_text_generation_response(bytes_output, details)  # type: ignore
-        data = _bytes_to_dict(bytes_output)[0]  # type: ignore[arg-type]
+        data = _bytes_to_dict(bytes_output)  # type: ignore[arg-type]
+        # Data can be a single element (dict) or an iterable of dicts where we select the first element of.
+        if isinstance(data, list):
+            data = data[0]
         return TextGenerationOutput.parse_obj_as_instance(data) if details else data["generated_text"]
     async def text_to_image(
@@ -2377,7 +2500,13 @@ class AsyncInferenceClient:
         return VisualQuestionAnsweringOutputElement.parse_obj_as_list(response)
     async def zero_shot_classification(
-        self, text: str, labels: List[str], *, multi_label: bool = False, model: Optional[str] = None
+        self,
+        text: str,
+        labels: List[str],
+        *,
+        multi_label: bool = False,
+        hypothesis_template: Optional[str] = None,
+        model: Optional[str] = None,
     ) -> List[ZeroShotClassificationOutputElement]:
         """
         Provide as input a text and a set of candidate labels to classify the input text.
@@ -2386,9 +2515,15 @@ class AsyncInferenceClient:
             text (`str`):
                 The input text to classify.
             labels (`List[str]`):
-                List of string possible labels. There must be at least 2 labels.
+                List of strings. Each string is the verbalization of a possible label for the input text.
             multi_label (`bool`):
-                Boolean that is set to True if classes can overlap.
+                Boolean. If True, the probability for each label is evaluated independently and multiple labels can have a probability close to 1 simultaneously or all probabilities can be close to 0.
+                If False, the labels are considered mutually exclusive and the probability over all labels always sums to 1. Defaults to False.
+            hypothesis_template (`str`, *optional*):
+                A template sentence string with curly brackets to which the label strings are added. The label strings are added at the position of the curly brackets "{}".
+                Zero-shot classifiers are based on NLI models, which evaluate if a hypothesis is entailed in another text or not.
+                For example, with hypothesis_template="This text is about {}." and labels=["economics", "politics"], the system internally creates the two hypotheses "This text is about economics." and "This text is about politics.".
+                The model then evaluates for both hypotheses if they are entailed in the provided `text` or not.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
@@ -2402,15 +2537,15 @@ class AsyncInferenceClient:
             `aiohttp.ClientResponseError`:
                 If the request fails with an HTTP error status code other than HTTP 503.
-        Example:
+        Example with `multi_label=False`:
         ```py
         # Must be run in an async context
         >>> from huggingface_hub import AsyncInferenceClient
         >>> client = AsyncInferenceClient()
         >>> text = (
-        ...     "A new model offers an explanation async for how the Galilean satellites formed around the solar system's"
+        ...     "A new model offers an explanation for how the Galilean satellites formed around the solar system's"
         ...     "largest world. Konstantin Batygin did not set out to solve one of the solar system's most puzzling"
-        ...     " mysteries when he went async for a run up a hill in Nice, France."
+        ...     " mysteries when he went for a run up a hill in Nice, France."
         ... )
         >>> labels = ["space & cosmos", "scientific discovery", "microbiology", "robots", "archeology"]
         >>> await client.zero_shot_classification(text, labels)
@@ -2430,21 +2565,38 @@ class AsyncInferenceClient:
             ZeroShotClassificationOutputElement(label='robots', score=0.00030448526376858354),
         ]
         ```
+        Example with `multi_label=True` and a custom `hypothesis_template`:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> await client.zero_shot_classification(
+        ...    text="I really like our dinner and I'm very happy. I don't like the weather though.",
+        ...    labels=["positive", "negative", "pessimistic", "optimistic"],
+        ...    multi_label=True,
+        ...    hypothesis_template="This text is {} towards the weather"
+        ... )
+        [
+            ZeroShotClassificationOutputElement(label='negative', score=0.9231801629066467),
+            ZeroShotClassificationOutputElement(label='pessimistic', score=0.8760990500450134),
+            ZeroShotClassificationOutputElement(label='optimistic', score=0.0008674879791215062),
+            ZeroShotClassificationOutputElement(label='positive', score=0.0005250611575320363)
+        ]
+        ```
         """
-        # Raise ValueError if input is less than 2 labels
-        if len(labels) < 2:
-            raise ValueError("You must specify at least 2 classes to compare.")
+        parameters = {"candidate_labels": labels, "multi_label": multi_label}
+        if hypothesis_template is not None:
+            parameters["hypothesis_template"] = hypothesis_template
         response = await self.post(
             json={
                 "inputs": text,
-                "parameters": {
-                    "candidate_labels": ",".join(labels),
-                    "multi_label": multi_label,
-                },
+                "parameters": parameters,
             },
-            model=model,
             task="zero-shot-classification",
+            model=model,
         )
         output = _bytes_to_dict(response)
         return [
@@ -2501,7 +2653,7 @@ class AsyncInferenceClient:
         return ZeroShotImageClassificationOutputElement.parse_obj_as_list(response)
     def _resolve_url(self, model: Optional[str] = None, task: Optional[str] = None) -> str:
-        model = model or self.model
+        model = model or self.model or self.base_url
         # If model is already a URL, ignore `task` and return directly
         if model is not None and (model.startswith("http://") or model.startswith("https://")):
@@ -2554,6 +2706,99 @@ class AsyncInferenceClient:
             )
         return model
+    async def get_endpoint_info(self, *, model: Optional[str] = None) -> Dict[str, Any]:
+        """
+        Get information about the deployed endpoint.
+        This endpoint is only available on endpoints powered by Text-Generation-Inference (TGI) or Text-Embedding-Inference (TEI).
+        Endpoints powered by `transformers` return an empty payload.
+        Args:
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+        Returns:
+            `Dict[str, Any]`: Information about the endpoint.
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
+        >>> await client.get_endpoint_info()
+        {
+            'model_id': 'meta-llama/Meta-Llama-3-70B-Instruct',
+            'model_sha': None,
+            'model_dtype': 'torch.float16',
+            'model_device_type': 'cuda',
+            'model_pipeline_tag': None,
+            'max_concurrent_requests': 128,
+            'max_best_of': 2,
+            'max_stop_sequences': 4,
+            'max_input_length': 8191,
+            'max_total_tokens': 8192,
+            'waiting_served_ratio': 0.3,
+            'max_batch_total_tokens': 1259392,
+            'max_waiting_tokens': 20,
+            'max_batch_size': None,
+            'validation_workers': 32,
+            'max_client_batch_size': 4,
+            'version': '2.0.2',
+            'sha': 'dccab72549635c7eb5ddb17f43f0b7cdff07c214',
+            'docker_label': 'sha-dccab72'
+        }
+        ```
+        """
+        model = model or self.model
+        if model is None:
+            raise ValueError("Model id not provided.")
+        if model.startswith(("http://", "https://")):
+            url = model.rstrip("/") + "/info"
+        else:
+            url = f"{INFERENCE_ENDPOINT}/models/{model}/info"
+        async with _import_aiohttp().ClientSession(headers=self.headers) as client:
+            response = await client.get(url)
+            response.raise_for_status()
+            return await response.json()
+    async def health_check(self, model: Optional[str] = None) -> bool:
+        """
+        Check the health of the deployed endpoint.
+        Health check is only available with Inference Endpoints powered by Text-Generation-Inference (TGI) or Text-Embedding-Inference (TEI).
+        For Inference API, please use [`InferenceClient.get_model_status`] instead.
+        Args:
+            model (`str`, *optional*):
+                URL of the Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+        Returns:
+            `bool`: True if everything is working fine.
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient("https://jzgu0buei5.us-east-1.aws.endpoints.huggingface.cloud")
+        >>> await client.health_check()
+        True
+        ```
+        """
+        model = model or self.model
+        if model is None:
+            raise ValueError("Model id not provided.")
+        if not model.startswith(("http://", "https://")):
+            raise ValueError(
+                "Model must be an Inference Endpoint URL. For serverless Inference API, please use `InferenceClient.get_model_status`."
+            )
+        url = model.rstrip("/") + "/health"
+        async with _import_aiohttp().ClientSession(headers=self.headers) as client:
+            response = await client.get(url)
+            return response.status == 200
     async def get_model_status(self, model: Optional[str] = None) -> ModelStatus:
         """
         Get the status of a model hosted on the Inference API.
@@ -2581,7 +2826,7 @@ class AsyncInferenceClient:
         # Must be run in an async context
         >>> from huggingface_hub import AsyncInferenceClient
         >>> client = AsyncInferenceClient()
-        >>> await client.get_model_status("bigcode/starcoder")
+        >>> await client.get_model_status("meta-llama/Meta-Llama-3-8B-Instruct")
         ModelStatus(loaded=True, state='Loaded', compute_type='gpu', framework='text-generation-inference')
         ```
         """
@@ -2606,3 +2851,30 @@ class AsyncInferenceClient:
             compute_type=response_data["compute_type"],
             framework=response_data["framework"],
         )
+    @property
+    def chat(self) -> "ProxyClientChat":
+        return ProxyClientChat(self)
+class _ProxyClient:
+    """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client."""
+    def __init__(self, client: AsyncInferenceClient):
+        self._client = client
+class ProxyClientChat(_ProxyClient):
+    """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client."""
+    @property
+    def completions(self) -> "ProxyClientChatCompletions":
+        return ProxyClientChatCompletions(self._client)
+class ProxyClientChatCompletions(_ProxyClient):
+    """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client."""
+    @property
+    def create(self):
+        return self._client.chat_completion

huggingface-hub 0.23.5__py3-none-any.whl → 0.24.1__py3-none-any.whl

Potentially problematic release.

huggingface-hub 0.23.5py3-none-any.whl → 0.24.1py3-none-any.whl