huggingface-hub 0.23.3__py3-none-any.whl → 0.24.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of huggingface-hub might be problematic. Click here for more details.

Files changed (44) hide show
  1. huggingface_hub/__init__.py +47 -15
  2. huggingface_hub/_commit_api.py +38 -8
  3. huggingface_hub/_inference_endpoints.py +11 -4
  4. huggingface_hub/_local_folder.py +22 -13
  5. huggingface_hub/_snapshot_download.py +12 -7
  6. huggingface_hub/_webhooks_server.py +3 -1
  7. huggingface_hub/commands/huggingface_cli.py +4 -3
  8. huggingface_hub/commands/repo_files.py +128 -0
  9. huggingface_hub/constants.py +12 -0
  10. huggingface_hub/file_download.py +127 -91
  11. huggingface_hub/hf_api.py +979 -341
  12. huggingface_hub/hf_file_system.py +30 -3
  13. huggingface_hub/hub_mixin.py +103 -41
  14. huggingface_hub/inference/_client.py +373 -42
  15. huggingface_hub/inference/_common.py +0 -2
  16. huggingface_hub/inference/_generated/_async_client.py +390 -48
  17. huggingface_hub/inference/_generated/types/__init__.py +4 -1
  18. huggingface_hub/inference/_generated/types/chat_completion.py +41 -21
  19. huggingface_hub/inference/_generated/types/feature_extraction.py +23 -5
  20. huggingface_hub/inference/_generated/types/text_generation.py +29 -0
  21. huggingface_hub/lfs.py +11 -6
  22. huggingface_hub/repocard_data.py +41 -29
  23. huggingface_hub/repository.py +6 -6
  24. huggingface_hub/serialization/__init__.py +8 -3
  25. huggingface_hub/serialization/_base.py +13 -16
  26. huggingface_hub/serialization/_tensorflow.py +4 -3
  27. huggingface_hub/serialization/_torch.py +399 -22
  28. huggingface_hub/utils/__init__.py +1 -2
  29. huggingface_hub/utils/_errors.py +1 -1
  30. huggingface_hub/utils/_fixes.py +14 -3
  31. huggingface_hub/utils/_paths.py +17 -6
  32. huggingface_hub/utils/_subprocess.py +0 -1
  33. huggingface_hub/utils/_telemetry.py +9 -1
  34. huggingface_hub/utils/_typing.py +26 -1
  35. huggingface_hub/utils/endpoint_helpers.py +2 -186
  36. huggingface_hub/utils/sha.py +36 -1
  37. huggingface_hub/utils/tqdm.py +0 -1
  38. {huggingface_hub-0.23.3.dist-info → huggingface_hub-0.24.0rc0.dist-info}/METADATA +12 -9
  39. {huggingface_hub-0.23.3.dist-info → huggingface_hub-0.24.0rc0.dist-info}/RECORD +43 -43
  40. huggingface_hub/serialization/_numpy.py +0 -68
  41. {huggingface_hub-0.23.3.dist-info → huggingface_hub-0.24.0rc0.dist-info}/LICENSE +0 -0
  42. {huggingface_hub-0.23.3.dist-info → huggingface_hub-0.24.0rc0.dist-info}/WHEEL +0 -0
  43. {huggingface_hub-0.23.3.dist-info → huggingface_hub-0.24.0rc0.dist-info}/entry_points.txt +0 -0
  44. {huggingface_hub-0.23.3.dist-info → huggingface_hub-0.24.0rc0.dist-info}/top_level.txt +0 -0
@@ -78,6 +78,7 @@ from huggingface_hub.inference._generated.types import (
78
78
  AudioClassificationOutputElement,
79
79
  AudioToAudioOutputElement,
80
80
  AutomaticSpeechRecognitionOutput,
81
+ ChatCompletionInputGrammarType,
81
82
  ChatCompletionInputTool,
82
83
  ChatCompletionInputToolTypeClass,
83
84
  ChatCompletionOutput,
@@ -103,7 +104,6 @@ from huggingface_hub.inference._generated.types import (
103
104
  ZeroShotClassificationOutputElement,
104
105
  ZeroShotImageClassificationOutputElement,
105
106
  )
106
- from huggingface_hub.inference._generated.types.chat_completion import ChatCompletionInputToolTypeEnum
107
107
  from huggingface_hub.inference._types import (
108
108
  ConversationalOutput, # soon to be removed
109
109
  )
@@ -113,6 +113,7 @@ from huggingface_hub.utils import (
113
113
  get_session,
114
114
  hf_raise_for_status,
115
115
  )
116
+ from huggingface_hub.utils._deprecation import _deprecate_positional_args
116
117
 
117
118
 
118
119
  if TYPE_CHECKING:
@@ -134,12 +135,16 @@ class InferenceClient:
134
135
 
135
136
  Args:
136
137
  model (`str`, `optional`):
137
- The model to run inference with. Can be a model id hosted on the Hugging Face Hub, e.g. `bigcode/starcoder`
138
+ The model to run inference with. Can be a model id hosted on the Hugging Face Hub, e.g. `meta-llama/Meta-Llama-3-8B-Instruct`
138
139
  or a URL to a deployed Inference Endpoint. Defaults to None, in which case a recommended model is
139
140
  automatically selected for the task.
141
+ Note: for better compatibility with OpenAI's client, `model` has been aliased as `base_url`. Those 2
142
+ arguments are mutually exclusive and have the exact same behavior.
140
143
  token (`str` or `bool`, *optional*):
141
144
  Hugging Face token. Will default to the locally saved token if not provided.
142
145
  Pass `token=False` if you don't want to send your token to the server.
146
+ Note: for better compatibility with OpenAI's client, `token` has been aliased as `api_key`. Those 2
147
+ arguments are mutually exclusive and have the exact same behavior.
143
148
  timeout (`float`, `optional`):
144
149
  The maximum number of seconds to wait for a response from the server. Loading a new model in Inference
145
150
  API can take up to several minutes. Defaults to None, meaning it will loop until the server is available.
@@ -148,23 +153,52 @@ class InferenceClient:
148
153
  Values in this dictionary will override the default values.
149
154
  cookies (`Dict[str, str]`, `optional`):
150
155
  Additional cookies to send to the server.
156
+ base_url (`str`, `optional`):
157
+ Base URL to run inference. This is a duplicated argument from `model` to make [`InferenceClient`]
158
+ follow the same pattern as `openai.OpenAI` client. Cannot be used if `model` is set. Defaults to None.
159
+ api_key (`str`, `optional`):
160
+ Token to use for authentication. This is a duplicated argument from `token` to make [`InferenceClient`]
161
+ follow the same pattern as `openai.OpenAI` client. Cannot be used if `token` is set. Defaults to None.
151
162
  """
152
163
 
164
+ @_deprecate_positional_args(version="0.26")
153
165
  def __init__(
154
166
  self,
155
167
  model: Optional[str] = None,
168
+ *,
156
169
  token: Union[str, bool, None] = None,
157
170
  timeout: Optional[float] = None,
158
171
  headers: Optional[Dict[str, str]] = None,
159
172
  cookies: Optional[Dict[str, str]] = None,
173
+ proxies: Optional[Any] = None,
174
+ # OpenAI compatibility
175
+ base_url: Optional[str] = None,
176
+ api_key: Optional[str] = None,
160
177
  ) -> None:
178
+ if model is not None and base_url is not None:
179
+ raise ValueError(
180
+ "Received both `model` and `base_url` arguments. Please provide only one of them."
181
+ " `base_url` is an alias for `model` to make the API compatible with OpenAI's client."
182
+ " It has the exact same behavior as `model`."
183
+ )
184
+ if token is not None and api_key is not None:
185
+ raise ValueError(
186
+ "Received both `token` and `api_key` arguments. Please provide only one of them."
187
+ " `api_key` is an alias for `token` to make the API compatible with OpenAI's client."
188
+ " It has the exact same behavior as `token`."
189
+ )
190
+
161
191
  self.model: Optional[str] = model
162
- self.token: Union[str, bool, None] = token
163
- self.headers = CaseInsensitiveDict(build_hf_headers(token=token)) # contains 'authorization' + 'user-agent'
192
+ self.token: Union[str, bool, None] = token or api_key
193
+ self.headers = CaseInsensitiveDict(build_hf_headers(token=self.token)) # 'authorization' + 'user-agent'
164
194
  if headers is not None:
165
195
  self.headers.update(headers)
166
196
  self.cookies = cookies
167
197
  self.timeout = timeout
198
+ self.proxies = proxies
199
+
200
+ # OpenAI compatibility
201
+ self.base_url = base_url
168
202
 
169
203
  def __repr__(self):
170
204
  return f"<InferenceClient(model='{self.model if self.model else ''}', timeout={self.timeout})>"
@@ -264,6 +298,7 @@ class InferenceClient:
264
298
  cookies=self.cookies,
265
299
  timeout=self.timeout,
266
300
  stream=stream,
301
+ proxies=self.proxies,
267
302
  )
268
303
  except TimeoutError as error:
269
304
  # Convert any `TimeoutError` to a `InferenceTimeoutError`
@@ -289,6 +324,8 @@ class InferenceClient:
289
324
  # ...or wait 1s and retry
290
325
  logger.info(f"Waiting for model to be loaded on the server: {error}")
291
326
  time.sleep(1)
327
+ if "X-wait-for-model" not in headers and url.startswith(INFERENCE_ENDPOINT):
328
+ headers["X-wait-for-model"] = "1"
292
329
  if timeout is not None:
293
330
  timeout = max(self.timeout - (time.time() - t0), 1) # type: ignore
294
331
  continue
@@ -428,10 +465,11 @@ class InferenceClient:
428
465
  max_tokens: Optional[int] = None,
429
466
  n: Optional[int] = None,
430
467
  presence_penalty: Optional[float] = None,
468
+ response_format: Optional[ChatCompletionInputGrammarType] = None,
431
469
  seed: Optional[int] = None,
432
470
  stop: Optional[List[str]] = None,
433
471
  temperature: Optional[float] = None,
434
- tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None,
472
+ tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
435
473
  tool_prompt: Optional[str] = None,
436
474
  tools: Optional[List[ChatCompletionInputTool]] = None,
437
475
  top_logprobs: Optional[int] = None,
@@ -451,10 +489,11 @@ class InferenceClient:
451
489
  max_tokens: Optional[int] = None,
452
490
  n: Optional[int] = None,
453
491
  presence_penalty: Optional[float] = None,
492
+ response_format: Optional[ChatCompletionInputGrammarType] = None,
454
493
  seed: Optional[int] = None,
455
494
  stop: Optional[List[str]] = None,
456
495
  temperature: Optional[float] = None,
457
- tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None,
496
+ tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
458
497
  tool_prompt: Optional[str] = None,
459
498
  tools: Optional[List[ChatCompletionInputTool]] = None,
460
499
  top_logprobs: Optional[int] = None,
@@ -474,10 +513,11 @@ class InferenceClient:
474
513
  max_tokens: Optional[int] = None,
475
514
  n: Optional[int] = None,
476
515
  presence_penalty: Optional[float] = None,
516
+ response_format: Optional[ChatCompletionInputGrammarType] = None,
477
517
  seed: Optional[int] = None,
478
518
  stop: Optional[List[str]] = None,
479
519
  temperature: Optional[float] = None,
480
- tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None,
520
+ tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
481
521
  tool_prompt: Optional[str] = None,
482
522
  tools: Optional[List[ChatCompletionInputTool]] = None,
483
523
  top_logprobs: Optional[int] = None,
@@ -497,10 +537,11 @@ class InferenceClient:
497
537
  max_tokens: Optional[int] = None,
498
538
  n: Optional[int] = None,
499
539
  presence_penalty: Optional[float] = None,
540
+ response_format: Optional[ChatCompletionInputGrammarType] = None,
500
541
  seed: Optional[int] = None,
501
542
  stop: Optional[List[str]] = None,
502
543
  temperature: Optional[float] = None,
503
- tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None,
544
+ tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
504
545
  tool_prompt: Optional[str] = None,
505
546
  tools: Optional[List[ChatCompletionInputTool]] = None,
506
547
  top_logprobs: Optional[int] = None,
@@ -511,11 +552,10 @@ class InferenceClient:
511
552
 
512
553
  <Tip>
513
554
 
514
- If the model is served by a server supporting chat-completion, the method will directly call the server's
515
- `/v1/chat/completions` endpoint. If the server does not support chat-completion, the method will render the
516
- chat template client-side based on the information fetched from the Hub API. In this case, you will need to
517
- have `minijinja` template engine installed. Run `pip install "huggingface_hub[inference]"` or `pip install minijinja`
518
- to install it.
555
+ The `client.chat_completion` method is aliased as `client.chat.completions.create` for compatibility with OpenAI's client.
556
+ Inputs and outputs are strictly the same and using either syntax will yield the same results.
557
+ Check out the [Inference guide](https://huggingface.co/docs/huggingface_hub/guides/inference#openai-compatibility)
558
+ for more details about OpenAI's compatibility.
519
559
 
520
560
  </Tip>
521
561
 
@@ -526,6 +566,9 @@ class InferenceClient:
526
566
  The model to use for chat-completion. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
527
567
  Inference Endpoint. If not provided, the default recommended model for chat-based text-generation will be used.
528
568
  See https://huggingface.co/tasks/text-generation for more details.
569
+
570
+ If `model` is a model ID, it is passed to the server as the `model` parameter. If you want to define a
571
+ custom URL while setting `model` in the request payload, you must set `base_url` when initializing [`InferenceClient`].
529
572
  frequency_penalty (`float`, *optional*):
530
573
  Penalizes new tokens based on their existing frequency
531
574
  in the text so far. Range: [-2.0, 2.0]. Defaults to 0.0.
@@ -545,6 +588,8 @@ class InferenceClient:
545
588
  presence_penalty (`float`, *optional*):
546
589
  Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the
547
590
  text so far, increasing the model's likelihood to talk about new topics.
591
+ response_format ([`ChatCompletionInputGrammarType`], *optional*):
592
+ Grammar constraints. Can be either a JSONSchema or a regex.
548
593
  seed (Optional[`int`], *optional*):
549
594
  Seed for reproducible control flow. Defaults to None.
550
595
  stop (Optional[`str`], *optional*):
@@ -562,7 +607,7 @@ class InferenceClient:
562
607
  top_p (`float`, *optional*):
563
608
  Fraction of the most likely next words to sample from.
564
609
  Must be between 0 and 1. Defaults to 1.0.
565
- tool_choice ([`ChatCompletionInputToolTypeClass`] or [`ChatCompletionInputToolTypeEnum`], *optional*):
610
+ tool_choice ([`ChatCompletionInputToolTypeClass`] or `str`, *optional*):
566
611
  The tool to use for the completion. Defaults to "auto".
567
612
  tool_prompt (`str`, *optional*):
568
613
  A prompt to be appended before the tools.
@@ -571,7 +616,7 @@ class InferenceClient:
571
616
  provide a list of functions the model may generate JSON inputs for.
572
617
 
573
618
  Returns:
574
- [`ChatCompletionOutput] or Iterable of [`ChatCompletionStreamOutput`]:
619
+ [`ChatCompletionOutput`] or Iterable of [`ChatCompletionStreamOutput`]:
575
620
  Generated text returned from the server:
576
621
  - if `stream=False`, the generated text is returned as a [`ChatCompletionOutput`] (default).
577
622
  - if `stream=True`, the generated text is returned token by token as a sequence of [`ChatCompletionStreamOutput`].
@@ -585,10 +630,9 @@ class InferenceClient:
585
630
  Example:
586
631
 
587
632
  ```py
588
- # Chat example
589
633
  >>> from huggingface_hub import InferenceClient
590
634
  >>> messages = [{"role": "user", "content": "What is the capital of France?"}]
591
- >>> client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
635
+ >>> client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
592
636
  >>> client.chat_completion(messages, max_tokens=100)
593
637
  ChatCompletionOutput(
594
638
  choices=[
@@ -596,21 +640,67 @@ class InferenceClient:
596
640
  finish_reason='eos_token',
597
641
  index=0,
598
642
  message=ChatCompletionOutputMessage(
599
- content='The capital of France is Paris. The official name of the city is Ville de Paris (City of Paris) and the name of the country governing body, which is located in Paris, is La République française (The French Republic). \nI hope that helps! Let me know if you need any further information.'
600
- )
643
+ role='assistant',
644
+ content='The capital of France is Paris.',
645
+ name=None,
646
+ tool_calls=None
647
+ ),
648
+ logprobs=None
601
649
  )
602
650
  ],
603
- created=1710498360
651
+ created=1719907176,
652
+ id='',
653
+ model='meta-llama/Meta-Llama-3-8B-Instruct',
654
+ object='text_completion',
655
+ system_fingerprint='2.0.4-sha-f426a33',
656
+ usage=ChatCompletionOutputUsage(
657
+ completion_tokens=8,
658
+ prompt_tokens=17,
659
+ total_tokens=25
660
+ )
604
661
  )
662
+ ```
605
663
 
664
+ Example (stream=True):
665
+ ```py
666
+ >>> from huggingface_hub import InferenceClient
667
+ >>> messages = [{"role": "user", "content": "What is the capital of France?"}]
668
+ >>> client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
606
669
  >>> for token in client.chat_completion(messages, max_tokens=10, stream=True):
607
670
  ... print(token)
608
671
  ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content='The', role='assistant'), index=0, finish_reason=None)], created=1710498504)
609
672
  ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' capital', role='assistant'), index=0, finish_reason=None)], created=1710498504)
610
673
  (...)
611
674
  ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' may', role='assistant'), index=0, finish_reason=None)], created=1710498504)
675
+ ```
612
676
 
613
- # Chat example with tools
677
+ Example using OpenAI's syntax:
678
+ ```py
679
+ # instead of `from openai import OpenAI`
680
+ from huggingface_hub import InferenceClient
681
+
682
+ # instead of `client = OpenAI(...)`
683
+ client = InferenceClient(
684
+ base_url=...,
685
+ api_key=...,
686
+ )
687
+
688
+ output = client.chat.completions.create(
689
+ model="meta-llama/Meta-Llama-3-8B-Instruct",
690
+ messages=[
691
+ {"role": "system", "content": "You are a helpful assistant."},
692
+ {"role": "user", "content": "Count to 10"},
693
+ ],
694
+ stream=True,
695
+ max_tokens=1024,
696
+ )
697
+
698
+ for chunk in output:
699
+ print(chunk.choices[0].delta.content)
700
+ ```
701
+
702
+ Example using tools:
703
+ ```py
614
704
  >>> client = InferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
615
705
  >>> messages = [
616
706
  ... {
@@ -691,9 +781,43 @@ class InferenceClient:
691
781
  description=None
692
782
  )
693
783
  ```
784
+
785
+ Example using response_format:
786
+ ```py
787
+ >>> from huggingface_hub import InferenceClient
788
+ >>> client = InferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
789
+ >>> messages = [
790
+ ... {
791
+ ... "role": "user",
792
+ ... "content": "I saw a puppy a cat and a raccoon during my bike ride in the park. What did I saw and when?",
793
+ ... },
794
+ ... ]
795
+ >>> response_format = {
796
+ ... "type": "json",
797
+ ... "value": {
798
+ ... "properties": {
799
+ ... "location": {"type": "string"},
800
+ ... "activity": {"type": "string"},
801
+ ... "animals_seen": {"type": "integer", "minimum": 1, "maximum": 5},
802
+ ... "animals": {"type": "array", "items": {"type": "string"}},
803
+ ... },
804
+ ... "required": ["location", "activity", "animals_seen", "animals"],
805
+ ... },
806
+ ... }
807
+ >>> response = client.chat_completion(
808
+ ... messages=messages,
809
+ ... response_format=response_format,
810
+ ... max_tokens=500,
811
+ )
812
+ >>> response.choices[0].message.content
813
+ '{\n\n"activity": "bike ride",\n"animals": ["puppy", "cat", "raccoon"],\n"animals_seen": 3,\n"location": "park"}'
814
+ ```
694
815
  """
695
- # determine model
696
- model = model or self.model or self.get_recommended_model("text-generation")
816
+ # Determine model
817
+ # `self.xxx` takes precedence over the method argument only in `chat_completion`
818
+ # since `chat_completion(..., model=xxx)` is also a payload parameter for the
819
+ # server, we need to handle it differently
820
+ model = self.base_url or self.model or model or self.get_recommended_model("text-generation")
697
821
 
698
822
  if _is_chat_completion_server(model):
699
823
  # First, let's consider the server has a `/v1/chat/completions` endpoint.
@@ -702,11 +826,19 @@ class InferenceClient:
702
826
  if not model_url.endswith("/chat/completions"):
703
827
  model_url += "/v1/chat/completions"
704
828
 
829
+ # `model` is sent in the payload. Not used by the server but can be useful for debugging/routing.
830
+ if not model.startswith("http") and model.count("/") == 1:
831
+ # If it's a ID on the Hub => use it
832
+ model_id = model
833
+ else:
834
+ # Otherwise, we use a random string
835
+ model_id = "tgi"
836
+
705
837
  try:
706
838
  data = self.post(
707
839
  model=model_url,
708
840
  json=dict(
709
- model="tgi", # random string
841
+ model=model_id,
710
842
  messages=messages,
711
843
  frequency_penalty=frequency_penalty,
712
844
  logit_bias=logit_bias,
@@ -714,6 +846,7 @@ class InferenceClient:
714
846
  max_tokens=max_tokens,
715
847
  n=n,
716
848
  presence_penalty=presence_penalty,
849
+ response_format=response_format,
717
850
  seed=seed,
718
851
  stop=stop,
719
852
  temperature=temperature,
@@ -765,6 +898,11 @@ class InferenceClient:
765
898
  "Tools are not supported by the model. This is due to the model not been served by a "
766
899
  "Text-Generation-Inference server. The provided tool parameters will be ignored."
767
900
  )
901
+ if response_format is not None:
902
+ warnings.warn(
903
+ "Response format is not supported by the model. This is due to the model not been served by a "
904
+ "Text-Generation-Inference server. The provided response format will be ignored."
905
+ )
768
906
 
769
907
  # generate response
770
908
  text_generation_output = self.text_generation(
@@ -783,7 +921,6 @@ class InferenceClient:
783
921
  return ChatCompletionOutput(
784
922
  id="dummy",
785
923
  model="dummy",
786
- object="dummy",
787
924
  system_fingerprint="dummy",
788
925
  usage=None, # type: ignore # set to `None` as we don't want to provide false information
789
926
  created=int(time.time()),
@@ -913,7 +1050,16 @@ class InferenceClient:
913
1050
  response = self.post(json=payload, model=model, task="document-question-answering")
914
1051
  return DocumentQuestionAnsweringOutputElement.parse_obj_as_list(response)
915
1052
 
916
- def feature_extraction(self, text: str, *, model: Optional[str] = None) -> "np.ndarray":
1053
+ def feature_extraction(
1054
+ self,
1055
+ text: str,
1056
+ *,
1057
+ normalize: Optional[bool] = None,
1058
+ prompt_name: Optional[str] = None,
1059
+ truncate: Optional[bool] = None,
1060
+ truncation_direction: Optional[Literal["Left", "Right"]] = None,
1061
+ model: Optional[str] = None,
1062
+ ) -> "np.ndarray":
917
1063
  """
918
1064
  Generate embeddings for a given text.
919
1065
 
@@ -924,6 +1070,20 @@ class InferenceClient:
924
1070
  The model to use for the conversational task. Can be a model ID hosted on the Hugging Face Hub or a URL to
925
1071
  a deployed Inference Endpoint. If not provided, the default recommended conversational model will be used.
926
1072
  Defaults to None.
1073
+ normalize (`bool`, *optional*):
1074
+ Whether to normalize the embeddings or not. Defaults to None.
1075
+ Only available on server powered by Text-Embedding-Inference.
1076
+ prompt_name (`str`, *optional*):
1077
+ The name of the prompt that should be used by for encoding. If not set, no prompt will be applied.
1078
+ Must be a key in the `Sentence Transformers` configuration `prompts` dictionary.
1079
+ For example if ``prompt_name`` is "query" and the ``prompts`` is {"query": "query: ",...},
1080
+ then the sentence "What is the capital of France?" will be encoded as "query: What is the capital of France?"
1081
+ because the prompt text will be prepended before any text to encode.
1082
+ truncate (`bool`, *optional*):
1083
+ Whether to truncate the embeddings or not. Defaults to None.
1084
+ Only available on server powered by Text-Embedding-Inference.
1085
+ truncation_direction (`Literal["Left", "Right"]`, *optional*):
1086
+ Which side of the input should be truncated when `truncate=True` is passed.
927
1087
 
928
1088
  Returns:
929
1089
  `np.ndarray`: The embedding representing the input text as a float32 numpy array.
@@ -945,7 +1105,16 @@ class InferenceClient:
945
1105
  [ 0.28552425, -0.928395 , -1.2077185 , ..., 0.76810825, -2.1069427 , 0.6236161 ]], dtype=float32)
946
1106
  ```
947
1107
  """
948
- response = self.post(json={"inputs": text}, model=model, task="feature-extraction")
1108
+ payload: Dict = {"inputs": text}
1109
+ if normalize is not None:
1110
+ payload["normalize"] = normalize
1111
+ if prompt_name is not None:
1112
+ payload["prompt_name"] = prompt_name
1113
+ if truncate is not None:
1114
+ payload["truncate"] = truncate
1115
+ if truncation_direction is not None:
1116
+ payload["truncation_direction"] = truncation_direction
1117
+ response = self.post(json=payload, model=model, task="feature-extraction")
949
1118
  np = _import_numpy()
950
1119
  return np.array(_bytes_to_dict(response), dtype="float32")
951
1120
 
@@ -1184,7 +1353,8 @@ class InferenceClient:
1184
1353
  ```
1185
1354
  """
1186
1355
  response = self.post(data=image, model=model, task="image-to-text")
1187
- return ImageToTextOutput.parse_obj_as_instance(response)
1356
+ output = ImageToTextOutput.parse_obj(response)
1357
+ return output[0] if isinstance(output, list) else output
1188
1358
 
1189
1359
  def list_deployed_models(
1190
1360
  self, frameworks: Union[None, str, Literal["all"], List[str]] = None
@@ -1619,6 +1789,7 @@ class InferenceClient:
1619
1789
  stream: Literal[False] = ...,
1620
1790
  model: Optional[str] = None,
1621
1791
  # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
1792
+ adapter_id: Optional[str] = None,
1622
1793
  best_of: Optional[int] = None,
1623
1794
  decoder_input_details: Optional[bool] = None,
1624
1795
  do_sample: Optional[bool] = False, # Manual default value
@@ -1647,6 +1818,7 @@ class InferenceClient:
1647
1818
  stream: Literal[False] = ...,
1648
1819
  model: Optional[str] = None,
1649
1820
  # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
1821
+ adapter_id: Optional[str] = None,
1650
1822
  best_of: Optional[int] = None,
1651
1823
  decoder_input_details: Optional[bool] = None,
1652
1824
  do_sample: Optional[bool] = False, # Manual default value
@@ -1675,6 +1847,7 @@ class InferenceClient:
1675
1847
  stream: Literal[True] = ...,
1676
1848
  model: Optional[str] = None,
1677
1849
  # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
1850
+ adapter_id: Optional[str] = None,
1678
1851
  best_of: Optional[int] = None,
1679
1852
  decoder_input_details: Optional[bool] = None,
1680
1853
  do_sample: Optional[bool] = False, # Manual default value
@@ -1703,6 +1876,7 @@ class InferenceClient:
1703
1876
  stream: Literal[True] = ...,
1704
1877
  model: Optional[str] = None,
1705
1878
  # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
1879
+ adapter_id: Optional[str] = None,
1706
1880
  best_of: Optional[int] = None,
1707
1881
  decoder_input_details: Optional[bool] = None,
1708
1882
  do_sample: Optional[bool] = False, # Manual default value
@@ -1731,6 +1905,7 @@ class InferenceClient:
1731
1905
  stream: bool = ...,
1732
1906
  model: Optional[str] = None,
1733
1907
  # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
1908
+ adapter_id: Optional[str] = None,
1734
1909
  best_of: Optional[int] = None,
1735
1910
  decoder_input_details: Optional[bool] = None,
1736
1911
  do_sample: Optional[bool] = False, # Manual default value
@@ -1758,6 +1933,7 @@ class InferenceClient:
1758
1933
  stream: bool = False,
1759
1934
  model: Optional[str] = None,
1760
1935
  # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
1936
+ adapter_id: Optional[str] = None,
1761
1937
  best_of: Optional[int] = None,
1762
1938
  decoder_input_details: Optional[bool] = None,
1763
1939
  do_sample: Optional[bool] = False, # Manual default value
@@ -1788,6 +1964,13 @@ class InferenceClient:
1788
1964
 
1789
1965
  To learn more about the TGI project, please refer to https://github.com/huggingface/text-generation-inference.
1790
1966
 
1967
+ <Tip>
1968
+
1969
+ If you want to generate a response from chat messages, you should use the [`InferenceClient.chat_completion`] method.
1970
+ It accepts a list of messages instead of a single text prompt and handles the chat templating for you.
1971
+
1972
+ </Tip>
1973
+
1791
1974
  Args:
1792
1975
  prompt (`str`):
1793
1976
  Input text.
@@ -1802,6 +1985,8 @@ class InferenceClient:
1802
1985
  model (`str`, *optional*):
1803
1986
  The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
1804
1987
  Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
1988
+ adapter_id (`str`, *optional*):
1989
+ Lora adapter id.
1805
1990
  best_of (`int`, *optional*):
1806
1991
  Generate best_of sequences and return the one if the highest token logprobs.
1807
1992
  decoder_input_details (`bool`, *optional*):
@@ -1970,6 +2155,7 @@ class InferenceClient:
1970
2155
 
1971
2156
  # Build payload
1972
2157
  parameters = {
2158
+ "adapter_id": adapter_id,
1973
2159
  "best_of": best_of,
1974
2160
  "decoder_input_details": decoder_input_details,
1975
2161
  "details": details,
@@ -2040,6 +2226,7 @@ class InferenceClient:
2040
2226
  details=details,
2041
2227
  stream=stream,
2042
2228
  model=model,
2229
+ adapter_id=adapter_id,
2043
2230
  best_of=best_of,
2044
2231
  decoder_input_details=decoder_input_details,
2045
2232
  do_sample=do_sample,
@@ -2347,7 +2534,13 @@ class InferenceClient:
2347
2534
  return VisualQuestionAnsweringOutputElement.parse_obj_as_list(response)
2348
2535
 
2349
2536
  def zero_shot_classification(
2350
- self, text: str, labels: List[str], *, multi_label: bool = False, model: Optional[str] = None
2537
+ self,
2538
+ text: str,
2539
+ labels: List[str],
2540
+ *,
2541
+ multi_label: bool = False,
2542
+ hypothesis_template: Optional[str] = None,
2543
+ model: Optional[str] = None,
2351
2544
  ) -> List[ZeroShotClassificationOutputElement]:
2352
2545
  """
2353
2546
  Provide as input a text and a set of candidate labels to classify the input text.
@@ -2356,9 +2549,15 @@ class InferenceClient:
2356
2549
  text (`str`):
2357
2550
  The input text to classify.
2358
2551
  labels (`List[str]`):
2359
- List of string possible labels. There must be at least 2 labels.
2552
+ List of strings. Each string is the verbalization of a possible label for the input text.
2360
2553
  multi_label (`bool`):
2361
- Boolean that is set to True if classes can overlap.
2554
+ Boolean. If True, the probability for each label is evaluated independently and multiple labels can have a probability close to 1 simultaneously or all probabilities can be close to 0.
2555
+ If False, the labels are considered mutually exclusive and the probability over all labels always sums to 1. Defaults to False.
2556
+ hypothesis_template (`str`, *optional*):
2557
+ A template sentence string with curly brackets to which the label strings are added. The label strings are added at the position of the curly brackets "{}".
2558
+ Zero-shot classifiers are based on NLI models, which evaluate if a hypothesis is entailed in another text or not.
2559
+ For example, with hypothesis_template="This text is about {}." and labels=["economics", "politics"], the system internally creates the two hypotheses "This text is about economics." and "This text is about politics.".
2560
+ The model then evaluates for both hypotheses if they are entailed in the provided `text` or not.
2362
2561
  model (`str`, *optional*):
2363
2562
  The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
2364
2563
  Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
@@ -2372,7 +2571,7 @@ class InferenceClient:
2372
2571
  `HTTPError`:
2373
2572
  If the request fails with an HTTP error status code other than HTTP 503.
2374
2573
 
2375
- Example:
2574
+ Example with `multi_label=False`:
2376
2575
  ```py
2377
2576
  >>> from huggingface_hub import InferenceClient
2378
2577
  >>> client = InferenceClient()
@@ -2399,21 +2598,37 @@ class InferenceClient:
2399
2598
  ZeroShotClassificationOutputElement(label='robots', score=0.00030448526376858354),
2400
2599
  ]
2401
2600
  ```
2601
+
2602
+ Example with `multi_label=True` and a custom `hypothesis_template`:
2603
+ ```py
2604
+ >>> from huggingface_hub import InferenceClient
2605
+ >>> client = InferenceClient()
2606
+ >>> client.zero_shot_classification(
2607
+ ... text="I really like our dinner and I'm very happy. I don't like the weather though.",
2608
+ ... labels=["positive", "negative", "pessimistic", "optimistic"],
2609
+ ... multi_label=True,
2610
+ ... hypothesis_template="This text is {} towards the weather"
2611
+ ... )
2612
+ [
2613
+ ZeroShotClassificationOutputElement(label='negative', score=0.9231801629066467),
2614
+ ZeroShotClassificationOutputElement(label='pessimistic', score=0.8760990500450134),
2615
+ ZeroShotClassificationOutputElement(label='optimistic', score=0.0008674879791215062),
2616
+ ZeroShotClassificationOutputElement(label='positive', score=0.0005250611575320363)
2617
+ ]
2618
+ ```
2402
2619
  """
2403
- # Raise ValueError if input is less than 2 labels
2404
- if len(labels) < 2:
2405
- raise ValueError("You must specify at least 2 classes to compare.")
2620
+
2621
+ parameters = {"candidate_labels": labels, "multi_label": multi_label}
2622
+ if hypothesis_template is not None:
2623
+ parameters["hypothesis_template"] = hypothesis_template
2406
2624
 
2407
2625
  response = self.post(
2408
2626
  json={
2409
2627
  "inputs": text,
2410
- "parameters": {
2411
- "candidate_labels": ",".join(labels),
2412
- "multi_label": multi_label,
2413
- },
2628
+ "parameters": parameters,
2414
2629
  },
2415
- model=model,
2416
2630
  task="zero-shot-classification",
2631
+ model=model,
2417
2632
  )
2418
2633
  output = _bytes_to_dict(response)
2419
2634
  return [
@@ -2469,7 +2684,7 @@ class InferenceClient:
2469
2684
  return ZeroShotImageClassificationOutputElement.parse_obj_as_list(response)
2470
2685
 
2471
2686
  def _resolve_url(self, model: Optional[str] = None, task: Optional[str] = None) -> str:
2472
- model = model or self.model
2687
+ model = model or self.model or self.base_url
2473
2688
 
2474
2689
  # If model is already a URL, ignore `task` and return directly
2475
2690
  if model is not None and (model.startswith("http://") or model.startswith("https://")):
@@ -2522,6 +2737,95 @@ class InferenceClient:
2522
2737
  )
2523
2738
  return model
2524
2739
 
2740
+ def get_endpoint_info(self, *, model: Optional[str] = None) -> Dict[str, Any]:
2741
+ """
2742
+ Get information about the deployed endpoint.
2743
+
2744
+ This endpoint is only available on endpoints powered by Text-Generation-Inference (TGI) or Text-Embedding-Inference (TEI).
2745
+ Endpoints powered by `transformers` return an empty payload.
2746
+
2747
+ Args:
2748
+ model (`str`, *optional*):
2749
+ The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
2750
+ Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
2751
+
2752
+ Returns:
2753
+ `Dict[str, Any]`: Information about the endpoint.
2754
+
2755
+ Example:
2756
+ ```py
2757
+ >>> from huggingface_hub import InferenceClient
2758
+ >>> client = InferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
2759
+ >>> client.get_endpoint_info()
2760
+ {
2761
+ 'model_id': 'meta-llama/Meta-Llama-3-70B-Instruct',
2762
+ 'model_sha': None,
2763
+ 'model_dtype': 'torch.float16',
2764
+ 'model_device_type': 'cuda',
2765
+ 'model_pipeline_tag': None,
2766
+ 'max_concurrent_requests': 128,
2767
+ 'max_best_of': 2,
2768
+ 'max_stop_sequences': 4,
2769
+ 'max_input_length': 8191,
2770
+ 'max_total_tokens': 8192,
2771
+ 'waiting_served_ratio': 0.3,
2772
+ 'max_batch_total_tokens': 1259392,
2773
+ 'max_waiting_tokens': 20,
2774
+ 'max_batch_size': None,
2775
+ 'validation_workers': 32,
2776
+ 'max_client_batch_size': 4,
2777
+ 'version': '2.0.2',
2778
+ 'sha': 'dccab72549635c7eb5ddb17f43f0b7cdff07c214',
2779
+ 'docker_label': 'sha-dccab72'
2780
+ }
2781
+ ```
2782
+ """
2783
+ model = model or self.model
2784
+ if model is None:
2785
+ raise ValueError("Model id not provided.")
2786
+ if model.startswith(("http://", "https://")):
2787
+ url = model.rstrip("/") + "/info"
2788
+ else:
2789
+ url = f"{INFERENCE_ENDPOINT}/models/{model}/info"
2790
+
2791
+ response = get_session().get(url, headers=self.headers)
2792
+ hf_raise_for_status(response)
2793
+ return response.json()
2794
+
2795
+ def health_check(self, model: Optional[str] = None) -> bool:
2796
+ """
2797
+ Check the health of the deployed endpoint.
2798
+
2799
+ Health check is only available with Inference Endpoints powered by Text-Generation-Inference (TGI) or Text-Embedding-Inference (TEI).
2800
+ For Inference API, please use [`InferenceClient.get_model_status`] instead.
2801
+
2802
+ Args:
2803
+ model (`str`, *optional*):
2804
+ URL of the Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
2805
+
2806
+ Returns:
2807
+ `bool`: True if everything is working fine.
2808
+
2809
+ Example:
2810
+ ```py
2811
+ >>> from huggingface_hub import InferenceClient
2812
+ >>> client = InferenceClient("https://jzgu0buei5.us-east-1.aws.endpoints.huggingface.cloud")
2813
+ >>> client.health_check()
2814
+ True
2815
+ ```
2816
+ """
2817
+ model = model or self.model
2818
+ if model is None:
2819
+ raise ValueError("Model id not provided.")
2820
+ if not model.startswith(("http://", "https://")):
2821
+ raise ValueError(
2822
+ "Model must be an Inference Endpoint URL. For serverless Inference API, please use `InferenceClient.get_model_status`."
2823
+ )
2824
+ url = model.rstrip("/") + "/health"
2825
+
2826
+ response = get_session().get(url, headers=self.headers)
2827
+ return response.status_code == 200
2828
+
2525
2829
  def get_model_status(self, model: Optional[str] = None) -> ModelStatus:
2526
2830
  """
2527
2831
  Get the status of a model hosted on the Inference API.
@@ -2548,7 +2852,7 @@ class InferenceClient:
2548
2852
  ```py
2549
2853
  >>> from huggingface_hub import InferenceClient
2550
2854
  >>> client = InferenceClient()
2551
- >>> client.get_model_status("bigcode/starcoder")
2855
+ >>> client.get_model_status("meta-llama/Meta-Llama-3-8B-Instruct")
2552
2856
  ModelStatus(loaded=True, state='Loaded', compute_type='gpu', framework='text-generation-inference')
2553
2857
  ```
2554
2858
  """
@@ -2572,3 +2876,30 @@ class InferenceClient:
2572
2876
  compute_type=response_data["compute_type"],
2573
2877
  framework=response_data["framework"],
2574
2878
  )
2879
+
2880
+ @property
2881
+ def chat(self) -> "ProxyClientChat":
2882
+ return ProxyClientChat(self)
2883
+
2884
+
2885
+ class _ProxyClient:
2886
+ """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client."""
2887
+
2888
+ def __init__(self, client: InferenceClient):
2889
+ self._client = client
2890
+
2891
+
2892
+ class ProxyClientChat(_ProxyClient):
2893
+ """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client."""
2894
+
2895
+ @property
2896
+ def completions(self) -> "ProxyClientChatCompletions":
2897
+ return ProxyClientChatCompletions(self._client)
2898
+
2899
+
2900
+ class ProxyClientChatCompletions(_ProxyClient):
2901
+ """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client."""
2902
+
2903
+ @property
2904
+ def create(self):
2905
+ return self._client.chat_completion