huggingface-hub 0.23.5__py3-none-any.whl → 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of huggingface-hub might be problematic. Click here for more details.
- huggingface_hub/__init__.py +47 -15
- huggingface_hub/_commit_api.py +38 -8
- huggingface_hub/_inference_endpoints.py +11 -4
- huggingface_hub/_local_folder.py +22 -13
- huggingface_hub/_snapshot_download.py +12 -7
- huggingface_hub/_webhooks_server.py +3 -1
- huggingface_hub/commands/huggingface_cli.py +4 -3
- huggingface_hub/commands/repo_files.py +128 -0
- huggingface_hub/constants.py +12 -0
- huggingface_hub/file_download.py +127 -91
- huggingface_hub/hf_api.py +976 -341
- huggingface_hub/hf_file_system.py +30 -3
- huggingface_hub/inference/_client.py +379 -43
- huggingface_hub/inference/_common.py +0 -2
- huggingface_hub/inference/_generated/_async_client.py +396 -49
- huggingface_hub/inference/_generated/types/__init__.py +4 -1
- huggingface_hub/inference/_generated/types/chat_completion.py +41 -21
- huggingface_hub/inference/_generated/types/feature_extraction.py +23 -5
- huggingface_hub/inference/_generated/types/text_generation.py +29 -0
- huggingface_hub/lfs.py +11 -6
- huggingface_hub/repocard_data.py +3 -3
- huggingface_hub/repository.py +6 -6
- huggingface_hub/serialization/__init__.py +8 -3
- huggingface_hub/serialization/_base.py +13 -16
- huggingface_hub/serialization/_tensorflow.py +4 -3
- huggingface_hub/serialization/_torch.py +399 -22
- huggingface_hub/utils/__init__.py +0 -1
- huggingface_hub/utils/_errors.py +1 -1
- huggingface_hub/utils/_fixes.py +14 -3
- huggingface_hub/utils/_paths.py +17 -6
- huggingface_hub/utils/_subprocess.py +0 -1
- huggingface_hub/utils/_telemetry.py +9 -1
- huggingface_hub/utils/endpoint_helpers.py +2 -186
- huggingface_hub/utils/sha.py +36 -1
- huggingface_hub/utils/tqdm.py +0 -1
- {huggingface_hub-0.23.5.dist-info → huggingface_hub-0.24.0.dist-info}/METADATA +12 -9
- {huggingface_hub-0.23.5.dist-info → huggingface_hub-0.24.0.dist-info}/RECORD +41 -41
- huggingface_hub/serialization/_numpy.py +0 -68
- {huggingface_hub-0.23.5.dist-info → huggingface_hub-0.24.0.dist-info}/LICENSE +0 -0
- {huggingface_hub-0.23.5.dist-info → huggingface_hub-0.24.0.dist-info}/WHEEL +0 -0
- {huggingface_hub-0.23.5.dist-info → huggingface_hub-0.24.0.dist-info}/entry_points.txt +0 -0
- {huggingface_hub-0.23.5.dist-info → huggingface_hub-0.24.0.dist-info}/top_level.txt +0 -0
|
@@ -78,6 +78,7 @@ from huggingface_hub.inference._generated.types import (
|
|
|
78
78
|
AudioClassificationOutputElement,
|
|
79
79
|
AudioToAudioOutputElement,
|
|
80
80
|
AutomaticSpeechRecognitionOutput,
|
|
81
|
+
ChatCompletionInputGrammarType,
|
|
81
82
|
ChatCompletionInputTool,
|
|
82
83
|
ChatCompletionInputToolTypeClass,
|
|
83
84
|
ChatCompletionOutput,
|
|
@@ -103,7 +104,6 @@ from huggingface_hub.inference._generated.types import (
|
|
|
103
104
|
ZeroShotClassificationOutputElement,
|
|
104
105
|
ZeroShotImageClassificationOutputElement,
|
|
105
106
|
)
|
|
106
|
-
from huggingface_hub.inference._generated.types.chat_completion import ChatCompletionInputToolTypeEnum
|
|
107
107
|
from huggingface_hub.inference._types import (
|
|
108
108
|
ConversationalOutput, # soon to be removed
|
|
109
109
|
)
|
|
@@ -113,6 +113,7 @@ from huggingface_hub.utils import (
|
|
|
113
113
|
get_session,
|
|
114
114
|
hf_raise_for_status,
|
|
115
115
|
)
|
|
116
|
+
from huggingface_hub.utils._deprecation import _deprecate_positional_args
|
|
116
117
|
|
|
117
118
|
|
|
118
119
|
if TYPE_CHECKING:
|
|
@@ -134,12 +135,16 @@ class InferenceClient:
|
|
|
134
135
|
|
|
135
136
|
Args:
|
|
136
137
|
model (`str`, `optional`):
|
|
137
|
-
The model to run inference with. Can be a model id hosted on the Hugging Face Hub, e.g. `
|
|
138
|
+
The model to run inference with. Can be a model id hosted on the Hugging Face Hub, e.g. `meta-llama/Meta-Llama-3-8B-Instruct`
|
|
138
139
|
or a URL to a deployed Inference Endpoint. Defaults to None, in which case a recommended model is
|
|
139
140
|
automatically selected for the task.
|
|
141
|
+
Note: for better compatibility with OpenAI's client, `model` has been aliased as `base_url`. Those 2
|
|
142
|
+
arguments are mutually exclusive and have the exact same behavior.
|
|
140
143
|
token (`str` or `bool`, *optional*):
|
|
141
144
|
Hugging Face token. Will default to the locally saved token if not provided.
|
|
142
145
|
Pass `token=False` if you don't want to send your token to the server.
|
|
146
|
+
Note: for better compatibility with OpenAI's client, `token` has been aliased as `api_key`. Those 2
|
|
147
|
+
arguments are mutually exclusive and have the exact same behavior.
|
|
143
148
|
timeout (`float`, `optional`):
|
|
144
149
|
The maximum number of seconds to wait for a response from the server. Loading a new model in Inference
|
|
145
150
|
API can take up to several minutes. Defaults to None, meaning it will loop until the server is available.
|
|
@@ -148,23 +153,52 @@ class InferenceClient:
|
|
|
148
153
|
Values in this dictionary will override the default values.
|
|
149
154
|
cookies (`Dict[str, str]`, `optional`):
|
|
150
155
|
Additional cookies to send to the server.
|
|
156
|
+
base_url (`str`, `optional`):
|
|
157
|
+
Base URL to run inference. This is a duplicated argument from `model` to make [`InferenceClient`]
|
|
158
|
+
follow the same pattern as `openai.OpenAI` client. Cannot be used if `model` is set. Defaults to None.
|
|
159
|
+
api_key (`str`, `optional`):
|
|
160
|
+
Token to use for authentication. This is a duplicated argument from `token` to make [`InferenceClient`]
|
|
161
|
+
follow the same pattern as `openai.OpenAI` client. Cannot be used if `token` is set. Defaults to None.
|
|
151
162
|
"""
|
|
152
163
|
|
|
164
|
+
@_deprecate_positional_args(version="0.26")
|
|
153
165
|
def __init__(
|
|
154
166
|
self,
|
|
155
167
|
model: Optional[str] = None,
|
|
168
|
+
*,
|
|
156
169
|
token: Union[str, bool, None] = None,
|
|
157
170
|
timeout: Optional[float] = None,
|
|
158
171
|
headers: Optional[Dict[str, str]] = None,
|
|
159
172
|
cookies: Optional[Dict[str, str]] = None,
|
|
173
|
+
proxies: Optional[Any] = None,
|
|
174
|
+
# OpenAI compatibility
|
|
175
|
+
base_url: Optional[str] = None,
|
|
176
|
+
api_key: Optional[str] = None,
|
|
160
177
|
) -> None:
|
|
178
|
+
if model is not None and base_url is not None:
|
|
179
|
+
raise ValueError(
|
|
180
|
+
"Received both `model` and `base_url` arguments. Please provide only one of them."
|
|
181
|
+
" `base_url` is an alias for `model` to make the API compatible with OpenAI's client."
|
|
182
|
+
" It has the exact same behavior as `model`."
|
|
183
|
+
)
|
|
184
|
+
if token is not None and api_key is not None:
|
|
185
|
+
raise ValueError(
|
|
186
|
+
"Received both `token` and `api_key` arguments. Please provide only one of them."
|
|
187
|
+
" `api_key` is an alias for `token` to make the API compatible with OpenAI's client."
|
|
188
|
+
" It has the exact same behavior as `token`."
|
|
189
|
+
)
|
|
190
|
+
|
|
161
191
|
self.model: Optional[str] = model
|
|
162
|
-
self.token: Union[str, bool, None] = token
|
|
163
|
-
self.headers = CaseInsensitiveDict(build_hf_headers(token=token)) #
|
|
192
|
+
self.token: Union[str, bool, None] = token or api_key
|
|
193
|
+
self.headers = CaseInsensitiveDict(build_hf_headers(token=self.token)) # 'authorization' + 'user-agent'
|
|
164
194
|
if headers is not None:
|
|
165
195
|
self.headers.update(headers)
|
|
166
196
|
self.cookies = cookies
|
|
167
197
|
self.timeout = timeout
|
|
198
|
+
self.proxies = proxies
|
|
199
|
+
|
|
200
|
+
# OpenAI compatibility
|
|
201
|
+
self.base_url = base_url
|
|
168
202
|
|
|
169
203
|
def __repr__(self):
|
|
170
204
|
return f"<InferenceClient(model='{self.model if self.model else ''}', timeout={self.timeout})>"
|
|
@@ -264,6 +298,7 @@ class InferenceClient:
|
|
|
264
298
|
cookies=self.cookies,
|
|
265
299
|
timeout=self.timeout,
|
|
266
300
|
stream=stream,
|
|
301
|
+
proxies=self.proxies,
|
|
267
302
|
)
|
|
268
303
|
except TimeoutError as error:
|
|
269
304
|
# Convert any `TimeoutError` to a `InferenceTimeoutError`
|
|
@@ -289,6 +324,8 @@ class InferenceClient:
|
|
|
289
324
|
# ...or wait 1s and retry
|
|
290
325
|
logger.info(f"Waiting for model to be loaded on the server: {error}")
|
|
291
326
|
time.sleep(1)
|
|
327
|
+
if "X-wait-for-model" not in headers and url.startswith(INFERENCE_ENDPOINT):
|
|
328
|
+
headers["X-wait-for-model"] = "1"
|
|
292
329
|
if timeout is not None:
|
|
293
330
|
timeout = max(self.timeout - (time.time() - t0), 1) # type: ignore
|
|
294
331
|
continue
|
|
@@ -428,10 +465,11 @@ class InferenceClient:
|
|
|
428
465
|
max_tokens: Optional[int] = None,
|
|
429
466
|
n: Optional[int] = None,
|
|
430
467
|
presence_penalty: Optional[float] = None,
|
|
468
|
+
response_format: Optional[ChatCompletionInputGrammarType] = None,
|
|
431
469
|
seed: Optional[int] = None,
|
|
432
470
|
stop: Optional[List[str]] = None,
|
|
433
471
|
temperature: Optional[float] = None,
|
|
434
|
-
tool_choice: Optional[Union[ChatCompletionInputToolTypeClass,
|
|
472
|
+
tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
|
|
435
473
|
tool_prompt: Optional[str] = None,
|
|
436
474
|
tools: Optional[List[ChatCompletionInputTool]] = None,
|
|
437
475
|
top_logprobs: Optional[int] = None,
|
|
@@ -451,10 +489,11 @@ class InferenceClient:
|
|
|
451
489
|
max_tokens: Optional[int] = None,
|
|
452
490
|
n: Optional[int] = None,
|
|
453
491
|
presence_penalty: Optional[float] = None,
|
|
492
|
+
response_format: Optional[ChatCompletionInputGrammarType] = None,
|
|
454
493
|
seed: Optional[int] = None,
|
|
455
494
|
stop: Optional[List[str]] = None,
|
|
456
495
|
temperature: Optional[float] = None,
|
|
457
|
-
tool_choice: Optional[Union[ChatCompletionInputToolTypeClass,
|
|
496
|
+
tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
|
|
458
497
|
tool_prompt: Optional[str] = None,
|
|
459
498
|
tools: Optional[List[ChatCompletionInputTool]] = None,
|
|
460
499
|
top_logprobs: Optional[int] = None,
|
|
@@ -474,10 +513,11 @@ class InferenceClient:
|
|
|
474
513
|
max_tokens: Optional[int] = None,
|
|
475
514
|
n: Optional[int] = None,
|
|
476
515
|
presence_penalty: Optional[float] = None,
|
|
516
|
+
response_format: Optional[ChatCompletionInputGrammarType] = None,
|
|
477
517
|
seed: Optional[int] = None,
|
|
478
518
|
stop: Optional[List[str]] = None,
|
|
479
519
|
temperature: Optional[float] = None,
|
|
480
|
-
tool_choice: Optional[Union[ChatCompletionInputToolTypeClass,
|
|
520
|
+
tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
|
|
481
521
|
tool_prompt: Optional[str] = None,
|
|
482
522
|
tools: Optional[List[ChatCompletionInputTool]] = None,
|
|
483
523
|
top_logprobs: Optional[int] = None,
|
|
@@ -497,10 +537,11 @@ class InferenceClient:
|
|
|
497
537
|
max_tokens: Optional[int] = None,
|
|
498
538
|
n: Optional[int] = None,
|
|
499
539
|
presence_penalty: Optional[float] = None,
|
|
540
|
+
response_format: Optional[ChatCompletionInputGrammarType] = None,
|
|
500
541
|
seed: Optional[int] = None,
|
|
501
542
|
stop: Optional[List[str]] = None,
|
|
502
543
|
temperature: Optional[float] = None,
|
|
503
|
-
tool_choice: Optional[Union[ChatCompletionInputToolTypeClass,
|
|
544
|
+
tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
|
|
504
545
|
tool_prompt: Optional[str] = None,
|
|
505
546
|
tools: Optional[List[ChatCompletionInputTool]] = None,
|
|
506
547
|
top_logprobs: Optional[int] = None,
|
|
@@ -511,11 +552,10 @@ class InferenceClient:
|
|
|
511
552
|
|
|
512
553
|
<Tip>
|
|
513
554
|
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
to install it.
|
|
555
|
+
The `client.chat_completion` method is aliased as `client.chat.completions.create` for compatibility with OpenAI's client.
|
|
556
|
+
Inputs and outputs are strictly the same and using either syntax will yield the same results.
|
|
557
|
+
Check out the [Inference guide](https://huggingface.co/docs/huggingface_hub/guides/inference#openai-compatibility)
|
|
558
|
+
for more details about OpenAI's compatibility.
|
|
519
559
|
|
|
520
560
|
</Tip>
|
|
521
561
|
|
|
@@ -526,6 +566,9 @@ class InferenceClient:
|
|
|
526
566
|
The model to use for chat-completion. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
|
|
527
567
|
Inference Endpoint. If not provided, the default recommended model for chat-based text-generation will be used.
|
|
528
568
|
See https://huggingface.co/tasks/text-generation for more details.
|
|
569
|
+
|
|
570
|
+
If `model` is a model ID, it is passed to the server as the `model` parameter. If you want to define a
|
|
571
|
+
custom URL while setting `model` in the request payload, you must set `base_url` when initializing [`InferenceClient`].
|
|
529
572
|
frequency_penalty (`float`, *optional*):
|
|
530
573
|
Penalizes new tokens based on their existing frequency
|
|
531
574
|
in the text so far. Range: [-2.0, 2.0]. Defaults to 0.0.
|
|
@@ -545,6 +588,8 @@ class InferenceClient:
|
|
|
545
588
|
presence_penalty (`float`, *optional*):
|
|
546
589
|
Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the
|
|
547
590
|
text so far, increasing the model's likelihood to talk about new topics.
|
|
591
|
+
response_format ([`ChatCompletionInputGrammarType`], *optional*):
|
|
592
|
+
Grammar constraints. Can be either a JSONSchema or a regex.
|
|
548
593
|
seed (Optional[`int`], *optional*):
|
|
549
594
|
Seed for reproducible control flow. Defaults to None.
|
|
550
595
|
stop (Optional[`str`], *optional*):
|
|
@@ -562,7 +607,7 @@ class InferenceClient:
|
|
|
562
607
|
top_p (`float`, *optional*):
|
|
563
608
|
Fraction of the most likely next words to sample from.
|
|
564
609
|
Must be between 0 and 1. Defaults to 1.0.
|
|
565
|
-
tool_choice ([`ChatCompletionInputToolTypeClass`] or
|
|
610
|
+
tool_choice ([`ChatCompletionInputToolTypeClass`] or `str`, *optional*):
|
|
566
611
|
The tool to use for the completion. Defaults to "auto".
|
|
567
612
|
tool_prompt (`str`, *optional*):
|
|
568
613
|
A prompt to be appended before the tools.
|
|
@@ -571,7 +616,7 @@ class InferenceClient:
|
|
|
571
616
|
provide a list of functions the model may generate JSON inputs for.
|
|
572
617
|
|
|
573
618
|
Returns:
|
|
574
|
-
[`ChatCompletionOutput] or Iterable of [`ChatCompletionStreamOutput`]:
|
|
619
|
+
[`ChatCompletionOutput`] or Iterable of [`ChatCompletionStreamOutput`]:
|
|
575
620
|
Generated text returned from the server:
|
|
576
621
|
- if `stream=False`, the generated text is returned as a [`ChatCompletionOutput`] (default).
|
|
577
622
|
- if `stream=True`, the generated text is returned token by token as a sequence of [`ChatCompletionStreamOutput`].
|
|
@@ -585,10 +630,9 @@ class InferenceClient:
|
|
|
585
630
|
Example:
|
|
586
631
|
|
|
587
632
|
```py
|
|
588
|
-
# Chat example
|
|
589
633
|
>>> from huggingface_hub import InferenceClient
|
|
590
634
|
>>> messages = [{"role": "user", "content": "What is the capital of France?"}]
|
|
591
|
-
>>> client = InferenceClient("
|
|
635
|
+
>>> client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
|
|
592
636
|
>>> client.chat_completion(messages, max_tokens=100)
|
|
593
637
|
ChatCompletionOutput(
|
|
594
638
|
choices=[
|
|
@@ -596,21 +640,67 @@ class InferenceClient:
|
|
|
596
640
|
finish_reason='eos_token',
|
|
597
641
|
index=0,
|
|
598
642
|
message=ChatCompletionOutputMessage(
|
|
599
|
-
|
|
600
|
-
|
|
643
|
+
role='assistant',
|
|
644
|
+
content='The capital of France is Paris.',
|
|
645
|
+
name=None,
|
|
646
|
+
tool_calls=None
|
|
647
|
+
),
|
|
648
|
+
logprobs=None
|
|
601
649
|
)
|
|
602
650
|
],
|
|
603
|
-
created=
|
|
651
|
+
created=1719907176,
|
|
652
|
+
id='',
|
|
653
|
+
model='meta-llama/Meta-Llama-3-8B-Instruct',
|
|
654
|
+
object='text_completion',
|
|
655
|
+
system_fingerprint='2.0.4-sha-f426a33',
|
|
656
|
+
usage=ChatCompletionOutputUsage(
|
|
657
|
+
completion_tokens=8,
|
|
658
|
+
prompt_tokens=17,
|
|
659
|
+
total_tokens=25
|
|
660
|
+
)
|
|
604
661
|
)
|
|
662
|
+
```
|
|
605
663
|
|
|
664
|
+
Example (stream=True):
|
|
665
|
+
```py
|
|
666
|
+
>>> from huggingface_hub import InferenceClient
|
|
667
|
+
>>> messages = [{"role": "user", "content": "What is the capital of France?"}]
|
|
668
|
+
>>> client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
|
|
606
669
|
>>> for token in client.chat_completion(messages, max_tokens=10, stream=True):
|
|
607
670
|
... print(token)
|
|
608
671
|
ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content='The', role='assistant'), index=0, finish_reason=None)], created=1710498504)
|
|
609
672
|
ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' capital', role='assistant'), index=0, finish_reason=None)], created=1710498504)
|
|
610
673
|
(...)
|
|
611
674
|
ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' may', role='assistant'), index=0, finish_reason=None)], created=1710498504)
|
|
675
|
+
```
|
|
612
676
|
|
|
613
|
-
|
|
677
|
+
Example using OpenAI's syntax:
|
|
678
|
+
```py
|
|
679
|
+
# instead of `from openai import OpenAI`
|
|
680
|
+
from huggingface_hub import InferenceClient
|
|
681
|
+
|
|
682
|
+
# instead of `client = OpenAI(...)`
|
|
683
|
+
client = InferenceClient(
|
|
684
|
+
base_url=...,
|
|
685
|
+
api_key=...,
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
output = client.chat.completions.create(
|
|
689
|
+
model="meta-llama/Meta-Llama-3-8B-Instruct",
|
|
690
|
+
messages=[
|
|
691
|
+
{"role": "system", "content": "You are a helpful assistant."},
|
|
692
|
+
{"role": "user", "content": "Count to 10"},
|
|
693
|
+
],
|
|
694
|
+
stream=True,
|
|
695
|
+
max_tokens=1024,
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
for chunk in output:
|
|
699
|
+
print(chunk.choices[0].delta.content)
|
|
700
|
+
```
|
|
701
|
+
|
|
702
|
+
Example using tools:
|
|
703
|
+
```py
|
|
614
704
|
>>> client = InferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
|
|
615
705
|
>>> messages = [
|
|
616
706
|
... {
|
|
@@ -691,9 +781,43 @@ class InferenceClient:
|
|
|
691
781
|
description=None
|
|
692
782
|
)
|
|
693
783
|
```
|
|
784
|
+
|
|
785
|
+
Example using response_format:
|
|
786
|
+
```py
|
|
787
|
+
>>> from huggingface_hub import InferenceClient
|
|
788
|
+
>>> client = InferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
|
|
789
|
+
>>> messages = [
|
|
790
|
+
... {
|
|
791
|
+
... "role": "user",
|
|
792
|
+
... "content": "I saw a puppy a cat and a raccoon during my bike ride in the park. What did I saw and when?",
|
|
793
|
+
... },
|
|
794
|
+
... ]
|
|
795
|
+
>>> response_format = {
|
|
796
|
+
... "type": "json",
|
|
797
|
+
... "value": {
|
|
798
|
+
... "properties": {
|
|
799
|
+
... "location": {"type": "string"},
|
|
800
|
+
... "activity": {"type": "string"},
|
|
801
|
+
... "animals_seen": {"type": "integer", "minimum": 1, "maximum": 5},
|
|
802
|
+
... "animals": {"type": "array", "items": {"type": "string"}},
|
|
803
|
+
... },
|
|
804
|
+
... "required": ["location", "activity", "animals_seen", "animals"],
|
|
805
|
+
... },
|
|
806
|
+
... }
|
|
807
|
+
>>> response = client.chat_completion(
|
|
808
|
+
... messages=messages,
|
|
809
|
+
... response_format=response_format,
|
|
810
|
+
... max_tokens=500,
|
|
811
|
+
)
|
|
812
|
+
>>> response.choices[0].message.content
|
|
813
|
+
'{\n\n"activity": "bike ride",\n"animals": ["puppy", "cat", "raccoon"],\n"animals_seen": 3,\n"location": "park"}'
|
|
814
|
+
```
|
|
694
815
|
"""
|
|
695
|
-
#
|
|
696
|
-
|
|
816
|
+
# Determine model
|
|
817
|
+
# `self.xxx` takes precedence over the method argument only in `chat_completion`
|
|
818
|
+
# since `chat_completion(..., model=xxx)` is also a payload parameter for the
|
|
819
|
+
# server, we need to handle it differently
|
|
820
|
+
model = self.base_url or self.model or model or self.get_recommended_model("text-generation")
|
|
697
821
|
|
|
698
822
|
if _is_chat_completion_server(model):
|
|
699
823
|
# First, let's consider the server has a `/v1/chat/completions` endpoint.
|
|
@@ -702,11 +826,19 @@ class InferenceClient:
|
|
|
702
826
|
if not model_url.endswith("/chat/completions"):
|
|
703
827
|
model_url += "/v1/chat/completions"
|
|
704
828
|
|
|
829
|
+
# `model` is sent in the payload. Not used by the server but can be useful for debugging/routing.
|
|
830
|
+
if not model.startswith("http") and model.count("/") == 1:
|
|
831
|
+
# If it's a ID on the Hub => use it
|
|
832
|
+
model_id = model
|
|
833
|
+
else:
|
|
834
|
+
# Otherwise, we use a random string
|
|
835
|
+
model_id = "tgi"
|
|
836
|
+
|
|
705
837
|
try:
|
|
706
838
|
data = self.post(
|
|
707
839
|
model=model_url,
|
|
708
840
|
json=dict(
|
|
709
|
-
model=
|
|
841
|
+
model=model_id,
|
|
710
842
|
messages=messages,
|
|
711
843
|
frequency_penalty=frequency_penalty,
|
|
712
844
|
logit_bias=logit_bias,
|
|
@@ -714,6 +846,7 @@ class InferenceClient:
|
|
|
714
846
|
max_tokens=max_tokens,
|
|
715
847
|
n=n,
|
|
716
848
|
presence_penalty=presence_penalty,
|
|
849
|
+
response_format=response_format,
|
|
717
850
|
seed=seed,
|
|
718
851
|
stop=stop,
|
|
719
852
|
temperature=temperature,
|
|
@@ -765,6 +898,11 @@ class InferenceClient:
|
|
|
765
898
|
"Tools are not supported by the model. This is due to the model not been served by a "
|
|
766
899
|
"Text-Generation-Inference server. The provided tool parameters will be ignored."
|
|
767
900
|
)
|
|
901
|
+
if response_format is not None:
|
|
902
|
+
warnings.warn(
|
|
903
|
+
"Response format is not supported by the model. This is due to the model not been served by a "
|
|
904
|
+
"Text-Generation-Inference server. The provided response format will be ignored."
|
|
905
|
+
)
|
|
768
906
|
|
|
769
907
|
# generate response
|
|
770
908
|
text_generation_output = self.text_generation(
|
|
@@ -783,7 +921,6 @@ class InferenceClient:
|
|
|
783
921
|
return ChatCompletionOutput(
|
|
784
922
|
id="dummy",
|
|
785
923
|
model="dummy",
|
|
786
|
-
object="dummy",
|
|
787
924
|
system_fingerprint="dummy",
|
|
788
925
|
usage=None, # type: ignore # set to `None` as we don't want to provide false information
|
|
789
926
|
created=int(time.time()),
|
|
@@ -913,7 +1050,16 @@ class InferenceClient:
|
|
|
913
1050
|
response = self.post(json=payload, model=model, task="document-question-answering")
|
|
914
1051
|
return DocumentQuestionAnsweringOutputElement.parse_obj_as_list(response)
|
|
915
1052
|
|
|
916
|
-
def feature_extraction(
|
|
1053
|
+
def feature_extraction(
|
|
1054
|
+
self,
|
|
1055
|
+
text: str,
|
|
1056
|
+
*,
|
|
1057
|
+
normalize: Optional[bool] = None,
|
|
1058
|
+
prompt_name: Optional[str] = None,
|
|
1059
|
+
truncate: Optional[bool] = None,
|
|
1060
|
+
truncation_direction: Optional[Literal["Left", "Right"]] = None,
|
|
1061
|
+
model: Optional[str] = None,
|
|
1062
|
+
) -> "np.ndarray":
|
|
917
1063
|
"""
|
|
918
1064
|
Generate embeddings for a given text.
|
|
919
1065
|
|
|
@@ -924,6 +1070,20 @@ class InferenceClient:
|
|
|
924
1070
|
The model to use for the conversational task. Can be a model ID hosted on the Hugging Face Hub or a URL to
|
|
925
1071
|
a deployed Inference Endpoint. If not provided, the default recommended conversational model will be used.
|
|
926
1072
|
Defaults to None.
|
|
1073
|
+
normalize (`bool`, *optional*):
|
|
1074
|
+
Whether to normalize the embeddings or not. Defaults to None.
|
|
1075
|
+
Only available on server powered by Text-Embedding-Inference.
|
|
1076
|
+
prompt_name (`str`, *optional*):
|
|
1077
|
+
The name of the prompt that should be used by for encoding. If not set, no prompt will be applied.
|
|
1078
|
+
Must be a key in the `Sentence Transformers` configuration `prompts` dictionary.
|
|
1079
|
+
For example if ``prompt_name`` is "query" and the ``prompts`` is {"query": "query: ",...},
|
|
1080
|
+
then the sentence "What is the capital of France?" will be encoded as "query: What is the capital of France?"
|
|
1081
|
+
because the prompt text will be prepended before any text to encode.
|
|
1082
|
+
truncate (`bool`, *optional*):
|
|
1083
|
+
Whether to truncate the embeddings or not. Defaults to None.
|
|
1084
|
+
Only available on server powered by Text-Embedding-Inference.
|
|
1085
|
+
truncation_direction (`Literal["Left", "Right"]`, *optional*):
|
|
1086
|
+
Which side of the input should be truncated when `truncate=True` is passed.
|
|
927
1087
|
|
|
928
1088
|
Returns:
|
|
929
1089
|
`np.ndarray`: The embedding representing the input text as a float32 numpy array.
|
|
@@ -945,7 +1105,16 @@ class InferenceClient:
|
|
|
945
1105
|
[ 0.28552425, -0.928395 , -1.2077185 , ..., 0.76810825, -2.1069427 , 0.6236161 ]], dtype=float32)
|
|
946
1106
|
```
|
|
947
1107
|
"""
|
|
948
|
-
|
|
1108
|
+
payload: Dict = {"inputs": text}
|
|
1109
|
+
if normalize is not None:
|
|
1110
|
+
payload["normalize"] = normalize
|
|
1111
|
+
if prompt_name is not None:
|
|
1112
|
+
payload["prompt_name"] = prompt_name
|
|
1113
|
+
if truncate is not None:
|
|
1114
|
+
payload["truncate"] = truncate
|
|
1115
|
+
if truncation_direction is not None:
|
|
1116
|
+
payload["truncation_direction"] = truncation_direction
|
|
1117
|
+
response = self.post(json=payload, model=model, task="feature-extraction")
|
|
949
1118
|
np = _import_numpy()
|
|
950
1119
|
return np.array(_bytes_to_dict(response), dtype="float32")
|
|
951
1120
|
|
|
@@ -1184,7 +1353,8 @@ class InferenceClient:
|
|
|
1184
1353
|
```
|
|
1185
1354
|
"""
|
|
1186
1355
|
response = self.post(data=image, model=model, task="image-to-text")
|
|
1187
|
-
|
|
1356
|
+
output = ImageToTextOutput.parse_obj(response)
|
|
1357
|
+
return output[0] if isinstance(output, list) else output
|
|
1188
1358
|
|
|
1189
1359
|
def list_deployed_models(
|
|
1190
1360
|
self, frameworks: Union[None, str, Literal["all"], List[str]] = None
|
|
@@ -1619,6 +1789,7 @@ class InferenceClient:
|
|
|
1619
1789
|
stream: Literal[False] = ...,
|
|
1620
1790
|
model: Optional[str] = None,
|
|
1621
1791
|
# Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
|
|
1792
|
+
adapter_id: Optional[str] = None,
|
|
1622
1793
|
best_of: Optional[int] = None,
|
|
1623
1794
|
decoder_input_details: Optional[bool] = None,
|
|
1624
1795
|
do_sample: Optional[bool] = False, # Manual default value
|
|
@@ -1647,6 +1818,7 @@ class InferenceClient:
|
|
|
1647
1818
|
stream: Literal[False] = ...,
|
|
1648
1819
|
model: Optional[str] = None,
|
|
1649
1820
|
# Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
|
|
1821
|
+
adapter_id: Optional[str] = None,
|
|
1650
1822
|
best_of: Optional[int] = None,
|
|
1651
1823
|
decoder_input_details: Optional[bool] = None,
|
|
1652
1824
|
do_sample: Optional[bool] = False, # Manual default value
|
|
@@ -1675,6 +1847,7 @@ class InferenceClient:
|
|
|
1675
1847
|
stream: Literal[True] = ...,
|
|
1676
1848
|
model: Optional[str] = None,
|
|
1677
1849
|
# Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
|
|
1850
|
+
adapter_id: Optional[str] = None,
|
|
1678
1851
|
best_of: Optional[int] = None,
|
|
1679
1852
|
decoder_input_details: Optional[bool] = None,
|
|
1680
1853
|
do_sample: Optional[bool] = False, # Manual default value
|
|
@@ -1703,6 +1876,7 @@ class InferenceClient:
|
|
|
1703
1876
|
stream: Literal[True] = ...,
|
|
1704
1877
|
model: Optional[str] = None,
|
|
1705
1878
|
# Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
|
|
1879
|
+
adapter_id: Optional[str] = None,
|
|
1706
1880
|
best_of: Optional[int] = None,
|
|
1707
1881
|
decoder_input_details: Optional[bool] = None,
|
|
1708
1882
|
do_sample: Optional[bool] = False, # Manual default value
|
|
@@ -1731,6 +1905,7 @@ class InferenceClient:
|
|
|
1731
1905
|
stream: bool = ...,
|
|
1732
1906
|
model: Optional[str] = None,
|
|
1733
1907
|
# Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
|
|
1908
|
+
adapter_id: Optional[str] = None,
|
|
1734
1909
|
best_of: Optional[int] = None,
|
|
1735
1910
|
decoder_input_details: Optional[bool] = None,
|
|
1736
1911
|
do_sample: Optional[bool] = False, # Manual default value
|
|
@@ -1758,6 +1933,7 @@ class InferenceClient:
|
|
|
1758
1933
|
stream: bool = False,
|
|
1759
1934
|
model: Optional[str] = None,
|
|
1760
1935
|
# Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
|
|
1936
|
+
adapter_id: Optional[str] = None,
|
|
1761
1937
|
best_of: Optional[int] = None,
|
|
1762
1938
|
decoder_input_details: Optional[bool] = None,
|
|
1763
1939
|
do_sample: Optional[bool] = False, # Manual default value
|
|
@@ -1788,6 +1964,13 @@ class InferenceClient:
|
|
|
1788
1964
|
|
|
1789
1965
|
To learn more about the TGI project, please refer to https://github.com/huggingface/text-generation-inference.
|
|
1790
1966
|
|
|
1967
|
+
<Tip>
|
|
1968
|
+
|
|
1969
|
+
If you want to generate a response from chat messages, you should use the [`InferenceClient.chat_completion`] method.
|
|
1970
|
+
It accepts a list of messages instead of a single text prompt and handles the chat templating for you.
|
|
1971
|
+
|
|
1972
|
+
</Tip>
|
|
1973
|
+
|
|
1791
1974
|
Args:
|
|
1792
1975
|
prompt (`str`):
|
|
1793
1976
|
Input text.
|
|
@@ -1802,6 +1985,8 @@ class InferenceClient:
|
|
|
1802
1985
|
model (`str`, *optional*):
|
|
1803
1986
|
The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
|
|
1804
1987
|
Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
|
|
1988
|
+
adapter_id (`str`, *optional*):
|
|
1989
|
+
Lora adapter id.
|
|
1805
1990
|
best_of (`int`, *optional*):
|
|
1806
1991
|
Generate best_of sequences and return the one if the highest token logprobs.
|
|
1807
1992
|
decoder_input_details (`bool`, *optional*):
|
|
@@ -1970,6 +2155,7 @@ class InferenceClient:
|
|
|
1970
2155
|
|
|
1971
2156
|
# Build payload
|
|
1972
2157
|
parameters = {
|
|
2158
|
+
"adapter_id": adapter_id,
|
|
1973
2159
|
"best_of": best_of,
|
|
1974
2160
|
"decoder_input_details": decoder_input_details,
|
|
1975
2161
|
"details": details,
|
|
@@ -2040,6 +2226,7 @@ class InferenceClient:
|
|
|
2040
2226
|
details=details,
|
|
2041
2227
|
stream=stream,
|
|
2042
2228
|
model=model,
|
|
2229
|
+
adapter_id=adapter_id,
|
|
2043
2230
|
best_of=best_of,
|
|
2044
2231
|
decoder_input_details=decoder_input_details,
|
|
2045
2232
|
do_sample=do_sample,
|
|
@@ -2064,7 +2251,12 @@ class InferenceClient:
|
|
|
2064
2251
|
if stream:
|
|
2065
2252
|
return _stream_text_generation_response(bytes_output, details) # type: ignore
|
|
2066
2253
|
|
|
2067
|
-
data = _bytes_to_dict(bytes_output)
|
|
2254
|
+
data = _bytes_to_dict(bytes_output) # type: ignore[arg-type]
|
|
2255
|
+
|
|
2256
|
+
# Data can be a single element (dict) or an iterable of dicts where we select the first element of.
|
|
2257
|
+
if isinstance(data, list):
|
|
2258
|
+
data = data[0]
|
|
2259
|
+
|
|
2068
2260
|
return TextGenerationOutput.parse_obj_as_instance(data) if details else data["generated_text"]
|
|
2069
2261
|
|
|
2070
2262
|
def text_to_image(
|
|
@@ -2347,7 +2539,13 @@ class InferenceClient:
|
|
|
2347
2539
|
return VisualQuestionAnsweringOutputElement.parse_obj_as_list(response)
|
|
2348
2540
|
|
|
2349
2541
|
def zero_shot_classification(
|
|
2350
|
-
self,
|
|
2542
|
+
self,
|
|
2543
|
+
text: str,
|
|
2544
|
+
labels: List[str],
|
|
2545
|
+
*,
|
|
2546
|
+
multi_label: bool = False,
|
|
2547
|
+
hypothesis_template: Optional[str] = None,
|
|
2548
|
+
model: Optional[str] = None,
|
|
2351
2549
|
) -> List[ZeroShotClassificationOutputElement]:
|
|
2352
2550
|
"""
|
|
2353
2551
|
Provide as input a text and a set of candidate labels to classify the input text.
|
|
@@ -2356,9 +2554,15 @@ class InferenceClient:
|
|
|
2356
2554
|
text (`str`):
|
|
2357
2555
|
The input text to classify.
|
|
2358
2556
|
labels (`List[str]`):
|
|
2359
|
-
List of string
|
|
2557
|
+
List of strings. Each string is the verbalization of a possible label for the input text.
|
|
2360
2558
|
multi_label (`bool`):
|
|
2361
|
-
Boolean
|
|
2559
|
+
Boolean. If True, the probability for each label is evaluated independently and multiple labels can have a probability close to 1 simultaneously or all probabilities can be close to 0.
|
|
2560
|
+
If False, the labels are considered mutually exclusive and the probability over all labels always sums to 1. Defaults to False.
|
|
2561
|
+
hypothesis_template (`str`, *optional*):
|
|
2562
|
+
A template sentence string with curly brackets to which the label strings are added. The label strings are added at the position of the curly brackets "{}".
|
|
2563
|
+
Zero-shot classifiers are based on NLI models, which evaluate if a hypothesis is entailed in another text or not.
|
|
2564
|
+
For example, with hypothesis_template="This text is about {}." and labels=["economics", "politics"], the system internally creates the two hypotheses "This text is about economics." and "This text is about politics.".
|
|
2565
|
+
The model then evaluates for both hypotheses if they are entailed in the provided `text` or not.
|
|
2362
2566
|
model (`str`, *optional*):
|
|
2363
2567
|
The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
|
|
2364
2568
|
Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
|
|
@@ -2372,7 +2576,7 @@ class InferenceClient:
|
|
|
2372
2576
|
`HTTPError`:
|
|
2373
2577
|
If the request fails with an HTTP error status code other than HTTP 503.
|
|
2374
2578
|
|
|
2375
|
-
Example
|
|
2579
|
+
Example with `multi_label=False`:
|
|
2376
2580
|
```py
|
|
2377
2581
|
>>> from huggingface_hub import InferenceClient
|
|
2378
2582
|
>>> client = InferenceClient()
|
|
@@ -2399,21 +2603,37 @@ class InferenceClient:
|
|
|
2399
2603
|
ZeroShotClassificationOutputElement(label='robots', score=0.00030448526376858354),
|
|
2400
2604
|
]
|
|
2401
2605
|
```
|
|
2606
|
+
|
|
2607
|
+
Example with `multi_label=True` and a custom `hypothesis_template`:
|
|
2608
|
+
```py
|
|
2609
|
+
>>> from huggingface_hub import InferenceClient
|
|
2610
|
+
>>> client = InferenceClient()
|
|
2611
|
+
>>> client.zero_shot_classification(
|
|
2612
|
+
... text="I really like our dinner and I'm very happy. I don't like the weather though.",
|
|
2613
|
+
... labels=["positive", "negative", "pessimistic", "optimistic"],
|
|
2614
|
+
... multi_label=True,
|
|
2615
|
+
... hypothesis_template="This text is {} towards the weather"
|
|
2616
|
+
... )
|
|
2617
|
+
[
|
|
2618
|
+
ZeroShotClassificationOutputElement(label='negative', score=0.9231801629066467),
|
|
2619
|
+
ZeroShotClassificationOutputElement(label='pessimistic', score=0.8760990500450134),
|
|
2620
|
+
ZeroShotClassificationOutputElement(label='optimistic', score=0.0008674879791215062),
|
|
2621
|
+
ZeroShotClassificationOutputElement(label='positive', score=0.0005250611575320363)
|
|
2622
|
+
]
|
|
2623
|
+
```
|
|
2402
2624
|
"""
|
|
2403
|
-
|
|
2404
|
-
|
|
2405
|
-
|
|
2625
|
+
|
|
2626
|
+
parameters = {"candidate_labels": labels, "multi_label": multi_label}
|
|
2627
|
+
if hypothesis_template is not None:
|
|
2628
|
+
parameters["hypothesis_template"] = hypothesis_template
|
|
2406
2629
|
|
|
2407
2630
|
response = self.post(
|
|
2408
2631
|
json={
|
|
2409
2632
|
"inputs": text,
|
|
2410
|
-
"parameters":
|
|
2411
|
-
"candidate_labels": ",".join(labels),
|
|
2412
|
-
"multi_label": multi_label,
|
|
2413
|
-
},
|
|
2633
|
+
"parameters": parameters,
|
|
2414
2634
|
},
|
|
2415
|
-
model=model,
|
|
2416
2635
|
task="zero-shot-classification",
|
|
2636
|
+
model=model,
|
|
2417
2637
|
)
|
|
2418
2638
|
output = _bytes_to_dict(response)
|
|
2419
2639
|
return [
|
|
@@ -2469,7 +2689,7 @@ class InferenceClient:
|
|
|
2469
2689
|
return ZeroShotImageClassificationOutputElement.parse_obj_as_list(response)
|
|
2470
2690
|
|
|
2471
2691
|
def _resolve_url(self, model: Optional[str] = None, task: Optional[str] = None) -> str:
|
|
2472
|
-
model = model or self.model
|
|
2692
|
+
model = model or self.model or self.base_url
|
|
2473
2693
|
|
|
2474
2694
|
# If model is already a URL, ignore `task` and return directly
|
|
2475
2695
|
if model is not None and (model.startswith("http://") or model.startswith("https://")):
|
|
@@ -2522,6 +2742,95 @@ class InferenceClient:
|
|
|
2522
2742
|
)
|
|
2523
2743
|
return model
|
|
2524
2744
|
|
|
2745
|
+
def get_endpoint_info(self, *, model: Optional[str] = None) -> Dict[str, Any]:
|
|
2746
|
+
"""
|
|
2747
|
+
Get information about the deployed endpoint.
|
|
2748
|
+
|
|
2749
|
+
This endpoint is only available on endpoints powered by Text-Generation-Inference (TGI) or Text-Embedding-Inference (TEI).
|
|
2750
|
+
Endpoints powered by `transformers` return an empty payload.
|
|
2751
|
+
|
|
2752
|
+
Args:
|
|
2753
|
+
model (`str`, *optional*):
|
|
2754
|
+
The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
|
|
2755
|
+
Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
|
|
2756
|
+
|
|
2757
|
+
Returns:
|
|
2758
|
+
`Dict[str, Any]`: Information about the endpoint.
|
|
2759
|
+
|
|
2760
|
+
Example:
|
|
2761
|
+
```py
|
|
2762
|
+
>>> from huggingface_hub import InferenceClient
|
|
2763
|
+
>>> client = InferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
|
|
2764
|
+
>>> client.get_endpoint_info()
|
|
2765
|
+
{
|
|
2766
|
+
'model_id': 'meta-llama/Meta-Llama-3-70B-Instruct',
|
|
2767
|
+
'model_sha': None,
|
|
2768
|
+
'model_dtype': 'torch.float16',
|
|
2769
|
+
'model_device_type': 'cuda',
|
|
2770
|
+
'model_pipeline_tag': None,
|
|
2771
|
+
'max_concurrent_requests': 128,
|
|
2772
|
+
'max_best_of': 2,
|
|
2773
|
+
'max_stop_sequences': 4,
|
|
2774
|
+
'max_input_length': 8191,
|
|
2775
|
+
'max_total_tokens': 8192,
|
|
2776
|
+
'waiting_served_ratio': 0.3,
|
|
2777
|
+
'max_batch_total_tokens': 1259392,
|
|
2778
|
+
'max_waiting_tokens': 20,
|
|
2779
|
+
'max_batch_size': None,
|
|
2780
|
+
'validation_workers': 32,
|
|
2781
|
+
'max_client_batch_size': 4,
|
|
2782
|
+
'version': '2.0.2',
|
|
2783
|
+
'sha': 'dccab72549635c7eb5ddb17f43f0b7cdff07c214',
|
|
2784
|
+
'docker_label': 'sha-dccab72'
|
|
2785
|
+
}
|
|
2786
|
+
```
|
|
2787
|
+
"""
|
|
2788
|
+
model = model or self.model
|
|
2789
|
+
if model is None:
|
|
2790
|
+
raise ValueError("Model id not provided.")
|
|
2791
|
+
if model.startswith(("http://", "https://")):
|
|
2792
|
+
url = model.rstrip("/") + "/info"
|
|
2793
|
+
else:
|
|
2794
|
+
url = f"{INFERENCE_ENDPOINT}/models/{model}/info"
|
|
2795
|
+
|
|
2796
|
+
response = get_session().get(url, headers=self.headers)
|
|
2797
|
+
hf_raise_for_status(response)
|
|
2798
|
+
return response.json()
|
|
2799
|
+
|
|
2800
|
+
def health_check(self, model: Optional[str] = None) -> bool:
|
|
2801
|
+
"""
|
|
2802
|
+
Check the health of the deployed endpoint.
|
|
2803
|
+
|
|
2804
|
+
Health check is only available with Inference Endpoints powered by Text-Generation-Inference (TGI) or Text-Embedding-Inference (TEI).
|
|
2805
|
+
For Inference API, please use [`InferenceClient.get_model_status`] instead.
|
|
2806
|
+
|
|
2807
|
+
Args:
|
|
2808
|
+
model (`str`, *optional*):
|
|
2809
|
+
URL of the Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
|
|
2810
|
+
|
|
2811
|
+
Returns:
|
|
2812
|
+
`bool`: True if everything is working fine.
|
|
2813
|
+
|
|
2814
|
+
Example:
|
|
2815
|
+
```py
|
|
2816
|
+
>>> from huggingface_hub import InferenceClient
|
|
2817
|
+
>>> client = InferenceClient("https://jzgu0buei5.us-east-1.aws.endpoints.huggingface.cloud")
|
|
2818
|
+
>>> client.health_check()
|
|
2819
|
+
True
|
|
2820
|
+
```
|
|
2821
|
+
"""
|
|
2822
|
+
model = model or self.model
|
|
2823
|
+
if model is None:
|
|
2824
|
+
raise ValueError("Model id not provided.")
|
|
2825
|
+
if not model.startswith(("http://", "https://")):
|
|
2826
|
+
raise ValueError(
|
|
2827
|
+
"Model must be an Inference Endpoint URL. For serverless Inference API, please use `InferenceClient.get_model_status`."
|
|
2828
|
+
)
|
|
2829
|
+
url = model.rstrip("/") + "/health"
|
|
2830
|
+
|
|
2831
|
+
response = get_session().get(url, headers=self.headers)
|
|
2832
|
+
return response.status_code == 200
|
|
2833
|
+
|
|
2525
2834
|
def get_model_status(self, model: Optional[str] = None) -> ModelStatus:
|
|
2526
2835
|
"""
|
|
2527
2836
|
Get the status of a model hosted on the Inference API.
|
|
@@ -2548,7 +2857,7 @@ class InferenceClient:
|
|
|
2548
2857
|
```py
|
|
2549
2858
|
>>> from huggingface_hub import InferenceClient
|
|
2550
2859
|
>>> client = InferenceClient()
|
|
2551
|
-
>>> client.get_model_status("
|
|
2860
|
+
>>> client.get_model_status("meta-llama/Meta-Llama-3-8B-Instruct")
|
|
2552
2861
|
ModelStatus(loaded=True, state='Loaded', compute_type='gpu', framework='text-generation-inference')
|
|
2553
2862
|
```
|
|
2554
2863
|
"""
|
|
@@ -2572,3 +2881,30 @@ class InferenceClient:
|
|
|
2572
2881
|
compute_type=response_data["compute_type"],
|
|
2573
2882
|
framework=response_data["framework"],
|
|
2574
2883
|
)
|
|
2884
|
+
|
|
2885
|
+
@property
|
|
2886
|
+
def chat(self) -> "ProxyClientChat":
|
|
2887
|
+
return ProxyClientChat(self)
|
|
2888
|
+
|
|
2889
|
+
|
|
2890
|
+
class _ProxyClient:
|
|
2891
|
+
"""Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client."""
|
|
2892
|
+
|
|
2893
|
+
def __init__(self, client: InferenceClient):
|
|
2894
|
+
self._client = client
|
|
2895
|
+
|
|
2896
|
+
|
|
2897
|
+
class ProxyClientChat(_ProxyClient):
|
|
2898
|
+
"""Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client."""
|
|
2899
|
+
|
|
2900
|
+
@property
|
|
2901
|
+
def completions(self) -> "ProxyClientChatCompletions":
|
|
2902
|
+
return ProxyClientChatCompletions(self._client)
|
|
2903
|
+
|
|
2904
|
+
|
|
2905
|
+
class ProxyClientChatCompletions(_ProxyClient):
|
|
2906
|
+
"""Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client."""
|
|
2907
|
+
|
|
2908
|
+
@property
|
|
2909
|
+
def create(self):
|
|
2910
|
+
return self._client.chat_completion
|