huggingface-hub 0.25.2__py3-none-any.whl → 0.26.0rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of huggingface-hub might be problematic. Click here for more details.
- huggingface_hub/__init__.py +45 -11
- huggingface_hub/_login.py +172 -33
- huggingface_hub/commands/user.py +125 -9
- huggingface_hub/constants.py +1 -1
- huggingface_hub/errors.py +6 -9
- huggingface_hub/file_download.py +2 -372
- huggingface_hub/hf_api.py +170 -13
- huggingface_hub/hf_file_system.py +3 -3
- huggingface_hub/hub_mixin.py +2 -1
- huggingface_hub/inference/_client.py +500 -145
- huggingface_hub/inference/_common.py +42 -4
- huggingface_hub/inference/_generated/_async_client.py +499 -144
- huggingface_hub/inference/_generated/types/__init__.py +37 -7
- huggingface_hub/inference/_generated/types/audio_classification.py +8 -5
- huggingface_hub/inference/_generated/types/automatic_speech_recognition.py +9 -7
- huggingface_hub/inference/_generated/types/chat_completion.py +23 -4
- huggingface_hub/inference/_generated/types/image_classification.py +8 -5
- huggingface_hub/inference/_generated/types/image_segmentation.py +9 -7
- huggingface_hub/inference/_generated/types/image_to_image.py +7 -5
- huggingface_hub/inference/_generated/types/image_to_text.py +4 -4
- huggingface_hub/inference/_generated/types/object_detection.py +11 -5
- huggingface_hub/inference/_generated/types/summarization.py +11 -13
- huggingface_hub/inference/_generated/types/text_classification.py +10 -5
- huggingface_hub/inference/_generated/types/text_generation.py +1 -0
- huggingface_hub/inference/_generated/types/text_to_audio.py +2 -2
- huggingface_hub/inference/_generated/types/text_to_image.py +9 -7
- huggingface_hub/inference/_generated/types/text_to_speech.py +107 -0
- huggingface_hub/inference/_generated/types/translation.py +17 -11
- huggingface_hub/inference/_generated/types/video_classification.py +2 -2
- huggingface_hub/repocard.py +2 -1
- huggingface_hub/repocard_data.py +10 -2
- huggingface_hub/serialization/_torch.py +7 -4
- huggingface_hub/utils/__init__.py +4 -20
- huggingface_hub/utils/{_token.py → _auth.py} +86 -3
- huggingface_hub/utils/_headers.py +1 -1
- huggingface_hub/utils/_hf_folder.py +1 -1
- huggingface_hub/utils/_http.py +10 -4
- huggingface_hub/utils/_runtime.py +1 -10
- {huggingface_hub-0.25.2.dist-info → huggingface_hub-0.26.0rc0.dist-info}/METADATA +12 -12
- {huggingface_hub-0.25.2.dist-info → huggingface_hub-0.26.0rc0.dist-info}/RECORD +44 -44
- huggingface_hub/inference/_templating.py +0 -102
- {huggingface_hub-0.25.2.dist-info → huggingface_hub-0.26.0rc0.dist-info}/LICENSE +0 -0
- {huggingface_hub-0.25.2.dist-info → huggingface_hub-0.26.0rc0.dist-info}/WHEEL +0 -0
- {huggingface_hub-0.25.2.dist-info → huggingface_hub-0.26.0rc0.dist-info}/entry_points.txt +0 -0
- {huggingface_hub-0.25.2.dist-info → huggingface_hub-0.26.0rc0.dist-info}/top_level.txt +0 -0
|
@@ -37,17 +37,7 @@ import logging
|
|
|
37
37
|
import re
|
|
38
38
|
import time
|
|
39
39
|
import warnings
|
|
40
|
-
from typing import
|
|
41
|
-
TYPE_CHECKING,
|
|
42
|
-
Any,
|
|
43
|
-
Dict,
|
|
44
|
-
Iterable,
|
|
45
|
-
List,
|
|
46
|
-
Literal,
|
|
47
|
-
Optional,
|
|
48
|
-
Union,
|
|
49
|
-
overload,
|
|
50
|
-
)
|
|
40
|
+
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Literal, Optional, Union, overload
|
|
51
41
|
|
|
52
42
|
from requests import HTTPError
|
|
53
43
|
from requests.structures import CaseInsensitiveDict
|
|
@@ -67,6 +57,7 @@ from huggingface_hub.inference._common import (
|
|
|
67
57
|
_get_unsupported_text_generation_kwargs,
|
|
68
58
|
_import_numpy,
|
|
69
59
|
_open_as_binary,
|
|
60
|
+
_prepare_payload,
|
|
70
61
|
_set_unsupported_text_generation_kwargs,
|
|
71
62
|
_stream_chat_completion_response,
|
|
72
63
|
_stream_text_generation_response,
|
|
@@ -74,11 +65,12 @@ from huggingface_hub.inference._common import (
|
|
|
74
65
|
)
|
|
75
66
|
from huggingface_hub.inference._generated.types import (
|
|
76
67
|
AudioClassificationOutputElement,
|
|
68
|
+
AudioClassificationOutputTransform,
|
|
77
69
|
AudioToAudioOutputElement,
|
|
78
70
|
AutomaticSpeechRecognitionOutput,
|
|
79
71
|
ChatCompletionInputGrammarType,
|
|
80
|
-
|
|
81
|
-
|
|
72
|
+
ChatCompletionInputStreamOptions,
|
|
73
|
+
ChatCompletionInputToolType,
|
|
82
74
|
ChatCompletionOutput,
|
|
83
75
|
ChatCompletionStreamOutput,
|
|
84
76
|
DocumentQuestionAnsweringOutputElement,
|
|
@@ -91,21 +83,21 @@ from huggingface_hub.inference._generated.types import (
|
|
|
91
83
|
SummarizationOutput,
|
|
92
84
|
TableQuestionAnsweringOutputElement,
|
|
93
85
|
TextClassificationOutputElement,
|
|
86
|
+
TextClassificationOutputTransform,
|
|
94
87
|
TextGenerationInputGrammarType,
|
|
95
88
|
TextGenerationOutput,
|
|
96
89
|
TextGenerationStreamOutput,
|
|
90
|
+
TextToImageTargetSize,
|
|
91
|
+
TextToSpeechEarlyStoppingEnum,
|
|
97
92
|
TokenClassificationOutputElement,
|
|
93
|
+
ToolElement,
|
|
98
94
|
TranslationOutput,
|
|
99
95
|
VisualQuestionAnsweringOutputElement,
|
|
100
96
|
ZeroShotClassificationOutputElement,
|
|
101
97
|
ZeroShotImageClassificationOutputElement,
|
|
102
98
|
)
|
|
103
|
-
from huggingface_hub.utils import
|
|
104
|
-
|
|
105
|
-
get_session,
|
|
106
|
-
hf_raise_for_status,
|
|
107
|
-
)
|
|
108
|
-
from huggingface_hub.utils._deprecation import _deprecate_positional_args
|
|
99
|
+
from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
|
|
100
|
+
from huggingface_hub.utils._deprecation import _deprecate_arguments
|
|
109
101
|
|
|
110
102
|
|
|
111
103
|
if TYPE_CHECKING:
|
|
@@ -157,7 +149,6 @@ class InferenceClient:
|
|
|
157
149
|
follow the same pattern as `openai.OpenAI` client. Cannot be used if `token` is set. Defaults to None.
|
|
158
150
|
"""
|
|
159
151
|
|
|
160
|
-
@_deprecate_positional_args(version="0.26")
|
|
161
152
|
def __init__(
|
|
162
153
|
self,
|
|
163
154
|
model: Optional[str] = None,
|
|
@@ -333,6 +324,8 @@ class InferenceClient:
|
|
|
333
324
|
audio: ContentT,
|
|
334
325
|
*,
|
|
335
326
|
model: Optional[str] = None,
|
|
327
|
+
top_k: Optional[int] = None,
|
|
328
|
+
function_to_apply: Optional["AudioClassificationOutputTransform"] = None,
|
|
336
329
|
) -> List[AudioClassificationOutputElement]:
|
|
337
330
|
"""
|
|
338
331
|
Perform audio classification on the provided audio content.
|
|
@@ -345,6 +338,10 @@ class InferenceClient:
|
|
|
345
338
|
The model to use for audio classification. Can be a model ID hosted on the Hugging Face Hub
|
|
346
339
|
or a URL to a deployed Inference Endpoint. If not provided, the default recommended model for
|
|
347
340
|
audio classification will be used.
|
|
341
|
+
top_k (`int`, *optional*):
|
|
342
|
+
When specified, limits the output to the top K most probable classes.
|
|
343
|
+
function_to_apply (`"AudioClassificationOutputTransform"`, *optional*):
|
|
344
|
+
The function to apply to the output.
|
|
348
345
|
|
|
349
346
|
Returns:
|
|
350
347
|
`List[AudioClassificationOutputElement]`: List of [`AudioClassificationOutputElement`] items containing the predicted labels and their confidence.
|
|
@@ -367,7 +364,9 @@ class InferenceClient:
|
|
|
367
364
|
]
|
|
368
365
|
```
|
|
369
366
|
"""
|
|
370
|
-
|
|
367
|
+
parameters = {"function_to_apply": function_to_apply, "top_k": top_k}
|
|
368
|
+
payload = _prepare_payload(audio, parameters=parameters, expect_binary=True)
|
|
369
|
+
response = self.post(**payload, model=model, task="audio-classification")
|
|
371
370
|
return AudioClassificationOutputElement.parse_obj_as_list(response)
|
|
372
371
|
|
|
373
372
|
def audio_to_audio(
|
|
@@ -452,7 +451,7 @@ class InferenceClient:
|
|
|
452
451
|
@overload
|
|
453
452
|
def chat_completion( # type: ignore
|
|
454
453
|
self,
|
|
455
|
-
messages: List[Dict
|
|
454
|
+
messages: List[Dict],
|
|
456
455
|
*,
|
|
457
456
|
model: Optional[str] = None,
|
|
458
457
|
stream: Literal[False] = False,
|
|
@@ -465,10 +464,11 @@ class InferenceClient:
|
|
|
465
464
|
response_format: Optional[ChatCompletionInputGrammarType] = None,
|
|
466
465
|
seed: Optional[int] = None,
|
|
467
466
|
stop: Optional[List[str]] = None,
|
|
467
|
+
stream_options: Optional[ChatCompletionInputStreamOptions] = None,
|
|
468
468
|
temperature: Optional[float] = None,
|
|
469
|
-
tool_choice: Optional[Union[
|
|
469
|
+
tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
|
|
470
470
|
tool_prompt: Optional[str] = None,
|
|
471
|
-
tools: Optional[List[
|
|
471
|
+
tools: Optional[List[ToolElement]] = None,
|
|
472
472
|
top_logprobs: Optional[int] = None,
|
|
473
473
|
top_p: Optional[float] = None,
|
|
474
474
|
) -> ChatCompletionOutput: ...
|
|
@@ -476,7 +476,7 @@ class InferenceClient:
|
|
|
476
476
|
@overload
|
|
477
477
|
def chat_completion( # type: ignore
|
|
478
478
|
self,
|
|
479
|
-
messages: List[Dict
|
|
479
|
+
messages: List[Dict],
|
|
480
480
|
*,
|
|
481
481
|
model: Optional[str] = None,
|
|
482
482
|
stream: Literal[True] = True,
|
|
@@ -489,10 +489,11 @@ class InferenceClient:
|
|
|
489
489
|
response_format: Optional[ChatCompletionInputGrammarType] = None,
|
|
490
490
|
seed: Optional[int] = None,
|
|
491
491
|
stop: Optional[List[str]] = None,
|
|
492
|
+
stream_options: Optional[ChatCompletionInputStreamOptions] = None,
|
|
492
493
|
temperature: Optional[float] = None,
|
|
493
|
-
tool_choice: Optional[Union[
|
|
494
|
+
tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
|
|
494
495
|
tool_prompt: Optional[str] = None,
|
|
495
|
-
tools: Optional[List[
|
|
496
|
+
tools: Optional[List[ToolElement]] = None,
|
|
496
497
|
top_logprobs: Optional[int] = None,
|
|
497
498
|
top_p: Optional[float] = None,
|
|
498
499
|
) -> Iterable[ChatCompletionStreamOutput]: ...
|
|
@@ -500,7 +501,7 @@ class InferenceClient:
|
|
|
500
501
|
@overload
|
|
501
502
|
def chat_completion(
|
|
502
503
|
self,
|
|
503
|
-
messages: List[Dict
|
|
504
|
+
messages: List[Dict],
|
|
504
505
|
*,
|
|
505
506
|
model: Optional[str] = None,
|
|
506
507
|
stream: bool = False,
|
|
@@ -513,17 +514,18 @@ class InferenceClient:
|
|
|
513
514
|
response_format: Optional[ChatCompletionInputGrammarType] = None,
|
|
514
515
|
seed: Optional[int] = None,
|
|
515
516
|
stop: Optional[List[str]] = None,
|
|
517
|
+
stream_options: Optional[ChatCompletionInputStreamOptions] = None,
|
|
516
518
|
temperature: Optional[float] = None,
|
|
517
|
-
tool_choice: Optional[Union[
|
|
519
|
+
tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
|
|
518
520
|
tool_prompt: Optional[str] = None,
|
|
519
|
-
tools: Optional[List[
|
|
521
|
+
tools: Optional[List[ToolElement]] = None,
|
|
520
522
|
top_logprobs: Optional[int] = None,
|
|
521
523
|
top_p: Optional[float] = None,
|
|
522
524
|
) -> Union[ChatCompletionOutput, Iterable[ChatCompletionStreamOutput]]: ...
|
|
523
525
|
|
|
524
526
|
def chat_completion(
|
|
525
527
|
self,
|
|
526
|
-
messages: List[Dict
|
|
528
|
+
messages: List[Dict],
|
|
527
529
|
*,
|
|
528
530
|
model: Optional[str] = None,
|
|
529
531
|
stream: bool = False,
|
|
@@ -537,10 +539,11 @@ class InferenceClient:
|
|
|
537
539
|
response_format: Optional[ChatCompletionInputGrammarType] = None,
|
|
538
540
|
seed: Optional[int] = None,
|
|
539
541
|
stop: Optional[List[str]] = None,
|
|
542
|
+
stream_options: Optional[ChatCompletionInputStreamOptions] = None,
|
|
540
543
|
temperature: Optional[float] = None,
|
|
541
|
-
tool_choice: Optional[Union[
|
|
544
|
+
tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
|
|
542
545
|
tool_prompt: Optional[str] = None,
|
|
543
|
-
tools: Optional[List[
|
|
546
|
+
tools: Optional[List[ToolElement]] = None,
|
|
544
547
|
top_logprobs: Optional[int] = None,
|
|
545
548
|
top_p: Optional[float] = None,
|
|
546
549
|
) -> Union[ChatCompletionOutput, Iterable[ChatCompletionStreamOutput]]:
|
|
@@ -557,7 +560,7 @@ class InferenceClient:
|
|
|
557
560
|
</Tip>
|
|
558
561
|
|
|
559
562
|
Args:
|
|
560
|
-
messages (List
|
|
563
|
+
messages (List of [`ChatCompletionInputMessage`]):
|
|
561
564
|
Conversation history consisting of roles and content pairs.
|
|
562
565
|
model (`str`, *optional*):
|
|
563
566
|
The model to use for chat-completion. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
|
|
@@ -594,6 +597,8 @@ class InferenceClient:
|
|
|
594
597
|
Defaults to None.
|
|
595
598
|
stream (`bool`, *optional*):
|
|
596
599
|
Enable realtime streaming of responses. Defaults to False.
|
|
600
|
+
stream_options ([`ChatCompletionInputStreamOptions`], *optional*):
|
|
601
|
+
Options for streaming completions.
|
|
597
602
|
temperature (`float`, *optional*):
|
|
598
603
|
Controls randomness of the generations. Lower values ensure
|
|
599
604
|
less random completions. Range: [0, 2]. Defaults to 1.0.
|
|
@@ -604,11 +609,11 @@ class InferenceClient:
|
|
|
604
609
|
top_p (`float`, *optional*):
|
|
605
610
|
Fraction of the most likely next words to sample from.
|
|
606
611
|
Must be between 0 and 1. Defaults to 1.0.
|
|
607
|
-
tool_choice ([`
|
|
612
|
+
tool_choice ([`ChatCompletionInputToolType`] or `str`, *optional*):
|
|
608
613
|
The tool to use for the completion. Defaults to "auto".
|
|
609
614
|
tool_prompt (`str`, *optional*):
|
|
610
615
|
A prompt to be appended before the tools.
|
|
611
|
-
tools (List of [`
|
|
616
|
+
tools (List of [`ToolElement`], *optional*):
|
|
612
617
|
A list of tools the model may call. Currently, only functions are supported as a tool. Use this to
|
|
613
618
|
provide a list of functions the model may generate JSON inputs for.
|
|
614
619
|
|
|
@@ -658,7 +663,7 @@ class InferenceClient:
|
|
|
658
663
|
)
|
|
659
664
|
```
|
|
660
665
|
|
|
661
|
-
Example
|
|
666
|
+
Example using streaming:
|
|
662
667
|
```py
|
|
663
668
|
>>> from huggingface_hub import InferenceClient
|
|
664
669
|
>>> messages = [{"role": "user", "content": "What is the capital of France?"}]
|
|
@@ -696,6 +701,40 @@ class InferenceClient:
|
|
|
696
701
|
print(chunk.choices[0].delta.content)
|
|
697
702
|
```
|
|
698
703
|
|
|
704
|
+
Example using Image + Text as input:
|
|
705
|
+
```py
|
|
706
|
+
>>> from huggingface_hub import InferenceClient
|
|
707
|
+
|
|
708
|
+
# provide a remote URL
|
|
709
|
+
>>> image_url ="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
|
|
710
|
+
# or a base64-encoded image
|
|
711
|
+
>>> image_path = "/path/to/image.jpeg"
|
|
712
|
+
>>> with open(image_path, "rb") as f:
|
|
713
|
+
... base64_image = base64.b64encode(f.read()).decode("utf-8")
|
|
714
|
+
>>> image_url = f"data:image/jpeg;base64,{base64_image}"
|
|
715
|
+
|
|
716
|
+
>>> client = InferenceClient("meta-llama/Llama-3.2-11B-Vision-Instruct")
|
|
717
|
+
>>> output = client.chat.completions.create(
|
|
718
|
+
... messages=[
|
|
719
|
+
... {
|
|
720
|
+
... "role": "user",
|
|
721
|
+
... "content": [
|
|
722
|
+
... {
|
|
723
|
+
... "type": "image_url",
|
|
724
|
+
... "image_url": {"url": image_url},
|
|
725
|
+
... },
|
|
726
|
+
... {
|
|
727
|
+
... "type": "text",
|
|
728
|
+
... "text": "Describe this image in one sentence.",
|
|
729
|
+
... },
|
|
730
|
+
... ],
|
|
731
|
+
... },
|
|
732
|
+
... ],
|
|
733
|
+
... )
|
|
734
|
+
>>> output
|
|
735
|
+
The image depicts the iconic Statue of Liberty situated in New York Harbor, New York, on a clear day.
|
|
736
|
+
```
|
|
737
|
+
|
|
699
738
|
Example using tools:
|
|
700
739
|
```py
|
|
701
740
|
>>> client = InferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
|
|
@@ -837,6 +876,7 @@ class InferenceClient:
|
|
|
837
876
|
top_logprobs=top_logprobs,
|
|
838
877
|
top_p=top_p,
|
|
839
878
|
stream=stream,
|
|
879
|
+
stream_options=stream_options,
|
|
840
880
|
)
|
|
841
881
|
payload = {key: value for key, value in payload.items() if value is not None}
|
|
842
882
|
data = self.post(model=model_url, json=payload, stream=stream)
|
|
@@ -877,6 +917,14 @@ class InferenceClient:
|
|
|
877
917
|
question: str,
|
|
878
918
|
*,
|
|
879
919
|
model: Optional[str] = None,
|
|
920
|
+
doc_stride: Optional[int] = None,
|
|
921
|
+
handle_impossible_answer: Optional[bool] = None,
|
|
922
|
+
lang: Optional[str] = None,
|
|
923
|
+
max_answer_len: Optional[int] = None,
|
|
924
|
+
max_question_len: Optional[int] = None,
|
|
925
|
+
max_seq_len: Optional[int] = None,
|
|
926
|
+
top_k: Optional[int] = None,
|
|
927
|
+
word_boxes: Optional[List[Union[List[float], str]]] = None,
|
|
880
928
|
) -> List[DocumentQuestionAnsweringOutputElement]:
|
|
881
929
|
"""
|
|
882
930
|
Answer questions on document images.
|
|
@@ -890,7 +938,29 @@ class InferenceClient:
|
|
|
890
938
|
The model to use for the document question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
|
|
891
939
|
a deployed Inference Endpoint. If not provided, the default recommended document question answering model will be used.
|
|
892
940
|
Defaults to None.
|
|
893
|
-
|
|
941
|
+
doc_stride (`int`, *optional*):
|
|
942
|
+
If the words in the document are too long to fit with the question for the model, it will
|
|
943
|
+
be split in several chunks with some overlap. This argument controls the size of that
|
|
944
|
+
overlap.
|
|
945
|
+
handle_impossible_answer (`bool`, *optional*):
|
|
946
|
+
Whether to accept impossible as an answer.
|
|
947
|
+
lang (`str`, *optional*):
|
|
948
|
+
Language to use while running OCR.
|
|
949
|
+
max_answer_len (`int`, *optional*):
|
|
950
|
+
The maximum length of predicted answers (e.g., only answers with a shorter length are
|
|
951
|
+
considered).
|
|
952
|
+
max_question_len (`int`, *optional*):
|
|
953
|
+
The maximum length of the question after tokenization. It will be truncated if needed.
|
|
954
|
+
max_seq_len (`int`, *optional*):
|
|
955
|
+
The maximum length of the total sentence (context + question) in tokens of each chunk
|
|
956
|
+
passed to the model. The context will be split in several chunks (using doc_stride as
|
|
957
|
+
overlap) if needed.
|
|
958
|
+
top_k (`int`, *optional*):
|
|
959
|
+
The number of answers to return (will be chosen by order of likelihood). Can return less
|
|
960
|
+
than top_k answers if there are not enough options available within the context.
|
|
961
|
+
word_boxes (`List[Union[List[float], str]]`, *optional*):
|
|
962
|
+
A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
|
|
963
|
+
skip the OCR step and use the provided bounding boxes instead.
|
|
894
964
|
Returns:
|
|
895
965
|
`List[DocumentQuestionAnsweringOutputElement]`: a list of [`DocumentQuestionAnsweringOutputElement`] items containing the predicted label, associated probability, word ids, and page number.
|
|
896
966
|
|
|
@@ -900,16 +970,28 @@ class InferenceClient:
|
|
|
900
970
|
`HTTPError`:
|
|
901
971
|
If the request fails with an HTTP error status code other than HTTP 503.
|
|
902
972
|
|
|
973
|
+
|
|
903
974
|
Example:
|
|
904
975
|
```py
|
|
905
976
|
>>> from huggingface_hub import InferenceClient
|
|
906
977
|
>>> client = InferenceClient()
|
|
907
978
|
>>> client.document_question_answering(image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png", question="What is the invoice number?")
|
|
908
|
-
[DocumentQuestionAnsweringOutputElement(
|
|
979
|
+
[DocumentQuestionAnsweringOutputElement(answer='us-001', end=16, score=0.9999666213989258, start=16, words=None)]
|
|
909
980
|
```
|
|
910
981
|
"""
|
|
911
|
-
|
|
912
|
-
|
|
982
|
+
inputs: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
|
|
983
|
+
parameters = {
|
|
984
|
+
"doc_stride": doc_stride,
|
|
985
|
+
"handle_impossible_answer": handle_impossible_answer,
|
|
986
|
+
"lang": lang,
|
|
987
|
+
"max_answer_len": max_answer_len,
|
|
988
|
+
"max_question_len": max_question_len,
|
|
989
|
+
"max_seq_len": max_seq_len,
|
|
990
|
+
"top_k": top_k,
|
|
991
|
+
"word_boxes": word_boxes,
|
|
992
|
+
}
|
|
993
|
+
payload = _prepare_payload(inputs, parameters=parameters)
|
|
994
|
+
response = self.post(**payload, model=model, task="document-question-answering")
|
|
913
995
|
return DocumentQuestionAnsweringOutputElement.parse_obj_as_list(response)
|
|
914
996
|
|
|
915
997
|
def feature_extraction(
|
|
@@ -933,7 +1015,7 @@ class InferenceClient:
|
|
|
933
1015
|
a deployed Inference Endpoint. If not provided, the default recommended conversational model will be used.
|
|
934
1016
|
Defaults to None.
|
|
935
1017
|
normalize (`bool`, *optional*):
|
|
936
|
-
Whether to normalize the embeddings or not.
|
|
1018
|
+
Whether to normalize the embeddings or not.
|
|
937
1019
|
Only available on server powered by Text-Embedding-Inference.
|
|
938
1020
|
prompt_name (`str`, *optional*):
|
|
939
1021
|
The name of the prompt that should be used by for encoding. If not set, no prompt will be applied.
|
|
@@ -942,7 +1024,7 @@ class InferenceClient:
|
|
|
942
1024
|
then the sentence "What is the capital of France?" will be encoded as "query: What is the capital of France?"
|
|
943
1025
|
because the prompt text will be prepended before any text to encode.
|
|
944
1026
|
truncate (`bool`, *optional*):
|
|
945
|
-
Whether to truncate the embeddings or not.
|
|
1027
|
+
Whether to truncate the embeddings or not.
|
|
946
1028
|
Only available on server powered by Text-Embedding-Inference.
|
|
947
1029
|
truncation_direction (`Literal["Left", "Right"]`, *optional*):
|
|
948
1030
|
Which side of the input should be truncated when `truncate=True` is passed.
|
|
@@ -967,20 +1049,25 @@ class InferenceClient:
|
|
|
967
1049
|
[ 0.28552425, -0.928395 , -1.2077185 , ..., 0.76810825, -2.1069427 , 0.6236161 ]], dtype=float32)
|
|
968
1050
|
```
|
|
969
1051
|
"""
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
payload["truncation_direction"] = truncation_direction
|
|
979
|
-
response = self.post(json=payload, model=model, task="feature-extraction")
|
|
1052
|
+
parameters = {
|
|
1053
|
+
"normalize": normalize,
|
|
1054
|
+
"prompt_name": prompt_name,
|
|
1055
|
+
"truncate": truncate,
|
|
1056
|
+
"truncation_direction": truncation_direction,
|
|
1057
|
+
}
|
|
1058
|
+
payload = _prepare_payload(text, parameters=parameters)
|
|
1059
|
+
response = self.post(**payload, model=model, task="feature-extraction")
|
|
980
1060
|
np = _import_numpy()
|
|
981
1061
|
return np.array(_bytes_to_dict(response), dtype="float32")
|
|
982
1062
|
|
|
983
|
-
def fill_mask(
|
|
1063
|
+
def fill_mask(
|
|
1064
|
+
self,
|
|
1065
|
+
text: str,
|
|
1066
|
+
*,
|
|
1067
|
+
model: Optional[str] = None,
|
|
1068
|
+
targets: Optional[List[str]] = None,
|
|
1069
|
+
top_k: Optional[int] = None,
|
|
1070
|
+
) -> List[FillMaskOutputElement]:
|
|
984
1071
|
"""
|
|
985
1072
|
Fill in a hole with a missing word (token to be precise).
|
|
986
1073
|
|
|
@@ -990,8 +1077,13 @@ class InferenceClient:
|
|
|
990
1077
|
model (`str`, *optional*):
|
|
991
1078
|
The model to use for the fill mask task. Can be a model ID hosted on the Hugging Face Hub or a URL to
|
|
992
1079
|
a deployed Inference Endpoint. If not provided, the default recommended fill mask model will be used.
|
|
993
|
-
|
|
994
|
-
|
|
1080
|
+
targets (`List[str]`, *optional*):
|
|
1081
|
+
When passed, the model will limit the scores to the passed targets instead of looking up
|
|
1082
|
+
in the whole vocabulary. If the provided targets are not in the model vocab, they will be
|
|
1083
|
+
tokenized and the first resulting token will be used (with a warning, and that might be
|
|
1084
|
+
slower).
|
|
1085
|
+
top_k (`int`, *optional*):
|
|
1086
|
+
When passed, overrides the number of predictions to return.
|
|
995
1087
|
Returns:
|
|
996
1088
|
`List[FillMaskOutputElement]`: a list of [`FillMaskOutputElement`] items containing the predicted label, associated
|
|
997
1089
|
probability, token reference, and completed text.
|
|
@@ -1013,7 +1105,9 @@ class InferenceClient:
|
|
|
1013
1105
|
]
|
|
1014
1106
|
```
|
|
1015
1107
|
"""
|
|
1016
|
-
|
|
1108
|
+
parameters = {"targets": targets, "top_k": top_k}
|
|
1109
|
+
payload = _prepare_payload(text, parameters=parameters)
|
|
1110
|
+
response = self.post(**payload, model=model, task="fill-mask")
|
|
1017
1111
|
return FillMaskOutputElement.parse_obj_as_list(response)
|
|
1018
1112
|
|
|
1019
1113
|
def image_classification(
|
|
@@ -1021,6 +1115,8 @@ class InferenceClient:
|
|
|
1021
1115
|
image: ContentT,
|
|
1022
1116
|
*,
|
|
1023
1117
|
model: Optional[str] = None,
|
|
1118
|
+
function_to_apply: Optional[Literal["sigmoid", "softmax", "none"]] = None,
|
|
1119
|
+
top_k: Optional[int] = None,
|
|
1024
1120
|
) -> List[ImageClassificationOutputElement]:
|
|
1025
1121
|
"""
|
|
1026
1122
|
Perform image classification on the given image using the specified model.
|
|
@@ -1031,7 +1127,10 @@ class InferenceClient:
|
|
|
1031
1127
|
model (`str`, *optional*):
|
|
1032
1128
|
The model to use for image classification. Can be a model ID hosted on the Hugging Face Hub or a URL to a
|
|
1033
1129
|
deployed Inference Endpoint. If not provided, the default recommended model for image classification will be used.
|
|
1034
|
-
|
|
1130
|
+
function_to_apply (`Literal["sigmoid", "softmax", "none"]`, *optional*):
|
|
1131
|
+
The function to apply to the output scores.
|
|
1132
|
+
top_k (`int`, *optional*):
|
|
1133
|
+
When specified, limits the output to the top K most probable classes.
|
|
1035
1134
|
Returns:
|
|
1036
1135
|
`List[ImageClassificationOutputElement]`: a list of [`ImageClassificationOutputElement`] items containing the predicted label and associated probability.
|
|
1037
1136
|
|
|
@@ -1046,10 +1145,12 @@ class InferenceClient:
|
|
|
1046
1145
|
>>> from huggingface_hub import InferenceClient
|
|
1047
1146
|
>>> client = InferenceClient()
|
|
1048
1147
|
>>> client.image_classification("https://upload.wikimedia.org/wikipedia/commons/thumb/4/43/Cute_dog.jpg/320px-Cute_dog.jpg")
|
|
1049
|
-
[ImageClassificationOutputElement(
|
|
1148
|
+
[ImageClassificationOutputElement(label='Blenheim spaniel', score=0.9779096841812134), ...]
|
|
1050
1149
|
```
|
|
1051
1150
|
"""
|
|
1052
|
-
|
|
1151
|
+
parameters = {"function_to_apply": function_to_apply, "top_k": top_k}
|
|
1152
|
+
payload = _prepare_payload(image, parameters=parameters, expect_binary=True)
|
|
1153
|
+
response = self.post(**payload, model=model, task="image-classification")
|
|
1053
1154
|
return ImageClassificationOutputElement.parse_obj_as_list(response)
|
|
1054
1155
|
|
|
1055
1156
|
def image_segmentation(
|
|
@@ -1057,6 +1158,10 @@ class InferenceClient:
|
|
|
1057
1158
|
image: ContentT,
|
|
1058
1159
|
*,
|
|
1059
1160
|
model: Optional[str] = None,
|
|
1161
|
+
mask_threshold: Optional[float] = None,
|
|
1162
|
+
overlap_mask_area_threshold: Optional[float] = None,
|
|
1163
|
+
subtask: Optional[Literal["instance", "panoptic", "semantic"]] = None,
|
|
1164
|
+
threshold: Optional[float] = None,
|
|
1060
1165
|
) -> List[ImageSegmentationOutputElement]:
|
|
1061
1166
|
"""
|
|
1062
1167
|
Perform image segmentation on the given image using the specified model.
|
|
@@ -1073,7 +1178,14 @@ class InferenceClient:
|
|
|
1073
1178
|
model (`str`, *optional*):
|
|
1074
1179
|
The model to use for image segmentation. Can be a model ID hosted on the Hugging Face Hub or a URL to a
|
|
1075
1180
|
deployed Inference Endpoint. If not provided, the default recommended model for image segmentation will be used.
|
|
1076
|
-
|
|
1181
|
+
mask_threshold (`float`, *optional*):
|
|
1182
|
+
Threshold to use when turning the predicted masks into binary values.
|
|
1183
|
+
overlap_mask_area_threshold (`float`, *optional*):
|
|
1184
|
+
Mask overlap threshold to eliminate small, disconnected segments.
|
|
1185
|
+
subtask (`Literal["instance", "panoptic", "semantic"]`, *optional*):
|
|
1186
|
+
Segmentation task to be performed, depending on model capabilities.
|
|
1187
|
+
threshold (`float`, *optional*):
|
|
1188
|
+
Probability threshold to filter out predicted masks.
|
|
1077
1189
|
Returns:
|
|
1078
1190
|
`List[ImageSegmentationOutputElement]`: A list of [`ImageSegmentationOutputElement`] items containing the segmented masks and associated attributes.
|
|
1079
1191
|
|
|
@@ -1087,14 +1199,21 @@ class InferenceClient:
|
|
|
1087
1199
|
```py
|
|
1088
1200
|
>>> from huggingface_hub import InferenceClient
|
|
1089
1201
|
>>> client = InferenceClient()
|
|
1090
|
-
>>> client.image_segmentation("cat.jpg")
|
|
1202
|
+
>>> client.image_segmentation("cat.jpg")
|
|
1091
1203
|
[ImageSegmentationOutputElement(score=0.989008, label='LABEL_184', mask=<PIL.PngImagePlugin.PngImageFile image mode=L size=400x300 at 0x7FDD2B129CC0>), ...]
|
|
1092
1204
|
```
|
|
1093
1205
|
"""
|
|
1094
|
-
|
|
1206
|
+
parameters = {
|
|
1207
|
+
"mask_threshold": mask_threshold,
|
|
1208
|
+
"overlap_mask_area_threshold": overlap_mask_area_threshold,
|
|
1209
|
+
"subtask": subtask,
|
|
1210
|
+
"threshold": threshold,
|
|
1211
|
+
}
|
|
1212
|
+
payload = _prepare_payload(image, parameters=parameters, expect_binary=True)
|
|
1213
|
+
response = self.post(**payload, model=model, task="image-segmentation")
|
|
1095
1214
|
output = ImageSegmentationOutputElement.parse_obj_as_list(response)
|
|
1096
1215
|
for item in output:
|
|
1097
|
-
item.mask = _b64_to_image(item.mask)
|
|
1216
|
+
item.mask = _b64_to_image(item.mask) # type: ignore [assignment]
|
|
1098
1217
|
return output
|
|
1099
1218
|
|
|
1100
1219
|
def image_to_image(
|
|
@@ -1166,19 +1285,8 @@ class InferenceClient:
|
|
|
1166
1285
|
"guidance_scale": guidance_scale,
|
|
1167
1286
|
**kwargs,
|
|
1168
1287
|
}
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
data = image
|
|
1172
|
-
payload: Optional[Dict[str, Any]] = None
|
|
1173
|
-
else:
|
|
1174
|
-
# Or an image + some parameters => use base64 encoding
|
|
1175
|
-
data = None
|
|
1176
|
-
payload = {"inputs": _b64_encode(image)}
|
|
1177
|
-
for key, value in parameters.items():
|
|
1178
|
-
if value is not None:
|
|
1179
|
-
payload.setdefault("parameters", {})[key] = value
|
|
1180
|
-
|
|
1181
|
-
response = self.post(json=payload, data=data, model=model, task="image-to-image")
|
|
1288
|
+
payload = _prepare_payload(image, parameters=parameters, expect_binary=True)
|
|
1289
|
+
response = self.post(**payload, model=model, task="image-to-image")
|
|
1182
1290
|
return _bytes_to_image(response)
|
|
1183
1291
|
|
|
1184
1292
|
def image_to_text(self, image: ContentT, *, model: Optional[str] = None) -> ImageToTextOutput:
|
|
@@ -1302,10 +1410,7 @@ class InferenceClient:
|
|
|
1302
1410
|
return models_by_task
|
|
1303
1411
|
|
|
1304
1412
|
def object_detection(
|
|
1305
|
-
self,
|
|
1306
|
-
image: ContentT,
|
|
1307
|
-
*,
|
|
1308
|
-
model: Optional[str] = None,
|
|
1413
|
+
self, image: ContentT, *, model: Optional[str] = None, threshold: Optional[float] = None
|
|
1309
1414
|
) -> List[ObjectDetectionOutputElement]:
|
|
1310
1415
|
"""
|
|
1311
1416
|
Perform object detection on the given image using the specified model.
|
|
@@ -1322,7 +1427,8 @@ class InferenceClient:
|
|
|
1322
1427
|
model (`str`, *optional*):
|
|
1323
1428
|
The model to use for object detection. Can be a model ID hosted on the Hugging Face Hub or a URL to a
|
|
1324
1429
|
deployed Inference Endpoint. If not provided, the default recommended model for object detection (DETR) will be used.
|
|
1325
|
-
|
|
1430
|
+
threshold (`float`, *optional*):
|
|
1431
|
+
The probability necessary to make a prediction.
|
|
1326
1432
|
Returns:
|
|
1327
1433
|
`List[ObjectDetectionOutputElement]`: A list of [`ObjectDetectionOutputElement`] items containing the bounding boxes and associated attributes.
|
|
1328
1434
|
|
|
@@ -1338,17 +1444,31 @@ class InferenceClient:
|
|
|
1338
1444
|
```py
|
|
1339
1445
|
>>> from huggingface_hub import InferenceClient
|
|
1340
1446
|
>>> client = InferenceClient()
|
|
1341
|
-
>>> client.object_detection("people.jpg")
|
|
1447
|
+
>>> client.object_detection("people.jpg")
|
|
1342
1448
|
[ObjectDetectionOutputElement(score=0.9486683011054993, label='person', box=ObjectDetectionBoundingBox(xmin=59, ymin=39, xmax=420, ymax=510)), ...]
|
|
1343
1449
|
```
|
|
1344
1450
|
"""
|
|
1345
|
-
|
|
1346
|
-
|
|
1451
|
+
parameters = {
|
|
1452
|
+
"threshold": threshold,
|
|
1453
|
+
}
|
|
1454
|
+
payload = _prepare_payload(image, parameters=parameters, expect_binary=True)
|
|
1455
|
+
response = self.post(**payload, model=model, task="object-detection")
|
|
1347
1456
|
return ObjectDetectionOutputElement.parse_obj_as_list(response)
|
|
1348
1457
|
|
|
1349
1458
|
def question_answering(
|
|
1350
|
-
self,
|
|
1351
|
-
|
|
1459
|
+
self,
|
|
1460
|
+
question: str,
|
|
1461
|
+
context: str,
|
|
1462
|
+
*,
|
|
1463
|
+
model: Optional[str] = None,
|
|
1464
|
+
align_to_words: Optional[bool] = None,
|
|
1465
|
+
doc_stride: Optional[int] = None,
|
|
1466
|
+
handle_impossible_answer: Optional[bool] = None,
|
|
1467
|
+
max_answer_len: Optional[int] = None,
|
|
1468
|
+
max_question_len: Optional[int] = None,
|
|
1469
|
+
max_seq_len: Optional[int] = None,
|
|
1470
|
+
top_k: Optional[int] = None,
|
|
1471
|
+
) -> Union[QuestionAnsweringOutputElement, List[QuestionAnsweringOutputElement]]:
|
|
1352
1472
|
"""
|
|
1353
1473
|
Retrieve the answer to a question from a given text.
|
|
1354
1474
|
|
|
@@ -1360,10 +1480,31 @@ class InferenceClient:
|
|
|
1360
1480
|
model (`str`):
|
|
1361
1481
|
The model to use for the question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
|
|
1362
1482
|
a deployed Inference Endpoint.
|
|
1363
|
-
|
|
1483
|
+
align_to_words (`bool`, *optional*):
|
|
1484
|
+
Attempts to align the answer to real words. Improves quality on space separated
|
|
1485
|
+
languages. Might hurt on non-space-separated languages (like Japanese or Chinese).
|
|
1486
|
+
doc_stride (`int`, *optional*):
|
|
1487
|
+
If the context is too long to fit with the question for the model, it will be split in
|
|
1488
|
+
several chunks with some overlap. This argument controls the size of that overlap.
|
|
1489
|
+
handle_impossible_answer (`bool`, *optional*):
|
|
1490
|
+
Whether to accept impossible as an answer.
|
|
1491
|
+
max_answer_len (`int`, *optional*):
|
|
1492
|
+
The maximum length of predicted answers (e.g., only answers with a shorter length are
|
|
1493
|
+
considered).
|
|
1494
|
+
max_question_len (`int`, *optional*):
|
|
1495
|
+
The maximum length of the question after tokenization. It will be truncated if needed.
|
|
1496
|
+
max_seq_len (`int`, *optional*):
|
|
1497
|
+
The maximum length of the total sentence (context + question) in tokens of each chunk
|
|
1498
|
+
passed to the model. The context will be split in several chunks (using docStride as
|
|
1499
|
+
overlap) if needed.
|
|
1500
|
+
top_k (`int`, *optional*):
|
|
1501
|
+
The number of answers to return (will be chosen by order of likelihood). Note that we
|
|
1502
|
+
return less than topk answers if there are not enough options available within the
|
|
1503
|
+
context.
|
|
1364
1504
|
Returns:
|
|
1365
|
-
[`QuestionAnsweringOutputElement`]:
|
|
1366
|
-
|
|
1505
|
+
Union[`QuestionAnsweringOutputElement`, List[`QuestionAnsweringOutputElement`]]:
|
|
1506
|
+
When top_k is 1 or not provided, it returns a single `QuestionAnsweringOutputElement`.
|
|
1507
|
+
When top_k is greater than 1, it returns a list of `QuestionAnsweringOutputElement`.
|
|
1367
1508
|
Raises:
|
|
1368
1509
|
[`InferenceTimeoutError`]:
|
|
1369
1510
|
If the model is unavailable or the request times out.
|
|
@@ -1375,17 +1516,28 @@ class InferenceClient:
|
|
|
1375
1516
|
>>> from huggingface_hub import InferenceClient
|
|
1376
1517
|
>>> client = InferenceClient()
|
|
1377
1518
|
>>> client.question_answering(question="What's my name?", context="My name is Clara and I live in Berkeley.")
|
|
1378
|
-
QuestionAnsweringOutputElement(
|
|
1519
|
+
QuestionAnsweringOutputElement(answer='Clara', end=16, score=0.9326565265655518, start=11)
|
|
1379
1520
|
```
|
|
1380
1521
|
"""
|
|
1381
|
-
|
|
1382
|
-
|
|
1522
|
+
parameters = {
|
|
1523
|
+
"align_to_words": align_to_words,
|
|
1524
|
+
"doc_stride": doc_stride,
|
|
1525
|
+
"handle_impossible_answer": handle_impossible_answer,
|
|
1526
|
+
"max_answer_len": max_answer_len,
|
|
1527
|
+
"max_question_len": max_question_len,
|
|
1528
|
+
"max_seq_len": max_seq_len,
|
|
1529
|
+
"top_k": top_k,
|
|
1530
|
+
}
|
|
1531
|
+
inputs: Dict[str, Any] = {"question": question, "context": context}
|
|
1532
|
+
payload = _prepare_payload(inputs, parameters=parameters)
|
|
1383
1533
|
response = self.post(
|
|
1384
|
-
|
|
1534
|
+
**payload,
|
|
1385
1535
|
model=model,
|
|
1386
1536
|
task="question-answering",
|
|
1387
1537
|
)
|
|
1388
|
-
|
|
1538
|
+
# Parse the response as a single `QuestionAnsweringOutputElement` when top_k is 1 or not provided, or a list of `QuestionAnsweringOutputElement` to ensure backward compatibility.
|
|
1539
|
+
output = QuestionAnsweringOutputElement.parse_obj(response)
|
|
1540
|
+
return output
|
|
1389
1541
|
|
|
1390
1542
|
def sentence_similarity(
|
|
1391
1543
|
self, sentence: str, other_sentences: List[str], *, model: Optional[str] = None
|
|
@@ -1434,12 +1586,23 @@ class InferenceClient:
|
|
|
1434
1586
|
)
|
|
1435
1587
|
return _bytes_to_list(response)
|
|
1436
1588
|
|
|
1589
|
+
@_deprecate_arguments(
|
|
1590
|
+
version="0.29",
|
|
1591
|
+
deprecated_args=["parameters"],
|
|
1592
|
+
custom_message=(
|
|
1593
|
+
"The `parameters` argument is deprecated and will be removed in a future version. "
|
|
1594
|
+
"Provide individual parameters instead: `clean_up_tokenization_spaces`, `generate_parameters`, and `truncation`."
|
|
1595
|
+
),
|
|
1596
|
+
)
|
|
1437
1597
|
def summarization(
|
|
1438
1598
|
self,
|
|
1439
1599
|
text: str,
|
|
1440
1600
|
*,
|
|
1441
1601
|
parameters: Optional[Dict[str, Any]] = None,
|
|
1442
1602
|
model: Optional[str] = None,
|
|
1603
|
+
clean_up_tokenization_spaces: Optional[bool] = None,
|
|
1604
|
+
generate_parameters: Optional[Dict[str, Any]] = None,
|
|
1605
|
+
truncation: Optional[Literal["do_not_truncate", "longest_first", "only_first", "only_second"]] = None,
|
|
1443
1606
|
) -> SummarizationOutput:
|
|
1444
1607
|
"""
|
|
1445
1608
|
Generate a summary of a given text using a specified model.
|
|
@@ -1452,8 +1615,13 @@ class InferenceClient:
|
|
|
1452
1615
|
for more details.
|
|
1453
1616
|
model (`str`, *optional*):
|
|
1454
1617
|
The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
|
|
1455
|
-
Inference Endpoint.
|
|
1456
|
-
|
|
1618
|
+
Inference Endpoint. If not provided, the default recommended model for summarization will be used.
|
|
1619
|
+
clean_up_tokenization_spaces (`bool`, *optional*):
|
|
1620
|
+
Whether to clean up the potential extra spaces in the text output.
|
|
1621
|
+
generate_parameters (`Dict[str, Any]`, *optional*):
|
|
1622
|
+
Additional parametrization of the text generation algorithm.
|
|
1623
|
+
truncation (`Literal["do_not_truncate", "longest_first", "only_first", "only_second"]`, *optional*):
|
|
1624
|
+
The truncation strategy to use.
|
|
1457
1625
|
Returns:
|
|
1458
1626
|
[`SummarizationOutput`]: The generated summary text.
|
|
1459
1627
|
|
|
@@ -1471,14 +1639,23 @@ class InferenceClient:
|
|
|
1471
1639
|
SummarizationOutput(generated_text="The Eiffel tower is one of the most famous landmarks in the world....")
|
|
1472
1640
|
```
|
|
1473
1641
|
"""
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1642
|
+
if parameters is None:
|
|
1643
|
+
parameters = {
|
|
1644
|
+
"clean_up_tokenization_spaces": clean_up_tokenization_spaces,
|
|
1645
|
+
"generate_parameters": generate_parameters,
|
|
1646
|
+
"truncation": truncation,
|
|
1647
|
+
}
|
|
1648
|
+
payload = _prepare_payload(text, parameters=parameters)
|
|
1649
|
+
response = self.post(**payload, model=model, task="summarization")
|
|
1478
1650
|
return SummarizationOutput.parse_obj_as_list(response)[0]
|
|
1479
1651
|
|
|
1480
1652
|
def table_question_answering(
|
|
1481
|
-
self,
|
|
1653
|
+
self,
|
|
1654
|
+
table: Dict[str, Any],
|
|
1655
|
+
query: str,
|
|
1656
|
+
*,
|
|
1657
|
+
model: Optional[str] = None,
|
|
1658
|
+
parameters: Optional[Dict[str, Any]] = None,
|
|
1482
1659
|
) -> TableQuestionAnsweringOutputElement:
|
|
1483
1660
|
"""
|
|
1484
1661
|
Retrieve the answer to a question from information given in a table.
|
|
@@ -1492,6 +1669,8 @@ class InferenceClient:
|
|
|
1492
1669
|
model (`str`):
|
|
1493
1670
|
The model to use for the table-question-answering task. Can be a model ID hosted on the Hugging Face
|
|
1494
1671
|
Hub or a URL to a deployed Inference Endpoint.
|
|
1672
|
+
parameters (`Dict[str, Any]`, *optional*):
|
|
1673
|
+
Additional inference parameters. Defaults to None.
|
|
1495
1674
|
|
|
1496
1675
|
Returns:
|
|
1497
1676
|
[`TableQuestionAnsweringOutputElement`]: a table question answering output containing the answer, coordinates, cells and the aggregator used.
|
|
@@ -1512,11 +1691,13 @@ class InferenceClient:
|
|
|
1512
1691
|
TableQuestionAnsweringOutputElement(answer='36542', coordinates=[[0, 1]], cells=['36542'], aggregator='AVERAGE')
|
|
1513
1692
|
```
|
|
1514
1693
|
"""
|
|
1694
|
+
inputs = {
|
|
1695
|
+
"query": query,
|
|
1696
|
+
"table": table,
|
|
1697
|
+
}
|
|
1698
|
+
payload = _prepare_payload(inputs, parameters=parameters)
|
|
1515
1699
|
response = self.post(
|
|
1516
|
-
|
|
1517
|
-
"query": query,
|
|
1518
|
-
"table": table,
|
|
1519
|
-
},
|
|
1700
|
+
**payload,
|
|
1520
1701
|
model=model,
|
|
1521
1702
|
task="table-question-answering",
|
|
1522
1703
|
)
|
|
@@ -1564,7 +1745,11 @@ class InferenceClient:
|
|
|
1564
1745
|
["5", "5", "5"]
|
|
1565
1746
|
```
|
|
1566
1747
|
"""
|
|
1567
|
-
response = self.post(
|
|
1748
|
+
response = self.post(
|
|
1749
|
+
json={"table": table},
|
|
1750
|
+
model=model,
|
|
1751
|
+
task="tabular-classification",
|
|
1752
|
+
)
|
|
1568
1753
|
return _bytes_to_list(response)
|
|
1569
1754
|
|
|
1570
1755
|
def tabular_regression(self, table: Dict[str, Any], *, model: Optional[str] = None) -> List[float]:
|
|
@@ -1607,7 +1792,14 @@ class InferenceClient:
|
|
|
1607
1792
|
response = self.post(json={"table": table}, model=model, task="tabular-regression")
|
|
1608
1793
|
return _bytes_to_list(response)
|
|
1609
1794
|
|
|
1610
|
-
def text_classification(
|
|
1795
|
+
def text_classification(
|
|
1796
|
+
self,
|
|
1797
|
+
text: str,
|
|
1798
|
+
*,
|
|
1799
|
+
model: Optional[str] = None,
|
|
1800
|
+
top_k: Optional[int] = None,
|
|
1801
|
+
function_to_apply: Optional["TextClassificationOutputTransform"] = None,
|
|
1802
|
+
) -> List[TextClassificationOutputElement]:
|
|
1611
1803
|
"""
|
|
1612
1804
|
Perform text classification (e.g. sentiment-analysis) on the given text.
|
|
1613
1805
|
|
|
@@ -1618,6 +1810,10 @@ class InferenceClient:
|
|
|
1618
1810
|
The model to use for the text classification task. Can be a model ID hosted on the Hugging Face Hub or a URL to
|
|
1619
1811
|
a deployed Inference Endpoint. If not provided, the default recommended text classification model will be used.
|
|
1620
1812
|
Defaults to None.
|
|
1813
|
+
top_k (`int`, *optional*):
|
|
1814
|
+
When specified, limits the output to the top K most probable classes.
|
|
1815
|
+
function_to_apply (`"TextClassificationOutputTransform"`, *optional*):
|
|
1816
|
+
The function to apply to the output.
|
|
1621
1817
|
|
|
1622
1818
|
Returns:
|
|
1623
1819
|
`List[TextClassificationOutputElement]`: a list of [`TextClassificationOutputElement`] items containing the predicted label and associated probability.
|
|
@@ -1639,7 +1835,16 @@ class InferenceClient:
|
|
|
1639
1835
|
]
|
|
1640
1836
|
```
|
|
1641
1837
|
"""
|
|
1642
|
-
|
|
1838
|
+
parameters = {
|
|
1839
|
+
"function_to_apply": function_to_apply,
|
|
1840
|
+
"top_k": top_k,
|
|
1841
|
+
}
|
|
1842
|
+
payload = _prepare_payload(text, parameters=parameters)
|
|
1843
|
+
response = self.post(
|
|
1844
|
+
**payload,
|
|
1845
|
+
model=model,
|
|
1846
|
+
task="text-classification",
|
|
1847
|
+
)
|
|
1643
1848
|
return TextClassificationOutputElement.parse_obj_as_list(response)[0] # type: ignore [return-value]
|
|
1644
1849
|
|
|
1645
1850
|
@overload
|
|
@@ -2148,6 +2353,9 @@ class InferenceClient:
|
|
|
2148
2353
|
num_inference_steps: Optional[float] = None,
|
|
2149
2354
|
guidance_scale: Optional[float] = None,
|
|
2150
2355
|
model: Optional[str] = None,
|
|
2356
|
+
scheduler: Optional[str] = None,
|
|
2357
|
+
target_size: Optional[TextToImageTargetSize] = None,
|
|
2358
|
+
seed: Optional[int] = None,
|
|
2151
2359
|
**kwargs,
|
|
2152
2360
|
) -> "Image":
|
|
2153
2361
|
"""
|
|
@@ -2176,7 +2384,14 @@ class InferenceClient:
|
|
|
2176
2384
|
usually at the expense of lower image quality.
|
|
2177
2385
|
model (`str`, *optional*):
|
|
2178
2386
|
The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
|
|
2179
|
-
Inference Endpoint.
|
|
2387
|
+
Inference Endpoint. If not provided, the default recommended text-to-image model will be used.
|
|
2388
|
+
Defaults to None.
|
|
2389
|
+
scheduler (`str`, *optional*):
|
|
2390
|
+
Override the scheduler with a compatible one.
|
|
2391
|
+
target_size (`TextToImageTargetSize`, *optional*):
|
|
2392
|
+
The size in pixel of the output image
|
|
2393
|
+
seed (`int`, *optional*):
|
|
2394
|
+
Seed for the random number generator.
|
|
2180
2395
|
|
|
2181
2396
|
Returns:
|
|
2182
2397
|
`Image`: The generated image.
|
|
@@ -2203,22 +2418,44 @@ class InferenceClient:
|
|
|
2203
2418
|
>>> image.save("better_astronaut.png")
|
|
2204
2419
|
```
|
|
2205
2420
|
"""
|
|
2206
|
-
|
|
2421
|
+
|
|
2207
2422
|
parameters = {
|
|
2208
2423
|
"negative_prompt": negative_prompt,
|
|
2209
2424
|
"height": height,
|
|
2210
2425
|
"width": width,
|
|
2211
2426
|
"num_inference_steps": num_inference_steps,
|
|
2212
2427
|
"guidance_scale": guidance_scale,
|
|
2428
|
+
"scheduler": scheduler,
|
|
2429
|
+
"target_size": target_size,
|
|
2430
|
+
"seed": seed,
|
|
2213
2431
|
**kwargs,
|
|
2214
2432
|
}
|
|
2215
|
-
|
|
2216
|
-
|
|
2217
|
-
payload.setdefault("parameters", {})[key] = value # type: ignore
|
|
2218
|
-
response = self.post(json=payload, model=model, task="text-to-image")
|
|
2433
|
+
payload = _prepare_payload(prompt, parameters=parameters)
|
|
2434
|
+
response = self.post(**payload, model=model, task="text-to-image")
|
|
2219
2435
|
return _bytes_to_image(response)
|
|
2220
2436
|
|
|
2221
|
-
def text_to_speech(
|
|
2437
|
+
def text_to_speech(
|
|
2438
|
+
self,
|
|
2439
|
+
text: str,
|
|
2440
|
+
*,
|
|
2441
|
+
model: Optional[str] = None,
|
|
2442
|
+
do_sample: Optional[bool] = None,
|
|
2443
|
+
early_stopping: Optional[Union[bool, "TextToSpeechEarlyStoppingEnum"]] = None,
|
|
2444
|
+
epsilon_cutoff: Optional[float] = None,
|
|
2445
|
+
eta_cutoff: Optional[float] = None,
|
|
2446
|
+
max_length: Optional[int] = None,
|
|
2447
|
+
max_new_tokens: Optional[int] = None,
|
|
2448
|
+
min_length: Optional[int] = None,
|
|
2449
|
+
min_new_tokens: Optional[int] = None,
|
|
2450
|
+
num_beam_groups: Optional[int] = None,
|
|
2451
|
+
num_beams: Optional[int] = None,
|
|
2452
|
+
penalty_alpha: Optional[float] = None,
|
|
2453
|
+
temperature: Optional[float] = None,
|
|
2454
|
+
top_k: Optional[int] = None,
|
|
2455
|
+
top_p: Optional[float] = None,
|
|
2456
|
+
typical_p: Optional[float] = None,
|
|
2457
|
+
use_cache: Optional[bool] = None,
|
|
2458
|
+
) -> bytes:
|
|
2222
2459
|
"""
|
|
2223
2460
|
Synthesize an audio of a voice pronouncing a given text.
|
|
2224
2461
|
|
|
@@ -2227,7 +2464,56 @@ class InferenceClient:
|
|
|
2227
2464
|
The text to synthesize.
|
|
2228
2465
|
model (`str`, *optional*):
|
|
2229
2466
|
The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
|
|
2230
|
-
Inference Endpoint.
|
|
2467
|
+
Inference Endpoint. If not provided, the default recommended text-to-speech model will be used.
|
|
2468
|
+
Defaults to None.
|
|
2469
|
+
do_sample (`bool`, *optional*):
|
|
2470
|
+
Whether to use sampling instead of greedy decoding when generating new tokens.
|
|
2471
|
+
early_stopping (`Union[bool, "TextToSpeechEarlyStoppingEnum"`, *optional*):
|
|
2472
|
+
Controls the stopping condition for beam-based methods.
|
|
2473
|
+
epsilon_cutoff (`float`, *optional*):
|
|
2474
|
+
If set to float strictly between 0 and 1, only tokens with a conditional probability
|
|
2475
|
+
greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
|
|
2476
|
+
3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
|
|
2477
|
+
Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
|
|
2478
|
+
eta_cutoff (`float`, *optional*):
|
|
2479
|
+
Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
|
|
2480
|
+
float strictly between 0 and 1, a token is only considered if it is greater than either
|
|
2481
|
+
eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
|
|
2482
|
+
term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
|
|
2483
|
+
the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
|
|
2484
|
+
See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
|
|
2485
|
+
for more details.
|
|
2486
|
+
max_length (`int`, *optional*):
|
|
2487
|
+
The maximum length (in tokens) of the generated text, including the input.
|
|
2488
|
+
max_new_tokens (`int`, *optional*):
|
|
2489
|
+
The maximum number of tokens to generate. Takes precedence over maxLength.
|
|
2490
|
+
min_length (`int`, *optional*):
|
|
2491
|
+
The minimum length (in tokens) of the generated text, including the input.
|
|
2492
|
+
min_new_tokens (`int`, *optional*):
|
|
2493
|
+
The minimum number of tokens to generate. Takes precedence over maxLength.
|
|
2494
|
+
num_beam_groups (`int`, *optional*):
|
|
2495
|
+
Number of groups to divide num_beams into in order to ensure diversity among different
|
|
2496
|
+
groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
|
|
2497
|
+
num_beams (`int`, *optional*):
|
|
2498
|
+
Number of beams to use for beam search.
|
|
2499
|
+
penalty_alpha (`float`, *optional*):
|
|
2500
|
+
The value balances the model confidence and the degeneration penalty in contrastive
|
|
2501
|
+
search decoding.
|
|
2502
|
+
temperature (`float`, *optional*):
|
|
2503
|
+
The value used to modulate the next token probabilities.
|
|
2504
|
+
top_k (`int`, *optional*):
|
|
2505
|
+
The number of highest probability vocabulary tokens to keep for top-k-filtering.
|
|
2506
|
+
top_p (`float`, *optional*):
|
|
2507
|
+
If set to float < 1, only the smallest set of most probable tokens with probabilities
|
|
2508
|
+
that add up to top_p or higher are kept for generation.
|
|
2509
|
+
typical_p (`float`, *optional*):
|
|
2510
|
+
Local typicality measures how similar the conditional probability of predicting a target token next is
|
|
2511
|
+
to the expected conditional probability of predicting a random token next, given the partial text
|
|
2512
|
+
already generated. If set to float < 1, the smallest set of the most locally typical tokens with
|
|
2513
|
+
probabilities that add up to typical_p or higher are kept for generation. See [this
|
|
2514
|
+
paper](https://hf.co/papers/2202.00666) for more details.
|
|
2515
|
+
use_cache (`bool`, *optional*):
|
|
2516
|
+
Whether the model should use the past last key/values attentions to speed up decoding
|
|
2231
2517
|
|
|
2232
2518
|
Returns:
|
|
2233
2519
|
`bytes`: The generated audio.
|
|
@@ -2248,10 +2534,36 @@ class InferenceClient:
|
|
|
2248
2534
|
>>> Path("hello_world.flac").write_bytes(audio)
|
|
2249
2535
|
```
|
|
2250
2536
|
"""
|
|
2251
|
-
|
|
2537
|
+
parameters = {
|
|
2538
|
+
"do_sample": do_sample,
|
|
2539
|
+
"early_stopping": early_stopping,
|
|
2540
|
+
"epsilon_cutoff": epsilon_cutoff,
|
|
2541
|
+
"eta_cutoff": eta_cutoff,
|
|
2542
|
+
"max_length": max_length,
|
|
2543
|
+
"max_new_tokens": max_new_tokens,
|
|
2544
|
+
"min_length": min_length,
|
|
2545
|
+
"min_new_tokens": min_new_tokens,
|
|
2546
|
+
"num_beam_groups": num_beam_groups,
|
|
2547
|
+
"num_beams": num_beams,
|
|
2548
|
+
"penalty_alpha": penalty_alpha,
|
|
2549
|
+
"temperature": temperature,
|
|
2550
|
+
"top_k": top_k,
|
|
2551
|
+
"top_p": top_p,
|
|
2552
|
+
"typical_p": typical_p,
|
|
2553
|
+
"use_cache": use_cache,
|
|
2554
|
+
}
|
|
2555
|
+
payload = _prepare_payload(text, parameters=parameters)
|
|
2556
|
+
response = self.post(**payload, model=model, task="text-to-speech")
|
|
2557
|
+
return response
|
|
2252
2558
|
|
|
2253
2559
|
def token_classification(
|
|
2254
|
-
self,
|
|
2560
|
+
self,
|
|
2561
|
+
text: str,
|
|
2562
|
+
*,
|
|
2563
|
+
model: Optional[str] = None,
|
|
2564
|
+
aggregation_strategy: Optional[Literal["none", "simple", "first", "average", "max"]] = None,
|
|
2565
|
+
ignore_labels: Optional[List[str]] = None,
|
|
2566
|
+
stride: Optional[int] = None,
|
|
2255
2567
|
) -> List[TokenClassificationOutputElement]:
|
|
2256
2568
|
"""
|
|
2257
2569
|
Perform token classification on the given text.
|
|
@@ -2264,6 +2576,12 @@ class InferenceClient:
|
|
|
2264
2576
|
The model to use for the token classification task. Can be a model ID hosted on the Hugging Face Hub or a URL to
|
|
2265
2577
|
a deployed Inference Endpoint. If not provided, the default recommended token classification model will be used.
|
|
2266
2578
|
Defaults to None.
|
|
2579
|
+
aggregation_strategy (`Literal["none", "simple", "first", "average", "max"]`, *optional*):
|
|
2580
|
+
The strategy used to fuse tokens based on model predictions.
|
|
2581
|
+
ignore_labels (`List[str]`, *optional*):
|
|
2582
|
+
A list of labels to ignore.
|
|
2583
|
+
stride (`int`, *optional*):
|
|
2584
|
+
The number of overlapping tokens between chunks when splitting the input text.
|
|
2267
2585
|
|
|
2268
2586
|
Returns:
|
|
2269
2587
|
`List[TokenClassificationOutputElement]`: List of [`TokenClassificationOutputElement`] items containing the entity group, confidence score, word, start and end index.
|
|
@@ -2297,16 +2615,30 @@ class InferenceClient:
|
|
|
2297
2615
|
]
|
|
2298
2616
|
```
|
|
2299
2617
|
"""
|
|
2300
|
-
|
|
2618
|
+
|
|
2619
|
+
parameters = {
|
|
2620
|
+
"aggregation_strategy": aggregation_strategy,
|
|
2621
|
+
"ignore_labels": ignore_labels,
|
|
2622
|
+
"stride": stride,
|
|
2623
|
+
}
|
|
2624
|
+
payload = _prepare_payload(text, parameters=parameters)
|
|
2301
2625
|
response = self.post(
|
|
2302
|
-
|
|
2626
|
+
**payload,
|
|
2303
2627
|
model=model,
|
|
2304
2628
|
task="token-classification",
|
|
2305
2629
|
)
|
|
2306
2630
|
return TokenClassificationOutputElement.parse_obj_as_list(response)
|
|
2307
2631
|
|
|
2308
2632
|
def translation(
|
|
2309
|
-
self,
|
|
2633
|
+
self,
|
|
2634
|
+
text: str,
|
|
2635
|
+
*,
|
|
2636
|
+
model: Optional[str] = None,
|
|
2637
|
+
src_lang: Optional[str] = None,
|
|
2638
|
+
tgt_lang: Optional[str] = None,
|
|
2639
|
+
clean_up_tokenization_spaces: Optional[bool] = None,
|
|
2640
|
+
truncation: Optional[Literal["do_not_truncate", "longest_first", "only_first", "only_second"]] = None,
|
|
2641
|
+
generate_parameters: Optional[Dict[str, Any]] = None,
|
|
2310
2642
|
) -> TranslationOutput:
|
|
2311
2643
|
"""
|
|
2312
2644
|
Convert text from one language to another.
|
|
@@ -2315,7 +2647,6 @@ class InferenceClient:
|
|
|
2315
2647
|
your specific use case. Source and target languages usually depend on the model.
|
|
2316
2648
|
However, it is possible to specify source and target languages for certain models. If you are working with one of these models,
|
|
2317
2649
|
you can use `src_lang` and `tgt_lang` arguments to pass the relevant information.
|
|
2318
|
-
You can find this information in the model card.
|
|
2319
2650
|
|
|
2320
2651
|
Args:
|
|
2321
2652
|
text (`str`):
|
|
@@ -2325,9 +2656,15 @@ class InferenceClient:
|
|
|
2325
2656
|
a deployed Inference Endpoint. If not provided, the default recommended translation model will be used.
|
|
2326
2657
|
Defaults to None.
|
|
2327
2658
|
src_lang (`str`, *optional*):
|
|
2328
|
-
|
|
2659
|
+
The source language of the text. Required for models that can translate from multiple languages.
|
|
2329
2660
|
tgt_lang (`str`, *optional*):
|
|
2330
|
-
Target language
|
|
2661
|
+
Target language to translate to. Required for models that can translate to multiple languages.
|
|
2662
|
+
clean_up_tokenization_spaces (`bool`, *optional*):
|
|
2663
|
+
Whether to clean up the potential extra spaces in the text output.
|
|
2664
|
+
truncation (`Literal["do_not_truncate", "longest_first", "only_first", "only_second"]`, *optional*):
|
|
2665
|
+
The truncation strategy to use.
|
|
2666
|
+
generate_parameters (`Dict[str, Any]`, *optional*):
|
|
2667
|
+
Additional parametrization of the text generation algorithm.
|
|
2331
2668
|
|
|
2332
2669
|
Returns:
|
|
2333
2670
|
[`TranslationOutput`]: The generated translated text.
|
|
@@ -2362,12 +2699,15 @@ class InferenceClient:
|
|
|
2362
2699
|
|
|
2363
2700
|
if src_lang is None and tgt_lang is not None:
|
|
2364
2701
|
raise ValueError("You cannot specify `tgt_lang` without specifying `src_lang`.")
|
|
2365
|
-
|
|
2366
|
-
|
|
2367
|
-
|
|
2368
|
-
|
|
2369
|
-
|
|
2370
|
-
|
|
2702
|
+
parameters = {
|
|
2703
|
+
"src_lang": src_lang,
|
|
2704
|
+
"tgt_lang": tgt_lang,
|
|
2705
|
+
"clean_up_tokenization_spaces": clean_up_tokenization_spaces,
|
|
2706
|
+
"truncation": truncation,
|
|
2707
|
+
"generate_parameters": generate_parameters,
|
|
2708
|
+
}
|
|
2709
|
+
payload = _prepare_payload(text, parameters=parameters)
|
|
2710
|
+
response = self.post(**payload, model=model, task="translation")
|
|
2371
2711
|
return TranslationOutput.parse_obj_as_list(response)[0]
|
|
2372
2712
|
|
|
2373
2713
|
def visual_question_answering(
|
|
@@ -2376,6 +2716,7 @@ class InferenceClient:
|
|
|
2376
2716
|
question: str,
|
|
2377
2717
|
*,
|
|
2378
2718
|
model: Optional[str] = None,
|
|
2719
|
+
top_k: Optional[int] = None,
|
|
2379
2720
|
) -> List[VisualQuestionAnsweringOutputElement]:
|
|
2380
2721
|
"""
|
|
2381
2722
|
Answering open-ended questions based on an image.
|
|
@@ -2389,7 +2730,10 @@ class InferenceClient:
|
|
|
2389
2730
|
The model to use for the visual question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
|
|
2390
2731
|
a deployed Inference Endpoint. If not provided, the default recommended visual question answering model will be used.
|
|
2391
2732
|
Defaults to None.
|
|
2392
|
-
|
|
2733
|
+
top_k (`int`, *optional*):
|
|
2734
|
+
The number of answers to return (will be chosen by order of likelihood). Note that we
|
|
2735
|
+
return less than topk answers if there are not enough options available within the
|
|
2736
|
+
context.
|
|
2393
2737
|
Returns:
|
|
2394
2738
|
`List[VisualQuestionAnsweringOutputElement]`: a list of [`VisualQuestionAnsweringOutputElement`] items containing the predicted label and associated probability.
|
|
2395
2739
|
|
|
@@ -2414,6 +2758,8 @@ class InferenceClient:
|
|
|
2414
2758
|
```
|
|
2415
2759
|
"""
|
|
2416
2760
|
payload: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
|
|
2761
|
+
if top_k is not None:
|
|
2762
|
+
payload.setdefault("parameters", {})["top_k"] = top_k
|
|
2417
2763
|
response = self.post(json=payload, model=model, task="visual-question-answering")
|
|
2418
2764
|
return VisualQuestionAnsweringOutputElement.parse_obj_as_list(response)
|
|
2419
2765
|
|
|
@@ -2444,7 +2790,7 @@ class InferenceClient:
|
|
|
2444
2790
|
The model then evaluates for both hypotheses if they are entailed in the provided `text` or not.
|
|
2445
2791
|
model (`str`, *optional*):
|
|
2446
2792
|
The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
|
|
2447
|
-
Inference Endpoint. This parameter overrides the model defined at the instance level.
|
|
2793
|
+
Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot classification model will be used.
|
|
2448
2794
|
|
|
2449
2795
|
Returns:
|
|
2450
2796
|
`List[ZeroShotClassificationOutputElement]`: List of [`ZeroShotClassificationOutputElement`] items containing the predicted labels and their confidence.
|
|
@@ -2502,15 +2848,14 @@ class InferenceClient:
|
|
|
2502
2848
|
```
|
|
2503
2849
|
"""
|
|
2504
2850
|
|
|
2505
|
-
parameters = {
|
|
2506
|
-
|
|
2507
|
-
|
|
2508
|
-
|
|
2851
|
+
parameters = {
|
|
2852
|
+
"candidate_labels": labels,
|
|
2853
|
+
"multi_label": multi_label,
|
|
2854
|
+
"hypothesis_template": hypothesis_template,
|
|
2855
|
+
}
|
|
2856
|
+
payload = _prepare_payload(text, parameters=parameters)
|
|
2509
2857
|
response = self.post(
|
|
2510
|
-
|
|
2511
|
-
"inputs": text,
|
|
2512
|
-
"parameters": parameters,
|
|
2513
|
-
},
|
|
2858
|
+
**payload,
|
|
2514
2859
|
task="zero-shot-classification",
|
|
2515
2860
|
model=model,
|
|
2516
2861
|
)
|
|
@@ -2521,7 +2866,12 @@ class InferenceClient:
|
|
|
2521
2866
|
]
|
|
2522
2867
|
|
|
2523
2868
|
def zero_shot_image_classification(
|
|
2524
|
-
self,
|
|
2869
|
+
self,
|
|
2870
|
+
image: ContentT,
|
|
2871
|
+
labels: List[str],
|
|
2872
|
+
*,
|
|
2873
|
+
model: Optional[str] = None,
|
|
2874
|
+
hypothesis_template: Optional[str] = None,
|
|
2525
2875
|
) -> List[ZeroShotImageClassificationOutputElement]:
|
|
2526
2876
|
"""
|
|
2527
2877
|
Provide input image and text labels to predict text labels for the image.
|
|
@@ -2533,8 +2883,10 @@ class InferenceClient:
|
|
|
2533
2883
|
List of string possible labels. There must be at least 2 labels.
|
|
2534
2884
|
model (`str`, *optional*):
|
|
2535
2885
|
The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
|
|
2536
|
-
Inference Endpoint. This parameter overrides the model defined at the instance level.
|
|
2537
|
-
|
|
2886
|
+
Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot image classification model will be used.
|
|
2887
|
+
hypothesis_template (`str`, *optional*):
|
|
2888
|
+
The sentence used in conjunction with `labels` to attempt the text classification by replacing the
|
|
2889
|
+
placeholder with the candidate labels.
|
|
2538
2890
|
Returns:
|
|
2539
2891
|
`List[ZeroShotImageClassificationOutputElement]`: List of [`ZeroShotImageClassificationOutputElement`] items containing the predicted labels and their confidence.
|
|
2540
2892
|
|
|
@@ -2560,8 +2912,11 @@ class InferenceClient:
|
|
|
2560
2912
|
if len(labels) < 2:
|
|
2561
2913
|
raise ValueError("You must specify at least 2 classes to compare.")
|
|
2562
2914
|
|
|
2915
|
+
inputs = {"image": _b64_encode(image), "candidateLabels": ",".join(labels)}
|
|
2916
|
+
parameters = {"hypothesis_template": hypothesis_template}
|
|
2917
|
+
payload = _prepare_payload(inputs, parameters=parameters)
|
|
2563
2918
|
response = self.post(
|
|
2564
|
-
|
|
2919
|
+
**payload,
|
|
2565
2920
|
model=model,
|
|
2566
2921
|
task="zero-shot-image-classification",
|
|
2567
2922
|
)
|