huggingface-hub 0.25.2__py3-none-any.whl → 0.26.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of huggingface-hub might be problematic. Click here for more details.

Files changed (45) hide show
  1. huggingface_hub/__init__.py +45 -11
  2. huggingface_hub/_login.py +172 -33
  3. huggingface_hub/commands/user.py +125 -9
  4. huggingface_hub/constants.py +1 -1
  5. huggingface_hub/errors.py +6 -9
  6. huggingface_hub/file_download.py +2 -372
  7. huggingface_hub/hf_api.py +170 -13
  8. huggingface_hub/hf_file_system.py +3 -3
  9. huggingface_hub/hub_mixin.py +2 -1
  10. huggingface_hub/inference/_client.py +500 -145
  11. huggingface_hub/inference/_common.py +42 -4
  12. huggingface_hub/inference/_generated/_async_client.py +499 -144
  13. huggingface_hub/inference/_generated/types/__init__.py +37 -7
  14. huggingface_hub/inference/_generated/types/audio_classification.py +8 -5
  15. huggingface_hub/inference/_generated/types/automatic_speech_recognition.py +9 -7
  16. huggingface_hub/inference/_generated/types/chat_completion.py +23 -4
  17. huggingface_hub/inference/_generated/types/image_classification.py +8 -5
  18. huggingface_hub/inference/_generated/types/image_segmentation.py +9 -7
  19. huggingface_hub/inference/_generated/types/image_to_image.py +7 -5
  20. huggingface_hub/inference/_generated/types/image_to_text.py +4 -4
  21. huggingface_hub/inference/_generated/types/object_detection.py +11 -5
  22. huggingface_hub/inference/_generated/types/summarization.py +11 -13
  23. huggingface_hub/inference/_generated/types/text_classification.py +10 -5
  24. huggingface_hub/inference/_generated/types/text_generation.py +1 -0
  25. huggingface_hub/inference/_generated/types/text_to_audio.py +2 -2
  26. huggingface_hub/inference/_generated/types/text_to_image.py +9 -7
  27. huggingface_hub/inference/_generated/types/text_to_speech.py +107 -0
  28. huggingface_hub/inference/_generated/types/translation.py +17 -11
  29. huggingface_hub/inference/_generated/types/video_classification.py +2 -2
  30. huggingface_hub/repocard.py +2 -1
  31. huggingface_hub/repocard_data.py +10 -2
  32. huggingface_hub/serialization/_torch.py +7 -4
  33. huggingface_hub/utils/__init__.py +4 -20
  34. huggingface_hub/utils/{_token.py → _auth.py} +86 -3
  35. huggingface_hub/utils/_headers.py +1 -1
  36. huggingface_hub/utils/_hf_folder.py +1 -1
  37. huggingface_hub/utils/_http.py +10 -4
  38. huggingface_hub/utils/_runtime.py +1 -10
  39. {huggingface_hub-0.25.2.dist-info → huggingface_hub-0.26.0rc0.dist-info}/METADATA +12 -12
  40. {huggingface_hub-0.25.2.dist-info → huggingface_hub-0.26.0rc0.dist-info}/RECORD +44 -44
  41. huggingface_hub/inference/_templating.py +0 -102
  42. {huggingface_hub-0.25.2.dist-info → huggingface_hub-0.26.0rc0.dist-info}/LICENSE +0 -0
  43. {huggingface_hub-0.25.2.dist-info → huggingface_hub-0.26.0rc0.dist-info}/WHEEL +0 -0
  44. {huggingface_hub-0.25.2.dist-info → huggingface_hub-0.26.0rc0.dist-info}/entry_points.txt +0 -0
  45. {huggingface_hub-0.25.2.dist-info → huggingface_hub-0.26.0rc0.dist-info}/top_level.txt +0 -0
@@ -37,17 +37,7 @@ import logging
37
37
  import re
38
38
  import time
39
39
  import warnings
40
- from typing import (
41
- TYPE_CHECKING,
42
- Any,
43
- Dict,
44
- Iterable,
45
- List,
46
- Literal,
47
- Optional,
48
- Union,
49
- overload,
50
- )
40
+ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Literal, Optional, Union, overload
51
41
 
52
42
  from requests import HTTPError
53
43
  from requests.structures import CaseInsensitiveDict
@@ -67,6 +57,7 @@ from huggingface_hub.inference._common import (
67
57
  _get_unsupported_text_generation_kwargs,
68
58
  _import_numpy,
69
59
  _open_as_binary,
60
+ _prepare_payload,
70
61
  _set_unsupported_text_generation_kwargs,
71
62
  _stream_chat_completion_response,
72
63
  _stream_text_generation_response,
@@ -74,11 +65,12 @@ from huggingface_hub.inference._common import (
74
65
  )
75
66
  from huggingface_hub.inference._generated.types import (
76
67
  AudioClassificationOutputElement,
68
+ AudioClassificationOutputTransform,
77
69
  AudioToAudioOutputElement,
78
70
  AutomaticSpeechRecognitionOutput,
79
71
  ChatCompletionInputGrammarType,
80
- ChatCompletionInputTool,
81
- ChatCompletionInputToolTypeClass,
72
+ ChatCompletionInputStreamOptions,
73
+ ChatCompletionInputToolType,
82
74
  ChatCompletionOutput,
83
75
  ChatCompletionStreamOutput,
84
76
  DocumentQuestionAnsweringOutputElement,
@@ -91,21 +83,21 @@ from huggingface_hub.inference._generated.types import (
91
83
  SummarizationOutput,
92
84
  TableQuestionAnsweringOutputElement,
93
85
  TextClassificationOutputElement,
86
+ TextClassificationOutputTransform,
94
87
  TextGenerationInputGrammarType,
95
88
  TextGenerationOutput,
96
89
  TextGenerationStreamOutput,
90
+ TextToImageTargetSize,
91
+ TextToSpeechEarlyStoppingEnum,
97
92
  TokenClassificationOutputElement,
93
+ ToolElement,
98
94
  TranslationOutput,
99
95
  VisualQuestionAnsweringOutputElement,
100
96
  ZeroShotClassificationOutputElement,
101
97
  ZeroShotImageClassificationOutputElement,
102
98
  )
103
- from huggingface_hub.utils import (
104
- build_hf_headers,
105
- get_session,
106
- hf_raise_for_status,
107
- )
108
- from huggingface_hub.utils._deprecation import _deprecate_positional_args
99
+ from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
100
+ from huggingface_hub.utils._deprecation import _deprecate_arguments
109
101
 
110
102
 
111
103
  if TYPE_CHECKING:
@@ -157,7 +149,6 @@ class InferenceClient:
157
149
  follow the same pattern as `openai.OpenAI` client. Cannot be used if `token` is set. Defaults to None.
158
150
  """
159
151
 
160
- @_deprecate_positional_args(version="0.26")
161
152
  def __init__(
162
153
  self,
163
154
  model: Optional[str] = None,
@@ -333,6 +324,8 @@ class InferenceClient:
333
324
  audio: ContentT,
334
325
  *,
335
326
  model: Optional[str] = None,
327
+ top_k: Optional[int] = None,
328
+ function_to_apply: Optional["AudioClassificationOutputTransform"] = None,
336
329
  ) -> List[AudioClassificationOutputElement]:
337
330
  """
338
331
  Perform audio classification on the provided audio content.
@@ -345,6 +338,10 @@ class InferenceClient:
345
338
  The model to use for audio classification. Can be a model ID hosted on the Hugging Face Hub
346
339
  or a URL to a deployed Inference Endpoint. If not provided, the default recommended model for
347
340
  audio classification will be used.
341
+ top_k (`int`, *optional*):
342
+ When specified, limits the output to the top K most probable classes.
343
+ function_to_apply (`"AudioClassificationOutputTransform"`, *optional*):
344
+ The function to apply to the output.
348
345
 
349
346
  Returns:
350
347
  `List[AudioClassificationOutputElement]`: List of [`AudioClassificationOutputElement`] items containing the predicted labels and their confidence.
@@ -367,7 +364,9 @@ class InferenceClient:
367
364
  ]
368
365
  ```
369
366
  """
370
- response = self.post(data=audio, model=model, task="audio-classification")
367
+ parameters = {"function_to_apply": function_to_apply, "top_k": top_k}
368
+ payload = _prepare_payload(audio, parameters=parameters, expect_binary=True)
369
+ response = self.post(**payload, model=model, task="audio-classification")
371
370
  return AudioClassificationOutputElement.parse_obj_as_list(response)
372
371
 
373
372
  def audio_to_audio(
@@ -452,7 +451,7 @@ class InferenceClient:
452
451
  @overload
453
452
  def chat_completion( # type: ignore
454
453
  self,
455
- messages: List[Dict[str, str]],
454
+ messages: List[Dict],
456
455
  *,
457
456
  model: Optional[str] = None,
458
457
  stream: Literal[False] = False,
@@ -465,10 +464,11 @@ class InferenceClient:
465
464
  response_format: Optional[ChatCompletionInputGrammarType] = None,
466
465
  seed: Optional[int] = None,
467
466
  stop: Optional[List[str]] = None,
467
+ stream_options: Optional[ChatCompletionInputStreamOptions] = None,
468
468
  temperature: Optional[float] = None,
469
- tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
469
+ tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
470
470
  tool_prompt: Optional[str] = None,
471
- tools: Optional[List[ChatCompletionInputTool]] = None,
471
+ tools: Optional[List[ToolElement]] = None,
472
472
  top_logprobs: Optional[int] = None,
473
473
  top_p: Optional[float] = None,
474
474
  ) -> ChatCompletionOutput: ...
@@ -476,7 +476,7 @@ class InferenceClient:
476
476
  @overload
477
477
  def chat_completion( # type: ignore
478
478
  self,
479
- messages: List[Dict[str, str]],
479
+ messages: List[Dict],
480
480
  *,
481
481
  model: Optional[str] = None,
482
482
  stream: Literal[True] = True,
@@ -489,10 +489,11 @@ class InferenceClient:
489
489
  response_format: Optional[ChatCompletionInputGrammarType] = None,
490
490
  seed: Optional[int] = None,
491
491
  stop: Optional[List[str]] = None,
492
+ stream_options: Optional[ChatCompletionInputStreamOptions] = None,
492
493
  temperature: Optional[float] = None,
493
- tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
494
+ tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
494
495
  tool_prompt: Optional[str] = None,
495
- tools: Optional[List[ChatCompletionInputTool]] = None,
496
+ tools: Optional[List[ToolElement]] = None,
496
497
  top_logprobs: Optional[int] = None,
497
498
  top_p: Optional[float] = None,
498
499
  ) -> Iterable[ChatCompletionStreamOutput]: ...
@@ -500,7 +501,7 @@ class InferenceClient:
500
501
  @overload
501
502
  def chat_completion(
502
503
  self,
503
- messages: List[Dict[str, str]],
504
+ messages: List[Dict],
504
505
  *,
505
506
  model: Optional[str] = None,
506
507
  stream: bool = False,
@@ -513,17 +514,18 @@ class InferenceClient:
513
514
  response_format: Optional[ChatCompletionInputGrammarType] = None,
514
515
  seed: Optional[int] = None,
515
516
  stop: Optional[List[str]] = None,
517
+ stream_options: Optional[ChatCompletionInputStreamOptions] = None,
516
518
  temperature: Optional[float] = None,
517
- tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
519
+ tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
518
520
  tool_prompt: Optional[str] = None,
519
- tools: Optional[List[ChatCompletionInputTool]] = None,
521
+ tools: Optional[List[ToolElement]] = None,
520
522
  top_logprobs: Optional[int] = None,
521
523
  top_p: Optional[float] = None,
522
524
  ) -> Union[ChatCompletionOutput, Iterable[ChatCompletionStreamOutput]]: ...
523
525
 
524
526
  def chat_completion(
525
527
  self,
526
- messages: List[Dict[str, str]],
528
+ messages: List[Dict],
527
529
  *,
528
530
  model: Optional[str] = None,
529
531
  stream: bool = False,
@@ -537,10 +539,11 @@ class InferenceClient:
537
539
  response_format: Optional[ChatCompletionInputGrammarType] = None,
538
540
  seed: Optional[int] = None,
539
541
  stop: Optional[List[str]] = None,
542
+ stream_options: Optional[ChatCompletionInputStreamOptions] = None,
540
543
  temperature: Optional[float] = None,
541
- tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
544
+ tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
542
545
  tool_prompt: Optional[str] = None,
543
- tools: Optional[List[ChatCompletionInputTool]] = None,
546
+ tools: Optional[List[ToolElement]] = None,
544
547
  top_logprobs: Optional[int] = None,
545
548
  top_p: Optional[float] = None,
546
549
  ) -> Union[ChatCompletionOutput, Iterable[ChatCompletionStreamOutput]]:
@@ -557,7 +560,7 @@ class InferenceClient:
557
560
  </Tip>
558
561
 
559
562
  Args:
560
- messages (List[Union[`SystemMessage`, `UserMessage`, `AssistantMessage`]]):
563
+ messages (List of [`ChatCompletionInputMessage`]):
561
564
  Conversation history consisting of roles and content pairs.
562
565
  model (`str`, *optional*):
563
566
  The model to use for chat-completion. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
@@ -594,6 +597,8 @@ class InferenceClient:
594
597
  Defaults to None.
595
598
  stream (`bool`, *optional*):
596
599
  Enable realtime streaming of responses. Defaults to False.
600
+ stream_options ([`ChatCompletionInputStreamOptions`], *optional*):
601
+ Options for streaming completions.
597
602
  temperature (`float`, *optional*):
598
603
  Controls randomness of the generations. Lower values ensure
599
604
  less random completions. Range: [0, 2]. Defaults to 1.0.
@@ -604,11 +609,11 @@ class InferenceClient:
604
609
  top_p (`float`, *optional*):
605
610
  Fraction of the most likely next words to sample from.
606
611
  Must be between 0 and 1. Defaults to 1.0.
607
- tool_choice ([`ChatCompletionInputToolTypeClass`] or `str`, *optional*):
612
+ tool_choice ([`ChatCompletionInputToolType`] or `str`, *optional*):
608
613
  The tool to use for the completion. Defaults to "auto".
609
614
  tool_prompt (`str`, *optional*):
610
615
  A prompt to be appended before the tools.
611
- tools (List of [`ChatCompletionInputTool`], *optional*):
616
+ tools (List of [`ToolElement`], *optional*):
612
617
  A list of tools the model may call. Currently, only functions are supported as a tool. Use this to
613
618
  provide a list of functions the model may generate JSON inputs for.
614
619
 
@@ -658,7 +663,7 @@ class InferenceClient:
658
663
  )
659
664
  ```
660
665
 
661
- Example (stream=True):
666
+ Example using streaming:
662
667
  ```py
663
668
  >>> from huggingface_hub import InferenceClient
664
669
  >>> messages = [{"role": "user", "content": "What is the capital of France?"}]
@@ -696,6 +701,40 @@ class InferenceClient:
696
701
  print(chunk.choices[0].delta.content)
697
702
  ```
698
703
 
704
+ Example using Image + Text as input:
705
+ ```py
706
+ >>> from huggingface_hub import InferenceClient
707
+
708
+ # provide a remote URL
709
+ >>> image_url ="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
710
+ # or a base64-encoded image
711
+ >>> image_path = "/path/to/image.jpeg"
712
+ >>> with open(image_path, "rb") as f:
713
+ ... base64_image = base64.b64encode(f.read()).decode("utf-8")
714
+ >>> image_url = f"data:image/jpeg;base64,{base64_image}"
715
+
716
+ >>> client = InferenceClient("meta-llama/Llama-3.2-11B-Vision-Instruct")
717
+ >>> output = client.chat.completions.create(
718
+ ... messages=[
719
+ ... {
720
+ ... "role": "user",
721
+ ... "content": [
722
+ ... {
723
+ ... "type": "image_url",
724
+ ... "image_url": {"url": image_url},
725
+ ... },
726
+ ... {
727
+ ... "type": "text",
728
+ ... "text": "Describe this image in one sentence.",
729
+ ... },
730
+ ... ],
731
+ ... },
732
+ ... ],
733
+ ... )
734
+ >>> output
735
+ The image depicts the iconic Statue of Liberty situated in New York Harbor, New York, on a clear day.
736
+ ```
737
+
699
738
  Example using tools:
700
739
  ```py
701
740
  >>> client = InferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
@@ -837,6 +876,7 @@ class InferenceClient:
837
876
  top_logprobs=top_logprobs,
838
877
  top_p=top_p,
839
878
  stream=stream,
879
+ stream_options=stream_options,
840
880
  )
841
881
  payload = {key: value for key, value in payload.items() if value is not None}
842
882
  data = self.post(model=model_url, json=payload, stream=stream)
@@ -877,6 +917,14 @@ class InferenceClient:
877
917
  question: str,
878
918
  *,
879
919
  model: Optional[str] = None,
920
+ doc_stride: Optional[int] = None,
921
+ handle_impossible_answer: Optional[bool] = None,
922
+ lang: Optional[str] = None,
923
+ max_answer_len: Optional[int] = None,
924
+ max_question_len: Optional[int] = None,
925
+ max_seq_len: Optional[int] = None,
926
+ top_k: Optional[int] = None,
927
+ word_boxes: Optional[List[Union[List[float], str]]] = None,
880
928
  ) -> List[DocumentQuestionAnsweringOutputElement]:
881
929
  """
882
930
  Answer questions on document images.
@@ -890,7 +938,29 @@ class InferenceClient:
890
938
  The model to use for the document question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
891
939
  a deployed Inference Endpoint. If not provided, the default recommended document question answering model will be used.
892
940
  Defaults to None.
893
-
941
+ doc_stride (`int`, *optional*):
942
+ If the words in the document are too long to fit with the question for the model, it will
943
+ be split in several chunks with some overlap. This argument controls the size of that
944
+ overlap.
945
+ handle_impossible_answer (`bool`, *optional*):
946
+ Whether to accept impossible as an answer.
947
+ lang (`str`, *optional*):
948
+ Language to use while running OCR.
949
+ max_answer_len (`int`, *optional*):
950
+ The maximum length of predicted answers (e.g., only answers with a shorter length are
951
+ considered).
952
+ max_question_len (`int`, *optional*):
953
+ The maximum length of the question after tokenization. It will be truncated if needed.
954
+ max_seq_len (`int`, *optional*):
955
+ The maximum length of the total sentence (context + question) in tokens of each chunk
956
+ passed to the model. The context will be split in several chunks (using doc_stride as
957
+ overlap) if needed.
958
+ top_k (`int`, *optional*):
959
+ The number of answers to return (will be chosen by order of likelihood). Can return less
960
+ than top_k answers if there are not enough options available within the context.
961
+ word_boxes (`List[Union[List[float], str]]`, *optional*):
962
+ A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
963
+ skip the OCR step and use the provided bounding boxes instead.
894
964
  Returns:
895
965
  `List[DocumentQuestionAnsweringOutputElement]`: a list of [`DocumentQuestionAnsweringOutputElement`] items containing the predicted label, associated probability, word ids, and page number.
896
966
 
@@ -900,16 +970,28 @@ class InferenceClient:
900
970
  `HTTPError`:
901
971
  If the request fails with an HTTP error status code other than HTTP 503.
902
972
 
973
+
903
974
  Example:
904
975
  ```py
905
976
  >>> from huggingface_hub import InferenceClient
906
977
  >>> client = InferenceClient()
907
978
  >>> client.document_question_answering(image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png", question="What is the invoice number?")
908
- [DocumentQuestionAnsweringOutputElement(score=0.42515629529953003, answer='us-001', start=16, end=16)]
979
+ [DocumentQuestionAnsweringOutputElement(answer='us-001', end=16, score=0.9999666213989258, start=16, words=None)]
909
980
  ```
910
981
  """
911
- payload: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
912
- response = self.post(json=payload, model=model, task="document-question-answering")
982
+ inputs: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
983
+ parameters = {
984
+ "doc_stride": doc_stride,
985
+ "handle_impossible_answer": handle_impossible_answer,
986
+ "lang": lang,
987
+ "max_answer_len": max_answer_len,
988
+ "max_question_len": max_question_len,
989
+ "max_seq_len": max_seq_len,
990
+ "top_k": top_k,
991
+ "word_boxes": word_boxes,
992
+ }
993
+ payload = _prepare_payload(inputs, parameters=parameters)
994
+ response = self.post(**payload, model=model, task="document-question-answering")
913
995
  return DocumentQuestionAnsweringOutputElement.parse_obj_as_list(response)
914
996
 
915
997
  def feature_extraction(
@@ -933,7 +1015,7 @@ class InferenceClient:
933
1015
  a deployed Inference Endpoint. If not provided, the default recommended conversational model will be used.
934
1016
  Defaults to None.
935
1017
  normalize (`bool`, *optional*):
936
- Whether to normalize the embeddings or not. Defaults to None.
1018
+ Whether to normalize the embeddings or not.
937
1019
  Only available on server powered by Text-Embedding-Inference.
938
1020
  prompt_name (`str`, *optional*):
939
1021
  The name of the prompt that should be used by for encoding. If not set, no prompt will be applied.
@@ -942,7 +1024,7 @@ class InferenceClient:
942
1024
  then the sentence "What is the capital of France?" will be encoded as "query: What is the capital of France?"
943
1025
  because the prompt text will be prepended before any text to encode.
944
1026
  truncate (`bool`, *optional*):
945
- Whether to truncate the embeddings or not. Defaults to None.
1027
+ Whether to truncate the embeddings or not.
946
1028
  Only available on server powered by Text-Embedding-Inference.
947
1029
  truncation_direction (`Literal["Left", "Right"]`, *optional*):
948
1030
  Which side of the input should be truncated when `truncate=True` is passed.
@@ -967,20 +1049,25 @@ class InferenceClient:
967
1049
  [ 0.28552425, -0.928395 , -1.2077185 , ..., 0.76810825, -2.1069427 , 0.6236161 ]], dtype=float32)
968
1050
  ```
969
1051
  """
970
- payload: Dict = {"inputs": text}
971
- if normalize is not None:
972
- payload["normalize"] = normalize
973
- if prompt_name is not None:
974
- payload["prompt_name"] = prompt_name
975
- if truncate is not None:
976
- payload["truncate"] = truncate
977
- if truncation_direction is not None:
978
- payload["truncation_direction"] = truncation_direction
979
- response = self.post(json=payload, model=model, task="feature-extraction")
1052
+ parameters = {
1053
+ "normalize": normalize,
1054
+ "prompt_name": prompt_name,
1055
+ "truncate": truncate,
1056
+ "truncation_direction": truncation_direction,
1057
+ }
1058
+ payload = _prepare_payload(text, parameters=parameters)
1059
+ response = self.post(**payload, model=model, task="feature-extraction")
980
1060
  np = _import_numpy()
981
1061
  return np.array(_bytes_to_dict(response), dtype="float32")
982
1062
 
983
- def fill_mask(self, text: str, *, model: Optional[str] = None) -> List[FillMaskOutputElement]:
1063
+ def fill_mask(
1064
+ self,
1065
+ text: str,
1066
+ *,
1067
+ model: Optional[str] = None,
1068
+ targets: Optional[List[str]] = None,
1069
+ top_k: Optional[int] = None,
1070
+ ) -> List[FillMaskOutputElement]:
984
1071
  """
985
1072
  Fill in a hole with a missing word (token to be precise).
986
1073
 
@@ -990,8 +1077,13 @@ class InferenceClient:
990
1077
  model (`str`, *optional*):
991
1078
  The model to use for the fill mask task. Can be a model ID hosted on the Hugging Face Hub or a URL to
992
1079
  a deployed Inference Endpoint. If not provided, the default recommended fill mask model will be used.
993
- Defaults to None.
994
-
1080
+ targets (`List[str]`, *optional*):
1081
+ When passed, the model will limit the scores to the passed targets instead of looking up
1082
+ in the whole vocabulary. If the provided targets are not in the model vocab, they will be
1083
+ tokenized and the first resulting token will be used (with a warning, and that might be
1084
+ slower).
1085
+ top_k (`int`, *optional*):
1086
+ When passed, overrides the number of predictions to return.
995
1087
  Returns:
996
1088
  `List[FillMaskOutputElement]`: a list of [`FillMaskOutputElement`] items containing the predicted label, associated
997
1089
  probability, token reference, and completed text.
@@ -1013,7 +1105,9 @@ class InferenceClient:
1013
1105
  ]
1014
1106
  ```
1015
1107
  """
1016
- response = self.post(json={"inputs": text}, model=model, task="fill-mask")
1108
+ parameters = {"targets": targets, "top_k": top_k}
1109
+ payload = _prepare_payload(text, parameters=parameters)
1110
+ response = self.post(**payload, model=model, task="fill-mask")
1017
1111
  return FillMaskOutputElement.parse_obj_as_list(response)
1018
1112
 
1019
1113
  def image_classification(
@@ -1021,6 +1115,8 @@ class InferenceClient:
1021
1115
  image: ContentT,
1022
1116
  *,
1023
1117
  model: Optional[str] = None,
1118
+ function_to_apply: Optional[Literal["sigmoid", "softmax", "none"]] = None,
1119
+ top_k: Optional[int] = None,
1024
1120
  ) -> List[ImageClassificationOutputElement]:
1025
1121
  """
1026
1122
  Perform image classification on the given image using the specified model.
@@ -1031,7 +1127,10 @@ class InferenceClient:
1031
1127
  model (`str`, *optional*):
1032
1128
  The model to use for image classification. Can be a model ID hosted on the Hugging Face Hub or a URL to a
1033
1129
  deployed Inference Endpoint. If not provided, the default recommended model for image classification will be used.
1034
-
1130
+ function_to_apply (`Literal["sigmoid", "softmax", "none"]`, *optional*):
1131
+ The function to apply to the output scores.
1132
+ top_k (`int`, *optional*):
1133
+ When specified, limits the output to the top K most probable classes.
1035
1134
  Returns:
1036
1135
  `List[ImageClassificationOutputElement]`: a list of [`ImageClassificationOutputElement`] items containing the predicted label and associated probability.
1037
1136
 
@@ -1046,10 +1145,12 @@ class InferenceClient:
1046
1145
  >>> from huggingface_hub import InferenceClient
1047
1146
  >>> client = InferenceClient()
1048
1147
  >>> client.image_classification("https://upload.wikimedia.org/wikipedia/commons/thumb/4/43/Cute_dog.jpg/320px-Cute_dog.jpg")
1049
- [ImageClassificationOutputElement(score=0.9779096841812134, label='Blenheim spaniel'), ...]
1148
+ [ImageClassificationOutputElement(label='Blenheim spaniel', score=0.9779096841812134), ...]
1050
1149
  ```
1051
1150
  """
1052
- response = self.post(data=image, model=model, task="image-classification")
1151
+ parameters = {"function_to_apply": function_to_apply, "top_k": top_k}
1152
+ payload = _prepare_payload(image, parameters=parameters, expect_binary=True)
1153
+ response = self.post(**payload, model=model, task="image-classification")
1053
1154
  return ImageClassificationOutputElement.parse_obj_as_list(response)
1054
1155
 
1055
1156
  def image_segmentation(
@@ -1057,6 +1158,10 @@ class InferenceClient:
1057
1158
  image: ContentT,
1058
1159
  *,
1059
1160
  model: Optional[str] = None,
1161
+ mask_threshold: Optional[float] = None,
1162
+ overlap_mask_area_threshold: Optional[float] = None,
1163
+ subtask: Optional[Literal["instance", "panoptic", "semantic"]] = None,
1164
+ threshold: Optional[float] = None,
1060
1165
  ) -> List[ImageSegmentationOutputElement]:
1061
1166
  """
1062
1167
  Perform image segmentation on the given image using the specified model.
@@ -1073,7 +1178,14 @@ class InferenceClient:
1073
1178
  model (`str`, *optional*):
1074
1179
  The model to use for image segmentation. Can be a model ID hosted on the Hugging Face Hub or a URL to a
1075
1180
  deployed Inference Endpoint. If not provided, the default recommended model for image segmentation will be used.
1076
-
1181
+ mask_threshold (`float`, *optional*):
1182
+ Threshold to use when turning the predicted masks into binary values.
1183
+ overlap_mask_area_threshold (`float`, *optional*):
1184
+ Mask overlap threshold to eliminate small, disconnected segments.
1185
+ subtask (`Literal["instance", "panoptic", "semantic"]`, *optional*):
1186
+ Segmentation task to be performed, depending on model capabilities.
1187
+ threshold (`float`, *optional*):
1188
+ Probability threshold to filter out predicted masks.
1077
1189
  Returns:
1078
1190
  `List[ImageSegmentationOutputElement]`: A list of [`ImageSegmentationOutputElement`] items containing the segmented masks and associated attributes.
1079
1191
 
@@ -1087,14 +1199,21 @@ class InferenceClient:
1087
1199
  ```py
1088
1200
  >>> from huggingface_hub import InferenceClient
1089
1201
  >>> client = InferenceClient()
1090
- >>> client.image_segmentation("cat.jpg"):
1202
+ >>> client.image_segmentation("cat.jpg")
1091
1203
  [ImageSegmentationOutputElement(score=0.989008, label='LABEL_184', mask=<PIL.PngImagePlugin.PngImageFile image mode=L size=400x300 at 0x7FDD2B129CC0>), ...]
1092
1204
  ```
1093
1205
  """
1094
- response = self.post(data=image, model=model, task="image-segmentation")
1206
+ parameters = {
1207
+ "mask_threshold": mask_threshold,
1208
+ "overlap_mask_area_threshold": overlap_mask_area_threshold,
1209
+ "subtask": subtask,
1210
+ "threshold": threshold,
1211
+ }
1212
+ payload = _prepare_payload(image, parameters=parameters, expect_binary=True)
1213
+ response = self.post(**payload, model=model, task="image-segmentation")
1095
1214
  output = ImageSegmentationOutputElement.parse_obj_as_list(response)
1096
1215
  for item in output:
1097
- item.mask = _b64_to_image(item.mask)
1216
+ item.mask = _b64_to_image(item.mask) # type: ignore [assignment]
1098
1217
  return output
1099
1218
 
1100
1219
  def image_to_image(
@@ -1166,19 +1285,8 @@ class InferenceClient:
1166
1285
  "guidance_scale": guidance_scale,
1167
1286
  **kwargs,
1168
1287
  }
1169
- if all(parameter is None for parameter in parameters.values()):
1170
- # Either only an image to send => send as raw bytes
1171
- data = image
1172
- payload: Optional[Dict[str, Any]] = None
1173
- else:
1174
- # Or an image + some parameters => use base64 encoding
1175
- data = None
1176
- payload = {"inputs": _b64_encode(image)}
1177
- for key, value in parameters.items():
1178
- if value is not None:
1179
- payload.setdefault("parameters", {})[key] = value
1180
-
1181
- response = self.post(json=payload, data=data, model=model, task="image-to-image")
1288
+ payload = _prepare_payload(image, parameters=parameters, expect_binary=True)
1289
+ response = self.post(**payload, model=model, task="image-to-image")
1182
1290
  return _bytes_to_image(response)
1183
1291
 
1184
1292
  def image_to_text(self, image: ContentT, *, model: Optional[str] = None) -> ImageToTextOutput:
@@ -1302,10 +1410,7 @@ class InferenceClient:
1302
1410
  return models_by_task
1303
1411
 
1304
1412
  def object_detection(
1305
- self,
1306
- image: ContentT,
1307
- *,
1308
- model: Optional[str] = None,
1413
+ self, image: ContentT, *, model: Optional[str] = None, threshold: Optional[float] = None
1309
1414
  ) -> List[ObjectDetectionOutputElement]:
1310
1415
  """
1311
1416
  Perform object detection on the given image using the specified model.
@@ -1322,7 +1427,8 @@ class InferenceClient:
1322
1427
  model (`str`, *optional*):
1323
1428
  The model to use for object detection. Can be a model ID hosted on the Hugging Face Hub or a URL to a
1324
1429
  deployed Inference Endpoint. If not provided, the default recommended model for object detection (DETR) will be used.
1325
-
1430
+ threshold (`float`, *optional*):
1431
+ The probability necessary to make a prediction.
1326
1432
  Returns:
1327
1433
  `List[ObjectDetectionOutputElement]`: A list of [`ObjectDetectionOutputElement`] items containing the bounding boxes and associated attributes.
1328
1434
 
@@ -1338,17 +1444,31 @@ class InferenceClient:
1338
1444
  ```py
1339
1445
  >>> from huggingface_hub import InferenceClient
1340
1446
  >>> client = InferenceClient()
1341
- >>> client.object_detection("people.jpg"):
1447
+ >>> client.object_detection("people.jpg")
1342
1448
  [ObjectDetectionOutputElement(score=0.9486683011054993, label='person', box=ObjectDetectionBoundingBox(xmin=59, ymin=39, xmax=420, ymax=510)), ...]
1343
1449
  ```
1344
1450
  """
1345
- # detect objects
1346
- response = self.post(data=image, model=model, task="object-detection")
1451
+ parameters = {
1452
+ "threshold": threshold,
1453
+ }
1454
+ payload = _prepare_payload(image, parameters=parameters, expect_binary=True)
1455
+ response = self.post(**payload, model=model, task="object-detection")
1347
1456
  return ObjectDetectionOutputElement.parse_obj_as_list(response)
1348
1457
 
1349
1458
  def question_answering(
1350
- self, question: str, context: str, *, model: Optional[str] = None
1351
- ) -> QuestionAnsweringOutputElement:
1459
+ self,
1460
+ question: str,
1461
+ context: str,
1462
+ *,
1463
+ model: Optional[str] = None,
1464
+ align_to_words: Optional[bool] = None,
1465
+ doc_stride: Optional[int] = None,
1466
+ handle_impossible_answer: Optional[bool] = None,
1467
+ max_answer_len: Optional[int] = None,
1468
+ max_question_len: Optional[int] = None,
1469
+ max_seq_len: Optional[int] = None,
1470
+ top_k: Optional[int] = None,
1471
+ ) -> Union[QuestionAnsweringOutputElement, List[QuestionAnsweringOutputElement]]:
1352
1472
  """
1353
1473
  Retrieve the answer to a question from a given text.
1354
1474
 
@@ -1360,10 +1480,31 @@ class InferenceClient:
1360
1480
  model (`str`):
1361
1481
  The model to use for the question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
1362
1482
  a deployed Inference Endpoint.
1363
-
1483
+ align_to_words (`bool`, *optional*):
1484
+ Attempts to align the answer to real words. Improves quality on space separated
1485
+ languages. Might hurt on non-space-separated languages (like Japanese or Chinese).
1486
+ doc_stride (`int`, *optional*):
1487
+ If the context is too long to fit with the question for the model, it will be split in
1488
+ several chunks with some overlap. This argument controls the size of that overlap.
1489
+ handle_impossible_answer (`bool`, *optional*):
1490
+ Whether to accept impossible as an answer.
1491
+ max_answer_len (`int`, *optional*):
1492
+ The maximum length of predicted answers (e.g., only answers with a shorter length are
1493
+ considered).
1494
+ max_question_len (`int`, *optional*):
1495
+ The maximum length of the question after tokenization. It will be truncated if needed.
1496
+ max_seq_len (`int`, *optional*):
1497
+ The maximum length of the total sentence (context + question) in tokens of each chunk
1498
+ passed to the model. The context will be split in several chunks (using docStride as
1499
+ overlap) if needed.
1500
+ top_k (`int`, *optional*):
1501
+ The number of answers to return (will be chosen by order of likelihood). Note that we
1502
+ return less than topk answers if there are not enough options available within the
1503
+ context.
1364
1504
  Returns:
1365
- [`QuestionAnsweringOutputElement`]: an question answering output containing the score, start index, end index, and answer.
1366
-
1505
+ Union[`QuestionAnsweringOutputElement`, List[`QuestionAnsweringOutputElement`]]:
1506
+ When top_k is 1 or not provided, it returns a single `QuestionAnsweringOutputElement`.
1507
+ When top_k is greater than 1, it returns a list of `QuestionAnsweringOutputElement`.
1367
1508
  Raises:
1368
1509
  [`InferenceTimeoutError`]:
1369
1510
  If the model is unavailable or the request times out.
@@ -1375,17 +1516,28 @@ class InferenceClient:
1375
1516
  >>> from huggingface_hub import InferenceClient
1376
1517
  >>> client = InferenceClient()
1377
1518
  >>> client.question_answering(question="What's my name?", context="My name is Clara and I live in Berkeley.")
1378
- QuestionAnsweringOutputElement(score=0.9326562285423279, start=11, end=16, answer='Clara')
1519
+ QuestionAnsweringOutputElement(answer='Clara', end=16, score=0.9326565265655518, start=11)
1379
1520
  ```
1380
1521
  """
1381
-
1382
- payload: Dict[str, Any] = {"question": question, "context": context}
1522
+ parameters = {
1523
+ "align_to_words": align_to_words,
1524
+ "doc_stride": doc_stride,
1525
+ "handle_impossible_answer": handle_impossible_answer,
1526
+ "max_answer_len": max_answer_len,
1527
+ "max_question_len": max_question_len,
1528
+ "max_seq_len": max_seq_len,
1529
+ "top_k": top_k,
1530
+ }
1531
+ inputs: Dict[str, Any] = {"question": question, "context": context}
1532
+ payload = _prepare_payload(inputs, parameters=parameters)
1383
1533
  response = self.post(
1384
- json=payload,
1534
+ **payload,
1385
1535
  model=model,
1386
1536
  task="question-answering",
1387
1537
  )
1388
- return QuestionAnsweringOutputElement.parse_obj_as_instance(response)
1538
+ # Parse the response as a single `QuestionAnsweringOutputElement` when top_k is 1 or not provided, or a list of `QuestionAnsweringOutputElement` to ensure backward compatibility.
1539
+ output = QuestionAnsweringOutputElement.parse_obj(response)
1540
+ return output
1389
1541
 
1390
1542
  def sentence_similarity(
1391
1543
  self, sentence: str, other_sentences: List[str], *, model: Optional[str] = None
@@ -1434,12 +1586,23 @@ class InferenceClient:
1434
1586
  )
1435
1587
  return _bytes_to_list(response)
1436
1588
 
1589
+ @_deprecate_arguments(
1590
+ version="0.29",
1591
+ deprecated_args=["parameters"],
1592
+ custom_message=(
1593
+ "The `parameters` argument is deprecated and will be removed in a future version. "
1594
+ "Provide individual parameters instead: `clean_up_tokenization_spaces`, `generate_parameters`, and `truncation`."
1595
+ ),
1596
+ )
1437
1597
  def summarization(
1438
1598
  self,
1439
1599
  text: str,
1440
1600
  *,
1441
1601
  parameters: Optional[Dict[str, Any]] = None,
1442
1602
  model: Optional[str] = None,
1603
+ clean_up_tokenization_spaces: Optional[bool] = None,
1604
+ generate_parameters: Optional[Dict[str, Any]] = None,
1605
+ truncation: Optional[Literal["do_not_truncate", "longest_first", "only_first", "only_second"]] = None,
1443
1606
  ) -> SummarizationOutput:
1444
1607
  """
1445
1608
  Generate a summary of a given text using a specified model.
@@ -1452,8 +1615,13 @@ class InferenceClient:
1452
1615
  for more details.
1453
1616
  model (`str`, *optional*):
1454
1617
  The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
1455
- Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
1456
-
1618
+ Inference Endpoint. If not provided, the default recommended model for summarization will be used.
1619
+ clean_up_tokenization_spaces (`bool`, *optional*):
1620
+ Whether to clean up the potential extra spaces in the text output.
1621
+ generate_parameters (`Dict[str, Any]`, *optional*):
1622
+ Additional parametrization of the text generation algorithm.
1623
+ truncation (`Literal["do_not_truncate", "longest_first", "only_first", "only_second"]`, *optional*):
1624
+ The truncation strategy to use.
1457
1625
  Returns:
1458
1626
  [`SummarizationOutput`]: The generated summary text.
1459
1627
 
@@ -1471,14 +1639,23 @@ class InferenceClient:
1471
1639
  SummarizationOutput(generated_text="The Eiffel tower is one of the most famous landmarks in the world....")
1472
1640
  ```
1473
1641
  """
1474
- payload: Dict[str, Any] = {"inputs": text}
1475
- if parameters is not None:
1476
- payload["parameters"] = parameters
1477
- response = self.post(json=payload, model=model, task="summarization")
1642
+ if parameters is None:
1643
+ parameters = {
1644
+ "clean_up_tokenization_spaces": clean_up_tokenization_spaces,
1645
+ "generate_parameters": generate_parameters,
1646
+ "truncation": truncation,
1647
+ }
1648
+ payload = _prepare_payload(text, parameters=parameters)
1649
+ response = self.post(**payload, model=model, task="summarization")
1478
1650
  return SummarizationOutput.parse_obj_as_list(response)[0]
1479
1651
 
1480
1652
  def table_question_answering(
1481
- self, table: Dict[str, Any], query: str, *, model: Optional[str] = None
1653
+ self,
1654
+ table: Dict[str, Any],
1655
+ query: str,
1656
+ *,
1657
+ model: Optional[str] = None,
1658
+ parameters: Optional[Dict[str, Any]] = None,
1482
1659
  ) -> TableQuestionAnsweringOutputElement:
1483
1660
  """
1484
1661
  Retrieve the answer to a question from information given in a table.
@@ -1492,6 +1669,8 @@ class InferenceClient:
1492
1669
  model (`str`):
1493
1670
  The model to use for the table-question-answering task. Can be a model ID hosted on the Hugging Face
1494
1671
  Hub or a URL to a deployed Inference Endpoint.
1672
+ parameters (`Dict[str, Any]`, *optional*):
1673
+ Additional inference parameters. Defaults to None.
1495
1674
 
1496
1675
  Returns:
1497
1676
  [`TableQuestionAnsweringOutputElement`]: a table question answering output containing the answer, coordinates, cells and the aggregator used.
@@ -1512,11 +1691,13 @@ class InferenceClient:
1512
1691
  TableQuestionAnsweringOutputElement(answer='36542', coordinates=[[0, 1]], cells=['36542'], aggregator='AVERAGE')
1513
1692
  ```
1514
1693
  """
1694
+ inputs = {
1695
+ "query": query,
1696
+ "table": table,
1697
+ }
1698
+ payload = _prepare_payload(inputs, parameters=parameters)
1515
1699
  response = self.post(
1516
- json={
1517
- "query": query,
1518
- "table": table,
1519
- },
1700
+ **payload,
1520
1701
  model=model,
1521
1702
  task="table-question-answering",
1522
1703
  )
@@ -1564,7 +1745,11 @@ class InferenceClient:
1564
1745
  ["5", "5", "5"]
1565
1746
  ```
1566
1747
  """
1567
- response = self.post(json={"table": table}, model=model, task="tabular-classification")
1748
+ response = self.post(
1749
+ json={"table": table},
1750
+ model=model,
1751
+ task="tabular-classification",
1752
+ )
1568
1753
  return _bytes_to_list(response)
1569
1754
 
1570
1755
  def tabular_regression(self, table: Dict[str, Any], *, model: Optional[str] = None) -> List[float]:
@@ -1607,7 +1792,14 @@ class InferenceClient:
1607
1792
  response = self.post(json={"table": table}, model=model, task="tabular-regression")
1608
1793
  return _bytes_to_list(response)
1609
1794
 
1610
- def text_classification(self, text: str, *, model: Optional[str] = None) -> List[TextClassificationOutputElement]:
1795
+ def text_classification(
1796
+ self,
1797
+ text: str,
1798
+ *,
1799
+ model: Optional[str] = None,
1800
+ top_k: Optional[int] = None,
1801
+ function_to_apply: Optional["TextClassificationOutputTransform"] = None,
1802
+ ) -> List[TextClassificationOutputElement]:
1611
1803
  """
1612
1804
  Perform text classification (e.g. sentiment-analysis) on the given text.
1613
1805
 
@@ -1618,6 +1810,10 @@ class InferenceClient:
1618
1810
  The model to use for the text classification task. Can be a model ID hosted on the Hugging Face Hub or a URL to
1619
1811
  a deployed Inference Endpoint. If not provided, the default recommended text classification model will be used.
1620
1812
  Defaults to None.
1813
+ top_k (`int`, *optional*):
1814
+ When specified, limits the output to the top K most probable classes.
1815
+ function_to_apply (`"TextClassificationOutputTransform"`, *optional*):
1816
+ The function to apply to the output.
1621
1817
 
1622
1818
  Returns:
1623
1819
  `List[TextClassificationOutputElement]`: a list of [`TextClassificationOutputElement`] items containing the predicted label and associated probability.
@@ -1639,7 +1835,16 @@ class InferenceClient:
1639
1835
  ]
1640
1836
  ```
1641
1837
  """
1642
- response = self.post(json={"inputs": text}, model=model, task="text-classification")
1838
+ parameters = {
1839
+ "function_to_apply": function_to_apply,
1840
+ "top_k": top_k,
1841
+ }
1842
+ payload = _prepare_payload(text, parameters=parameters)
1843
+ response = self.post(
1844
+ **payload,
1845
+ model=model,
1846
+ task="text-classification",
1847
+ )
1643
1848
  return TextClassificationOutputElement.parse_obj_as_list(response)[0] # type: ignore [return-value]
1644
1849
 
1645
1850
  @overload
@@ -2148,6 +2353,9 @@ class InferenceClient:
2148
2353
  num_inference_steps: Optional[float] = None,
2149
2354
  guidance_scale: Optional[float] = None,
2150
2355
  model: Optional[str] = None,
2356
+ scheduler: Optional[str] = None,
2357
+ target_size: Optional[TextToImageTargetSize] = None,
2358
+ seed: Optional[int] = None,
2151
2359
  **kwargs,
2152
2360
  ) -> "Image":
2153
2361
  """
@@ -2176,7 +2384,14 @@ class InferenceClient:
2176
2384
  usually at the expense of lower image quality.
2177
2385
  model (`str`, *optional*):
2178
2386
  The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
2179
- Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
2387
+ Inference Endpoint. If not provided, the default recommended text-to-image model will be used.
2388
+ Defaults to None.
2389
+ scheduler (`str`, *optional*):
2390
+ Override the scheduler with a compatible one.
2391
+ target_size (`TextToImageTargetSize`, *optional*):
2392
+ The size in pixel of the output image
2393
+ seed (`int`, *optional*):
2394
+ Seed for the random number generator.
2180
2395
 
2181
2396
  Returns:
2182
2397
  `Image`: The generated image.
@@ -2203,22 +2418,44 @@ class InferenceClient:
2203
2418
  >>> image.save("better_astronaut.png")
2204
2419
  ```
2205
2420
  """
2206
- payload = {"inputs": prompt}
2421
+
2207
2422
  parameters = {
2208
2423
  "negative_prompt": negative_prompt,
2209
2424
  "height": height,
2210
2425
  "width": width,
2211
2426
  "num_inference_steps": num_inference_steps,
2212
2427
  "guidance_scale": guidance_scale,
2428
+ "scheduler": scheduler,
2429
+ "target_size": target_size,
2430
+ "seed": seed,
2213
2431
  **kwargs,
2214
2432
  }
2215
- for key, value in parameters.items():
2216
- if value is not None:
2217
- payload.setdefault("parameters", {})[key] = value # type: ignore
2218
- response = self.post(json=payload, model=model, task="text-to-image")
2433
+ payload = _prepare_payload(prompt, parameters=parameters)
2434
+ response = self.post(**payload, model=model, task="text-to-image")
2219
2435
  return _bytes_to_image(response)
2220
2436
 
2221
- def text_to_speech(self, text: str, *, model: Optional[str] = None) -> bytes:
2437
+ def text_to_speech(
2438
+ self,
2439
+ text: str,
2440
+ *,
2441
+ model: Optional[str] = None,
2442
+ do_sample: Optional[bool] = None,
2443
+ early_stopping: Optional[Union[bool, "TextToSpeechEarlyStoppingEnum"]] = None,
2444
+ epsilon_cutoff: Optional[float] = None,
2445
+ eta_cutoff: Optional[float] = None,
2446
+ max_length: Optional[int] = None,
2447
+ max_new_tokens: Optional[int] = None,
2448
+ min_length: Optional[int] = None,
2449
+ min_new_tokens: Optional[int] = None,
2450
+ num_beam_groups: Optional[int] = None,
2451
+ num_beams: Optional[int] = None,
2452
+ penalty_alpha: Optional[float] = None,
2453
+ temperature: Optional[float] = None,
2454
+ top_k: Optional[int] = None,
2455
+ top_p: Optional[float] = None,
2456
+ typical_p: Optional[float] = None,
2457
+ use_cache: Optional[bool] = None,
2458
+ ) -> bytes:
2222
2459
  """
2223
2460
  Synthesize an audio of a voice pronouncing a given text.
2224
2461
 
@@ -2227,7 +2464,56 @@ class InferenceClient:
2227
2464
  The text to synthesize.
2228
2465
  model (`str`, *optional*):
2229
2466
  The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
2230
- Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
2467
+ Inference Endpoint. If not provided, the default recommended text-to-speech model will be used.
2468
+ Defaults to None.
2469
+ do_sample (`bool`, *optional*):
2470
+ Whether to use sampling instead of greedy decoding when generating new tokens.
2471
+ early_stopping (`Union[bool, "TextToSpeechEarlyStoppingEnum"`, *optional*):
2472
+ Controls the stopping condition for beam-based methods.
2473
+ epsilon_cutoff (`float`, *optional*):
2474
+ If set to float strictly between 0 and 1, only tokens with a conditional probability
2475
+ greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
2476
+ 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
2477
+ Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
2478
+ eta_cutoff (`float`, *optional*):
2479
+ Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
2480
+ float strictly between 0 and 1, a token is only considered if it is greater than either
2481
+ eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
2482
+ term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
2483
+ the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
2484
+ See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
2485
+ for more details.
2486
+ max_length (`int`, *optional*):
2487
+ The maximum length (in tokens) of the generated text, including the input.
2488
+ max_new_tokens (`int`, *optional*):
2489
+ The maximum number of tokens to generate. Takes precedence over maxLength.
2490
+ min_length (`int`, *optional*):
2491
+ The minimum length (in tokens) of the generated text, including the input.
2492
+ min_new_tokens (`int`, *optional*):
2493
+ The minimum number of tokens to generate. Takes precedence over maxLength.
2494
+ num_beam_groups (`int`, *optional*):
2495
+ Number of groups to divide num_beams into in order to ensure diversity among different
2496
+ groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
2497
+ num_beams (`int`, *optional*):
2498
+ Number of beams to use for beam search.
2499
+ penalty_alpha (`float`, *optional*):
2500
+ The value balances the model confidence and the degeneration penalty in contrastive
2501
+ search decoding.
2502
+ temperature (`float`, *optional*):
2503
+ The value used to modulate the next token probabilities.
2504
+ top_k (`int`, *optional*):
2505
+ The number of highest probability vocabulary tokens to keep for top-k-filtering.
2506
+ top_p (`float`, *optional*):
2507
+ If set to float < 1, only the smallest set of most probable tokens with probabilities
2508
+ that add up to top_p or higher are kept for generation.
2509
+ typical_p (`float`, *optional*):
2510
+ Local typicality measures how similar the conditional probability of predicting a target token next is
2511
+ to the expected conditional probability of predicting a random token next, given the partial text
2512
+ already generated. If set to float < 1, the smallest set of the most locally typical tokens with
2513
+ probabilities that add up to typical_p or higher are kept for generation. See [this
2514
+ paper](https://hf.co/papers/2202.00666) for more details.
2515
+ use_cache (`bool`, *optional*):
2516
+ Whether the model should use the past last key/values attentions to speed up decoding
2231
2517
 
2232
2518
  Returns:
2233
2519
  `bytes`: The generated audio.
@@ -2248,10 +2534,36 @@ class InferenceClient:
2248
2534
  >>> Path("hello_world.flac").write_bytes(audio)
2249
2535
  ```
2250
2536
  """
2251
- return self.post(json={"inputs": text}, model=model, task="text-to-speech")
2537
+ parameters = {
2538
+ "do_sample": do_sample,
2539
+ "early_stopping": early_stopping,
2540
+ "epsilon_cutoff": epsilon_cutoff,
2541
+ "eta_cutoff": eta_cutoff,
2542
+ "max_length": max_length,
2543
+ "max_new_tokens": max_new_tokens,
2544
+ "min_length": min_length,
2545
+ "min_new_tokens": min_new_tokens,
2546
+ "num_beam_groups": num_beam_groups,
2547
+ "num_beams": num_beams,
2548
+ "penalty_alpha": penalty_alpha,
2549
+ "temperature": temperature,
2550
+ "top_k": top_k,
2551
+ "top_p": top_p,
2552
+ "typical_p": typical_p,
2553
+ "use_cache": use_cache,
2554
+ }
2555
+ payload = _prepare_payload(text, parameters=parameters)
2556
+ response = self.post(**payload, model=model, task="text-to-speech")
2557
+ return response
2252
2558
 
2253
2559
  def token_classification(
2254
- self, text: str, *, model: Optional[str] = None
2560
+ self,
2561
+ text: str,
2562
+ *,
2563
+ model: Optional[str] = None,
2564
+ aggregation_strategy: Optional[Literal["none", "simple", "first", "average", "max"]] = None,
2565
+ ignore_labels: Optional[List[str]] = None,
2566
+ stride: Optional[int] = None,
2255
2567
  ) -> List[TokenClassificationOutputElement]:
2256
2568
  """
2257
2569
  Perform token classification on the given text.
@@ -2264,6 +2576,12 @@ class InferenceClient:
2264
2576
  The model to use for the token classification task. Can be a model ID hosted on the Hugging Face Hub or a URL to
2265
2577
  a deployed Inference Endpoint. If not provided, the default recommended token classification model will be used.
2266
2578
  Defaults to None.
2579
+ aggregation_strategy (`Literal["none", "simple", "first", "average", "max"]`, *optional*):
2580
+ The strategy used to fuse tokens based on model predictions.
2581
+ ignore_labels (`List[str]`, *optional*):
2582
+ A list of labels to ignore.
2583
+ stride (`int`, *optional*):
2584
+ The number of overlapping tokens between chunks when splitting the input text.
2267
2585
 
2268
2586
  Returns:
2269
2587
  `List[TokenClassificationOutputElement]`: List of [`TokenClassificationOutputElement`] items containing the entity group, confidence score, word, start and end index.
@@ -2297,16 +2615,30 @@ class InferenceClient:
2297
2615
  ]
2298
2616
  ```
2299
2617
  """
2300
- payload: Dict[str, Any] = {"inputs": text}
2618
+
2619
+ parameters = {
2620
+ "aggregation_strategy": aggregation_strategy,
2621
+ "ignore_labels": ignore_labels,
2622
+ "stride": stride,
2623
+ }
2624
+ payload = _prepare_payload(text, parameters=parameters)
2301
2625
  response = self.post(
2302
- json=payload,
2626
+ **payload,
2303
2627
  model=model,
2304
2628
  task="token-classification",
2305
2629
  )
2306
2630
  return TokenClassificationOutputElement.parse_obj_as_list(response)
2307
2631
 
2308
2632
  def translation(
2309
- self, text: str, *, model: Optional[str] = None, src_lang: Optional[str] = None, tgt_lang: Optional[str] = None
2633
+ self,
2634
+ text: str,
2635
+ *,
2636
+ model: Optional[str] = None,
2637
+ src_lang: Optional[str] = None,
2638
+ tgt_lang: Optional[str] = None,
2639
+ clean_up_tokenization_spaces: Optional[bool] = None,
2640
+ truncation: Optional[Literal["do_not_truncate", "longest_first", "only_first", "only_second"]] = None,
2641
+ generate_parameters: Optional[Dict[str, Any]] = None,
2310
2642
  ) -> TranslationOutput:
2311
2643
  """
2312
2644
  Convert text from one language to another.
@@ -2315,7 +2647,6 @@ class InferenceClient:
2315
2647
  your specific use case. Source and target languages usually depend on the model.
2316
2648
  However, it is possible to specify source and target languages for certain models. If you are working with one of these models,
2317
2649
  you can use `src_lang` and `tgt_lang` arguments to pass the relevant information.
2318
- You can find this information in the model card.
2319
2650
 
2320
2651
  Args:
2321
2652
  text (`str`):
@@ -2325,9 +2656,15 @@ class InferenceClient:
2325
2656
  a deployed Inference Endpoint. If not provided, the default recommended translation model will be used.
2326
2657
  Defaults to None.
2327
2658
  src_lang (`str`, *optional*):
2328
- Source language of the translation task, i.e. input language. Cannot be passed without `tgt_lang`.
2659
+ The source language of the text. Required for models that can translate from multiple languages.
2329
2660
  tgt_lang (`str`, *optional*):
2330
- Target language of the translation task, i.e. output language. Cannot be passed without `src_lang`.
2661
+ Target language to translate to. Required for models that can translate to multiple languages.
2662
+ clean_up_tokenization_spaces (`bool`, *optional*):
2663
+ Whether to clean up the potential extra spaces in the text output.
2664
+ truncation (`Literal["do_not_truncate", "longest_first", "only_first", "only_second"]`, *optional*):
2665
+ The truncation strategy to use.
2666
+ generate_parameters (`Dict[str, Any]`, *optional*):
2667
+ Additional parametrization of the text generation algorithm.
2331
2668
 
2332
2669
  Returns:
2333
2670
  [`TranslationOutput`]: The generated translated text.
@@ -2362,12 +2699,15 @@ class InferenceClient:
2362
2699
 
2363
2700
  if src_lang is None and tgt_lang is not None:
2364
2701
  raise ValueError("You cannot specify `tgt_lang` without specifying `src_lang`.")
2365
-
2366
- # If both `src_lang` and `tgt_lang` are given, pass them to the request body
2367
- payload: Dict = {"inputs": text}
2368
- if src_lang and tgt_lang:
2369
- payload["parameters"] = {"src_lang": src_lang, "tgt_lang": tgt_lang}
2370
- response = self.post(json=payload, model=model, task="translation")
2702
+ parameters = {
2703
+ "src_lang": src_lang,
2704
+ "tgt_lang": tgt_lang,
2705
+ "clean_up_tokenization_spaces": clean_up_tokenization_spaces,
2706
+ "truncation": truncation,
2707
+ "generate_parameters": generate_parameters,
2708
+ }
2709
+ payload = _prepare_payload(text, parameters=parameters)
2710
+ response = self.post(**payload, model=model, task="translation")
2371
2711
  return TranslationOutput.parse_obj_as_list(response)[0]
2372
2712
 
2373
2713
  def visual_question_answering(
@@ -2376,6 +2716,7 @@ class InferenceClient:
2376
2716
  question: str,
2377
2717
  *,
2378
2718
  model: Optional[str] = None,
2719
+ top_k: Optional[int] = None,
2379
2720
  ) -> List[VisualQuestionAnsweringOutputElement]:
2380
2721
  """
2381
2722
  Answering open-ended questions based on an image.
@@ -2389,7 +2730,10 @@ class InferenceClient:
2389
2730
  The model to use for the visual question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
2390
2731
  a deployed Inference Endpoint. If not provided, the default recommended visual question answering model will be used.
2391
2732
  Defaults to None.
2392
-
2733
+ top_k (`int`, *optional*):
2734
+ The number of answers to return (will be chosen by order of likelihood). Note that we
2735
+ return less than topk answers if there are not enough options available within the
2736
+ context.
2393
2737
  Returns:
2394
2738
  `List[VisualQuestionAnsweringOutputElement]`: a list of [`VisualQuestionAnsweringOutputElement`] items containing the predicted label and associated probability.
2395
2739
 
@@ -2414,6 +2758,8 @@ class InferenceClient:
2414
2758
  ```
2415
2759
  """
2416
2760
  payload: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
2761
+ if top_k is not None:
2762
+ payload.setdefault("parameters", {})["top_k"] = top_k
2417
2763
  response = self.post(json=payload, model=model, task="visual-question-answering")
2418
2764
  return VisualQuestionAnsweringOutputElement.parse_obj_as_list(response)
2419
2765
 
@@ -2444,7 +2790,7 @@ class InferenceClient:
2444
2790
  The model then evaluates for both hypotheses if they are entailed in the provided `text` or not.
2445
2791
  model (`str`, *optional*):
2446
2792
  The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
2447
- Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
2793
+ Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot classification model will be used.
2448
2794
 
2449
2795
  Returns:
2450
2796
  `List[ZeroShotClassificationOutputElement]`: List of [`ZeroShotClassificationOutputElement`] items containing the predicted labels and their confidence.
@@ -2502,15 +2848,14 @@ class InferenceClient:
2502
2848
  ```
2503
2849
  """
2504
2850
 
2505
- parameters = {"candidate_labels": labels, "multi_label": multi_label}
2506
- if hypothesis_template is not None:
2507
- parameters["hypothesis_template"] = hypothesis_template
2508
-
2851
+ parameters = {
2852
+ "candidate_labels": labels,
2853
+ "multi_label": multi_label,
2854
+ "hypothesis_template": hypothesis_template,
2855
+ }
2856
+ payload = _prepare_payload(text, parameters=parameters)
2509
2857
  response = self.post(
2510
- json={
2511
- "inputs": text,
2512
- "parameters": parameters,
2513
- },
2858
+ **payload,
2514
2859
  task="zero-shot-classification",
2515
2860
  model=model,
2516
2861
  )
@@ -2521,7 +2866,12 @@ class InferenceClient:
2521
2866
  ]
2522
2867
 
2523
2868
  def zero_shot_image_classification(
2524
- self, image: ContentT, labels: List[str], *, model: Optional[str] = None
2869
+ self,
2870
+ image: ContentT,
2871
+ labels: List[str],
2872
+ *,
2873
+ model: Optional[str] = None,
2874
+ hypothesis_template: Optional[str] = None,
2525
2875
  ) -> List[ZeroShotImageClassificationOutputElement]:
2526
2876
  """
2527
2877
  Provide input image and text labels to predict text labels for the image.
@@ -2533,8 +2883,10 @@ class InferenceClient:
2533
2883
  List of string possible labels. There must be at least 2 labels.
2534
2884
  model (`str`, *optional*):
2535
2885
  The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
2536
- Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
2537
-
2886
+ Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot image classification model will be used.
2887
+ hypothesis_template (`str`, *optional*):
2888
+ The sentence used in conjunction with `labels` to attempt the text classification by replacing the
2889
+ placeholder with the candidate labels.
2538
2890
  Returns:
2539
2891
  `List[ZeroShotImageClassificationOutputElement]`: List of [`ZeroShotImageClassificationOutputElement`] items containing the predicted labels and their confidence.
2540
2892
 
@@ -2560,8 +2912,11 @@ class InferenceClient:
2560
2912
  if len(labels) < 2:
2561
2913
  raise ValueError("You must specify at least 2 classes to compare.")
2562
2914
 
2915
+ inputs = {"image": _b64_encode(image), "candidateLabels": ",".join(labels)}
2916
+ parameters = {"hypothesis_template": hypothesis_template}
2917
+ payload = _prepare_payload(inputs, parameters=parameters)
2563
2918
  response = self.post(
2564
- json={"image": _b64_encode(image), "parameters": {"candidate_labels": ",".join(labels)}},
2919
+ **payload,
2565
2920
  model=model,
2566
2921
  task="zero-shot-image-classification",
2567
2922
  )