together 1.5.34__py3-none-any.whl → 2.0.0a6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (208) hide show
  1. together/__init__.py +101 -114
  2. together/_base_client.py +1995 -0
  3. together/_client.py +1033 -0
  4. together/_compat.py +219 -0
  5. together/_constants.py +14 -0
  6. together/_exceptions.py +108 -0
  7. together/_files.py +123 -0
  8. together/_models.py +857 -0
  9. together/_qs.py +150 -0
  10. together/_resource.py +43 -0
  11. together/_response.py +830 -0
  12. together/_streaming.py +370 -0
  13. together/_types.py +260 -0
  14. together/_utils/__init__.py +64 -0
  15. together/_utils/_compat.py +45 -0
  16. together/_utils/_datetime_parse.py +136 -0
  17. together/_utils/_logs.py +25 -0
  18. together/_utils/_proxy.py +65 -0
  19. together/_utils/_reflection.py +42 -0
  20. together/_utils/_resources_proxy.py +24 -0
  21. together/_utils/_streams.py +12 -0
  22. together/_utils/_sync.py +58 -0
  23. together/_utils/_transform.py +457 -0
  24. together/_utils/_typing.py +156 -0
  25. together/_utils/_utils.py +421 -0
  26. together/_version.py +4 -0
  27. together/lib/.keep +4 -0
  28. together/lib/__init__.py +23 -0
  29. together/{cli → lib/cli}/api/endpoints.py +65 -81
  30. together/{cli/api/evaluation.py → lib/cli/api/evals.py} +152 -43
  31. together/{cli → lib/cli}/api/files.py +20 -17
  32. together/{cli/api/finetune.py → lib/cli/api/fine_tuning.py} +116 -172
  33. together/{cli → lib/cli}/api/models.py +34 -27
  34. together/lib/cli/api/utils.py +50 -0
  35. together/{cli → lib/cli}/cli.py +16 -26
  36. together/{constants.py → lib/constants.py} +11 -24
  37. together/lib/resources/__init__.py +11 -0
  38. together/lib/resources/files.py +999 -0
  39. together/lib/resources/fine_tuning.py +280 -0
  40. together/lib/resources/models.py +35 -0
  41. together/lib/types/__init__.py +13 -0
  42. together/lib/types/error.py +9 -0
  43. together/lib/types/fine_tuning.py +397 -0
  44. together/{utils → lib/utils}/__init__.py +6 -14
  45. together/{utils → lib/utils}/_log.py +11 -16
  46. together/{utils → lib/utils}/files.py +90 -288
  47. together/lib/utils/serializer.py +10 -0
  48. together/{utils → lib/utils}/tools.py +19 -55
  49. together/resources/__init__.py +225 -39
  50. together/resources/audio/__init__.py +72 -48
  51. together/resources/audio/audio.py +198 -0
  52. together/resources/audio/speech.py +574 -128
  53. together/resources/audio/transcriptions.py +247 -261
  54. together/resources/audio/translations.py +221 -241
  55. together/resources/audio/voices.py +111 -41
  56. together/resources/batches.py +417 -0
  57. together/resources/chat/__init__.py +30 -21
  58. together/resources/chat/chat.py +102 -0
  59. together/resources/chat/completions.py +1063 -263
  60. together/resources/code_interpreter/__init__.py +33 -0
  61. together/resources/code_interpreter/code_interpreter.py +258 -0
  62. together/resources/code_interpreter/sessions.py +135 -0
  63. together/resources/completions.py +884 -225
  64. together/resources/embeddings.py +172 -68
  65. together/resources/endpoints.py +589 -477
  66. together/resources/evals.py +452 -0
  67. together/resources/files.py +397 -129
  68. together/resources/fine_tuning.py +1033 -0
  69. together/resources/hardware.py +181 -0
  70. together/resources/images.py +258 -104
  71. together/resources/jobs.py +214 -0
  72. together/resources/models.py +223 -193
  73. together/resources/rerank.py +190 -92
  74. together/resources/videos.py +286 -214
  75. together/types/__init__.py +66 -167
  76. together/types/audio/__init__.py +10 -0
  77. together/types/audio/speech_create_params.py +75 -0
  78. together/types/audio/transcription_create_params.py +54 -0
  79. together/types/audio/transcription_create_response.py +111 -0
  80. together/types/audio/translation_create_params.py +40 -0
  81. together/types/audio/translation_create_response.py +70 -0
  82. together/types/audio/voice_list_response.py +23 -0
  83. together/types/audio_speech_stream_chunk.py +16 -0
  84. together/types/autoscaling.py +13 -0
  85. together/types/autoscaling_param.py +15 -0
  86. together/types/batch_create_params.py +24 -0
  87. together/types/batch_create_response.py +14 -0
  88. together/types/batch_job.py +45 -0
  89. together/types/batch_list_response.py +10 -0
  90. together/types/chat/__init__.py +18 -0
  91. together/types/chat/chat_completion.py +60 -0
  92. together/types/chat/chat_completion_chunk.py +61 -0
  93. together/types/chat/chat_completion_structured_message_image_url_param.py +18 -0
  94. together/types/chat/chat_completion_structured_message_text_param.py +13 -0
  95. together/types/chat/chat_completion_structured_message_video_url_param.py +18 -0
  96. together/types/chat/chat_completion_usage.py +13 -0
  97. together/types/chat/chat_completion_warning.py +9 -0
  98. together/types/chat/completion_create_params.py +329 -0
  99. together/types/code_interpreter/__init__.py +5 -0
  100. together/types/code_interpreter/session_list_response.py +31 -0
  101. together/types/code_interpreter_execute_params.py +45 -0
  102. together/types/completion.py +42 -0
  103. together/types/completion_chunk.py +66 -0
  104. together/types/completion_create_params.py +138 -0
  105. together/types/dedicated_endpoint.py +44 -0
  106. together/types/embedding.py +24 -0
  107. together/types/embedding_create_params.py +31 -0
  108. together/types/endpoint_create_params.py +43 -0
  109. together/types/endpoint_list_avzones_response.py +11 -0
  110. together/types/endpoint_list_params.py +18 -0
  111. together/types/endpoint_list_response.py +41 -0
  112. together/types/endpoint_update_params.py +27 -0
  113. together/types/eval_create_params.py +263 -0
  114. together/types/eval_create_response.py +16 -0
  115. together/types/eval_list_params.py +21 -0
  116. together/types/eval_list_response.py +10 -0
  117. together/types/eval_status_response.py +100 -0
  118. together/types/evaluation_job.py +139 -0
  119. together/types/execute_response.py +108 -0
  120. together/types/file_delete_response.py +13 -0
  121. together/types/file_list.py +12 -0
  122. together/types/file_purpose.py +9 -0
  123. together/types/file_response.py +31 -0
  124. together/types/file_type.py +7 -0
  125. together/types/fine_tuning_cancel_response.py +194 -0
  126. together/types/fine_tuning_content_params.py +24 -0
  127. together/types/fine_tuning_delete_params.py +11 -0
  128. together/types/fine_tuning_delete_response.py +12 -0
  129. together/types/fine_tuning_list_checkpoints_response.py +21 -0
  130. together/types/fine_tuning_list_events_response.py +12 -0
  131. together/types/fine_tuning_list_response.py +199 -0
  132. together/types/finetune_event.py +41 -0
  133. together/types/finetune_event_type.py +33 -0
  134. together/types/finetune_response.py +177 -0
  135. together/types/hardware_list_params.py +16 -0
  136. together/types/hardware_list_response.py +58 -0
  137. together/types/image_data_b64.py +15 -0
  138. together/types/image_data_url.py +15 -0
  139. together/types/image_file.py +23 -0
  140. together/types/image_generate_params.py +85 -0
  141. together/types/job_list_response.py +47 -0
  142. together/types/job_retrieve_response.py +43 -0
  143. together/types/log_probs.py +18 -0
  144. together/types/model_list_response.py +10 -0
  145. together/types/model_object.py +42 -0
  146. together/types/model_upload_params.py +36 -0
  147. together/types/model_upload_response.py +23 -0
  148. together/types/rerank_create_params.py +36 -0
  149. together/types/rerank_create_response.py +36 -0
  150. together/types/tool_choice.py +23 -0
  151. together/types/tool_choice_param.py +23 -0
  152. together/types/tools_param.py +23 -0
  153. together/types/training_method_dpo.py +22 -0
  154. together/types/training_method_sft.py +18 -0
  155. together/types/video_create_params.py +86 -0
  156. together/types/video_create_response.py +10 -0
  157. together/types/video_job.py +57 -0
  158. together-2.0.0a6.dist-info/METADATA +729 -0
  159. together-2.0.0a6.dist-info/RECORD +165 -0
  160. {together-1.5.34.dist-info → together-2.0.0a6.dist-info}/WHEEL +1 -1
  161. together-2.0.0a6.dist-info/entry_points.txt +2 -0
  162. {together-1.5.34.dist-info → together-2.0.0a6.dist-info}/licenses/LICENSE +1 -1
  163. together/abstract/api_requestor.py +0 -770
  164. together/cli/api/chat.py +0 -298
  165. together/cli/api/completions.py +0 -119
  166. together/cli/api/images.py +0 -93
  167. together/cli/api/utils.py +0 -139
  168. together/client.py +0 -186
  169. together/error.py +0 -194
  170. together/filemanager.py +0 -635
  171. together/legacy/__init__.py +0 -0
  172. together/legacy/base.py +0 -27
  173. together/legacy/complete.py +0 -93
  174. together/legacy/embeddings.py +0 -27
  175. together/legacy/files.py +0 -146
  176. together/legacy/finetune.py +0 -177
  177. together/legacy/images.py +0 -27
  178. together/legacy/models.py +0 -44
  179. together/resources/batch.py +0 -165
  180. together/resources/code_interpreter.py +0 -82
  181. together/resources/evaluation.py +0 -808
  182. together/resources/finetune.py +0 -1388
  183. together/together_response.py +0 -50
  184. together/types/abstract.py +0 -26
  185. together/types/audio_speech.py +0 -311
  186. together/types/batch.py +0 -54
  187. together/types/chat_completions.py +0 -210
  188. together/types/code_interpreter.py +0 -57
  189. together/types/common.py +0 -67
  190. together/types/completions.py +0 -107
  191. together/types/embeddings.py +0 -35
  192. together/types/endpoints.py +0 -123
  193. together/types/error.py +0 -16
  194. together/types/evaluation.py +0 -93
  195. together/types/files.py +0 -93
  196. together/types/finetune.py +0 -464
  197. together/types/images.py +0 -42
  198. together/types/models.py +0 -96
  199. together/types/rerank.py +0 -43
  200. together/types/videos.py +0 -69
  201. together/utils/api_helpers.py +0 -124
  202. together/version.py +0 -6
  203. together-1.5.34.dist-info/METADATA +0 -583
  204. together-1.5.34.dist-info/RECORD +0 -77
  205. together-1.5.34.dist-info/entry_points.txt +0 -3
  206. /together/{abstract → lib/cli}/__init__.py +0 -0
  207. /together/{cli → lib/cli/api}/__init__.py +0 -0
  208. /together/{cli/api/__init__.py → py.typed} +0 -0
@@ -1,297 +1,1097 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
1
3
  from __future__ import annotations
2
4
 
3
- from typing import Any, AsyncGenerator, Dict, Iterator, List
5
+ from typing import Dict, Union, Iterable
6
+ from typing_extensions import Literal, overload
7
+
8
+ import httpx
4
9
 
5
- from together.abstract import api_requestor
6
- from together.together_response import TogetherResponse
7
- from together.types import (
8
- ChatCompletionChunk,
9
- ChatCompletionRequest,
10
- ChatCompletionResponse,
11
- TogetherClient,
12
- TogetherRequest,
10
+ from ..._types import Body, Omit, Query, Headers, NotGiven, SequenceNotStr, omit, not_given
11
+ from ..._utils import required_args, maybe_transform, async_maybe_transform
12
+ from ..._compat import cached_property
13
+ from ..._resource import SyncAPIResource, AsyncAPIResource
14
+ from ..._response import (
15
+ to_raw_response_wrapper,
16
+ to_streamed_response_wrapper,
17
+ async_to_raw_response_wrapper,
18
+ async_to_streamed_response_wrapper,
13
19
  )
20
+ from ..._streaming import Stream, AsyncStream
21
+ from ...types.chat import completion_create_params
22
+ from ..._base_client import make_request_options
23
+ from ...types.tools_param import ToolsParam
24
+ from ...types.chat.chat_completion import ChatCompletion
25
+ from ...types.chat.chat_completion_chunk import ChatCompletionChunk
26
+
27
+ __all__ = ["CompletionsResource", "AsyncCompletionsResource"]
28
+
14
29
 
30
+ class CompletionsResource(SyncAPIResource):
31
+ @cached_property
32
+ def with_raw_response(self) -> CompletionsResourceWithRawResponse:
33
+ """
34
+ This property can be used as a prefix for any HTTP method call to return
35
+ the raw response object instead of the parsed content.
36
+
37
+ For more information, see https://www.github.com/togethercomputer/together-py#accessing-raw-response-data-eg-headers
38
+ """
39
+ return CompletionsResourceWithRawResponse(self)
15
40
 
16
- class ChatCompletions:
17
- def __init__(self, client: TogetherClient) -> None:
18
- self._client = client
41
+ @cached_property
42
+ def with_streaming_response(self) -> CompletionsResourceWithStreamingResponse:
43
+ """
44
+ An alternative to `.with_raw_response` that doesn't eagerly read the response body.
19
45
 
46
+ For more information, see https://www.github.com/togethercomputer/together-py#with_streaming_response
47
+ """
48
+ return CompletionsResourceWithStreamingResponse(self)
49
+
50
+ @overload
20
51
  def create(
21
52
  self,
22
53
  *,
23
- messages: List[Dict[str, Any]],
24
- model: str,
25
- max_tokens: int | None = None,
26
- stop: List[str] | None = None,
27
- temperature: float | None = None,
28
- top_p: float | None = None,
29
- top_k: int | None = None,
30
- repetition_penalty: float | None = None,
31
- presence_penalty: float | None = None,
32
- frequency_penalty: float | None = None,
33
- min_p: float | None = None,
34
- logit_bias: Dict[str, float] | None = None,
35
- seed: int | None = None,
36
- stream: bool = False,
37
- logprobs: int | None = None,
38
- echo: bool | None = None,
39
- n: int | None = None,
40
- safety_model: str | None = None,
41
- response_format: Dict[str, Any] | None = None,
42
- tools: List[Dict[str, Any]] | None = None,
43
- tool_choice: str | Dict[str, str | Dict[str, str]] | None = None,
44
- **kwargs: Any,
45
- ) -> ChatCompletionResponse | Iterator[ChatCompletionChunk]:
54
+ messages: Iterable[completion_create_params.Message],
55
+ model: Union[
56
+ Literal[
57
+ "Qwen/Qwen2.5-72B-Instruct-Turbo",
58
+ "Qwen/Qwen2.5-7B-Instruct-Turbo",
59
+ "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
60
+ "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
61
+ "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
62
+ ],
63
+ str,
64
+ ],
65
+ context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
66
+ echo: bool | Omit = omit,
67
+ frequency_penalty: float | Omit = omit,
68
+ function_call: completion_create_params.FunctionCall | Omit = omit,
69
+ logit_bias: Dict[str, float] | Omit = omit,
70
+ logprobs: int | Omit = omit,
71
+ max_tokens: int | Omit = omit,
72
+ min_p: float | Omit = omit,
73
+ n: int | Omit = omit,
74
+ presence_penalty: float | Omit = omit,
75
+ reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
76
+ repetition_penalty: float | Omit = omit,
77
+ response_format: completion_create_params.ResponseFormat | Omit = omit,
78
+ safety_model: str | Omit = omit,
79
+ seed: int | Omit = omit,
80
+ stop: SequenceNotStr[str] | Omit = omit,
81
+ stream: Literal[False] | Omit = omit,
82
+ temperature: float | Omit = omit,
83
+ tool_choice: completion_create_params.ToolChoice | Omit = omit,
84
+ tools: Iterable[ToolsParam] | Omit = omit,
85
+ top_k: int | Omit = omit,
86
+ top_p: float | Omit = omit,
87
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
88
+ # The extra values given here take precedence over values defined on the client or passed to this method.
89
+ extra_headers: Headers | None = None,
90
+ extra_query: Query | None = None,
91
+ extra_body: Body | None = None,
92
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
93
+ ) -> ChatCompletion:
46
94
  """
47
- Method to generate completions based on a given prompt using a specified model.
95
+ Query a chat model.
48
96
 
49
97
  Args:
50
- messages (List[Dict[str, str]]): A list of messages in the format
51
- `[{"role": together.types.chat_completions.MessageRole, "content": TEXT}, ...]`
52
- model (str): The name of the model to query.
53
- max_tokens (int, optional): The maximum number of tokens to generate.
54
- Defaults to 512.
55
- stop (List[str], optional): List of strings at which to stop generation.
56
- Defaults to None.
57
- temperature (float, optional): A decimal number that determines the degree of randomness in the response.
58
- Defaults to None.
59
- top_p (float, optional): The top_p (nucleus) parameter is used to dynamically adjust the number
60
- of choices for each predicted token based on the cumulative probabilities.
61
- Defaults to None.
62
- top_k (int, optional): The top_k parameter is used to limit the number of choices for the
63
- next predicted word or token.
64
- Defaults to None.
65
- repetition_penalty (float, optional): A number that controls the diversity of generated text
66
- by reducing the likelihood of repeated sequences. Higher values decrease repetition.
67
- Defaults to None.
68
- presence_penalty (float, optional): A number that controls the likelihood of tokens based on if they have
69
- appeared in the text. Positive values decrease the likelihood of repeated tokens or phrases.
70
- Must be in the range [-2, 2].
71
- Defaults to None.
72
- frequency_penalty (float, optional): A number that controls the likelihood of tokens based on the frequency
73
- of their appearance in the text. Positive decrease the likelihood of repeated tokens or phrases.
74
- Must be in the range [-2, 2].
75
- Defaults to None.
76
- min_p (float, optional): A number that controls the minimum percentage value that a token must reach to
77
- be considered during sampling.
78
- Must be in the range [0, 1].
79
- Defaults to None.
80
- logit_bias (Dict[str, float], optional): A dictionary of tokens and their bias values that modify the
81
- likelihood of specific tokens being sampled. Bias values must be in the range [-100, 100].
82
- Defaults to None.
83
- seed (int, optional): A seed value to use for reproducibility.
84
- stream (bool, optional): Flag indicating whether to stream the generated completions.
85
- Defaults to False.
86
- logprobs (int, optional): Number of top-k logprobs to return
87
- Defaults to None.
88
- echo (bool, optional): Echo prompt in output. Can be used with logprobs to return prompt logprobs.
89
- Defaults to None.
90
- n (int, optional): Number of completions to generate. Setting to None will return a single generation.
91
- Defaults to None.
92
- safety_model (str, optional): A moderation model to validate tokens. Choice between available moderation
93
- models found [here](https://docs.together.ai/docs/inference-models#moderation-models).
94
- Defaults to None.
95
- response_format (Dict[str, Any], optional): An object specifying the format that the model must output.
96
- Defaults to None.
97
- tools (Dict[str, str | Dict[str, str | Dict[str, Any]]], optional): A list of tools the model may call.
98
- Currently, only functions are supported as a tool.
99
- Use this to provide a list of functions the model may generate JSON inputs for.
100
- Defaults to None
101
- tool_choice: Controls which (if any) function is called by the model. auto means the model can pick
102
- between generating a message or calling a function. Specifying a particular function
103
- via {"type": "function", "function": {"name": "my_function"}} forces the model to call that function.
104
- Sets to `auto` if None.
105
- Defaults to None.
106
-
107
- Returns:
108
- ChatCompletionResponse | Iterator[ChatCompletionChunk]: Object containing the completions
109
- or an iterator over completion chunks.
98
+ messages: A list of messages comprising the conversation so far.
99
+
100
+ model: The name of the model to query.
101
+
102
+ [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
103
+
104
+ context_length_exceeded_behavior: Defined the behavior of the API when max_tokens exceed the maximum context
105
+ length of the model. When set to 'error', API will return 400 with appropriate
106
+ error message. When set to 'truncate', override the max_tokens with maximum
107
+ context length of the model.
108
+
109
+ echo: If true, the response will contain the prompt. Can be used with `logprobs` to
110
+ return prompt logprobs.
111
+
112
+ frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
113
+ repeating tokens that have already been mentioned.
114
+
115
+ logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
116
+
117
+ logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
118
+ at each generation step, instead of just the sampled token. Log probabilities
119
+ help assess model confidence in token predictions.
120
+
121
+ max_tokens: The maximum number of tokens to generate.
122
+
123
+ min_p: A number between 0 and 1 that can be used as an alternative to top_p and top-k.
124
+
125
+ n: The number of completions to generate for each prompt.
126
+
127
+ presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
128
+ a model talking about new topics.
129
+
130
+ reasoning_effort: Controls the level of reasoning effort the model should apply when generating
131
+ responses. Higher values may result in more thoughtful and detailed responses
132
+ but may take longer to generate.
133
+
134
+ repetition_penalty: A number that controls the diversity of generated text by reducing the
135
+ likelihood of repeated sequences. Higher values decrease repetition.
136
+
137
+ response_format: An object specifying the format that the model must output.
138
+
139
+ safety_model: The name of the moderation model used to validate tokens. Choose from the
140
+ available moderation models found
141
+ [here](https://docs.together.ai/docs/inference-models#moderation-models).
142
+
143
+ seed: Seed value for reproducibility.
144
+
145
+ stop: A list of string sequences that will truncate (stop) inference text output. For
146
+ example, "</s>" will stop generation as soon as the model generates the given
147
+ token.
148
+
149
+ stream: If true, stream tokens as Server-Sent Events as the model generates them instead
150
+ of waiting for the full model response. The stream terminates with
151
+ `data: [DONE]`. If false, return a single JSON object containing the results.
152
+
153
+ temperature: A decimal number from 0-1 that determines the degree of randomness in the
154
+ response. A temperature less than 1 favors more correctness and is appropriate
155
+ for question answering or summarization. A value closer to 1 introduces more
156
+ randomness in the output.
157
+
158
+ tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
159
+ which lets the model pick between generating a message or calling a function.
160
+
161
+ tools: A list of tools the model may call. Currently, only functions are supported as a
162
+ tool. Use this to provide a list of functions the model may generate JSON inputs
163
+ for.
164
+
165
+ top_k: An integer that's used to limit the number of choices for the next predicted
166
+ word or token. It specifies the maximum number of tokens to consider at each
167
+ step, based on their probability of occurrence. This technique helps to speed up
168
+ the generation process and can improve the quality of the generated text by
169
+ focusing on the most likely options.
170
+
171
+ top_p: A percentage (also called the nucleus parameter) that's used to dynamically
172
+ adjust the number of choices for each predicted token based on the cumulative
173
+ probabilities. It specifies a probability threshold below which all less likely
174
+ tokens are filtered out. This technique helps maintain diversity and generate
175
+ more fluent and natural-sounding text.
176
+
177
+ extra_headers: Send extra headers
178
+
179
+ extra_query: Add additional query parameters to the request
180
+
181
+ extra_body: Add additional JSON properties to the request
182
+
183
+ timeout: Override the client-level default timeout for this request, in seconds
110
184
  """
185
+ ...
111
186
 
112
- requestor = api_requestor.APIRequestor(
113
- client=self._client,
114
- )
187
+ @overload
188
+ def create(
189
+ self,
190
+ *,
191
+ messages: Iterable[completion_create_params.Message],
192
+ model: Union[
193
+ Literal[
194
+ "Qwen/Qwen2.5-72B-Instruct-Turbo",
195
+ "Qwen/Qwen2.5-7B-Instruct-Turbo",
196
+ "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
197
+ "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
198
+ "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
199
+ ],
200
+ str,
201
+ ],
202
+ stream: Literal[True],
203
+ context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
204
+ echo: bool | Omit = omit,
205
+ frequency_penalty: float | Omit = omit,
206
+ function_call: completion_create_params.FunctionCall | Omit = omit,
207
+ logit_bias: Dict[str, float] | Omit = omit,
208
+ logprobs: int | Omit = omit,
209
+ max_tokens: int | Omit = omit,
210
+ min_p: float | Omit = omit,
211
+ n: int | Omit = omit,
212
+ presence_penalty: float | Omit = omit,
213
+ reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
214
+ repetition_penalty: float | Omit = omit,
215
+ response_format: completion_create_params.ResponseFormat | Omit = omit,
216
+ safety_model: str | Omit = omit,
217
+ seed: int | Omit = omit,
218
+ stop: SequenceNotStr[str] | Omit = omit,
219
+ temperature: float | Omit = omit,
220
+ tool_choice: completion_create_params.ToolChoice | Omit = omit,
221
+ tools: Iterable[ToolsParam] | Omit = omit,
222
+ top_k: int | Omit = omit,
223
+ top_p: float | Omit = omit,
224
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
225
+ # The extra values given here take precedence over values defined on the client or passed to this method.
226
+ extra_headers: Headers | None = None,
227
+ extra_query: Query | None = None,
228
+ extra_body: Body | None = None,
229
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
230
+ ) -> Stream[ChatCompletionChunk]:
231
+ """
232
+ Query a chat model.
233
+
234
+ Args:
235
+ messages: A list of messages comprising the conversation so far.
236
+
237
+ model: The name of the model to query.
238
+
239
+ [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
240
+
241
+ stream: If true, stream tokens as Server-Sent Events as the model generates them instead
242
+ of waiting for the full model response. The stream terminates with
243
+ `data: [DONE]`. If false, return a single JSON object containing the results.
244
+
245
+ context_length_exceeded_behavior: Defined the behavior of the API when max_tokens exceed the maximum context
246
+ length of the model. When set to 'error', API will return 400 with appropriate
247
+ error message. When set to 'truncate', override the max_tokens with maximum
248
+ context length of the model.
249
+
250
+ echo: If true, the response will contain the prompt. Can be used with `logprobs` to
251
+ return prompt logprobs.
252
+
253
+ frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
254
+ repeating tokens that have already been mentioned.
255
+
256
+ logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
257
+
258
+ logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
259
+ at each generation step, instead of just the sampled token. Log probabilities
260
+ help assess model confidence in token predictions.
261
+
262
+ max_tokens: The maximum number of tokens to generate.
263
+
264
+ min_p: A number between 0 and 1 that can be used as an alternative to top_p and top-k.
265
+
266
+ n: The number of completions to generate for each prompt.
267
+
268
+ presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
269
+ a model talking about new topics.
270
+
271
+ reasoning_effort: Controls the level of reasoning effort the model should apply when generating
272
+ responses. Higher values may result in more thoughtful and detailed responses
273
+ but may take longer to generate.
274
+
275
+ repetition_penalty: A number that controls the diversity of generated text by reducing the
276
+ likelihood of repeated sequences. Higher values decrease repetition.
277
+
278
+ response_format: An object specifying the format that the model must output.
115
279
 
116
- parameter_payload = ChatCompletionRequest(
117
- model=model,
118
- messages=messages,
119
- top_p=top_p,
120
- top_k=top_k,
121
- temperature=temperature,
122
- max_tokens=max_tokens,
123
- stop=stop,
124
- repetition_penalty=repetition_penalty,
125
- presence_penalty=presence_penalty,
126
- frequency_penalty=frequency_penalty,
127
- min_p=min_p,
128
- logit_bias=logit_bias,
129
- seed=seed,
130
- stream=stream,
131
- logprobs=logprobs,
132
- echo=echo,
133
- n=n,
134
- safety_model=safety_model,
135
- response_format=response_format,
136
- tools=tools,
137
- tool_choice=tool_choice,
138
- **kwargs,
139
- ).model_dump(exclude_none=True)
140
-
141
- response, _, _ = requestor.request(
142
- options=TogetherRequest(
143
- method="POST",
144
- url="chat/completions",
145
- params=parameter_payload,
280
+ safety_model: The name of the moderation model used to validate tokens. Choose from the
281
+ available moderation models found
282
+ [here](https://docs.together.ai/docs/inference-models#moderation-models).
283
+
284
+ seed: Seed value for reproducibility.
285
+
286
+ stop: A list of string sequences that will truncate (stop) inference text output. For
287
+ example, "</s>" will stop generation as soon as the model generates the given
288
+ token.
289
+
290
+ temperature: A decimal number from 0-1 that determines the degree of randomness in the
291
+ response. A temperature less than 1 favors more correctness and is appropriate
292
+ for question answering or summarization. A value closer to 1 introduces more
293
+ randomness in the output.
294
+
295
+ tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
296
+ which lets the model pick between generating a message or calling a function.
297
+
298
+ tools: A list of tools the model may call. Currently, only functions are supported as a
299
+ tool. Use this to provide a list of functions the model may generate JSON inputs
300
+ for.
301
+
302
+ top_k: An integer that's used to limit the number of choices for the next predicted
303
+ word or token. It specifies the maximum number of tokens to consider at each
304
+ step, based on their probability of occurrence. This technique helps to speed up
305
+ the generation process and can improve the quality of the generated text by
306
+ focusing on the most likely options.
307
+
308
+ top_p: A percentage (also called the nucleus parameter) that's used to dynamically
309
+ adjust the number of choices for each predicted token based on the cumulative
310
+ probabilities. It specifies a probability threshold below which all less likely
311
+ tokens are filtered out. This technique helps maintain diversity and generate
312
+ more fluent and natural-sounding text.
313
+
314
+ extra_headers: Send extra headers
315
+
316
+ extra_query: Add additional query parameters to the request
317
+
318
+ extra_body: Add additional JSON properties to the request
319
+
320
+ timeout: Override the client-level default timeout for this request, in seconds
321
+ """
322
+ ...
323
+
324
+ @overload
325
+ def create(
326
+ self,
327
+ *,
328
+ messages: Iterable[completion_create_params.Message],
329
+ model: Union[
330
+ Literal[
331
+ "Qwen/Qwen2.5-72B-Instruct-Turbo",
332
+ "Qwen/Qwen2.5-7B-Instruct-Turbo",
333
+ "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
334
+ "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
335
+ "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
336
+ ],
337
+ str,
338
+ ],
339
+ stream: bool,
340
+ context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
341
+ echo: bool | Omit = omit,
342
+ frequency_penalty: float | Omit = omit,
343
+ function_call: completion_create_params.FunctionCall | Omit = omit,
344
+ logit_bias: Dict[str, float] | Omit = omit,
345
+ logprobs: int | Omit = omit,
346
+ max_tokens: int | Omit = omit,
347
+ min_p: float | Omit = omit,
348
+ n: int | Omit = omit,
349
+ presence_penalty: float | Omit = omit,
350
+ reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
351
+ repetition_penalty: float | Omit = omit,
352
+ response_format: completion_create_params.ResponseFormat | Omit = omit,
353
+ safety_model: str | Omit = omit,
354
+ seed: int | Omit = omit,
355
+ stop: SequenceNotStr[str] | Omit = omit,
356
+ temperature: float | Omit = omit,
357
+ tool_choice: completion_create_params.ToolChoice | Omit = omit,
358
+ tools: Iterable[ToolsParam] | Omit = omit,
359
+ top_k: int | Omit = omit,
360
+ top_p: float | Omit = omit,
361
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
362
+ # The extra values given here take precedence over values defined on the client or passed to this method.
363
+ extra_headers: Headers | None = None,
364
+ extra_query: Query | None = None,
365
+ extra_body: Body | None = None,
366
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
367
+ ) -> ChatCompletion | Stream[ChatCompletionChunk]:
368
+ """
369
+ Query a chat model.
370
+
371
+ Args:
372
+ messages: A list of messages comprising the conversation so far.
373
+
374
+ model: The name of the model to query.
375
+
376
+ [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
377
+
378
+ stream: If true, stream tokens as Server-Sent Events as the model generates them instead
379
+ of waiting for the full model response. The stream terminates with
380
+ `data: [DONE]`. If false, return a single JSON object containing the results.
381
+
382
+ context_length_exceeded_behavior: Defined the behavior of the API when max_tokens exceed the maximum context
383
+ length of the model. When set to 'error', API will return 400 with appropriate
384
+ error message. When set to 'truncate', override the max_tokens with maximum
385
+ context length of the model.
386
+
387
+ echo: If true, the response will contain the prompt. Can be used with `logprobs` to
388
+ return prompt logprobs.
389
+
390
+ frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
391
+ repeating tokens that have already been mentioned.
392
+
393
+ logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
394
+
395
+ logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
396
+ at each generation step, instead of just the sampled token. Log probabilities
397
+ help assess model confidence in token predictions.
398
+
399
+ max_tokens: The maximum number of tokens to generate.
400
+
401
+ min_p: A number between 0 and 1 that can be used as an alternative to top_p and top-k.
402
+
403
+ n: The number of completions to generate for each prompt.
404
+
405
+ presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
406
+ a model talking about new topics.
407
+
408
+ reasoning_effort: Controls the level of reasoning effort the model should apply when generating
409
+ responses. Higher values may result in more thoughtful and detailed responses
410
+ but may take longer to generate.
411
+
412
+ repetition_penalty: A number that controls the diversity of generated text by reducing the
413
+ likelihood of repeated sequences. Higher values decrease repetition.
414
+
415
+ response_format: An object specifying the format that the model must output.
416
+
417
+ safety_model: The name of the moderation model used to validate tokens. Choose from the
418
+ available moderation models found
419
+ [here](https://docs.together.ai/docs/inference-models#moderation-models).
420
+
421
+ seed: Seed value for reproducibility.
422
+
423
+ stop: A list of string sequences that will truncate (stop) inference text output. For
424
+ example, "</s>" will stop generation as soon as the model generates the given
425
+ token.
426
+
427
+ temperature: A decimal number from 0-1 that determines the degree of randomness in the
428
+ response. A temperature less than 1 favors more correctness and is appropriate
429
+ for question answering or summarization. A value closer to 1 introduces more
430
+ randomness in the output.
431
+
432
+ tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
433
+ which lets the model pick between generating a message or calling a function.
434
+
435
+ tools: A list of tools the model may call. Currently, only functions are supported as a
436
+ tool. Use this to provide a list of functions the model may generate JSON inputs
437
+ for.
438
+
439
+ top_k: An integer that's used to limit the number of choices for the next predicted
440
+ word or token. It specifies the maximum number of tokens to consider at each
441
+ step, based on their probability of occurrence. This technique helps to speed up
442
+ the generation process and can improve the quality of the generated text by
443
+ focusing on the most likely options.
444
+
445
+ top_p: A percentage (also called the nucleus parameter) that's used to dynamically
446
+ adjust the number of choices for each predicted token based on the cumulative
447
+ probabilities. It specifies a probability threshold below which all less likely
448
+ tokens are filtered out. This technique helps maintain diversity and generate
449
+ more fluent and natural-sounding text.
450
+
451
+ extra_headers: Send extra headers
452
+
453
+ extra_query: Add additional query parameters to the request
454
+
455
+ extra_body: Add additional JSON properties to the request
456
+
457
+ timeout: Override the client-level default timeout for this request, in seconds
458
+ """
459
+ ...
460
+
461
+ @required_args(["messages", "model"], ["messages", "model", "stream"])
462
+ def create(
463
+ self,
464
+ *,
465
+ messages: Iterable[completion_create_params.Message],
466
+ model: Union[
467
+ Literal[
468
+ "Qwen/Qwen2.5-72B-Instruct-Turbo",
469
+ "Qwen/Qwen2.5-7B-Instruct-Turbo",
470
+ "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
471
+ "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
472
+ "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
473
+ ],
474
+ str,
475
+ ],
476
+ context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
477
+ echo: bool | Omit = omit,
478
+ frequency_penalty: float | Omit = omit,
479
+ function_call: completion_create_params.FunctionCall | Omit = omit,
480
+ logit_bias: Dict[str, float] | Omit = omit,
481
+ logprobs: int | Omit = omit,
482
+ max_tokens: int | Omit = omit,
483
+ min_p: float | Omit = omit,
484
+ n: int | Omit = omit,
485
+ presence_penalty: float | Omit = omit,
486
+ reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
487
+ repetition_penalty: float | Omit = omit,
488
+ response_format: completion_create_params.ResponseFormat | Omit = omit,
489
+ safety_model: str | Omit = omit,
490
+ seed: int | Omit = omit,
491
+ stop: SequenceNotStr[str] | Omit = omit,
492
+ stream: Literal[False] | Literal[True] | Omit = omit,
493
+ temperature: float | Omit = omit,
494
+ tool_choice: completion_create_params.ToolChoice | Omit = omit,
495
+ tools: Iterable[ToolsParam] | Omit = omit,
496
+ top_k: int | Omit = omit,
497
+ top_p: float | Omit = omit,
498
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
499
+ # The extra values given here take precedence over values defined on the client or passed to this method.
500
+ extra_headers: Headers | None = None,
501
+ extra_query: Query | None = None,
502
+ extra_body: Body | None = None,
503
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
504
+ ) -> ChatCompletion | Stream[ChatCompletionChunk]:
505
+ return self._post(
506
+ "/chat/completions",
507
+ body=maybe_transform(
508
+ {
509
+ "messages": messages,
510
+ "model": model,
511
+ "context_length_exceeded_behavior": context_length_exceeded_behavior,
512
+ "echo": echo,
513
+ "frequency_penalty": frequency_penalty,
514
+ "function_call": function_call,
515
+ "logit_bias": logit_bias,
516
+ "logprobs": logprobs,
517
+ "max_tokens": max_tokens,
518
+ "min_p": min_p,
519
+ "n": n,
520
+ "presence_penalty": presence_penalty,
521
+ "reasoning_effort": reasoning_effort,
522
+ "repetition_penalty": repetition_penalty,
523
+ "response_format": response_format,
524
+ "safety_model": safety_model,
525
+ "seed": seed,
526
+ "stop": stop,
527
+ "stream": stream,
528
+ "temperature": temperature,
529
+ "tool_choice": tool_choice,
530
+ "tools": tools,
531
+ "top_k": top_k,
532
+ "top_p": top_p,
533
+ },
534
+ completion_create_params.CompletionCreateParamsStreaming
535
+ if stream
536
+ else completion_create_params.CompletionCreateParamsNonStreaming,
146
537
  ),
147
- stream=stream,
538
+ options=make_request_options(
539
+ extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
540
+ ),
541
+ cast_to=ChatCompletion,
542
+ stream=stream or False,
543
+ stream_cls=Stream[ChatCompletionChunk],
148
544
  )
149
545
 
150
- if stream:
151
- # must be an iterator
152
- assert not isinstance(response, TogetherResponse)
153
- return (ChatCompletionChunk(**line.data) for line in response)
154
- assert isinstance(response, TogetherResponse)
155
- return ChatCompletionResponse(**response.data)
156
546
 
547
+ class AsyncCompletionsResource(AsyncAPIResource):
548
+ @cached_property
549
+ def with_raw_response(self) -> AsyncCompletionsResourceWithRawResponse:
550
+ """
551
+ This property can be used as a prefix for any HTTP method call to return
552
+ the raw response object instead of the parsed content.
553
+
554
+ For more information, see https://www.github.com/togethercomputer/together-py#accessing-raw-response-data-eg-headers
555
+ """
556
+ return AsyncCompletionsResourceWithRawResponse(self)
157
557
 
158
- class AsyncChatCompletions:
159
- def __init__(self, client: TogetherClient) -> None:
160
- self._client = client
558
+ @cached_property
559
+ def with_streaming_response(self) -> AsyncCompletionsResourceWithStreamingResponse:
560
+ """
561
+ An alternative to `.with_raw_response` that doesn't eagerly read the response body.
562
+
563
+ For more information, see https://www.github.com/togethercomputer/together-py#with_streaming_response
564
+ """
565
+ return AsyncCompletionsResourceWithStreamingResponse(self)
161
566
 
567
+ @overload
162
568
  async def create(
163
569
  self,
164
570
  *,
165
- messages: List[Dict[str, str]],
166
- model: str,
167
- max_tokens: int | None = None,
168
- stop: List[str] | None = None,
169
- temperature: float | None = None,
170
- top_p: float | None = None,
171
- top_k: int | None = None,
172
- repetition_penalty: float | None = None,
173
- presence_penalty: float | None = None,
174
- frequency_penalty: float | None = None,
175
- min_p: float | None = None,
176
- logit_bias: Dict[str, float] | None = None,
177
- seed: int | None = None,
178
- stream: bool = False,
179
- logprobs: int | None = None,
180
- echo: bool | None = None,
181
- n: int | None = None,
182
- safety_model: str | None = None,
183
- response_format: Dict[str, Any] | None = None,
184
- tools: Dict[str, str | Dict[str, str | Dict[str, Any]]] | None = None,
185
- tool_choice: str | Dict[str, str | Dict[str, str]] | None = None,
186
- **kwargs: Any,
187
- ) -> AsyncGenerator[ChatCompletionChunk, None] | ChatCompletionResponse:
571
+ messages: Iterable[completion_create_params.Message],
572
+ model: Union[
573
+ Literal[
574
+ "Qwen/Qwen2.5-72B-Instruct-Turbo",
575
+ "Qwen/Qwen2.5-7B-Instruct-Turbo",
576
+ "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
577
+ "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
578
+ "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
579
+ ],
580
+ str,
581
+ ],
582
+ context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
583
+ echo: bool | Omit = omit,
584
+ frequency_penalty: float | Omit = omit,
585
+ function_call: completion_create_params.FunctionCall | Omit = omit,
586
+ logit_bias: Dict[str, float] | Omit = omit,
587
+ logprobs: int | Omit = omit,
588
+ max_tokens: int | Omit = omit,
589
+ min_p: float | Omit = omit,
590
+ n: int | Omit = omit,
591
+ presence_penalty: float | Omit = omit,
592
+ reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
593
+ repetition_penalty: float | Omit = omit,
594
+ response_format: completion_create_params.ResponseFormat | Omit = omit,
595
+ safety_model: str | Omit = omit,
596
+ seed: int | Omit = omit,
597
+ stop: SequenceNotStr[str] | Omit = omit,
598
+ stream: Literal[False] | Omit = omit,
599
+ temperature: float | Omit = omit,
600
+ tool_choice: completion_create_params.ToolChoice | Omit = omit,
601
+ tools: Iterable[ToolsParam] | Omit = omit,
602
+ top_k: int | Omit = omit,
603
+ top_p: float | Omit = omit,
604
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
605
+ # The extra values given here take precedence over values defined on the client or passed to this method.
606
+ extra_headers: Headers | None = None,
607
+ extra_query: Query | None = None,
608
+ extra_body: Body | None = None,
609
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
610
+ ) -> ChatCompletion:
188
611
  """
189
- Async method to generate completions based on a given prompt using a specified model.
612
+ Query a chat model.
190
613
 
191
614
  Args:
192
- messages (List[Dict[str, str]]): A list of messages in the format
193
- `[{"role": together.types.chat_completions.MessageRole, "content": TEXT}, ...]`
194
- model (str): The name of the model to query.
195
- max_tokens (int, optional): The maximum number of tokens to generate.
196
- Defaults to 512.
197
- stop (List[str], optional): List of strings at which to stop generation.
198
- Defaults to None.
199
- temperature (float, optional): A decimal number that determines the degree of randomness in the response.
200
- Defaults to None.
201
- top_p (float, optional): The top_p (nucleus) parameter is used to dynamically adjust the number
202
- of choices for each predicted token based on the cumulative probabilities.
203
- Defaults to None.
204
- top_k (int, optional): The top_k parameter is used to limit the number of choices for the
205
- next predicted word or token.
206
- Defaults to None.
207
- repetition_penalty (float, optional): A number that controls the diversity of generated text
208
- by reducing the likelihood of repeated sequences. Higher values decrease repetition.
209
- Defaults to None.
210
- presence_penalty (float, optional): A number that controls the likelihood of tokens based on if they have
211
- appeared in the text. Positive values decrease the likelihood of repeated tokens or phrases.
212
- Must be in the range [-2, 2].
213
- Defaults to None.
214
- frequency_penalty (float, optional): A number that controls the likelihood of tokens based on the frequency
215
- of their appearance in the text. Positive decrease the likelihood of repeated tokens or phrases.
216
- Must be in the range [-2, 2].
217
- Defaults to None.
218
- min_p (float, optional): A number that controls the minimum percentage value that a token must reach to
219
- be considered during sampling.
220
- Must be in the range [0, 1].
221
- Defaults to None.
222
- logit_bias (Dict[str, float], optional): A dictionary of tokens and their bias values that modify the
223
- likelihood of specific tokens being sampled. Bias values must be in the range [-100, 100].
224
- Defaults to None.
225
- seed (int, optional): A seed value to use for reproducibility.
226
- stream (bool, optional): Flag indicating whether to stream the generated completions.
227
- Defaults to False.
228
- logprobs (int, optional): Number of top-k logprobs to return
229
- Defaults to None.
230
- echo (bool, optional): Echo prompt in output. Can be used with logprobs to return prompt logprobs.
231
- Defaults to None.
232
- n (int, optional): Number of completions to generate. Setting to None will return a single generation.
233
- Defaults to None.
234
- safety_model (str, optional): A moderation model to validate tokens. Choice between available moderation
235
- models found [here](https://docs.together.ai/docs/inference-models#moderation-models).
236
- Defaults to None.
237
- response_format (Dict[str, Any], optional): An object specifying the format that the model must output.
238
- Defaults to None.
239
- tools (Dict[str, str | Dict[str, str | Dict[str, Any]]], optional): A list of tools the model may call.
240
- Currently, only functions are supported as a tool.
241
- Use this to provide a list of functions the model may generate JSON inputs for.
242
- Defaults to None
243
- tool_choice: Controls which (if any) function is called by the model. auto means the model can pick
244
- between generating a message or calling a function. Specifying a particular function
245
- via {"type": "function", "function": {"name": "my_function"}} forces the model to call that function.
246
- Sets to `auto` if None.
247
- Defaults to None.
248
-
249
- Returns:
250
- AsyncGenerator[ChatCompletionChunk, None] | ChatCompletionResponse: Object containing the completions
251
- or an iterator over completion chunks.
615
+ messages: A list of messages comprising the conversation so far.
616
+
617
+ model: The name of the model to query.
618
+
619
+ [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
620
+
621
+ context_length_exceeded_behavior: Defined the behavior of the API when max_tokens exceed the maximum context
622
+ length of the model. When set to 'error', API will return 400 with appropriate
623
+ error message. When set to 'truncate', override the max_tokens with maximum
624
+ context length of the model.
625
+
626
+ echo: If true, the response will contain the prompt. Can be used with `logprobs` to
627
+ return prompt logprobs.
628
+
629
+ frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
630
+ repeating tokens that have already been mentioned.
631
+
632
+ logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
633
+
634
+ logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
635
+ at each generation step, instead of just the sampled token. Log probabilities
636
+ help assess model confidence in token predictions.
637
+
638
+ max_tokens: The maximum number of tokens to generate.
639
+
640
+ min_p: A number between 0 and 1 that can be used as an alternative to top_p and top-k.
641
+
642
+ n: The number of completions to generate for each prompt.
643
+
644
+ presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
645
+ a model talking about new topics.
646
+
647
+ reasoning_effort: Controls the level of reasoning effort the model should apply when generating
648
+ responses. Higher values may result in more thoughtful and detailed responses
649
+ but may take longer to generate.
650
+
651
+ repetition_penalty: A number that controls the diversity of generated text by reducing the
652
+ likelihood of repeated sequences. Higher values decrease repetition.
653
+
654
+ response_format: An object specifying the format that the model must output.
655
+
656
+ safety_model: The name of the moderation model used to validate tokens. Choose from the
657
+ available moderation models found
658
+ [here](https://docs.together.ai/docs/inference-models#moderation-models).
659
+
660
+ seed: Seed value for reproducibility.
661
+
662
+ stop: A list of string sequences that will truncate (stop) inference text output. For
663
+ example, "</s>" will stop generation as soon as the model generates the given
664
+ token.
665
+
666
+ stream: If true, stream tokens as Server-Sent Events as the model generates them instead
667
+ of waiting for the full model response. The stream terminates with
668
+ `data: [DONE]`. If false, return a single JSON object containing the results.
669
+
670
+ temperature: A decimal number from 0-1 that determines the degree of randomness in the
671
+ response. A temperature less than 1 favors more correctness and is appropriate
672
+ for question answering or summarization. A value closer to 1 introduces more
673
+ randomness in the output.
674
+
675
+ tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
676
+ which lets the model pick between generating a message or calling a function.
677
+
678
+ tools: A list of tools the model may call. Currently, only functions are supported as a
679
+ tool. Use this to provide a list of functions the model may generate JSON inputs
680
+ for.
681
+
682
+ top_k: An integer that's used to limit the number of choices for the next predicted
683
+ word or token. It specifies the maximum number of tokens to consider at each
684
+ step, based on their probability of occurrence. This technique helps to speed up
685
+ the generation process and can improve the quality of the generated text by
686
+ focusing on the most likely options.
687
+
688
+ top_p: A percentage (also called the nucleus parameter) that's used to dynamically
689
+ adjust the number of choices for each predicted token based on the cumulative
690
+ probabilities. It specifies a probability threshold below which all less likely
691
+ tokens are filtered out. This technique helps maintain diversity and generate
692
+ more fluent and natural-sounding text.
693
+
694
+ extra_headers: Send extra headers
695
+
696
+ extra_query: Add additional query parameters to the request
697
+
698
+ extra_body: Add additional JSON properties to the request
699
+
700
+ timeout: Override the client-level default timeout for this request, in seconds
252
701
  """
702
+ ...
253
703
 
254
- requestor = api_requestor.APIRequestor(
255
- client=self._client,
256
- )
704
+ @overload
705
+ async def create(
706
+ self,
707
+ *,
708
+ messages: Iterable[completion_create_params.Message],
709
+ model: Union[
710
+ Literal[
711
+ "Qwen/Qwen2.5-72B-Instruct-Turbo",
712
+ "Qwen/Qwen2.5-7B-Instruct-Turbo",
713
+ "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
714
+ "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
715
+ "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
716
+ ],
717
+ str,
718
+ ],
719
+ stream: Literal[True],
720
+ context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
721
+ echo: bool | Omit = omit,
722
+ frequency_penalty: float | Omit = omit,
723
+ function_call: completion_create_params.FunctionCall | Omit = omit,
724
+ logit_bias: Dict[str, float] | Omit = omit,
725
+ logprobs: int | Omit = omit,
726
+ max_tokens: int | Omit = omit,
727
+ min_p: float | Omit = omit,
728
+ n: int | Omit = omit,
729
+ presence_penalty: float | Omit = omit,
730
+ reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
731
+ repetition_penalty: float | Omit = omit,
732
+ response_format: completion_create_params.ResponseFormat | Omit = omit,
733
+ safety_model: str | Omit = omit,
734
+ seed: int | Omit = omit,
735
+ stop: SequenceNotStr[str] | Omit = omit,
736
+ temperature: float | Omit = omit,
737
+ tool_choice: completion_create_params.ToolChoice | Omit = omit,
738
+ tools: Iterable[ToolsParam] | Omit = omit,
739
+ top_k: int | Omit = omit,
740
+ top_p: float | Omit = omit,
741
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
742
+ # The extra values given here take precedence over values defined on the client or passed to this method.
743
+ extra_headers: Headers | None = None,
744
+ extra_query: Query | None = None,
745
+ extra_body: Body | None = None,
746
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
747
+ ) -> AsyncStream[ChatCompletionChunk]:
748
+ """
749
+ Query a chat model.
750
+
751
+ Args:
752
+ messages: A list of messages comprising the conversation so far.
753
+
754
+ model: The name of the model to query.
755
+
756
+ [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
757
+
758
+ stream: If true, stream tokens as Server-Sent Events as the model generates them instead
759
+ of waiting for the full model response. The stream terminates with
760
+ `data: [DONE]`. If false, return a single JSON object containing the results.
761
+
762
+ context_length_exceeded_behavior: Defined the behavior of the API when max_tokens exceed the maximum context
763
+ length of the model. When set to 'error', API will return 400 with appropriate
764
+ error message. When set to 'truncate', override the max_tokens with maximum
765
+ context length of the model.
766
+
767
+ echo: If true, the response will contain the prompt. Can be used with `logprobs` to
768
+ return prompt logprobs.
769
+
770
+ frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
771
+ repeating tokens that have already been mentioned.
772
+
773
+ logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
774
+
775
+ logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
776
+ at each generation step, instead of just the sampled token. Log probabilities
777
+ help assess model confidence in token predictions.
778
+
779
+ max_tokens: The maximum number of tokens to generate.
780
+
781
+ min_p: A number between 0 and 1 that can be used as an alternative to top_p and top-k.
782
+
783
+ n: The number of completions to generate for each prompt.
784
+
785
+ presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
786
+ a model talking about new topics.
787
+
788
+ reasoning_effort: Controls the level of reasoning effort the model should apply when generating
789
+ responses. Higher values may result in more thoughtful and detailed responses
790
+ but may take longer to generate.
791
+
792
+ repetition_penalty: A number that controls the diversity of generated text by reducing the
793
+ likelihood of repeated sequences. Higher values decrease repetition.
794
+
795
+ response_format: An object specifying the format that the model must output.
796
+
797
+ safety_model: The name of the moderation model used to validate tokens. Choose from the
798
+ available moderation models found
799
+ [here](https://docs.together.ai/docs/inference-models#moderation-models).
800
+
801
+ seed: Seed value for reproducibility.
802
+
803
+ stop: A list of string sequences that will truncate (stop) inference text output. For
804
+ example, "</s>" will stop generation as soon as the model generates the given
805
+ token.
806
+
807
+ temperature: A decimal number from 0-1 that determines the degree of randomness in the
808
+ response. A temperature less than 1 favors more correctness and is appropriate
809
+ for question answering or summarization. A value closer to 1 introduces more
810
+ randomness in the output.
811
+
812
+ tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
813
+ which lets the model pick between generating a message or calling a function.
814
+
815
+ tools: A list of tools the model may call. Currently, only functions are supported as a
816
+ tool. Use this to provide a list of functions the model may generate JSON inputs
817
+ for.
818
+
819
+ top_k: An integer that's used to limit the number of choices for the next predicted
820
+ word or token. It specifies the maximum number of tokens to consider at each
821
+ step, based on their probability of occurrence. This technique helps to speed up
822
+ the generation process and can improve the quality of the generated text by
823
+ focusing on the most likely options.
824
+
825
+ top_p: A percentage (also called the nucleus parameter) that's used to dynamically
826
+ adjust the number of choices for each predicted token based on the cumulative
827
+ probabilities. It specifies a probability threshold below which all less likely
828
+ tokens are filtered out. This technique helps maintain diversity and generate
829
+ more fluent and natural-sounding text.
830
+
831
+ extra_headers: Send extra headers
257
832
 
258
- parameter_payload = ChatCompletionRequest(
259
- model=model,
260
- messages=messages,
261
- top_p=top_p,
262
- top_k=top_k,
263
- temperature=temperature,
264
- max_tokens=max_tokens,
265
- stop=stop,
266
- repetition_penalty=repetition_penalty,
267
- presence_penalty=presence_penalty,
268
- frequency_penalty=frequency_penalty,
269
- min_p=min_p,
270
- logit_bias=logit_bias,
271
- seed=seed,
272
- stream=stream,
273
- logprobs=logprobs,
274
- echo=echo,
275
- n=n,
276
- safety_model=safety_model,
277
- response_format=response_format,
278
- tools=tools,
279
- tool_choice=tool_choice,
280
- **kwargs,
281
- ).model_dump(exclude_none=True)
282
-
283
- response, _, _ = await requestor.arequest(
284
- options=TogetherRequest(
285
- method="POST",
286
- url="chat/completions",
287
- params=parameter_payload,
833
+ extra_query: Add additional query parameters to the request
834
+
835
+ extra_body: Add additional JSON properties to the request
836
+
837
+ timeout: Override the client-level default timeout for this request, in seconds
838
+ """
839
+ ...
840
+
841
+ @overload
842
+ async def create(
843
+ self,
844
+ *,
845
+ messages: Iterable[completion_create_params.Message],
846
+ model: Union[
847
+ Literal[
848
+ "Qwen/Qwen2.5-72B-Instruct-Turbo",
849
+ "Qwen/Qwen2.5-7B-Instruct-Turbo",
850
+ "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
851
+ "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
852
+ "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
853
+ ],
854
+ str,
855
+ ],
856
+ stream: bool,
857
+ context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
858
+ echo: bool | Omit = omit,
859
+ frequency_penalty: float | Omit = omit,
860
+ function_call: completion_create_params.FunctionCall | Omit = omit,
861
+ logit_bias: Dict[str, float] | Omit = omit,
862
+ logprobs: int | Omit = omit,
863
+ max_tokens: int | Omit = omit,
864
+ min_p: float | Omit = omit,
865
+ n: int | Omit = omit,
866
+ presence_penalty: float | Omit = omit,
867
+ reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
868
+ repetition_penalty: float | Omit = omit,
869
+ response_format: completion_create_params.ResponseFormat | Omit = omit,
870
+ safety_model: str | Omit = omit,
871
+ seed: int | Omit = omit,
872
+ stop: SequenceNotStr[str] | Omit = omit,
873
+ temperature: float | Omit = omit,
874
+ tool_choice: completion_create_params.ToolChoice | Omit = omit,
875
+ tools: Iterable[ToolsParam] | Omit = omit,
876
+ top_k: int | Omit = omit,
877
+ top_p: float | Omit = omit,
878
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
879
+ # The extra values given here take precedence over values defined on the client or passed to this method.
880
+ extra_headers: Headers | None = None,
881
+ extra_query: Query | None = None,
882
+ extra_body: Body | None = None,
883
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
884
+ ) -> ChatCompletion | AsyncStream[ChatCompletionChunk]:
885
+ """
886
+ Query a chat model.
887
+
888
+ Args:
889
+ messages: A list of messages comprising the conversation so far.
890
+
891
+ model: The name of the model to query.
892
+
893
+ [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
894
+
895
+ stream: If true, stream tokens as Server-Sent Events as the model generates them instead
896
+ of waiting for the full model response. The stream terminates with
897
+ `data: [DONE]`. If false, return a single JSON object containing the results.
898
+
899
+ context_length_exceeded_behavior: Defined the behavior of the API when max_tokens exceed the maximum context
900
+ length of the model. When set to 'error', API will return 400 with appropriate
901
+ error message. When set to 'truncate', override the max_tokens with maximum
902
+ context length of the model.
903
+
904
+ echo: If true, the response will contain the prompt. Can be used with `logprobs` to
905
+ return prompt logprobs.
906
+
907
+ frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
908
+ repeating tokens that have already been mentioned.
909
+
910
+ logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
911
+
912
+ logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
913
+ at each generation step, instead of just the sampled token. Log probabilities
914
+ help assess model confidence in token predictions.
915
+
916
+ max_tokens: The maximum number of tokens to generate.
917
+
918
+ min_p: A number between 0 and 1 that can be used as an alternative to top_p and top-k.
919
+
920
+ n: The number of completions to generate for each prompt.
921
+
922
+ presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
923
+ a model talking about new topics.
924
+
925
+ reasoning_effort: Controls the level of reasoning effort the model should apply when generating
926
+ responses. Higher values may result in more thoughtful and detailed responses
927
+ but may take longer to generate.
928
+
929
+ repetition_penalty: A number that controls the diversity of generated text by reducing the
930
+ likelihood of repeated sequences. Higher values decrease repetition.
931
+
932
+ response_format: An object specifying the format that the model must output.
933
+
934
+ safety_model: The name of the moderation model used to validate tokens. Choose from the
935
+ available moderation models found
936
+ [here](https://docs.together.ai/docs/inference-models#moderation-models).
937
+
938
+ seed: Seed value for reproducibility.
939
+
940
+ stop: A list of string sequences that will truncate (stop) inference text output. For
941
+ example, "</s>" will stop generation as soon as the model generates the given
942
+ token.
943
+
944
+ temperature: A decimal number from 0-1 that determines the degree of randomness in the
945
+ response. A temperature less than 1 favors more correctness and is appropriate
946
+ for question answering or summarization. A value closer to 1 introduces more
947
+ randomness in the output.
948
+
949
+ tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
950
+ which lets the model pick between generating a message or calling a function.
951
+
952
+ tools: A list of tools the model may call. Currently, only functions are supported as a
953
+ tool. Use this to provide a list of functions the model may generate JSON inputs
954
+ for.
955
+
956
+ top_k: An integer that's used to limit the number of choices for the next predicted
957
+ word or token. It specifies the maximum number of tokens to consider at each
958
+ step, based on their probability of occurrence. This technique helps to speed up
959
+ the generation process and can improve the quality of the generated text by
960
+ focusing on the most likely options.
961
+
962
+ top_p: A percentage (also called the nucleus parameter) that's used to dynamically
963
+ adjust the number of choices for each predicted token based on the cumulative
964
+ probabilities. It specifies a probability threshold below which all less likely
965
+ tokens are filtered out. This technique helps maintain diversity and generate
966
+ more fluent and natural-sounding text.
967
+
968
+ extra_headers: Send extra headers
969
+
970
+ extra_query: Add additional query parameters to the request
971
+
972
+ extra_body: Add additional JSON properties to the request
973
+
974
+ timeout: Override the client-level default timeout for this request, in seconds
975
+ """
976
+ ...
977
+
978
+ @required_args(["messages", "model"], ["messages", "model", "stream"])
979
+ async def create(
980
+ self,
981
+ *,
982
+ messages: Iterable[completion_create_params.Message],
983
+ model: Union[
984
+ Literal[
985
+ "Qwen/Qwen2.5-72B-Instruct-Turbo",
986
+ "Qwen/Qwen2.5-7B-Instruct-Turbo",
987
+ "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
988
+ "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
989
+ "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
990
+ ],
991
+ str,
992
+ ],
993
+ context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
994
+ echo: bool | Omit = omit,
995
+ frequency_penalty: float | Omit = omit,
996
+ function_call: completion_create_params.FunctionCall | Omit = omit,
997
+ logit_bias: Dict[str, float] | Omit = omit,
998
+ logprobs: int | Omit = omit,
999
+ max_tokens: int | Omit = omit,
1000
+ min_p: float | Omit = omit,
1001
+ n: int | Omit = omit,
1002
+ presence_penalty: float | Omit = omit,
1003
+ reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
1004
+ repetition_penalty: float | Omit = omit,
1005
+ response_format: completion_create_params.ResponseFormat | Omit = omit,
1006
+ safety_model: str | Omit = omit,
1007
+ seed: int | Omit = omit,
1008
+ stop: SequenceNotStr[str] | Omit = omit,
1009
+ stream: Literal[False] | Literal[True] | Omit = omit,
1010
+ temperature: float | Omit = omit,
1011
+ tool_choice: completion_create_params.ToolChoice | Omit = omit,
1012
+ tools: Iterable[ToolsParam] | Omit = omit,
1013
+ top_k: int | Omit = omit,
1014
+ top_p: float | Omit = omit,
1015
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
1016
+ # The extra values given here take precedence over values defined on the client or passed to this method.
1017
+ extra_headers: Headers | None = None,
1018
+ extra_query: Query | None = None,
1019
+ extra_body: Body | None = None,
1020
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
1021
+ ) -> ChatCompletion | AsyncStream[ChatCompletionChunk]:
1022
+ return await self._post(
1023
+ "/chat/completions",
1024
+ body=await async_maybe_transform(
1025
+ {
1026
+ "messages": messages,
1027
+ "model": model,
1028
+ "context_length_exceeded_behavior": context_length_exceeded_behavior,
1029
+ "echo": echo,
1030
+ "frequency_penalty": frequency_penalty,
1031
+ "function_call": function_call,
1032
+ "logit_bias": logit_bias,
1033
+ "logprobs": logprobs,
1034
+ "max_tokens": max_tokens,
1035
+ "min_p": min_p,
1036
+ "n": n,
1037
+ "presence_penalty": presence_penalty,
1038
+ "reasoning_effort": reasoning_effort,
1039
+ "repetition_penalty": repetition_penalty,
1040
+ "response_format": response_format,
1041
+ "safety_model": safety_model,
1042
+ "seed": seed,
1043
+ "stop": stop,
1044
+ "stream": stream,
1045
+ "temperature": temperature,
1046
+ "tool_choice": tool_choice,
1047
+ "tools": tools,
1048
+ "top_k": top_k,
1049
+ "top_p": top_p,
1050
+ },
1051
+ completion_create_params.CompletionCreateParamsStreaming
1052
+ if stream
1053
+ else completion_create_params.CompletionCreateParamsNonStreaming,
1054
+ ),
1055
+ options=make_request_options(
1056
+ extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
288
1057
  ),
289
- stream=stream,
1058
+ cast_to=ChatCompletion,
1059
+ stream=stream or False,
1060
+ stream_cls=AsyncStream[ChatCompletionChunk],
290
1061
  )
291
1062
 
292
- if stream:
293
- # must be an iterator
294
- assert not isinstance(response, TogetherResponse)
295
- return (ChatCompletionChunk(**line.data) async for line in response)
296
- assert isinstance(response, TogetherResponse)
297
- return ChatCompletionResponse(**response.data)
1063
+
1064
+ class CompletionsResourceWithRawResponse:
1065
+ def __init__(self, completions: CompletionsResource) -> None:
1066
+ self._completions = completions
1067
+
1068
+ self.create = to_raw_response_wrapper(
1069
+ completions.create,
1070
+ )
1071
+
1072
+
1073
+ class AsyncCompletionsResourceWithRawResponse:
1074
+ def __init__(self, completions: AsyncCompletionsResource) -> None:
1075
+ self._completions = completions
1076
+
1077
+ self.create = async_to_raw_response_wrapper(
1078
+ completions.create,
1079
+ )
1080
+
1081
+
1082
+ class CompletionsResourceWithStreamingResponse:
1083
+ def __init__(self, completions: CompletionsResource) -> None:
1084
+ self._completions = completions
1085
+
1086
+ self.create = to_streamed_response_wrapper(
1087
+ completions.create,
1088
+ )
1089
+
1090
+
1091
+ class AsyncCompletionsResourceWithStreamingResponse:
1092
+ def __init__(self, completions: AsyncCompletionsResource) -> None:
1093
+ self._completions = completions
1094
+
1095
+ self.create = async_to_streamed_response_wrapper(
1096
+ completions.create,
1097
+ )