together 1.2.11__py3-none-any.whl → 2.0.0a8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. together/__init__.py +101 -63
  2. together/_base_client.py +1995 -0
  3. together/_client.py +1033 -0
  4. together/_compat.py +219 -0
  5. together/_constants.py +14 -0
  6. together/_exceptions.py +108 -0
  7. together/_files.py +123 -0
  8. together/_models.py +857 -0
  9. together/_qs.py +150 -0
  10. together/_resource.py +43 -0
  11. together/_response.py +830 -0
  12. together/_streaming.py +370 -0
  13. together/_types.py +260 -0
  14. together/_utils/__init__.py +64 -0
  15. together/_utils/_compat.py +45 -0
  16. together/_utils/_datetime_parse.py +136 -0
  17. together/_utils/_logs.py +25 -0
  18. together/_utils/_proxy.py +65 -0
  19. together/_utils/_reflection.py +42 -0
  20. together/_utils/_resources_proxy.py +24 -0
  21. together/_utils/_streams.py +12 -0
  22. together/_utils/_sync.py +58 -0
  23. together/_utils/_transform.py +457 -0
  24. together/_utils/_typing.py +156 -0
  25. together/_utils/_utils.py +421 -0
  26. together/_version.py +4 -0
  27. together/lib/.keep +4 -0
  28. together/lib/__init__.py +23 -0
  29. together/lib/cli/api/endpoints.py +467 -0
  30. together/lib/cli/api/evals.py +588 -0
  31. together/{cli → lib/cli}/api/files.py +20 -17
  32. together/lib/cli/api/fine_tuning.py +566 -0
  33. together/lib/cli/api/models.py +140 -0
  34. together/lib/cli/api/utils.py +50 -0
  35. together/{cli → lib/cli}/cli.py +17 -23
  36. together/lib/constants.py +61 -0
  37. together/lib/resources/__init__.py +11 -0
  38. together/lib/resources/files.py +999 -0
  39. together/lib/resources/fine_tuning.py +280 -0
  40. together/lib/resources/models.py +35 -0
  41. together/lib/types/__init__.py +13 -0
  42. together/lib/types/error.py +9 -0
  43. together/lib/types/fine_tuning.py +455 -0
  44. together/{utils → lib/utils}/__init__.py +7 -10
  45. together/{utils → lib/utils}/_log.py +18 -13
  46. together/lib/utils/files.py +628 -0
  47. together/lib/utils/serializer.py +10 -0
  48. together/{utils → lib/utils}/tools.py +17 -2
  49. together/resources/__init__.py +225 -24
  50. together/resources/audio/__init__.py +75 -0
  51. together/resources/audio/audio.py +198 -0
  52. together/resources/audio/speech.py +605 -0
  53. together/resources/audio/transcriptions.py +282 -0
  54. together/resources/audio/translations.py +256 -0
  55. together/resources/audio/voices.py +135 -0
  56. together/resources/batches.py +417 -0
  57. together/resources/chat/__init__.py +30 -21
  58. together/resources/chat/chat.py +102 -0
  59. together/resources/chat/completions.py +1063 -257
  60. together/resources/code_interpreter/__init__.py +33 -0
  61. together/resources/code_interpreter/code_interpreter.py +258 -0
  62. together/resources/code_interpreter/sessions.py +135 -0
  63. together/resources/completions.py +890 -225
  64. together/resources/embeddings.py +172 -68
  65. together/resources/endpoints.py +711 -0
  66. together/resources/evals.py +452 -0
  67. together/resources/files.py +397 -120
  68. together/resources/fine_tuning.py +1033 -0
  69. together/resources/hardware.py +181 -0
  70. together/resources/images.py +256 -108
  71. together/resources/jobs.py +214 -0
  72. together/resources/models.py +251 -44
  73. together/resources/rerank.py +190 -92
  74. together/resources/videos.py +374 -0
  75. together/types/__init__.py +66 -73
  76. together/types/audio/__init__.py +10 -0
  77. together/types/audio/speech_create_params.py +75 -0
  78. together/types/audio/transcription_create_params.py +54 -0
  79. together/types/audio/transcription_create_response.py +111 -0
  80. together/types/audio/translation_create_params.py +40 -0
  81. together/types/audio/translation_create_response.py +70 -0
  82. together/types/audio/voice_list_response.py +23 -0
  83. together/types/audio_speech_stream_chunk.py +16 -0
  84. together/types/autoscaling.py +13 -0
  85. together/types/autoscaling_param.py +15 -0
  86. together/types/batch_create_params.py +24 -0
  87. together/types/batch_create_response.py +14 -0
  88. together/types/batch_job.py +45 -0
  89. together/types/batch_list_response.py +10 -0
  90. together/types/chat/__init__.py +18 -0
  91. together/types/chat/chat_completion.py +60 -0
  92. together/types/chat/chat_completion_chunk.py +61 -0
  93. together/types/chat/chat_completion_structured_message_image_url_param.py +18 -0
  94. together/types/chat/chat_completion_structured_message_text_param.py +13 -0
  95. together/types/chat/chat_completion_structured_message_video_url_param.py +18 -0
  96. together/types/chat/chat_completion_usage.py +13 -0
  97. together/types/chat/chat_completion_warning.py +9 -0
  98. together/types/chat/completion_create_params.py +329 -0
  99. together/types/code_interpreter/__init__.py +5 -0
  100. together/types/code_interpreter/session_list_response.py +31 -0
  101. together/types/code_interpreter_execute_params.py +45 -0
  102. together/types/completion.py +42 -0
  103. together/types/completion_chunk.py +66 -0
  104. together/types/completion_create_params.py +138 -0
  105. together/types/dedicated_endpoint.py +44 -0
  106. together/types/embedding.py +24 -0
  107. together/types/embedding_create_params.py +31 -0
  108. together/types/endpoint_create_params.py +43 -0
  109. together/types/endpoint_list_avzones_response.py +11 -0
  110. together/types/endpoint_list_params.py +18 -0
  111. together/types/endpoint_list_response.py +41 -0
  112. together/types/endpoint_update_params.py +27 -0
  113. together/types/eval_create_params.py +263 -0
  114. together/types/eval_create_response.py +16 -0
  115. together/types/eval_list_params.py +21 -0
  116. together/types/eval_list_response.py +10 -0
  117. together/types/eval_status_response.py +100 -0
  118. together/types/evaluation_job.py +139 -0
  119. together/types/execute_response.py +108 -0
  120. together/types/file_delete_response.py +13 -0
  121. together/types/file_list.py +12 -0
  122. together/types/file_purpose.py +9 -0
  123. together/types/file_response.py +31 -0
  124. together/types/file_type.py +7 -0
  125. together/types/fine_tuning_cancel_response.py +194 -0
  126. together/types/fine_tuning_content_params.py +24 -0
  127. together/types/fine_tuning_delete_params.py +11 -0
  128. together/types/fine_tuning_delete_response.py +12 -0
  129. together/types/fine_tuning_list_checkpoints_response.py +21 -0
  130. together/types/fine_tuning_list_events_response.py +12 -0
  131. together/types/fine_tuning_list_response.py +199 -0
  132. together/types/finetune_event.py +41 -0
  133. together/types/finetune_event_type.py +33 -0
  134. together/types/finetune_response.py +177 -0
  135. together/types/hardware_list_params.py +16 -0
  136. together/types/hardware_list_response.py +58 -0
  137. together/types/image_data_b64.py +15 -0
  138. together/types/image_data_url.py +15 -0
  139. together/types/image_file.py +23 -0
  140. together/types/image_generate_params.py +85 -0
  141. together/types/job_list_response.py +47 -0
  142. together/types/job_retrieve_response.py +43 -0
  143. together/types/log_probs.py +18 -0
  144. together/types/model_list_response.py +10 -0
  145. together/types/model_object.py +42 -0
  146. together/types/model_upload_params.py +36 -0
  147. together/types/model_upload_response.py +23 -0
  148. together/types/rerank_create_params.py +36 -0
  149. together/types/rerank_create_response.py +36 -0
  150. together/types/tool_choice.py +23 -0
  151. together/types/tool_choice_param.py +23 -0
  152. together/types/tools_param.py +23 -0
  153. together/types/training_method_dpo.py +22 -0
  154. together/types/training_method_sft.py +18 -0
  155. together/types/video_create_params.py +86 -0
  156. together/types/video_job.py +57 -0
  157. together-2.0.0a8.dist-info/METADATA +680 -0
  158. together-2.0.0a8.dist-info/RECORD +164 -0
  159. {together-1.2.11.dist-info → together-2.0.0a8.dist-info}/WHEEL +1 -1
  160. together-2.0.0a8.dist-info/entry_points.txt +2 -0
  161. {together-1.2.11.dist-info → together-2.0.0a8.dist-info/licenses}/LICENSE +1 -1
  162. together/abstract/api_requestor.py +0 -723
  163. together/cli/api/chat.py +0 -276
  164. together/cli/api/completions.py +0 -119
  165. together/cli/api/finetune.py +0 -272
  166. together/cli/api/images.py +0 -82
  167. together/cli/api/models.py +0 -42
  168. together/client.py +0 -157
  169. together/constants.py +0 -31
  170. together/error.py +0 -191
  171. together/filemanager.py +0 -388
  172. together/legacy/__init__.py +0 -0
  173. together/legacy/base.py +0 -27
  174. together/legacy/complete.py +0 -93
  175. together/legacy/embeddings.py +0 -27
  176. together/legacy/files.py +0 -146
  177. together/legacy/finetune.py +0 -177
  178. together/legacy/images.py +0 -27
  179. together/legacy/models.py +0 -44
  180. together/resources/finetune.py +0 -489
  181. together/together_response.py +0 -50
  182. together/types/abstract.py +0 -26
  183. together/types/chat_completions.py +0 -171
  184. together/types/common.py +0 -65
  185. together/types/completions.py +0 -104
  186. together/types/embeddings.py +0 -35
  187. together/types/error.py +0 -16
  188. together/types/files.py +0 -89
  189. together/types/finetune.py +0 -265
  190. together/types/images.py +0 -42
  191. together/types/models.py +0 -44
  192. together/types/rerank.py +0 -43
  193. together/utils/api_helpers.py +0 -84
  194. together/utils/files.py +0 -204
  195. together/version.py +0 -6
  196. together-1.2.11.dist-info/METADATA +0 -408
  197. together-1.2.11.dist-info/RECORD +0 -58
  198. together-1.2.11.dist-info/entry_points.txt +0 -3
  199. /together/{abstract → lib/cli}/__init__.py +0 -0
  200. /together/{cli → lib/cli/api}/__init__.py +0 -0
  201. /together/{cli/api/__init__.py → py.typed} +0 -0
@@ -1,255 +1,920 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
1
3
  from __future__ import annotations
2
4
 
3
- from typing import AsyncGenerator, Dict, Iterator, List, Any
5
+ from typing import Dict, Union
6
+ from typing_extensions import Literal, overload
7
+
8
+ import httpx
4
9
 
5
- from together.abstract import api_requestor
6
- from together.together_response import TogetherResponse
7
- from together.types import (
8
- CompletionChunk,
9
- CompletionRequest,
10
- CompletionResponse,
11
- TogetherClient,
12
- TogetherRequest,
10
+ from ..types import completion_create_params
11
+ from .._types import Body, Omit, Query, Headers, NotGiven, SequenceNotStr, omit, not_given
12
+ from .._utils import required_args, maybe_transform, async_maybe_transform
13
+ from .._compat import cached_property
14
+ from .._resource import SyncAPIResource, AsyncAPIResource
15
+ from .._response import (
16
+ to_raw_response_wrapper,
17
+ to_streamed_response_wrapper,
18
+ async_to_raw_response_wrapper,
19
+ async_to_streamed_response_wrapper,
13
20
  )
21
+ from .._streaming import Stream, AsyncStream
22
+ from .._base_client import make_request_options
23
+ from ..types.completion import Completion
24
+ from ..types.completion_chunk import CompletionChunk
25
+
26
+ __all__ = ["CompletionsResource", "AsyncCompletionsResource"]
27
+
28
+
29
+ class CompletionsResource(SyncAPIResource):
30
+ @cached_property
31
+ def with_raw_response(self) -> CompletionsResourceWithRawResponse:
32
+ """
33
+ This property can be used as a prefix for any HTTP method call to return
34
+ the raw response object instead of the parsed content.
35
+
36
+ For more information, see https://www.github.com/togethercomputer/together-py#accessing-raw-response-data-eg-headers
37
+ """
38
+ return CompletionsResourceWithRawResponse(self)
14
39
 
40
+ @cached_property
41
+ def with_streaming_response(self) -> CompletionsResourceWithStreamingResponse:
42
+ """
43
+ An alternative to `.with_raw_response` that doesn't eagerly read the response body.
15
44
 
16
- class Completions:
17
- def __init__(self, client: TogetherClient) -> None:
18
- self._client = client
45
+ For more information, see https://www.github.com/togethercomputer/together-py#with_streaming_response
46
+ """
47
+ return CompletionsResourceWithStreamingResponse(self)
19
48
 
49
+ @overload
20
50
  def create(
21
51
  self,
22
52
  *,
53
+ model: Union[
54
+ Literal[
55
+ "meta-llama/Llama-2-70b-hf",
56
+ "mistralai/Mistral-7B-v0.1",
57
+ "mistralai/Mixtral-8x7B-v0.1",
58
+ "Meta-Llama/Llama-Guard-7b",
59
+ ],
60
+ str,
61
+ ],
23
62
  prompt: str,
24
- model: str,
25
- max_tokens: int | None = 512,
26
- stop: List[str] | None = None,
27
- temperature: float | None = None,
28
- top_p: float | None = None,
29
- top_k: int | None = None,
30
- repetition_penalty: float | None = None,
31
- presence_penalty: float | None = None,
32
- frequency_penalty: float | None = None,
33
- min_p: float | None = None,
34
- logit_bias: Dict[str, float] | None = None,
35
- stream: bool = False,
36
- logprobs: int | None = None,
37
- echo: bool | None = None,
38
- n: int | None = None,
39
- safety_model: str | None = None,
40
- **kwargs: Any,
41
- ) -> CompletionResponse | Iterator[CompletionChunk]:
42
- """
43
- Method to generate completions based on a given prompt using a specified model.
63
+ echo: bool | Omit = omit,
64
+ frequency_penalty: float | Omit = omit,
65
+ logit_bias: Dict[str, float] | Omit = omit,
66
+ logprobs: int | Omit = omit,
67
+ max_tokens: int | Omit = omit,
68
+ min_p: float | Omit = omit,
69
+ n: int | Omit = omit,
70
+ presence_penalty: float | Omit = omit,
71
+ repetition_penalty: float | Omit = omit,
72
+ safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
73
+ seed: int | Omit = omit,
74
+ stop: SequenceNotStr[str] | Omit = omit,
75
+ stream: Literal[False] | Omit = omit,
76
+ temperature: float | Omit = omit,
77
+ top_k: int | Omit = omit,
78
+ top_p: float | Omit = omit,
79
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
80
+ # The extra values given here take precedence over values defined on the client or passed to this method.
81
+ extra_headers: Headers | None = None,
82
+ extra_query: Query | None = None,
83
+ extra_body: Body | None = None,
84
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
85
+ ) -> Completion:
86
+ """
87
+ Query a language, code, or image model.
44
88
 
45
89
  Args:
46
- prompt (str): A string providing context for the model to complete.
47
- model (str): The name of the model to query.
48
- max_tokens (int, optional): The maximum number of tokens to generate.
49
- Defaults to 512.
50
- stop (List[str], optional): List of strings at which to stop generation.
51
- Defaults to None.
52
- temperature (float, optional): A decimal number that determines the degree of randomness in the response.
53
- Defaults to None.
54
- top_p (float, optional): The top_p (nucleus) parameter is used to dynamically adjust the number
55
- of choices for each predicted token based on the cumulative probabilities.
56
- Defaults to None.
57
- top_k (int, optional): The top_k parameter is used to limit the number of choices for the
58
- next predicted word or token.
59
- Defaults to None.
60
- repetition_penalty (float, optional): A number that controls the diversity of generated text
61
- by reducing the likelihood of repeated sequences. Higher values decrease repetition.
62
- Defaults to None.
63
- presence_penalty (float, optional): A number that controls the likelihood of tokens based on if they have
64
- appeared in the text. Positive values decrease the likelihood of repeated tokens or phrases.
65
- Must be in the range [-2, 2].
66
- Defaults to None.
67
- frequency_penalty (float, optional): A number that controls the likelihood of tokens based on the frequency
68
- of their appearance in the text. Positive decrease the likelihood of repeated tokens or phrases.
69
- Must be in the range [-2, 2].
70
- Defaults to None.
71
- min_p (float, optional): A number that controls the minimum percentage value that a token must reach to
72
- be considered during sampling.
73
- Must be in the range [0, 1].
74
- Defaults to None.
75
- logit_bias (Dict[str, float], optional): A dictionary of tokens and their bias values that modify the
76
- likelihood of specific tokens being sampled. Bias values must be in the range [-100, 100].
77
- Defaults to None.
78
- stream (bool, optional): Flag indicating whether to stream the generated completions.
79
- Defaults to False.
80
- logprobs (int, optional): Number of top-k logprobs to return
81
- Defaults to None.
82
- echo (bool, optional): Echo prompt in output. Can be used with logprobs to return prompt logprobs.
83
- Defaults to None.
84
- n (int, optional): Number of completions to generate. Setting to None will return a single generation.
85
- Defaults to None.
86
- safety_model (str, optional): A moderation model to validate tokens. Choice between available moderation
87
- models found [here](https://docs.together.ai/docs/inference-models#moderation-models).
88
- Defaults to None.
89
-
90
- Returns:
91
- CompletionResponse | Iterator[CompletionChunk]: Object containing the completions
92
- or an iterator over completion chunks.
93
- """
94
-
95
- requestor = api_requestor.APIRequestor(
96
- client=self._client,
97
- )
90
+ model: The name of the model to query.
91
+
92
+ [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
93
+
94
+ prompt: A string providing context for the model to complete.
95
+
96
+ echo: If true, the response will contain the prompt. Can be used with `logprobs` to
97
+ return prompt logprobs.
98
+
99
+ frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
100
+ repeating tokens that have already been mentioned.
101
+
102
+ logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
103
+
104
+ logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
105
+ at each generation step, instead of just the sampled token. Log probabilities
106
+ help assess model confidence in token predictions.
107
+
108
+ max_tokens: The maximum number of tokens to generate.
109
+
110
+ min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
111
+
112
+ n: The number of completions to generate for each prompt.
113
+
114
+ presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
115
+ a model talking about new topics.
116
+
117
+ repetition_penalty: A number that controls the diversity of generated text by reducing the
118
+ likelihood of repeated sequences. Higher values decrease repetition.
119
+
120
+ safety_model: The name of the moderation model used to validate tokens. Choose from the
121
+ available moderation models found
122
+ [here](https://docs.together.ai/docs/inference-models#moderation-models).
123
+
124
+ seed: Seed value for reproducibility.
125
+
126
+ stop: A list of string sequences that will truncate (stop) inference text output. For
127
+ example, "</s>" will stop generation as soon as the model generates the given
128
+ token.
129
+
130
+ stream: If true, stream tokens as Server-Sent Events as the model generates them instead
131
+ of waiting for the full model response. The stream terminates with
132
+ `data: [DONE]`. If false, return a single JSON object containing the results.
133
+
134
+ temperature: A decimal number from 0-1 that determines the degree of randomness in the
135
+ response. A temperature less than 1 favors more correctness and is appropriate
136
+ for question answering or summarization. A value closer to 1 introduces more
137
+ randomness in the output.
138
+
139
+ top_k: An integer that's used to limit the number of choices for the next predicted
140
+ word or token. It specifies the maximum number of tokens to consider at each
141
+ step, based on their probability of occurrence. This technique helps to speed up
142
+ the generation process and can improve the quality of the generated text by
143
+ focusing on the most likely options.
144
+
145
+ top_p: A percentage (also called the nucleus parameter) that's used to dynamically
146
+ adjust the number of choices for each predicted token based on the cumulative
147
+ probabilities. It specifies a probability threshold below which all less likely
148
+ tokens are filtered out. This technique helps maintain diversity and generate
149
+ more fluent and natural-sounding text.
150
+
151
+ extra_headers: Send extra headers
152
+
153
+ extra_query: Add additional query parameters to the request
154
+
155
+ extra_body: Add additional JSON properties to the request
156
+
157
+ timeout: Override the client-level default timeout for this request, in seconds
158
+ """
159
+ ...
160
+
161
+ @overload
162
+ def create(
163
+ self,
164
+ *,
165
+ model: Union[
166
+ Literal[
167
+ "meta-llama/Llama-2-70b-hf",
168
+ "mistralai/Mistral-7B-v0.1",
169
+ "mistralai/Mixtral-8x7B-v0.1",
170
+ "Meta-Llama/Llama-Guard-7b",
171
+ ],
172
+ str,
173
+ ],
174
+ prompt: str,
175
+ stream: Literal[True],
176
+ echo: bool | Omit = omit,
177
+ frequency_penalty: float | Omit = omit,
178
+ logit_bias: Dict[str, float] | Omit = omit,
179
+ logprobs: int | Omit = omit,
180
+ max_tokens: int | Omit = omit,
181
+ min_p: float | Omit = omit,
182
+ n: int | Omit = omit,
183
+ presence_penalty: float | Omit = omit,
184
+ repetition_penalty: float | Omit = omit,
185
+ safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
186
+ seed: int | Omit = omit,
187
+ stop: SequenceNotStr[str] | Omit = omit,
188
+ temperature: float | Omit = omit,
189
+ top_k: int | Omit = omit,
190
+ top_p: float | Omit = omit,
191
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
192
+ # The extra values given here take precedence over values defined on the client or passed to this method.
193
+ extra_headers: Headers | None = None,
194
+ extra_query: Query | None = None,
195
+ extra_body: Body | None = None,
196
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
197
+ ) -> Stream[CompletionChunk]:
198
+ """
199
+ Query a language, code, or image model.
200
+
201
+ Args:
202
+ model: The name of the model to query.
203
+
204
+ [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
205
+
206
+ prompt: A string providing context for the model to complete.
207
+
208
+ stream: If true, stream tokens as Server-Sent Events as the model generates them instead
209
+ of waiting for the full model response. The stream terminates with
210
+ `data: [DONE]`. If false, return a single JSON object containing the results.
211
+
212
+ echo: If true, the response will contain the prompt. Can be used with `logprobs` to
213
+ return prompt logprobs.
214
+
215
+ frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
216
+ repeating tokens that have already been mentioned.
217
+
218
+ logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
219
+
220
+ logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
221
+ at each generation step, instead of just the sampled token. Log probabilities
222
+ help assess model confidence in token predictions.
223
+
224
+ max_tokens: The maximum number of tokens to generate.
225
+
226
+ min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
227
+
228
+ n: The number of completions to generate for each prompt.
229
+
230
+ presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
231
+ a model talking about new topics.
98
232
 
99
- parameter_payload = CompletionRequest(
100
- model=model,
101
- prompt=prompt,
102
- top_p=top_p,
103
- top_k=top_k,
104
- temperature=temperature,
105
- max_tokens=max_tokens,
106
- stop=stop,
107
- repetition_penalty=repetition_penalty,
108
- presence_penalty=presence_penalty,
109
- frequency_penalty=frequency_penalty,
110
- min_p=min_p,
111
- logit_bias=logit_bias,
112
- stream=stream,
113
- logprobs=logprobs,
114
- echo=echo,
115
- n=n,
116
- safety_model=safety_model,
117
- **kwargs,
118
- ).model_dump(exclude_none=True)
119
-
120
- response, _, _ = requestor.request(
121
- options=TogetherRequest(
122
- method="POST",
123
- url="completions",
124
- params=parameter_payload,
233
+ repetition_penalty: A number that controls the diversity of generated text by reducing the
234
+ likelihood of repeated sequences. Higher values decrease repetition.
235
+
236
+ safety_model: The name of the moderation model used to validate tokens. Choose from the
237
+ available moderation models found
238
+ [here](https://docs.together.ai/docs/inference-models#moderation-models).
239
+
240
+ seed: Seed value for reproducibility.
241
+
242
+ stop: A list of string sequences that will truncate (stop) inference text output. For
243
+ example, "</s>" will stop generation as soon as the model generates the given
244
+ token.
245
+
246
+ temperature: A decimal number from 0-1 that determines the degree of randomness in the
247
+ response. A temperature less than 1 favors more correctness and is appropriate
248
+ for question answering or summarization. A value closer to 1 introduces more
249
+ randomness in the output.
250
+
251
+ top_k: An integer that's used to limit the number of choices for the next predicted
252
+ word or token. It specifies the maximum number of tokens to consider at each
253
+ step, based on their probability of occurrence. This technique helps to speed up
254
+ the generation process and can improve the quality of the generated text by
255
+ focusing on the most likely options.
256
+
257
+ top_p: A percentage (also called the nucleus parameter) that's used to dynamically
258
+ adjust the number of choices for each predicted token based on the cumulative
259
+ probabilities. It specifies a probability threshold below which all less likely
260
+ tokens are filtered out. This technique helps maintain diversity and generate
261
+ more fluent and natural-sounding text.
262
+
263
+ extra_headers: Send extra headers
264
+
265
+ extra_query: Add additional query parameters to the request
266
+
267
+ extra_body: Add additional JSON properties to the request
268
+
269
+ timeout: Override the client-level default timeout for this request, in seconds
270
+ """
271
+ ...
272
+
273
+ @overload
274
+ def create(
275
+ self,
276
+ *,
277
+ model: Union[
278
+ Literal[
279
+ "meta-llama/Llama-2-70b-hf",
280
+ "mistralai/Mistral-7B-v0.1",
281
+ "mistralai/Mixtral-8x7B-v0.1",
282
+ "Meta-Llama/Llama-Guard-7b",
283
+ ],
284
+ str,
285
+ ],
286
+ prompt: str,
287
+ stream: bool,
288
+ echo: bool | Omit = omit,
289
+ frequency_penalty: float | Omit = omit,
290
+ logit_bias: Dict[str, float] | Omit = omit,
291
+ logprobs: int | Omit = omit,
292
+ max_tokens: int | Omit = omit,
293
+ min_p: float | Omit = omit,
294
+ n: int | Omit = omit,
295
+ presence_penalty: float | Omit = omit,
296
+ repetition_penalty: float | Omit = omit,
297
+ safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
298
+ seed: int | Omit = omit,
299
+ stop: SequenceNotStr[str] | Omit = omit,
300
+ temperature: float | Omit = omit,
301
+ top_k: int | Omit = omit,
302
+ top_p: float | Omit = omit,
303
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
304
+ # The extra values given here take precedence over values defined on the client or passed to this method.
305
+ extra_headers: Headers | None = None,
306
+ extra_query: Query | None = None,
307
+ extra_body: Body | None = None,
308
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
309
+ ) -> Completion | Stream[CompletionChunk]:
310
+ """
311
+ Query a language, code, or image model.
312
+
313
+ Args:
314
+ model: The name of the model to query.
315
+
316
+ [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
317
+
318
+ prompt: A string providing context for the model to complete.
319
+
320
+ stream: If true, stream tokens as Server-Sent Events as the model generates them instead
321
+ of waiting for the full model response. The stream terminates with
322
+ `data: [DONE]`. If false, return a single JSON object containing the results.
323
+
324
+ echo: If true, the response will contain the prompt. Can be used with `logprobs` to
325
+ return prompt logprobs.
326
+
327
+ frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
328
+ repeating tokens that have already been mentioned.
329
+
330
+ logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
331
+
332
+ logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
333
+ at each generation step, instead of just the sampled token. Log probabilities
334
+ help assess model confidence in token predictions.
335
+
336
+ max_tokens: The maximum number of tokens to generate.
337
+
338
+ min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
339
+
340
+ n: The number of completions to generate for each prompt.
341
+
342
+ presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
343
+ a model talking about new topics.
344
+
345
+ repetition_penalty: A number that controls the diversity of generated text by reducing the
346
+ likelihood of repeated sequences. Higher values decrease repetition.
347
+
348
+ safety_model: The name of the moderation model used to validate tokens. Choose from the
349
+ available moderation models found
350
+ [here](https://docs.together.ai/docs/inference-models#moderation-models).
351
+
352
+ seed: Seed value for reproducibility.
353
+
354
+ stop: A list of string sequences that will truncate (stop) inference text output. For
355
+ example, "</s>" will stop generation as soon as the model generates the given
356
+ token.
357
+
358
+ temperature: A decimal number from 0-1 that determines the degree of randomness in the
359
+ response. A temperature less than 1 favors more correctness and is appropriate
360
+ for question answering or summarization. A value closer to 1 introduces more
361
+ randomness in the output.
362
+
363
+ top_k: An integer that's used to limit the number of choices for the next predicted
364
+ word or token. It specifies the maximum number of tokens to consider at each
365
+ step, based on their probability of occurrence. This technique helps to speed up
366
+ the generation process and can improve the quality of the generated text by
367
+ focusing on the most likely options.
368
+
369
+ top_p: A percentage (also called the nucleus parameter) that's used to dynamically
370
+ adjust the number of choices for each predicted token based on the cumulative
371
+ probabilities. It specifies a probability threshold below which all less likely
372
+ tokens are filtered out. This technique helps maintain diversity and generate
373
+ more fluent and natural-sounding text.
374
+
375
+ extra_headers: Send extra headers
376
+
377
+ extra_query: Add additional query parameters to the request
378
+
379
+ extra_body: Add additional JSON properties to the request
380
+
381
+ timeout: Override the client-level default timeout for this request, in seconds
382
+ """
383
+ ...
384
+
385
+ @required_args(["model", "prompt"], ["model", "prompt", "stream"])
386
+ def create(
387
+ self,
388
+ *,
389
+ model: Union[
390
+ Literal[
391
+ "meta-llama/Llama-2-70b-hf",
392
+ "mistralai/Mistral-7B-v0.1",
393
+ "mistralai/Mixtral-8x7B-v0.1",
394
+ "Meta-Llama/Llama-Guard-7b",
395
+ ],
396
+ str,
397
+ ],
398
+ prompt: str,
399
+ echo: bool | Omit = omit,
400
+ frequency_penalty: float | Omit = omit,
401
+ logit_bias: Dict[str, float] | Omit = omit,
402
+ logprobs: int | Omit = omit,
403
+ max_tokens: int | Omit = omit,
404
+ min_p: float | Omit = omit,
405
+ n: int | Omit = omit,
406
+ presence_penalty: float | Omit = omit,
407
+ repetition_penalty: float | Omit = omit,
408
+ safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
409
+ seed: int | Omit = omit,
410
+ stop: SequenceNotStr[str] | Omit = omit,
411
+ stream: Literal[False] | Literal[True] | Omit = omit,
412
+ temperature: float | Omit = omit,
413
+ top_k: int | Omit = omit,
414
+ top_p: float | Omit = omit,
415
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
416
+ # The extra values given here take precedence over values defined on the client or passed to this method.
417
+ extra_headers: Headers | None = None,
418
+ extra_query: Query | None = None,
419
+ extra_body: Body | None = None,
420
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
421
+ ) -> Completion | Stream[CompletionChunk]:
422
+ return self._post(
423
+ "/completions",
424
+ body=maybe_transform(
425
+ {
426
+ "model": model,
427
+ "prompt": prompt,
428
+ "echo": echo,
429
+ "frequency_penalty": frequency_penalty,
430
+ "logit_bias": logit_bias,
431
+ "logprobs": logprobs,
432
+ "max_tokens": max_tokens,
433
+ "min_p": min_p,
434
+ "n": n,
435
+ "presence_penalty": presence_penalty,
436
+ "repetition_penalty": repetition_penalty,
437
+ "safety_model": safety_model,
438
+ "seed": seed,
439
+ "stop": stop,
440
+ "stream": stream,
441
+ "temperature": temperature,
442
+ "top_k": top_k,
443
+ "top_p": top_p,
444
+ },
445
+ completion_create_params.CompletionCreateParamsStreaming
446
+ if stream
447
+ else completion_create_params.CompletionCreateParamsNonStreaming,
448
+ ),
449
+ options=make_request_options(
450
+ extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
125
451
  ),
126
- stream=stream,
452
+ cast_to=Completion,
453
+ stream=stream or False,
454
+ stream_cls=Stream[CompletionChunk],
127
455
  )
128
456
 
129
- if stream:
130
- # must be an iterator
131
- assert not isinstance(response, TogetherResponse)
132
- return (CompletionChunk(**line.data) for line in response)
133
- assert isinstance(response, TogetherResponse)
134
- return CompletionResponse(**response.data)
135
457
 
458
+ class AsyncCompletionsResource(AsyncAPIResource):
459
+ @cached_property
460
+ def with_raw_response(self) -> AsyncCompletionsResourceWithRawResponse:
461
+ """
462
+ This property can be used as a prefix for any HTTP method call to return
463
+ the raw response object instead of the parsed content.
464
+
465
+ For more information, see https://www.github.com/togethercomputer/together-py#accessing-raw-response-data-eg-headers
466
+ """
467
+ return AsyncCompletionsResourceWithRawResponse(self)
136
468
 
137
- class AsyncCompletions:
138
- def __init__(self, client: TogetherClient) -> None:
139
- self._client = client
469
+ @cached_property
470
+ def with_streaming_response(self) -> AsyncCompletionsResourceWithStreamingResponse:
471
+ """
472
+ An alternative to `.with_raw_response` that doesn't eagerly read the response body.
473
+
474
+ For more information, see https://www.github.com/togethercomputer/together-py#with_streaming_response
475
+ """
476
+ return AsyncCompletionsResourceWithStreamingResponse(self)
140
477
 
478
+ @overload
141
479
  async def create(
142
480
  self,
143
481
  *,
482
+ model: Union[
483
+ Literal[
484
+ "meta-llama/Llama-2-70b-hf",
485
+ "mistralai/Mistral-7B-v0.1",
486
+ "mistralai/Mixtral-8x7B-v0.1",
487
+ "Meta-Llama/Llama-Guard-7b",
488
+ ],
489
+ str,
490
+ ],
144
491
  prompt: str,
145
- model: str,
146
- max_tokens: int | None = 512,
147
- stop: List[str] | None = None,
148
- temperature: float | None = None,
149
- top_p: float | None = None,
150
- top_k: int | None = None,
151
- repetition_penalty: float | None = None,
152
- presence_penalty: float | None = None,
153
- frequency_penalty: float | None = None,
154
- min_p: float | None = None,
155
- logit_bias: Dict[str, float] | None = None,
156
- stream: bool = False,
157
- logprobs: int | None = None,
158
- echo: bool | None = None,
159
- n: int | None = None,
160
- safety_model: str | None = None,
161
- **kwargs: Any,
162
- ) -> AsyncGenerator[CompletionChunk, None] | CompletionResponse:
163
- """
164
- Async method to generate completions based on a given prompt using a specified model.
492
+ echo: bool | Omit = omit,
493
+ frequency_penalty: float | Omit = omit,
494
+ logit_bias: Dict[str, float] | Omit = omit,
495
+ logprobs: int | Omit = omit,
496
+ max_tokens: int | Omit = omit,
497
+ min_p: float | Omit = omit,
498
+ n: int | Omit = omit,
499
+ presence_penalty: float | Omit = omit,
500
+ repetition_penalty: float | Omit = omit,
501
+ safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
502
+ seed: int | Omit = omit,
503
+ stop: SequenceNotStr[str] | Omit = omit,
504
+ stream: Literal[False] | Omit = omit,
505
+ temperature: float | Omit = omit,
506
+ top_k: int | Omit = omit,
507
+ top_p: float | Omit = omit,
508
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
509
+ # The extra values given here take precedence over values defined on the client or passed to this method.
510
+ extra_headers: Headers | None = None,
511
+ extra_query: Query | None = None,
512
+ extra_body: Body | None = None,
513
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
514
+ ) -> Completion:
515
+ """
516
+ Query a language, code, or image model.
165
517
 
166
518
  Args:
167
- prompt (str): A string providing context for the model to complete.
168
- model (str): The name of the model to query.
169
- max_tokens (int, optional): The maximum number of tokens to generate.
170
- Defaults to 512.
171
- stop (List[str], optional): List of strings at which to stop generation.
172
- Defaults to None.
173
- temperature (float, optional): A decimal number that determines the degree of randomness in the response.
174
- Defaults to None.
175
- top_p (float, optional): The top_p (nucleus) parameter is used to dynamically adjust the number
176
- of choices for each predicted token based on the cumulative probabilities.
177
- Defaults to None.
178
- top_k (int, optional): The top_k parameter is used to limit the number of choices for the
179
- next predicted word or token.
180
- Defaults to None.
181
- repetition_penalty (float, optional): A number that controls the diversity of generated text
182
- by reducing the likelihood of repeated sequences. Higher values decrease repetition.
183
- Defaults to None.
184
- presence_penalty (float, optional): A number that controls the likelihood of tokens based on if they have
185
- appeared in the text. Positive values decrease the likelihood of repeated tokens or phrases.
186
- Must be in the range [-2, 2].
187
- Defaults to None.
188
- frequency_penalty (float, optional): A number that controls the likelihood of tokens based on the frequency
189
- of their appearance in the text. Positive decrease the likelihood of repeated tokens or phrases.
190
- Must be in the range [-2, 2].
191
- Defaults to None.
192
- min_p (float, optional): A number that controls the minimum percentage value that a token must reach to
193
- be considered during sampling.
194
- Must be in the range [0, 1].
195
- Defaults to None.
196
- logit_bias (Dict[str, float], optional): A dictionary of tokens and their bias values that modify the
197
- likelihood of specific tokens being sampled. Bias values must be in the range [-100, 100].
198
- Defaults to None.
199
- stream (bool, optional): Flag indicating whether to stream the generated completions.
200
- Defaults to False.
201
- logprobs (int, optional): Number of top-k logprobs to return
202
- Defaults to None.
203
- echo (bool, optional): Echo prompt in output. Can be used with logprobs to return prompt logprobs.
204
- Defaults to None.
205
- n (int, optional): Number of completions to generate. Setting to None will return a single generation.
206
- Defaults to None.
207
- safety_model (str, optional): A moderation model to validate tokens. Choice between available moderation
208
- models found [here](https://docs.together.ai/docs/inference-models#moderation-models).
209
- Defaults to None.
210
-
211
- Returns:
212
- AsyncGenerator[CompletionChunk, None] | CompletionResponse: Object containing the completions
213
- or an iterator over completion chunks.
214
- """
215
-
216
- requestor = api_requestor.APIRequestor(
217
- client=self._client,
218
- )
519
+ model: The name of the model to query.
520
+
521
+ [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
522
+
523
+ prompt: A string providing context for the model to complete.
524
+
525
+ echo: If true, the response will contain the prompt. Can be used with `logprobs` to
526
+ return prompt logprobs.
527
+
528
+ frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
529
+ repeating tokens that have already been mentioned.
530
+
531
+ logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
532
+
533
+ logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
534
+ at each generation step, instead of just the sampled token. Log probabilities
535
+ help assess model confidence in token predictions.
536
+
537
+ max_tokens: The maximum number of tokens to generate.
538
+
539
+ min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
540
+
541
+ n: The number of completions to generate for each prompt.
542
+
543
+ presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
544
+ a model talking about new topics.
545
+
546
+ repetition_penalty: A number that controls the diversity of generated text by reducing the
547
+ likelihood of repeated sequences. Higher values decrease repetition.
548
+
549
+ safety_model: The name of the moderation model used to validate tokens. Choose from the
550
+ available moderation models found
551
+ [here](https://docs.together.ai/docs/inference-models#moderation-models).
552
+
553
+ seed: Seed value for reproducibility.
554
+
555
+ stop: A list of string sequences that will truncate (stop) inference text output. For
556
+ example, "</s>" will stop generation as soon as the model generates the given
557
+ token.
558
+
559
+ stream: If true, stream tokens as Server-Sent Events as the model generates them instead
560
+ of waiting for the full model response. The stream terminates with
561
+ `data: [DONE]`. If false, return a single JSON object containing the results.
562
+
563
+ temperature: A decimal number from 0-1 that determines the degree of randomness in the
564
+ response. A temperature less than 1 favors more correctness and is appropriate
565
+ for question answering or summarization. A value closer to 1 introduces more
566
+ randomness in the output.
567
+
568
+ top_k: An integer that's used to limit the number of choices for the next predicted
569
+ word or token. It specifies the maximum number of tokens to consider at each
570
+ step, based on their probability of occurrence. This technique helps to speed up
571
+ the generation process and can improve the quality of the generated text by
572
+ focusing on the most likely options.
573
+
574
+ top_p: A percentage (also called the nucleus parameter) that's used to dynamically
575
+ adjust the number of choices for each predicted token based on the cumulative
576
+ probabilities. It specifies a probability threshold below which all less likely
577
+ tokens are filtered out. This technique helps maintain diversity and generate
578
+ more fluent and natural-sounding text.
579
+
580
+ extra_headers: Send extra headers
581
+
582
+ extra_query: Add additional query parameters to the request
583
+
584
+ extra_body: Add additional JSON properties to the request
585
+
586
+ timeout: Override the client-level default timeout for this request, in seconds
587
+ """
588
+ ...
589
+
590
+ @overload
591
+ async def create(
592
+ self,
593
+ *,
594
+ model: Union[
595
+ Literal[
596
+ "meta-llama/Llama-2-70b-hf",
597
+ "mistralai/Mistral-7B-v0.1",
598
+ "mistralai/Mixtral-8x7B-v0.1",
599
+ "Meta-Llama/Llama-Guard-7b",
600
+ ],
601
+ str,
602
+ ],
603
+ prompt: str,
604
+ stream: Literal[True],
605
+ echo: bool | Omit = omit,
606
+ frequency_penalty: float | Omit = omit,
607
+ logit_bias: Dict[str, float] | Omit = omit,
608
+ logprobs: int | Omit = omit,
609
+ max_tokens: int | Omit = omit,
610
+ min_p: float | Omit = omit,
611
+ n: int | Omit = omit,
612
+ presence_penalty: float | Omit = omit,
613
+ repetition_penalty: float | Omit = omit,
614
+ safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
615
+ seed: int | Omit = omit,
616
+ stop: SequenceNotStr[str] | Omit = omit,
617
+ temperature: float | Omit = omit,
618
+ top_k: int | Omit = omit,
619
+ top_p: float | Omit = omit,
620
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
621
+ # The extra values given here take precedence over values defined on the client or passed to this method.
622
+ extra_headers: Headers | None = None,
623
+ extra_query: Query | None = None,
624
+ extra_body: Body | None = None,
625
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
626
+ ) -> AsyncStream[CompletionChunk]:
627
+ """
628
+ Query a language, code, or image model.
629
+
630
+ Args:
631
+ model: The name of the model to query.
632
+
633
+ [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
634
+
635
+ prompt: A string providing context for the model to complete.
636
+
637
+ stream: If true, stream tokens as Server-Sent Events as the model generates them instead
638
+ of waiting for the full model response. The stream terminates with
639
+ `data: [DONE]`. If false, return a single JSON object containing the results.
640
+
641
+ echo: If true, the response will contain the prompt. Can be used with `logprobs` to
642
+ return prompt logprobs.
643
+
644
+ frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
645
+ repeating tokens that have already been mentioned.
646
+
647
+ logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
648
+
649
+ logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
650
+ at each generation step, instead of just the sampled token. Log probabilities
651
+ help assess model confidence in token predictions.
652
+
653
+ max_tokens: The maximum number of tokens to generate.
654
+
655
+ min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
656
+
657
+ n: The number of completions to generate for each prompt.
658
+
659
+ presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
660
+ a model talking about new topics.
661
+
662
+ repetition_penalty: A number that controls the diversity of generated text by reducing the
663
+ likelihood of repeated sequences. Higher values decrease repetition.
664
+
665
+ safety_model: The name of the moderation model used to validate tokens. Choose from the
666
+ available moderation models found
667
+ [here](https://docs.together.ai/docs/inference-models#moderation-models).
668
+
669
+ seed: Seed value for reproducibility.
670
+
671
+ stop: A list of string sequences that will truncate (stop) inference text output. For
672
+ example, "</s>" will stop generation as soon as the model generates the given
673
+ token.
674
+
675
+ temperature: A decimal number from 0-1 that determines the degree of randomness in the
676
+ response. A temperature less than 1 favors more correctness and is appropriate
677
+ for question answering or summarization. A value closer to 1 introduces more
678
+ randomness in the output.
679
+
680
+ top_k: An integer that's used to limit the number of choices for the next predicted
681
+ word or token. It specifies the maximum number of tokens to consider at each
682
+ step, based on their probability of occurrence. This technique helps to speed up
683
+ the generation process and can improve the quality of the generated text by
684
+ focusing on the most likely options.
685
+
686
+ top_p: A percentage (also called the nucleus parameter) that's used to dynamically
687
+ adjust the number of choices for each predicted token based on the cumulative
688
+ probabilities. It specifies a probability threshold below which all less likely
689
+ tokens are filtered out. This technique helps maintain diversity and generate
690
+ more fluent and natural-sounding text.
691
+
692
+ extra_headers: Send extra headers
693
+
694
+ extra_query: Add additional query parameters to the request
695
+
696
+ extra_body: Add additional JSON properties to the request
219
697
 
220
- parameter_payload = CompletionRequest(
221
- model=model,
222
- prompt=prompt,
223
- top_p=top_p,
224
- top_k=top_k,
225
- temperature=temperature,
226
- max_tokens=max_tokens,
227
- stop=stop,
228
- repetition_penalty=repetition_penalty,
229
- presence_penalty=presence_penalty,
230
- frequency_penalty=frequency_penalty,
231
- min_p=min_p,
232
- logit_bias=logit_bias,
233
- stream=stream,
234
- logprobs=logprobs,
235
- echo=echo,
236
- n=n,
237
- safety_model=safety_model,
238
- **kwargs,
239
- ).model_dump(exclude_none=True)
240
-
241
- response, _, _ = await requestor.arequest(
242
- options=TogetherRequest(
243
- method="POST",
244
- url="completions",
245
- params=parameter_payload,
698
+ timeout: Override the client-level default timeout for this request, in seconds
699
+ """
700
+ ...
701
+
702
+ @overload
703
+ async def create(
704
+ self,
705
+ *,
706
+ model: Union[
707
+ Literal[
708
+ "meta-llama/Llama-2-70b-hf",
709
+ "mistralai/Mistral-7B-v0.1",
710
+ "mistralai/Mixtral-8x7B-v0.1",
711
+ "Meta-Llama/Llama-Guard-7b",
712
+ ],
713
+ str,
714
+ ],
715
+ prompt: str,
716
+ stream: bool,
717
+ echo: bool | Omit = omit,
718
+ frequency_penalty: float | Omit = omit,
719
+ logit_bias: Dict[str, float] | Omit = omit,
720
+ logprobs: int | Omit = omit,
721
+ max_tokens: int | Omit = omit,
722
+ min_p: float | Omit = omit,
723
+ n: int | Omit = omit,
724
+ presence_penalty: float | Omit = omit,
725
+ repetition_penalty: float | Omit = omit,
726
+ safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
727
+ seed: int | Omit = omit,
728
+ stop: SequenceNotStr[str] | Omit = omit,
729
+ temperature: float | Omit = omit,
730
+ top_k: int | Omit = omit,
731
+ top_p: float | Omit = omit,
732
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
733
+ # The extra values given here take precedence over values defined on the client or passed to this method.
734
+ extra_headers: Headers | None = None,
735
+ extra_query: Query | None = None,
736
+ extra_body: Body | None = None,
737
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
738
+ ) -> Completion | AsyncStream[CompletionChunk]:
739
+ """
740
+ Query a language, code, or image model.
741
+
742
+ Args:
743
+ model: The name of the model to query.
744
+
745
+ [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
746
+
747
+ prompt: A string providing context for the model to complete.
748
+
749
+ stream: If true, stream tokens as Server-Sent Events as the model generates them instead
750
+ of waiting for the full model response. The stream terminates with
751
+ `data: [DONE]`. If false, return a single JSON object containing the results.
752
+
753
+ echo: If true, the response will contain the prompt. Can be used with `logprobs` to
754
+ return prompt logprobs.
755
+
756
+ frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
757
+ repeating tokens that have already been mentioned.
758
+
759
+ logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
760
+
761
+ logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
762
+ at each generation step, instead of just the sampled token. Log probabilities
763
+ help assess model confidence in token predictions.
764
+
765
+ max_tokens: The maximum number of tokens to generate.
766
+
767
+ min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
768
+
769
+ n: The number of completions to generate for each prompt.
770
+
771
+ presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
772
+ a model talking about new topics.
773
+
774
+ repetition_penalty: A number that controls the diversity of generated text by reducing the
775
+ likelihood of repeated sequences. Higher values decrease repetition.
776
+
777
+ safety_model: The name of the moderation model used to validate tokens. Choose from the
778
+ available moderation models found
779
+ [here](https://docs.together.ai/docs/inference-models#moderation-models).
780
+
781
+ seed: Seed value for reproducibility.
782
+
783
+ stop: A list of string sequences that will truncate (stop) inference text output. For
784
+ example, "</s>" will stop generation as soon as the model generates the given
785
+ token.
786
+
787
+ temperature: A decimal number from 0-1 that determines the degree of randomness in the
788
+ response. A temperature less than 1 favors more correctness and is appropriate
789
+ for question answering or summarization. A value closer to 1 introduces more
790
+ randomness in the output.
791
+
792
+ top_k: An integer that's used to limit the number of choices for the next predicted
793
+ word or token. It specifies the maximum number of tokens to consider at each
794
+ step, based on their probability of occurrence. This technique helps to speed up
795
+ the generation process and can improve the quality of the generated text by
796
+ focusing on the most likely options.
797
+
798
+ top_p: A percentage (also called the nucleus parameter) that's used to dynamically
799
+ adjust the number of choices for each predicted token based on the cumulative
800
+ probabilities. It specifies a probability threshold below which all less likely
801
+ tokens are filtered out. This technique helps maintain diversity and generate
802
+ more fluent and natural-sounding text.
803
+
804
+ extra_headers: Send extra headers
805
+
806
+ extra_query: Add additional query parameters to the request
807
+
808
+ extra_body: Add additional JSON properties to the request
809
+
810
+ timeout: Override the client-level default timeout for this request, in seconds
811
+ """
812
+ ...
813
+
814
+ @required_args(["model", "prompt"], ["model", "prompt", "stream"])
815
+ async def create(
816
+ self,
817
+ *,
818
+ model: Union[
819
+ Literal[
820
+ "meta-llama/Llama-2-70b-hf",
821
+ "mistralai/Mistral-7B-v0.1",
822
+ "mistralai/Mixtral-8x7B-v0.1",
823
+ "Meta-Llama/Llama-Guard-7b",
824
+ ],
825
+ str,
826
+ ],
827
+ prompt: str,
828
+ echo: bool | Omit = omit,
829
+ frequency_penalty: float | Omit = omit,
830
+ logit_bias: Dict[str, float] | Omit = omit,
831
+ logprobs: int | Omit = omit,
832
+ max_tokens: int | Omit = omit,
833
+ min_p: float | Omit = omit,
834
+ n: int | Omit = omit,
835
+ presence_penalty: float | Omit = omit,
836
+ repetition_penalty: float | Omit = omit,
837
+ safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
838
+ seed: int | Omit = omit,
839
+ stop: SequenceNotStr[str] | Omit = omit,
840
+ stream: Literal[False] | Literal[True] | Omit = omit,
841
+ temperature: float | Omit = omit,
842
+ top_k: int | Omit = omit,
843
+ top_p: float | Omit = omit,
844
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
845
+ # The extra values given here take precedence over values defined on the client or passed to this method.
846
+ extra_headers: Headers | None = None,
847
+ extra_query: Query | None = None,
848
+ extra_body: Body | None = None,
849
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
850
+ ) -> Completion | AsyncStream[CompletionChunk]:
851
+ return await self._post(
852
+ "/completions",
853
+ body=await async_maybe_transform(
854
+ {
855
+ "model": model,
856
+ "prompt": prompt,
857
+ "echo": echo,
858
+ "frequency_penalty": frequency_penalty,
859
+ "logit_bias": logit_bias,
860
+ "logprobs": logprobs,
861
+ "max_tokens": max_tokens,
862
+ "min_p": min_p,
863
+ "n": n,
864
+ "presence_penalty": presence_penalty,
865
+ "repetition_penalty": repetition_penalty,
866
+ "safety_model": safety_model,
867
+ "seed": seed,
868
+ "stop": stop,
869
+ "stream": stream,
870
+ "temperature": temperature,
871
+ "top_k": top_k,
872
+ "top_p": top_p,
873
+ },
874
+ completion_create_params.CompletionCreateParamsStreaming
875
+ if stream
876
+ else completion_create_params.CompletionCreateParamsNonStreaming,
246
877
  ),
247
- stream=stream,
878
+ options=make_request_options(
879
+ extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
880
+ ),
881
+ cast_to=Completion,
882
+ stream=stream or False,
883
+ stream_cls=AsyncStream[CompletionChunk],
884
+ )
885
+
886
+
887
+ class CompletionsResourceWithRawResponse:
888
+ def __init__(self, completions: CompletionsResource) -> None:
889
+ self._completions = completions
890
+
891
+ self.create = to_raw_response_wrapper(
892
+ completions.create,
248
893
  )
249
894
 
250
- if stream:
251
- # must be an iterator
252
- assert not isinstance(response, TogetherResponse)
253
- return (CompletionChunk(**line.data) async for line in response)
254
- assert isinstance(response, TogetherResponse)
255
- return CompletionResponse(**response.data)
895
+
896
+ class AsyncCompletionsResourceWithRawResponse:
897
+ def __init__(self, completions: AsyncCompletionsResource) -> None:
898
+ self._completions = completions
899
+
900
+ self.create = async_to_raw_response_wrapper(
901
+ completions.create,
902
+ )
903
+
904
+
905
+ class CompletionsResourceWithStreamingResponse:
906
+ def __init__(self, completions: CompletionsResource) -> None:
907
+ self._completions = completions
908
+
909
+ self.create = to_streamed_response_wrapper(
910
+ completions.create,
911
+ )
912
+
913
+
914
+ class AsyncCompletionsResourceWithStreamingResponse:
915
+ def __init__(self, completions: AsyncCompletionsResource) -> None:
916
+ self._completions = completions
917
+
918
+ self.create = async_to_streamed_response_wrapper(
919
+ completions.create,
920
+ )