together 1.5.34__py3-none-any.whl → 2.0.0a6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (208) hide show
  1. together/__init__.py +101 -114
  2. together/_base_client.py +1995 -0
  3. together/_client.py +1033 -0
  4. together/_compat.py +219 -0
  5. together/_constants.py +14 -0
  6. together/_exceptions.py +108 -0
  7. together/_files.py +123 -0
  8. together/_models.py +857 -0
  9. together/_qs.py +150 -0
  10. together/_resource.py +43 -0
  11. together/_response.py +830 -0
  12. together/_streaming.py +370 -0
  13. together/_types.py +260 -0
  14. together/_utils/__init__.py +64 -0
  15. together/_utils/_compat.py +45 -0
  16. together/_utils/_datetime_parse.py +136 -0
  17. together/_utils/_logs.py +25 -0
  18. together/_utils/_proxy.py +65 -0
  19. together/_utils/_reflection.py +42 -0
  20. together/_utils/_resources_proxy.py +24 -0
  21. together/_utils/_streams.py +12 -0
  22. together/_utils/_sync.py +58 -0
  23. together/_utils/_transform.py +457 -0
  24. together/_utils/_typing.py +156 -0
  25. together/_utils/_utils.py +421 -0
  26. together/_version.py +4 -0
  27. together/lib/.keep +4 -0
  28. together/lib/__init__.py +23 -0
  29. together/{cli → lib/cli}/api/endpoints.py +65 -81
  30. together/{cli/api/evaluation.py → lib/cli/api/evals.py} +152 -43
  31. together/{cli → lib/cli}/api/files.py +20 -17
  32. together/{cli/api/finetune.py → lib/cli/api/fine_tuning.py} +116 -172
  33. together/{cli → lib/cli}/api/models.py +34 -27
  34. together/lib/cli/api/utils.py +50 -0
  35. together/{cli → lib/cli}/cli.py +16 -26
  36. together/{constants.py → lib/constants.py} +11 -24
  37. together/lib/resources/__init__.py +11 -0
  38. together/lib/resources/files.py +999 -0
  39. together/lib/resources/fine_tuning.py +280 -0
  40. together/lib/resources/models.py +35 -0
  41. together/lib/types/__init__.py +13 -0
  42. together/lib/types/error.py +9 -0
  43. together/lib/types/fine_tuning.py +397 -0
  44. together/{utils → lib/utils}/__init__.py +6 -14
  45. together/{utils → lib/utils}/_log.py +11 -16
  46. together/{utils → lib/utils}/files.py +90 -288
  47. together/lib/utils/serializer.py +10 -0
  48. together/{utils → lib/utils}/tools.py +19 -55
  49. together/resources/__init__.py +225 -39
  50. together/resources/audio/__init__.py +72 -48
  51. together/resources/audio/audio.py +198 -0
  52. together/resources/audio/speech.py +574 -128
  53. together/resources/audio/transcriptions.py +247 -261
  54. together/resources/audio/translations.py +221 -241
  55. together/resources/audio/voices.py +111 -41
  56. together/resources/batches.py +417 -0
  57. together/resources/chat/__init__.py +30 -21
  58. together/resources/chat/chat.py +102 -0
  59. together/resources/chat/completions.py +1063 -263
  60. together/resources/code_interpreter/__init__.py +33 -0
  61. together/resources/code_interpreter/code_interpreter.py +258 -0
  62. together/resources/code_interpreter/sessions.py +135 -0
  63. together/resources/completions.py +884 -225
  64. together/resources/embeddings.py +172 -68
  65. together/resources/endpoints.py +589 -477
  66. together/resources/evals.py +452 -0
  67. together/resources/files.py +397 -129
  68. together/resources/fine_tuning.py +1033 -0
  69. together/resources/hardware.py +181 -0
  70. together/resources/images.py +258 -104
  71. together/resources/jobs.py +214 -0
  72. together/resources/models.py +223 -193
  73. together/resources/rerank.py +190 -92
  74. together/resources/videos.py +286 -214
  75. together/types/__init__.py +66 -167
  76. together/types/audio/__init__.py +10 -0
  77. together/types/audio/speech_create_params.py +75 -0
  78. together/types/audio/transcription_create_params.py +54 -0
  79. together/types/audio/transcription_create_response.py +111 -0
  80. together/types/audio/translation_create_params.py +40 -0
  81. together/types/audio/translation_create_response.py +70 -0
  82. together/types/audio/voice_list_response.py +23 -0
  83. together/types/audio_speech_stream_chunk.py +16 -0
  84. together/types/autoscaling.py +13 -0
  85. together/types/autoscaling_param.py +15 -0
  86. together/types/batch_create_params.py +24 -0
  87. together/types/batch_create_response.py +14 -0
  88. together/types/batch_job.py +45 -0
  89. together/types/batch_list_response.py +10 -0
  90. together/types/chat/__init__.py +18 -0
  91. together/types/chat/chat_completion.py +60 -0
  92. together/types/chat/chat_completion_chunk.py +61 -0
  93. together/types/chat/chat_completion_structured_message_image_url_param.py +18 -0
  94. together/types/chat/chat_completion_structured_message_text_param.py +13 -0
  95. together/types/chat/chat_completion_structured_message_video_url_param.py +18 -0
  96. together/types/chat/chat_completion_usage.py +13 -0
  97. together/types/chat/chat_completion_warning.py +9 -0
  98. together/types/chat/completion_create_params.py +329 -0
  99. together/types/code_interpreter/__init__.py +5 -0
  100. together/types/code_interpreter/session_list_response.py +31 -0
  101. together/types/code_interpreter_execute_params.py +45 -0
  102. together/types/completion.py +42 -0
  103. together/types/completion_chunk.py +66 -0
  104. together/types/completion_create_params.py +138 -0
  105. together/types/dedicated_endpoint.py +44 -0
  106. together/types/embedding.py +24 -0
  107. together/types/embedding_create_params.py +31 -0
  108. together/types/endpoint_create_params.py +43 -0
  109. together/types/endpoint_list_avzones_response.py +11 -0
  110. together/types/endpoint_list_params.py +18 -0
  111. together/types/endpoint_list_response.py +41 -0
  112. together/types/endpoint_update_params.py +27 -0
  113. together/types/eval_create_params.py +263 -0
  114. together/types/eval_create_response.py +16 -0
  115. together/types/eval_list_params.py +21 -0
  116. together/types/eval_list_response.py +10 -0
  117. together/types/eval_status_response.py +100 -0
  118. together/types/evaluation_job.py +139 -0
  119. together/types/execute_response.py +108 -0
  120. together/types/file_delete_response.py +13 -0
  121. together/types/file_list.py +12 -0
  122. together/types/file_purpose.py +9 -0
  123. together/types/file_response.py +31 -0
  124. together/types/file_type.py +7 -0
  125. together/types/fine_tuning_cancel_response.py +194 -0
  126. together/types/fine_tuning_content_params.py +24 -0
  127. together/types/fine_tuning_delete_params.py +11 -0
  128. together/types/fine_tuning_delete_response.py +12 -0
  129. together/types/fine_tuning_list_checkpoints_response.py +21 -0
  130. together/types/fine_tuning_list_events_response.py +12 -0
  131. together/types/fine_tuning_list_response.py +199 -0
  132. together/types/finetune_event.py +41 -0
  133. together/types/finetune_event_type.py +33 -0
  134. together/types/finetune_response.py +177 -0
  135. together/types/hardware_list_params.py +16 -0
  136. together/types/hardware_list_response.py +58 -0
  137. together/types/image_data_b64.py +15 -0
  138. together/types/image_data_url.py +15 -0
  139. together/types/image_file.py +23 -0
  140. together/types/image_generate_params.py +85 -0
  141. together/types/job_list_response.py +47 -0
  142. together/types/job_retrieve_response.py +43 -0
  143. together/types/log_probs.py +18 -0
  144. together/types/model_list_response.py +10 -0
  145. together/types/model_object.py +42 -0
  146. together/types/model_upload_params.py +36 -0
  147. together/types/model_upload_response.py +23 -0
  148. together/types/rerank_create_params.py +36 -0
  149. together/types/rerank_create_response.py +36 -0
  150. together/types/tool_choice.py +23 -0
  151. together/types/tool_choice_param.py +23 -0
  152. together/types/tools_param.py +23 -0
  153. together/types/training_method_dpo.py +22 -0
  154. together/types/training_method_sft.py +18 -0
  155. together/types/video_create_params.py +86 -0
  156. together/types/video_create_response.py +10 -0
  157. together/types/video_job.py +57 -0
  158. together-2.0.0a6.dist-info/METADATA +729 -0
  159. together-2.0.0a6.dist-info/RECORD +165 -0
  160. {together-1.5.34.dist-info → together-2.0.0a6.dist-info}/WHEEL +1 -1
  161. together-2.0.0a6.dist-info/entry_points.txt +2 -0
  162. {together-1.5.34.dist-info → together-2.0.0a6.dist-info}/licenses/LICENSE +1 -1
  163. together/abstract/api_requestor.py +0 -770
  164. together/cli/api/chat.py +0 -298
  165. together/cli/api/completions.py +0 -119
  166. together/cli/api/images.py +0 -93
  167. together/cli/api/utils.py +0 -139
  168. together/client.py +0 -186
  169. together/error.py +0 -194
  170. together/filemanager.py +0 -635
  171. together/legacy/__init__.py +0 -0
  172. together/legacy/base.py +0 -27
  173. together/legacy/complete.py +0 -93
  174. together/legacy/embeddings.py +0 -27
  175. together/legacy/files.py +0 -146
  176. together/legacy/finetune.py +0 -177
  177. together/legacy/images.py +0 -27
  178. together/legacy/models.py +0 -44
  179. together/resources/batch.py +0 -165
  180. together/resources/code_interpreter.py +0 -82
  181. together/resources/evaluation.py +0 -808
  182. together/resources/finetune.py +0 -1388
  183. together/together_response.py +0 -50
  184. together/types/abstract.py +0 -26
  185. together/types/audio_speech.py +0 -311
  186. together/types/batch.py +0 -54
  187. together/types/chat_completions.py +0 -210
  188. together/types/code_interpreter.py +0 -57
  189. together/types/common.py +0 -67
  190. together/types/completions.py +0 -107
  191. together/types/embeddings.py +0 -35
  192. together/types/endpoints.py +0 -123
  193. together/types/error.py +0 -16
  194. together/types/evaluation.py +0 -93
  195. together/types/files.py +0 -93
  196. together/types/finetune.py +0 -464
  197. together/types/images.py +0 -42
  198. together/types/models.py +0 -96
  199. together/types/rerank.py +0 -43
  200. together/types/videos.py +0 -69
  201. together/utils/api_helpers.py +0 -124
  202. together/version.py +0 -6
  203. together-1.5.34.dist-info/METADATA +0 -583
  204. together-1.5.34.dist-info/RECORD +0 -77
  205. together-1.5.34.dist-info/entry_points.txt +0 -3
  206. /together/{abstract → lib/cli}/__init__.py +0 -0
  207. /together/{cli → lib/cli/api}/__init__.py +0 -0
  208. /together/{cli/api/__init__.py → py.typed} +0 -0
@@ -1,261 +1,920 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
1
3
  from __future__ import annotations
2
4
 
3
- from typing import AsyncGenerator, Dict, Iterator, List, Any
5
+ from typing import Dict, Union
6
+ from typing_extensions import Literal, overload
7
+
8
+ import httpx
4
9
 
5
- from together.abstract import api_requestor
6
- from together.together_response import TogetherResponse
7
- from together.types import (
8
- CompletionChunk,
9
- CompletionRequest,
10
- CompletionResponse,
11
- TogetherClient,
12
- TogetherRequest,
10
+ from ..types import completion_create_params
11
+ from .._types import Body, Omit, Query, Headers, NotGiven, SequenceNotStr, omit, not_given
12
+ from .._utils import required_args, maybe_transform, async_maybe_transform
13
+ from .._compat import cached_property
14
+ from .._resource import SyncAPIResource, AsyncAPIResource
15
+ from .._response import (
16
+ to_raw_response_wrapper,
17
+ to_streamed_response_wrapper,
18
+ async_to_raw_response_wrapper,
19
+ async_to_streamed_response_wrapper,
13
20
  )
21
+ from .._streaming import Stream, AsyncStream
22
+ from .._base_client import make_request_options
23
+ from ..types.completion import Completion
24
+ from ..types.completion_chunk import CompletionChunk
25
+
26
+ __all__ = ["CompletionsResource", "AsyncCompletionsResource"]
27
+
28
+
29
+ class CompletionsResource(SyncAPIResource):
30
+ @cached_property
31
+ def with_raw_response(self) -> CompletionsResourceWithRawResponse:
32
+ """
33
+ This property can be used as a prefix for any HTTP method call to return
34
+ the raw response object instead of the parsed content.
35
+
36
+ For more information, see https://www.github.com/togethercomputer/together-py#accessing-raw-response-data-eg-headers
37
+ """
38
+ return CompletionsResourceWithRawResponse(self)
14
39
 
40
+ @cached_property
41
+ def with_streaming_response(self) -> CompletionsResourceWithStreamingResponse:
42
+ """
43
+ An alternative to `.with_raw_response` that doesn't eagerly read the response body.
15
44
 
16
- class Completions:
17
- def __init__(self, client: TogetherClient) -> None:
18
- self._client = client
45
+ For more information, see https://www.github.com/togethercomputer/together-py#with_streaming_response
46
+ """
47
+ return CompletionsResourceWithStreamingResponse(self)
19
48
 
49
+ @overload
20
50
  def create(
21
51
  self,
22
52
  *,
53
+ model: Union[
54
+ Literal[
55
+ "meta-llama/Llama-2-70b-hf",
56
+ "mistralai/Mistral-7B-v0.1",
57
+ "mistralai/Mixtral-8x7B-v0.1",
58
+ "Meta-Llama/Llama-Guard-7b",
59
+ ],
60
+ str,
61
+ ],
23
62
  prompt: str,
24
- model: str,
25
- max_tokens: int | None = 512,
26
- stop: List[str] | None = None,
27
- temperature: float | None = None,
28
- top_p: float | None = None,
29
- top_k: int | None = None,
30
- repetition_penalty: float | None = None,
31
- presence_penalty: float | None = None,
32
- frequency_penalty: float | None = None,
33
- min_p: float | None = None,
34
- logit_bias: Dict[str, float] | None = None,
35
- seed: int | None = None,
36
- stream: bool = False,
37
- logprobs: int | None = None,
38
- echo: bool | None = None,
39
- n: int | None = None,
40
- safety_model: str | None = None,
41
- **kwargs: Any,
42
- ) -> CompletionResponse | Iterator[CompletionChunk]:
63
+ echo: bool | Omit = omit,
64
+ frequency_penalty: float | Omit = omit,
65
+ logit_bias: Dict[str, float] | Omit = omit,
66
+ logprobs: int | Omit = omit,
67
+ max_tokens: int | Omit = omit,
68
+ min_p: float | Omit = omit,
69
+ n: int | Omit = omit,
70
+ presence_penalty: float | Omit = omit,
71
+ repetition_penalty: float | Omit = omit,
72
+ safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
73
+ seed: int | Omit = omit,
74
+ stop: SequenceNotStr[str] | Omit = omit,
75
+ stream: Literal[False] | Omit = omit,
76
+ temperature: float | Omit = omit,
77
+ top_k: int | Omit = omit,
78
+ top_p: float | Omit = omit,
79
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
80
+ # The extra values given here take precedence over values defined on the client or passed to this method.
81
+ extra_headers: Headers | None = None,
82
+ extra_query: Query | None = None,
83
+ extra_body: Body | None = None,
84
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
85
+ ) -> Completion:
43
86
  """
44
- Method to generate completions based on a given prompt using a specified model.
87
+ Query a language, code, or image model.
45
88
 
46
89
  Args:
47
- prompt (str): A string providing context for the model to complete.
48
- model (str): The name of the model to query.
49
- max_tokens (int, optional): The maximum number of tokens to generate.
50
- Defaults to 512.
51
- stop (List[str], optional): List of strings at which to stop generation.
52
- Defaults to None.
53
- temperature (float, optional): A decimal number that determines the degree of randomness in the response.
54
- Defaults to None.
55
- top_p (float, optional): The top_p (nucleus) parameter is used to dynamically adjust the number
56
- of choices for each predicted token based on the cumulative probabilities.
57
- Defaults to None.
58
- top_k (int, optional): The top_k parameter is used to limit the number of choices for the
59
- next predicted word or token.
60
- Defaults to None.
61
- repetition_penalty (float, optional): A number that controls the diversity of generated text
62
- by reducing the likelihood of repeated sequences. Higher values decrease repetition.
63
- Defaults to None.
64
- presence_penalty (float, optional): A number that controls the likelihood of tokens based on if they have
65
- appeared in the text. Positive values decrease the likelihood of repeated tokens or phrases.
66
- Must be in the range [-2, 2].
67
- Defaults to None.
68
- frequency_penalty (float, optional): A number that controls the likelihood of tokens based on the frequency
69
- of their appearance in the text. Positive decrease the likelihood of repeated tokens or phrases.
70
- Must be in the range [-2, 2].
71
- Defaults to None.
72
- min_p (float, optional): A number that controls the minimum percentage value that a token must reach to
73
- be considered during sampling.
74
- Must be in the range [0, 1].
75
- Defaults to None.
76
- logit_bias (Dict[str, float], optional): A dictionary of tokens and their bias values that modify the
77
- likelihood of specific tokens being sampled. Bias values must be in the range [-100, 100].
78
- Defaults to None.
79
- seed (int, optional): Seed value for reproducibility.
80
- stream (bool, optional): Flag indicating whether to stream the generated completions.
81
- Defaults to False.
82
- logprobs (int, optional): Number of top-k logprobs to return
83
- Defaults to None.
84
- echo (bool, optional): Echo prompt in output. Can be used with logprobs to return prompt logprobs.
85
- Defaults to None.
86
- n (int, optional): Number of completions to generate. Setting to None will return a single generation.
87
- Defaults to None.
88
- safety_model (str, optional): A moderation model to validate tokens. Choice between available moderation
89
- models found [here](https://docs.together.ai/docs/inference-models#moderation-models).
90
- Defaults to None.
91
-
92
- Returns:
93
- CompletionResponse | Iterator[CompletionChunk]: Object containing the completions
94
- or an iterator over completion chunks.
90
+ model: The name of the model to query.
91
+
92
+ [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
93
+
94
+ prompt: A string providing context for the model to complete.
95
+
96
+ echo: If true, the response will contain the prompt. Can be used with `logprobs` to
97
+ return prompt logprobs.
98
+
99
+ frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
100
+ repeating tokens that have already been mentioned.
101
+
102
+ logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
103
+
104
+ logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
105
+ at each generation step, instead of just the sampled token. Log probabilities
106
+ help assess model confidence in token predictions.
107
+
108
+ max_tokens: The maximum number of tokens to generate.
109
+
110
+ min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
111
+
112
+ n: The number of completions to generate for each prompt.
113
+
114
+ presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
115
+ a model talking about new topics.
116
+
117
+ repetition_penalty: A number that controls the diversity of generated text by reducing the
118
+ likelihood of repeated sequences. Higher values decrease repetition.
119
+
120
+ safety_model: The name of the moderation model used to validate tokens. Choose from the
121
+ available moderation models found
122
+ [here](https://docs.together.ai/docs/inference-models#moderation-models).
123
+
124
+ seed: Seed value for reproducibility.
125
+
126
+ stop: A list of string sequences that will truncate (stop) inference text output. For
127
+ example, "</s>" will stop generation as soon as the model generates the given
128
+ token.
129
+
130
+ stream: If true, stream tokens as Server-Sent Events as the model generates them instead
131
+ of waiting for the full model response. The stream terminates with
132
+ `data: [DONE]`. If false, return a single JSON object containing the results.
133
+
134
+ temperature: A decimal number from 0-1 that determines the degree of randomness in the
135
+ response. A temperature less than 1 favors more correctness and is appropriate
136
+ for question answering or summarization. A value closer to 1 introduces more
137
+ randomness in the output.
138
+
139
+ top_k: An integer that's used to limit the number of choices for the next predicted
140
+ word or token. It specifies the maximum number of tokens to consider at each
141
+ step, based on their probability of occurrence. This technique helps to speed up
142
+ the generation process and can improve the quality of the generated text by
143
+ focusing on the most likely options.
144
+
145
+ top_p: A percentage (also called the nucleus parameter) that's used to dynamically
146
+ adjust the number of choices for each predicted token based on the cumulative
147
+ probabilities. It specifies a probability threshold below which all less likely
148
+ tokens are filtered out. This technique helps maintain diversity and generate
149
+ more fluent and natural-sounding text.
150
+
151
+ extra_headers: Send extra headers
152
+
153
+ extra_query: Add additional query parameters to the request
154
+
155
+ extra_body: Add additional JSON properties to the request
156
+
157
+ timeout: Override the client-level default timeout for this request, in seconds
95
158
  """
159
+ ...
96
160
 
97
- requestor = api_requestor.APIRequestor(
98
- client=self._client,
99
- )
161
+ @overload
162
+ def create(
163
+ self,
164
+ *,
165
+ model: Union[
166
+ Literal[
167
+ "meta-llama/Llama-2-70b-hf",
168
+ "mistralai/Mistral-7B-v0.1",
169
+ "mistralai/Mixtral-8x7B-v0.1",
170
+ "Meta-Llama/Llama-Guard-7b",
171
+ ],
172
+ str,
173
+ ],
174
+ prompt: str,
175
+ stream: Literal[True],
176
+ echo: bool | Omit = omit,
177
+ frequency_penalty: float | Omit = omit,
178
+ logit_bias: Dict[str, float] | Omit = omit,
179
+ logprobs: int | Omit = omit,
180
+ max_tokens: int | Omit = omit,
181
+ min_p: float | Omit = omit,
182
+ n: int | Omit = omit,
183
+ presence_penalty: float | Omit = omit,
184
+ repetition_penalty: float | Omit = omit,
185
+ safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
186
+ seed: int | Omit = omit,
187
+ stop: SequenceNotStr[str] | Omit = omit,
188
+ temperature: float | Omit = omit,
189
+ top_k: int | Omit = omit,
190
+ top_p: float | Omit = omit,
191
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
192
+ # The extra values given here take precedence over values defined on the client or passed to this method.
193
+ extra_headers: Headers | None = None,
194
+ extra_query: Query | None = None,
195
+ extra_body: Body | None = None,
196
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
197
+ ) -> Stream[CompletionChunk]:
198
+ """
199
+ Query a language, code, or image model.
200
+
201
+ Args:
202
+ model: The name of the model to query.
203
+
204
+ [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
205
+
206
+ prompt: A string providing context for the model to complete.
207
+
208
+ stream: If true, stream tokens as Server-Sent Events as the model generates them instead
209
+ of waiting for the full model response. The stream terminates with
210
+ `data: [DONE]`. If false, return a single JSON object containing the results.
211
+
212
+ echo: If true, the response will contain the prompt. Can be used with `logprobs` to
213
+ return prompt logprobs.
214
+
215
+ frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
216
+ repeating tokens that have already been mentioned.
217
+
218
+ logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
219
+
220
+ logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
221
+ at each generation step, instead of just the sampled token. Log probabilities
222
+ help assess model confidence in token predictions.
223
+
224
+ max_tokens: The maximum number of tokens to generate.
225
+
226
+ min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
227
+
228
+ n: The number of completions to generate for each prompt.
229
+
230
+ presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
231
+ a model talking about new topics.
232
+
233
+ repetition_penalty: A number that controls the diversity of generated text by reducing the
234
+ likelihood of repeated sequences. Higher values decrease repetition.
235
+
236
+ safety_model: The name of the moderation model used to validate tokens. Choose from the
237
+ available moderation models found
238
+ [here](https://docs.together.ai/docs/inference-models#moderation-models).
100
239
 
101
- parameter_payload = CompletionRequest(
102
- model=model,
103
- prompt=prompt,
104
- top_p=top_p,
105
- top_k=top_k,
106
- temperature=temperature,
107
- max_tokens=max_tokens,
108
- stop=stop,
109
- repetition_penalty=repetition_penalty,
110
- presence_penalty=presence_penalty,
111
- frequency_penalty=frequency_penalty,
112
- seed=seed,
113
- min_p=min_p,
114
- logit_bias=logit_bias,
115
- stream=stream,
116
- logprobs=logprobs,
117
- echo=echo,
118
- n=n,
119
- safety_model=safety_model,
120
- **kwargs,
121
- ).model_dump(exclude_none=True)
122
-
123
- response, _, _ = requestor.request(
124
- options=TogetherRequest(
125
- method="POST",
126
- url="completions",
127
- params=parameter_payload,
240
+ seed: Seed value for reproducibility.
241
+
242
+ stop: A list of string sequences that will truncate (stop) inference text output. For
243
+ example, "</s>" will stop generation as soon as the model generates the given
244
+ token.
245
+
246
+ temperature: A decimal number from 0-1 that determines the degree of randomness in the
247
+ response. A temperature less than 1 favors more correctness and is appropriate
248
+ for question answering or summarization. A value closer to 1 introduces more
249
+ randomness in the output.
250
+
251
+ top_k: An integer that's used to limit the number of choices for the next predicted
252
+ word or token. It specifies the maximum number of tokens to consider at each
253
+ step, based on their probability of occurrence. This technique helps to speed up
254
+ the generation process and can improve the quality of the generated text by
255
+ focusing on the most likely options.
256
+
257
+ top_p: A percentage (also called the nucleus parameter) that's used to dynamically
258
+ adjust the number of choices for each predicted token based on the cumulative
259
+ probabilities. It specifies a probability threshold below which all less likely
260
+ tokens are filtered out. This technique helps maintain diversity and generate
261
+ more fluent and natural-sounding text.
262
+
263
+ extra_headers: Send extra headers
264
+
265
+ extra_query: Add additional query parameters to the request
266
+
267
+ extra_body: Add additional JSON properties to the request
268
+
269
+ timeout: Override the client-level default timeout for this request, in seconds
270
+ """
271
+ ...
272
+
273
+ @overload
274
+ def create(
275
+ self,
276
+ *,
277
+ model: Union[
278
+ Literal[
279
+ "meta-llama/Llama-2-70b-hf",
280
+ "mistralai/Mistral-7B-v0.1",
281
+ "mistralai/Mixtral-8x7B-v0.1",
282
+ "Meta-Llama/Llama-Guard-7b",
283
+ ],
284
+ str,
285
+ ],
286
+ prompt: str,
287
+ stream: bool,
288
+ echo: bool | Omit = omit,
289
+ frequency_penalty: float | Omit = omit,
290
+ logit_bias: Dict[str, float] | Omit = omit,
291
+ logprobs: int | Omit = omit,
292
+ max_tokens: int | Omit = omit,
293
+ min_p: float | Omit = omit,
294
+ n: int | Omit = omit,
295
+ presence_penalty: float | Omit = omit,
296
+ repetition_penalty: float | Omit = omit,
297
+ safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
298
+ seed: int | Omit = omit,
299
+ stop: SequenceNotStr[str] | Omit = omit,
300
+ temperature: float | Omit = omit,
301
+ top_k: int | Omit = omit,
302
+ top_p: float | Omit = omit,
303
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
304
+ # The extra values given here take precedence over values defined on the client or passed to this method.
305
+ extra_headers: Headers | None = None,
306
+ extra_query: Query | None = None,
307
+ extra_body: Body | None = None,
308
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
309
+ ) -> Completion | Stream[CompletionChunk]:
310
+ """
311
+ Query a language, code, or image model.
312
+
313
+ Args:
314
+ model: The name of the model to query.
315
+
316
+ [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
317
+
318
+ prompt: A string providing context for the model to complete.
319
+
320
+ stream: If true, stream tokens as Server-Sent Events as the model generates them instead
321
+ of waiting for the full model response. The stream terminates with
322
+ `data: [DONE]`. If false, return a single JSON object containing the results.
323
+
324
+ echo: If true, the response will contain the prompt. Can be used with `logprobs` to
325
+ return prompt logprobs.
326
+
327
+ frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
328
+ repeating tokens that have already been mentioned.
329
+
330
+ logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
331
+
332
+ logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
333
+ at each generation step, instead of just the sampled token. Log probabilities
334
+ help assess model confidence in token predictions.
335
+
336
+ max_tokens: The maximum number of tokens to generate.
337
+
338
+ min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
339
+
340
+ n: The number of completions to generate for each prompt.
341
+
342
+ presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
343
+ a model talking about new topics.
344
+
345
+ repetition_penalty: A number that controls the diversity of generated text by reducing the
346
+ likelihood of repeated sequences. Higher values decrease repetition.
347
+
348
+ safety_model: The name of the moderation model used to validate tokens. Choose from the
349
+ available moderation models found
350
+ [here](https://docs.together.ai/docs/inference-models#moderation-models).
351
+
352
+ seed: Seed value for reproducibility.
353
+
354
+ stop: A list of string sequences that will truncate (stop) inference text output. For
355
+ example, "</s>" will stop generation as soon as the model generates the given
356
+ token.
357
+
358
+ temperature: A decimal number from 0-1 that determines the degree of randomness in the
359
+ response. A temperature less than 1 favors more correctness and is appropriate
360
+ for question answering or summarization. A value closer to 1 introduces more
361
+ randomness in the output.
362
+
363
+ top_k: An integer that's used to limit the number of choices for the next predicted
364
+ word or token. It specifies the maximum number of tokens to consider at each
365
+ step, based on their probability of occurrence. This technique helps to speed up
366
+ the generation process and can improve the quality of the generated text by
367
+ focusing on the most likely options.
368
+
369
+ top_p: A percentage (also called the nucleus parameter) that's used to dynamically
370
+ adjust the number of choices for each predicted token based on the cumulative
371
+ probabilities. It specifies a probability threshold below which all less likely
372
+ tokens are filtered out. This technique helps maintain diversity and generate
373
+ more fluent and natural-sounding text.
374
+
375
+ extra_headers: Send extra headers
376
+
377
+ extra_query: Add additional query parameters to the request
378
+
379
+ extra_body: Add additional JSON properties to the request
380
+
381
+ timeout: Override the client-level default timeout for this request, in seconds
382
+ """
383
+ ...
384
+
385
+ @required_args(["model", "prompt"], ["model", "prompt", "stream"])
386
+ def create(
387
+ self,
388
+ *,
389
+ model: Union[
390
+ Literal[
391
+ "meta-llama/Llama-2-70b-hf",
392
+ "mistralai/Mistral-7B-v0.1",
393
+ "mistralai/Mixtral-8x7B-v0.1",
394
+ "Meta-Llama/Llama-Guard-7b",
395
+ ],
396
+ str,
397
+ ],
398
+ prompt: str,
399
+ echo: bool | Omit = omit,
400
+ frequency_penalty: float | Omit = omit,
401
+ logit_bias: Dict[str, float] | Omit = omit,
402
+ logprobs: int | Omit = omit,
403
+ max_tokens: int | Omit = omit,
404
+ min_p: float | Omit = omit,
405
+ n: int | Omit = omit,
406
+ presence_penalty: float | Omit = omit,
407
+ repetition_penalty: float | Omit = omit,
408
+ safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
409
+ seed: int | Omit = omit,
410
+ stop: SequenceNotStr[str] | Omit = omit,
411
+ stream: Literal[False] | Literal[True] | Omit = omit,
412
+ temperature: float | Omit = omit,
413
+ top_k: int | Omit = omit,
414
+ top_p: float | Omit = omit,
415
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
416
+ # The extra values given here take precedence over values defined on the client or passed to this method.
417
+ extra_headers: Headers | None = None,
418
+ extra_query: Query | None = None,
419
+ extra_body: Body | None = None,
420
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
421
+ ) -> Completion | Stream[CompletionChunk]:
422
+ return self._post(
423
+ "/completions",
424
+ body=maybe_transform(
425
+ {
426
+ "model": model,
427
+ "prompt": prompt,
428
+ "echo": echo,
429
+ "frequency_penalty": frequency_penalty,
430
+ "logit_bias": logit_bias,
431
+ "logprobs": logprobs,
432
+ "max_tokens": max_tokens,
433
+ "min_p": min_p,
434
+ "n": n,
435
+ "presence_penalty": presence_penalty,
436
+ "repetition_penalty": repetition_penalty,
437
+ "safety_model": safety_model,
438
+ "seed": seed,
439
+ "stop": stop,
440
+ "stream": stream,
441
+ "temperature": temperature,
442
+ "top_k": top_k,
443
+ "top_p": top_p,
444
+ },
445
+ completion_create_params.CompletionCreateParamsStreaming
446
+ if stream
447
+ else completion_create_params.CompletionCreateParamsNonStreaming,
448
+ ),
449
+ options=make_request_options(
450
+ extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
128
451
  ),
129
- stream=stream,
452
+ cast_to=Completion,
453
+ stream=stream or False,
454
+ stream_cls=Stream[CompletionChunk],
130
455
  )
131
456
 
132
- if stream:
133
- # must be an iterator
134
- assert not isinstance(response, TogetherResponse)
135
- return (CompletionChunk(**line.data) for line in response)
136
- assert isinstance(response, TogetherResponse)
137
- return CompletionResponse(**response.data)
138
457
 
458
+ class AsyncCompletionsResource(AsyncAPIResource):
459
+ @cached_property
460
+ def with_raw_response(self) -> AsyncCompletionsResourceWithRawResponse:
461
+ """
462
+ This property can be used as a prefix for any HTTP method call to return
463
+ the raw response object instead of the parsed content.
464
+
465
+ For more information, see https://www.github.com/togethercomputer/together-py#accessing-raw-response-data-eg-headers
466
+ """
467
+ return AsyncCompletionsResourceWithRawResponse(self)
139
468
 
140
- class AsyncCompletions:
141
- def __init__(self, client: TogetherClient) -> None:
142
- self._client = client
469
+ @cached_property
470
+ def with_streaming_response(self) -> AsyncCompletionsResourceWithStreamingResponse:
471
+ """
472
+ An alternative to `.with_raw_response` that doesn't eagerly read the response body.
473
+
474
+ For more information, see https://www.github.com/togethercomputer/together-py#with_streaming_response
475
+ """
476
+ return AsyncCompletionsResourceWithStreamingResponse(self)
143
477
 
478
+ @overload
144
479
  async def create(
145
480
  self,
146
481
  *,
482
+ model: Union[
483
+ Literal[
484
+ "meta-llama/Llama-2-70b-hf",
485
+ "mistralai/Mistral-7B-v0.1",
486
+ "mistralai/Mixtral-8x7B-v0.1",
487
+ "Meta-Llama/Llama-Guard-7b",
488
+ ],
489
+ str,
490
+ ],
147
491
  prompt: str,
148
- model: str,
149
- max_tokens: int | None = 512,
150
- stop: List[str] | None = None,
151
- temperature: float | None = None,
152
- top_p: float | None = None,
153
- top_k: int | None = None,
154
- repetition_penalty: float | None = None,
155
- presence_penalty: float | None = None,
156
- frequency_penalty: float | None = None,
157
- min_p: float | None = None,
158
- logit_bias: Dict[str, float] | None = None,
159
- seed: int | None = None,
160
- stream: bool = False,
161
- logprobs: int | None = None,
162
- echo: bool | None = None,
163
- n: int | None = None,
164
- safety_model: str | None = None,
165
- **kwargs: Any,
166
- ) -> AsyncGenerator[CompletionChunk, None] | CompletionResponse:
492
+ echo: bool | Omit = omit,
493
+ frequency_penalty: float | Omit = omit,
494
+ logit_bias: Dict[str, float] | Omit = omit,
495
+ logprobs: int | Omit = omit,
496
+ max_tokens: int | Omit = omit,
497
+ min_p: float | Omit = omit,
498
+ n: int | Omit = omit,
499
+ presence_penalty: float | Omit = omit,
500
+ repetition_penalty: float | Omit = omit,
501
+ safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
502
+ seed: int | Omit = omit,
503
+ stop: SequenceNotStr[str] | Omit = omit,
504
+ stream: Literal[False] | Omit = omit,
505
+ temperature: float | Omit = omit,
506
+ top_k: int | Omit = omit,
507
+ top_p: float | Omit = omit,
508
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
509
+ # The extra values given here take precedence over values defined on the client or passed to this method.
510
+ extra_headers: Headers | None = None,
511
+ extra_query: Query | None = None,
512
+ extra_body: Body | None = None,
513
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
514
+ ) -> Completion:
167
515
  """
168
- Async method to generate completions based on a given prompt using a specified model.
516
+ Query a language, code, or image model.
169
517
 
170
518
  Args:
171
- prompt (str): A string providing context for the model to complete.
172
- model (str): The name of the model to query.
173
- max_tokens (int, optional): The maximum number of tokens to generate.
174
- Defaults to 512.
175
- stop (List[str], optional): List of strings at which to stop generation.
176
- Defaults to None.
177
- temperature (float, optional): A decimal number that determines the degree of randomness in the response.
178
- Defaults to None.
179
- top_p (float, optional): The top_p (nucleus) parameter is used to dynamically adjust the number
180
- of choices for each predicted token based on the cumulative probabilities.
181
- Defaults to None.
182
- top_k (int, optional): The top_k parameter is used to limit the number of choices for the
183
- next predicted word or token.
184
- Defaults to None.
185
- repetition_penalty (float, optional): A number that controls the diversity of generated text
186
- by reducing the likelihood of repeated sequences. Higher values decrease repetition.
187
- Defaults to None.
188
- presence_penalty (float, optional): A number that controls the likelihood of tokens based on if they have
189
- appeared in the text. Positive values decrease the likelihood of repeated tokens or phrases.
190
- Must be in the range [-2, 2].
191
- Defaults to None.
192
- frequency_penalty (float, optional): A number that controls the likelihood of tokens based on the frequency
193
- of their appearance in the text. Positive decrease the likelihood of repeated tokens or phrases.
194
- Must be in the range [-2, 2].
195
- Defaults to None.
196
- min_p (float, optional): A number that controls the minimum percentage value that a token must reach to
197
- be considered during sampling.
198
- Must be in the range [0, 1].
199
- Defaults to None.
200
- logit_bias (Dict[str, float], optional): A dictionary of tokens and their bias values that modify the
201
- likelihood of specific tokens being sampled. Bias values must be in the range [-100, 100].
202
- Defaults to None.
203
- seed (int, optional): Seed value for reproducibility.
204
- stream (bool, optional): Flag indicating whether to stream the generated completions.
205
- Defaults to False.
206
- logprobs (int, optional): Number of top-k logprobs to return
207
- Defaults to None.
208
- echo (bool, optional): Echo prompt in output. Can be used with logprobs to return prompt logprobs.
209
- Defaults to None.
210
- n (int, optional): Number of completions to generate. Setting to None will return a single generation.
211
- Defaults to None.
212
- safety_model (str, optional): A moderation model to validate tokens. Choice between available moderation
213
- models found [here](https://docs.together.ai/docs/inference-models#moderation-models).
214
- Defaults to None.
215
-
216
- Returns:
217
- AsyncGenerator[CompletionChunk, None] | CompletionResponse: Object containing the completions
218
- or an iterator over completion chunks.
519
+ model: The name of the model to query.
520
+
521
+ [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
522
+
523
+ prompt: A string providing context for the model to complete.
524
+
525
+ echo: If true, the response will contain the prompt. Can be used with `logprobs` to
526
+ return prompt logprobs.
527
+
528
+ frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
529
+ repeating tokens that have already been mentioned.
530
+
531
+ logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
532
+
533
+ logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
534
+ at each generation step, instead of just the sampled token. Log probabilities
535
+ help assess model confidence in token predictions.
536
+
537
+ max_tokens: The maximum number of tokens to generate.
538
+
539
+ min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
540
+
541
+ n: The number of completions to generate for each prompt.
542
+
543
+ presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
544
+ a model talking about new topics.
545
+
546
+ repetition_penalty: A number that controls the diversity of generated text by reducing the
547
+ likelihood of repeated sequences. Higher values decrease repetition.
548
+
549
+ safety_model: The name of the moderation model used to validate tokens. Choose from the
550
+ available moderation models found
551
+ [here](https://docs.together.ai/docs/inference-models#moderation-models).
552
+
553
+ seed: Seed value for reproducibility.
554
+
555
+ stop: A list of string sequences that will truncate (stop) inference text output. For
556
+ example, "</s>" will stop generation as soon as the model generates the given
557
+ token.
558
+
559
+ stream: If true, stream tokens as Server-Sent Events as the model generates them instead
560
+ of waiting for the full model response. The stream terminates with
561
+ `data: [DONE]`. If false, return a single JSON object containing the results.
562
+
563
+ temperature: A decimal number from 0-1 that determines the degree of randomness in the
564
+ response. A temperature less than 1 favors more correctness and is appropriate
565
+ for question answering or summarization. A value closer to 1 introduces more
566
+ randomness in the output.
567
+
568
+ top_k: An integer that's used to limit the number of choices for the next predicted
569
+ word or token. It specifies the maximum number of tokens to consider at each
570
+ step, based on their probability of occurrence. This technique helps to speed up
571
+ the generation process and can improve the quality of the generated text by
572
+ focusing on the most likely options.
573
+
574
+ top_p: A percentage (also called the nucleus parameter) that's used to dynamically
575
+ adjust the number of choices for each predicted token based on the cumulative
576
+ probabilities. It specifies a probability threshold below which all less likely
577
+ tokens are filtered out. This technique helps maintain diversity and generate
578
+ more fluent and natural-sounding text.
579
+
580
+ extra_headers: Send extra headers
581
+
582
+ extra_query: Add additional query parameters to the request
583
+
584
+ extra_body: Add additional JSON properties to the request
585
+
586
+ timeout: Override the client-level default timeout for this request, in seconds
219
587
  """
588
+ ...
220
589
 
221
- requestor = api_requestor.APIRequestor(
222
- client=self._client,
223
- )
590
+ @overload
591
+ async def create(
592
+ self,
593
+ *,
594
+ model: Union[
595
+ Literal[
596
+ "meta-llama/Llama-2-70b-hf",
597
+ "mistralai/Mistral-7B-v0.1",
598
+ "mistralai/Mixtral-8x7B-v0.1",
599
+ "Meta-Llama/Llama-Guard-7b",
600
+ ],
601
+ str,
602
+ ],
603
+ prompt: str,
604
+ stream: Literal[True],
605
+ echo: bool | Omit = omit,
606
+ frequency_penalty: float | Omit = omit,
607
+ logit_bias: Dict[str, float] | Omit = omit,
608
+ logprobs: int | Omit = omit,
609
+ max_tokens: int | Omit = omit,
610
+ min_p: float | Omit = omit,
611
+ n: int | Omit = omit,
612
+ presence_penalty: float | Omit = omit,
613
+ repetition_penalty: float | Omit = omit,
614
+ safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
615
+ seed: int | Omit = omit,
616
+ stop: SequenceNotStr[str] | Omit = omit,
617
+ temperature: float | Omit = omit,
618
+ top_k: int | Omit = omit,
619
+ top_p: float | Omit = omit,
620
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
621
+ # The extra values given here take precedence over values defined on the client or passed to this method.
622
+ extra_headers: Headers | None = None,
623
+ extra_query: Query | None = None,
624
+ extra_body: Body | None = None,
625
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
626
+ ) -> AsyncStream[CompletionChunk]:
627
+ """
628
+ Query a language, code, or image model.
629
+
630
+ Args:
631
+ model: The name of the model to query.
632
+
633
+ [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
634
+
635
+ prompt: A string providing context for the model to complete.
636
+
637
+ stream: If true, stream tokens as Server-Sent Events as the model generates them instead
638
+ of waiting for the full model response. The stream terminates with
639
+ `data: [DONE]`. If false, return a single JSON object containing the results.
640
+
641
+ echo: If true, the response will contain the prompt. Can be used with `logprobs` to
642
+ return prompt logprobs.
643
+
644
+ frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
645
+ repeating tokens that have already been mentioned.
646
+
647
+ logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
648
+
649
+ logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
650
+ at each generation step, instead of just the sampled token. Log probabilities
651
+ help assess model confidence in token predictions.
652
+
653
+ max_tokens: The maximum number of tokens to generate.
654
+
655
+ min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
656
+
657
+ n: The number of completions to generate for each prompt.
658
+
659
+ presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
660
+ a model talking about new topics.
661
+
662
+ repetition_penalty: A number that controls the diversity of generated text by reducing the
663
+ likelihood of repeated sequences. Higher values decrease repetition.
664
+
665
+ safety_model: The name of the moderation model used to validate tokens. Choose from the
666
+ available moderation models found
667
+ [here](https://docs.together.ai/docs/inference-models#moderation-models).
668
+
669
+ seed: Seed value for reproducibility.
670
+
671
+ stop: A list of string sequences that will truncate (stop) inference text output. For
672
+ example, "</s>" will stop generation as soon as the model generates the given
673
+ token.
674
+
675
+ temperature: A decimal number from 0-1 that determines the degree of randomness in the
676
+ response. A temperature less than 1 favors more correctness and is appropriate
677
+ for question answering or summarization. A value closer to 1 introduces more
678
+ randomness in the output.
679
+
680
+ top_k: An integer that's used to limit the number of choices for the next predicted
681
+ word or token. It specifies the maximum number of tokens to consider at each
682
+ step, based on their probability of occurrence. This technique helps to speed up
683
+ the generation process and can improve the quality of the generated text by
684
+ focusing on the most likely options.
685
+
686
+ top_p: A percentage (also called the nucleus parameter) that's used to dynamically
687
+ adjust the number of choices for each predicted token based on the cumulative
688
+ probabilities. It specifies a probability threshold below which all less likely
689
+ tokens are filtered out. This technique helps maintain diversity and generate
690
+ more fluent and natural-sounding text.
691
+
692
+ extra_headers: Send extra headers
693
+
694
+ extra_query: Add additional query parameters to the request
695
+
696
+ extra_body: Add additional JSON properties to the request
697
+
698
+ timeout: Override the client-level default timeout for this request, in seconds
699
+ """
700
+ ...
701
+
702
+ @overload
703
+ async def create(
704
+ self,
705
+ *,
706
+ model: Union[
707
+ Literal[
708
+ "meta-llama/Llama-2-70b-hf",
709
+ "mistralai/Mistral-7B-v0.1",
710
+ "mistralai/Mixtral-8x7B-v0.1",
711
+ "Meta-Llama/Llama-Guard-7b",
712
+ ],
713
+ str,
714
+ ],
715
+ prompt: str,
716
+ stream: bool,
717
+ echo: bool | Omit = omit,
718
+ frequency_penalty: float | Omit = omit,
719
+ logit_bias: Dict[str, float] | Omit = omit,
720
+ logprobs: int | Omit = omit,
721
+ max_tokens: int | Omit = omit,
722
+ min_p: float | Omit = omit,
723
+ n: int | Omit = omit,
724
+ presence_penalty: float | Omit = omit,
725
+ repetition_penalty: float | Omit = omit,
726
+ safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
727
+ seed: int | Omit = omit,
728
+ stop: SequenceNotStr[str] | Omit = omit,
729
+ temperature: float | Omit = omit,
730
+ top_k: int | Omit = omit,
731
+ top_p: float | Omit = omit,
732
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
733
+ # The extra values given here take precedence over values defined on the client or passed to this method.
734
+ extra_headers: Headers | None = None,
735
+ extra_query: Query | None = None,
736
+ extra_body: Body | None = None,
737
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
738
+ ) -> Completion | AsyncStream[CompletionChunk]:
739
+ """
740
+ Query a language, code, or image model.
741
+
742
+ Args:
743
+ model: The name of the model to query.
744
+
745
+ [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
746
+
747
+ prompt: A string providing context for the model to complete.
748
+
749
+ stream: If true, stream tokens as Server-Sent Events as the model generates them instead
750
+ of waiting for the full model response. The stream terminates with
751
+ `data: [DONE]`. If false, return a single JSON object containing the results.
752
+
753
+ echo: If true, the response will contain the prompt. Can be used with `logprobs` to
754
+ return prompt logprobs.
755
+
756
+ frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
757
+ repeating tokens that have already been mentioned.
758
+
759
+ logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
760
+
761
+ logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
762
+ at each generation step, instead of just the sampled token. Log probabilities
763
+ help assess model confidence in token predictions.
764
+
765
+ max_tokens: The maximum number of tokens to generate.
766
+
767
+ min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
768
+
769
+ n: The number of completions to generate for each prompt.
770
+
771
+ presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
772
+ a model talking about new topics.
773
+
774
+ repetition_penalty: A number that controls the diversity of generated text by reducing the
775
+ likelihood of repeated sequences. Higher values decrease repetition.
776
+
777
+ safety_model: The name of the moderation model used to validate tokens. Choose from the
778
+ available moderation models found
779
+ [here](https://docs.together.ai/docs/inference-models#moderation-models).
780
+
781
+ seed: Seed value for reproducibility.
782
+
783
+ stop: A list of string sequences that will truncate (stop) inference text output. For
784
+ example, "</s>" will stop generation as soon as the model generates the given
785
+ token.
786
+
787
+ temperature: A decimal number from 0-1 that determines the degree of randomness in the
788
+ response. A temperature less than 1 favors more correctness and is appropriate
789
+ for question answering or summarization. A value closer to 1 introduces more
790
+ randomness in the output.
791
+
792
+ top_k: An integer that's used to limit the number of choices for the next predicted
793
+ word or token. It specifies the maximum number of tokens to consider at each
794
+ step, based on their probability of occurrence. This technique helps to speed up
795
+ the generation process and can improve the quality of the generated text by
796
+ focusing on the most likely options.
224
797
 
225
- parameter_payload = CompletionRequest(
226
- model=model,
227
- prompt=prompt,
228
- top_p=top_p,
229
- top_k=top_k,
230
- temperature=temperature,
231
- max_tokens=max_tokens,
232
- stop=stop,
233
- repetition_penalty=repetition_penalty,
234
- presence_penalty=presence_penalty,
235
- frequency_penalty=frequency_penalty,
236
- min_p=min_p,
237
- logit_bias=logit_bias,
238
- seed=seed,
239
- stream=stream,
240
- logprobs=logprobs,
241
- echo=echo,
242
- n=n,
243
- safety_model=safety_model,
244
- **kwargs,
245
- ).model_dump(exclude_none=True)
246
-
247
- response, _, _ = await requestor.arequest(
248
- options=TogetherRequest(
249
- method="POST",
250
- url="completions",
251
- params=parameter_payload,
798
+ top_p: A percentage (also called the nucleus parameter) that's used to dynamically
799
+ adjust the number of choices for each predicted token based on the cumulative
800
+ probabilities. It specifies a probability threshold below which all less likely
801
+ tokens are filtered out. This technique helps maintain diversity and generate
802
+ more fluent and natural-sounding text.
803
+
804
+ extra_headers: Send extra headers
805
+
806
+ extra_query: Add additional query parameters to the request
807
+
808
+ extra_body: Add additional JSON properties to the request
809
+
810
+ timeout: Override the client-level default timeout for this request, in seconds
811
+ """
812
+ ...
813
+
814
+ @required_args(["model", "prompt"], ["model", "prompt", "stream"])
815
+ async def create(
816
+ self,
817
+ *,
818
+ model: Union[
819
+ Literal[
820
+ "meta-llama/Llama-2-70b-hf",
821
+ "mistralai/Mistral-7B-v0.1",
822
+ "mistralai/Mixtral-8x7B-v0.1",
823
+ "Meta-Llama/Llama-Guard-7b",
824
+ ],
825
+ str,
826
+ ],
827
+ prompt: str,
828
+ echo: bool | Omit = omit,
829
+ frequency_penalty: float | Omit = omit,
830
+ logit_bias: Dict[str, float] | Omit = omit,
831
+ logprobs: int | Omit = omit,
832
+ max_tokens: int | Omit = omit,
833
+ min_p: float | Omit = omit,
834
+ n: int | Omit = omit,
835
+ presence_penalty: float | Omit = omit,
836
+ repetition_penalty: float | Omit = omit,
837
+ safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
838
+ seed: int | Omit = omit,
839
+ stop: SequenceNotStr[str] | Omit = omit,
840
+ stream: Literal[False] | Literal[True] | Omit = omit,
841
+ temperature: float | Omit = omit,
842
+ top_k: int | Omit = omit,
843
+ top_p: float | Omit = omit,
844
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
845
+ # The extra values given here take precedence over values defined on the client or passed to this method.
846
+ extra_headers: Headers | None = None,
847
+ extra_query: Query | None = None,
848
+ extra_body: Body | None = None,
849
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
850
+ ) -> Completion | AsyncStream[CompletionChunk]:
851
+ return await self._post(
852
+ "/completions",
853
+ body=await async_maybe_transform(
854
+ {
855
+ "model": model,
856
+ "prompt": prompt,
857
+ "echo": echo,
858
+ "frequency_penalty": frequency_penalty,
859
+ "logit_bias": logit_bias,
860
+ "logprobs": logprobs,
861
+ "max_tokens": max_tokens,
862
+ "min_p": min_p,
863
+ "n": n,
864
+ "presence_penalty": presence_penalty,
865
+ "repetition_penalty": repetition_penalty,
866
+ "safety_model": safety_model,
867
+ "seed": seed,
868
+ "stop": stop,
869
+ "stream": stream,
870
+ "temperature": temperature,
871
+ "top_k": top_k,
872
+ "top_p": top_p,
873
+ },
874
+ completion_create_params.CompletionCreateParamsStreaming
875
+ if stream
876
+ else completion_create_params.CompletionCreateParamsNonStreaming,
252
877
  ),
253
- stream=stream,
878
+ options=make_request_options(
879
+ extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
880
+ ),
881
+ cast_to=Completion,
882
+ stream=stream or False,
883
+ stream_cls=AsyncStream[CompletionChunk],
884
+ )
885
+
886
+
887
+ class CompletionsResourceWithRawResponse:
888
+ def __init__(self, completions: CompletionsResource) -> None:
889
+ self._completions = completions
890
+
891
+ self.create = to_raw_response_wrapper(
892
+ completions.create,
254
893
  )
255
894
 
256
- if stream:
257
- # must be an iterator
258
- assert not isinstance(response, TogetherResponse)
259
- return (CompletionChunk(**line.data) async for line in response)
260
- assert isinstance(response, TogetherResponse)
261
- return CompletionResponse(**response.data)
895
+
896
+ class AsyncCompletionsResourceWithRawResponse:
897
+ def __init__(self, completions: AsyncCompletionsResource) -> None:
898
+ self._completions = completions
899
+
900
+ self.create = async_to_raw_response_wrapper(
901
+ completions.create,
902
+ )
903
+
904
+
905
+ class CompletionsResourceWithStreamingResponse:
906
+ def __init__(self, completions: CompletionsResource) -> None:
907
+ self._completions = completions
908
+
909
+ self.create = to_streamed_response_wrapper(
910
+ completions.create,
911
+ )
912
+
913
+
914
+ class AsyncCompletionsResourceWithStreamingResponse:
915
+ def __init__(self, completions: AsyncCompletionsResource) -> None:
916
+ self._completions = completions
917
+
918
+ self.create = async_to_streamed_response_wrapper(
919
+ completions.create,
920
+ )