together 1.2.11__py3-none-any.whl → 2.0.0a8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- together/__init__.py +101 -63
- together/_base_client.py +1995 -0
- together/_client.py +1033 -0
- together/_compat.py +219 -0
- together/_constants.py +14 -0
- together/_exceptions.py +108 -0
- together/_files.py +123 -0
- together/_models.py +857 -0
- together/_qs.py +150 -0
- together/_resource.py +43 -0
- together/_response.py +830 -0
- together/_streaming.py +370 -0
- together/_types.py +260 -0
- together/_utils/__init__.py +64 -0
- together/_utils/_compat.py +45 -0
- together/_utils/_datetime_parse.py +136 -0
- together/_utils/_logs.py +25 -0
- together/_utils/_proxy.py +65 -0
- together/_utils/_reflection.py +42 -0
- together/_utils/_resources_proxy.py +24 -0
- together/_utils/_streams.py +12 -0
- together/_utils/_sync.py +58 -0
- together/_utils/_transform.py +457 -0
- together/_utils/_typing.py +156 -0
- together/_utils/_utils.py +421 -0
- together/_version.py +4 -0
- together/lib/.keep +4 -0
- together/lib/__init__.py +23 -0
- together/lib/cli/api/endpoints.py +467 -0
- together/lib/cli/api/evals.py +588 -0
- together/{cli → lib/cli}/api/files.py +20 -17
- together/lib/cli/api/fine_tuning.py +566 -0
- together/lib/cli/api/models.py +140 -0
- together/lib/cli/api/utils.py +50 -0
- together/{cli → lib/cli}/cli.py +17 -23
- together/lib/constants.py +61 -0
- together/lib/resources/__init__.py +11 -0
- together/lib/resources/files.py +999 -0
- together/lib/resources/fine_tuning.py +280 -0
- together/lib/resources/models.py +35 -0
- together/lib/types/__init__.py +13 -0
- together/lib/types/error.py +9 -0
- together/lib/types/fine_tuning.py +455 -0
- together/{utils → lib/utils}/__init__.py +7 -10
- together/{utils → lib/utils}/_log.py +18 -13
- together/lib/utils/files.py +628 -0
- together/lib/utils/serializer.py +10 -0
- together/{utils → lib/utils}/tools.py +17 -2
- together/resources/__init__.py +225 -24
- together/resources/audio/__init__.py +75 -0
- together/resources/audio/audio.py +198 -0
- together/resources/audio/speech.py +605 -0
- together/resources/audio/transcriptions.py +282 -0
- together/resources/audio/translations.py +256 -0
- together/resources/audio/voices.py +135 -0
- together/resources/batches.py +417 -0
- together/resources/chat/__init__.py +30 -21
- together/resources/chat/chat.py +102 -0
- together/resources/chat/completions.py +1063 -257
- together/resources/code_interpreter/__init__.py +33 -0
- together/resources/code_interpreter/code_interpreter.py +258 -0
- together/resources/code_interpreter/sessions.py +135 -0
- together/resources/completions.py +890 -225
- together/resources/embeddings.py +172 -68
- together/resources/endpoints.py +711 -0
- together/resources/evals.py +452 -0
- together/resources/files.py +397 -120
- together/resources/fine_tuning.py +1033 -0
- together/resources/hardware.py +181 -0
- together/resources/images.py +256 -108
- together/resources/jobs.py +214 -0
- together/resources/models.py +251 -44
- together/resources/rerank.py +190 -92
- together/resources/videos.py +374 -0
- together/types/__init__.py +66 -73
- together/types/audio/__init__.py +10 -0
- together/types/audio/speech_create_params.py +75 -0
- together/types/audio/transcription_create_params.py +54 -0
- together/types/audio/transcription_create_response.py +111 -0
- together/types/audio/translation_create_params.py +40 -0
- together/types/audio/translation_create_response.py +70 -0
- together/types/audio/voice_list_response.py +23 -0
- together/types/audio_speech_stream_chunk.py +16 -0
- together/types/autoscaling.py +13 -0
- together/types/autoscaling_param.py +15 -0
- together/types/batch_create_params.py +24 -0
- together/types/batch_create_response.py +14 -0
- together/types/batch_job.py +45 -0
- together/types/batch_list_response.py +10 -0
- together/types/chat/__init__.py +18 -0
- together/types/chat/chat_completion.py +60 -0
- together/types/chat/chat_completion_chunk.py +61 -0
- together/types/chat/chat_completion_structured_message_image_url_param.py +18 -0
- together/types/chat/chat_completion_structured_message_text_param.py +13 -0
- together/types/chat/chat_completion_structured_message_video_url_param.py +18 -0
- together/types/chat/chat_completion_usage.py +13 -0
- together/types/chat/chat_completion_warning.py +9 -0
- together/types/chat/completion_create_params.py +329 -0
- together/types/code_interpreter/__init__.py +5 -0
- together/types/code_interpreter/session_list_response.py +31 -0
- together/types/code_interpreter_execute_params.py +45 -0
- together/types/completion.py +42 -0
- together/types/completion_chunk.py +66 -0
- together/types/completion_create_params.py +138 -0
- together/types/dedicated_endpoint.py +44 -0
- together/types/embedding.py +24 -0
- together/types/embedding_create_params.py +31 -0
- together/types/endpoint_create_params.py +43 -0
- together/types/endpoint_list_avzones_response.py +11 -0
- together/types/endpoint_list_params.py +18 -0
- together/types/endpoint_list_response.py +41 -0
- together/types/endpoint_update_params.py +27 -0
- together/types/eval_create_params.py +263 -0
- together/types/eval_create_response.py +16 -0
- together/types/eval_list_params.py +21 -0
- together/types/eval_list_response.py +10 -0
- together/types/eval_status_response.py +100 -0
- together/types/evaluation_job.py +139 -0
- together/types/execute_response.py +108 -0
- together/types/file_delete_response.py +13 -0
- together/types/file_list.py +12 -0
- together/types/file_purpose.py +9 -0
- together/types/file_response.py +31 -0
- together/types/file_type.py +7 -0
- together/types/fine_tuning_cancel_response.py +194 -0
- together/types/fine_tuning_content_params.py +24 -0
- together/types/fine_tuning_delete_params.py +11 -0
- together/types/fine_tuning_delete_response.py +12 -0
- together/types/fine_tuning_list_checkpoints_response.py +21 -0
- together/types/fine_tuning_list_events_response.py +12 -0
- together/types/fine_tuning_list_response.py +199 -0
- together/types/finetune_event.py +41 -0
- together/types/finetune_event_type.py +33 -0
- together/types/finetune_response.py +177 -0
- together/types/hardware_list_params.py +16 -0
- together/types/hardware_list_response.py +58 -0
- together/types/image_data_b64.py +15 -0
- together/types/image_data_url.py +15 -0
- together/types/image_file.py +23 -0
- together/types/image_generate_params.py +85 -0
- together/types/job_list_response.py +47 -0
- together/types/job_retrieve_response.py +43 -0
- together/types/log_probs.py +18 -0
- together/types/model_list_response.py +10 -0
- together/types/model_object.py +42 -0
- together/types/model_upload_params.py +36 -0
- together/types/model_upload_response.py +23 -0
- together/types/rerank_create_params.py +36 -0
- together/types/rerank_create_response.py +36 -0
- together/types/tool_choice.py +23 -0
- together/types/tool_choice_param.py +23 -0
- together/types/tools_param.py +23 -0
- together/types/training_method_dpo.py +22 -0
- together/types/training_method_sft.py +18 -0
- together/types/video_create_params.py +86 -0
- together/types/video_job.py +57 -0
- together-2.0.0a8.dist-info/METADATA +680 -0
- together-2.0.0a8.dist-info/RECORD +164 -0
- {together-1.2.11.dist-info → together-2.0.0a8.dist-info}/WHEEL +1 -1
- together-2.0.0a8.dist-info/entry_points.txt +2 -0
- {together-1.2.11.dist-info → together-2.0.0a8.dist-info/licenses}/LICENSE +1 -1
- together/abstract/api_requestor.py +0 -723
- together/cli/api/chat.py +0 -276
- together/cli/api/completions.py +0 -119
- together/cli/api/finetune.py +0 -272
- together/cli/api/images.py +0 -82
- together/cli/api/models.py +0 -42
- together/client.py +0 -157
- together/constants.py +0 -31
- together/error.py +0 -191
- together/filemanager.py +0 -388
- together/legacy/__init__.py +0 -0
- together/legacy/base.py +0 -27
- together/legacy/complete.py +0 -93
- together/legacy/embeddings.py +0 -27
- together/legacy/files.py +0 -146
- together/legacy/finetune.py +0 -177
- together/legacy/images.py +0 -27
- together/legacy/models.py +0 -44
- together/resources/finetune.py +0 -489
- together/together_response.py +0 -50
- together/types/abstract.py +0 -26
- together/types/chat_completions.py +0 -171
- together/types/common.py +0 -65
- together/types/completions.py +0 -104
- together/types/embeddings.py +0 -35
- together/types/error.py +0 -16
- together/types/files.py +0 -89
- together/types/finetune.py +0 -265
- together/types/images.py +0 -42
- together/types/models.py +0 -44
- together/types/rerank.py +0 -43
- together/utils/api_helpers.py +0 -84
- together/utils/files.py +0 -204
- together/version.py +0 -6
- together-1.2.11.dist-info/METADATA +0 -408
- together-1.2.11.dist-info/RECORD +0 -58
- together-1.2.11.dist-info/entry_points.txt +0 -3
- /together/{abstract → lib/cli}/__init__.py +0 -0
- /together/{cli → lib/cli/api}/__init__.py +0 -0
- /together/{cli/api/__init__.py → py.typed} +0 -0
|
@@ -1,255 +1,920 @@
|
|
|
1
|
+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
|
2
|
+
|
|
1
3
|
from __future__ import annotations
|
|
2
4
|
|
|
3
|
-
from typing import
|
|
5
|
+
from typing import Dict, Union
|
|
6
|
+
from typing_extensions import Literal, overload
|
|
7
|
+
|
|
8
|
+
import httpx
|
|
4
9
|
|
|
5
|
-
from
|
|
6
|
-
from
|
|
7
|
-
from
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
from ..types import completion_create_params
|
|
11
|
+
from .._types import Body, Omit, Query, Headers, NotGiven, SequenceNotStr, omit, not_given
|
|
12
|
+
from .._utils import required_args, maybe_transform, async_maybe_transform
|
|
13
|
+
from .._compat import cached_property
|
|
14
|
+
from .._resource import SyncAPIResource, AsyncAPIResource
|
|
15
|
+
from .._response import (
|
|
16
|
+
to_raw_response_wrapper,
|
|
17
|
+
to_streamed_response_wrapper,
|
|
18
|
+
async_to_raw_response_wrapper,
|
|
19
|
+
async_to_streamed_response_wrapper,
|
|
13
20
|
)
|
|
21
|
+
from .._streaming import Stream, AsyncStream
|
|
22
|
+
from .._base_client import make_request_options
|
|
23
|
+
from ..types.completion import Completion
|
|
24
|
+
from ..types.completion_chunk import CompletionChunk
|
|
25
|
+
|
|
26
|
+
__all__ = ["CompletionsResource", "AsyncCompletionsResource"]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class CompletionsResource(SyncAPIResource):
|
|
30
|
+
@cached_property
|
|
31
|
+
def with_raw_response(self) -> CompletionsResourceWithRawResponse:
|
|
32
|
+
"""
|
|
33
|
+
This property can be used as a prefix for any HTTP method call to return
|
|
34
|
+
the raw response object instead of the parsed content.
|
|
35
|
+
|
|
36
|
+
For more information, see https://www.github.com/togethercomputer/together-py#accessing-raw-response-data-eg-headers
|
|
37
|
+
"""
|
|
38
|
+
return CompletionsResourceWithRawResponse(self)
|
|
14
39
|
|
|
40
|
+
@cached_property
|
|
41
|
+
def with_streaming_response(self) -> CompletionsResourceWithStreamingResponse:
|
|
42
|
+
"""
|
|
43
|
+
An alternative to `.with_raw_response` that doesn't eagerly read the response body.
|
|
15
44
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
self
|
|
45
|
+
For more information, see https://www.github.com/togethercomputer/together-py#with_streaming_response
|
|
46
|
+
"""
|
|
47
|
+
return CompletionsResourceWithStreamingResponse(self)
|
|
19
48
|
|
|
49
|
+
@overload
|
|
20
50
|
def create(
|
|
21
51
|
self,
|
|
22
52
|
*,
|
|
53
|
+
model: Union[
|
|
54
|
+
Literal[
|
|
55
|
+
"meta-llama/Llama-2-70b-hf",
|
|
56
|
+
"mistralai/Mistral-7B-v0.1",
|
|
57
|
+
"mistralai/Mixtral-8x7B-v0.1",
|
|
58
|
+
"Meta-Llama/Llama-Guard-7b",
|
|
59
|
+
],
|
|
60
|
+
str,
|
|
61
|
+
],
|
|
23
62
|
prompt: str,
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
presence_penalty: float |
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
63
|
+
echo: bool | Omit = omit,
|
|
64
|
+
frequency_penalty: float | Omit = omit,
|
|
65
|
+
logit_bias: Dict[str, float] | Omit = omit,
|
|
66
|
+
logprobs: int | Omit = omit,
|
|
67
|
+
max_tokens: int | Omit = omit,
|
|
68
|
+
min_p: float | Omit = omit,
|
|
69
|
+
n: int | Omit = omit,
|
|
70
|
+
presence_penalty: float | Omit = omit,
|
|
71
|
+
repetition_penalty: float | Omit = omit,
|
|
72
|
+
safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
|
|
73
|
+
seed: int | Omit = omit,
|
|
74
|
+
stop: SequenceNotStr[str] | Omit = omit,
|
|
75
|
+
stream: Literal[False] | Omit = omit,
|
|
76
|
+
temperature: float | Omit = omit,
|
|
77
|
+
top_k: int | Omit = omit,
|
|
78
|
+
top_p: float | Omit = omit,
|
|
79
|
+
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
|
80
|
+
# The extra values given here take precedence over values defined on the client or passed to this method.
|
|
81
|
+
extra_headers: Headers | None = None,
|
|
82
|
+
extra_query: Query | None = None,
|
|
83
|
+
extra_body: Body | None = None,
|
|
84
|
+
timeout: float | httpx.Timeout | None | NotGiven = not_given,
|
|
85
|
+
) -> Completion:
|
|
86
|
+
"""
|
|
87
|
+
Query a language, code, or image model.
|
|
44
88
|
|
|
45
89
|
Args:
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
90
|
+
model: The name of the model to query.
|
|
91
|
+
|
|
92
|
+
[See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
|
|
93
|
+
|
|
94
|
+
prompt: A string providing context for the model to complete.
|
|
95
|
+
|
|
96
|
+
echo: If true, the response will contain the prompt. Can be used with `logprobs` to
|
|
97
|
+
return prompt logprobs.
|
|
98
|
+
|
|
99
|
+
frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
|
|
100
|
+
repeating tokens that have already been mentioned.
|
|
101
|
+
|
|
102
|
+
logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
|
|
103
|
+
|
|
104
|
+
logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
|
|
105
|
+
at each generation step, instead of just the sampled token. Log probabilities
|
|
106
|
+
help assess model confidence in token predictions.
|
|
107
|
+
|
|
108
|
+
max_tokens: The maximum number of tokens to generate.
|
|
109
|
+
|
|
110
|
+
min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
|
|
111
|
+
|
|
112
|
+
n: The number of completions to generate for each prompt.
|
|
113
|
+
|
|
114
|
+
presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
|
|
115
|
+
a model talking about new topics.
|
|
116
|
+
|
|
117
|
+
repetition_penalty: A number that controls the diversity of generated text by reducing the
|
|
118
|
+
likelihood of repeated sequences. Higher values decrease repetition.
|
|
119
|
+
|
|
120
|
+
safety_model: The name of the moderation model used to validate tokens. Choose from the
|
|
121
|
+
available moderation models found
|
|
122
|
+
[here](https://docs.together.ai/docs/inference-models#moderation-models).
|
|
123
|
+
|
|
124
|
+
seed: Seed value for reproducibility.
|
|
125
|
+
|
|
126
|
+
stop: A list of string sequences that will truncate (stop) inference text output. For
|
|
127
|
+
example, "</s>" will stop generation as soon as the model generates the given
|
|
128
|
+
token.
|
|
129
|
+
|
|
130
|
+
stream: If true, stream tokens as Server-Sent Events as the model generates them instead
|
|
131
|
+
of waiting for the full model response. The stream terminates with
|
|
132
|
+
`data: [DONE]`. If false, return a single JSON object containing the results.
|
|
133
|
+
|
|
134
|
+
temperature: A decimal number from 0-1 that determines the degree of randomness in the
|
|
135
|
+
response. A temperature less than 1 favors more correctness and is appropriate
|
|
136
|
+
for question answering or summarization. A value closer to 1 introduces more
|
|
137
|
+
randomness in the output.
|
|
138
|
+
|
|
139
|
+
top_k: An integer that's used to limit the number of choices for the next predicted
|
|
140
|
+
word or token. It specifies the maximum number of tokens to consider at each
|
|
141
|
+
step, based on their probability of occurrence. This technique helps to speed up
|
|
142
|
+
the generation process and can improve the quality of the generated text by
|
|
143
|
+
focusing on the most likely options.
|
|
144
|
+
|
|
145
|
+
top_p: A percentage (also called the nucleus parameter) that's used to dynamically
|
|
146
|
+
adjust the number of choices for each predicted token based on the cumulative
|
|
147
|
+
probabilities. It specifies a probability threshold below which all less likely
|
|
148
|
+
tokens are filtered out. This technique helps maintain diversity and generate
|
|
149
|
+
more fluent and natural-sounding text.
|
|
150
|
+
|
|
151
|
+
extra_headers: Send extra headers
|
|
152
|
+
|
|
153
|
+
extra_query: Add additional query parameters to the request
|
|
154
|
+
|
|
155
|
+
extra_body: Add additional JSON properties to the request
|
|
156
|
+
|
|
157
|
+
timeout: Override the client-level default timeout for this request, in seconds
|
|
158
|
+
"""
|
|
159
|
+
...
|
|
160
|
+
|
|
161
|
+
@overload
|
|
162
|
+
def create(
|
|
163
|
+
self,
|
|
164
|
+
*,
|
|
165
|
+
model: Union[
|
|
166
|
+
Literal[
|
|
167
|
+
"meta-llama/Llama-2-70b-hf",
|
|
168
|
+
"mistralai/Mistral-7B-v0.1",
|
|
169
|
+
"mistralai/Mixtral-8x7B-v0.1",
|
|
170
|
+
"Meta-Llama/Llama-Guard-7b",
|
|
171
|
+
],
|
|
172
|
+
str,
|
|
173
|
+
],
|
|
174
|
+
prompt: str,
|
|
175
|
+
stream: Literal[True],
|
|
176
|
+
echo: bool | Omit = omit,
|
|
177
|
+
frequency_penalty: float | Omit = omit,
|
|
178
|
+
logit_bias: Dict[str, float] | Omit = omit,
|
|
179
|
+
logprobs: int | Omit = omit,
|
|
180
|
+
max_tokens: int | Omit = omit,
|
|
181
|
+
min_p: float | Omit = omit,
|
|
182
|
+
n: int | Omit = omit,
|
|
183
|
+
presence_penalty: float | Omit = omit,
|
|
184
|
+
repetition_penalty: float | Omit = omit,
|
|
185
|
+
safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
|
|
186
|
+
seed: int | Omit = omit,
|
|
187
|
+
stop: SequenceNotStr[str] | Omit = omit,
|
|
188
|
+
temperature: float | Omit = omit,
|
|
189
|
+
top_k: int | Omit = omit,
|
|
190
|
+
top_p: float | Omit = omit,
|
|
191
|
+
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
|
192
|
+
# The extra values given here take precedence over values defined on the client or passed to this method.
|
|
193
|
+
extra_headers: Headers | None = None,
|
|
194
|
+
extra_query: Query | None = None,
|
|
195
|
+
extra_body: Body | None = None,
|
|
196
|
+
timeout: float | httpx.Timeout | None | NotGiven = not_given,
|
|
197
|
+
) -> Stream[CompletionChunk]:
|
|
198
|
+
"""
|
|
199
|
+
Query a language, code, or image model.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
model: The name of the model to query.
|
|
203
|
+
|
|
204
|
+
[See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
|
|
205
|
+
|
|
206
|
+
prompt: A string providing context for the model to complete.
|
|
207
|
+
|
|
208
|
+
stream: If true, stream tokens as Server-Sent Events as the model generates them instead
|
|
209
|
+
of waiting for the full model response. The stream terminates with
|
|
210
|
+
`data: [DONE]`. If false, return a single JSON object containing the results.
|
|
211
|
+
|
|
212
|
+
echo: If true, the response will contain the prompt. Can be used with `logprobs` to
|
|
213
|
+
return prompt logprobs.
|
|
214
|
+
|
|
215
|
+
frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
|
|
216
|
+
repeating tokens that have already been mentioned.
|
|
217
|
+
|
|
218
|
+
logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
|
|
219
|
+
|
|
220
|
+
logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
|
|
221
|
+
at each generation step, instead of just the sampled token. Log probabilities
|
|
222
|
+
help assess model confidence in token predictions.
|
|
223
|
+
|
|
224
|
+
max_tokens: The maximum number of tokens to generate.
|
|
225
|
+
|
|
226
|
+
min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
|
|
227
|
+
|
|
228
|
+
n: The number of completions to generate for each prompt.
|
|
229
|
+
|
|
230
|
+
presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
|
|
231
|
+
a model talking about new topics.
|
|
98
232
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
233
|
+
repetition_penalty: A number that controls the diversity of generated text by reducing the
|
|
234
|
+
likelihood of repeated sequences. Higher values decrease repetition.
|
|
235
|
+
|
|
236
|
+
safety_model: The name of the moderation model used to validate tokens. Choose from the
|
|
237
|
+
available moderation models found
|
|
238
|
+
[here](https://docs.together.ai/docs/inference-models#moderation-models).
|
|
239
|
+
|
|
240
|
+
seed: Seed value for reproducibility.
|
|
241
|
+
|
|
242
|
+
stop: A list of string sequences that will truncate (stop) inference text output. For
|
|
243
|
+
example, "</s>" will stop generation as soon as the model generates the given
|
|
244
|
+
token.
|
|
245
|
+
|
|
246
|
+
temperature: A decimal number from 0-1 that determines the degree of randomness in the
|
|
247
|
+
response. A temperature less than 1 favors more correctness and is appropriate
|
|
248
|
+
for question answering or summarization. A value closer to 1 introduces more
|
|
249
|
+
randomness in the output.
|
|
250
|
+
|
|
251
|
+
top_k: An integer that's used to limit the number of choices for the next predicted
|
|
252
|
+
word or token. It specifies the maximum number of tokens to consider at each
|
|
253
|
+
step, based on their probability of occurrence. This technique helps to speed up
|
|
254
|
+
the generation process and can improve the quality of the generated text by
|
|
255
|
+
focusing on the most likely options.
|
|
256
|
+
|
|
257
|
+
top_p: A percentage (also called the nucleus parameter) that's used to dynamically
|
|
258
|
+
adjust the number of choices for each predicted token based on the cumulative
|
|
259
|
+
probabilities. It specifies a probability threshold below which all less likely
|
|
260
|
+
tokens are filtered out. This technique helps maintain diversity and generate
|
|
261
|
+
more fluent and natural-sounding text.
|
|
262
|
+
|
|
263
|
+
extra_headers: Send extra headers
|
|
264
|
+
|
|
265
|
+
extra_query: Add additional query parameters to the request
|
|
266
|
+
|
|
267
|
+
extra_body: Add additional JSON properties to the request
|
|
268
|
+
|
|
269
|
+
timeout: Override the client-level default timeout for this request, in seconds
|
|
270
|
+
"""
|
|
271
|
+
...
|
|
272
|
+
|
|
273
|
+
@overload
|
|
274
|
+
def create(
|
|
275
|
+
self,
|
|
276
|
+
*,
|
|
277
|
+
model: Union[
|
|
278
|
+
Literal[
|
|
279
|
+
"meta-llama/Llama-2-70b-hf",
|
|
280
|
+
"mistralai/Mistral-7B-v0.1",
|
|
281
|
+
"mistralai/Mixtral-8x7B-v0.1",
|
|
282
|
+
"Meta-Llama/Llama-Guard-7b",
|
|
283
|
+
],
|
|
284
|
+
str,
|
|
285
|
+
],
|
|
286
|
+
prompt: str,
|
|
287
|
+
stream: bool,
|
|
288
|
+
echo: bool | Omit = omit,
|
|
289
|
+
frequency_penalty: float | Omit = omit,
|
|
290
|
+
logit_bias: Dict[str, float] | Omit = omit,
|
|
291
|
+
logprobs: int | Omit = omit,
|
|
292
|
+
max_tokens: int | Omit = omit,
|
|
293
|
+
min_p: float | Omit = omit,
|
|
294
|
+
n: int | Omit = omit,
|
|
295
|
+
presence_penalty: float | Omit = omit,
|
|
296
|
+
repetition_penalty: float | Omit = omit,
|
|
297
|
+
safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
|
|
298
|
+
seed: int | Omit = omit,
|
|
299
|
+
stop: SequenceNotStr[str] | Omit = omit,
|
|
300
|
+
temperature: float | Omit = omit,
|
|
301
|
+
top_k: int | Omit = omit,
|
|
302
|
+
top_p: float | Omit = omit,
|
|
303
|
+
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
|
304
|
+
# The extra values given here take precedence over values defined on the client or passed to this method.
|
|
305
|
+
extra_headers: Headers | None = None,
|
|
306
|
+
extra_query: Query | None = None,
|
|
307
|
+
extra_body: Body | None = None,
|
|
308
|
+
timeout: float | httpx.Timeout | None | NotGiven = not_given,
|
|
309
|
+
) -> Completion | Stream[CompletionChunk]:
|
|
310
|
+
"""
|
|
311
|
+
Query a language, code, or image model.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
model: The name of the model to query.
|
|
315
|
+
|
|
316
|
+
[See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
|
|
317
|
+
|
|
318
|
+
prompt: A string providing context for the model to complete.
|
|
319
|
+
|
|
320
|
+
stream: If true, stream tokens as Server-Sent Events as the model generates them instead
|
|
321
|
+
of waiting for the full model response. The stream terminates with
|
|
322
|
+
`data: [DONE]`. If false, return a single JSON object containing the results.
|
|
323
|
+
|
|
324
|
+
echo: If true, the response will contain the prompt. Can be used with `logprobs` to
|
|
325
|
+
return prompt logprobs.
|
|
326
|
+
|
|
327
|
+
frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
|
|
328
|
+
repeating tokens that have already been mentioned.
|
|
329
|
+
|
|
330
|
+
logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
|
|
331
|
+
|
|
332
|
+
logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
|
|
333
|
+
at each generation step, instead of just the sampled token. Log probabilities
|
|
334
|
+
help assess model confidence in token predictions.
|
|
335
|
+
|
|
336
|
+
max_tokens: The maximum number of tokens to generate.
|
|
337
|
+
|
|
338
|
+
min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
|
|
339
|
+
|
|
340
|
+
n: The number of completions to generate for each prompt.
|
|
341
|
+
|
|
342
|
+
presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
|
|
343
|
+
a model talking about new topics.
|
|
344
|
+
|
|
345
|
+
repetition_penalty: A number that controls the diversity of generated text by reducing the
|
|
346
|
+
likelihood of repeated sequences. Higher values decrease repetition.
|
|
347
|
+
|
|
348
|
+
safety_model: The name of the moderation model used to validate tokens. Choose from the
|
|
349
|
+
available moderation models found
|
|
350
|
+
[here](https://docs.together.ai/docs/inference-models#moderation-models).
|
|
351
|
+
|
|
352
|
+
seed: Seed value for reproducibility.
|
|
353
|
+
|
|
354
|
+
stop: A list of string sequences that will truncate (stop) inference text output. For
|
|
355
|
+
example, "</s>" will stop generation as soon as the model generates the given
|
|
356
|
+
token.
|
|
357
|
+
|
|
358
|
+
temperature: A decimal number from 0-1 that determines the degree of randomness in the
|
|
359
|
+
response. A temperature less than 1 favors more correctness and is appropriate
|
|
360
|
+
for question answering or summarization. A value closer to 1 introduces more
|
|
361
|
+
randomness in the output.
|
|
362
|
+
|
|
363
|
+
top_k: An integer that's used to limit the number of choices for the next predicted
|
|
364
|
+
word or token. It specifies the maximum number of tokens to consider at each
|
|
365
|
+
step, based on their probability of occurrence. This technique helps to speed up
|
|
366
|
+
the generation process and can improve the quality of the generated text by
|
|
367
|
+
focusing on the most likely options.
|
|
368
|
+
|
|
369
|
+
top_p: A percentage (also called the nucleus parameter) that's used to dynamically
|
|
370
|
+
adjust the number of choices for each predicted token based on the cumulative
|
|
371
|
+
probabilities. It specifies a probability threshold below which all less likely
|
|
372
|
+
tokens are filtered out. This technique helps maintain diversity and generate
|
|
373
|
+
more fluent and natural-sounding text.
|
|
374
|
+
|
|
375
|
+
extra_headers: Send extra headers
|
|
376
|
+
|
|
377
|
+
extra_query: Add additional query parameters to the request
|
|
378
|
+
|
|
379
|
+
extra_body: Add additional JSON properties to the request
|
|
380
|
+
|
|
381
|
+
timeout: Override the client-level default timeout for this request, in seconds
|
|
382
|
+
"""
|
|
383
|
+
...
|
|
384
|
+
|
|
385
|
+
@required_args(["model", "prompt"], ["model", "prompt", "stream"])
|
|
386
|
+
def create(
|
|
387
|
+
self,
|
|
388
|
+
*,
|
|
389
|
+
model: Union[
|
|
390
|
+
Literal[
|
|
391
|
+
"meta-llama/Llama-2-70b-hf",
|
|
392
|
+
"mistralai/Mistral-7B-v0.1",
|
|
393
|
+
"mistralai/Mixtral-8x7B-v0.1",
|
|
394
|
+
"Meta-Llama/Llama-Guard-7b",
|
|
395
|
+
],
|
|
396
|
+
str,
|
|
397
|
+
],
|
|
398
|
+
prompt: str,
|
|
399
|
+
echo: bool | Omit = omit,
|
|
400
|
+
frequency_penalty: float | Omit = omit,
|
|
401
|
+
logit_bias: Dict[str, float] | Omit = omit,
|
|
402
|
+
logprobs: int | Omit = omit,
|
|
403
|
+
max_tokens: int | Omit = omit,
|
|
404
|
+
min_p: float | Omit = omit,
|
|
405
|
+
n: int | Omit = omit,
|
|
406
|
+
presence_penalty: float | Omit = omit,
|
|
407
|
+
repetition_penalty: float | Omit = omit,
|
|
408
|
+
safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
|
|
409
|
+
seed: int | Omit = omit,
|
|
410
|
+
stop: SequenceNotStr[str] | Omit = omit,
|
|
411
|
+
stream: Literal[False] | Literal[True] | Omit = omit,
|
|
412
|
+
temperature: float | Omit = omit,
|
|
413
|
+
top_k: int | Omit = omit,
|
|
414
|
+
top_p: float | Omit = omit,
|
|
415
|
+
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
|
416
|
+
# The extra values given here take precedence over values defined on the client or passed to this method.
|
|
417
|
+
extra_headers: Headers | None = None,
|
|
418
|
+
extra_query: Query | None = None,
|
|
419
|
+
extra_body: Body | None = None,
|
|
420
|
+
timeout: float | httpx.Timeout | None | NotGiven = not_given,
|
|
421
|
+
) -> Completion | Stream[CompletionChunk]:
|
|
422
|
+
return self._post(
|
|
423
|
+
"/completions",
|
|
424
|
+
body=maybe_transform(
|
|
425
|
+
{
|
|
426
|
+
"model": model,
|
|
427
|
+
"prompt": prompt,
|
|
428
|
+
"echo": echo,
|
|
429
|
+
"frequency_penalty": frequency_penalty,
|
|
430
|
+
"logit_bias": logit_bias,
|
|
431
|
+
"logprobs": logprobs,
|
|
432
|
+
"max_tokens": max_tokens,
|
|
433
|
+
"min_p": min_p,
|
|
434
|
+
"n": n,
|
|
435
|
+
"presence_penalty": presence_penalty,
|
|
436
|
+
"repetition_penalty": repetition_penalty,
|
|
437
|
+
"safety_model": safety_model,
|
|
438
|
+
"seed": seed,
|
|
439
|
+
"stop": stop,
|
|
440
|
+
"stream": stream,
|
|
441
|
+
"temperature": temperature,
|
|
442
|
+
"top_k": top_k,
|
|
443
|
+
"top_p": top_p,
|
|
444
|
+
},
|
|
445
|
+
completion_create_params.CompletionCreateParamsStreaming
|
|
446
|
+
if stream
|
|
447
|
+
else completion_create_params.CompletionCreateParamsNonStreaming,
|
|
448
|
+
),
|
|
449
|
+
options=make_request_options(
|
|
450
|
+
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
|
|
125
451
|
),
|
|
126
|
-
|
|
452
|
+
cast_to=Completion,
|
|
453
|
+
stream=stream or False,
|
|
454
|
+
stream_cls=Stream[CompletionChunk],
|
|
127
455
|
)
|
|
128
456
|
|
|
129
|
-
if stream:
|
|
130
|
-
# must be an iterator
|
|
131
|
-
assert not isinstance(response, TogetherResponse)
|
|
132
|
-
return (CompletionChunk(**line.data) for line in response)
|
|
133
|
-
assert isinstance(response, TogetherResponse)
|
|
134
|
-
return CompletionResponse(**response.data)
|
|
135
457
|
|
|
458
|
+
class AsyncCompletionsResource(AsyncAPIResource):
|
|
459
|
+
@cached_property
|
|
460
|
+
def with_raw_response(self) -> AsyncCompletionsResourceWithRawResponse:
|
|
461
|
+
"""
|
|
462
|
+
This property can be used as a prefix for any HTTP method call to return
|
|
463
|
+
the raw response object instead of the parsed content.
|
|
464
|
+
|
|
465
|
+
For more information, see https://www.github.com/togethercomputer/together-py#accessing-raw-response-data-eg-headers
|
|
466
|
+
"""
|
|
467
|
+
return AsyncCompletionsResourceWithRawResponse(self)
|
|
136
468
|
|
|
137
|
-
|
|
138
|
-
def
|
|
139
|
-
|
|
469
|
+
@cached_property
|
|
470
|
+
def with_streaming_response(self) -> AsyncCompletionsResourceWithStreamingResponse:
|
|
471
|
+
"""
|
|
472
|
+
An alternative to `.with_raw_response` that doesn't eagerly read the response body.
|
|
473
|
+
|
|
474
|
+
For more information, see https://www.github.com/togethercomputer/together-py#with_streaming_response
|
|
475
|
+
"""
|
|
476
|
+
return AsyncCompletionsResourceWithStreamingResponse(self)
|
|
140
477
|
|
|
478
|
+
@overload
|
|
141
479
|
async def create(
|
|
142
480
|
self,
|
|
143
481
|
*,
|
|
482
|
+
model: Union[
|
|
483
|
+
Literal[
|
|
484
|
+
"meta-llama/Llama-2-70b-hf",
|
|
485
|
+
"mistralai/Mistral-7B-v0.1",
|
|
486
|
+
"mistralai/Mixtral-8x7B-v0.1",
|
|
487
|
+
"Meta-Llama/Llama-Guard-7b",
|
|
488
|
+
],
|
|
489
|
+
str,
|
|
490
|
+
],
|
|
144
491
|
prompt: str,
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
presence_penalty: float |
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
492
|
+
echo: bool | Omit = omit,
|
|
493
|
+
frequency_penalty: float | Omit = omit,
|
|
494
|
+
logit_bias: Dict[str, float] | Omit = omit,
|
|
495
|
+
logprobs: int | Omit = omit,
|
|
496
|
+
max_tokens: int | Omit = omit,
|
|
497
|
+
min_p: float | Omit = omit,
|
|
498
|
+
n: int | Omit = omit,
|
|
499
|
+
presence_penalty: float | Omit = omit,
|
|
500
|
+
repetition_penalty: float | Omit = omit,
|
|
501
|
+
safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
|
|
502
|
+
seed: int | Omit = omit,
|
|
503
|
+
stop: SequenceNotStr[str] | Omit = omit,
|
|
504
|
+
stream: Literal[False] | Omit = omit,
|
|
505
|
+
temperature: float | Omit = omit,
|
|
506
|
+
top_k: int | Omit = omit,
|
|
507
|
+
top_p: float | Omit = omit,
|
|
508
|
+
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
|
509
|
+
# The extra values given here take precedence over values defined on the client or passed to this method.
|
|
510
|
+
extra_headers: Headers | None = None,
|
|
511
|
+
extra_query: Query | None = None,
|
|
512
|
+
extra_body: Body | None = None,
|
|
513
|
+
timeout: float | httpx.Timeout | None | NotGiven = not_given,
|
|
514
|
+
) -> Completion:
|
|
515
|
+
"""
|
|
516
|
+
Query a language, code, or image model.
|
|
165
517
|
|
|
166
518
|
Args:
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
519
|
+
model: The name of the model to query.
|
|
520
|
+
|
|
521
|
+
[See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
|
|
522
|
+
|
|
523
|
+
prompt: A string providing context for the model to complete.
|
|
524
|
+
|
|
525
|
+
echo: If true, the response will contain the prompt. Can be used with `logprobs` to
|
|
526
|
+
return prompt logprobs.
|
|
527
|
+
|
|
528
|
+
frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
|
|
529
|
+
repeating tokens that have already been mentioned.
|
|
530
|
+
|
|
531
|
+
logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
|
|
532
|
+
|
|
533
|
+
logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
|
|
534
|
+
at each generation step, instead of just the sampled token. Log probabilities
|
|
535
|
+
help assess model confidence in token predictions.
|
|
536
|
+
|
|
537
|
+
max_tokens: The maximum number of tokens to generate.
|
|
538
|
+
|
|
539
|
+
min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
|
|
540
|
+
|
|
541
|
+
n: The number of completions to generate for each prompt.
|
|
542
|
+
|
|
543
|
+
presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
|
|
544
|
+
a model talking about new topics.
|
|
545
|
+
|
|
546
|
+
repetition_penalty: A number that controls the diversity of generated text by reducing the
|
|
547
|
+
likelihood of repeated sequences. Higher values decrease repetition.
|
|
548
|
+
|
|
549
|
+
safety_model: The name of the moderation model used to validate tokens. Choose from the
|
|
550
|
+
available moderation models found
|
|
551
|
+
[here](https://docs.together.ai/docs/inference-models#moderation-models).
|
|
552
|
+
|
|
553
|
+
seed: Seed value for reproducibility.
|
|
554
|
+
|
|
555
|
+
stop: A list of string sequences that will truncate (stop) inference text output. For
|
|
556
|
+
example, "</s>" will stop generation as soon as the model generates the given
|
|
557
|
+
token.
|
|
558
|
+
|
|
559
|
+
stream: If true, stream tokens as Server-Sent Events as the model generates them instead
|
|
560
|
+
of waiting for the full model response. The stream terminates with
|
|
561
|
+
`data: [DONE]`. If false, return a single JSON object containing the results.
|
|
562
|
+
|
|
563
|
+
temperature: A decimal number from 0-1 that determines the degree of randomness in the
|
|
564
|
+
response. A temperature less than 1 favors more correctness and is appropriate
|
|
565
|
+
for question answering or summarization. A value closer to 1 introduces more
|
|
566
|
+
randomness in the output.
|
|
567
|
+
|
|
568
|
+
top_k: An integer that's used to limit the number of choices for the next predicted
|
|
569
|
+
word or token. It specifies the maximum number of tokens to consider at each
|
|
570
|
+
step, based on their probability of occurrence. This technique helps to speed up
|
|
571
|
+
the generation process and can improve the quality of the generated text by
|
|
572
|
+
focusing on the most likely options.
|
|
573
|
+
|
|
574
|
+
top_p: A percentage (also called the nucleus parameter) that's used to dynamically
|
|
575
|
+
adjust the number of choices for each predicted token based on the cumulative
|
|
576
|
+
probabilities. It specifies a probability threshold below which all less likely
|
|
577
|
+
tokens are filtered out. This technique helps maintain diversity and generate
|
|
578
|
+
more fluent and natural-sounding text.
|
|
579
|
+
|
|
580
|
+
extra_headers: Send extra headers
|
|
581
|
+
|
|
582
|
+
extra_query: Add additional query parameters to the request
|
|
583
|
+
|
|
584
|
+
extra_body: Add additional JSON properties to the request
|
|
585
|
+
|
|
586
|
+
timeout: Override the client-level default timeout for this request, in seconds
|
|
587
|
+
"""
|
|
588
|
+
...
|
|
589
|
+
|
|
590
|
+
@overload
|
|
591
|
+
async def create(
|
|
592
|
+
self,
|
|
593
|
+
*,
|
|
594
|
+
model: Union[
|
|
595
|
+
Literal[
|
|
596
|
+
"meta-llama/Llama-2-70b-hf",
|
|
597
|
+
"mistralai/Mistral-7B-v0.1",
|
|
598
|
+
"mistralai/Mixtral-8x7B-v0.1",
|
|
599
|
+
"Meta-Llama/Llama-Guard-7b",
|
|
600
|
+
],
|
|
601
|
+
str,
|
|
602
|
+
],
|
|
603
|
+
prompt: str,
|
|
604
|
+
stream: Literal[True],
|
|
605
|
+
echo: bool | Omit = omit,
|
|
606
|
+
frequency_penalty: float | Omit = omit,
|
|
607
|
+
logit_bias: Dict[str, float] | Omit = omit,
|
|
608
|
+
logprobs: int | Omit = omit,
|
|
609
|
+
max_tokens: int | Omit = omit,
|
|
610
|
+
min_p: float | Omit = omit,
|
|
611
|
+
n: int | Omit = omit,
|
|
612
|
+
presence_penalty: float | Omit = omit,
|
|
613
|
+
repetition_penalty: float | Omit = omit,
|
|
614
|
+
safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
|
|
615
|
+
seed: int | Omit = omit,
|
|
616
|
+
stop: SequenceNotStr[str] | Omit = omit,
|
|
617
|
+
temperature: float | Omit = omit,
|
|
618
|
+
top_k: int | Omit = omit,
|
|
619
|
+
top_p: float | Omit = omit,
|
|
620
|
+
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
|
621
|
+
# The extra values given here take precedence over values defined on the client or passed to this method.
|
|
622
|
+
extra_headers: Headers | None = None,
|
|
623
|
+
extra_query: Query | None = None,
|
|
624
|
+
extra_body: Body | None = None,
|
|
625
|
+
timeout: float | httpx.Timeout | None | NotGiven = not_given,
|
|
626
|
+
) -> AsyncStream[CompletionChunk]:
|
|
627
|
+
"""
|
|
628
|
+
Query a language, code, or image model.
|
|
629
|
+
|
|
630
|
+
Args:
|
|
631
|
+
model: The name of the model to query.
|
|
632
|
+
|
|
633
|
+
[See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
|
|
634
|
+
|
|
635
|
+
prompt: A string providing context for the model to complete.
|
|
636
|
+
|
|
637
|
+
stream: If true, stream tokens as Server-Sent Events as the model generates them instead
|
|
638
|
+
of waiting for the full model response. The stream terminates with
|
|
639
|
+
`data: [DONE]`. If false, return a single JSON object containing the results.
|
|
640
|
+
|
|
641
|
+
echo: If true, the response will contain the prompt. Can be used with `logprobs` to
|
|
642
|
+
return prompt logprobs.
|
|
643
|
+
|
|
644
|
+
frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
|
|
645
|
+
repeating tokens that have already been mentioned.
|
|
646
|
+
|
|
647
|
+
logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
|
|
648
|
+
|
|
649
|
+
logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
|
|
650
|
+
at each generation step, instead of just the sampled token. Log probabilities
|
|
651
|
+
help assess model confidence in token predictions.
|
|
652
|
+
|
|
653
|
+
max_tokens: The maximum number of tokens to generate.
|
|
654
|
+
|
|
655
|
+
min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
|
|
656
|
+
|
|
657
|
+
n: The number of completions to generate for each prompt.
|
|
658
|
+
|
|
659
|
+
presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
|
|
660
|
+
a model talking about new topics.
|
|
661
|
+
|
|
662
|
+
repetition_penalty: A number that controls the diversity of generated text by reducing the
|
|
663
|
+
likelihood of repeated sequences. Higher values decrease repetition.
|
|
664
|
+
|
|
665
|
+
safety_model: The name of the moderation model used to validate tokens. Choose from the
|
|
666
|
+
available moderation models found
|
|
667
|
+
[here](https://docs.together.ai/docs/inference-models#moderation-models).
|
|
668
|
+
|
|
669
|
+
seed: Seed value for reproducibility.
|
|
670
|
+
|
|
671
|
+
stop: A list of string sequences that will truncate (stop) inference text output. For
|
|
672
|
+
example, "</s>" will stop generation as soon as the model generates the given
|
|
673
|
+
token.
|
|
674
|
+
|
|
675
|
+
temperature: A decimal number from 0-1 that determines the degree of randomness in the
|
|
676
|
+
response. A temperature less than 1 favors more correctness and is appropriate
|
|
677
|
+
for question answering or summarization. A value closer to 1 introduces more
|
|
678
|
+
randomness in the output.
|
|
679
|
+
|
|
680
|
+
top_k: An integer that's used to limit the number of choices for the next predicted
|
|
681
|
+
word or token. It specifies the maximum number of tokens to consider at each
|
|
682
|
+
step, based on their probability of occurrence. This technique helps to speed up
|
|
683
|
+
the generation process and can improve the quality of the generated text by
|
|
684
|
+
focusing on the most likely options.
|
|
685
|
+
|
|
686
|
+
top_p: A percentage (also called the nucleus parameter) that's used to dynamically
|
|
687
|
+
adjust the number of choices for each predicted token based on the cumulative
|
|
688
|
+
probabilities. It specifies a probability threshold below which all less likely
|
|
689
|
+
tokens are filtered out. This technique helps maintain diversity and generate
|
|
690
|
+
more fluent and natural-sounding text.
|
|
691
|
+
|
|
692
|
+
extra_headers: Send extra headers
|
|
693
|
+
|
|
694
|
+
extra_query: Add additional query parameters to the request
|
|
695
|
+
|
|
696
|
+
extra_body: Add additional JSON properties to the request
|
|
219
697
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
698
|
+
timeout: Override the client-level default timeout for this request, in seconds
|
|
699
|
+
"""
|
|
700
|
+
...
|
|
701
|
+
|
|
702
|
+
@overload
|
|
703
|
+
async def create(
|
|
704
|
+
self,
|
|
705
|
+
*,
|
|
706
|
+
model: Union[
|
|
707
|
+
Literal[
|
|
708
|
+
"meta-llama/Llama-2-70b-hf",
|
|
709
|
+
"mistralai/Mistral-7B-v0.1",
|
|
710
|
+
"mistralai/Mixtral-8x7B-v0.1",
|
|
711
|
+
"Meta-Llama/Llama-Guard-7b",
|
|
712
|
+
],
|
|
713
|
+
str,
|
|
714
|
+
],
|
|
715
|
+
prompt: str,
|
|
716
|
+
stream: bool,
|
|
717
|
+
echo: bool | Omit = omit,
|
|
718
|
+
frequency_penalty: float | Omit = omit,
|
|
719
|
+
logit_bias: Dict[str, float] | Omit = omit,
|
|
720
|
+
logprobs: int | Omit = omit,
|
|
721
|
+
max_tokens: int | Omit = omit,
|
|
722
|
+
min_p: float | Omit = omit,
|
|
723
|
+
n: int | Omit = omit,
|
|
724
|
+
presence_penalty: float | Omit = omit,
|
|
725
|
+
repetition_penalty: float | Omit = omit,
|
|
726
|
+
safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
|
|
727
|
+
seed: int | Omit = omit,
|
|
728
|
+
stop: SequenceNotStr[str] | Omit = omit,
|
|
729
|
+
temperature: float | Omit = omit,
|
|
730
|
+
top_k: int | Omit = omit,
|
|
731
|
+
top_p: float | Omit = omit,
|
|
732
|
+
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
|
733
|
+
# The extra values given here take precedence over values defined on the client or passed to this method.
|
|
734
|
+
extra_headers: Headers | None = None,
|
|
735
|
+
extra_query: Query | None = None,
|
|
736
|
+
extra_body: Body | None = None,
|
|
737
|
+
timeout: float | httpx.Timeout | None | NotGiven = not_given,
|
|
738
|
+
) -> Completion | AsyncStream[CompletionChunk]:
|
|
739
|
+
"""
|
|
740
|
+
Query a language, code, or image model.
|
|
741
|
+
|
|
742
|
+
Args:
|
|
743
|
+
model: The name of the model to query.
|
|
744
|
+
|
|
745
|
+
[See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
|
|
746
|
+
|
|
747
|
+
prompt: A string providing context for the model to complete.
|
|
748
|
+
|
|
749
|
+
stream: If true, stream tokens as Server-Sent Events as the model generates them instead
|
|
750
|
+
of waiting for the full model response. The stream terminates with
|
|
751
|
+
`data: [DONE]`. If false, return a single JSON object containing the results.
|
|
752
|
+
|
|
753
|
+
echo: If true, the response will contain the prompt. Can be used with `logprobs` to
|
|
754
|
+
return prompt logprobs.
|
|
755
|
+
|
|
756
|
+
frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
|
|
757
|
+
repeating tokens that have already been mentioned.
|
|
758
|
+
|
|
759
|
+
logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
|
|
760
|
+
|
|
761
|
+
logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
|
|
762
|
+
at each generation step, instead of just the sampled token. Log probabilities
|
|
763
|
+
help assess model confidence in token predictions.
|
|
764
|
+
|
|
765
|
+
max_tokens: The maximum number of tokens to generate.
|
|
766
|
+
|
|
767
|
+
min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
|
|
768
|
+
|
|
769
|
+
n: The number of completions to generate for each prompt.
|
|
770
|
+
|
|
771
|
+
presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
|
|
772
|
+
a model talking about new topics.
|
|
773
|
+
|
|
774
|
+
repetition_penalty: A number that controls the diversity of generated text by reducing the
|
|
775
|
+
likelihood of repeated sequences. Higher values decrease repetition.
|
|
776
|
+
|
|
777
|
+
safety_model: The name of the moderation model used to validate tokens. Choose from the
|
|
778
|
+
available moderation models found
|
|
779
|
+
[here](https://docs.together.ai/docs/inference-models#moderation-models).
|
|
780
|
+
|
|
781
|
+
seed: Seed value for reproducibility.
|
|
782
|
+
|
|
783
|
+
stop: A list of string sequences that will truncate (stop) inference text output. For
|
|
784
|
+
example, "</s>" will stop generation as soon as the model generates the given
|
|
785
|
+
token.
|
|
786
|
+
|
|
787
|
+
temperature: A decimal number from 0-1 that determines the degree of randomness in the
|
|
788
|
+
response. A temperature less than 1 favors more correctness and is appropriate
|
|
789
|
+
for question answering or summarization. A value closer to 1 introduces more
|
|
790
|
+
randomness in the output.
|
|
791
|
+
|
|
792
|
+
top_k: An integer that's used to limit the number of choices for the next predicted
|
|
793
|
+
word or token. It specifies the maximum number of tokens to consider at each
|
|
794
|
+
step, based on their probability of occurrence. This technique helps to speed up
|
|
795
|
+
the generation process and can improve the quality of the generated text by
|
|
796
|
+
focusing on the most likely options.
|
|
797
|
+
|
|
798
|
+
top_p: A percentage (also called the nucleus parameter) that's used to dynamically
|
|
799
|
+
adjust the number of choices for each predicted token based on the cumulative
|
|
800
|
+
probabilities. It specifies a probability threshold below which all less likely
|
|
801
|
+
tokens are filtered out. This technique helps maintain diversity and generate
|
|
802
|
+
more fluent and natural-sounding text.
|
|
803
|
+
|
|
804
|
+
extra_headers: Send extra headers
|
|
805
|
+
|
|
806
|
+
extra_query: Add additional query parameters to the request
|
|
807
|
+
|
|
808
|
+
extra_body: Add additional JSON properties to the request
|
|
809
|
+
|
|
810
|
+
timeout: Override the client-level default timeout for this request, in seconds
|
|
811
|
+
"""
|
|
812
|
+
...
|
|
813
|
+
|
|
814
|
+
@required_args(["model", "prompt"], ["model", "prompt", "stream"])
|
|
815
|
+
async def create(
|
|
816
|
+
self,
|
|
817
|
+
*,
|
|
818
|
+
model: Union[
|
|
819
|
+
Literal[
|
|
820
|
+
"meta-llama/Llama-2-70b-hf",
|
|
821
|
+
"mistralai/Mistral-7B-v0.1",
|
|
822
|
+
"mistralai/Mixtral-8x7B-v0.1",
|
|
823
|
+
"Meta-Llama/Llama-Guard-7b",
|
|
824
|
+
],
|
|
825
|
+
str,
|
|
826
|
+
],
|
|
827
|
+
prompt: str,
|
|
828
|
+
echo: bool | Omit = omit,
|
|
829
|
+
frequency_penalty: float | Omit = omit,
|
|
830
|
+
logit_bias: Dict[str, float] | Omit = omit,
|
|
831
|
+
logprobs: int | Omit = omit,
|
|
832
|
+
max_tokens: int | Omit = omit,
|
|
833
|
+
min_p: float | Omit = omit,
|
|
834
|
+
n: int | Omit = omit,
|
|
835
|
+
presence_penalty: float | Omit = omit,
|
|
836
|
+
repetition_penalty: float | Omit = omit,
|
|
837
|
+
safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
|
|
838
|
+
seed: int | Omit = omit,
|
|
839
|
+
stop: SequenceNotStr[str] | Omit = omit,
|
|
840
|
+
stream: Literal[False] | Literal[True] | Omit = omit,
|
|
841
|
+
temperature: float | Omit = omit,
|
|
842
|
+
top_k: int | Omit = omit,
|
|
843
|
+
top_p: float | Omit = omit,
|
|
844
|
+
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
|
845
|
+
# The extra values given here take precedence over values defined on the client or passed to this method.
|
|
846
|
+
extra_headers: Headers | None = None,
|
|
847
|
+
extra_query: Query | None = None,
|
|
848
|
+
extra_body: Body | None = None,
|
|
849
|
+
timeout: float | httpx.Timeout | None | NotGiven = not_given,
|
|
850
|
+
) -> Completion | AsyncStream[CompletionChunk]:
|
|
851
|
+
return await self._post(
|
|
852
|
+
"/completions",
|
|
853
|
+
body=await async_maybe_transform(
|
|
854
|
+
{
|
|
855
|
+
"model": model,
|
|
856
|
+
"prompt": prompt,
|
|
857
|
+
"echo": echo,
|
|
858
|
+
"frequency_penalty": frequency_penalty,
|
|
859
|
+
"logit_bias": logit_bias,
|
|
860
|
+
"logprobs": logprobs,
|
|
861
|
+
"max_tokens": max_tokens,
|
|
862
|
+
"min_p": min_p,
|
|
863
|
+
"n": n,
|
|
864
|
+
"presence_penalty": presence_penalty,
|
|
865
|
+
"repetition_penalty": repetition_penalty,
|
|
866
|
+
"safety_model": safety_model,
|
|
867
|
+
"seed": seed,
|
|
868
|
+
"stop": stop,
|
|
869
|
+
"stream": stream,
|
|
870
|
+
"temperature": temperature,
|
|
871
|
+
"top_k": top_k,
|
|
872
|
+
"top_p": top_p,
|
|
873
|
+
},
|
|
874
|
+
completion_create_params.CompletionCreateParamsStreaming
|
|
875
|
+
if stream
|
|
876
|
+
else completion_create_params.CompletionCreateParamsNonStreaming,
|
|
246
877
|
),
|
|
247
|
-
|
|
878
|
+
options=make_request_options(
|
|
879
|
+
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
|
|
880
|
+
),
|
|
881
|
+
cast_to=Completion,
|
|
882
|
+
stream=stream or False,
|
|
883
|
+
stream_cls=AsyncStream[CompletionChunk],
|
|
884
|
+
)
|
|
885
|
+
|
|
886
|
+
|
|
887
|
+
class CompletionsResourceWithRawResponse:
|
|
888
|
+
def __init__(self, completions: CompletionsResource) -> None:
|
|
889
|
+
self._completions = completions
|
|
890
|
+
|
|
891
|
+
self.create = to_raw_response_wrapper(
|
|
892
|
+
completions.create,
|
|
248
893
|
)
|
|
249
894
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
895
|
+
|
|
896
|
+
class AsyncCompletionsResourceWithRawResponse:
|
|
897
|
+
def __init__(self, completions: AsyncCompletionsResource) -> None:
|
|
898
|
+
self._completions = completions
|
|
899
|
+
|
|
900
|
+
self.create = async_to_raw_response_wrapper(
|
|
901
|
+
completions.create,
|
|
902
|
+
)
|
|
903
|
+
|
|
904
|
+
|
|
905
|
+
class CompletionsResourceWithStreamingResponse:
|
|
906
|
+
def __init__(self, completions: CompletionsResource) -> None:
|
|
907
|
+
self._completions = completions
|
|
908
|
+
|
|
909
|
+
self.create = to_streamed_response_wrapper(
|
|
910
|
+
completions.create,
|
|
911
|
+
)
|
|
912
|
+
|
|
913
|
+
|
|
914
|
+
class AsyncCompletionsResourceWithStreamingResponse:
|
|
915
|
+
def __init__(self, completions: AsyncCompletionsResource) -> None:
|
|
916
|
+
self._completions = completions
|
|
917
|
+
|
|
918
|
+
self.create = async_to_streamed_response_wrapper(
|
|
919
|
+
completions.create,
|
|
920
|
+
)
|