together 1.5.17__py3-none-any.whl → 2.0.0a8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- together/__init__.py +101 -63
- together/_base_client.py +1995 -0
- together/_client.py +1033 -0
- together/_compat.py +219 -0
- together/_constants.py +14 -0
- together/_exceptions.py +108 -0
- together/_files.py +123 -0
- together/_models.py +857 -0
- together/_qs.py +150 -0
- together/_resource.py +43 -0
- together/_response.py +830 -0
- together/_streaming.py +370 -0
- together/_types.py +260 -0
- together/_utils/__init__.py +64 -0
- together/_utils/_compat.py +45 -0
- together/_utils/_datetime_parse.py +136 -0
- together/_utils/_logs.py +25 -0
- together/_utils/_proxy.py +65 -0
- together/_utils/_reflection.py +42 -0
- together/_utils/_resources_proxy.py +24 -0
- together/_utils/_streams.py +12 -0
- together/_utils/_sync.py +58 -0
- together/_utils/_transform.py +457 -0
- together/_utils/_typing.py +156 -0
- together/_utils/_utils.py +421 -0
- together/_version.py +4 -0
- together/lib/.keep +4 -0
- together/lib/__init__.py +23 -0
- together/{cli → lib/cli}/api/endpoints.py +108 -75
- together/lib/cli/api/evals.py +588 -0
- together/{cli → lib/cli}/api/files.py +20 -17
- together/{cli/api/finetune.py → lib/cli/api/fine_tuning.py} +161 -120
- together/lib/cli/api/models.py +140 -0
- together/{cli → lib/cli}/api/utils.py +6 -7
- together/{cli → lib/cli}/cli.py +16 -24
- together/{constants.py → lib/constants.py} +17 -12
- together/lib/resources/__init__.py +11 -0
- together/lib/resources/files.py +999 -0
- together/lib/resources/fine_tuning.py +280 -0
- together/lib/resources/models.py +35 -0
- together/lib/types/__init__.py +13 -0
- together/lib/types/error.py +9 -0
- together/lib/types/fine_tuning.py +455 -0
- together/{utils → lib/utils}/__init__.py +6 -14
- together/{utils → lib/utils}/_log.py +11 -16
- together/lib/utils/files.py +628 -0
- together/lib/utils/serializer.py +10 -0
- together/{utils → lib/utils}/tools.py +19 -55
- together/resources/__init__.py +225 -33
- together/resources/audio/__init__.py +72 -21
- together/resources/audio/audio.py +198 -0
- together/resources/audio/speech.py +574 -122
- together/resources/audio/transcriptions.py +282 -0
- together/resources/audio/translations.py +256 -0
- together/resources/audio/voices.py +135 -0
- together/resources/batches.py +417 -0
- together/resources/chat/__init__.py +30 -21
- together/resources/chat/chat.py +102 -0
- together/resources/chat/completions.py +1063 -263
- together/resources/code_interpreter/__init__.py +33 -0
- together/resources/code_interpreter/code_interpreter.py +258 -0
- together/resources/code_interpreter/sessions.py +135 -0
- together/resources/completions.py +884 -225
- together/resources/embeddings.py +172 -68
- together/resources/endpoints.py +598 -395
- together/resources/evals.py +452 -0
- together/resources/files.py +398 -121
- together/resources/fine_tuning.py +1033 -0
- together/resources/hardware.py +181 -0
- together/resources/images.py +256 -108
- together/resources/jobs.py +214 -0
- together/resources/models.py +238 -90
- together/resources/rerank.py +190 -92
- together/resources/videos.py +374 -0
- together/types/__init__.py +65 -109
- together/types/audio/__init__.py +10 -0
- together/types/audio/speech_create_params.py +75 -0
- together/types/audio/transcription_create_params.py +54 -0
- together/types/audio/transcription_create_response.py +111 -0
- together/types/audio/translation_create_params.py +40 -0
- together/types/audio/translation_create_response.py +70 -0
- together/types/audio/voice_list_response.py +23 -0
- together/types/audio_speech_stream_chunk.py +16 -0
- together/types/autoscaling.py +13 -0
- together/types/autoscaling_param.py +15 -0
- together/types/batch_create_params.py +24 -0
- together/types/batch_create_response.py +14 -0
- together/types/batch_job.py +45 -0
- together/types/batch_list_response.py +10 -0
- together/types/chat/__init__.py +18 -0
- together/types/chat/chat_completion.py +60 -0
- together/types/chat/chat_completion_chunk.py +61 -0
- together/types/chat/chat_completion_structured_message_image_url_param.py +18 -0
- together/types/chat/chat_completion_structured_message_text_param.py +13 -0
- together/types/chat/chat_completion_structured_message_video_url_param.py +18 -0
- together/types/chat/chat_completion_usage.py +13 -0
- together/types/chat/chat_completion_warning.py +9 -0
- together/types/chat/completion_create_params.py +329 -0
- together/types/code_interpreter/__init__.py +5 -0
- together/types/code_interpreter/session_list_response.py +31 -0
- together/types/code_interpreter_execute_params.py +45 -0
- together/types/completion.py +42 -0
- together/types/completion_chunk.py +66 -0
- together/types/completion_create_params.py +138 -0
- together/types/dedicated_endpoint.py +44 -0
- together/types/embedding.py +24 -0
- together/types/embedding_create_params.py +31 -0
- together/types/endpoint_create_params.py +43 -0
- together/types/endpoint_list_avzones_response.py +11 -0
- together/types/endpoint_list_params.py +18 -0
- together/types/endpoint_list_response.py +41 -0
- together/types/endpoint_update_params.py +27 -0
- together/types/eval_create_params.py +263 -0
- together/types/eval_create_response.py +16 -0
- together/types/eval_list_params.py +21 -0
- together/types/eval_list_response.py +10 -0
- together/types/eval_status_response.py +100 -0
- together/types/evaluation_job.py +139 -0
- together/types/execute_response.py +108 -0
- together/types/file_delete_response.py +13 -0
- together/types/file_list.py +12 -0
- together/types/file_purpose.py +9 -0
- together/types/file_response.py +31 -0
- together/types/file_type.py +7 -0
- together/types/fine_tuning_cancel_response.py +194 -0
- together/types/fine_tuning_content_params.py +24 -0
- together/types/fine_tuning_delete_params.py +11 -0
- together/types/fine_tuning_delete_response.py +12 -0
- together/types/fine_tuning_list_checkpoints_response.py +21 -0
- together/types/fine_tuning_list_events_response.py +12 -0
- together/types/fine_tuning_list_response.py +199 -0
- together/types/finetune_event.py +41 -0
- together/types/finetune_event_type.py +33 -0
- together/types/finetune_response.py +177 -0
- together/types/hardware_list_params.py +16 -0
- together/types/hardware_list_response.py +58 -0
- together/types/image_data_b64.py +15 -0
- together/types/image_data_url.py +15 -0
- together/types/image_file.py +23 -0
- together/types/image_generate_params.py +85 -0
- together/types/job_list_response.py +47 -0
- together/types/job_retrieve_response.py +43 -0
- together/types/log_probs.py +18 -0
- together/types/model_list_response.py +10 -0
- together/types/model_object.py +42 -0
- together/types/model_upload_params.py +36 -0
- together/types/model_upload_response.py +23 -0
- together/types/rerank_create_params.py +36 -0
- together/types/rerank_create_response.py +36 -0
- together/types/tool_choice.py +23 -0
- together/types/tool_choice_param.py +23 -0
- together/types/tools_param.py +23 -0
- together/types/training_method_dpo.py +22 -0
- together/types/training_method_sft.py +18 -0
- together/types/video_create_params.py +86 -0
- together/types/video_job.py +57 -0
- together-2.0.0a8.dist-info/METADATA +680 -0
- together-2.0.0a8.dist-info/RECORD +164 -0
- {together-1.5.17.dist-info → together-2.0.0a8.dist-info}/WHEEL +1 -1
- together-2.0.0a8.dist-info/entry_points.txt +2 -0
- {together-1.5.17.dist-info → together-2.0.0a8.dist-info/licenses}/LICENSE +1 -1
- together/abstract/api_requestor.py +0 -729
- together/cli/api/chat.py +0 -276
- together/cli/api/completions.py +0 -119
- together/cli/api/images.py +0 -93
- together/cli/api/models.py +0 -55
- together/client.py +0 -176
- together/error.py +0 -194
- together/filemanager.py +0 -389
- together/legacy/__init__.py +0 -0
- together/legacy/base.py +0 -27
- together/legacy/complete.py +0 -93
- together/legacy/embeddings.py +0 -27
- together/legacy/files.py +0 -146
- together/legacy/finetune.py +0 -177
- together/legacy/images.py +0 -27
- together/legacy/models.py +0 -44
- together/resources/batch.py +0 -136
- together/resources/code_interpreter.py +0 -82
- together/resources/finetune.py +0 -1064
- together/together_response.py +0 -50
- together/types/abstract.py +0 -26
- together/types/audio_speech.py +0 -110
- together/types/batch.py +0 -53
- together/types/chat_completions.py +0 -197
- together/types/code_interpreter.py +0 -57
- together/types/common.py +0 -66
- together/types/completions.py +0 -107
- together/types/embeddings.py +0 -35
- together/types/endpoints.py +0 -123
- together/types/error.py +0 -16
- together/types/files.py +0 -90
- together/types/finetune.py +0 -398
- together/types/images.py +0 -44
- together/types/models.py +0 -45
- together/types/rerank.py +0 -43
- together/utils/api_helpers.py +0 -124
- together/utils/files.py +0 -425
- together/version.py +0 -6
- together-1.5.17.dist-info/METADATA +0 -525
- together-1.5.17.dist-info/RECORD +0 -69
- together-1.5.17.dist-info/entry_points.txt +0 -3
- /together/{abstract → lib/cli}/__init__.py +0 -0
- /together/{cli → lib/cli/api}/__init__.py +0 -0
- /together/{cli/api/__init__.py → py.typed} +0 -0
|
@@ -1,297 +1,1097 @@
|
|
|
1
|
+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
|
2
|
+
|
|
1
3
|
from __future__ import annotations
|
|
2
4
|
|
|
3
|
-
from typing import
|
|
5
|
+
from typing import Dict, Union, Iterable
|
|
6
|
+
from typing_extensions import Literal, overload
|
|
7
|
+
|
|
8
|
+
import httpx
|
|
4
9
|
|
|
5
|
-
from
|
|
6
|
-
from
|
|
7
|
-
from
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
from ..._types import Body, Omit, Query, Headers, NotGiven, SequenceNotStr, omit, not_given
|
|
11
|
+
from ..._utils import required_args, maybe_transform, async_maybe_transform
|
|
12
|
+
from ..._compat import cached_property
|
|
13
|
+
from ..._resource import SyncAPIResource, AsyncAPIResource
|
|
14
|
+
from ..._response import (
|
|
15
|
+
to_raw_response_wrapper,
|
|
16
|
+
to_streamed_response_wrapper,
|
|
17
|
+
async_to_raw_response_wrapper,
|
|
18
|
+
async_to_streamed_response_wrapper,
|
|
13
19
|
)
|
|
20
|
+
from ..._streaming import Stream, AsyncStream
|
|
21
|
+
from ...types.chat import completion_create_params
|
|
22
|
+
from ..._base_client import make_request_options
|
|
23
|
+
from ...types.tools_param import ToolsParam
|
|
24
|
+
from ...types.chat.chat_completion import ChatCompletion
|
|
25
|
+
from ...types.chat.chat_completion_chunk import ChatCompletionChunk
|
|
26
|
+
|
|
27
|
+
__all__ = ["CompletionsResource", "AsyncCompletionsResource"]
|
|
28
|
+
|
|
14
29
|
|
|
30
|
+
class CompletionsResource(SyncAPIResource):
|
|
31
|
+
@cached_property
|
|
32
|
+
def with_raw_response(self) -> CompletionsResourceWithRawResponse:
|
|
33
|
+
"""
|
|
34
|
+
This property can be used as a prefix for any HTTP method call to return
|
|
35
|
+
the raw response object instead of the parsed content.
|
|
36
|
+
|
|
37
|
+
For more information, see https://www.github.com/togethercomputer/together-py#accessing-raw-response-data-eg-headers
|
|
38
|
+
"""
|
|
39
|
+
return CompletionsResourceWithRawResponse(self)
|
|
15
40
|
|
|
16
|
-
|
|
17
|
-
def
|
|
18
|
-
|
|
41
|
+
@cached_property
|
|
42
|
+
def with_streaming_response(self) -> CompletionsResourceWithStreamingResponse:
|
|
43
|
+
"""
|
|
44
|
+
An alternative to `.with_raw_response` that doesn't eagerly read the response body.
|
|
19
45
|
|
|
46
|
+
For more information, see https://www.github.com/togethercomputer/together-py#with_streaming_response
|
|
47
|
+
"""
|
|
48
|
+
return CompletionsResourceWithStreamingResponse(self)
|
|
49
|
+
|
|
50
|
+
@overload
|
|
20
51
|
def create(
|
|
21
52
|
self,
|
|
22
53
|
*,
|
|
23
|
-
messages:
|
|
24
|
-
model:
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
54
|
+
messages: Iterable[completion_create_params.Message],
|
|
55
|
+
model: Union[
|
|
56
|
+
Literal[
|
|
57
|
+
"Qwen/Qwen2.5-72B-Instruct-Turbo",
|
|
58
|
+
"Qwen/Qwen2.5-7B-Instruct-Turbo",
|
|
59
|
+
"meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
|
|
60
|
+
"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
|
|
61
|
+
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
|
|
62
|
+
],
|
|
63
|
+
str,
|
|
64
|
+
],
|
|
65
|
+
context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
|
|
66
|
+
echo: bool | Omit = omit,
|
|
67
|
+
frequency_penalty: float | Omit = omit,
|
|
68
|
+
function_call: completion_create_params.FunctionCall | Omit = omit,
|
|
69
|
+
logit_bias: Dict[str, float] | Omit = omit,
|
|
70
|
+
logprobs: int | Omit = omit,
|
|
71
|
+
max_tokens: int | Omit = omit,
|
|
72
|
+
min_p: float | Omit = omit,
|
|
73
|
+
n: int | Omit = omit,
|
|
74
|
+
presence_penalty: float | Omit = omit,
|
|
75
|
+
reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
|
|
76
|
+
repetition_penalty: float | Omit = omit,
|
|
77
|
+
response_format: completion_create_params.ResponseFormat | Omit = omit,
|
|
78
|
+
safety_model: str | Omit = omit,
|
|
79
|
+
seed: int | Omit = omit,
|
|
80
|
+
stop: SequenceNotStr[str] | Omit = omit,
|
|
81
|
+
stream: Literal[False] | Omit = omit,
|
|
82
|
+
temperature: float | Omit = omit,
|
|
83
|
+
tool_choice: completion_create_params.ToolChoice | Omit = omit,
|
|
84
|
+
tools: Iterable[ToolsParam] | Omit = omit,
|
|
85
|
+
top_k: int | Omit = omit,
|
|
86
|
+
top_p: float | Omit = omit,
|
|
87
|
+
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
|
88
|
+
# The extra values given here take precedence over values defined on the client or passed to this method.
|
|
89
|
+
extra_headers: Headers | None = None,
|
|
90
|
+
extra_query: Query | None = None,
|
|
91
|
+
extra_body: Body | None = None,
|
|
92
|
+
timeout: float | httpx.Timeout | None | NotGiven = not_given,
|
|
93
|
+
) -> ChatCompletion:
|
|
46
94
|
"""
|
|
47
|
-
|
|
95
|
+
Query a chat model.
|
|
48
96
|
|
|
49
97
|
Args:
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
98
|
+
messages: A list of messages comprising the conversation so far.
|
|
99
|
+
|
|
100
|
+
model: The name of the model to query.
|
|
101
|
+
|
|
102
|
+
[See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
|
|
103
|
+
|
|
104
|
+
context_length_exceeded_behavior: Defined the behavior of the API when max_tokens exceed the maximum context
|
|
105
|
+
length of the model. When set to 'error', API will return 400 with appropriate
|
|
106
|
+
error message. When set to 'truncate', override the max_tokens with maximum
|
|
107
|
+
context length of the model.
|
|
108
|
+
|
|
109
|
+
echo: If true, the response will contain the prompt. Can be used with `logprobs` to
|
|
110
|
+
return prompt logprobs.
|
|
111
|
+
|
|
112
|
+
frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
|
|
113
|
+
repeating tokens that have already been mentioned.
|
|
114
|
+
|
|
115
|
+
logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
|
|
116
|
+
|
|
117
|
+
logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
|
|
118
|
+
at each generation step, instead of just the sampled token. Log probabilities
|
|
119
|
+
help assess model confidence in token predictions.
|
|
120
|
+
|
|
121
|
+
max_tokens: The maximum number of tokens to generate.
|
|
122
|
+
|
|
123
|
+
min_p: A number between 0 and 1 that can be used as an alternative to top_p and top-k.
|
|
124
|
+
|
|
125
|
+
n: The number of completions to generate for each prompt.
|
|
126
|
+
|
|
127
|
+
presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
|
|
128
|
+
a model talking about new topics.
|
|
129
|
+
|
|
130
|
+
reasoning_effort: Controls the level of reasoning effort the model should apply when generating
|
|
131
|
+
responses. Higher values may result in more thoughtful and detailed responses
|
|
132
|
+
but may take longer to generate.
|
|
133
|
+
|
|
134
|
+
repetition_penalty: A number that controls the diversity of generated text by reducing the
|
|
135
|
+
likelihood of repeated sequences. Higher values decrease repetition.
|
|
136
|
+
|
|
137
|
+
response_format: An object specifying the format that the model must output.
|
|
138
|
+
|
|
139
|
+
safety_model: The name of the moderation model used to validate tokens. Choose from the
|
|
140
|
+
available moderation models found
|
|
141
|
+
[here](https://docs.together.ai/docs/inference-models#moderation-models).
|
|
142
|
+
|
|
143
|
+
seed: Seed value for reproducibility.
|
|
144
|
+
|
|
145
|
+
stop: A list of string sequences that will truncate (stop) inference text output. For
|
|
146
|
+
example, "</s>" will stop generation as soon as the model generates the given
|
|
147
|
+
token.
|
|
148
|
+
|
|
149
|
+
stream: If true, stream tokens as Server-Sent Events as the model generates them instead
|
|
150
|
+
of waiting for the full model response. The stream terminates with
|
|
151
|
+
`data: [DONE]`. If false, return a single JSON object containing the results.
|
|
152
|
+
|
|
153
|
+
temperature: A decimal number from 0-1 that determines the degree of randomness in the
|
|
154
|
+
response. A temperature less than 1 favors more correctness and is appropriate
|
|
155
|
+
for question answering or summarization. A value closer to 1 introduces more
|
|
156
|
+
randomness in the output.
|
|
157
|
+
|
|
158
|
+
tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
|
|
159
|
+
which lets the model pick between generating a message or calling a function.
|
|
160
|
+
|
|
161
|
+
tools: A list of tools the model may call. Currently, only functions are supported as a
|
|
162
|
+
tool. Use this to provide a list of functions the model may generate JSON inputs
|
|
163
|
+
for.
|
|
164
|
+
|
|
165
|
+
top_k: An integer that's used to limit the number of choices for the next predicted
|
|
166
|
+
word or token. It specifies the maximum number of tokens to consider at each
|
|
167
|
+
step, based on their probability of occurrence. This technique helps to speed up
|
|
168
|
+
the generation process and can improve the quality of the generated text by
|
|
169
|
+
focusing on the most likely options.
|
|
170
|
+
|
|
171
|
+
top_p: A percentage (also called the nucleus parameter) that's used to dynamically
|
|
172
|
+
adjust the number of choices for each predicted token based on the cumulative
|
|
173
|
+
probabilities. It specifies a probability threshold below which all less likely
|
|
174
|
+
tokens are filtered out. This technique helps maintain diversity and generate
|
|
175
|
+
more fluent and natural-sounding text.
|
|
176
|
+
|
|
177
|
+
extra_headers: Send extra headers
|
|
178
|
+
|
|
179
|
+
extra_query: Add additional query parameters to the request
|
|
180
|
+
|
|
181
|
+
extra_body: Add additional JSON properties to the request
|
|
182
|
+
|
|
183
|
+
timeout: Override the client-level default timeout for this request, in seconds
|
|
110
184
|
"""
|
|
185
|
+
...
|
|
111
186
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
187
|
+
@overload
|
|
188
|
+
def create(
|
|
189
|
+
self,
|
|
190
|
+
*,
|
|
191
|
+
messages: Iterable[completion_create_params.Message],
|
|
192
|
+
model: Union[
|
|
193
|
+
Literal[
|
|
194
|
+
"Qwen/Qwen2.5-72B-Instruct-Turbo",
|
|
195
|
+
"Qwen/Qwen2.5-7B-Instruct-Turbo",
|
|
196
|
+
"meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
|
|
197
|
+
"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
|
|
198
|
+
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
|
|
199
|
+
],
|
|
200
|
+
str,
|
|
201
|
+
],
|
|
202
|
+
stream: Literal[True],
|
|
203
|
+
context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
|
|
204
|
+
echo: bool | Omit = omit,
|
|
205
|
+
frequency_penalty: float | Omit = omit,
|
|
206
|
+
function_call: completion_create_params.FunctionCall | Omit = omit,
|
|
207
|
+
logit_bias: Dict[str, float] | Omit = omit,
|
|
208
|
+
logprobs: int | Omit = omit,
|
|
209
|
+
max_tokens: int | Omit = omit,
|
|
210
|
+
min_p: float | Omit = omit,
|
|
211
|
+
n: int | Omit = omit,
|
|
212
|
+
presence_penalty: float | Omit = omit,
|
|
213
|
+
reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
|
|
214
|
+
repetition_penalty: float | Omit = omit,
|
|
215
|
+
response_format: completion_create_params.ResponseFormat | Omit = omit,
|
|
216
|
+
safety_model: str | Omit = omit,
|
|
217
|
+
seed: int | Omit = omit,
|
|
218
|
+
stop: SequenceNotStr[str] | Omit = omit,
|
|
219
|
+
temperature: float | Omit = omit,
|
|
220
|
+
tool_choice: completion_create_params.ToolChoice | Omit = omit,
|
|
221
|
+
tools: Iterable[ToolsParam] | Omit = omit,
|
|
222
|
+
top_k: int | Omit = omit,
|
|
223
|
+
top_p: float | Omit = omit,
|
|
224
|
+
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
|
225
|
+
# The extra values given here take precedence over values defined on the client or passed to this method.
|
|
226
|
+
extra_headers: Headers | None = None,
|
|
227
|
+
extra_query: Query | None = None,
|
|
228
|
+
extra_body: Body | None = None,
|
|
229
|
+
timeout: float | httpx.Timeout | None | NotGiven = not_given,
|
|
230
|
+
) -> Stream[ChatCompletionChunk]:
|
|
231
|
+
"""
|
|
232
|
+
Query a chat model.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
messages: A list of messages comprising the conversation so far.
|
|
236
|
+
|
|
237
|
+
model: The name of the model to query.
|
|
238
|
+
|
|
239
|
+
[See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
|
|
240
|
+
|
|
241
|
+
stream: If true, stream tokens as Server-Sent Events as the model generates them instead
|
|
242
|
+
of waiting for the full model response. The stream terminates with
|
|
243
|
+
`data: [DONE]`. If false, return a single JSON object containing the results.
|
|
244
|
+
|
|
245
|
+
context_length_exceeded_behavior: Defined the behavior of the API when max_tokens exceed the maximum context
|
|
246
|
+
length of the model. When set to 'error', API will return 400 with appropriate
|
|
247
|
+
error message. When set to 'truncate', override the max_tokens with maximum
|
|
248
|
+
context length of the model.
|
|
249
|
+
|
|
250
|
+
echo: If true, the response will contain the prompt. Can be used with `logprobs` to
|
|
251
|
+
return prompt logprobs.
|
|
252
|
+
|
|
253
|
+
frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
|
|
254
|
+
repeating tokens that have already been mentioned.
|
|
255
|
+
|
|
256
|
+
logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
|
|
257
|
+
|
|
258
|
+
logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
|
|
259
|
+
at each generation step, instead of just the sampled token. Log probabilities
|
|
260
|
+
help assess model confidence in token predictions.
|
|
261
|
+
|
|
262
|
+
max_tokens: The maximum number of tokens to generate.
|
|
263
|
+
|
|
264
|
+
min_p: A number between 0 and 1 that can be used as an alternative to top_p and top-k.
|
|
265
|
+
|
|
266
|
+
n: The number of completions to generate for each prompt.
|
|
267
|
+
|
|
268
|
+
presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
|
|
269
|
+
a model talking about new topics.
|
|
270
|
+
|
|
271
|
+
reasoning_effort: Controls the level of reasoning effort the model should apply when generating
|
|
272
|
+
responses. Higher values may result in more thoughtful and detailed responses
|
|
273
|
+
but may take longer to generate.
|
|
274
|
+
|
|
275
|
+
repetition_penalty: A number that controls the diversity of generated text by reducing the
|
|
276
|
+
likelihood of repeated sequences. Higher values decrease repetition.
|
|
277
|
+
|
|
278
|
+
response_format: An object specifying the format that the model must output.
|
|
115
279
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
280
|
+
safety_model: The name of the moderation model used to validate tokens. Choose from the
|
|
281
|
+
available moderation models found
|
|
282
|
+
[here](https://docs.together.ai/docs/inference-models#moderation-models).
|
|
283
|
+
|
|
284
|
+
seed: Seed value for reproducibility.
|
|
285
|
+
|
|
286
|
+
stop: A list of string sequences that will truncate (stop) inference text output. For
|
|
287
|
+
example, "</s>" will stop generation as soon as the model generates the given
|
|
288
|
+
token.
|
|
289
|
+
|
|
290
|
+
temperature: A decimal number from 0-1 that determines the degree of randomness in the
|
|
291
|
+
response. A temperature less than 1 favors more correctness and is appropriate
|
|
292
|
+
for question answering or summarization. A value closer to 1 introduces more
|
|
293
|
+
randomness in the output.
|
|
294
|
+
|
|
295
|
+
tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
|
|
296
|
+
which lets the model pick between generating a message or calling a function.
|
|
297
|
+
|
|
298
|
+
tools: A list of tools the model may call. Currently, only functions are supported as a
|
|
299
|
+
tool. Use this to provide a list of functions the model may generate JSON inputs
|
|
300
|
+
for.
|
|
301
|
+
|
|
302
|
+
top_k: An integer that's used to limit the number of choices for the next predicted
|
|
303
|
+
word or token. It specifies the maximum number of tokens to consider at each
|
|
304
|
+
step, based on their probability of occurrence. This technique helps to speed up
|
|
305
|
+
the generation process and can improve the quality of the generated text by
|
|
306
|
+
focusing on the most likely options.
|
|
307
|
+
|
|
308
|
+
top_p: A percentage (also called the nucleus parameter) that's used to dynamically
|
|
309
|
+
adjust the number of choices for each predicted token based on the cumulative
|
|
310
|
+
probabilities. It specifies a probability threshold below which all less likely
|
|
311
|
+
tokens are filtered out. This technique helps maintain diversity and generate
|
|
312
|
+
more fluent and natural-sounding text.
|
|
313
|
+
|
|
314
|
+
extra_headers: Send extra headers
|
|
315
|
+
|
|
316
|
+
extra_query: Add additional query parameters to the request
|
|
317
|
+
|
|
318
|
+
extra_body: Add additional JSON properties to the request
|
|
319
|
+
|
|
320
|
+
timeout: Override the client-level default timeout for this request, in seconds
|
|
321
|
+
"""
|
|
322
|
+
...
|
|
323
|
+
|
|
324
|
+
@overload
|
|
325
|
+
def create(
|
|
326
|
+
self,
|
|
327
|
+
*,
|
|
328
|
+
messages: Iterable[completion_create_params.Message],
|
|
329
|
+
model: Union[
|
|
330
|
+
Literal[
|
|
331
|
+
"Qwen/Qwen2.5-72B-Instruct-Turbo",
|
|
332
|
+
"Qwen/Qwen2.5-7B-Instruct-Turbo",
|
|
333
|
+
"meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
|
|
334
|
+
"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
|
|
335
|
+
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
|
|
336
|
+
],
|
|
337
|
+
str,
|
|
338
|
+
],
|
|
339
|
+
stream: bool,
|
|
340
|
+
context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
|
|
341
|
+
echo: bool | Omit = omit,
|
|
342
|
+
frequency_penalty: float | Omit = omit,
|
|
343
|
+
function_call: completion_create_params.FunctionCall | Omit = omit,
|
|
344
|
+
logit_bias: Dict[str, float] | Omit = omit,
|
|
345
|
+
logprobs: int | Omit = omit,
|
|
346
|
+
max_tokens: int | Omit = omit,
|
|
347
|
+
min_p: float | Omit = omit,
|
|
348
|
+
n: int | Omit = omit,
|
|
349
|
+
presence_penalty: float | Omit = omit,
|
|
350
|
+
reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
|
|
351
|
+
repetition_penalty: float | Omit = omit,
|
|
352
|
+
response_format: completion_create_params.ResponseFormat | Omit = omit,
|
|
353
|
+
safety_model: str | Omit = omit,
|
|
354
|
+
seed: int | Omit = omit,
|
|
355
|
+
stop: SequenceNotStr[str] | Omit = omit,
|
|
356
|
+
temperature: float | Omit = omit,
|
|
357
|
+
tool_choice: completion_create_params.ToolChoice | Omit = omit,
|
|
358
|
+
tools: Iterable[ToolsParam] | Omit = omit,
|
|
359
|
+
top_k: int | Omit = omit,
|
|
360
|
+
top_p: float | Omit = omit,
|
|
361
|
+
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
|
362
|
+
# The extra values given here take precedence over values defined on the client or passed to this method.
|
|
363
|
+
extra_headers: Headers | None = None,
|
|
364
|
+
extra_query: Query | None = None,
|
|
365
|
+
extra_body: Body | None = None,
|
|
366
|
+
timeout: float | httpx.Timeout | None | NotGiven = not_given,
|
|
367
|
+
) -> ChatCompletion | Stream[ChatCompletionChunk]:
|
|
368
|
+
"""
|
|
369
|
+
Query a chat model.
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
messages: A list of messages comprising the conversation so far.
|
|
373
|
+
|
|
374
|
+
model: The name of the model to query.
|
|
375
|
+
|
|
376
|
+
[See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
|
|
377
|
+
|
|
378
|
+
stream: If true, stream tokens as Server-Sent Events as the model generates them instead
|
|
379
|
+
of waiting for the full model response. The stream terminates with
|
|
380
|
+
`data: [DONE]`. If false, return a single JSON object containing the results.
|
|
381
|
+
|
|
382
|
+
context_length_exceeded_behavior: Defined the behavior of the API when max_tokens exceed the maximum context
|
|
383
|
+
length of the model. When set to 'error', API will return 400 with appropriate
|
|
384
|
+
error message. When set to 'truncate', override the max_tokens with maximum
|
|
385
|
+
context length of the model.
|
|
386
|
+
|
|
387
|
+
echo: If true, the response will contain the prompt. Can be used with `logprobs` to
|
|
388
|
+
return prompt logprobs.
|
|
389
|
+
|
|
390
|
+
frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
|
|
391
|
+
repeating tokens that have already been mentioned.
|
|
392
|
+
|
|
393
|
+
logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
|
|
394
|
+
|
|
395
|
+
logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
|
|
396
|
+
at each generation step, instead of just the sampled token. Log probabilities
|
|
397
|
+
help assess model confidence in token predictions.
|
|
398
|
+
|
|
399
|
+
max_tokens: The maximum number of tokens to generate.
|
|
400
|
+
|
|
401
|
+
min_p: A number between 0 and 1 that can be used as an alternative to top_p and top-k.
|
|
402
|
+
|
|
403
|
+
n: The number of completions to generate for each prompt.
|
|
404
|
+
|
|
405
|
+
presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
|
|
406
|
+
a model talking about new topics.
|
|
407
|
+
|
|
408
|
+
reasoning_effort: Controls the level of reasoning effort the model should apply when generating
|
|
409
|
+
responses. Higher values may result in more thoughtful and detailed responses
|
|
410
|
+
but may take longer to generate.
|
|
411
|
+
|
|
412
|
+
repetition_penalty: A number that controls the diversity of generated text by reducing the
|
|
413
|
+
likelihood of repeated sequences. Higher values decrease repetition.
|
|
414
|
+
|
|
415
|
+
response_format: An object specifying the format that the model must output.
|
|
416
|
+
|
|
417
|
+
safety_model: The name of the moderation model used to validate tokens. Choose from the
|
|
418
|
+
available moderation models found
|
|
419
|
+
[here](https://docs.together.ai/docs/inference-models#moderation-models).
|
|
420
|
+
|
|
421
|
+
seed: Seed value for reproducibility.
|
|
422
|
+
|
|
423
|
+
stop: A list of string sequences that will truncate (stop) inference text output. For
|
|
424
|
+
example, "</s>" will stop generation as soon as the model generates the given
|
|
425
|
+
token.
|
|
426
|
+
|
|
427
|
+
temperature: A decimal number from 0-1 that determines the degree of randomness in the
|
|
428
|
+
response. A temperature less than 1 favors more correctness and is appropriate
|
|
429
|
+
for question answering or summarization. A value closer to 1 introduces more
|
|
430
|
+
randomness in the output.
|
|
431
|
+
|
|
432
|
+
tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
|
|
433
|
+
which lets the model pick between generating a message or calling a function.
|
|
434
|
+
|
|
435
|
+
tools: A list of tools the model may call. Currently, only functions are supported as a
|
|
436
|
+
tool. Use this to provide a list of functions the model may generate JSON inputs
|
|
437
|
+
for.
|
|
438
|
+
|
|
439
|
+
top_k: An integer that's used to limit the number of choices for the next predicted
|
|
440
|
+
word or token. It specifies the maximum number of tokens to consider at each
|
|
441
|
+
step, based on their probability of occurrence. This technique helps to speed up
|
|
442
|
+
the generation process and can improve the quality of the generated text by
|
|
443
|
+
focusing on the most likely options.
|
|
444
|
+
|
|
445
|
+
top_p: A percentage (also called the nucleus parameter) that's used to dynamically
|
|
446
|
+
adjust the number of choices for each predicted token based on the cumulative
|
|
447
|
+
probabilities. It specifies a probability threshold below which all less likely
|
|
448
|
+
tokens are filtered out. This technique helps maintain diversity and generate
|
|
449
|
+
more fluent and natural-sounding text.
|
|
450
|
+
|
|
451
|
+
extra_headers: Send extra headers
|
|
452
|
+
|
|
453
|
+
extra_query: Add additional query parameters to the request
|
|
454
|
+
|
|
455
|
+
extra_body: Add additional JSON properties to the request
|
|
456
|
+
|
|
457
|
+
timeout: Override the client-level default timeout for this request, in seconds
|
|
458
|
+
"""
|
|
459
|
+
...
|
|
460
|
+
|
|
461
|
+
@required_args(["messages", "model"], ["messages", "model", "stream"])
|
|
462
|
+
def create(
|
|
463
|
+
self,
|
|
464
|
+
*,
|
|
465
|
+
messages: Iterable[completion_create_params.Message],
|
|
466
|
+
model: Union[
|
|
467
|
+
Literal[
|
|
468
|
+
"Qwen/Qwen2.5-72B-Instruct-Turbo",
|
|
469
|
+
"Qwen/Qwen2.5-7B-Instruct-Turbo",
|
|
470
|
+
"meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
|
|
471
|
+
"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
|
|
472
|
+
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
|
|
473
|
+
],
|
|
474
|
+
str,
|
|
475
|
+
],
|
|
476
|
+
context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
|
|
477
|
+
echo: bool | Omit = omit,
|
|
478
|
+
frequency_penalty: float | Omit = omit,
|
|
479
|
+
function_call: completion_create_params.FunctionCall | Omit = omit,
|
|
480
|
+
logit_bias: Dict[str, float] | Omit = omit,
|
|
481
|
+
logprobs: int | Omit = omit,
|
|
482
|
+
max_tokens: int | Omit = omit,
|
|
483
|
+
min_p: float | Omit = omit,
|
|
484
|
+
n: int | Omit = omit,
|
|
485
|
+
presence_penalty: float | Omit = omit,
|
|
486
|
+
reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
|
|
487
|
+
repetition_penalty: float | Omit = omit,
|
|
488
|
+
response_format: completion_create_params.ResponseFormat | Omit = omit,
|
|
489
|
+
safety_model: str | Omit = omit,
|
|
490
|
+
seed: int | Omit = omit,
|
|
491
|
+
stop: SequenceNotStr[str] | Omit = omit,
|
|
492
|
+
stream: Literal[False] | Literal[True] | Omit = omit,
|
|
493
|
+
temperature: float | Omit = omit,
|
|
494
|
+
tool_choice: completion_create_params.ToolChoice | Omit = omit,
|
|
495
|
+
tools: Iterable[ToolsParam] | Omit = omit,
|
|
496
|
+
top_k: int | Omit = omit,
|
|
497
|
+
top_p: float | Omit = omit,
|
|
498
|
+
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
|
499
|
+
# The extra values given here take precedence over values defined on the client or passed to this method.
|
|
500
|
+
extra_headers: Headers | None = None,
|
|
501
|
+
extra_query: Query | None = None,
|
|
502
|
+
extra_body: Body | None = None,
|
|
503
|
+
timeout: float | httpx.Timeout | None | NotGiven = not_given,
|
|
504
|
+
) -> ChatCompletion | Stream[ChatCompletionChunk]:
|
|
505
|
+
return self._post(
|
|
506
|
+
"/chat/completions",
|
|
507
|
+
body=maybe_transform(
|
|
508
|
+
{
|
|
509
|
+
"messages": messages,
|
|
510
|
+
"model": model,
|
|
511
|
+
"context_length_exceeded_behavior": context_length_exceeded_behavior,
|
|
512
|
+
"echo": echo,
|
|
513
|
+
"frequency_penalty": frequency_penalty,
|
|
514
|
+
"function_call": function_call,
|
|
515
|
+
"logit_bias": logit_bias,
|
|
516
|
+
"logprobs": logprobs,
|
|
517
|
+
"max_tokens": max_tokens,
|
|
518
|
+
"min_p": min_p,
|
|
519
|
+
"n": n,
|
|
520
|
+
"presence_penalty": presence_penalty,
|
|
521
|
+
"reasoning_effort": reasoning_effort,
|
|
522
|
+
"repetition_penalty": repetition_penalty,
|
|
523
|
+
"response_format": response_format,
|
|
524
|
+
"safety_model": safety_model,
|
|
525
|
+
"seed": seed,
|
|
526
|
+
"stop": stop,
|
|
527
|
+
"stream": stream,
|
|
528
|
+
"temperature": temperature,
|
|
529
|
+
"tool_choice": tool_choice,
|
|
530
|
+
"tools": tools,
|
|
531
|
+
"top_k": top_k,
|
|
532
|
+
"top_p": top_p,
|
|
533
|
+
},
|
|
534
|
+
completion_create_params.CompletionCreateParamsStreaming
|
|
535
|
+
if stream
|
|
536
|
+
else completion_create_params.CompletionCreateParamsNonStreaming,
|
|
146
537
|
),
|
|
147
|
-
|
|
538
|
+
options=make_request_options(
|
|
539
|
+
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
|
|
540
|
+
),
|
|
541
|
+
cast_to=ChatCompletion,
|
|
542
|
+
stream=stream or False,
|
|
543
|
+
stream_cls=Stream[ChatCompletionChunk],
|
|
148
544
|
)
|
|
149
545
|
|
|
150
|
-
if stream:
|
|
151
|
-
# must be an iterator
|
|
152
|
-
assert not isinstance(response, TogetherResponse)
|
|
153
|
-
return (ChatCompletionChunk(**line.data) for line in response)
|
|
154
|
-
assert isinstance(response, TogetherResponse)
|
|
155
|
-
return ChatCompletionResponse(**response.data)
|
|
156
546
|
|
|
547
|
+
class AsyncCompletionsResource(AsyncAPIResource):
|
|
548
|
+
@cached_property
|
|
549
|
+
def with_raw_response(self) -> AsyncCompletionsResourceWithRawResponse:
|
|
550
|
+
"""
|
|
551
|
+
This property can be used as a prefix for any HTTP method call to return
|
|
552
|
+
the raw response object instead of the parsed content.
|
|
553
|
+
|
|
554
|
+
For more information, see https://www.github.com/togethercomputer/together-py#accessing-raw-response-data-eg-headers
|
|
555
|
+
"""
|
|
556
|
+
return AsyncCompletionsResourceWithRawResponse(self)
|
|
157
557
|
|
|
158
|
-
|
|
159
|
-
def
|
|
160
|
-
|
|
558
|
+
@cached_property
|
|
559
|
+
def with_streaming_response(self) -> AsyncCompletionsResourceWithStreamingResponse:
|
|
560
|
+
"""
|
|
561
|
+
An alternative to `.with_raw_response` that doesn't eagerly read the response body.
|
|
562
|
+
|
|
563
|
+
For more information, see https://www.github.com/togethercomputer/together-py#with_streaming_response
|
|
564
|
+
"""
|
|
565
|
+
return AsyncCompletionsResourceWithStreamingResponse(self)
|
|
161
566
|
|
|
567
|
+
@overload
|
|
162
568
|
async def create(
|
|
163
569
|
self,
|
|
164
570
|
*,
|
|
165
|
-
messages:
|
|
166
|
-
model:
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
571
|
+
messages: Iterable[completion_create_params.Message],
|
|
572
|
+
model: Union[
|
|
573
|
+
Literal[
|
|
574
|
+
"Qwen/Qwen2.5-72B-Instruct-Turbo",
|
|
575
|
+
"Qwen/Qwen2.5-7B-Instruct-Turbo",
|
|
576
|
+
"meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
|
|
577
|
+
"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
|
|
578
|
+
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
|
|
579
|
+
],
|
|
580
|
+
str,
|
|
581
|
+
],
|
|
582
|
+
context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
|
|
583
|
+
echo: bool | Omit = omit,
|
|
584
|
+
frequency_penalty: float | Omit = omit,
|
|
585
|
+
function_call: completion_create_params.FunctionCall | Omit = omit,
|
|
586
|
+
logit_bias: Dict[str, float] | Omit = omit,
|
|
587
|
+
logprobs: int | Omit = omit,
|
|
588
|
+
max_tokens: int | Omit = omit,
|
|
589
|
+
min_p: float | Omit = omit,
|
|
590
|
+
n: int | Omit = omit,
|
|
591
|
+
presence_penalty: float | Omit = omit,
|
|
592
|
+
reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
|
|
593
|
+
repetition_penalty: float | Omit = omit,
|
|
594
|
+
response_format: completion_create_params.ResponseFormat | Omit = omit,
|
|
595
|
+
safety_model: str | Omit = omit,
|
|
596
|
+
seed: int | Omit = omit,
|
|
597
|
+
stop: SequenceNotStr[str] | Omit = omit,
|
|
598
|
+
stream: Literal[False] | Omit = omit,
|
|
599
|
+
temperature: float | Omit = omit,
|
|
600
|
+
tool_choice: completion_create_params.ToolChoice | Omit = omit,
|
|
601
|
+
tools: Iterable[ToolsParam] | Omit = omit,
|
|
602
|
+
top_k: int | Omit = omit,
|
|
603
|
+
top_p: float | Omit = omit,
|
|
604
|
+
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
|
605
|
+
# The extra values given here take precedence over values defined on the client or passed to this method.
|
|
606
|
+
extra_headers: Headers | None = None,
|
|
607
|
+
extra_query: Query | None = None,
|
|
608
|
+
extra_body: Body | None = None,
|
|
609
|
+
timeout: float | httpx.Timeout | None | NotGiven = not_given,
|
|
610
|
+
) -> ChatCompletion:
|
|
188
611
|
"""
|
|
189
|
-
|
|
612
|
+
Query a chat model.
|
|
190
613
|
|
|
191
614
|
Args:
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
615
|
+
messages: A list of messages comprising the conversation so far.
|
|
616
|
+
|
|
617
|
+
model: The name of the model to query.
|
|
618
|
+
|
|
619
|
+
[See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
|
|
620
|
+
|
|
621
|
+
context_length_exceeded_behavior: Defined the behavior of the API when max_tokens exceed the maximum context
|
|
622
|
+
length of the model. When set to 'error', API will return 400 with appropriate
|
|
623
|
+
error message. When set to 'truncate', override the max_tokens with maximum
|
|
624
|
+
context length of the model.
|
|
625
|
+
|
|
626
|
+
echo: If true, the response will contain the prompt. Can be used with `logprobs` to
|
|
627
|
+
return prompt logprobs.
|
|
628
|
+
|
|
629
|
+
frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
|
|
630
|
+
repeating tokens that have already been mentioned.
|
|
631
|
+
|
|
632
|
+
logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
|
|
633
|
+
|
|
634
|
+
logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
|
|
635
|
+
at each generation step, instead of just the sampled token. Log probabilities
|
|
636
|
+
help assess model confidence in token predictions.
|
|
637
|
+
|
|
638
|
+
max_tokens: The maximum number of tokens to generate.
|
|
639
|
+
|
|
640
|
+
min_p: A number between 0 and 1 that can be used as an alternative to top_p and top-k.
|
|
641
|
+
|
|
642
|
+
n: The number of completions to generate for each prompt.
|
|
643
|
+
|
|
644
|
+
presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
|
|
645
|
+
a model talking about new topics.
|
|
646
|
+
|
|
647
|
+
reasoning_effort: Controls the level of reasoning effort the model should apply when generating
|
|
648
|
+
responses. Higher values may result in more thoughtful and detailed responses
|
|
649
|
+
but may take longer to generate.
|
|
650
|
+
|
|
651
|
+
repetition_penalty: A number that controls the diversity of generated text by reducing the
|
|
652
|
+
likelihood of repeated sequences. Higher values decrease repetition.
|
|
653
|
+
|
|
654
|
+
response_format: An object specifying the format that the model must output.
|
|
655
|
+
|
|
656
|
+
safety_model: The name of the moderation model used to validate tokens. Choose from the
|
|
657
|
+
available moderation models found
|
|
658
|
+
[here](https://docs.together.ai/docs/inference-models#moderation-models).
|
|
659
|
+
|
|
660
|
+
seed: Seed value for reproducibility.
|
|
661
|
+
|
|
662
|
+
stop: A list of string sequences that will truncate (stop) inference text output. For
|
|
663
|
+
example, "</s>" will stop generation as soon as the model generates the given
|
|
664
|
+
token.
|
|
665
|
+
|
|
666
|
+
stream: If true, stream tokens as Server-Sent Events as the model generates them instead
|
|
667
|
+
of waiting for the full model response. The stream terminates with
|
|
668
|
+
`data: [DONE]`. If false, return a single JSON object containing the results.
|
|
669
|
+
|
|
670
|
+
temperature: A decimal number from 0-1 that determines the degree of randomness in the
|
|
671
|
+
response. A temperature less than 1 favors more correctness and is appropriate
|
|
672
|
+
for question answering or summarization. A value closer to 1 introduces more
|
|
673
|
+
randomness in the output.
|
|
674
|
+
|
|
675
|
+
tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
|
|
676
|
+
which lets the model pick between generating a message or calling a function.
|
|
677
|
+
|
|
678
|
+
tools: A list of tools the model may call. Currently, only functions are supported as a
|
|
679
|
+
tool. Use this to provide a list of functions the model may generate JSON inputs
|
|
680
|
+
for.
|
|
681
|
+
|
|
682
|
+
top_k: An integer that's used to limit the number of choices for the next predicted
|
|
683
|
+
word or token. It specifies the maximum number of tokens to consider at each
|
|
684
|
+
step, based on their probability of occurrence. This technique helps to speed up
|
|
685
|
+
the generation process and can improve the quality of the generated text by
|
|
686
|
+
focusing on the most likely options.
|
|
687
|
+
|
|
688
|
+
top_p: A percentage (also called the nucleus parameter) that's used to dynamically
|
|
689
|
+
adjust the number of choices for each predicted token based on the cumulative
|
|
690
|
+
probabilities. It specifies a probability threshold below which all less likely
|
|
691
|
+
tokens are filtered out. This technique helps maintain diversity and generate
|
|
692
|
+
more fluent and natural-sounding text.
|
|
693
|
+
|
|
694
|
+
extra_headers: Send extra headers
|
|
695
|
+
|
|
696
|
+
extra_query: Add additional query parameters to the request
|
|
697
|
+
|
|
698
|
+
extra_body: Add additional JSON properties to the request
|
|
699
|
+
|
|
700
|
+
timeout: Override the client-level default timeout for this request, in seconds
|
|
252
701
|
"""
|
|
702
|
+
...
|
|
253
703
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
704
|
+
@overload
|
|
705
|
+
async def create(
|
|
706
|
+
self,
|
|
707
|
+
*,
|
|
708
|
+
messages: Iterable[completion_create_params.Message],
|
|
709
|
+
model: Union[
|
|
710
|
+
Literal[
|
|
711
|
+
"Qwen/Qwen2.5-72B-Instruct-Turbo",
|
|
712
|
+
"Qwen/Qwen2.5-7B-Instruct-Turbo",
|
|
713
|
+
"meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
|
|
714
|
+
"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
|
|
715
|
+
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
|
|
716
|
+
],
|
|
717
|
+
str,
|
|
718
|
+
],
|
|
719
|
+
stream: Literal[True],
|
|
720
|
+
context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
|
|
721
|
+
echo: bool | Omit = omit,
|
|
722
|
+
frequency_penalty: float | Omit = omit,
|
|
723
|
+
function_call: completion_create_params.FunctionCall | Omit = omit,
|
|
724
|
+
logit_bias: Dict[str, float] | Omit = omit,
|
|
725
|
+
logprobs: int | Omit = omit,
|
|
726
|
+
max_tokens: int | Omit = omit,
|
|
727
|
+
min_p: float | Omit = omit,
|
|
728
|
+
n: int | Omit = omit,
|
|
729
|
+
presence_penalty: float | Omit = omit,
|
|
730
|
+
reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
|
|
731
|
+
repetition_penalty: float | Omit = omit,
|
|
732
|
+
response_format: completion_create_params.ResponseFormat | Omit = omit,
|
|
733
|
+
safety_model: str | Omit = omit,
|
|
734
|
+
seed: int | Omit = omit,
|
|
735
|
+
stop: SequenceNotStr[str] | Omit = omit,
|
|
736
|
+
temperature: float | Omit = omit,
|
|
737
|
+
tool_choice: completion_create_params.ToolChoice | Omit = omit,
|
|
738
|
+
tools: Iterable[ToolsParam] | Omit = omit,
|
|
739
|
+
top_k: int | Omit = omit,
|
|
740
|
+
top_p: float | Omit = omit,
|
|
741
|
+
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
|
742
|
+
# The extra values given here take precedence over values defined on the client or passed to this method.
|
|
743
|
+
extra_headers: Headers | None = None,
|
|
744
|
+
extra_query: Query | None = None,
|
|
745
|
+
extra_body: Body | None = None,
|
|
746
|
+
timeout: float | httpx.Timeout | None | NotGiven = not_given,
|
|
747
|
+
) -> AsyncStream[ChatCompletionChunk]:
|
|
748
|
+
"""
|
|
749
|
+
Query a chat model.
|
|
750
|
+
|
|
751
|
+
Args:
|
|
752
|
+
messages: A list of messages comprising the conversation so far.
|
|
753
|
+
|
|
754
|
+
model: The name of the model to query.
|
|
755
|
+
|
|
756
|
+
[See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
|
|
757
|
+
|
|
758
|
+
stream: If true, stream tokens as Server-Sent Events as the model generates them instead
|
|
759
|
+
of waiting for the full model response. The stream terminates with
|
|
760
|
+
`data: [DONE]`. If false, return a single JSON object containing the results.
|
|
761
|
+
|
|
762
|
+
context_length_exceeded_behavior: Defined the behavior of the API when max_tokens exceed the maximum context
|
|
763
|
+
length of the model. When set to 'error', API will return 400 with appropriate
|
|
764
|
+
error message. When set to 'truncate', override the max_tokens with maximum
|
|
765
|
+
context length of the model.
|
|
766
|
+
|
|
767
|
+
echo: If true, the response will contain the prompt. Can be used with `logprobs` to
|
|
768
|
+
return prompt logprobs.
|
|
769
|
+
|
|
770
|
+
frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
|
|
771
|
+
repeating tokens that have already been mentioned.
|
|
772
|
+
|
|
773
|
+
logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
|
|
774
|
+
|
|
775
|
+
logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
|
|
776
|
+
at each generation step, instead of just the sampled token. Log probabilities
|
|
777
|
+
help assess model confidence in token predictions.
|
|
778
|
+
|
|
779
|
+
max_tokens: The maximum number of tokens to generate.
|
|
780
|
+
|
|
781
|
+
min_p: A number between 0 and 1 that can be used as an alternative to top_p and top-k.
|
|
782
|
+
|
|
783
|
+
n: The number of completions to generate for each prompt.
|
|
784
|
+
|
|
785
|
+
presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
|
|
786
|
+
a model talking about new topics.
|
|
787
|
+
|
|
788
|
+
reasoning_effort: Controls the level of reasoning effort the model should apply when generating
|
|
789
|
+
responses. Higher values may result in more thoughtful and detailed responses
|
|
790
|
+
but may take longer to generate.
|
|
791
|
+
|
|
792
|
+
repetition_penalty: A number that controls the diversity of generated text by reducing the
|
|
793
|
+
likelihood of repeated sequences. Higher values decrease repetition.
|
|
794
|
+
|
|
795
|
+
response_format: An object specifying the format that the model must output.
|
|
796
|
+
|
|
797
|
+
safety_model: The name of the moderation model used to validate tokens. Choose from the
|
|
798
|
+
available moderation models found
|
|
799
|
+
[here](https://docs.together.ai/docs/inference-models#moderation-models).
|
|
800
|
+
|
|
801
|
+
seed: Seed value for reproducibility.
|
|
802
|
+
|
|
803
|
+
stop: A list of string sequences that will truncate (stop) inference text output. For
|
|
804
|
+
example, "</s>" will stop generation as soon as the model generates the given
|
|
805
|
+
token.
|
|
806
|
+
|
|
807
|
+
temperature: A decimal number from 0-1 that determines the degree of randomness in the
|
|
808
|
+
response. A temperature less than 1 favors more correctness and is appropriate
|
|
809
|
+
for question answering or summarization. A value closer to 1 introduces more
|
|
810
|
+
randomness in the output.
|
|
811
|
+
|
|
812
|
+
tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
|
|
813
|
+
which lets the model pick between generating a message or calling a function.
|
|
814
|
+
|
|
815
|
+
tools: A list of tools the model may call. Currently, only functions are supported as a
|
|
816
|
+
tool. Use this to provide a list of functions the model may generate JSON inputs
|
|
817
|
+
for.
|
|
818
|
+
|
|
819
|
+
top_k: An integer that's used to limit the number of choices for the next predicted
|
|
820
|
+
word or token. It specifies the maximum number of tokens to consider at each
|
|
821
|
+
step, based on their probability of occurrence. This technique helps to speed up
|
|
822
|
+
the generation process and can improve the quality of the generated text by
|
|
823
|
+
focusing on the most likely options.
|
|
824
|
+
|
|
825
|
+
top_p: A percentage (also called the nucleus parameter) that's used to dynamically
|
|
826
|
+
adjust the number of choices for each predicted token based on the cumulative
|
|
827
|
+
probabilities. It specifies a probability threshold below which all less likely
|
|
828
|
+
tokens are filtered out. This technique helps maintain diversity and generate
|
|
829
|
+
more fluent and natural-sounding text.
|
|
830
|
+
|
|
831
|
+
extra_headers: Send extra headers
|
|
257
832
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
833
|
+
extra_query: Add additional query parameters to the request
|
|
834
|
+
|
|
835
|
+
extra_body: Add additional JSON properties to the request
|
|
836
|
+
|
|
837
|
+
timeout: Override the client-level default timeout for this request, in seconds
|
|
838
|
+
"""
|
|
839
|
+
...
|
|
840
|
+
|
|
841
|
+
@overload
|
|
842
|
+
async def create(
|
|
843
|
+
self,
|
|
844
|
+
*,
|
|
845
|
+
messages: Iterable[completion_create_params.Message],
|
|
846
|
+
model: Union[
|
|
847
|
+
Literal[
|
|
848
|
+
"Qwen/Qwen2.5-72B-Instruct-Turbo",
|
|
849
|
+
"Qwen/Qwen2.5-7B-Instruct-Turbo",
|
|
850
|
+
"meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
|
|
851
|
+
"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
|
|
852
|
+
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
|
|
853
|
+
],
|
|
854
|
+
str,
|
|
855
|
+
],
|
|
856
|
+
stream: bool,
|
|
857
|
+
context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
|
|
858
|
+
echo: bool | Omit = omit,
|
|
859
|
+
frequency_penalty: float | Omit = omit,
|
|
860
|
+
function_call: completion_create_params.FunctionCall | Omit = omit,
|
|
861
|
+
logit_bias: Dict[str, float] | Omit = omit,
|
|
862
|
+
logprobs: int | Omit = omit,
|
|
863
|
+
max_tokens: int | Omit = omit,
|
|
864
|
+
min_p: float | Omit = omit,
|
|
865
|
+
n: int | Omit = omit,
|
|
866
|
+
presence_penalty: float | Omit = omit,
|
|
867
|
+
reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
|
|
868
|
+
repetition_penalty: float | Omit = omit,
|
|
869
|
+
response_format: completion_create_params.ResponseFormat | Omit = omit,
|
|
870
|
+
safety_model: str | Omit = omit,
|
|
871
|
+
seed: int | Omit = omit,
|
|
872
|
+
stop: SequenceNotStr[str] | Omit = omit,
|
|
873
|
+
temperature: float | Omit = omit,
|
|
874
|
+
tool_choice: completion_create_params.ToolChoice | Omit = omit,
|
|
875
|
+
tools: Iterable[ToolsParam] | Omit = omit,
|
|
876
|
+
top_k: int | Omit = omit,
|
|
877
|
+
top_p: float | Omit = omit,
|
|
878
|
+
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
|
879
|
+
# The extra values given here take precedence over values defined on the client or passed to this method.
|
|
880
|
+
extra_headers: Headers | None = None,
|
|
881
|
+
extra_query: Query | None = None,
|
|
882
|
+
extra_body: Body | None = None,
|
|
883
|
+
timeout: float | httpx.Timeout | None | NotGiven = not_given,
|
|
884
|
+
) -> ChatCompletion | AsyncStream[ChatCompletionChunk]:
|
|
885
|
+
"""
|
|
886
|
+
Query a chat model.
|
|
887
|
+
|
|
888
|
+
Args:
|
|
889
|
+
messages: A list of messages comprising the conversation so far.
|
|
890
|
+
|
|
891
|
+
model: The name of the model to query.
|
|
892
|
+
|
|
893
|
+
[See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
|
|
894
|
+
|
|
895
|
+
stream: If true, stream tokens as Server-Sent Events as the model generates them instead
|
|
896
|
+
of waiting for the full model response. The stream terminates with
|
|
897
|
+
`data: [DONE]`. If false, return a single JSON object containing the results.
|
|
898
|
+
|
|
899
|
+
context_length_exceeded_behavior: Defined the behavior of the API when max_tokens exceed the maximum context
|
|
900
|
+
length of the model. When set to 'error', API will return 400 with appropriate
|
|
901
|
+
error message. When set to 'truncate', override the max_tokens with maximum
|
|
902
|
+
context length of the model.
|
|
903
|
+
|
|
904
|
+
echo: If true, the response will contain the prompt. Can be used with `logprobs` to
|
|
905
|
+
return prompt logprobs.
|
|
906
|
+
|
|
907
|
+
frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
|
|
908
|
+
repeating tokens that have already been mentioned.
|
|
909
|
+
|
|
910
|
+
logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
|
|
911
|
+
|
|
912
|
+
logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
|
|
913
|
+
at each generation step, instead of just the sampled token. Log probabilities
|
|
914
|
+
help assess model confidence in token predictions.
|
|
915
|
+
|
|
916
|
+
max_tokens: The maximum number of tokens to generate.
|
|
917
|
+
|
|
918
|
+
min_p: A number between 0 and 1 that can be used as an alternative to top_p and top-k.
|
|
919
|
+
|
|
920
|
+
n: The number of completions to generate for each prompt.
|
|
921
|
+
|
|
922
|
+
presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
|
|
923
|
+
a model talking about new topics.
|
|
924
|
+
|
|
925
|
+
reasoning_effort: Controls the level of reasoning effort the model should apply when generating
|
|
926
|
+
responses. Higher values may result in more thoughtful and detailed responses
|
|
927
|
+
but may take longer to generate.
|
|
928
|
+
|
|
929
|
+
repetition_penalty: A number that controls the diversity of generated text by reducing the
|
|
930
|
+
likelihood of repeated sequences. Higher values decrease repetition.
|
|
931
|
+
|
|
932
|
+
response_format: An object specifying the format that the model must output.
|
|
933
|
+
|
|
934
|
+
safety_model: The name of the moderation model used to validate tokens. Choose from the
|
|
935
|
+
available moderation models found
|
|
936
|
+
[here](https://docs.together.ai/docs/inference-models#moderation-models).
|
|
937
|
+
|
|
938
|
+
seed: Seed value for reproducibility.
|
|
939
|
+
|
|
940
|
+
stop: A list of string sequences that will truncate (stop) inference text output. For
|
|
941
|
+
example, "</s>" will stop generation as soon as the model generates the given
|
|
942
|
+
token.
|
|
943
|
+
|
|
944
|
+
temperature: A decimal number from 0-1 that determines the degree of randomness in the
|
|
945
|
+
response. A temperature less than 1 favors more correctness and is appropriate
|
|
946
|
+
for question answering or summarization. A value closer to 1 introduces more
|
|
947
|
+
randomness in the output.
|
|
948
|
+
|
|
949
|
+
tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
|
|
950
|
+
which lets the model pick between generating a message or calling a function.
|
|
951
|
+
|
|
952
|
+
tools: A list of tools the model may call. Currently, only functions are supported as a
|
|
953
|
+
tool. Use this to provide a list of functions the model may generate JSON inputs
|
|
954
|
+
for.
|
|
955
|
+
|
|
956
|
+
top_k: An integer that's used to limit the number of choices for the next predicted
|
|
957
|
+
word or token. It specifies the maximum number of tokens to consider at each
|
|
958
|
+
step, based on their probability of occurrence. This technique helps to speed up
|
|
959
|
+
the generation process and can improve the quality of the generated text by
|
|
960
|
+
focusing on the most likely options.
|
|
961
|
+
|
|
962
|
+
top_p: A percentage (also called the nucleus parameter) that's used to dynamically
|
|
963
|
+
adjust the number of choices for each predicted token based on the cumulative
|
|
964
|
+
probabilities. It specifies a probability threshold below which all less likely
|
|
965
|
+
tokens are filtered out. This technique helps maintain diversity and generate
|
|
966
|
+
more fluent and natural-sounding text.
|
|
967
|
+
|
|
968
|
+
extra_headers: Send extra headers
|
|
969
|
+
|
|
970
|
+
extra_query: Add additional query parameters to the request
|
|
971
|
+
|
|
972
|
+
extra_body: Add additional JSON properties to the request
|
|
973
|
+
|
|
974
|
+
timeout: Override the client-level default timeout for this request, in seconds
|
|
975
|
+
"""
|
|
976
|
+
...
|
|
977
|
+
|
|
978
|
+
@required_args(["messages", "model"], ["messages", "model", "stream"])
|
|
979
|
+
async def create(
|
|
980
|
+
self,
|
|
981
|
+
*,
|
|
982
|
+
messages: Iterable[completion_create_params.Message],
|
|
983
|
+
model: Union[
|
|
984
|
+
Literal[
|
|
985
|
+
"Qwen/Qwen2.5-72B-Instruct-Turbo",
|
|
986
|
+
"Qwen/Qwen2.5-7B-Instruct-Turbo",
|
|
987
|
+
"meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
|
|
988
|
+
"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
|
|
989
|
+
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
|
|
990
|
+
],
|
|
991
|
+
str,
|
|
992
|
+
],
|
|
993
|
+
context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
|
|
994
|
+
echo: bool | Omit = omit,
|
|
995
|
+
frequency_penalty: float | Omit = omit,
|
|
996
|
+
function_call: completion_create_params.FunctionCall | Omit = omit,
|
|
997
|
+
logit_bias: Dict[str, float] | Omit = omit,
|
|
998
|
+
logprobs: int | Omit = omit,
|
|
999
|
+
max_tokens: int | Omit = omit,
|
|
1000
|
+
min_p: float | Omit = omit,
|
|
1001
|
+
n: int | Omit = omit,
|
|
1002
|
+
presence_penalty: float | Omit = omit,
|
|
1003
|
+
reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
|
|
1004
|
+
repetition_penalty: float | Omit = omit,
|
|
1005
|
+
response_format: completion_create_params.ResponseFormat | Omit = omit,
|
|
1006
|
+
safety_model: str | Omit = omit,
|
|
1007
|
+
seed: int | Omit = omit,
|
|
1008
|
+
stop: SequenceNotStr[str] | Omit = omit,
|
|
1009
|
+
stream: Literal[False] | Literal[True] | Omit = omit,
|
|
1010
|
+
temperature: float | Omit = omit,
|
|
1011
|
+
tool_choice: completion_create_params.ToolChoice | Omit = omit,
|
|
1012
|
+
tools: Iterable[ToolsParam] | Omit = omit,
|
|
1013
|
+
top_k: int | Omit = omit,
|
|
1014
|
+
top_p: float | Omit = omit,
|
|
1015
|
+
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
|
1016
|
+
# The extra values given here take precedence over values defined on the client or passed to this method.
|
|
1017
|
+
extra_headers: Headers | None = None,
|
|
1018
|
+
extra_query: Query | None = None,
|
|
1019
|
+
extra_body: Body | None = None,
|
|
1020
|
+
timeout: float | httpx.Timeout | None | NotGiven = not_given,
|
|
1021
|
+
) -> ChatCompletion | AsyncStream[ChatCompletionChunk]:
|
|
1022
|
+
return await self._post(
|
|
1023
|
+
"/chat/completions",
|
|
1024
|
+
body=await async_maybe_transform(
|
|
1025
|
+
{
|
|
1026
|
+
"messages": messages,
|
|
1027
|
+
"model": model,
|
|
1028
|
+
"context_length_exceeded_behavior": context_length_exceeded_behavior,
|
|
1029
|
+
"echo": echo,
|
|
1030
|
+
"frequency_penalty": frequency_penalty,
|
|
1031
|
+
"function_call": function_call,
|
|
1032
|
+
"logit_bias": logit_bias,
|
|
1033
|
+
"logprobs": logprobs,
|
|
1034
|
+
"max_tokens": max_tokens,
|
|
1035
|
+
"min_p": min_p,
|
|
1036
|
+
"n": n,
|
|
1037
|
+
"presence_penalty": presence_penalty,
|
|
1038
|
+
"reasoning_effort": reasoning_effort,
|
|
1039
|
+
"repetition_penalty": repetition_penalty,
|
|
1040
|
+
"response_format": response_format,
|
|
1041
|
+
"safety_model": safety_model,
|
|
1042
|
+
"seed": seed,
|
|
1043
|
+
"stop": stop,
|
|
1044
|
+
"stream": stream,
|
|
1045
|
+
"temperature": temperature,
|
|
1046
|
+
"tool_choice": tool_choice,
|
|
1047
|
+
"tools": tools,
|
|
1048
|
+
"top_k": top_k,
|
|
1049
|
+
"top_p": top_p,
|
|
1050
|
+
},
|
|
1051
|
+
completion_create_params.CompletionCreateParamsStreaming
|
|
1052
|
+
if stream
|
|
1053
|
+
else completion_create_params.CompletionCreateParamsNonStreaming,
|
|
1054
|
+
),
|
|
1055
|
+
options=make_request_options(
|
|
1056
|
+
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
|
|
288
1057
|
),
|
|
289
|
-
|
|
1058
|
+
cast_to=ChatCompletion,
|
|
1059
|
+
stream=stream or False,
|
|
1060
|
+
stream_cls=AsyncStream[ChatCompletionChunk],
|
|
290
1061
|
)
|
|
291
1062
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
1063
|
+
|
|
1064
|
+
class CompletionsResourceWithRawResponse:
|
|
1065
|
+
def __init__(self, completions: CompletionsResource) -> None:
|
|
1066
|
+
self._completions = completions
|
|
1067
|
+
|
|
1068
|
+
self.create = to_raw_response_wrapper(
|
|
1069
|
+
completions.create,
|
|
1070
|
+
)
|
|
1071
|
+
|
|
1072
|
+
|
|
1073
|
+
class AsyncCompletionsResourceWithRawResponse:
|
|
1074
|
+
def __init__(self, completions: AsyncCompletionsResource) -> None:
|
|
1075
|
+
self._completions = completions
|
|
1076
|
+
|
|
1077
|
+
self.create = async_to_raw_response_wrapper(
|
|
1078
|
+
completions.create,
|
|
1079
|
+
)
|
|
1080
|
+
|
|
1081
|
+
|
|
1082
|
+
class CompletionsResourceWithStreamingResponse:
|
|
1083
|
+
def __init__(self, completions: CompletionsResource) -> None:
|
|
1084
|
+
self._completions = completions
|
|
1085
|
+
|
|
1086
|
+
self.create = to_streamed_response_wrapper(
|
|
1087
|
+
completions.create,
|
|
1088
|
+
)
|
|
1089
|
+
|
|
1090
|
+
|
|
1091
|
+
class AsyncCompletionsResourceWithStreamingResponse:
|
|
1092
|
+
def __init__(self, completions: AsyncCompletionsResource) -> None:
|
|
1093
|
+
self._completions = completions
|
|
1094
|
+
|
|
1095
|
+
self.create = async_to_streamed_response_wrapper(
|
|
1096
|
+
completions.create,
|
|
1097
|
+
)
|