PyPI - together - Versions diffs - 1.5.34__py3-none-any.whl → 2.0.0a6__py3-none-any.whl - Mend

together 1.5.34py3-none-any.whl → 2.0.0a6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (208) hide show

together/__init__.py +101 -114
together/_base_client.py +1995 -0
together/_client.py +1033 -0
together/_compat.py +219 -0
together/_constants.py +14 -0
together/_exceptions.py +108 -0
together/_files.py +123 -0
together/_models.py +857 -0
together/_qs.py +150 -0
together/_resource.py +43 -0
together/_response.py +830 -0
together/_streaming.py +370 -0
together/_types.py +260 -0
together/_utils/__init__.py +64 -0
together/_utils/_compat.py +45 -0
together/_utils/_datetime_parse.py +136 -0
together/_utils/_logs.py +25 -0
together/_utils/_proxy.py +65 -0
together/_utils/_reflection.py +42 -0
together/_utils/_resources_proxy.py +24 -0
together/_utils/_streams.py +12 -0
together/_utils/_sync.py +58 -0
together/_utils/_transform.py +457 -0
together/_utils/_typing.py +156 -0
together/_utils/_utils.py +421 -0
together/_version.py +4 -0
together/lib/.keep +4 -0
together/lib/__init__.py +23 -0
together/{cli → lib/cli}/api/endpoints.py +65 -81
together/{cli/api/evaluation.py → lib/cli/api/evals.py} +152 -43
together/{cli → lib/cli}/api/files.py +20 -17
together/{cli/api/finetune.py → lib/cli/api/fine_tuning.py} +116 -172
together/{cli → lib/cli}/api/models.py +34 -27
together/lib/cli/api/utils.py +50 -0
together/{cli → lib/cli}/cli.py +16 -26
together/{constants.py → lib/constants.py} +11 -24
together/lib/resources/__init__.py +11 -0
together/lib/resources/files.py +999 -0
together/lib/resources/fine_tuning.py +280 -0
together/lib/resources/models.py +35 -0
together/lib/types/__init__.py +13 -0
together/lib/types/error.py +9 -0
together/lib/types/fine_tuning.py +397 -0
together/{utils → lib/utils}/__init__.py +6 -14
together/{utils → lib/utils}/_log.py +11 -16
together/{utils → lib/utils}/files.py +90 -288
together/lib/utils/serializer.py +10 -0
together/{utils → lib/utils}/tools.py +19 -55
together/resources/__init__.py +225 -39
together/resources/audio/__init__.py +72 -48
together/resources/audio/audio.py +198 -0
together/resources/audio/speech.py +574 -128
together/resources/audio/transcriptions.py +247 -261
together/resources/audio/translations.py +221 -241
together/resources/audio/voices.py +111 -41
together/resources/batches.py +417 -0
together/resources/chat/__init__.py +30 -21
together/resources/chat/chat.py +102 -0
together/resources/chat/completions.py +1063 -263
together/resources/code_interpreter/__init__.py +33 -0
together/resources/code_interpreter/code_interpreter.py +258 -0
together/resources/code_interpreter/sessions.py +135 -0
together/resources/completions.py +884 -225
together/resources/embeddings.py +172 -68
together/resources/endpoints.py +589 -477
together/resources/evals.py +452 -0
together/resources/files.py +397 -129
together/resources/fine_tuning.py +1033 -0
together/resources/hardware.py +181 -0
together/resources/images.py +258 -104
together/resources/jobs.py +214 -0
together/resources/models.py +223 -193
together/resources/rerank.py +190 -92
together/resources/videos.py +286 -214
together/types/__init__.py +66 -167
together/types/audio/__init__.py +10 -0
together/types/audio/speech_create_params.py +75 -0
together/types/audio/transcription_create_params.py +54 -0
together/types/audio/transcription_create_response.py +111 -0
together/types/audio/translation_create_params.py +40 -0
together/types/audio/translation_create_response.py +70 -0
together/types/audio/voice_list_response.py +23 -0
together/types/audio_speech_stream_chunk.py +16 -0
together/types/autoscaling.py +13 -0
together/types/autoscaling_param.py +15 -0
together/types/batch_create_params.py +24 -0
together/types/batch_create_response.py +14 -0
together/types/batch_job.py +45 -0
together/types/batch_list_response.py +10 -0
together/types/chat/__init__.py +18 -0
together/types/chat/chat_completion.py +60 -0
together/types/chat/chat_completion_chunk.py +61 -0
together/types/chat/chat_completion_structured_message_image_url_param.py +18 -0
together/types/chat/chat_completion_structured_message_text_param.py +13 -0
together/types/chat/chat_completion_structured_message_video_url_param.py +18 -0
together/types/chat/chat_completion_usage.py +13 -0
together/types/chat/chat_completion_warning.py +9 -0
together/types/chat/completion_create_params.py +329 -0
together/types/code_interpreter/__init__.py +5 -0
together/types/code_interpreter/session_list_response.py +31 -0
together/types/code_interpreter_execute_params.py +45 -0
together/types/completion.py +42 -0
together/types/completion_chunk.py +66 -0
together/types/completion_create_params.py +138 -0
together/types/dedicated_endpoint.py +44 -0
together/types/embedding.py +24 -0
together/types/embedding_create_params.py +31 -0
together/types/endpoint_create_params.py +43 -0
together/types/endpoint_list_avzones_response.py +11 -0
together/types/endpoint_list_params.py +18 -0
together/types/endpoint_list_response.py +41 -0
together/types/endpoint_update_params.py +27 -0
together/types/eval_create_params.py +263 -0
together/types/eval_create_response.py +16 -0
together/types/eval_list_params.py +21 -0
together/types/eval_list_response.py +10 -0
together/types/eval_status_response.py +100 -0
together/types/evaluation_job.py +139 -0
together/types/execute_response.py +108 -0
together/types/file_delete_response.py +13 -0
together/types/file_list.py +12 -0
together/types/file_purpose.py +9 -0
together/types/file_response.py +31 -0
together/types/file_type.py +7 -0
together/types/fine_tuning_cancel_response.py +194 -0
together/types/fine_tuning_content_params.py +24 -0
together/types/fine_tuning_delete_params.py +11 -0
together/types/fine_tuning_delete_response.py +12 -0
together/types/fine_tuning_list_checkpoints_response.py +21 -0
together/types/fine_tuning_list_events_response.py +12 -0
together/types/fine_tuning_list_response.py +199 -0
together/types/finetune_event.py +41 -0
together/types/finetune_event_type.py +33 -0
together/types/finetune_response.py +177 -0
together/types/hardware_list_params.py +16 -0
together/types/hardware_list_response.py +58 -0
together/types/image_data_b64.py +15 -0
together/types/image_data_url.py +15 -0
together/types/image_file.py +23 -0
together/types/image_generate_params.py +85 -0
together/types/job_list_response.py +47 -0
together/types/job_retrieve_response.py +43 -0
together/types/log_probs.py +18 -0
together/types/model_list_response.py +10 -0
together/types/model_object.py +42 -0
together/types/model_upload_params.py +36 -0
together/types/model_upload_response.py +23 -0
together/types/rerank_create_params.py +36 -0
together/types/rerank_create_response.py +36 -0
together/types/tool_choice.py +23 -0
together/types/tool_choice_param.py +23 -0
together/types/tools_param.py +23 -0
together/types/training_method_dpo.py +22 -0
together/types/training_method_sft.py +18 -0
together/types/video_create_params.py +86 -0
together/types/video_create_response.py +10 -0
together/types/video_job.py +57 -0
together-2.0.0a6.dist-info/METADATA +729 -0
together-2.0.0a6.dist-info/RECORD +165 -0
{together-1.5.34.dist-info → together-2.0.0a6.dist-info}/WHEEL +1 -1
together-2.0.0a6.dist-info/entry_points.txt +2 -0
{together-1.5.34.dist-info → together-2.0.0a6.dist-info}/licenses/LICENSE +1 -1
together/abstract/api_requestor.py +0 -770
together/cli/api/chat.py +0 -298
together/cli/api/completions.py +0 -119
together/cli/api/images.py +0 -93
together/cli/api/utils.py +0 -139
together/client.py +0 -186
together/error.py +0 -194
together/filemanager.py +0 -635
together/legacy/__init__.py +0 -0
together/legacy/base.py +0 -27
together/legacy/complete.py +0 -93
together/legacy/embeddings.py +0 -27
together/legacy/files.py +0 -146
together/legacy/finetune.py +0 -177
together/legacy/images.py +0 -27
together/legacy/models.py +0 -44
together/resources/batch.py +0 -165
together/resources/code_interpreter.py +0 -82
together/resources/evaluation.py +0 -808
together/resources/finetune.py +0 -1388
together/together_response.py +0 -50
together/types/abstract.py +0 -26
together/types/audio_speech.py +0 -311
together/types/batch.py +0 -54
together/types/chat_completions.py +0 -210
together/types/code_interpreter.py +0 -57
together/types/common.py +0 -67
together/types/completions.py +0 -107
together/types/embeddings.py +0 -35
together/types/endpoints.py +0 -123
together/types/error.py +0 -16
together/types/evaluation.py +0 -93
together/types/files.py +0 -93
together/types/finetune.py +0 -464
together/types/images.py +0 -42
together/types/models.py +0 -96
together/types/rerank.py +0 -43
together/types/videos.py +0 -69
together/utils/api_helpers.py +0 -124
together/version.py +0 -6
together-1.5.34.dist-info/METADATA +0 -583
together-1.5.34.dist-info/RECORD +0 -77
together-1.5.34.dist-info/entry_points.txt +0 -3
/together/{abstract → lib/cli}/__init__.py +0 -0
/together/{cli → lib/cli/api}/__init__.py +0 -0
/together/{cli/api/__init__.py → py.typed} +0 -0

together/resources/chat/completions.py CHANGED Viewed

@@ -1,297 +1,1097 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 from __future__ import annotations
-from typing import Any, AsyncGenerator, Dict, Iterator, List
+from typing import Dict, Union, Iterable
+from typing_extensions import Literal, overload
+import httpx
-from together.abstract import api_requestor
-from together.together_response import TogetherResponse
-from together.types import (
-    ChatCompletionChunk,
-    ChatCompletionRequest,
-    ChatCompletionResponse,
-    TogetherClient,
-    TogetherRequest,
+from ..._types import Body, Omit, Query, Headers, NotGiven, SequenceNotStr, omit, not_given
+from ..._utils import required_args, maybe_transform, async_maybe_transform
+from ..._compat import cached_property
+from ..._resource import SyncAPIResource, AsyncAPIResource
+from ..._response import (
+    to_raw_response_wrapper,
+    to_streamed_response_wrapper,
+    async_to_raw_response_wrapper,
+    async_to_streamed_response_wrapper,
 )
+from ..._streaming import Stream, AsyncStream
+from ...types.chat import completion_create_params
+from ..._base_client import make_request_options
+from ...types.tools_param import ToolsParam
+from ...types.chat.chat_completion import ChatCompletion
+from ...types.chat.chat_completion_chunk import ChatCompletionChunk
+__all__ = ["CompletionsResource", "AsyncCompletionsResource"]
+class CompletionsResource(SyncAPIResource):
+    @cached_property
+    def with_raw_response(self) -> CompletionsResourceWithRawResponse:
+        """
+        This property can be used as a prefix for any HTTP method call to return
+        the raw response object instead of the parsed content.
+        For more information, see https://www.github.com/togethercomputer/together-py#accessing-raw-response-data-eg-headers
+        """
+        return CompletionsResourceWithRawResponse(self)
-class ChatCompletions:
-    def __init__(self, client: TogetherClient) -> None:
-        self._client = client
+    @cached_property
+    def with_streaming_response(self) -> CompletionsResourceWithStreamingResponse:
+        """
+        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
+        For more information, see https://www.github.com/togethercomputer/together-py#with_streaming_response
+        """
+        return CompletionsResourceWithStreamingResponse(self)
+    @overload
     def create(
         self,
         *,
-        messages: List[Dict[str, Any]],
-        model: str,
-        max_tokens: int | None = None,
-        stop: List[str] | None = None,
-        temperature: float | None = None,
-        top_p: float | None = None,
-        top_k: int | None = None,
-        repetition_penalty: float | None = None,
-        presence_penalty: float | None = None,
-        frequency_penalty: float | None = None,
-        min_p: float | None = None,
-        logit_bias: Dict[str, float] | None = None,
-        seed: int | None = None,
-        stream: bool = False,
-        logprobs: int | None = None,
-        echo: bool | None = None,
-        n: int | None = None,
-        safety_model: str | None = None,
-        response_format: Dict[str, Any] | None = None,
-        tools: List[Dict[str, Any]] | None = None,
-        tool_choice: str | Dict[str, str | Dict[str, str]] | None = None,
-        **kwargs: Any,
-    ) -> ChatCompletionResponse | Iterator[ChatCompletionChunk]:
+        messages: Iterable[completion_create_params.Message],
+        model: Union[
+            Literal[
+                "Qwen/Qwen2.5-72B-Instruct-Turbo",
+                "Qwen/Qwen2.5-7B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+            ],
+            str,
+        ],
+        context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
+        echo: bool | Omit = omit,
+        frequency_penalty: float | Omit = omit,
+        function_call: completion_create_params.FunctionCall | Omit = omit,
+        logit_bias: Dict[str, float] | Omit = omit,
+        logprobs: int | Omit = omit,
+        max_tokens: int | Omit = omit,
+        min_p: float | Omit = omit,
+        n: int | Omit = omit,
+        presence_penalty: float | Omit = omit,
+        reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
+        repetition_penalty: float | Omit = omit,
+        response_format: completion_create_params.ResponseFormat | Omit = omit,
+        safety_model: str | Omit = omit,
+        seed: int | Omit = omit,
+        stop: SequenceNotStr[str] | Omit = omit,
+        stream: Literal[False] | Omit = omit,
+        temperature: float | Omit = omit,
+        tool_choice: completion_create_params.ToolChoice | Omit = omit,
+        tools: Iterable[ToolsParam] | Omit = omit,
+        top_k: int | Omit = omit,
+        top_p: float | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> ChatCompletion:
         """
-        Method to generate completions based on a given prompt using a specified model.
+        Query a chat model.
         Args:
-            messages (List[Dict[str, str]]): A list of messages in the format
-                `[{"role": together.types.chat_completions.MessageRole, "content": TEXT}, ...]`
-            model (str): The name of the model to query.
-            max_tokens (int, optional): The maximum number of tokens to generate.
-                Defaults to 512.
-            stop (List[str], optional): List of strings at which to stop generation.
-                Defaults to None.
-            temperature (float, optional): A decimal number that determines the degree of randomness in the response.
-                Defaults to None.
-            top_p (float, optional): The top_p (nucleus) parameter is used to dynamically adjust the number
-                    of choices for each predicted token based on the cumulative probabilities.
-                Defaults to None.
-            top_k (int, optional): The top_k parameter is used to limit the number of choices for the
-                    next predicted word or token.
-                Defaults to None.
-            repetition_penalty (float, optional): A number that controls the diversity of generated text
-                    by reducing the likelihood of repeated sequences. Higher values decrease repetition.
-                Defaults to None.
-            presence_penalty (float, optional): A number that controls the likelihood of tokens based on if they have
-                    appeared in the text. Positive values decrease the likelihood of repeated tokens or phrases.
-                    Must be in the range [-2, 2].
-                Defaults to None.
-            frequency_penalty (float, optional): A number that controls the likelihood of tokens based on the frequency
-                    of their appearance in the text. Positive decrease the likelihood of repeated tokens or phrases.
-                    Must be in the range [-2, 2].
-                Defaults to None.
-            min_p (float, optional): A number that controls the minimum percentage value that a token must reach to
-                be considered during sampling.
-                Must be in the range [0, 1].
-                Defaults to None.
-            logit_bias (Dict[str, float], optional): A dictionary of tokens and their bias values that modify the
-                likelihood of specific tokens being sampled. Bias values must be in the range [-100, 100].
-                Defaults to None.
-            seed (int, optional): A seed value to use for reproducibility.
-            stream (bool, optional): Flag indicating whether to stream the generated completions.
-                Defaults to False.
-            logprobs (int, optional): Number of top-k logprobs to return
-                Defaults to None.
-            echo (bool, optional): Echo prompt in output. Can be used with logprobs to return prompt logprobs.
-                Defaults to None.
-            n (int, optional): Number of completions to generate. Setting to None will return a single generation.
-                Defaults to None.
-            safety_model (str, optional): A moderation model to validate tokens. Choice between available moderation
-                    models found [here](https://docs.together.ai/docs/inference-models#moderation-models).
-                Defaults to None.
-            response_format (Dict[str, Any], optional): An object specifying the format that the model must output.
-                Defaults to None.
-            tools (Dict[str, str | Dict[str, str | Dict[str, Any]]], optional): A list of tools the model may call.
-                    Currently, only functions are supported as a tool.
-                    Use this to provide a list of functions the model may generate JSON inputs for.
-                Defaults to None
-            tool_choice: Controls which (if any) function is called by the model. auto means the model can pick
-                    between generating a message or calling a function. Specifying a particular function
-                    via {"type": "function", "function": {"name": "my_function"}} forces the model to call that function.
-                    Sets to `auto` if None.
-                Defaults to None.
-        Returns:
-            ChatCompletionResponse | Iterator[ChatCompletionChunk]: Object containing the completions
-            or an iterator over completion chunks.
+          messages: A list of messages comprising the conversation so far.
+          model: The name of the model to query.
+              [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
+          context_length_exceeded_behavior: Defined the behavior of the API when max_tokens exceed the maximum context
+              length of the model. When set to 'error', API will return 400 with appropriate
+              error message. When set to 'truncate', override the max_tokens with maximum
+              context length of the model.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
+          logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
+              at each generation step, instead of just the sampled token. Log probabilities
+              help assess model confidence in token predictions.
+          max_tokens: The maximum number of tokens to generate.
+          min_p: A number between 0 and 1 that can be used as an alternative to top_p and top-k.
+          n: The number of completions to generate for each prompt.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
+          reasoning_effort: Controls the level of reasoning effort the model should apply when generating
+              responses. Higher values may result in more thoughtful and detailed responses
+              but may take longer to generate.
+          repetition_penalty: A number that controls the diversity of generated text by reducing the
+              likelihood of repeated sequences. Higher values decrease repetition.
+          response_format: An object specifying the format that the model must output.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
+          seed: Seed value for reproducibility.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
+          tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
+              which lets the model pick between generating a message or calling a function.
+          tools: A list of tools the model may call. Currently, only functions are supported as a
+              tool. Use this to provide a list of functions the model may generate JSON inputs
+              for.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
+          extra_headers: Send extra headers
+          extra_query: Add additional query parameters to the request
+          extra_body: Add additional JSON properties to the request
+          timeout: Override the client-level default timeout for this request, in seconds
         """
+        ...
-        requestor = api_requestor.APIRequestor(
-            client=self._client,
-        )
+    @overload
+    def create(
+        self,
+        *,
+        messages: Iterable[completion_create_params.Message],
+        model: Union[
+            Literal[
+                "Qwen/Qwen2.5-72B-Instruct-Turbo",
+                "Qwen/Qwen2.5-7B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+            ],
+            str,
+        ],
+        stream: Literal[True],
+        context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
+        echo: bool | Omit = omit,
+        frequency_penalty: float | Omit = omit,
+        function_call: completion_create_params.FunctionCall | Omit = omit,
+        logit_bias: Dict[str, float] | Omit = omit,
+        logprobs: int | Omit = omit,
+        max_tokens: int | Omit = omit,
+        min_p: float | Omit = omit,
+        n: int | Omit = omit,
+        presence_penalty: float | Omit = omit,
+        reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
+        repetition_penalty: float | Omit = omit,
+        response_format: completion_create_params.ResponseFormat | Omit = omit,
+        safety_model: str | Omit = omit,
+        seed: int | Omit = omit,
+        stop: SequenceNotStr[str] | Omit = omit,
+        temperature: float | Omit = omit,
+        tool_choice: completion_create_params.ToolChoice | Omit = omit,
+        tools: Iterable[ToolsParam] | Omit = omit,
+        top_k: int | Omit = omit,
+        top_p: float | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> Stream[ChatCompletionChunk]:
+        """
+        Query a chat model.
+        Args:
+          messages: A list of messages comprising the conversation so far.
+          model: The name of the model to query.
+              [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
+          context_length_exceeded_behavior: Defined the behavior of the API when max_tokens exceed the maximum context
+              length of the model. When set to 'error', API will return 400 with appropriate
+              error message. When set to 'truncate', override the max_tokens with maximum
+              context length of the model.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
+          logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
+              at each generation step, instead of just the sampled token. Log probabilities
+              help assess model confidence in token predictions.
+          max_tokens: The maximum number of tokens to generate.
+          min_p: A number between 0 and 1 that can be used as an alternative to top_p and top-k.
+          n: The number of completions to generate for each prompt.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
+          reasoning_effort: Controls the level of reasoning effort the model should apply when generating
+              responses. Higher values may result in more thoughtful and detailed responses
+              but may take longer to generate.
+          repetition_penalty: A number that controls the diversity of generated text by reducing the
+              likelihood of repeated sequences. Higher values decrease repetition.
+          response_format: An object specifying the format that the model must output.
-        parameter_payload = ChatCompletionRequest(
-            model=model,
-            messages=messages,
-            top_p=top_p,
-            top_k=top_k,
-            temperature=temperature,
-            max_tokens=max_tokens,
-            stop=stop,
-            repetition_penalty=repetition_penalty,
-            presence_penalty=presence_penalty,
-            frequency_penalty=frequency_penalty,
-            min_p=min_p,
-            logit_bias=logit_bias,
-            seed=seed,
-            stream=stream,
-            logprobs=logprobs,
-            echo=echo,
-            n=n,
-            safety_model=safety_model,
-            response_format=response_format,
-            tools=tools,
-            tool_choice=tool_choice,
-            **kwargs,
-        ).model_dump(exclude_none=True)
-        response, _, _ = requestor.request(
-            options=TogetherRequest(
-                method="POST",
-                url="chat/completions",
-                params=parameter_payload,
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
+          seed: Seed value for reproducibility.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
+          tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
+              which lets the model pick between generating a message or calling a function.
+          tools: A list of tools the model may call. Currently, only functions are supported as a
+              tool. Use this to provide a list of functions the model may generate JSON inputs
+              for.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
+          extra_headers: Send extra headers
+          extra_query: Add additional query parameters to the request
+          extra_body: Add additional JSON properties to the request
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+    @overload
+    def create(
+        self,
+        *,
+        messages: Iterable[completion_create_params.Message],
+        model: Union[
+            Literal[
+                "Qwen/Qwen2.5-72B-Instruct-Turbo",
+                "Qwen/Qwen2.5-7B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+            ],
+            str,
+        ],
+        stream: bool,
+        context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
+        echo: bool | Omit = omit,
+        frequency_penalty: float | Omit = omit,
+        function_call: completion_create_params.FunctionCall | Omit = omit,
+        logit_bias: Dict[str, float] | Omit = omit,
+        logprobs: int | Omit = omit,
+        max_tokens: int | Omit = omit,
+        min_p: float | Omit = omit,
+        n: int | Omit = omit,
+        presence_penalty: float | Omit = omit,
+        reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
+        repetition_penalty: float | Omit = omit,
+        response_format: completion_create_params.ResponseFormat | Omit = omit,
+        safety_model: str | Omit = omit,
+        seed: int | Omit = omit,
+        stop: SequenceNotStr[str] | Omit = omit,
+        temperature: float | Omit = omit,
+        tool_choice: completion_create_params.ToolChoice | Omit = omit,
+        tools: Iterable[ToolsParam] | Omit = omit,
+        top_k: int | Omit = omit,
+        top_p: float | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> ChatCompletion | Stream[ChatCompletionChunk]:
+        """
+        Query a chat model.
+        Args:
+          messages: A list of messages comprising the conversation so far.
+          model: The name of the model to query.
+              [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
+          context_length_exceeded_behavior: Defined the behavior of the API when max_tokens exceed the maximum context
+              length of the model. When set to 'error', API will return 400 with appropriate
+              error message. When set to 'truncate', override the max_tokens with maximum
+              context length of the model.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
+          logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
+              at each generation step, instead of just the sampled token. Log probabilities
+              help assess model confidence in token predictions.
+          max_tokens: The maximum number of tokens to generate.
+          min_p: A number between 0 and 1 that can be used as an alternative to top_p and top-k.
+          n: The number of completions to generate for each prompt.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
+          reasoning_effort: Controls the level of reasoning effort the model should apply when generating
+              responses. Higher values may result in more thoughtful and detailed responses
+              but may take longer to generate.
+          repetition_penalty: A number that controls the diversity of generated text by reducing the
+              likelihood of repeated sequences. Higher values decrease repetition.
+          response_format: An object specifying the format that the model must output.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
+          seed: Seed value for reproducibility.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
+          tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
+              which lets the model pick between generating a message or calling a function.
+          tools: A list of tools the model may call. Currently, only functions are supported as a
+              tool. Use this to provide a list of functions the model may generate JSON inputs
+              for.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
+          extra_headers: Send extra headers
+          extra_query: Add additional query parameters to the request
+          extra_body: Add additional JSON properties to the request
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+    @required_args(["messages", "model"], ["messages", "model", "stream"])
+    def create(
+        self,
+        *,
+        messages: Iterable[completion_create_params.Message],
+        model: Union[
+            Literal[
+                "Qwen/Qwen2.5-72B-Instruct-Turbo",
+                "Qwen/Qwen2.5-7B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+            ],
+            str,
+        ],
+        context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
+        echo: bool | Omit = omit,
+        frequency_penalty: float | Omit = omit,
+        function_call: completion_create_params.FunctionCall | Omit = omit,
+        logit_bias: Dict[str, float] | Omit = omit,
+        logprobs: int | Omit = omit,
+        max_tokens: int | Omit = omit,
+        min_p: float | Omit = omit,
+        n: int | Omit = omit,
+        presence_penalty: float | Omit = omit,
+        reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
+        repetition_penalty: float | Omit = omit,
+        response_format: completion_create_params.ResponseFormat | Omit = omit,
+        safety_model: str | Omit = omit,
+        seed: int | Omit = omit,
+        stop: SequenceNotStr[str] | Omit = omit,
+        stream: Literal[False] | Literal[True] | Omit = omit,
+        temperature: float | Omit = omit,
+        tool_choice: completion_create_params.ToolChoice | Omit = omit,
+        tools: Iterable[ToolsParam] | Omit = omit,
+        top_k: int | Omit = omit,
+        top_p: float | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> ChatCompletion | Stream[ChatCompletionChunk]:
+        return self._post(
+            "/chat/completions",
+            body=maybe_transform(
+                {
+                    "messages": messages,
+                    "model": model,
+                    "context_length_exceeded_behavior": context_length_exceeded_behavior,
+                    "echo": echo,
+                    "frequency_penalty": frequency_penalty,
+                    "function_call": function_call,
+                    "logit_bias": logit_bias,
+                    "logprobs": logprobs,
+                    "max_tokens": max_tokens,
+                    "min_p": min_p,
+                    "n": n,
+                    "presence_penalty": presence_penalty,
+                    "reasoning_effort": reasoning_effort,
+                    "repetition_penalty": repetition_penalty,
+                    "response_format": response_format,
+                    "safety_model": safety_model,
+                    "seed": seed,
+                    "stop": stop,
+                    "stream": stream,
+                    "temperature": temperature,
+                    "tool_choice": tool_choice,
+                    "tools": tools,
+                    "top_k": top_k,
+                    "top_p": top_p,
+                },
+                completion_create_params.CompletionCreateParamsStreaming
+                if stream
+                else completion_create_params.CompletionCreateParamsNonStreaming,
             ),
-            stream=stream,
+            options=make_request_options(
+                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+            ),
+            cast_to=ChatCompletion,
+            stream=stream or False,
+            stream_cls=Stream[ChatCompletionChunk],
         )
-        if stream:
-            # must be an iterator
-            assert not isinstance(response, TogetherResponse)
-            return (ChatCompletionChunk(**line.data) for line in response)
-        assert isinstance(response, TogetherResponse)
-        return ChatCompletionResponse(**response.data)
+class AsyncCompletionsResource(AsyncAPIResource):
+    @cached_property
+    def with_raw_response(self) -> AsyncCompletionsResourceWithRawResponse:
+        """
+        This property can be used as a prefix for any HTTP method call to return
+        the raw response object instead of the parsed content.
+        For more information, see https://www.github.com/togethercomputer/together-py#accessing-raw-response-data-eg-headers
+        """
+        return AsyncCompletionsResourceWithRawResponse(self)
-class AsyncChatCompletions:
-    def __init__(self, client: TogetherClient) -> None:
-        self._client = client
+    @cached_property
+    def with_streaming_response(self) -> AsyncCompletionsResourceWithStreamingResponse:
+        """
+        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
+        For more information, see https://www.github.com/togethercomputer/together-py#with_streaming_response
+        """
+        return AsyncCompletionsResourceWithStreamingResponse(self)
+    @overload
     async def create(
         self,
         *,
-        messages: List[Dict[str, str]],
-        model: str,
-        max_tokens: int | None = None,
-        stop: List[str] | None = None,
-        temperature: float | None = None,
-        top_p: float | None = None,
-        top_k: int | None = None,
-        repetition_penalty: float | None = None,
-        presence_penalty: float | None = None,
-        frequency_penalty: float | None = None,
-        min_p: float | None = None,
-        logit_bias: Dict[str, float] | None = None,
-        seed: int | None = None,
-        stream: bool = False,
-        logprobs: int | None = None,
-        echo: bool | None = None,
-        n: int | None = None,
-        safety_model: str | None = None,
-        response_format: Dict[str, Any] | None = None,
-        tools: Dict[str, str | Dict[str, str | Dict[str, Any]]] | None = None,
-        tool_choice: str | Dict[str, str | Dict[str, str]] | None = None,
-        **kwargs: Any,
-    ) -> AsyncGenerator[ChatCompletionChunk, None] | ChatCompletionResponse:
+        messages: Iterable[completion_create_params.Message],
+        model: Union[
+            Literal[
+                "Qwen/Qwen2.5-72B-Instruct-Turbo",
+                "Qwen/Qwen2.5-7B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+            ],
+            str,
+        ],
+        context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
+        echo: bool | Omit = omit,
+        frequency_penalty: float | Omit = omit,
+        function_call: completion_create_params.FunctionCall | Omit = omit,
+        logit_bias: Dict[str, float] | Omit = omit,
+        logprobs: int | Omit = omit,
+        max_tokens: int | Omit = omit,
+        min_p: float | Omit = omit,
+        n: int | Omit = omit,
+        presence_penalty: float | Omit = omit,
+        reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
+        repetition_penalty: float | Omit = omit,
+        response_format: completion_create_params.ResponseFormat | Omit = omit,
+        safety_model: str | Omit = omit,
+        seed: int | Omit = omit,
+        stop: SequenceNotStr[str] | Omit = omit,
+        stream: Literal[False] | Omit = omit,
+        temperature: float | Omit = omit,
+        tool_choice: completion_create_params.ToolChoice | Omit = omit,
+        tools: Iterable[ToolsParam] | Omit = omit,
+        top_k: int | Omit = omit,
+        top_p: float | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> ChatCompletion:
         """
-        Async method to generate completions based on a given prompt using a specified model.
+        Query a chat model.
         Args:
-            messages (List[Dict[str, str]]): A list of messages in the format
-                `[{"role": together.types.chat_completions.MessageRole, "content": TEXT}, ...]`
-            model (str): The name of the model to query.
-            max_tokens (int, optional): The maximum number of tokens to generate.
-                Defaults to 512.
-            stop (List[str], optional): List of strings at which to stop generation.
-                Defaults to None.
-            temperature (float, optional): A decimal number that determines the degree of randomness in the response.
-                Defaults to None.
-            top_p (float, optional): The top_p (nucleus) parameter is used to dynamically adjust the number
-                    of choices for each predicted token based on the cumulative probabilities.
-                Defaults to None.
-            top_k (int, optional): The top_k parameter is used to limit the number of choices for the
-                    next predicted word or token.
-                Defaults to None.
-            repetition_penalty (float, optional): A number that controls the diversity of generated text
-                    by reducing the likelihood of repeated sequences. Higher values decrease repetition.
-                Defaults to None.
-            presence_penalty (float, optional): A number that controls the likelihood of tokens based on if they have
-                    appeared in the text. Positive values decrease the likelihood of repeated tokens or phrases.
-                    Must be in the range [-2, 2].
-                Defaults to None.
-            frequency_penalty (float, optional): A number that controls the likelihood of tokens based on the frequency
-                    of their appearance in the text. Positive decrease the likelihood of repeated tokens or phrases.
-                    Must be in the range [-2, 2].
-                Defaults to None.
-            min_p (float, optional): A number that controls the minimum percentage value that a token must reach to
-                be considered during sampling.
-                Must be in the range [0, 1].
-                Defaults to None.
-            logit_bias (Dict[str, float], optional): A dictionary of tokens and their bias values that modify the
-                likelihood of specific tokens being sampled. Bias values must be in the range [-100, 100].
-                Defaults to None.
-            seed (int, optional): A seed value to use for reproducibility.
-            stream (bool, optional): Flag indicating whether to stream the generated completions.
-                Defaults to False.
-            logprobs (int, optional): Number of top-k logprobs to return
-                Defaults to None.
-            echo (bool, optional): Echo prompt in output. Can be used with logprobs to return prompt logprobs.
-                Defaults to None.
-            n (int, optional): Number of completions to generate. Setting to None will return a single generation.
-                Defaults to None.
-            safety_model (str, optional): A moderation model to validate tokens. Choice between available moderation
-                    models found [here](https://docs.together.ai/docs/inference-models#moderation-models).
-                Defaults to None.
-            response_format (Dict[str, Any], optional): An object specifying the format that the model must output.
-                Defaults to None.
-            tools (Dict[str, str | Dict[str, str | Dict[str, Any]]], optional): A list of tools the model may call.
-                    Currently, only functions are supported as a tool.
-                    Use this to provide a list of functions the model may generate JSON inputs for.
-                Defaults to None
-            tool_choice: Controls which (if any) function is called by the model. auto means the model can pick
-                    between generating a message or calling a function. Specifying a particular function
-                    via {"type": "function", "function": {"name": "my_function"}} forces the model to call that function.
-                    Sets to `auto` if None.
-                Defaults to None.
-        Returns:
-            AsyncGenerator[ChatCompletionChunk, None] | ChatCompletionResponse: Object containing the completions
-            or an iterator over completion chunks.
+          messages: A list of messages comprising the conversation so far.
+          model: The name of the model to query.
+              [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
+          context_length_exceeded_behavior: Defined the behavior of the API when max_tokens exceed the maximum context
+              length of the model. When set to 'error', API will return 400 with appropriate
+              error message. When set to 'truncate', override the max_tokens with maximum
+              context length of the model.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
+          logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
+              at each generation step, instead of just the sampled token. Log probabilities
+              help assess model confidence in token predictions.
+          max_tokens: The maximum number of tokens to generate.
+          min_p: A number between 0 and 1 that can be used as an alternative to top_p and top-k.
+          n: The number of completions to generate for each prompt.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
+          reasoning_effort: Controls the level of reasoning effort the model should apply when generating
+              responses. Higher values may result in more thoughtful and detailed responses
+              but may take longer to generate.
+          repetition_penalty: A number that controls the diversity of generated text by reducing the
+              likelihood of repeated sequences. Higher values decrease repetition.
+          response_format: An object specifying the format that the model must output.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
+          seed: Seed value for reproducibility.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
+          tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
+              which lets the model pick between generating a message or calling a function.
+          tools: A list of tools the model may call. Currently, only functions are supported as a
+              tool. Use this to provide a list of functions the model may generate JSON inputs
+              for.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
+          extra_headers: Send extra headers
+          extra_query: Add additional query parameters to the request
+          extra_body: Add additional JSON properties to the request
+          timeout: Override the client-level default timeout for this request, in seconds
         """
+        ...
-        requestor = api_requestor.APIRequestor(
-            client=self._client,
-        )
+    @overload
+    async def create(
+        self,
+        *,
+        messages: Iterable[completion_create_params.Message],
+        model: Union[
+            Literal[
+                "Qwen/Qwen2.5-72B-Instruct-Turbo",
+                "Qwen/Qwen2.5-7B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+            ],
+            str,
+        ],
+        stream: Literal[True],
+        context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
+        echo: bool | Omit = omit,
+        frequency_penalty: float | Omit = omit,
+        function_call: completion_create_params.FunctionCall | Omit = omit,
+        logit_bias: Dict[str, float] | Omit = omit,
+        logprobs: int | Omit = omit,
+        max_tokens: int | Omit = omit,
+        min_p: float | Omit = omit,
+        n: int | Omit = omit,
+        presence_penalty: float | Omit = omit,
+        reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
+        repetition_penalty: float | Omit = omit,
+        response_format: completion_create_params.ResponseFormat | Omit = omit,
+        safety_model: str | Omit = omit,
+        seed: int | Omit = omit,
+        stop: SequenceNotStr[str] | Omit = omit,
+        temperature: float | Omit = omit,
+        tool_choice: completion_create_params.ToolChoice | Omit = omit,
+        tools: Iterable[ToolsParam] | Omit = omit,
+        top_k: int | Omit = omit,
+        top_p: float | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> AsyncStream[ChatCompletionChunk]:
+        """
+        Query a chat model.
+        Args:
+          messages: A list of messages comprising the conversation so far.
+          model: The name of the model to query.
+              [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
+          context_length_exceeded_behavior: Defined the behavior of the API when max_tokens exceed the maximum context
+              length of the model. When set to 'error', API will return 400 with appropriate
+              error message. When set to 'truncate', override the max_tokens with maximum
+              context length of the model.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
+          logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
+              at each generation step, instead of just the sampled token. Log probabilities
+              help assess model confidence in token predictions.
+          max_tokens: The maximum number of tokens to generate.
+          min_p: A number between 0 and 1 that can be used as an alternative to top_p and top-k.
+          n: The number of completions to generate for each prompt.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
+          reasoning_effort: Controls the level of reasoning effort the model should apply when generating
+              responses. Higher values may result in more thoughtful and detailed responses
+              but may take longer to generate.
+          repetition_penalty: A number that controls the diversity of generated text by reducing the
+              likelihood of repeated sequences. Higher values decrease repetition.
+          response_format: An object specifying the format that the model must output.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
+          seed: Seed value for reproducibility.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
+          tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
+              which lets the model pick between generating a message or calling a function.
+          tools: A list of tools the model may call. Currently, only functions are supported as a
+              tool. Use this to provide a list of functions the model may generate JSON inputs
+              for.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
+          extra_headers: Send extra headers
-        parameter_payload = ChatCompletionRequest(
-            model=model,
-            messages=messages,
-            top_p=top_p,
-            top_k=top_k,
-            temperature=temperature,
-            max_tokens=max_tokens,
-            stop=stop,
-            repetition_penalty=repetition_penalty,
-            presence_penalty=presence_penalty,
-            frequency_penalty=frequency_penalty,
-            min_p=min_p,
-            logit_bias=logit_bias,
-            seed=seed,
-            stream=stream,
-            logprobs=logprobs,
-            echo=echo,
-            n=n,
-            safety_model=safety_model,
-            response_format=response_format,
-            tools=tools,
-            tool_choice=tool_choice,
-            **kwargs,
-        ).model_dump(exclude_none=True)
-        response, _, _ = await requestor.arequest(
-            options=TogetherRequest(
-                method="POST",
-                url="chat/completions",
-                params=parameter_payload,
+          extra_query: Add additional query parameters to the request
+          extra_body: Add additional JSON properties to the request
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+    @overload
+    async def create(
+        self,
+        *,
+        messages: Iterable[completion_create_params.Message],
+        model: Union[
+            Literal[
+                "Qwen/Qwen2.5-72B-Instruct-Turbo",
+                "Qwen/Qwen2.5-7B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+            ],
+            str,
+        ],
+        stream: bool,
+        context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
+        echo: bool | Omit = omit,
+        frequency_penalty: float | Omit = omit,
+        function_call: completion_create_params.FunctionCall | Omit = omit,
+        logit_bias: Dict[str, float] | Omit = omit,
+        logprobs: int | Omit = omit,
+        max_tokens: int | Omit = omit,
+        min_p: float | Omit = omit,
+        n: int | Omit = omit,
+        presence_penalty: float | Omit = omit,
+        reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
+        repetition_penalty: float | Omit = omit,
+        response_format: completion_create_params.ResponseFormat | Omit = omit,
+        safety_model: str | Omit = omit,
+        seed: int | Omit = omit,
+        stop: SequenceNotStr[str] | Omit = omit,
+        temperature: float | Omit = omit,
+        tool_choice: completion_create_params.ToolChoice | Omit = omit,
+        tools: Iterable[ToolsParam] | Omit = omit,
+        top_k: int | Omit = omit,
+        top_p: float | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> ChatCompletion | AsyncStream[ChatCompletionChunk]:
+        """
+        Query a chat model.
+        Args:
+          messages: A list of messages comprising the conversation so far.
+          model: The name of the model to query.
+              [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
+          context_length_exceeded_behavior: Defined the behavior of the API when max_tokens exceed the maximum context
+              length of the model. When set to 'error', API will return 400 with appropriate
+              error message. When set to 'truncate', override the max_tokens with maximum
+              context length of the model.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
+          logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
+              at each generation step, instead of just the sampled token. Log probabilities
+              help assess model confidence in token predictions.
+          max_tokens: The maximum number of tokens to generate.
+          min_p: A number between 0 and 1 that can be used as an alternative to top_p and top-k.
+          n: The number of completions to generate for each prompt.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
+          reasoning_effort: Controls the level of reasoning effort the model should apply when generating
+              responses. Higher values may result in more thoughtful and detailed responses
+              but may take longer to generate.
+          repetition_penalty: A number that controls the diversity of generated text by reducing the
+              likelihood of repeated sequences. Higher values decrease repetition.
+          response_format: An object specifying the format that the model must output.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
+          seed: Seed value for reproducibility.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
+          tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
+              which lets the model pick between generating a message or calling a function.
+          tools: A list of tools the model may call. Currently, only functions are supported as a
+              tool. Use this to provide a list of functions the model may generate JSON inputs
+              for.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
+          extra_headers: Send extra headers
+          extra_query: Add additional query parameters to the request
+          extra_body: Add additional JSON properties to the request
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+    @required_args(["messages", "model"], ["messages", "model", "stream"])
+    async def create(
+        self,
+        *,
+        messages: Iterable[completion_create_params.Message],
+        model: Union[
+            Literal[
+                "Qwen/Qwen2.5-72B-Instruct-Turbo",
+                "Qwen/Qwen2.5-7B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
+                "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+            ],
+            str,
+        ],
+        context_length_exceeded_behavior: Literal["truncate", "error"] | Omit = omit,
+        echo: bool | Omit = omit,
+        frequency_penalty: float | Omit = omit,
+        function_call: completion_create_params.FunctionCall | Omit = omit,
+        logit_bias: Dict[str, float] | Omit = omit,
+        logprobs: int | Omit = omit,
+        max_tokens: int | Omit = omit,
+        min_p: float | Omit = omit,
+        n: int | Omit = omit,
+        presence_penalty: float | Omit = omit,
+        reasoning_effort: Literal["low", "medium", "high"] | Omit = omit,
+        repetition_penalty: float | Omit = omit,
+        response_format: completion_create_params.ResponseFormat | Omit = omit,
+        safety_model: str | Omit = omit,
+        seed: int | Omit = omit,
+        stop: SequenceNotStr[str] | Omit = omit,
+        stream: Literal[False] | Literal[True] | Omit = omit,
+        temperature: float | Omit = omit,
+        tool_choice: completion_create_params.ToolChoice | Omit = omit,
+        tools: Iterable[ToolsParam] | Omit = omit,
+        top_k: int | Omit = omit,
+        top_p: float | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> ChatCompletion | AsyncStream[ChatCompletionChunk]:
+        return await self._post(
+            "/chat/completions",
+            body=await async_maybe_transform(
+                {
+                    "messages": messages,
+                    "model": model,
+                    "context_length_exceeded_behavior": context_length_exceeded_behavior,
+                    "echo": echo,
+                    "frequency_penalty": frequency_penalty,
+                    "function_call": function_call,
+                    "logit_bias": logit_bias,
+                    "logprobs": logprobs,
+                    "max_tokens": max_tokens,
+                    "min_p": min_p,
+                    "n": n,
+                    "presence_penalty": presence_penalty,
+                    "reasoning_effort": reasoning_effort,
+                    "repetition_penalty": repetition_penalty,
+                    "response_format": response_format,
+                    "safety_model": safety_model,
+                    "seed": seed,
+                    "stop": stop,
+                    "stream": stream,
+                    "temperature": temperature,
+                    "tool_choice": tool_choice,
+                    "tools": tools,
+                    "top_k": top_k,
+                    "top_p": top_p,
+                },
+                completion_create_params.CompletionCreateParamsStreaming
+                if stream
+                else completion_create_params.CompletionCreateParamsNonStreaming,
+            ),
+            options=make_request_options(
+                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
-            stream=stream,
+            cast_to=ChatCompletion,
+            stream=stream or False,
+            stream_cls=AsyncStream[ChatCompletionChunk],
         )
-        if stream:
-            # must be an iterator
-            assert not isinstance(response, TogetherResponse)
-            return (ChatCompletionChunk(**line.data) async for line in response)
-        assert isinstance(response, TogetherResponse)
-        return ChatCompletionResponse(**response.data)
+class CompletionsResourceWithRawResponse:
+    def __init__(self, completions: CompletionsResource) -> None:
+        self._completions = completions
+        self.create = to_raw_response_wrapper(
+            completions.create,
+        )
+class AsyncCompletionsResourceWithRawResponse:
+    def __init__(self, completions: AsyncCompletionsResource) -> None:
+        self._completions = completions
+        self.create = async_to_raw_response_wrapper(
+            completions.create,
+        )
+class CompletionsResourceWithStreamingResponse:
+    def __init__(self, completions: CompletionsResource) -> None:
+        self._completions = completions
+        self.create = to_streamed_response_wrapper(
+            completions.create,
+        )
+class AsyncCompletionsResourceWithStreamingResponse:
+    def __init__(self, completions: AsyncCompletionsResource) -> None:
+        self._completions = completions
+        self.create = async_to_streamed_response_wrapper(
+            completions.create,
+        )

together 1.5.34__py3-none-any.whl → 2.0.0a6__py3-none-any.whl

together 1.5.34py3-none-any.whl → 2.0.0a6py3-none-any.whl