PyPI - together - Versions diffs - 1.5.34__py3-none-any.whl → 2.0.0a6__py3-none-any.whl - Mend

together 1.5.34py3-none-any.whl → 2.0.0a6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (208) hide show

together/__init__.py +101 -114
together/_base_client.py +1995 -0
together/_client.py +1033 -0
together/_compat.py +219 -0
together/_constants.py +14 -0
together/_exceptions.py +108 -0
together/_files.py +123 -0
together/_models.py +857 -0
together/_qs.py +150 -0
together/_resource.py +43 -0
together/_response.py +830 -0
together/_streaming.py +370 -0
together/_types.py +260 -0
together/_utils/__init__.py +64 -0
together/_utils/_compat.py +45 -0
together/_utils/_datetime_parse.py +136 -0
together/_utils/_logs.py +25 -0
together/_utils/_proxy.py +65 -0
together/_utils/_reflection.py +42 -0
together/_utils/_resources_proxy.py +24 -0
together/_utils/_streams.py +12 -0
together/_utils/_sync.py +58 -0
together/_utils/_transform.py +457 -0
together/_utils/_typing.py +156 -0
together/_utils/_utils.py +421 -0
together/_version.py +4 -0
together/lib/.keep +4 -0
together/lib/__init__.py +23 -0
together/{cli → lib/cli}/api/endpoints.py +65 -81
together/{cli/api/evaluation.py → lib/cli/api/evals.py} +152 -43
together/{cli → lib/cli}/api/files.py +20 -17
together/{cli/api/finetune.py → lib/cli/api/fine_tuning.py} +116 -172
together/{cli → lib/cli}/api/models.py +34 -27
together/lib/cli/api/utils.py +50 -0
together/{cli → lib/cli}/cli.py +16 -26
together/{constants.py → lib/constants.py} +11 -24
together/lib/resources/__init__.py +11 -0
together/lib/resources/files.py +999 -0
together/lib/resources/fine_tuning.py +280 -0
together/lib/resources/models.py +35 -0
together/lib/types/__init__.py +13 -0
together/lib/types/error.py +9 -0
together/lib/types/fine_tuning.py +397 -0
together/{utils → lib/utils}/__init__.py +6 -14
together/{utils → lib/utils}/_log.py +11 -16
together/{utils → lib/utils}/files.py +90 -288
together/lib/utils/serializer.py +10 -0
together/{utils → lib/utils}/tools.py +19 -55
together/resources/__init__.py +225 -39
together/resources/audio/__init__.py +72 -48
together/resources/audio/audio.py +198 -0
together/resources/audio/speech.py +574 -128
together/resources/audio/transcriptions.py +247 -261
together/resources/audio/translations.py +221 -241
together/resources/audio/voices.py +111 -41
together/resources/batches.py +417 -0
together/resources/chat/__init__.py +30 -21
together/resources/chat/chat.py +102 -0
together/resources/chat/completions.py +1063 -263
together/resources/code_interpreter/__init__.py +33 -0
together/resources/code_interpreter/code_interpreter.py +258 -0
together/resources/code_interpreter/sessions.py +135 -0
together/resources/completions.py +884 -225
together/resources/embeddings.py +172 -68
together/resources/endpoints.py +589 -477
together/resources/evals.py +452 -0
together/resources/files.py +397 -129
together/resources/fine_tuning.py +1033 -0
together/resources/hardware.py +181 -0
together/resources/images.py +258 -104
together/resources/jobs.py +214 -0
together/resources/models.py +223 -193
together/resources/rerank.py +190 -92
together/resources/videos.py +286 -214
together/types/__init__.py +66 -167
together/types/audio/__init__.py +10 -0
together/types/audio/speech_create_params.py +75 -0
together/types/audio/transcription_create_params.py +54 -0
together/types/audio/transcription_create_response.py +111 -0
together/types/audio/translation_create_params.py +40 -0
together/types/audio/translation_create_response.py +70 -0
together/types/audio/voice_list_response.py +23 -0
together/types/audio_speech_stream_chunk.py +16 -0
together/types/autoscaling.py +13 -0
together/types/autoscaling_param.py +15 -0
together/types/batch_create_params.py +24 -0
together/types/batch_create_response.py +14 -0
together/types/batch_job.py +45 -0
together/types/batch_list_response.py +10 -0
together/types/chat/__init__.py +18 -0
together/types/chat/chat_completion.py +60 -0
together/types/chat/chat_completion_chunk.py +61 -0
together/types/chat/chat_completion_structured_message_image_url_param.py +18 -0
together/types/chat/chat_completion_structured_message_text_param.py +13 -0
together/types/chat/chat_completion_structured_message_video_url_param.py +18 -0
together/types/chat/chat_completion_usage.py +13 -0
together/types/chat/chat_completion_warning.py +9 -0
together/types/chat/completion_create_params.py +329 -0
together/types/code_interpreter/__init__.py +5 -0
together/types/code_interpreter/session_list_response.py +31 -0
together/types/code_interpreter_execute_params.py +45 -0
together/types/completion.py +42 -0
together/types/completion_chunk.py +66 -0
together/types/completion_create_params.py +138 -0
together/types/dedicated_endpoint.py +44 -0
together/types/embedding.py +24 -0
together/types/embedding_create_params.py +31 -0
together/types/endpoint_create_params.py +43 -0
together/types/endpoint_list_avzones_response.py +11 -0
together/types/endpoint_list_params.py +18 -0
together/types/endpoint_list_response.py +41 -0
together/types/endpoint_update_params.py +27 -0
together/types/eval_create_params.py +263 -0
together/types/eval_create_response.py +16 -0
together/types/eval_list_params.py +21 -0
together/types/eval_list_response.py +10 -0
together/types/eval_status_response.py +100 -0
together/types/evaluation_job.py +139 -0
together/types/execute_response.py +108 -0
together/types/file_delete_response.py +13 -0
together/types/file_list.py +12 -0
together/types/file_purpose.py +9 -0
together/types/file_response.py +31 -0
together/types/file_type.py +7 -0
together/types/fine_tuning_cancel_response.py +194 -0
together/types/fine_tuning_content_params.py +24 -0
together/types/fine_tuning_delete_params.py +11 -0
together/types/fine_tuning_delete_response.py +12 -0
together/types/fine_tuning_list_checkpoints_response.py +21 -0
together/types/fine_tuning_list_events_response.py +12 -0
together/types/fine_tuning_list_response.py +199 -0
together/types/finetune_event.py +41 -0
together/types/finetune_event_type.py +33 -0
together/types/finetune_response.py +177 -0
together/types/hardware_list_params.py +16 -0
together/types/hardware_list_response.py +58 -0
together/types/image_data_b64.py +15 -0
together/types/image_data_url.py +15 -0
together/types/image_file.py +23 -0
together/types/image_generate_params.py +85 -0
together/types/job_list_response.py +47 -0
together/types/job_retrieve_response.py +43 -0
together/types/log_probs.py +18 -0
together/types/model_list_response.py +10 -0
together/types/model_object.py +42 -0
together/types/model_upload_params.py +36 -0
together/types/model_upload_response.py +23 -0
together/types/rerank_create_params.py +36 -0
together/types/rerank_create_response.py +36 -0
together/types/tool_choice.py +23 -0
together/types/tool_choice_param.py +23 -0
together/types/tools_param.py +23 -0
together/types/training_method_dpo.py +22 -0
together/types/training_method_sft.py +18 -0
together/types/video_create_params.py +86 -0
together/types/video_create_response.py +10 -0
together/types/video_job.py +57 -0
together-2.0.0a6.dist-info/METADATA +729 -0
together-2.0.0a6.dist-info/RECORD +165 -0
{together-1.5.34.dist-info → together-2.0.0a6.dist-info}/WHEEL +1 -1
together-2.0.0a6.dist-info/entry_points.txt +2 -0
{together-1.5.34.dist-info → together-2.0.0a6.dist-info}/licenses/LICENSE +1 -1
together/abstract/api_requestor.py +0 -770
together/cli/api/chat.py +0 -298
together/cli/api/completions.py +0 -119
together/cli/api/images.py +0 -93
together/cli/api/utils.py +0 -139
together/client.py +0 -186
together/error.py +0 -194
together/filemanager.py +0 -635
together/legacy/__init__.py +0 -0
together/legacy/base.py +0 -27
together/legacy/complete.py +0 -93
together/legacy/embeddings.py +0 -27
together/legacy/files.py +0 -146
together/legacy/finetune.py +0 -177
together/legacy/images.py +0 -27
together/legacy/models.py +0 -44
together/resources/batch.py +0 -165
together/resources/code_interpreter.py +0 -82
together/resources/evaluation.py +0 -808
together/resources/finetune.py +0 -1388
together/together_response.py +0 -50
together/types/abstract.py +0 -26
together/types/audio_speech.py +0 -311
together/types/batch.py +0 -54
together/types/chat_completions.py +0 -210
together/types/code_interpreter.py +0 -57
together/types/common.py +0 -67
together/types/completions.py +0 -107
together/types/embeddings.py +0 -35
together/types/endpoints.py +0 -123
together/types/error.py +0 -16
together/types/evaluation.py +0 -93
together/types/files.py +0 -93
together/types/finetune.py +0 -464
together/types/images.py +0 -42
together/types/models.py +0 -96
together/types/rerank.py +0 -43
together/types/videos.py +0 -69
together/utils/api_helpers.py +0 -124
together/version.py +0 -6
together-1.5.34.dist-info/METADATA +0 -583
together-1.5.34.dist-info/RECORD +0 -77
together-1.5.34.dist-info/entry_points.txt +0 -3
/together/{abstract → lib/cli}/__init__.py +0 -0
/together/{cli → lib/cli/api}/__init__.py +0 -0
/together/{cli/api/__init__.py → py.typed} +0 -0

together/resources/completions.py CHANGED Viewed

@@ -1,261 +1,920 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 from __future__ import annotations
-from typing import AsyncGenerator, Dict, Iterator, List, Any
+from typing import Dict, Union
+from typing_extensions import Literal, overload
+import httpx
-from together.abstract import api_requestor
-from together.together_response import TogetherResponse
-from together.types import (
-    CompletionChunk,
-    CompletionRequest,
-    CompletionResponse,
-    TogetherClient,
-    TogetherRequest,
+from ..types import completion_create_params
+from .._types import Body, Omit, Query, Headers, NotGiven, SequenceNotStr, omit, not_given
+from .._utils import required_args, maybe_transform, async_maybe_transform
+from .._compat import cached_property
+from .._resource import SyncAPIResource, AsyncAPIResource
+from .._response import (
+    to_raw_response_wrapper,
+    to_streamed_response_wrapper,
+    async_to_raw_response_wrapper,
+    async_to_streamed_response_wrapper,
 )
+from .._streaming import Stream, AsyncStream
+from .._base_client import make_request_options
+from ..types.completion import Completion
+from ..types.completion_chunk import CompletionChunk
+__all__ = ["CompletionsResource", "AsyncCompletionsResource"]
+class CompletionsResource(SyncAPIResource):
+    @cached_property
+    def with_raw_response(self) -> CompletionsResourceWithRawResponse:
+        """
+        This property can be used as a prefix for any HTTP method call to return
+        the raw response object instead of the parsed content.
+        For more information, see https://www.github.com/togethercomputer/together-py#accessing-raw-response-data-eg-headers
+        """
+        return CompletionsResourceWithRawResponse(self)
+    @cached_property
+    def with_streaming_response(self) -> CompletionsResourceWithStreamingResponse:
+        """
+        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
-class Completions:
-    def __init__(self, client: TogetherClient) -> None:
-        self._client = client
+        For more information, see https://www.github.com/togethercomputer/together-py#with_streaming_response
+        """
+        return CompletionsResourceWithStreamingResponse(self)
+    @overload
     def create(
         self,
         *,
+        model: Union[
+            Literal[
+                "meta-llama/Llama-2-70b-hf",
+                "mistralai/Mistral-7B-v0.1",
+                "mistralai/Mixtral-8x7B-v0.1",
+                "Meta-Llama/Llama-Guard-7b",
+            ],
+            str,
+        ],
         prompt: str,
-        model: str,
-        max_tokens: int | None = 512,
-        stop: List[str] | None = None,
-        temperature: float | None = None,
-        top_p: float | None = None,
-        top_k: int | None = None,
-        repetition_penalty: float | None = None,
-        presence_penalty: float | None = None,
-        frequency_penalty: float | None = None,
-        min_p: float | None = None,
-        logit_bias: Dict[str, float] | None = None,
-        seed: int | None = None,
-        stream: bool = False,
-        logprobs: int | None = None,
-        echo: bool | None = None,
-        n: int | None = None,
-        safety_model: str | None = None,
-        **kwargs: Any,
-    ) -> CompletionResponse | Iterator[CompletionChunk]:
+        echo: bool | Omit = omit,
+        frequency_penalty: float | Omit = omit,
+        logit_bias: Dict[str, float] | Omit = omit,
+        logprobs: int | Omit = omit,
+        max_tokens: int | Omit = omit,
+        min_p: float | Omit = omit,
+        n: int | Omit = omit,
+        presence_penalty: float | Omit = omit,
+        repetition_penalty: float | Omit = omit,
+        safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
+        seed: int | Omit = omit,
+        stop: SequenceNotStr[str] | Omit = omit,
+        stream: Literal[False] | Omit = omit,
+        temperature: float | Omit = omit,
+        top_k: int | Omit = omit,
+        top_p: float | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> Completion:
         """
-        Method to generate completions based on a given prompt using a specified model.
+        Query a language, code, or image model.
         Args:
-            prompt (str): A string providing context for the model to complete.
-            model (str): The name of the model to query.
-            max_tokens (int, optional): The maximum number of tokens to generate.
-                Defaults to 512.
-            stop (List[str], optional): List of strings at which to stop generation.
-                Defaults to None.
-            temperature (float, optional): A decimal number that determines the degree of randomness in the response.
-                Defaults to None.
-            top_p (float, optional): The top_p (nucleus) parameter is used to dynamically adjust the number
-                    of choices for each predicted token based on the cumulative probabilities.
-                Defaults to None.
-            top_k (int, optional): The top_k parameter is used to limit the number of choices for the
-                    next predicted word or token.
-                Defaults to None.
-            repetition_penalty (float, optional): A number that controls the diversity of generated text
-                    by reducing the likelihood of repeated sequences. Higher values decrease repetition.
-                Defaults to None.
-            presence_penalty (float, optional): A number that controls the likelihood of tokens based on if they have
-                    appeared in the text. Positive values decrease the likelihood of repeated tokens or phrases.
-                    Must be in the range [-2, 2].
-                Defaults to None.
-            frequency_penalty (float, optional): A number that controls the likelihood of tokens based on the frequency
-                    of their appearance in the text. Positive decrease the likelihood of repeated tokens or phrases.
-                    Must be in the range [-2, 2].
-                Defaults to None.
-            min_p (float, optional): A number that controls the minimum percentage value that a token must reach to
-                be considered during sampling.
-                Must be in the range [0, 1].
-                Defaults to None.
-            logit_bias (Dict[str, float], optional): A dictionary of tokens and their bias values that modify the
-                likelihood of specific tokens being sampled. Bias values must be in the range [-100, 100].
-                Defaults to None.
-            seed (int, optional): Seed value for reproducibility.
-            stream (bool, optional): Flag indicating whether to stream the generated completions.
-                Defaults to False.
-            logprobs (int, optional): Number of top-k logprobs to return
-                Defaults to None.
-            echo (bool, optional): Echo prompt in output. Can be used with logprobs to return prompt logprobs.
-                Defaults to None.
-            n (int, optional): Number of completions to generate. Setting to None will return a single generation.
-                Defaults to None.
-            safety_model (str, optional): A moderation model to validate tokens. Choice between available moderation
-                    models found [here](https://docs.together.ai/docs/inference-models#moderation-models).
-                Defaults to None.
-        Returns:
-            CompletionResponse | Iterator[CompletionChunk]: Object containing the completions
-            or an iterator over completion chunks.
+          model: The name of the model to query.
+              [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
+          prompt: A string providing context for the model to complete.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
+          logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
+              at each generation step, instead of just the sampled token. Log probabilities
+              help assess model confidence in token predictions.
+          max_tokens: The maximum number of tokens to generate.
+          min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
+          n: The number of completions to generate for each prompt.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
+          repetition_penalty: A number that controls the diversity of generated text by reducing the
+              likelihood of repeated sequences. Higher values decrease repetition.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
+          seed: Seed value for reproducibility.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
+          extra_headers: Send extra headers
+          extra_query: Add additional query parameters to the request
+          extra_body: Add additional JSON properties to the request
+          timeout: Override the client-level default timeout for this request, in seconds
         """
+        ...
-        requestor = api_requestor.APIRequestor(
-            client=self._client,
-        )
+    @overload
+    def create(
+        self,
+        *,
+        model: Union[
+            Literal[
+                "meta-llama/Llama-2-70b-hf",
+                "mistralai/Mistral-7B-v0.1",
+                "mistralai/Mixtral-8x7B-v0.1",
+                "Meta-Llama/Llama-Guard-7b",
+            ],
+            str,
+        ],
+        prompt: str,
+        stream: Literal[True],
+        echo: bool | Omit = omit,
+        frequency_penalty: float | Omit = omit,
+        logit_bias: Dict[str, float] | Omit = omit,
+        logprobs: int | Omit = omit,
+        max_tokens: int | Omit = omit,
+        min_p: float | Omit = omit,
+        n: int | Omit = omit,
+        presence_penalty: float | Omit = omit,
+        repetition_penalty: float | Omit = omit,
+        safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
+        seed: int | Omit = omit,
+        stop: SequenceNotStr[str] | Omit = omit,
+        temperature: float | Omit = omit,
+        top_k: int | Omit = omit,
+        top_p: float | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> Stream[CompletionChunk]:
+        """
+        Query a language, code, or image model.
+        Args:
+          model: The name of the model to query.
+              [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
+          prompt: A string providing context for the model to complete.
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
+          logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
+              at each generation step, instead of just the sampled token. Log probabilities
+              help assess model confidence in token predictions.
+          max_tokens: The maximum number of tokens to generate.
+          min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
+          n: The number of completions to generate for each prompt.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
+          repetition_penalty: A number that controls the diversity of generated text by reducing the
+              likelihood of repeated sequences. Higher values decrease repetition.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
-        parameter_payload = CompletionRequest(
-            model=model,
-            prompt=prompt,
-            top_p=top_p,
-            top_k=top_k,
-            temperature=temperature,
-            max_tokens=max_tokens,
-            stop=stop,
-            repetition_penalty=repetition_penalty,
-            presence_penalty=presence_penalty,
-            frequency_penalty=frequency_penalty,
-            seed=seed,
-            min_p=min_p,
-            logit_bias=logit_bias,
-            stream=stream,
-            logprobs=logprobs,
-            echo=echo,
-            n=n,
-            safety_model=safety_model,
-            **kwargs,
-        ).model_dump(exclude_none=True)
-        response, _, _ = requestor.request(
-            options=TogetherRequest(
-                method="POST",
-                url="completions",
-                params=parameter_payload,
+          seed: Seed value for reproducibility.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
+          extra_headers: Send extra headers
+          extra_query: Add additional query parameters to the request
+          extra_body: Add additional JSON properties to the request
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+    @overload
+    def create(
+        self,
+        *,
+        model: Union[
+            Literal[
+                "meta-llama/Llama-2-70b-hf",
+                "mistralai/Mistral-7B-v0.1",
+                "mistralai/Mixtral-8x7B-v0.1",
+                "Meta-Llama/Llama-Guard-7b",
+            ],
+            str,
+        ],
+        prompt: str,
+        stream: bool,
+        echo: bool | Omit = omit,
+        frequency_penalty: float | Omit = omit,
+        logit_bias: Dict[str, float] | Omit = omit,
+        logprobs: int | Omit = omit,
+        max_tokens: int | Omit = omit,
+        min_p: float | Omit = omit,
+        n: int | Omit = omit,
+        presence_penalty: float | Omit = omit,
+        repetition_penalty: float | Omit = omit,
+        safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
+        seed: int | Omit = omit,
+        stop: SequenceNotStr[str] | Omit = omit,
+        temperature: float | Omit = omit,
+        top_k: int | Omit = omit,
+        top_p: float | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> Completion | Stream[CompletionChunk]:
+        """
+        Query a language, code, or image model.
+        Args:
+          model: The name of the model to query.
+              [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
+          prompt: A string providing context for the model to complete.
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
+          logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
+              at each generation step, instead of just the sampled token. Log probabilities
+              help assess model confidence in token predictions.
+          max_tokens: The maximum number of tokens to generate.
+          min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
+          n: The number of completions to generate for each prompt.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
+          repetition_penalty: A number that controls the diversity of generated text by reducing the
+              likelihood of repeated sequences. Higher values decrease repetition.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
+          seed: Seed value for reproducibility.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
+          extra_headers: Send extra headers
+          extra_query: Add additional query parameters to the request
+          extra_body: Add additional JSON properties to the request
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+    @required_args(["model", "prompt"], ["model", "prompt", "stream"])
+    def create(
+        self,
+        *,
+        model: Union[
+            Literal[
+                "meta-llama/Llama-2-70b-hf",
+                "mistralai/Mistral-7B-v0.1",
+                "mistralai/Mixtral-8x7B-v0.1",
+                "Meta-Llama/Llama-Guard-7b",
+            ],
+            str,
+        ],
+        prompt: str,
+        echo: bool | Omit = omit,
+        frequency_penalty: float | Omit = omit,
+        logit_bias: Dict[str, float] | Omit = omit,
+        logprobs: int | Omit = omit,
+        max_tokens: int | Omit = omit,
+        min_p: float | Omit = omit,
+        n: int | Omit = omit,
+        presence_penalty: float | Omit = omit,
+        repetition_penalty: float | Omit = omit,
+        safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
+        seed: int | Omit = omit,
+        stop: SequenceNotStr[str] | Omit = omit,
+        stream: Literal[False] | Literal[True] | Omit = omit,
+        temperature: float | Omit = omit,
+        top_k: int | Omit = omit,
+        top_p: float | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> Completion | Stream[CompletionChunk]:
+        return self._post(
+            "/completions",
+            body=maybe_transform(
+                {
+                    "model": model,
+                    "prompt": prompt,
+                    "echo": echo,
+                    "frequency_penalty": frequency_penalty,
+                    "logit_bias": logit_bias,
+                    "logprobs": logprobs,
+                    "max_tokens": max_tokens,
+                    "min_p": min_p,
+                    "n": n,
+                    "presence_penalty": presence_penalty,
+                    "repetition_penalty": repetition_penalty,
+                    "safety_model": safety_model,
+                    "seed": seed,
+                    "stop": stop,
+                    "stream": stream,
+                    "temperature": temperature,
+                    "top_k": top_k,
+                    "top_p": top_p,
+                },
+                completion_create_params.CompletionCreateParamsStreaming
+                if stream
+                else completion_create_params.CompletionCreateParamsNonStreaming,
+            ),
+            options=make_request_options(
+                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
-            stream=stream,
+            cast_to=Completion,
+            stream=stream or False,
+            stream_cls=Stream[CompletionChunk],
         )
-        if stream:
-            # must be an iterator
-            assert not isinstance(response, TogetherResponse)
-            return (CompletionChunk(**line.data) for line in response)
-        assert isinstance(response, TogetherResponse)
-        return CompletionResponse(**response.data)
+class AsyncCompletionsResource(AsyncAPIResource):
+    @cached_property
+    def with_raw_response(self) -> AsyncCompletionsResourceWithRawResponse:
+        """
+        This property can be used as a prefix for any HTTP method call to return
+        the raw response object instead of the parsed content.
+        For more information, see https://www.github.com/togethercomputer/together-py#accessing-raw-response-data-eg-headers
+        """
+        return AsyncCompletionsResourceWithRawResponse(self)
-class AsyncCompletions:
-    def __init__(self, client: TogetherClient) -> None:
-        self._client = client
+    @cached_property
+    def with_streaming_response(self) -> AsyncCompletionsResourceWithStreamingResponse:
+        """
+        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
+        For more information, see https://www.github.com/togethercomputer/together-py#with_streaming_response
+        """
+        return AsyncCompletionsResourceWithStreamingResponse(self)
+    @overload
     async def create(
         self,
         *,
+        model: Union[
+            Literal[
+                "meta-llama/Llama-2-70b-hf",
+                "mistralai/Mistral-7B-v0.1",
+                "mistralai/Mixtral-8x7B-v0.1",
+                "Meta-Llama/Llama-Guard-7b",
+            ],
+            str,
+        ],
         prompt: str,
-        model: str,
-        max_tokens: int | None = 512,
-        stop: List[str] | None = None,
-        temperature: float | None = None,
-        top_p: float | None = None,
-        top_k: int | None = None,
-        repetition_penalty: float | None = None,
-        presence_penalty: float | None = None,
-        frequency_penalty: float | None = None,
-        min_p: float | None = None,
-        logit_bias: Dict[str, float] | None = None,
-        seed: int | None = None,
-        stream: bool = False,
-        logprobs: int | None = None,
-        echo: bool | None = None,
-        n: int | None = None,
-        safety_model: str | None = None,
-        **kwargs: Any,
-    ) -> AsyncGenerator[CompletionChunk, None] | CompletionResponse:
+        echo: bool | Omit = omit,
+        frequency_penalty: float | Omit = omit,
+        logit_bias: Dict[str, float] | Omit = omit,
+        logprobs: int | Omit = omit,
+        max_tokens: int | Omit = omit,
+        min_p: float | Omit = omit,
+        n: int | Omit = omit,
+        presence_penalty: float | Omit = omit,
+        repetition_penalty: float | Omit = omit,
+        safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
+        seed: int | Omit = omit,
+        stop: SequenceNotStr[str] | Omit = omit,
+        stream: Literal[False] | Omit = omit,
+        temperature: float | Omit = omit,
+        top_k: int | Omit = omit,
+        top_p: float | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> Completion:
         """
-        Async method to generate completions based on a given prompt using a specified model.
+        Query a language, code, or image model.
         Args:
-            prompt (str): A string providing context for the model to complete.
-            model (str): The name of the model to query.
-            max_tokens (int, optional): The maximum number of tokens to generate.
-                Defaults to 512.
-            stop (List[str], optional): List of strings at which to stop generation.
-                Defaults to None.
-            temperature (float, optional): A decimal number that determines the degree of randomness in the response.
-                Defaults to None.
-            top_p (float, optional): The top_p (nucleus) parameter is used to dynamically adjust the number
-                    of choices for each predicted token based on the cumulative probabilities.
-                Defaults to None.
-            top_k (int, optional): The top_k parameter is used to limit the number of choices for the
-                    next predicted word or token.
-                Defaults to None.
-            repetition_penalty (float, optional): A number that controls the diversity of generated text
-                    by reducing the likelihood of repeated sequences. Higher values decrease repetition.
-                Defaults to None.
-            presence_penalty (float, optional): A number that controls the likelihood of tokens based on if they have
-                    appeared in the text. Positive values decrease the likelihood of repeated tokens or phrases.
-                    Must be in the range [-2, 2].
-                Defaults to None.
-            frequency_penalty (float, optional): A number that controls the likelihood of tokens based on the frequency
-                    of their appearance in the text. Positive decrease the likelihood of repeated tokens or phrases.
-                    Must be in the range [-2, 2].
-                Defaults to None.
-            min_p (float, optional): A number that controls the minimum percentage value that a token must reach to
-                be considered during sampling.
-                Must be in the range [0, 1].
-                Defaults to None.
-            logit_bias (Dict[str, float], optional): A dictionary of tokens and their bias values that modify the
-                likelihood of specific tokens being sampled. Bias values must be in the range [-100, 100].
-                Defaults to None.
-            seed (int, optional): Seed value for reproducibility.
-            stream (bool, optional): Flag indicating whether to stream the generated completions.
-                Defaults to False.
-            logprobs (int, optional): Number of top-k logprobs to return
-                Defaults to None.
-            echo (bool, optional): Echo prompt in output. Can be used with logprobs to return prompt logprobs.
-                Defaults to None.
-            n (int, optional): Number of completions to generate. Setting to None will return a single generation.
-                Defaults to None.
-            safety_model (str, optional): A moderation model to validate tokens. Choice between available moderation
-                    models found [here](https://docs.together.ai/docs/inference-models#moderation-models).
-                Defaults to None.
-        Returns:
-            AsyncGenerator[CompletionChunk, None] | CompletionResponse: Object containing the completions
-            or an iterator over completion chunks.
+          model: The name of the model to query.
+              [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
+          prompt: A string providing context for the model to complete.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
+          logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
+              at each generation step, instead of just the sampled token. Log probabilities
+              help assess model confidence in token predictions.
+          max_tokens: The maximum number of tokens to generate.
+          min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
+          n: The number of completions to generate for each prompt.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
+          repetition_penalty: A number that controls the diversity of generated text by reducing the
+              likelihood of repeated sequences. Higher values decrease repetition.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
+          seed: Seed value for reproducibility.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
+          extra_headers: Send extra headers
+          extra_query: Add additional query parameters to the request
+          extra_body: Add additional JSON properties to the request
+          timeout: Override the client-level default timeout for this request, in seconds
         """
+        ...
-        requestor = api_requestor.APIRequestor(
-            client=self._client,
-        )
+    @overload
+    async def create(
+        self,
+        *,
+        model: Union[
+            Literal[
+                "meta-llama/Llama-2-70b-hf",
+                "mistralai/Mistral-7B-v0.1",
+                "mistralai/Mixtral-8x7B-v0.1",
+                "Meta-Llama/Llama-Guard-7b",
+            ],
+            str,
+        ],
+        prompt: str,
+        stream: Literal[True],
+        echo: bool | Omit = omit,
+        frequency_penalty: float | Omit = omit,
+        logit_bias: Dict[str, float] | Omit = omit,
+        logprobs: int | Omit = omit,
+        max_tokens: int | Omit = omit,
+        min_p: float | Omit = omit,
+        n: int | Omit = omit,
+        presence_penalty: float | Omit = omit,
+        repetition_penalty: float | Omit = omit,
+        safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
+        seed: int | Omit = omit,
+        stop: SequenceNotStr[str] | Omit = omit,
+        temperature: float | Omit = omit,
+        top_k: int | Omit = omit,
+        top_p: float | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> AsyncStream[CompletionChunk]:
+        """
+        Query a language, code, or image model.
+        Args:
+          model: The name of the model to query.
+              [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
+          prompt: A string providing context for the model to complete.
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
+          logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
+              at each generation step, instead of just the sampled token. Log probabilities
+              help assess model confidence in token predictions.
+          max_tokens: The maximum number of tokens to generate.
+          min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
+          n: The number of completions to generate for each prompt.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
+          repetition_penalty: A number that controls the diversity of generated text by reducing the
+              likelihood of repeated sequences. Higher values decrease repetition.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
+          seed: Seed value for reproducibility.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
+          extra_headers: Send extra headers
+          extra_query: Add additional query parameters to the request
+          extra_body: Add additional JSON properties to the request
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+    @overload
+    async def create(
+        self,
+        *,
+        model: Union[
+            Literal[
+                "meta-llama/Llama-2-70b-hf",
+                "mistralai/Mistral-7B-v0.1",
+                "mistralai/Mixtral-8x7B-v0.1",
+                "Meta-Llama/Llama-Guard-7b",
+            ],
+            str,
+        ],
+        prompt: str,
+        stream: bool,
+        echo: bool | Omit = omit,
+        frequency_penalty: float | Omit = omit,
+        logit_bias: Dict[str, float] | Omit = omit,
+        logprobs: int | Omit = omit,
+        max_tokens: int | Omit = omit,
+        min_p: float | Omit = omit,
+        n: int | Omit = omit,
+        presence_penalty: float | Omit = omit,
+        repetition_penalty: float | Omit = omit,
+        safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
+        seed: int | Omit = omit,
+        stop: SequenceNotStr[str] | Omit = omit,
+        temperature: float | Omit = omit,
+        top_k: int | Omit = omit,
+        top_p: float | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> Completion | AsyncStream[CompletionChunk]:
+        """
+        Query a language, code, or image model.
+        Args:
+          model: The name of the model to query.
+              [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)
+          prompt: A string providing context for the model to complete.
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
+          logprobs: An integer between 0 and 20 of the top k tokens to return log probabilities for
+              at each generation step, instead of just the sampled token. Log probabilities
+              help assess model confidence in token predictions.
+          max_tokens: The maximum number of tokens to generate.
+          min_p: A number between 0 and 1 that can be used as an alternative to top-p and top-k.
+          n: The number of completions to generate for each prompt.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
+          repetition_penalty: A number that controls the diversity of generated text by reducing the
+              likelihood of repeated sequences. Higher values decrease repetition.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
+          seed: Seed value for reproducibility.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
-        parameter_payload = CompletionRequest(
-            model=model,
-            prompt=prompt,
-            top_p=top_p,
-            top_k=top_k,
-            temperature=temperature,
-            max_tokens=max_tokens,
-            stop=stop,
-            repetition_penalty=repetition_penalty,
-            presence_penalty=presence_penalty,
-            frequency_penalty=frequency_penalty,
-            min_p=min_p,
-            logit_bias=logit_bias,
-            seed=seed,
-            stream=stream,
-            logprobs=logprobs,
-            echo=echo,
-            n=n,
-            safety_model=safety_model,
-            **kwargs,
-        ).model_dump(exclude_none=True)
-        response, _, _ = await requestor.arequest(
-            options=TogetherRequest(
-                method="POST",
-                url="completions",
-                params=parameter_payload,
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
+          extra_headers: Send extra headers
+          extra_query: Add additional query parameters to the request
+          extra_body: Add additional JSON properties to the request
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+    @required_args(["model", "prompt"], ["model", "prompt", "stream"])
+    async def create(
+        self,
+        *,
+        model: Union[
+            Literal[
+                "meta-llama/Llama-2-70b-hf",
+                "mistralai/Mistral-7B-v0.1",
+                "mistralai/Mixtral-8x7B-v0.1",
+                "Meta-Llama/Llama-Guard-7b",
+            ],
+            str,
+        ],
+        prompt: str,
+        echo: bool | Omit = omit,
+        frequency_penalty: float | Omit = omit,
+        logit_bias: Dict[str, float] | Omit = omit,
+        logprobs: int | Omit = omit,
+        max_tokens: int | Omit = omit,
+        min_p: float | Omit = omit,
+        n: int | Omit = omit,
+        presence_penalty: float | Omit = omit,
+        repetition_penalty: float | Omit = omit,
+        safety_model: Union[Literal["Meta-Llama/Llama-Guard-7b"], str] | Omit = omit,
+        seed: int | Omit = omit,
+        stop: SequenceNotStr[str] | Omit = omit,
+        stream: Literal[False] | Literal[True] | Omit = omit,
+        temperature: float | Omit = omit,
+        top_k: int | Omit = omit,
+        top_p: float | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> Completion | AsyncStream[CompletionChunk]:
+        return await self._post(
+            "/completions",
+            body=await async_maybe_transform(
+                {
+                    "model": model,
+                    "prompt": prompt,
+                    "echo": echo,
+                    "frequency_penalty": frequency_penalty,
+                    "logit_bias": logit_bias,
+                    "logprobs": logprobs,
+                    "max_tokens": max_tokens,
+                    "min_p": min_p,
+                    "n": n,
+                    "presence_penalty": presence_penalty,
+                    "repetition_penalty": repetition_penalty,
+                    "safety_model": safety_model,
+                    "seed": seed,
+                    "stop": stop,
+                    "stream": stream,
+                    "temperature": temperature,
+                    "top_k": top_k,
+                    "top_p": top_p,
+                },
+                completion_create_params.CompletionCreateParamsStreaming
+                if stream
+                else completion_create_params.CompletionCreateParamsNonStreaming,
             ),
-            stream=stream,
+            options=make_request_options(
+                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+            ),
+            cast_to=Completion,
+            stream=stream or False,
+            stream_cls=AsyncStream[CompletionChunk],
+        )
+class CompletionsResourceWithRawResponse:
+    def __init__(self, completions: CompletionsResource) -> None:
+        self._completions = completions
+        self.create = to_raw_response_wrapper(
+            completions.create,
         )
-        if stream:
-            # must be an iterator
-            assert not isinstance(response, TogetherResponse)
-            return (CompletionChunk(**line.data) async for line in response)
-        assert isinstance(response, TogetherResponse)
-        return CompletionResponse(**response.data)
+class AsyncCompletionsResourceWithRawResponse:
+    def __init__(self, completions: AsyncCompletionsResource) -> None:
+        self._completions = completions
+        self.create = async_to_raw_response_wrapper(
+            completions.create,
+        )
+class CompletionsResourceWithStreamingResponse:
+    def __init__(self, completions: CompletionsResource) -> None:
+        self._completions = completions
+        self.create = to_streamed_response_wrapper(
+            completions.create,
+        )
+class AsyncCompletionsResourceWithStreamingResponse:
+    def __init__(self, completions: AsyncCompletionsResource) -> None:
+        self._completions = completions
+        self.create = async_to_streamed_response_wrapper(
+            completions.create,
+        )

together 1.5.34__py3-none-any.whl → 2.0.0a6__py3-none-any.whl

together 1.5.34py3-none-any.whl → 2.0.0a6py3-none-any.whl