PyPI - vectorvein - Versions diffs - 0.1.93__tar.gz → 0.1.94__tar.gz - Mend

vectorvein 0.1.93tar.gz → 0.1.94tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

{vectorvein-0.1.93 → vectorvein-0.1.94}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vectorvein
-Version: 0.1.93
+Version: 0.1.94
 Summary: VectorVein python SDK
 Author-Email: Anderson <andersonby@163.com>
 License: MIT

{vectorvein-0.1.93 → vectorvein-0.1.94}/pyproject.toml RENAMED Viewed

@@ -17,7 +17,7 @@ description = "VectorVein python SDK"
 name = "vectorvein"
 readme = "README.md"
 requires-python = ">=3.10"
-version = "0.1.93"
+version = "0.1.94"
 [project.license]
 text = "MIT"

vectorvein-0.1.94/src/vectorvein/chat_clients/minimax_client.py ADDED Viewed

@@ -0,0 +1,13 @@
+from ..types.enums import BackendType
+from ..types.defaults import MINIMAX_DEFAULT_MODEL
+from .openai_compatible_client import OpenAICompatibleChatClient, AsyncOpenAICompatibleChatClient
+class MiniMaxChatClient(OpenAICompatibleChatClient):
+    DEFAULT_MODEL = MINIMAX_DEFAULT_MODEL
+    BACKEND_NAME = BackendType.MiniMax
+class AsyncMiniMaxChatClient(AsyncOpenAICompatibleChatClient):
+    DEFAULT_MODEL = MINIMAX_DEFAULT_MODEL
+    BACKEND_NAME = BackendType.MiniMax

{vectorvein-0.1.93 → vectorvein-0.1.94}/src/vectorvein/chat_clients/openai_compatible_client.py RENAMED Viewed

@@ -181,7 +181,21 @@ class OpenAICompatibleChatClient(BaseChatClient):
         if tools:
             if self.model_setting.function_call_available:
-                tools_params = dict(tools=tools, tool_choice=tool_choice)
+                _tools = tools
+                if self.BACKEND_NAME.value == BackendType.MiniMax.value:  # MiniMax 就非要搞特殊
+                    _tools = []
+                    for tool in tools:
+                        _tools.append(
+                            {
+                                "type": "function",
+                                "function": {
+                                    "name": tool["function"]["name"],
+                                    "description": tool["function"].get("description", ""),
+                                    "parameters": json.dumps(tool["function"].get("parameters", {})),
+                                },
+                            }
+                        )
+                tools_params = dict(tools=_tools, tool_choice=tool_choice)
             else:
                 tools_str = json.dumps(tools, ensure_ascii=False, indent=None)
                 additional_system_prompt = generate_tool_use_system_prompt(tools=tools_str)
@@ -256,6 +270,7 @@ class OpenAICompatibleChatClient(BaseChatClient):
                         if chunk.choices[0].delta.tool_calls:
                             for index, tool_call in enumerate(chunk.choices[0].delta.tool_calls):
                                 tool_call.index = index
+                                tool_call.type = "function"  # 也是 MiniMax 的不规范导致的问题
                         yield ChatCompletionDeltaMessage(**chunk.choices[0].delta.model_dump(), usage=usage)
                     else:
                         message = chunk.choices[0].delta.model_dump()
@@ -509,7 +524,21 @@ class AsyncOpenAICompatibleChatClient(BaseAsyncChatClient):
         if tools:
             if self.model_setting.function_call_available:
-                tools_params = dict(tools=tools, tool_choice=tool_choice)
+                _tools = tools
+                if self.BACKEND_NAME.value == BackendType.MiniMax.value:
+                    _tools = []
+                    for tool in tools:
+                        _tools.append(
+                            {
+                                "type": "function",
+                                "function": {
+                                    "name": tool["function"]["name"],
+                                    "description": tool["function"].get("description", ""),
+                                    "parameters": json.dumps(tool["function"].get("parameters", {})),
+                                },
+                            }
+                        )
+                tools_params = dict(tools=_tools, tool_choice=tool_choice)
             else:
                 tools_str = json.dumps(tools, ensure_ascii=False, indent=None)
                 additional_system_prompt = generate_tool_use_system_prompt(tools=tools_str)
@@ -584,6 +613,7 @@ class AsyncOpenAICompatibleChatClient(BaseAsyncChatClient):
                         if chunk.choices[0].delta.tool_calls:
                             for index, tool_call in enumerate(chunk.choices[0].delta.tool_calls):
                                 tool_call.index = index
+                                tool_call.type = "function"
                         yield ChatCompletionDeltaMessage(**chunk.choices[0].delta.model_dump(), usage=usage)
                     else:
                         message = chunk.choices[0].delta.model_dump()

{vectorvein-0.1.93 → vectorvein-0.1.94}/src/vectorvein/chat_clients/utils.py RENAMED Viewed

@@ -311,10 +311,13 @@ def get_token_counts(text: str | dict, model: str = "", use_token_server_first:
         result = response.json()
         return result["usage"]["prompt_tokens"]
     else:
-        return len(get_gpt_35_encoding().encode(text))
+        return len(get_gpt_4o_encoding().encode(text))
 def calculate_image_tokens(width: int, height: int, model: str = "gpt-4o"):
+    if model.startswith("moonshot"):
+        return 1024
     if width > 2048 or height > 2048:
         aspect_ratio = width / height
         if aspect_ratio > 1:

{vectorvein-0.1.93 → vectorvein-0.1.94}/src/vectorvein/types/defaults.py RENAMED Viewed

@@ -33,6 +33,28 @@ MOONSHOT_MODELS: Final[Dict[str, Dict[str, Any]]] = {
         "function_call_available": True,
         "response_format_available": True,
     },
+    "moonshot-v1-8k-vision-preview": {
+        "id": "moonshot-v1-8k-vision-preview",
+        "context_length": 8192,
+        "max_output_tokens": 4096,
+        "function_call_available": True,
+        "response_format_available": True,
+        "native_multimodal": True,
+    },
+    "moonshot-v1-32k-vision-preview": {
+        "id": "moonshot-v1-32k-vision-preview",
+        "context_length": 32768,
+        "function_call_available": True,
+        "response_format_available": True,
+        "native_multimodal": True,
+    },
+    "moonshot-v1-128k-vision-preview": {
+        "id": "moonshot-v1-128k-vision-preview",
+        "context_length": 131072,
+        "function_call_available": True,
+        "response_format_available": True,
+        "native_multimodal": True,
+    },
 }
 # Deepseek models
@@ -293,13 +315,6 @@ QWEN_MODELS: Final[Dict[str, Dict[str, Any]]] = {
 # Yi models
 YI_DEFAULT_MODEL: Final[str] = "yi-lightning"
 YI_MODELS: Final[Dict[str, Dict[str, Any]]] = {
-    "yi-large": {
-        "id": "yi-large",
-        "context_length": 32000,
-        "max_output_tokens": 4096,
-        "function_call_available": False,
-        "response_format_available": False,
-    },
     "yi-lightning": {
         "id": "yi-lightning",
         "context_length": 16000,
@@ -307,44 +322,9 @@ YI_MODELS: Final[Dict[str, Dict[str, Any]]] = {
         "function_call_available": False,
         "response_format_available": False,
     },
-    "yi-large-turbo": {
-        "id": "yi-large-turbo",
+    "yi-vision-v2": {
+        "id": "yi-vision-v2",
         "context_length": 16000,
-        "max_output_tokens": 4096,
-        "function_call_available": False,
-        "response_format_available": False,
-    },
-    "yi-large-fc": {
-        "id": "yi-large-fc",
-        "context_length": 32000,
-        "max_output_tokens": 4096,
-        "function_call_available": True,
-        "response_format_available": False,
-    },
-    "yi-medium": {
-        "id": "yi-medium",
-        "context_length": 16000,
-        "max_output_tokens": 4096,
-        "function_call_available": False,
-        "response_format_available": False,
-    },
-    "yi-medium-200k": {
-        "id": "yi-medium-200k",
-        "context_length": 200000,
-        "max_output_tokens": 4096,
-        "function_call_available": False,
-        "response_format_available": False,
-    },
-    "yi-spark": {
-        "id": "yi-spark",
-        "context_length": 16000,
-        "max_output_tokens": 4096,
-        "function_call_available": False,
-        "response_format_available": False,
-    },
-    "yi-vision": {
-        "id": "yi-vision",
-        "context_length": 4000,
         "max_output_tokens": 2000,
         "function_call_available": False,
         "response_format_available": False,

vectorvein-0.1.93/src/vectorvein/chat_clients/minimax_client.py DELETED Viewed

@@ -1,548 +0,0 @@
-# @Author: Bi Ying
-# @Date:   2024-07-26 14:48:55
-import json
-from functools import cached_property
-from typing import Iterable, Literal, Generator, AsyncGenerator, overload, Any
-import httpx
-from ..types import defaults as defs
-from .utils import cutoff_messages, get_token_counts
-from .base_client import BaseChatClient, BaseAsyncChatClient
-from ..types.enums import ContextLengthControlType, BackendType
-from ..types.llm_parameters import (
-    NotGiven,
-    NOT_GIVEN,
-    ToolParam,
-    ToolChoice,
-    ChatCompletionMessage,
-    ChatCompletionDeltaMessage,
-    ChatCompletionStreamOptionsParam,
-)
-def extract_tool_calls(response):
-    try:
-        message = response["choices"][0].get("delta") or response["choices"][0].get("message", {})
-        tool_calls = message.get("tool_calls")
-        if tool_calls:
-            return {
-                "tool_calls": [
-                    {
-                        "index": index,
-                        "id": tool_call["id"],
-                        "function": tool_call["function"],
-                        "type": "function",
-                    }
-                    for index, tool_call in enumerate(tool_calls)
-                ]
-            }
-        else:
-            return {}
-    except Exception:
-        return {}
-class MiniMaxChatClient(BaseChatClient):
-    DEFAULT_MODEL: str = defs.MINIMAX_DEFAULT_MODEL
-    BACKEND_NAME: BackendType = BackendType.MiniMax
-    def __init__(
-        self,
-        model: str = defs.MINIMAX_DEFAULT_MODEL,
-        stream: bool = True,
-        temperature: float | None | NotGiven = NOT_GIVEN,
-        context_length_control: ContextLengthControlType = defs.CONTEXT_LENGTH_CONTROL,
-        random_endpoint: bool = True,
-        endpoint_id: str = "",
-        http_client: httpx.Client | None = None,
-        backend_name: str | None = None,
-    ):
-        super().__init__(
-            model,
-            stream,
-            temperature,
-            context_length_control,
-            random_endpoint,
-            endpoint_id,
-            http_client,
-            backend_name,
-        )
-        self.model_id = None
-        self.endpoint = None
-    @cached_property
-    def raw_client(self):
-        self.endpoint, self.model_id = self._set_endpoint()
-        if not self.http_client:
-            self.http_client = httpx.Client(timeout=300, proxy=self.endpoint.proxy)
-        return self.http_client
-    @overload
-    def create_completion(
-        self,
-        *,
-        messages: list,
-        model: str | None = None,
-        stream: Literal[False] = False,
-        temperature: float | None | NotGiven = NOT_GIVEN,
-        max_tokens: int | None = None,
-        tools: Iterable[ToolParam] | NotGiven = NOT_GIVEN,
-        tool_choice: ToolChoice | NotGiven = NOT_GIVEN,
-        response_format: dict | None = None,
-        stream_options: ChatCompletionStreamOptionsParam | None = None,
-        top_p: float | NotGiven | None = NOT_GIVEN,
-        skip_cutoff: bool = False,
-        **kwargs,
-    ) -> ChatCompletionMessage:
-        pass
-    @overload
-    def create_completion(
-        self,
-        *,
-        messages: list,
-        model: str | None = None,
-        stream: Literal[True],
-        temperature: float | None | NotGiven = NOT_GIVEN,
-        max_tokens: int | None = None,
-        tools: Iterable[ToolParam] | NotGiven = NOT_GIVEN,
-        tool_choice: ToolChoice | NotGiven = NOT_GIVEN,
-        response_format: dict | None = None,
-        stream_options: ChatCompletionStreamOptionsParam | None = None,
-        top_p: float | NotGiven | None = NOT_GIVEN,
-        skip_cutoff: bool = False,
-        **kwargs,
-    ) -> Generator[ChatCompletionDeltaMessage, None, None]:
-        pass
-    @overload
-    def create_completion(
-        self,
-        *,
-        messages: list,
-        model: str | None = None,
-        stream: bool,
-        temperature: float | None | NotGiven = NOT_GIVEN,
-        max_tokens: int | None = None,
-        tools: Iterable[ToolParam] | NotGiven = NOT_GIVEN,
-        tool_choice: ToolChoice | NotGiven = NOT_GIVEN,
-        response_format: dict | None = None,
-        stream_options: ChatCompletionStreamOptionsParam | None = None,
-        top_p: float | NotGiven | None = NOT_GIVEN,
-        skip_cutoff: bool = False,
-        **kwargs,
-    ) -> ChatCompletionMessage | Generator[ChatCompletionDeltaMessage, Any, None]:
-        pass
-    def create_completion(
-        self,
-        messages: list,
-        model: str | None = None,
-        stream: Literal[False] | Literal[True] = False,
-        temperature: float | None | NotGiven = NOT_GIVEN,
-        max_tokens: int | None = None,
-        tools: Iterable[ToolParam] | NotGiven = NOT_GIVEN,
-        tool_choice: ToolChoice | NotGiven = NOT_GIVEN,
-        response_format: dict | None = None,
-        stream_options: ChatCompletionStreamOptionsParam | None = None,
-        top_p: float | NotGiven | None = NOT_GIVEN,
-        skip_cutoff: bool = False,
-        **kwargs,
-    ):
-        if model is not None:
-            self.model = model
-        if stream is not None:
-            self.stream = stream
-        if temperature is not None:
-            self.temperature = temperature
-        if isinstance(tool_choice, NotGiven):
-            tool_choice = "auto"
-        self.model_setting = self.backend_settings.models[self.model]
-        if self.model_id is None:
-            self.model_id = self.model_setting.id
-        self.endpoint, self.model_id = self._set_endpoint()
-        if not skip_cutoff and self.context_length_control == ContextLengthControlType.Latest:
-            messages = cutoff_messages(
-                messages,
-                max_count=self.model_setting.context_length,
-                backend=self.BACKEND_NAME,
-                model=self.model_setting.id,
-            )
-        if tools:
-            tools_params = {
-                "tools": [
-                    {
-                        "type": "function",
-                        "function": {
-                            "name": tool["function"]["name"],
-                            "description": tool["function"].get("description", ""),
-                            "parameters": json.dumps(
-                                tool["function"].get("parameters", {})
-                            ),  # 非要搞不同，parameters 是个字符串
-                        },
-                    }
-                    for tool in tools
-                ],
-                "tool_choice": tool_choice,
-            }
-        else:
-            tools_params = {}
-        if top_p:
-            top_p_params = {"top_p": top_p}
-        else:
-            top_p_params = {}
-        temperature_params = {}
-        if temperature:
-            temperature_params = {"temperature": temperature}
-        if max_tokens is None:
-            max_output_tokens = self.model_setting.max_output_tokens
-            if max_output_tokens is not None:
-                token_counts = get_token_counts(
-                    text={"messages": messages, "tools_params": tools_params},
-                    model=self.model,
-                    use_token_server_first=True,
-                )
-                max_tokens = self.model_setting.context_length - token_counts
-                max_tokens = min(max(max_tokens, 1), max_output_tokens)
-            else:
-                token_counts = get_token_counts(
-                    text={"messages": messages, "tools_params": tools_params},
-                    model=self.model,
-                    use_token_server_first=True,
-                )
-                max_tokens = self.model_setting.context_length - token_counts
-        self.url = self.endpoint.api_base or "https://api.minimax.chat/v1/text/chatcompletion_v2"
-        self.headers = {"Authorization": f"Bearer {self.endpoint.api_key}", "Content-Type": "application/json"}
-        request_body = {
-            "model": self.model_id,
-            "messages": messages,
-            "max_tokens": max_tokens,
-            "stream": self.stream,
-            "mask_sensitive_info": False,
-            **temperature_params,
-            **top_p_params,
-            **tools_params,
-            **kwargs,
-        }
-        raw_client = self.raw_client
-        if self.stream:
-            def generator():
-                with raw_client.stream(
-                    "POST",
-                    url=self.url,
-                    headers=self.headers,
-                    json=request_body,
-                    timeout=300,
-                ) as response:
-                    for chunk in response.iter_lines():
-                        if chunk:
-                            chunk_data = json.loads(chunk[6:])
-                            if chunk_data["object"] != "chat.completion.chunk":
-                                continue
-                            tool_calls_params = extract_tool_calls(chunk_data)
-                            has_tool_calls = True if tool_calls_params else False
-                            if has_tool_calls:
-                                yield ChatCompletionDeltaMessage(
-                                    **{
-                                        "content": chunk_data["choices"][0]["delta"].get("content"),
-                                        "role": "assistant",
-                                        **tool_calls_params,
-                                    }
-                                )
-                            else:
-                                yield ChatCompletionDeltaMessage(
-                                    **{
-                                        "content": chunk_data["choices"][0]["delta"]["content"],
-                                        "role": "assistant",
-                                    }
-                                )
-            return generator()
-        else:
-            response = raw_client.post(
-                url=self.url,
-                headers=self.headers,
-                json=request_body,
-                timeout=300,
-            )
-            result = response.json()
-            tool_calls_params = extract_tool_calls(result)
-            return ChatCompletionMessage(
-                **{
-                    "content": result["choices"][0]["message"].get("content"),
-                    "usage": {
-                        "prompt_tokens": 0,
-                        "completion_tokens": result["usage"]["total_tokens"],
-                        "total_tokens": result["usage"]["total_tokens"],
-                    },
-                    "role": "assistant",
-                    **tool_calls_params,
-                }
-            )
-class AsyncMiniMaxChatClient(BaseAsyncChatClient):
-    DEFAULT_MODEL: str = defs.MINIMAX_DEFAULT_MODEL
-    BACKEND_NAME: BackendType = BackendType.MiniMax
-    def __init__(
-        self,
-        model: str = defs.MINIMAX_DEFAULT_MODEL,
-        stream: bool = True,
-        temperature: float | None | NotGiven = NOT_GIVEN,
-        context_length_control: ContextLengthControlType = defs.CONTEXT_LENGTH_CONTROL,
-        random_endpoint: bool = True,
-        endpoint_id: str = "",
-        http_client: httpx.AsyncClient | None = None,
-        backend_name: str | None = None,
-    ):
-        super().__init__(
-            model,
-            stream,
-            temperature,
-            context_length_control,
-            random_endpoint,
-            endpoint_id,
-            http_client,
-            backend_name,
-        )
-        self.model_id = None
-        self.endpoint = None
-    @cached_property
-    def raw_client(self):
-        self.endpoint, self.model_id = self._set_endpoint()
-        if not self.http_client:
-            self.http_client = httpx.AsyncClient(timeout=300, proxy=self.endpoint.proxy)
-        return self.http_client
-    @overload
-    async def create_completion(
-        self,
-        *,
-        messages: list,
-        model: str | None = None,
-        stream: Literal[False] = False,
-        temperature: float | None | NotGiven = NOT_GIVEN,
-        max_tokens: int | None = None,
-        tools: Iterable[ToolParam] | NotGiven = NOT_GIVEN,
-        tool_choice: ToolChoice | NotGiven = NOT_GIVEN,
-        response_format: dict | None = None,
-        stream_options: ChatCompletionStreamOptionsParam | None = None,
-        top_p: float | NotGiven | None = NOT_GIVEN,
-        skip_cutoff: bool = False,
-        **kwargs,
-    ) -> ChatCompletionMessage:
-        pass
-    @overload
-    async def create_completion(
-        self,
-        *,
-        messages: list,
-        model: str | None = None,
-        stream: Literal[True],
-        temperature: float | None | NotGiven = NOT_GIVEN,
-        max_tokens: int | None = None,
-        tools: Iterable[ToolParam] | NotGiven = NOT_GIVEN,
-        tool_choice: ToolChoice | NotGiven = NOT_GIVEN,
-        response_format: dict | None = None,
-        stream_options: ChatCompletionStreamOptionsParam | None = None,
-        top_p: float | NotGiven | None = NOT_GIVEN,
-        skip_cutoff: bool = False,
-        **kwargs,
-    ) -> AsyncGenerator[ChatCompletionDeltaMessage, Any]:
-        pass
-    @overload
-    async def create_completion(
-        self,
-        *,
-        messages: list,
-        model: str | None = None,
-        stream: bool,
-        temperature: float | None | NotGiven = NOT_GIVEN,
-        max_tokens: int | None = None,
-        tools: Iterable[ToolParam] | NotGiven = NOT_GIVEN,
-        tool_choice: ToolChoice | NotGiven = NOT_GIVEN,
-        response_format: dict | None = None,
-        stream_options: ChatCompletionStreamOptionsParam | None = None,
-        top_p: float | NotGiven | None = NOT_GIVEN,
-        skip_cutoff: bool = False,
-        **kwargs,
-    ) -> ChatCompletionMessage | AsyncGenerator[ChatCompletionDeltaMessage, Any]:
-        pass
-    async def create_completion(
-        self,
-        *,
-        messages: list,
-        model: str | None = None,
-        stream: Literal[False] | Literal[True] = False,
-        temperature: float | None | NotGiven = NOT_GIVEN,
-        max_tokens: int | None = None,
-        tools: Iterable[ToolParam] | NotGiven = NOT_GIVEN,
-        tool_choice: ToolChoice | NotGiven = NOT_GIVEN,
-        response_format: dict | None = None,
-        stream_options: ChatCompletionStreamOptionsParam | None = None,
-        top_p: float | NotGiven | None = NOT_GIVEN,
-        skip_cutoff: bool = False,
-        **kwargs,
-    ):
-        if model is not None:
-            self.model = model
-        if stream is not None:
-            self.stream = stream
-        if temperature is not None:
-            self.temperature = temperature
-        if isinstance(tool_choice, NotGiven):
-            tool_choice = "auto"
-        self.model_setting = self.backend_settings.models[self.model]
-        if self.model_id is None:
-            self.model_id = self.model_setting.id
-        self.endpoint, self.model_id = self._set_endpoint()
-        if not skip_cutoff and self.context_length_control == ContextLengthControlType.Latest:
-            messages = cutoff_messages(
-                messages,
-                max_count=self.model_setting.context_length,
-                backend=self.BACKEND_NAME,
-                model=self.model_setting.id,
-            )
-        if tools:
-            tools_params = {
-                "tools": [
-                    {
-                        "type": "function",
-                        "function": {
-                            "name": tool["function"]["name"],
-                            "description": tool["function"].get("description", ""),
-                            "parameters": json.dumps(tool["function"].get("parameters", {})),
-                        },
-                    }
-                    for tool in tools
-                ],
-                "tool_choice": tool_choice,
-            }
-        else:
-            tools_params = {}
-        if top_p:
-            top_p_params = {"top_p": top_p}
-        else:
-            top_p_params = {}
-        temperature_params = {}
-        if temperature:
-            temperature_params = {"temperature": temperature}
-        if max_tokens is None:
-            max_output_tokens = self.model_setting.max_output_tokens
-            if max_output_tokens is not None:
-                token_counts = get_token_counts(
-                    text={"messages": messages, "tools_params": tools_params},
-                    model=self.model,
-                    use_token_server_first=True,
-                )
-                max_tokens = self.model_setting.context_length - token_counts
-                max_tokens = min(max(max_tokens, 1), max_output_tokens)
-            else:
-                token_counts = get_token_counts(
-                    text={"messages": messages, "tools_params": tools_params},
-                    model=self.model,
-                    use_token_server_first=True,
-                )
-                max_tokens = self.model_setting.context_length - token_counts
-        self.url = self.endpoint.api_base or "https://api.minimax.chat/v1/text/chatcompletion_v2"
-        self.headers = {"Authorization": f"Bearer {self.endpoint.api_key}", "Content-Type": "application/json"}
-        request_body = {
-            "model": self.model_id,
-            "messages": messages,
-            "max_tokens": max_tokens,
-            "stream": self.stream,
-            "mask_sensitive_info": False,
-            **temperature_params,
-            **top_p_params,
-            **tools_params,
-            **kwargs,
-        }
-        raw_client = self.raw_client
-        if self.stream:
-            async def generator():
-                async with raw_client.stream(
-                    "POST",
-                    url=self.url,
-                    headers=self.headers,
-                    json=request_body,
-                    timeout=300,
-                ) as response:
-                    has_tool_calls = False
-                    async for chunk in response.aiter_lines():
-                        if chunk:
-                            chunk_data = json.loads(chunk[6:])
-                            if chunk_data["object"] != "chat.completion.chunk":
-                                continue
-                            tool_calls_params = extract_tool_calls(chunk_data)
-                            has_tool_calls = True if tool_calls_params else False
-                            if has_tool_calls:
-                                yield ChatCompletionDeltaMessage(
-                                    **{
-                                        "content": chunk_data["choices"][0]["delta"].get("content"),
-                                        "role": "assistant",
-                                        **tool_calls_params,
-                                    }
-                                )
-                            else:
-                                yield ChatCompletionDeltaMessage(
-                                    **{
-                                        "content": chunk_data["choices"][0]["delta"]["content"],
-                                        "role": "assistant",
-                                    }
-                                )
-            return generator()
-        else:
-            response = await raw_client.post(
-                url=self.url,
-                headers=self.headers,
-                json=request_body,
-                timeout=300,
-            )
-            result = response.json()
-            tool_calls_params = extract_tool_calls(result)
-            return ChatCompletionMessage(
-                **{
-                    "content": result["choices"][0]["message"].get("content"),
-                    "usage": {
-                        "prompt_tokens": 0,
-                        "completion_tokens": result["usage"]["total_tokens"],
-                        "total_tokens": result["usage"]["total_tokens"],
-                    },
-                    "role": "assistant",
-                    **tool_calls_params,
-                }
-            )
-    async def __aexit__(self, exc_type, exc, tb):
-        await self.http_client.aclose()