PyPI - langroid - Versions diffs - 0.56.10__py3-none-any.whl → 0.56.12__py3-none-any.whl - Mend

langroid 0.56.10py3-none-any.whl → 0.56.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

langroid/agent/base.py +5 -3
langroid/agent/chat_agent.py +12 -0
langroid/language_models/base.py +25 -19
langroid/language_models/client_cache.py +255 -0
langroid/language_models/model_info.py +57 -3
langroid/language_models/openai_gpt.py +102 -41
{langroid-0.56.10.dist-info → langroid-0.56.12.dist-info}/METADATA +1 -1
{langroid-0.56.10.dist-info → langroid-0.56.12.dist-info}/RECORD +10 -9
{langroid-0.56.10.dist-info → langroid-0.56.12.dist-info}/WHEEL +0 -0
{langroid-0.56.10.dist-info → langroid-0.56.12.dist-info}/licenses/LICENSE +0 -0

langroid/agent/base.py CHANGED Viewed

@@ -2142,7 +2142,7 @@ class Agent(ABC):
                 completion_tokens = self.num_tokens(response.message)
                 if response.function_call is not None:
                     completion_tokens += self.num_tokens(str(response.function_call))
-                cost = self.compute_token_cost(prompt_tokens, completion_tokens)
+                cost = self.compute_token_cost(prompt_tokens, 0, completion_tokens)
             response.usage = LLMTokenUsage(
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
@@ -2166,9 +2166,11 @@ class Agent(ABC):
             if print_response_stats:
                 print(self.indent + self.token_stats_str)
-    def compute_token_cost(self, prompt: int, completion: int) -> float:
+    def compute_token_cost(self, prompt: int, cached: int, completion: int) -> float:
         price = cast(LanguageModel, self.llm).chat_cost()
-        return (price[0] * prompt + price[1] * completion) / 1000
+        return (
+            price[0] * (prompt - cached) + price[1] * cached + price[2] * completion
+        ) / 1000
     def ask_agent(
         self,

langroid/agent/chat_agent.py CHANGED Viewed

@@ -2068,3 +2068,15 @@ class ChatAgent(Agent):
             return str(self.message_history[i])
         else:
             return "\n".join([str(m) for m in self.message_history[i:]])
+    def __del__(self) -> None:
+        """
+        Cleanup method called when the ChatAgent is garbage collected.
+        Note: We don't close LLM clients here because they may be shared
+        across multiple agents when client caching is enabled.
+        The clients are managed centrally and cleaned up via atexit hooks.
+        """
+        # Previously we closed clients here, but this caused issues when
+        # multiple agents shared the same cached client instance.
+        # Clients are now managed centrally in langroid.language_models.client_cache
+        pass

langroid/language_models/base.py CHANGED Viewed

@@ -91,10 +91,6 @@ class LLMConfig(BaseSettings):
     # reasoning output from reasoning models
     cache_config: None | CacheDBConfig = RedisCacheConfig()
     thought_delimiters: Tuple[str, str] = ("<think>", "</think>")
-    # Dict of model -> (input/prompt cost, output/completion cost)
-    chat_cost_per_1k_tokens: Tuple[float, float] = (0.0, 0.0)
-    completion_cost_per_1k_tokens: Tuple[float, float] = (0.0, 0.0)
     retry_params: RetryParams = RetryParams()
     @property
@@ -131,7 +127,7 @@ class LLMFunctionCall(BaseModel):
             if not isinstance(dict_or_list, dict):
                 raise ValueError(
                     f"""
-                        Invalid function args: {fun_args_str}
+                        Invalid function args: {fun_args_str}
                         parsed as {dict_or_list},
                         which is not a valid dict.
                         """
@@ -224,12 +220,14 @@ class LLMTokenUsage(BaseModel):
     """
     prompt_tokens: int = 0
+    cached_tokens: int = 0
     completion_tokens: int = 0
     cost: float = 0.0
     calls: int = 0  # how many API calls - not used as of 2025-04-04
     def reset(self) -> None:
         self.prompt_tokens = 0
+        self.cached_tokens = 0
         self.completion_tokens = 0
         self.cost = 0.0
         self.calls = 0
@@ -237,7 +235,8 @@ class LLMTokenUsage(BaseModel):
     def __str__(self) -> str:
         return (
             f"Tokens = "
-            f"(prompt {self.prompt_tokens}, completion {self.completion_tokens}), "
+            f"(prompt {self.prompt_tokens}, cached {self.cached_tokens}, "
+            f"completion {self.completion_tokens}), "
             f"Cost={self.cost}, Calls={self.calls}"
         )
@@ -462,9 +461,9 @@ class LanguageModel(ABC):
         if type(config) is LLMConfig:
             raise ValueError(
                 """
-                Cannot create a Language Model object from LLMConfig.
-                Please specify a specific subclass of LLMConfig e.g.,
-                OpenAIGPTConfig. If you are creating a ChatAgent from
+                Cannot create a Language Model object from LLMConfig.
+                Please specify a specific subclass of LLMConfig e.g.,
+                OpenAIGPTConfig. If you are creating a ChatAgent from
                 a ChatAgentConfig, please specify the `llm` field of this config
                 as a specific subclass of LLMConfig, e.g., OpenAIGPTConfig.
                 """
@@ -666,8 +665,15 @@ class LanguageModel(ABC):
     def completion_context_length(self) -> int:
         return self.config.completion_context_length or DEFAULT_CONTEXT_LENGTH
-    def chat_cost(self) -> Tuple[float, float]:
-        return self.config.chat_cost_per_1k_tokens
+    def chat_cost(self) -> Tuple[float, float, float]:
+        """
+        Return the cost per 1000 tokens for chat completions.
+        Returns:
+            Tuple[float, float, float]: (input_cost, cached_cost, output_cost)
+                per 1000 tokens
+        """
+        return (0.0, 0.0, 0.0)
     def reset_usage_cost(self) -> None:
         for mdl in [self.config.chat_model, self.config.completion_model]:
@@ -754,18 +760,18 @@ class LanguageModel(ABC):
         prompt = f"""
         You are an expert at understanding a CHAT HISTORY between an AI Assistant
-        and a User, and you are highly skilled in rephrasing the User's FOLLOW-UP
-        QUESTION/REQUEST as a STANDALONE QUESTION/REQUEST that can be understood
+        and a User, and you are highly skilled in rephrasing the User's FOLLOW-UP
+        QUESTION/REQUEST as a STANDALONE QUESTION/REQUEST that can be understood
         WITHOUT the context of the chat history.
-        Below is the CHAT HISTORY. When the User asks you to rephrase a
-        FOLLOW-UP QUESTION/REQUEST, your ONLY task is to simply return the
-        question REPHRASED as a STANDALONE QUESTION/REQUEST, without any additional
+        Below is the CHAT HISTORY. When the User asks you to rephrase a
+        FOLLOW-UP QUESTION/REQUEST, your ONLY task is to simply return the
+        question REPHRASED as a STANDALONE QUESTION/REQUEST, without any additional
         text or context.
         <CHAT_HISTORY>
         {history}
-        </CHAT_HISTORY>
+        </CHAT_HISTORY>
         """.strip()
         follow_up_question = f"""

langroid/language_models/client_cache.py ADDED Viewed

@@ -0,0 +1,255 @@
+"""
+Client caching/singleton pattern for LLM clients to prevent connection pool exhaustion.
+"""
+import atexit
+import hashlib
+import weakref
+from typing import Any, Dict, Optional, Union, cast
+from cerebras.cloud.sdk import AsyncCerebras, Cerebras
+from groq import AsyncGroq, Groq
+from httpx import Timeout
+from openai import AsyncOpenAI, OpenAI
+# Cache for client instances, keyed by hashed configuration parameters
+_client_cache: Dict[str, Any] = {}
+# Keep track of clients for cleanup
+_all_clients: weakref.WeakSet[Any] = weakref.WeakSet()
+def _get_cache_key(client_type: str, **kwargs: Any) -> str:
+    """
+    Generate a cache key from client type and configuration parameters.
+    Uses the same approach as OpenAIGPT._cache_lookup for consistency.
+    Args:
+        client_type: Type of client (e.g., "openai", "groq", "cerebras")
+        **kwargs: Configuration parameters (api_key, base_url, timeout, etc.)
+    Returns:
+        SHA256 hash of the configuration as a hex string
+    """
+    # Convert kwargs to sorted string representation
+    sorted_kwargs_str = str(sorted(kwargs.items()))
+    # Create raw key combining client type and sorted kwargs
+    raw_key = f"{client_type}:{sorted_kwargs_str}"
+    # Hash the key for consistent length and to handle complex objects
+    hashed_key = hashlib.sha256(raw_key.encode()).hexdigest()
+    return hashed_key
+def get_openai_client(
+    api_key: str,
+    base_url: Optional[str] = None,
+    organization: Optional[str] = None,
+    timeout: Union[float, Timeout] = 120.0,
+    default_headers: Optional[Dict[str, str]] = None,
+) -> OpenAI:
+    """
+    Get or create a singleton OpenAI client with the given configuration.
+    Args:
+        api_key: OpenAI API key
+        base_url: Optional base URL for API
+        organization: Optional organization ID
+        timeout: Request timeout
+        default_headers: Optional default headers
+    Returns:
+        OpenAI client instance
+    """
+    if isinstance(timeout, (int, float)):
+        timeout = Timeout(timeout)
+    cache_key = _get_cache_key(
+        "openai",
+        api_key=api_key,
+        base_url=base_url,
+        organization=organization,
+        timeout=timeout,
+        default_headers=default_headers,
+    )
+    if cache_key in _client_cache:
+        return cast(OpenAI, _client_cache[cache_key])
+    client = OpenAI(
+        api_key=api_key,
+        base_url=base_url,
+        organization=organization,
+        timeout=timeout,
+        default_headers=default_headers,
+    )
+    _client_cache[cache_key] = client
+    _all_clients.add(client)
+    return client
+def get_async_openai_client(
+    api_key: str,
+    base_url: Optional[str] = None,
+    organization: Optional[str] = None,
+    timeout: Union[float, Timeout] = 120.0,
+    default_headers: Optional[Dict[str, str]] = None,
+) -> AsyncOpenAI:
+    """
+    Get or create a singleton AsyncOpenAI client with the given configuration.
+    Args:
+        api_key: OpenAI API key
+        base_url: Optional base URL for API
+        organization: Optional organization ID
+        timeout: Request timeout
+        default_headers: Optional default headers
+    Returns:
+        AsyncOpenAI client instance
+    """
+    if isinstance(timeout, (int, float)):
+        timeout = Timeout(timeout)
+    cache_key = _get_cache_key(
+        "async_openai",
+        api_key=api_key,
+        base_url=base_url,
+        organization=organization,
+        timeout=timeout,
+        default_headers=default_headers,
+    )
+    if cache_key in _client_cache:
+        return cast(AsyncOpenAI, _client_cache[cache_key])
+    client = AsyncOpenAI(
+        api_key=api_key,
+        base_url=base_url,
+        organization=organization,
+        timeout=timeout,
+        default_headers=default_headers,
+    )
+    _client_cache[cache_key] = client
+    _all_clients.add(client)
+    return client
+def get_groq_client(api_key: str) -> Groq:
+    """
+    Get or create a singleton Groq client with the given configuration.
+    Args:
+        api_key: Groq API key
+    Returns:
+        Groq client instance
+    """
+    cache_key = _get_cache_key("groq", api_key=api_key)
+    if cache_key in _client_cache:
+        return cast(Groq, _client_cache[cache_key])
+    client = Groq(api_key=api_key)
+    _client_cache[cache_key] = client
+    _all_clients.add(client)
+    return client
+def get_async_groq_client(api_key: str) -> AsyncGroq:
+    """
+    Get or create a singleton AsyncGroq client with the given configuration.
+    Args:
+        api_key: Groq API key
+    Returns:
+        AsyncGroq client instance
+    """
+    cache_key = _get_cache_key("async_groq", api_key=api_key)
+    if cache_key in _client_cache:
+        return cast(AsyncGroq, _client_cache[cache_key])
+    client = AsyncGroq(api_key=api_key)
+    _client_cache[cache_key] = client
+    _all_clients.add(client)
+    return client
+def get_cerebras_client(api_key: str) -> Cerebras:
+    """
+    Get or create a singleton Cerebras client with the given configuration.
+    Args:
+        api_key: Cerebras API key
+    Returns:
+        Cerebras client instance
+    """
+    cache_key = _get_cache_key("cerebras", api_key=api_key)
+    if cache_key in _client_cache:
+        return cast(Cerebras, _client_cache[cache_key])
+    client = Cerebras(api_key=api_key)
+    _client_cache[cache_key] = client
+    _all_clients.add(client)
+    return client
+def get_async_cerebras_client(api_key: str) -> AsyncCerebras:
+    """
+    Get or create a singleton AsyncCerebras client with the given configuration.
+    Args:
+        api_key: Cerebras API key
+    Returns:
+        AsyncCerebras client instance
+    """
+    cache_key = _get_cache_key("async_cerebras", api_key=api_key)
+    if cache_key in _client_cache:
+        return cast(AsyncCerebras, _client_cache[cache_key])
+    client = AsyncCerebras(api_key=api_key)
+    _client_cache[cache_key] = client
+    _all_clients.add(client)
+    return client
+def _cleanup_clients() -> None:
+    """
+    Cleanup function to close all cached clients on exit.
+    Called automatically via atexit.
+    """
+    import inspect
+    for client in list(_all_clients):
+        if hasattr(client, "close") and callable(client.close):
+            try:
+                # Check if close is a coroutine function (async)
+                if inspect.iscoroutinefunction(client.close):
+                    # For async clients, we can't await in atexit
+                    # They will be cleaned up by the OS
+                    pass
+                else:
+                    # Sync clients can be closed directly
+                    client.close()
+            except Exception:
+                pass  # Ignore errors during cleanup
+# Register cleanup function to run on exit
+atexit.register(_cleanup_clients)
+# For testing purposes
+def _clear_cache() -> None:
+    """Clear the client cache. Only for testing."""
+    _client_cache.clear()

langroid/language_models/model_info.py CHANGED Viewed

@@ -69,7 +69,9 @@ class GeminiModel(ModelName):
     GEMINI_1_5_FLASH = "gemini-1.5-flash"
     GEMINI_1_5_FLASH_8B = "gemini-1.5-flash-8b"
     GEMINI_1_5_PRO = "gemini-1.5-pro"
-    GEMINI_2_5_PRO = "gemini-2.5-pro-exp-02-05"
+    GEMINI_2_5_PRO = "gemini-2.5-pro"
+    GEMINI_2_5_FLASH = "gemini-2.5-flash"
+    GEMINI_2_5_FLASH_LITE_PREVIEW = "gemini-2.5-flash-lite-preview-06-17"
     GEMINI_2_PRO = "gemini-2.0-pro-exp-02-05"
     GEMINI_2_FLASH = "gemini-2.0-flash"
     GEMINI_2_FLASH_LITE = "gemini-2.0-flash-lite-preview"
@@ -108,6 +110,7 @@ class ModelInfo(BaseModel):
     max_cot_tokens: int = 0  # max chain of thought (thinking) tokens where applicable
     max_output_tokens: int = 8192  # Maximum number of output tokens - model dependent
     input_cost_per_million: float = 0.0  # Cost in USD per million input tokens
+    cached_cost_per_million: float = 0.0  # Cost in USD per million cached tokens
     output_cost_per_million: float = 0.0  # Cost in USD per million output tokens
     allows_streaming: bool = True  # Whether model supports streaming output
     allows_system_message: bool = True  # Whether model supports system messages
@@ -173,6 +176,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=1_047_576,
         max_output_tokens=32_768,
         input_cost_per_million=0.10,
+        cached_cost_per_million=0.025,
         output_cost_per_million=0.40,
         description="GPT-4.1",
     ),
@@ -182,6 +186,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=1_047_576,
         max_output_tokens=32_768,
         input_cost_per_million=0.40,
+        cached_cost_per_million=0.10,
         output_cost_per_million=1.60,
         description="GPT-4.1 Mini",
     ),
@@ -191,6 +196,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=1_047_576,
         max_output_tokens=32_768,
         input_cost_per_million=2.00,
+        cached_cost_per_million=0.50,
         output_cost_per_million=8.00,
         description="GPT-4.1",
     ),
@@ -200,6 +206,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=128_000,
         max_output_tokens=16_384,
         input_cost_per_million=2.5,
+        cached_cost_per_million=1.25,
         output_cost_per_million=10.0,
         has_structured_output=True,
         description="GPT-4o (128K context)",
@@ -210,6 +217,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=128_000,
         max_output_tokens=16_384,
         input_cost_per_million=0.15,
+        cached_cost_per_million=0.075,
         output_cost_per_million=0.60,
         has_structured_output=True,
         description="GPT-4o Mini",
@@ -220,6 +228,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=200_000,
         max_output_tokens=100_000,
         input_cost_per_million=15.0,
+        cached_cost_per_million=7.50,
         output_cost_per_million=60.0,
         allows_streaming=True,
         allows_system_message=False,
@@ -233,8 +242,9 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         provider=ModelProvider.OPENAI,
         context_length=200_000,
         max_output_tokens=100_000,
-        input_cost_per_million=10.0,
-        output_cost_per_million=40.0,
+        input_cost_per_million=2.0,
+        cached_cost_per_million=0.50,
+        output_cost_per_million=8.0,
         allows_streaming=True,
         allows_system_message=False,
         unsupported_params=["temperature"],
@@ -248,6 +258,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=128_000,
         max_output_tokens=65_536,
         input_cost_per_million=1.1,
+        cached_cost_per_million=0.55,
         output_cost_per_million=4.4,
         allows_streaming=False,
         allows_system_message=False,
@@ -262,6 +273,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=200_000,
         max_output_tokens=100_000,
         input_cost_per_million=1.1,
+        cached_cost_per_million=0.55,
         output_cost_per_million=4.4,
         allows_streaming=False,
         allows_system_message=False,
@@ -276,6 +288,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=200_000,
         max_output_tokens=100_000,
         input_cost_per_million=1.10,
+        cached_cost_per_million=0.275,
         output_cost_per_million=4.40,
         allows_streaming=False,
         allows_system_message=False,
@@ -291,6 +304,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=200_000,
         max_output_tokens=8192,
         input_cost_per_million=3.0,
+        cached_cost_per_million=0.30,
         output_cost_per_million=15.0,
         description="Claude 3.5 Sonnet",
     ),
@@ -300,6 +314,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=200_000,
         max_output_tokens=4096,
         input_cost_per_million=15.0,
+        cached_cost_per_million=1.50,
         output_cost_per_million=75.0,
         description="Claude 3 Opus",
     ),
@@ -309,6 +324,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=200_000,
         max_output_tokens=4096,
         input_cost_per_million=3.0,
+        cached_cost_per_million=0.30,
         output_cost_per_million=15.0,
         description="Claude 3 Sonnet",
     ),
@@ -318,6 +334,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=200_000,
         max_output_tokens=4096,
         input_cost_per_million=0.25,
+        cached_cost_per_million=0.03,
         output_cost_per_million=1.25,
         description="Claude 3 Haiku",
     ),
@@ -328,6 +345,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=64_000,
         max_output_tokens=8_000,
         input_cost_per_million=0.27,
+        cached_cost_per_million=0.07,
         output_cost_per_million=1.10,
         description="DeepSeek Chat",
     ),
@@ -337,6 +355,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=64_000,
         max_output_tokens=8_000,
         input_cost_per_million=0.55,
+        cached_cost_per_million=0.14,
         output_cost_per_million=2.19,
         description="DeepSeek-R1 Reasoning LM",
     ),
@@ -347,6 +366,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=1_056_768,
         max_output_tokens=8192,
         input_cost_per_million=0.10,
+        cached_cost_per_million=0.025,
         output_cost_per_million=0.40,
         rename_params={"max_tokens": "max_completion_tokens"},
         description="Gemini 2.0 Flash",
@@ -401,6 +421,40 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         rename_params={"max_tokens": "max_completion_tokens"},
         description="Gemini 2.0 Flash Thinking",
     ),
+    # Gemini 2.5 Models
+    GeminiModel.GEMINI_2_5_PRO.value: ModelInfo(
+        name=GeminiModel.GEMINI_2_5_PRO.value,
+        provider=ModelProvider.GOOGLE,
+        context_length=1_048_576,
+        max_output_tokens=65_536,
+        input_cost_per_million=1.25,
+        cached_cost_per_million=0.31,
+        output_cost_per_million=10.0,
+        rename_params={"max_tokens": "max_completion_tokens"},
+        description="Gemini 2.5 Pro",
+    ),
+    GeminiModel.GEMINI_2_5_FLASH.value: ModelInfo(
+        name=GeminiModel.GEMINI_2_5_FLASH.value,
+        provider=ModelProvider.GOOGLE,
+        context_length=1_048_576,
+        max_output_tokens=65_536,
+        input_cost_per_million=0.30,
+        cached_cost_per_million=0.075,
+        output_cost_per_million=2.50,
+        rename_params={"max_tokens": "max_completion_tokens"},
+        description="Gemini 2.5 Flash",
+    ),
+    GeminiModel.GEMINI_2_5_FLASH_LITE_PREVIEW.value: ModelInfo(
+        name=GeminiModel.GEMINI_2_5_FLASH_LITE_PREVIEW.value,
+        provider=ModelProvider.GOOGLE,
+        context_length=65_536,
+        max_output_tokens=65_536,
+        input_cost_per_million=0.10,
+        cached_cost_per_million=0.025,
+        output_cost_per_million=0.40,
+        rename_params={"max_tokens": "max_completion_tokens"},
+        description="Gemini 2.5 Flash Lite Preview",
+    ),
 }

langroid/language_models/openai_gpt.py CHANGED Viewed

@@ -45,6 +45,14 @@ from langroid.language_models.base import (
     StreamEventType,
     ToolChoiceTypes,
 )
+from langroid.language_models.client_cache import (
+    get_async_cerebras_client,
+    get_async_groq_client,
+    get_async_openai_client,
+    get_cerebras_client,
+    get_groq_client,
+    get_openai_client,
+)
 from langroid.language_models.config import HFPromptFormatterConfig
 from langroid.language_models.model_info import (
     DeepSeekModel,
@@ -256,6 +264,9 @@ class OpenAIGPTConfig(LLMConfig):
     temperature: float = 0.2
     seed: int | None = 42
     params: OpenAICallParams | None = None
+    use_cached_client: bool = (
+        True  # Whether to reuse cached clients (prevents resource exhaustion)
+    )
     # these can be any model name that is served at an OpenAI-compatible API end point
     chat_model: str = default_openai_chat_model
     chat_model_orig: str = default_openai_chat_model
@@ -529,24 +540,26 @@ class OpenAIGPT(LanguageModel):
             self.config.chat_model = self.config.chat_model.replace("groq/", "")
             if self.api_key == OPENAI_API_KEY:
                 self.api_key = os.getenv("GROQ_API_KEY", DUMMY_API_KEY)
-            self.client = Groq(
-                api_key=self.api_key,
-            )
-            self.async_client = AsyncGroq(
-                api_key=self.api_key,
-            )
+            if self.config.use_cached_client:
+                self.client = get_groq_client(api_key=self.api_key)
+                self.async_client = get_async_groq_client(api_key=self.api_key)
+            else:
+                # Create new clients without caching
+                self.client = Groq(api_key=self.api_key)
+                self.async_client = AsyncGroq(api_key=self.api_key)
         elif self.is_cerebras:
             # use cerebras-specific client
             self.config.chat_model = self.config.chat_model.replace("cerebras/", "")
             if self.api_key == OPENAI_API_KEY:
                 self.api_key = os.getenv("CEREBRAS_API_KEY", DUMMY_API_KEY)
-            self.client = Cerebras(
-                api_key=self.api_key,
-            )
-            # TODO there is not async client, so should we do anything here?
-            self.async_client = AsyncCerebras(
-                api_key=self.api_key,
-            )
+            if self.config.use_cached_client:
+                self.client = get_cerebras_client(api_key=self.api_key)
+                # TODO there is not async client, so should we do anything here?
+                self.async_client = get_async_cerebras_client(api_key=self.api_key)
+            else:
+                # Create new clients without caching
+                self.client = Cerebras(api_key=self.api_key)
+                self.async_client = AsyncCerebras(api_key=self.api_key)
         else:
             # in these cases, there's no specific client: OpenAI python client suffices
             if self.is_litellm_proxy:
@@ -618,20 +631,37 @@ class OpenAIGPT(LanguageModel):
                 # Add Portkey-specific headers
                 self.config.headers.update(self.config.portkey_params.get_headers())
-            self.client = OpenAI(
-                api_key=self.api_key,
-                base_url=self.api_base,
-                organization=self.config.organization,
-                timeout=Timeout(self.config.timeout),
-                default_headers=self.config.headers,
-            )
-            self.async_client = AsyncOpenAI(
-                api_key=self.api_key,
-                organization=self.config.organization,
-                base_url=self.api_base,
-                timeout=Timeout(self.config.timeout),
-                default_headers=self.config.headers,
-            )
+            if self.config.use_cached_client:
+                self.client = get_openai_client(
+                    api_key=self.api_key,
+                    base_url=self.api_base,
+                    organization=self.config.organization,
+                    timeout=Timeout(self.config.timeout),
+                    default_headers=self.config.headers,
+                )
+                self.async_client = get_async_openai_client(
+                    api_key=self.api_key,
+                    base_url=self.api_base,
+                    organization=self.config.organization,
+                    timeout=Timeout(self.config.timeout),
+                    default_headers=self.config.headers,
+                )
+            else:
+                # Create new clients without caching
+                self.client = OpenAI(
+                    api_key=self.api_key,
+                    base_url=self.api_base,
+                    organization=self.config.organization,
+                    timeout=Timeout(self.config.timeout),
+                    default_headers=self.config.headers,
+                )
+                self.async_client = AsyncOpenAI(
+                    api_key=self.api_key,
+                    base_url=self.api_base,
+                    organization=self.config.organization,
+                    timeout=Timeout(self.config.timeout),
+                    default_headers=self.config.headers,
+                )
         self.cache: CacheDB | None = None
         use_cache = self.config.cache_config is not None
@@ -736,14 +766,21 @@ class OpenAIGPT(LanguageModel):
             or self.completion_info().context_length
         )
-    def chat_cost(self) -> Tuple[float, float]:
+    def chat_cost(self) -> Tuple[float, float, float]:
         """
-        (Prompt, Generation) cost per 1000 tokens, for chat-completion
+        (Prompt, Cached, Generation) cost per 1000 tokens, for chat-completion
         models/endpoints.
         Get it from the dict, otherwise fail-over to general method
         """
         info = self.info()
-        return (info.input_cost_per_million / 1000, info.output_cost_per_million / 1000)
+        cached_cost_per_million = info.cached_cost_per_million
+        if not cached_cost_per_million:
+            cached_cost_per_million = info.input_cost_per_million
+        return (
+            info.input_cost_per_million / 1000,
+            cached_cost_per_million / 1000,
+            info.output_cost_per_million / 1000,
+        )
     def set_stream(self, stream: bool) -> bool:
         """Enable or disable streaming output from API.
@@ -1399,6 +1436,16 @@ class OpenAIGPT(LanguageModel):
             # and the reasoning may be included in the message content
             # within delimiters like <think> ... </think>
             reasoning, completion = self.get_reasoning_final(completion)
+        prompt_tokens = usage.get("prompt_tokens", 0)
+        prompt_tokens_details: Any = usage.get("prompt_tokens_details", {})
+        cached_tokens = (
+            prompt_tokens_details.get("cached_tokens", 0)
+            if isinstance(prompt_tokens_details, dict)
+            else 0
+        )
+        completion_tokens = usage.get("completion_tokens", 0)
         return (
             LLMResponse(
                 message=completion,
@@ -1408,11 +1455,13 @@ class OpenAIGPT(LanguageModel):
                 oai_tool_calls=tool_calls or None if len(tool_deltas) > 0 else None,
                 function_call=function_call if has_function else None,
                 usage=LLMTokenUsage(
-                    prompt_tokens=usage.get("prompt_tokens", 0),
-                    completion_tokens=usage.get("completion_tokens", 0),
+                    prompt_tokens=prompt_tokens,
+                    cached_tokens=cached_tokens,
+                    completion_tokens=completion_tokens,
                     cost=self._cost_chat_model(
-                        usage.get("prompt_tokens", 0),
-                        usage.get("completion_tokens", 0),
+                        prompt_tokens,
+                        cached_tokens,
+                        completion_tokens,
                     ),
                 ),
             ),
@@ -1449,9 +1498,11 @@ class OpenAIGPT(LanguageModel):
             return hashed_key, None
         return hashed_key, cached_val
-    def _cost_chat_model(self, prompt: int, completion: int) -> float:
+    def _cost_chat_model(self, prompt: int, cached: int, completion: int) -> float:
         price = self.chat_cost()
-        return (price[0] * prompt + price[1] * completion) / 1000
+        return (
+            price[0] * (prompt - cached) + price[1] * cached + price[2] * completion
+        ) / 1000
     def _get_non_stream_token_usage(
         self, cached: bool, response: Dict[str, Any]
@@ -1469,14 +1520,24 @@ class OpenAIGPT(LanguageModel):
         """
         cost = 0.0
         prompt_tokens = 0
+        cached_tokens = 0
         completion_tokens = 0
-        if not cached and not self.get_stream() and response["usage"] is not None:
-            prompt_tokens = response["usage"]["prompt_tokens"] or 0
-            completion_tokens = response["usage"]["completion_tokens"] or 0
-            cost = self._cost_chat_model(prompt_tokens, completion_tokens)
+        usage = response.get("usage")
+        if not cached and not self.get_stream() and usage is not None:
+            prompt_tokens = usage.get("prompt_tokens") or 0
+            prompt_tokens_details = usage.get("prompt_tokens_details", {})
+            cached_tokens = prompt_tokens_details.get("cached_tokens") or 0
+            completion_tokens = usage.get("completion_tokens") or 0
+            cost = self._cost_chat_model(
+                prompt_tokens, cached_tokens, completion_tokens
+            )
         return LLMTokenUsage(
-            prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, cost=cost
+            prompt_tokens=prompt_tokens,
+            cached_tokens=cached_tokens,
+            completion_tokens=completion_tokens,
+            cost=cost,
         )
     def generate(self, prompt: str, max_tokens: int = 200) -> LLMResponse:

{langroid-0.56.10.dist-info → langroid-0.56.12.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: langroid
-Version: 0.56.10
+Version: 0.56.12
 Summary: Harness LLMs with Multi-Agent Programming
 Author-email: Prasad Chalasani <pchalasani@gmail.com>
 License: MIT

{langroid-0.56.10.dist-info → langroid-0.56.12.dist-info}/RECORD RENAMED Viewed

@@ -3,9 +3,9 @@ langroid/exceptions.py,sha256=OPjece_8cwg94DLPcOGA1ddzy5bGh65pxzcHMnssTz8,2995
 langroid/mytypes.py,sha256=HIcYAqGeA9OK0Hlscym2FI5Oax9QFljDZoVgRlomhRk,4014
 langroid/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 langroid/agent/__init__.py,sha256=ll0Cubd2DZ-fsCMl7e10hf9ZjFGKzphfBco396IKITY,786
-langroid/agent/base.py,sha256=GVE_vdtDUJpldACH4LQwjqbQ11UDn9thr2-uBXk0RjU,86009
+langroid/agent/base.py,sha256=S71qbnNdpEeR33RlUPEyxHNuNkj9LQ7iNAZk7lcAYNk,86078
 langroid/agent/batch.py,sha256=wpE9RqCNDVDhAXkCB7wEqfCIEAi6qKcrhaZ-Zr9T4C0,21375
-langroid/agent/chat_agent.py,sha256=2HIYzYxkrGkRIS97ioKfIqjaW3RbX89M39LjzBobBEY,88381
+langroid/agent/chat_agent.py,sha256=pBnLGlAA6d2MK_1qa4GyhFZHnDf_RrUDli7__PKRRz4,88956
 langroid/agent/chat_document.py,sha256=0e6zYkqIorMIVbCsxOul9ziwAPPOWDsBsRV9E8ux-WI,18055
 langroid/agent/done_sequence_parser.py,sha256=oUPzQCkkAo-5qos3ndSV47Lre7O_LoGWwTybjE9sCwc,4381
 langroid/agent/openai_assistant.py,sha256=JkAcs02bIrgPNVvUWVR06VCthc5-ulla2QMBzux_q6o,34340
@@ -73,11 +73,12 @@ langroid/embedding_models/protoc/embeddings_pb2.pyi,sha256=UkNy7BrNsmQm0vLb3NtGX
 langroid/embedding_models/protoc/embeddings_pb2_grpc.py,sha256=9dYQqkW3JPyBpSEjeGXTNpSqAkC-6FPtBHyteVob2Y8,2452
 langroid/language_models/__init__.py,sha256=3aD2qC1lz8v12HX4B-dilv27gNxYdGdeu1QvDlkqqHs,1095
 langroid/language_models/azure_openai.py,sha256=SW0Fp_y6HpERr9l6TtF6CYsKgKwjUf_hSL_2mhTV4wI,5034
-langroid/language_models/base.py,sha256=OlPgmhQS2o3Y5DLoO1IEBUp0kIOeQdYsZsd25sz7DY8,28485
+langroid/language_models/base.py,sha256=r0MckcZGmuv_opKR2xvjzOz94mmWCzn9LJKgqyBjJ7c,28559
+langroid/language_models/client_cache.py,sha256=YtGcpalYkS_ckMU12J7VmUOGmVv1wzuLUBxgIagcpmA,6896
 langroid/language_models/config.py,sha256=9Q8wk5a7RQr8LGMT_0WkpjY8S4ywK06SalVRjXlfCiI,378
 langroid/language_models/mock_lm.py,sha256=tA9JpURznsMZ59iRhFYMmaYQzAc0D0BT-PiJIV58sAk,4079
-langroid/language_models/model_info.py,sha256=0e011vJZMi7XU9OkKT6doxlybrNJfMlP54klLDDNgFg,14939
-langroid/language_models/openai_gpt.py,sha256=Xyg2VHGmA3VgPIS5ppLZeeU2Aai0qMKF9ia-oIjqRNM,86616
+langroid/language_models/model_info.py,sha256=LzRfZsWmOm7WF6KGJfcN0aVdRqk0URNuDGMMz6cFt50,17121
+langroid/language_models/openai_gpt.py,sha256=FFiJa9_j_bTiA8SzBv7xssuc7LGxT_TI7Pcg8XLJnzE,89230
 langroid/language_models/provider_params.py,sha256=fX25NAmYUIc1-nliMKpmTGZO6D6RpyTXtSDdZCZdb5w,5464
 langroid/language_models/utils.py,sha256=n55Oe2_V_4VNGhytvPWLYC-0tFS07RTjN83KWl-p_MI,6032
 langroid/language_models/prompt_formatter/__init__.py,sha256=2-5cdE24XoFDhifOLl8yiscohil1ogbP1ECkYdBlBsk,372
@@ -137,7 +138,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
 langroid/vector_store/postgres.py,sha256=wHPtIi2qM4fhO4pMQr95pz1ZCe7dTb2hxl4VYspGZoA,16104
 langroid/vector_store/qdrantdb.py,sha256=ZYrT9mxoUCx_67Qzb5xnkWuFG12rfe30yAg4NgG2ueA,19168
 langroid/vector_store/weaviatedb.py,sha256=Yn8pg139gOy3zkaPfoTbMXEEBCiLiYa1MU5d_3UA1K4,11847
-langroid-0.56.10.dist-info/METADATA,sha256=QYPsEwh24uWqM4OwExH1tSmsWlnj-cyFRkTAXd2Rl64,65745
-langroid-0.56.10.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-langroid-0.56.10.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
-langroid-0.56.10.dist-info/RECORD,,
+langroid-0.56.12.dist-info/METADATA,sha256=4GwWmlU70uCD1SbRHgLjgDeSUJdCHf4f0VY_5I0yXwE,65745
+langroid-0.56.12.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+langroid-0.56.12.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
+langroid-0.56.12.dist-info/RECORD,,

{langroid-0.56.10.dist-info → langroid-0.56.12.dist-info}/WHEEL RENAMED Viewed

File without changes

{langroid-0.56.10.dist-info → langroid-0.56.12.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

langroid 0.56.10__py3-none-any.whl → 0.56.12__py3-none-any.whl

langroid 0.56.10py3-none-any.whl → 0.56.12py3-none-any.whl