PyPI - langroid - Versions diffs - 0.56.10__tar.gz → 0.56.12__tar.gz - Mend

langroid 0.56.10tar.gz → 0.56.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (145) hide show

{langroid-0.56.10 → langroid-0.56.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: langroid
-Version: 0.56.10
+Version: 0.56.12
 Summary: Harness LLMs with Multi-Agent Programming
 Author-email: Prasad Chalasani <pchalasani@gmail.com>
 License: MIT

{langroid-0.56.10 → langroid-0.56.12}/langroid/agent/base.py RENAMED Viewed

@@ -2142,7 +2142,7 @@ class Agent(ABC):
                 completion_tokens = self.num_tokens(response.message)
                 if response.function_call is not None:
                     completion_tokens += self.num_tokens(str(response.function_call))
-                cost = self.compute_token_cost(prompt_tokens, completion_tokens)
+                cost = self.compute_token_cost(prompt_tokens, 0, completion_tokens)
             response.usage = LLMTokenUsage(
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
@@ -2166,9 +2166,11 @@ class Agent(ABC):
             if print_response_stats:
                 print(self.indent + self.token_stats_str)
-    def compute_token_cost(self, prompt: int, completion: int) -> float:
+    def compute_token_cost(self, prompt: int, cached: int, completion: int) -> float:
         price = cast(LanguageModel, self.llm).chat_cost()
-        return (price[0] * prompt + price[1] * completion) / 1000
+        return (
+            price[0] * (prompt - cached) + price[1] * cached + price[2] * completion
+        ) / 1000
     def ask_agent(
         self,

{langroid-0.56.10 → langroid-0.56.12}/langroid/agent/chat_agent.py RENAMED Viewed

@@ -2068,3 +2068,15 @@ class ChatAgent(Agent):
             return str(self.message_history[i])
         else:
             return "\n".join([str(m) for m in self.message_history[i:]])
+    def __del__(self) -> None:
+        """
+        Cleanup method called when the ChatAgent is garbage collected.
+        Note: We don't close LLM clients here because they may be shared
+        across multiple agents when client caching is enabled.
+        The clients are managed centrally and cleaned up via atexit hooks.
+        """
+        # Previously we closed clients here, but this caused issues when
+        # multiple agents shared the same cached client instance.
+        # Clients are now managed centrally in langroid.language_models.client_cache
+        pass

{langroid-0.56.10 → langroid-0.56.12}/langroid/language_models/base.py RENAMED Viewed

@@ -91,10 +91,6 @@ class LLMConfig(BaseSettings):
     # reasoning output from reasoning models
     cache_config: None | CacheDBConfig = RedisCacheConfig()
     thought_delimiters: Tuple[str, str] = ("<think>", "</think>")
-    # Dict of model -> (input/prompt cost, output/completion cost)
-    chat_cost_per_1k_tokens: Tuple[float, float] = (0.0, 0.0)
-    completion_cost_per_1k_tokens: Tuple[float, float] = (0.0, 0.0)
     retry_params: RetryParams = RetryParams()
     @property
@@ -131,7 +127,7 @@ class LLMFunctionCall(BaseModel):
             if not isinstance(dict_or_list, dict):
                 raise ValueError(
                     f"""
-                        Invalid function args: {fun_args_str}
+                        Invalid function args: {fun_args_str}
                         parsed as {dict_or_list},
                         which is not a valid dict.
                         """
@@ -224,12 +220,14 @@ class LLMTokenUsage(BaseModel):
     """
     prompt_tokens: int = 0
+    cached_tokens: int = 0
     completion_tokens: int = 0
     cost: float = 0.0
     calls: int = 0  # how many API calls - not used as of 2025-04-04
     def reset(self) -> None:
         self.prompt_tokens = 0
+        self.cached_tokens = 0
         self.completion_tokens = 0
         self.cost = 0.0
         self.calls = 0
@@ -237,7 +235,8 @@ class LLMTokenUsage(BaseModel):
     def __str__(self) -> str:
         return (
             f"Tokens = "
-            f"(prompt {self.prompt_tokens}, completion {self.completion_tokens}), "
+            f"(prompt {self.prompt_tokens}, cached {self.cached_tokens}, "
+            f"completion {self.completion_tokens}), "
             f"Cost={self.cost}, Calls={self.calls}"
         )
@@ -462,9 +461,9 @@ class LanguageModel(ABC):
         if type(config) is LLMConfig:
             raise ValueError(
                 """
-                Cannot create a Language Model object from LLMConfig.
-                Please specify a specific subclass of LLMConfig e.g.,
-                OpenAIGPTConfig. If you are creating a ChatAgent from
+                Cannot create a Language Model object from LLMConfig.
+                Please specify a specific subclass of LLMConfig e.g.,
+                OpenAIGPTConfig. If you are creating a ChatAgent from
                 a ChatAgentConfig, please specify the `llm` field of this config
                 as a specific subclass of LLMConfig, e.g., OpenAIGPTConfig.
                 """
@@ -666,8 +665,15 @@ class LanguageModel(ABC):
     def completion_context_length(self) -> int:
         return self.config.completion_context_length or DEFAULT_CONTEXT_LENGTH
-    def chat_cost(self) -> Tuple[float, float]:
-        return self.config.chat_cost_per_1k_tokens
+    def chat_cost(self) -> Tuple[float, float, float]:
+        """
+        Return the cost per 1000 tokens for chat completions.
+        Returns:
+            Tuple[float, float, float]: (input_cost, cached_cost, output_cost)
+                per 1000 tokens
+        """
+        return (0.0, 0.0, 0.0)
     def reset_usage_cost(self) -> None:
         for mdl in [self.config.chat_model, self.config.completion_model]:
@@ -754,18 +760,18 @@ class LanguageModel(ABC):
         prompt = f"""
         You are an expert at understanding a CHAT HISTORY between an AI Assistant
-        and a User, and you are highly skilled in rephrasing the User's FOLLOW-UP
-        QUESTION/REQUEST as a STANDALONE QUESTION/REQUEST that can be understood
+        and a User, and you are highly skilled in rephrasing the User's FOLLOW-UP
+        QUESTION/REQUEST as a STANDALONE QUESTION/REQUEST that can be understood
         WITHOUT the context of the chat history.
-        Below is the CHAT HISTORY. When the User asks you to rephrase a
-        FOLLOW-UP QUESTION/REQUEST, your ONLY task is to simply return the
-        question REPHRASED as a STANDALONE QUESTION/REQUEST, without any additional
+        Below is the CHAT HISTORY. When the User asks you to rephrase a
+        FOLLOW-UP QUESTION/REQUEST, your ONLY task is to simply return the
+        question REPHRASED as a STANDALONE QUESTION/REQUEST, without any additional
         text or context.
         <CHAT_HISTORY>
         {history}
-        </CHAT_HISTORY>
+        </CHAT_HISTORY>
         """.strip()
         follow_up_question = f"""

langroid-0.56.12/langroid/language_models/client_cache.py ADDED Viewed

@@ -0,0 +1,255 @@
+"""
+Client caching/singleton pattern for LLM clients to prevent connection pool exhaustion.
+"""
+import atexit
+import hashlib
+import weakref
+from typing import Any, Dict, Optional, Union, cast
+from cerebras.cloud.sdk import AsyncCerebras, Cerebras
+from groq import AsyncGroq, Groq
+from httpx import Timeout
+from openai import AsyncOpenAI, OpenAI
+# Cache for client instances, keyed by hashed configuration parameters
+_client_cache: Dict[str, Any] = {}
+# Keep track of clients for cleanup
+_all_clients: weakref.WeakSet[Any] = weakref.WeakSet()
+def _get_cache_key(client_type: str, **kwargs: Any) -> str:
+    """
+    Generate a cache key from client type and configuration parameters.
+    Uses the same approach as OpenAIGPT._cache_lookup for consistency.
+    Args:
+        client_type: Type of client (e.g., "openai", "groq", "cerebras")
+        **kwargs: Configuration parameters (api_key, base_url, timeout, etc.)
+    Returns:
+        SHA256 hash of the configuration as a hex string
+    """
+    # Convert kwargs to sorted string representation
+    sorted_kwargs_str = str(sorted(kwargs.items()))
+    # Create raw key combining client type and sorted kwargs
+    raw_key = f"{client_type}:{sorted_kwargs_str}"
+    # Hash the key for consistent length and to handle complex objects
+    hashed_key = hashlib.sha256(raw_key.encode()).hexdigest()
+    return hashed_key
+def get_openai_client(
+    api_key: str,
+    base_url: Optional[str] = None,
+    organization: Optional[str] = None,
+    timeout: Union[float, Timeout] = 120.0,
+    default_headers: Optional[Dict[str, str]] = None,
+) -> OpenAI:
+    """
+    Get or create a singleton OpenAI client with the given configuration.
+    Args:
+        api_key: OpenAI API key
+        base_url: Optional base URL for API
+        organization: Optional organization ID
+        timeout: Request timeout
+        default_headers: Optional default headers
+    Returns:
+        OpenAI client instance
+    """
+    if isinstance(timeout, (int, float)):
+        timeout = Timeout(timeout)
+    cache_key = _get_cache_key(
+        "openai",
+        api_key=api_key,
+        base_url=base_url,
+        organization=organization,
+        timeout=timeout,
+        default_headers=default_headers,
+    )
+    if cache_key in _client_cache:
+        return cast(OpenAI, _client_cache[cache_key])
+    client = OpenAI(
+        api_key=api_key,
+        base_url=base_url,
+        organization=organization,
+        timeout=timeout,
+        default_headers=default_headers,
+    )
+    _client_cache[cache_key] = client
+    _all_clients.add(client)
+    return client
+def get_async_openai_client(
+    api_key: str,
+    base_url: Optional[str] = None,
+    organization: Optional[str] = None,
+    timeout: Union[float, Timeout] = 120.0,
+    default_headers: Optional[Dict[str, str]] = None,
+) -> AsyncOpenAI:
+    """
+    Get or create a singleton AsyncOpenAI client with the given configuration.
+    Args:
+        api_key: OpenAI API key
+        base_url: Optional base URL for API
+        organization: Optional organization ID
+        timeout: Request timeout
+        default_headers: Optional default headers
+    Returns:
+        AsyncOpenAI client instance
+    """
+    if isinstance(timeout, (int, float)):
+        timeout = Timeout(timeout)
+    cache_key = _get_cache_key(
+        "async_openai",
+        api_key=api_key,
+        base_url=base_url,
+        organization=organization,
+        timeout=timeout,
+        default_headers=default_headers,
+    )
+    if cache_key in _client_cache:
+        return cast(AsyncOpenAI, _client_cache[cache_key])
+    client = AsyncOpenAI(
+        api_key=api_key,
+        base_url=base_url,
+        organization=organization,
+        timeout=timeout,
+        default_headers=default_headers,
+    )
+    _client_cache[cache_key] = client
+    _all_clients.add(client)
+    return client
+def get_groq_client(api_key: str) -> Groq:
+    """
+    Get or create a singleton Groq client with the given configuration.
+    Args:
+        api_key: Groq API key
+    Returns:
+        Groq client instance
+    """
+    cache_key = _get_cache_key("groq", api_key=api_key)
+    if cache_key in _client_cache:
+        return cast(Groq, _client_cache[cache_key])
+    client = Groq(api_key=api_key)
+    _client_cache[cache_key] = client
+    _all_clients.add(client)
+    return client
+def get_async_groq_client(api_key: str) -> AsyncGroq:
+    """
+    Get or create a singleton AsyncGroq client with the given configuration.
+    Args:
+        api_key: Groq API key
+    Returns:
+        AsyncGroq client instance
+    """
+    cache_key = _get_cache_key("async_groq", api_key=api_key)
+    if cache_key in _client_cache:
+        return cast(AsyncGroq, _client_cache[cache_key])
+    client = AsyncGroq(api_key=api_key)
+    _client_cache[cache_key] = client
+    _all_clients.add(client)
+    return client
+def get_cerebras_client(api_key: str) -> Cerebras:
+    """
+    Get or create a singleton Cerebras client with the given configuration.
+    Args:
+        api_key: Cerebras API key
+    Returns:
+        Cerebras client instance
+    """
+    cache_key = _get_cache_key("cerebras", api_key=api_key)
+    if cache_key in _client_cache:
+        return cast(Cerebras, _client_cache[cache_key])
+    client = Cerebras(api_key=api_key)
+    _client_cache[cache_key] = client
+    _all_clients.add(client)
+    return client
+def get_async_cerebras_client(api_key: str) -> AsyncCerebras:
+    """
+    Get or create a singleton AsyncCerebras client with the given configuration.
+    Args:
+        api_key: Cerebras API key
+    Returns:
+        AsyncCerebras client instance
+    """
+    cache_key = _get_cache_key("async_cerebras", api_key=api_key)
+    if cache_key in _client_cache:
+        return cast(AsyncCerebras, _client_cache[cache_key])
+    client = AsyncCerebras(api_key=api_key)
+    _client_cache[cache_key] = client
+    _all_clients.add(client)
+    return client
+def _cleanup_clients() -> None:
+    """
+    Cleanup function to close all cached clients on exit.
+    Called automatically via atexit.
+    """
+    import inspect
+    for client in list(_all_clients):
+        if hasattr(client, "close") and callable(client.close):
+            try:
+                # Check if close is a coroutine function (async)
+                if inspect.iscoroutinefunction(client.close):
+                    # For async clients, we can't await in atexit
+                    # They will be cleaned up by the OS
+                    pass
+                else:
+                    # Sync clients can be closed directly
+                    client.close()
+            except Exception:
+                pass  # Ignore errors during cleanup
+# Register cleanup function to run on exit
+atexit.register(_cleanup_clients)
+# For testing purposes
+def _clear_cache() -> None:
+    """Clear the client cache. Only for testing."""
+    _client_cache.clear()

{langroid-0.56.10 → langroid-0.56.12}/langroid/language_models/model_info.py RENAMED Viewed

@@ -69,7 +69,9 @@ class GeminiModel(ModelName):
     GEMINI_1_5_FLASH = "gemini-1.5-flash"
     GEMINI_1_5_FLASH_8B = "gemini-1.5-flash-8b"
     GEMINI_1_5_PRO = "gemini-1.5-pro"
-    GEMINI_2_5_PRO = "gemini-2.5-pro-exp-02-05"
+    GEMINI_2_5_PRO = "gemini-2.5-pro"
+    GEMINI_2_5_FLASH = "gemini-2.5-flash"
+    GEMINI_2_5_FLASH_LITE_PREVIEW = "gemini-2.5-flash-lite-preview-06-17"
     GEMINI_2_PRO = "gemini-2.0-pro-exp-02-05"
     GEMINI_2_FLASH = "gemini-2.0-flash"
     GEMINI_2_FLASH_LITE = "gemini-2.0-flash-lite-preview"
@@ -108,6 +110,7 @@ class ModelInfo(BaseModel):
     max_cot_tokens: int = 0  # max chain of thought (thinking) tokens where applicable
     max_output_tokens: int = 8192  # Maximum number of output tokens - model dependent
     input_cost_per_million: float = 0.0  # Cost in USD per million input tokens
+    cached_cost_per_million: float = 0.0  # Cost in USD per million cached tokens
     output_cost_per_million: float = 0.0  # Cost in USD per million output tokens
     allows_streaming: bool = True  # Whether model supports streaming output
     allows_system_message: bool = True  # Whether model supports system messages
@@ -173,6 +176,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=1_047_576,
         max_output_tokens=32_768,
         input_cost_per_million=0.10,
+        cached_cost_per_million=0.025,
         output_cost_per_million=0.40,
         description="GPT-4.1",
     ),
@@ -182,6 +186,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=1_047_576,
         max_output_tokens=32_768,
         input_cost_per_million=0.40,
+        cached_cost_per_million=0.10,
         output_cost_per_million=1.60,
         description="GPT-4.1 Mini",
     ),
@@ -191,6 +196,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=1_047_576,
         max_output_tokens=32_768,
         input_cost_per_million=2.00,
+        cached_cost_per_million=0.50,
         output_cost_per_million=8.00,
         description="GPT-4.1",
     ),
@@ -200,6 +206,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=128_000,
         max_output_tokens=16_384,
         input_cost_per_million=2.5,
+        cached_cost_per_million=1.25,
         output_cost_per_million=10.0,
         has_structured_output=True,
         description="GPT-4o (128K context)",
@@ -210,6 +217,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=128_000,
         max_output_tokens=16_384,
         input_cost_per_million=0.15,
+        cached_cost_per_million=0.075,
         output_cost_per_million=0.60,
         has_structured_output=True,
         description="GPT-4o Mini",
@@ -220,6 +228,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=200_000,
         max_output_tokens=100_000,
         input_cost_per_million=15.0,
+        cached_cost_per_million=7.50,
         output_cost_per_million=60.0,
         allows_streaming=True,
         allows_system_message=False,
@@ -233,8 +242,9 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         provider=ModelProvider.OPENAI,
         context_length=200_000,
         max_output_tokens=100_000,
-        input_cost_per_million=10.0,
-        output_cost_per_million=40.0,
+        input_cost_per_million=2.0,
+        cached_cost_per_million=0.50,
+        output_cost_per_million=8.0,
         allows_streaming=True,
         allows_system_message=False,
         unsupported_params=["temperature"],
@@ -248,6 +258,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=128_000,
         max_output_tokens=65_536,
         input_cost_per_million=1.1,
+        cached_cost_per_million=0.55,
         output_cost_per_million=4.4,
         allows_streaming=False,
         allows_system_message=False,
@@ -262,6 +273,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=200_000,
         max_output_tokens=100_000,
         input_cost_per_million=1.1,
+        cached_cost_per_million=0.55,
         output_cost_per_million=4.4,
         allows_streaming=False,
         allows_system_message=False,
@@ -276,6 +288,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=200_000,
         max_output_tokens=100_000,
         input_cost_per_million=1.10,
+        cached_cost_per_million=0.275,
         output_cost_per_million=4.40,
         allows_streaming=False,
         allows_system_message=False,
@@ -291,6 +304,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=200_000,
         max_output_tokens=8192,
         input_cost_per_million=3.0,
+        cached_cost_per_million=0.30,
         output_cost_per_million=15.0,
         description="Claude 3.5 Sonnet",
     ),
@@ -300,6 +314,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=200_000,
         max_output_tokens=4096,
         input_cost_per_million=15.0,
+        cached_cost_per_million=1.50,
         output_cost_per_million=75.0,
         description="Claude 3 Opus",
     ),
@@ -309,6 +324,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=200_000,
         max_output_tokens=4096,
         input_cost_per_million=3.0,
+        cached_cost_per_million=0.30,
         output_cost_per_million=15.0,
         description="Claude 3 Sonnet",
     ),
@@ -318,6 +334,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=200_000,
         max_output_tokens=4096,
         input_cost_per_million=0.25,
+        cached_cost_per_million=0.03,
         output_cost_per_million=1.25,
         description="Claude 3 Haiku",
     ),
@@ -328,6 +345,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=64_000,
         max_output_tokens=8_000,
         input_cost_per_million=0.27,
+        cached_cost_per_million=0.07,
         output_cost_per_million=1.10,
         description="DeepSeek Chat",
     ),
@@ -337,6 +355,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=64_000,
         max_output_tokens=8_000,
         input_cost_per_million=0.55,
+        cached_cost_per_million=0.14,
         output_cost_per_million=2.19,
         description="DeepSeek-R1 Reasoning LM",
     ),
@@ -347,6 +366,7 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         context_length=1_056_768,
         max_output_tokens=8192,
         input_cost_per_million=0.10,
+        cached_cost_per_million=0.025,
         output_cost_per_million=0.40,
         rename_params={"max_tokens": "max_completion_tokens"},
         description="Gemini 2.0 Flash",
@@ -401,6 +421,40 @@ MODEL_INFO: Dict[str, ModelInfo] = {
         rename_params={"max_tokens": "max_completion_tokens"},
         description="Gemini 2.0 Flash Thinking",
     ),
+    # Gemini 2.5 Models
+    GeminiModel.GEMINI_2_5_PRO.value: ModelInfo(
+        name=GeminiModel.GEMINI_2_5_PRO.value,
+        provider=ModelProvider.GOOGLE,
+        context_length=1_048_576,
+        max_output_tokens=65_536,
+        input_cost_per_million=1.25,
+        cached_cost_per_million=0.31,
+        output_cost_per_million=10.0,
+        rename_params={"max_tokens": "max_completion_tokens"},
+        description="Gemini 2.5 Pro",
+    ),
+    GeminiModel.GEMINI_2_5_FLASH.value: ModelInfo(
+        name=GeminiModel.GEMINI_2_5_FLASH.value,
+        provider=ModelProvider.GOOGLE,
+        context_length=1_048_576,
+        max_output_tokens=65_536,
+        input_cost_per_million=0.30,
+        cached_cost_per_million=0.075,
+        output_cost_per_million=2.50,
+        rename_params={"max_tokens": "max_completion_tokens"},
+        description="Gemini 2.5 Flash",
+    ),
+    GeminiModel.GEMINI_2_5_FLASH_LITE_PREVIEW.value: ModelInfo(
+        name=GeminiModel.GEMINI_2_5_FLASH_LITE_PREVIEW.value,
+        provider=ModelProvider.GOOGLE,
+        context_length=65_536,
+        max_output_tokens=65_536,
+        input_cost_per_million=0.10,
+        cached_cost_per_million=0.025,
+        output_cost_per_million=0.40,
+        rename_params={"max_tokens": "max_completion_tokens"},
+        description="Gemini 2.5 Flash Lite Preview",
+    ),
 }

langroid 0.56.10__tar.gz → 0.56.12__tar.gz

langroid 0.56.10tar.gz → 0.56.12tar.gz