PyPI - lm-deluge - Versions diffs - 0.0.53__tar.gz → 0.0.55__tar.gz - Mend

lm-deluge 0.0.53tar.gz → 0.0.55tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lm-deluge might be problematic. Click here for more details.

Files changed (80) hide show

{lm_deluge-0.0.53/src/lm_deluge.egg-info → lm_deluge-0.0.55}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lm_deluge
-Version: 0.0.53
+Version: 0.0.55
 Summary: Python utility for using LLM API models.
 Author-email: Benjamin Anderson <ben@trytaylor.ai>
 Requires-Python: >=3.10

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/pyproject.toml RENAMED Viewed

@@ -3,7 +3,7 @@ requires = ["setuptools", "wheel"]
 [project]
 name = "lm_deluge"
-version = "0.0.53"
+version = "0.0.55"
 authors = [{ name = "Benjamin Anderson", email = "ben@trytaylor.ai" }]
 description = "Python utility for using LLM API models."
 readme = "README.md"

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/__init__.py RENAMED Viewed

@@ -1,10 +1,9 @@
-from .client import LLMClient, SamplingParams, APIResponse
+from .client import APIResponse, LLMClient, SamplingParams
+from .file import File
 from .prompt import Conversation, Message
 from .tool import Tool
-from .file import File
-import dotenv
-dotenv.load_dotenv()
+# dotenv.load_dotenv() - don't do this, fucks with other packages
 __all__ = [
     "LLMClient",

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/base.py RENAMED Viewed

@@ -52,6 +52,9 @@ class APIRequestBase(ABC):
         self, base_headers: dict[str, str], exclude_patterns: list[str] | None = None
     ) -> dict[str, str]:
         """Merge extra_headers with base headers, giving priority to extra_headers."""
+        # Filter out None values from base headers (e.g., missing API keys)
+        base_headers = {k: v for k, v in base_headers.items() if v is not None}
         if not self.context.extra_headers:
             return base_headers
@@ -69,6 +72,9 @@ class APIRequestBase(ABC):
         # Start with base headers, then overlay filtered extra headers (extra takes precedence)
         merged = dict(base_headers)
         merged.update(filtered_extra)
+        # Filter out None values from final merged headers
+        merged = {k: v for k, v in merged.items() if v is not None}
         return merged
     def handle_success(self, data):

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/response.py RENAMED Viewed

@@ -84,10 +84,37 @@ class APIResponse:
             and api_model.input_cost is not None
             and api_model.output_cost is not None
         ):
+            # Calculate input cost, accounting for cached vs non-cached tokens
+            # Different providers report tokens differently:
+            # - Anthropic/Bedrock: input_tokens is ONLY non-cached, cache_read_tokens is separate
+            # - OpenAI/Gemini: input_tokens INCLUDES cached, cache_read_tokens is a subset
+            cache_read_tokens = self.usage.cache_read_tokens or 0
+            if api_model.api_spec in ("anthropic", "bedrock"):
+                # For Anthropic: input_tokens already excludes cache, so use directly
+                non_cached_input_tokens = self.usage.input_tokens
+            else:
+                # For OpenAI/Gemini: input_tokens includes cache, so subtract it
+                non_cached_input_tokens = self.usage.input_tokens - cache_read_tokens
             self.cost = (
-                self.usage.input_tokens * api_model.input_cost / 1e6
+                non_cached_input_tokens * api_model.input_cost / 1e6
                 + self.usage.output_tokens * api_model.output_cost / 1e6
             )
+            # Add cost for cache read tokens (at reduced rate)
+            if cache_read_tokens > 0 and api_model.cached_input_cost is not None:
+                self.cost += cache_read_tokens * api_model.cached_input_cost / 1e6
+            # Add cost for cache write tokens (only for Anthropic)
+            if (
+                self.usage.cache_write_tokens
+                and self.usage.cache_write_tokens > 0
+                and api_model.cache_write_cost is not None
+            ):
+                self.cost += (
+                    self.usage.cache_write_tokens * api_model.cache_write_cost / 1e6
+                )
         elif self.content is not None and self.completion is not None:
             pass
             # print(

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/client.py RENAMED Viewed

@@ -30,6 +30,7 @@ class _LLMClient(BaseModel):
     """
     model_names: str | list[str] = ["gpt-4.1-mini"]
+    name: str | None = None
     max_requests_per_minute: int = 1_000
     max_tokens_per_minute: int = 100_000
     max_concurrent_requests: int = 225
@@ -69,6 +70,7 @@ class _LLMClient(BaseModel):
             max_requests_per_minute=self.max_requests_per_minute,
             max_tokens_per_minute=self.max_tokens_per_minute,
             max_concurrent_requests=self.max_concurrent_requests,
+            client_name=self.name or "LLMClient",
             progress_style=self.progress,
             use_progress_bar=show_progress,
         )
@@ -80,6 +82,22 @@ class _LLMClient(BaseModel):
             self._tracker.log_final_status()
             self._tracker = None
+    def reset_tracker(self):
+        """Reset tracker by closing and reopening with fresh state.
+        Useful when reusing a client across multiple batches and you want
+        the progress bar to start from 0 instead of showing cumulative totals.
+        """
+        if self._tracker is None:
+            return
+        # Close existing tracker (including progress bar)
+        show_progress = self._tracker.use_progress_bar
+        self.close()
+        # Create fresh tracker
+        self.open(total=0, show_progress=show_progress)
     # NEW! Builder methods
     def with_model(self, model: str):
         self.model_names = [model]
@@ -153,6 +171,13 @@ class _LLMClient(BaseModel):
         # normalize weights
         self.model_weights = [w / sum(self.model_weights) for w in self.model_weights]
+        # Auto-generate name if not provided
+        if self.name is None:
+            if len(self.model_names) == 1:
+                self.name = self.model_names[0]
+            else:
+                self.name = "LLMClient"
         # Validate logprobs settings across all sampling params
         if self.logprobs or any(sp.logprobs for sp in self.sampling_params):
             print("Logprobs enabled.")
@@ -353,147 +378,61 @@ class _LLMClient(BaseModel):
         cache: CachePattern | None = None,
         use_responses_api: bool = False,
     ) -> list[APIResponse | None] | list[str | None] | dict[str, int]:
-        # Convert prompts to Conversations - no upfront cache checking for dynamic caching!
-        prompts = prompts_to_conversations(prompts)
-        ids = list(range(len(prompts)))
-        results: list[APIResponse | None] = [None for _ in range(len(prompts))]
-        contexts: list[RequestContext | None] = [None for _ in range(len(prompts))]
-        inflight_tasks: set[asyncio.Task[None]] = set()
-        # Use existing tracker if client has been opened; otherwise open/close automatically
-        tracker: StatusTracker
-        tracker_preopened = self._tracker is not None
-        if tracker_preopened:
-            tracker = self._tracker  # type: ignore[assignment]
-            tracker.add_to_total(len(prompts))
-        else:
-            self.open(total=len(prompts), show_progress=show_progress)
-            tracker = self._tracker  # type: ignore[assignment]
-        assert tracker is not None
-        # Create retry queue for failed requests
-        retry_queue: asyncio.Queue[RequestContext] = asyncio.Queue()
+        """Process multiple prompts asynchronously using the start_nowait/wait_for_all backend.
-        # Calculate sleep time for rate limiting (legacy; gating happens in _wait_for_capacity)
-        seconds_to_sleep_each_loop = (60.0 * 0.9) / tracker.max_requests_per_minute
-        # Main dispatch loop - using original pattern but with all prompts
-        next_context = None  # Persist across iterations like original
-        next_is_retry = False  # Track whether next_context is a retry
-        prompts_not_finished = True
-        prompts_iter = iter(zip(ids, prompts))
-        while True:
-            # Get next context (retry or new) - only if we don't already have one waiting
-            if next_context is None:
-                if not retry_queue.empty():
-                    next_context = retry_queue.get_nowait()
-                    next_is_retry = True
-                    print(f"Retrying request {next_context.task_id}.")
-                elif prompts_not_finished:
-                    try:
-                        task_id, prompt = next(prompts_iter)
-                        model, sampling_params = self._select_model()
-                        assert isinstance(prompt, Conversation)
-                        next_context = RequestContext(
-                            task_id=task_id,
-                            model_name=model,
-                            prompt=prompt,
-                            sampling_params=sampling_params,
-                            attempts_left=self.max_attempts,
-                            request_timeout=self.request_timeout,
-                            status_tracker=tracker,
-                            tools=tools,
-                            cache=cache,
-                            use_responses_api=use_responses_api,
-                            extra_headers=self.extra_headers,
-                            force_local_mcp=self.force_local_mcp,
-                        )
-                        next_is_retry = False
-                    except StopIteration:
-                        prompts_not_finished = False
-            # Dispatch using shared capacity gate (consistent with start_nowait)
-            if next_context:
-                # Wait here until we have capacity to launch this context
-                await self._wait_for_capacity(
-                    next_context.num_tokens, tracker, retry=next_is_retry
-                )
-                # Launch simplified request processing
-                contexts[next_context.task_id] = next_context
-                async def process_and_store(ctx: RequestContext):
-                    try:
-                        response = await self.process_single_request(ctx, retry_queue)
-                        results[ctx.task_id] = response
-                    except BaseException as exc:
-                        # Capture cancellations and other BaseExceptions before fallback response fires.
-                        error_response = APIResponse(
-                            id=ctx.task_id,
-                            model_internal=ctx.model_name,
-                            prompt=ctx.prompt,
-                            sampling_params=ctx.sampling_params,
-                            status_code=None,
-                            is_error=True,
-                            error_message=f"{type(exc).__name__}: {exc}",
-                            raw_response={"exception_repr": repr(exc)},
-                        )
-                        results[ctx.task_id] = error_response
-                        if ctx.status_tracker:
-                            ctx.status_tracker.task_failed(ctx.task_id)
-                        raise
-                task = asyncio.create_task(process_and_store(next_context))
-                inflight_tasks.add(task)
-                task.add_done_callback(inflight_tasks.discard)
-                next_context = None  # Reset after successful dispatch
-                next_is_retry = False
-            # Update progress - original logic
-            tracker.update_pbar()
-            # Check completion: consider final outcomes, not in-progress count
-            # This avoids rare hangs if in-progress is miscounted (e.g., double-increment).
-            if (tracker.num_tasks_succeeded + tracker.num_tasks_failed) >= len(
-                prompts
-            ) and retry_queue.empty():
-                break
+        This implementation creates all tasks upfront and waits for them to complete,
+        avoiding issues with tracker state accumulating across multiple calls.
+        """
+        # Convert prompts to Conversations
+        prompts = prompts_to_conversations(prompts)
-            # Yield briefly to allow in-flight tasks to progress
-            await asyncio.sleep(min(0.01, seconds_to_sleep_each_loop))
+        # Ensure tracker exists (start_nowait will call add_to_total for each task)
+        if self._tracker is None:
+            self.open(total=0, show_progress=show_progress)
+            tracker_preopened = False
+        else:
+            tracker_preopened = True
+        # Start all tasks using start_nowait - tasks will coordinate via shared capacity lock
+        task_ids = []
+        for prompt in prompts:
+            assert isinstance(prompt, Conversation)
+            task_id = self.start_nowait(
+                prompt,
+                tools=tools,
+                cache=cache,
+                use_responses_api=use_responses_api,
+            )
+            task_ids.append(task_id)
-        if inflight_tasks:
-            await asyncio.gather(*inflight_tasks, return_exceptions=True)
+        # Wait for all tasks to complete
+        results = await self.wait_for_all(task_ids)
+        # Close tracker if we opened it
         if not tracker_preopened:
             self.close()
+        # Defensive check: This should rarely happen, but provides a safety net
         for idx, response in enumerate(results):
             if response is None:
-                ctx = contexts[idx]
-                prompt = ctx.prompt if ctx else prompts[idx]
-                sampling_params = (
-                    ctx.sampling_params
-                    if ctx
-                    else self.sampling_params[0]
-                    if self.sampling_params
-                    else SamplingParams()
+                # This should only happen if there's a bug in _run_context
+                print(
+                    f"WARNING: result[{idx}] is None! Creating defensive error response. "
+                    f"Please report this bug."
                 )
-                model_name = ctx.model_name if ctx else self.model_names[0]
-                assert isinstance(
-                    prompt, Conversation
-                ), "expected prompt to be a conversation"
                 results[idx] = APIResponse(
                     id=idx,
-                    model_internal=model_name,
-                    prompt=prompt,
-                    sampling_params=sampling_params,
+                    model_internal=self.model_names[0],
+                    prompt=prompts[idx],  # type: ignore
+                    sampling_params=self.sampling_params[0]
+                    if self.sampling_params
+                    else SamplingParams(),
                     status_code=None,
                     is_error=True,
                     error_message="Internal error: no response produced.",
                 )
+        # Handle return format
         if return_completions_only:
             return [r.completion if r is not None else None for r in results]
@@ -795,6 +734,7 @@ class _LLMClient(BaseModel):
 def LLMClient(
     model_names: str,
     *,
+    name: str | None = None,
     max_requests_per_minute: int = 1_000,
     max_tokens_per_minute: int = 100_000,
     max_concurrent_requests: int = 225,
@@ -821,6 +761,7 @@ def LLMClient(
 def LLMClient(
     model_names: list[str],
     *,
+    name: str | None = None,
     max_requests_per_minute: int = 1_000,
     max_tokens_per_minute: int = 100_000,
     max_concurrent_requests: int = 225,
@@ -846,6 +787,7 @@ def LLMClient(
 def LLMClient(
     model_names: str | list[str] = "gpt-4.1-mini",
     *,
+    name: str | None = None,
     max_requests_per_minute: int = 1_000,
     max_tokens_per_minute: int = 100_000,
     max_concurrent_requests: int = 225,
@@ -883,6 +825,7 @@ def LLMClient(
     # Simply pass everything to the Pydantic constructor
     return _LLMClient(
         model_names=model_names,
+        name=name,
         max_requests_per_minute=max_requests_per_minute,
         max_tokens_per_minute=max_tokens_per_minute,
         max_concurrent_requests=max_concurrent_requests,

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/llm_tools/extract.py RENAMED Viewed

@@ -1,11 +1,12 @@
 import asyncio
 import io
 import json
+import os
 from typing import Any
+from lm_deluge.client import _LLMClient
 from lm_deluge.file import File
-from ..client import LLMClient
 from ..prompt import Conversation
 from ..util.json import load_json
@@ -18,7 +19,7 @@ except ImportError:
 async def extract_async(
     inputs: list[str | Any],
     schema: Any,
-    client: LLMClient,
+    client: _LLMClient,
     document_name: str | None = None,
     object_name: str | None = None,
     show_progress: bool = True,
@@ -32,12 +33,13 @@ async def extract_async(
         raise ValueError("schema must be a pydantic model or a dict.")
     # warn if json_mode is not True
+    has_warned = os.environ.get("LM_DELUGE_WARN_JSON_MODE", False)
     for sp in client.sampling_params:
-        if sp.json_mode is False:
+        if sp.json_mode is False and not has_warned:
             print(
                 "Warning: json_mode is False for one or more sampling params. You may get invalid output."
             )
-            break
+            os.environ["LM_DELUGE_WARN_JSON_MODE"] = "True"
     # check_schema(schema_dict) -- figure out later
     if document_name is None:
         document_name = "text"
@@ -111,7 +113,7 @@ async def extract_async(
 def extract(
     inputs: list[str | Any],
     schema: Any,
-    client: LLMClient,
+    client: _LLMClient,
     document_name: str | None = None,
     object_name: str | None = None,
     show_progress: bool = True,

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/__init__.py RENAMED Viewed

@@ -29,7 +29,8 @@ class APIModel:
     api_base: str
     api_key_env_var: str
     api_spec: str
-    cached_input_cost: float | None = 0
+    cached_input_cost: float | None = 0  # $ per million cached/read input tokens
+    cache_write_cost: float | None = 0  # $ per million cache write tokens
     input_cost: float | None = 0  # $ per million input tokens
     output_cost: float | None = 0  # $ per million output tokens
     supports_json: bool = False
@@ -89,6 +90,7 @@ def register_model(
     api_spec: str = "openai",
     input_cost: float | None = 0,  # $ per million input tokens
     cached_input_cost: float | None = 0,
+    cache_write_cost: float | None = 0,  # $ per million cache write tokens
     output_cost: float | None = 0,  # $ per million output tokens
     supports_json: bool = False,
     supports_logprobs: bool = False,
@@ -106,6 +108,7 @@ def register_model(
         api_key_env_var=api_key_env_var,
         api_spec=api_spec,
         cached_input_cost=cached_input_cost,
+        cache_write_cost=cache_write_cost,
         input_cost=input_cost,
         output_cost=output_cost,
         supports_json=supports_json,

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/anthropic.py RENAMED Viewed

@@ -18,6 +18,8 @@ ANTHROPIC_MODELS = {
         "supports_json": False,
         "api_spec": "anthropic",
         "input_cost": 3.0,
+        "cached_input_cost": 0.30,
+        "cache_write_cost": 3.75,
         "output_cost": 15.0,
         "requests_per_minute": 4_000,
         "tokens_per_minute": 400_000,
@@ -30,6 +32,8 @@ ANTHROPIC_MODELS = {
         "supports_json": False,
         "api_spec": "anthropic",
         "input_cost": 15.0,
+        "cached_input_cost": 1.50,
+        "cache_write_cost": 18.75,
         "output_cost": 75.0,
         "requests_per_minute": 4_000,
         "tokens_per_minute": 400_000,
@@ -43,6 +47,8 @@ ANTHROPIC_MODELS = {
         "supports_json": False,
         "api_spec": "anthropic",
         "input_cost": 15.0,
+        "cached_input_cost": 1.50,
+        "cache_write_cost": 18.75,
         "output_cost": 75.0,
         "requests_per_minute": 4_000,
         "tokens_per_minute": 400_000,
@@ -56,6 +62,8 @@ ANTHROPIC_MODELS = {
         "supports_json": False,
         "api_spec": "anthropic",
         "input_cost": 3.0,
+        "cached_input_cost": 0.30,
+        "cache_write_cost": 3.75,
         "output_cost": 15.0,
         "requests_per_minute": 4_000,
         "tokens_per_minute": 400_000,
@@ -68,6 +76,8 @@ ANTHROPIC_MODELS = {
         "supports_json": False,
         "api_spec": "anthropic",
         "input_cost": 3.0,
+        "cached_input_cost": 0.30,
+        "cache_write_cost": 3.75,
         "output_cost": 15.0,
         "requests_per_minute": 4_000,
         "tokens_per_minute": 400_000,
@@ -81,6 +91,8 @@ ANTHROPIC_MODELS = {
         "supports_json": False,
         "api_spec": "anthropic",
         "input_cost": 3.0,
+        "cached_input_cost": 0.30,
+        "cache_write_cost": 3.75,
         "output_cost": 15.0,
         "requests_per_minute": 4_000,
         "tokens_per_minute": 400_000,
@@ -93,6 +105,8 @@ ANTHROPIC_MODELS = {
         "supports_json": False,
         "api_spec": "anthropic",
         "input_cost": 3.0,
+        "cached_input_cost": 0.30,
+        "cache_write_cost": 3.75,
         "output_cost": 15.0,
         "requests_per_minute": 4_000,
         "tokens_per_minute": 400_000,
@@ -116,8 +130,10 @@ ANTHROPIC_MODELS = {
         "api_key_env_var": "ANTHROPIC_API_KEY",
         "supports_json": False,
         "api_spec": "anthropic",
-        "input_cost": 1.00,
-        "output_cost": 5.00,
+        "input_cost": 0.8,
+        "cached_input_cost": 0.08,
+        "cache_write_cost": 1.00,
+        "output_cost": 4.00,
         "requests_per_minute": 20_000,
         "tokens_per_minute": 4_000_000,  # supposed to be this but they fucked up
     },
@@ -129,6 +145,8 @@ ANTHROPIC_MODELS = {
         "supports_json": False,
         "api_spec": "anthropic",
         "input_cost": 0.25,
+        "cache_write_cost": 0.30,
+        "cached_input_cost": 0.03,
         "output_cost": 1.25,
         "requests_per_minute": 10_000,
         "tokens_per_minute": 4_000_000,  # supposed to be this but they fucked up

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/google.py RENAMED Viewed

@@ -18,6 +18,7 @@ GOOGLE_MODELS = {
         "supports_logprobs": False,
         "api_spec": "openai",
         "input_cost": 0.1,
+        "cached_input_cost": 0.025,
         "output_cost": 0.4,
         "requests_per_minute": 20,
         "tokens_per_minute": 100_000,
@@ -31,8 +32,8 @@ GOOGLE_MODELS = {
         "supports_json": True,
         "supports_logprobs": False,
         "api_spec": "openai",
-        "input_cost": 0.1,
-        "output_cost": 0.4,
+        "input_cost": 0.075,
+        "output_cost": 0.3,
         "requests_per_minute": 20,
         "tokens_per_minute": 100_000,
         "reasoning_model": False,
@@ -45,8 +46,9 @@ GOOGLE_MODELS = {
         "supports_json": True,
         "supports_logprobs": False,
         "api_spec": "openai",
-        "input_cost": 0.1,
-        "output_cost": 0.4,
+        "input_cost": 1.25,
+        "cached_input_cost": 0.31,
+        "output_cost": 10.0,
         "requests_per_minute": 20,
         "tokens_per_minute": 100_000,
         "reasoning_model": True,
@@ -59,8 +61,9 @@ GOOGLE_MODELS = {
         "supports_json": True,
         "supports_logprobs": False,
         "api_spec": "openai",
-        "input_cost": 0.1,
-        "output_cost": 0.4,
+        "input_cost": 0.3,
+        "cached_input_cost": 0.075,
+        "output_cost": 2.5,
         "requests_per_minute": 20,
         "tokens_per_minute": 100_000,
         "reasoning_model": True,
@@ -74,6 +77,7 @@ GOOGLE_MODELS = {
         "supports_logprobs": False,
         "api_spec": "openai",
         "input_cost": 0.1,
+        "cached_input_cost": 0.025,
         "output_cost": 0.4,
         "requests_per_minute": 20,
         "tokens_per_minute": 100_000,
@@ -89,6 +93,7 @@ GOOGLE_MODELS = {
         "supports_logprobs": False,
         "api_spec": "gemini",
         "input_cost": 0.1,
+        "cached_input_cost": 0.025,
         "output_cost": 0.4,
         "requests_per_minute": 20,
         "tokens_per_minute": 100_000,
@@ -102,8 +107,8 @@ GOOGLE_MODELS = {
         "supports_json": True,
         "supports_logprobs": False,
         "api_spec": "gemini",
-        "input_cost": 0.1,
-        "output_cost": 0.4,
+        "input_cost": 0.075,
+        "output_cost": 0.3,
         "requests_per_minute": 20,
         "tokens_per_minute": 100_000,
         "reasoning_model": False,
@@ -116,8 +121,9 @@ GOOGLE_MODELS = {
         "supports_json": True,
         "supports_logprobs": False,
         "api_spec": "gemini",
-        "input_cost": 0.1,
-        "output_cost": 0.4,
+        "input_cost": 1.25,
+        "cached_input_cost": 0.31,
+        "output_cost": 10.0,
         "requests_per_minute": 20,
         "tokens_per_minute": 100_000,
         "reasoning_model": True,
@@ -130,8 +136,9 @@ GOOGLE_MODELS = {
         "supports_json": True,
         "supports_logprobs": False,
         "api_spec": "gemini",
-        "input_cost": 0.1,
-        "output_cost": 0.4,
+        "input_cost": 0.3,
+        "cached_input_cost": 0.075,
+        "output_cost": 2.5,
         "requests_per_minute": 20,
         "tokens_per_minute": 100_000,
         "reasoning_model": True,
@@ -145,6 +152,7 @@ GOOGLE_MODELS = {
         "supports_logprobs": False,
         "api_spec": "gemini",
         "input_cost": 0.1,
+        "cached_input_cost": 0.025,
         "output_cost": 0.4,
         "requests_per_minute": 20,
         "tokens_per_minute": 100_000,

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/openai.py RENAMED Viewed

@@ -75,8 +75,8 @@ OPENAI_MODELS = {
         "supports_logprobs": False,
         "supports_responses": True,
         "api_spec": "openai",
-        "input_cost": 2.0,
-        "output_cost": 8.0,
+        "input_cost": 3.0,
+        "output_cost": 12.0,
         "requests_per_minute": 20,
         "tokens_per_minute": 100_000,
         "reasoning_model": False,
@@ -90,8 +90,9 @@ OPENAI_MODELS = {
         "supports_logprobs": True,
         "supports_responses": True,
         "api_spec": "openai",
-        "input_cost": 10.0,
-        "output_cost": 40.0,
+        "input_cost": 2.0,
+        "cached_input_cost": 0.50,
+        "output_cost": 8.0,
         "requests_per_minute": 20,
         "tokens_per_minute": 100_000,
         "reasoning_model": True,
@@ -106,6 +107,7 @@ OPENAI_MODELS = {
         "supports_responses": True,
         "api_spec": "openai",
         "input_cost": 1.1,
+        "cached_input_cost": 0.275,
         "output_cost": 4.4,
         "requests_per_minute": 20,
         "tokens_per_minute": 100_000,
@@ -121,6 +123,7 @@ OPENAI_MODELS = {
         "supports_responses": True,
         "api_spec": "openai",
         "input_cost": 2.0,
+        "cached_input_cost": 0.50,
         "output_cost": 8.0,
         "requests_per_minute": 20,
         "tokens_per_minute": 100_000,
@@ -136,6 +139,7 @@ OPENAI_MODELS = {
         "supports_responses": True,
         "api_spec": "openai",
         "input_cost": 0.4,
+        "cached_input_cost": 0.10,
         "output_cost": 1.6,
         "requests_per_minute": 20,
         "tokens_per_minute": 100_000,
@@ -151,6 +155,7 @@ OPENAI_MODELS = {
         "supports_responses": True,
         "api_spec": "openai",
         "input_cost": 0.1,
+        "cached_input_cost": 0.025,
         "output_cost": 0.4,
         "requests_per_minute": 20,
         "tokens_per_minute": 100_000,
@@ -181,6 +186,7 @@ OPENAI_MODELS = {
         "supports_responses": True,
         "api_spec": "openai",
         "input_cost": 1.1,
+        "cached_input_cost": 0.55,
         "output_cost": 4.4,
         "requests_per_minute": 20,
         "tokens_per_minute": 100_000,
@@ -196,6 +202,7 @@ OPENAI_MODELS = {
         "supports_responses": True,
         "api_spec": "openai",
         "input_cost": 15.0,
+        "cached_input_cost": 7.50,
         "output_cost": 60.0,
         "requests_per_minute": 20,
         "tokens_per_minute": 100_000,
@@ -225,8 +232,9 @@ OPENAI_MODELS = {
         "supports_logprobs": True,
         "supports_responses": True,
         "api_spec": "openai",
-        "input_cost": 3.0,
-        "output_cost": 15.0,
+        "input_cost": 1.1,
+        "cached_input_cost": 0.55,
+        "output_cost": 4.4,
         "requests_per_minute": 20,
         "tokens_per_minute": 100_000,
         "reasoning_model": True,
@@ -240,8 +248,9 @@ OPENAI_MODELS = {
         "supports_logprobs": True,
         "supports_responses": True,
         "api_spec": "openai",
-        "input_cost": 5.0,
-        "output_cost": 15.0,
+        "input_cost": 2.50,
+        "cached_input_cost": 1.25,
+        "output_cost": 10.0,
         "requests_per_minute": 10_000,
         "tokens_per_minute": 30_000_000,
     },
@@ -255,6 +264,7 @@ OPENAI_MODELS = {
         "supports_responses": True,
         "api_spec": "openai",
         "input_cost": 0.15,
+        "cached_input_cost": 0.075,
         "output_cost": 0.6,
         "requests_per_minute": 60_000,
         "tokens_per_minute": 250_000_000,

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/tracker.py RENAMED Viewed

@@ -13,7 +13,6 @@ from rich.progress import (
     TaskID,
     TextColumn,
 )
-from rich.text import Text
 from tqdm.auto import tqdm
 SECONDS_TO_PAUSE_AFTER_RATE_LIMIT_ERROR = 5
@@ -24,6 +23,7 @@ class StatusTracker:
     max_requests_per_minute: int
     max_tokens_per_minute: int
     max_concurrent_requests: int
+    client_name: str = "LLMClient"
     num_tasks_started: int = 0
     num_tasks_in_progress: int = 0
     num_tasks_succeeded: int = 0
@@ -187,14 +187,16 @@ class StatusTracker:
     def _init_rich_display(self, total: int):
         """Initialize Rich display components."""
-        self._rich_console = Console()
+        self._rich_console = Console(highlight=False)
+        # Escape square brackets so Rich doesn't interpret them as markup
+        description = f"[bold blue]\\[{self.client_name}][/bold blue] Processing..."
         self._rich_progress = Progress(
             SpinnerColumn(),
-            TextColumn("Processing requests..."),
+            TextColumn("[progress.description]{task.description}"),
             BarColumn(),
             MofNCompleteColumn(),
         )
-        self._rich_task_id = self._rich_progress.add_task("requests", total=total)
+        self._rich_task_id = self._rich_progress.add_task(description, total=total)
         self._rich_stop_event = asyncio.Event()
         self._rich_display_task = asyncio.create_task(self._rich_display_updater())
@@ -217,12 +219,17 @@ class StatusTracker:
                     total=self.progress_bar_total,
                 )
-                tokens_info = f"TPM Capacity: {self.available_token_capacity / 1000:.1f}k/{self.max_tokens_per_minute / 1000:.1f}k"
-                reqs_info = f"RPM Capacity: {int(self.available_request_capacity)}/{self.max_requests_per_minute}"
-                in_progress = f"In Progress: {int(self.num_tasks_in_progress)}"
-                capacity_text = Text(f"{in_progress} • {tokens_info} • {reqs_info}")
+                tokens_info = f"{self.available_token_capacity / 1000:.1f}k/{self.max_tokens_per_minute / 1000:.1f}k TPM"
+                reqs_info = f"{int(self.available_request_capacity)}/{self.max_requests_per_minute} RPM"
+                in_progress = (
+                    f"   [gold3]In Progress:[/gold3] {int(self.num_tasks_in_progress)} "
+                    + ("requests" if self.num_tasks_in_progress != 1 else "request")
+                )
+                capacity_text = (
+                    f"   [gold3]Capacity:[/gold3] {tokens_info} • {reqs_info}"
+                )
-                display = Group(self._rich_progress, capacity_text)
+                display = Group(self._rich_progress, in_progress, capacity_text)
                 live.update(display)
                 await asyncio.sleep(0.1)
@@ -252,7 +259,7 @@ class StatusTracker:
             return
         while not self._manual_stop_event.is_set():
             print(
-                f"Completed {self.num_tasks_succeeded}/{self.progress_bar_total} requests"
+                f"[{self.client_name}] Completed {self.num_tasks_succeeded}/{self.progress_bar_total} requests"
             )
             await asyncio.sleep(self.progress_print_interval)

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/usage.py RENAMED Viewed

@@ -1,5 +1,4 @@
 from dataclasses import dataclass
-from typing import Optional
 @dataclass
@@ -13,8 +12,8 @@ class Usage:
     input_tokens: int = 0
     output_tokens: int = 0
-    cache_read_tokens: Optional[int] = None  # Tokens read from cache (Anthropic)
-    cache_write_tokens: Optional[int] = None  # Tokens written to cache (Anthropic)
+    cache_read_tokens: int = 0
+    cache_write_tokens: int = 0
     @property
     def total_input_tokens(self) -> int:
@@ -47,18 +46,29 @@ class Usage:
         return cls(
             input_tokens=usage_data.get("input_tokens", 0),
             output_tokens=usage_data.get("output_tokens", 0),
-            cache_read_tokens=usage_data.get("cache_read_input_tokens"),
-            cache_write_tokens=usage_data.get("cache_creation_input_tokens"),
+            cache_read_tokens=usage_data.get("cache_read_input_tokens", 0),
+            cache_write_tokens=usage_data.get("cache_creation_input_tokens", 0),
         )
     @classmethod
     def from_openai_usage(cls, usage_data: dict) -> "Usage":
-        """Create Usage from OpenAI API response usage data."""
+        """Create Usage from OpenAI API response usage data.
+        OpenAI supports prompt caching - cached tokens appear in prompt_tokens_details.cached_tokens.
+        Caching is automatic for prompts over 1024 tokens.
+        """
+        prompt_tokens_details = usage_data.get("prompt_tokens_details", {})
+        cached_tokens = (
+            prompt_tokens_details.get("cached_tokens", 0)
+            if prompt_tokens_details
+            else 0
+        )
         return cls(
             input_tokens=usage_data.get("prompt_tokens", 0),
             output_tokens=usage_data.get("completion_tokens", 0),
-            cache_read_tokens=None,  # OpenAI doesn't support caching yet
-            cache_write_tokens=None,
+            cache_read_tokens=cached_tokens if cached_tokens > 0 else 0,
+            cache_write_tokens=0,  # OpenAI doesn't charge separately for cache writes
         )
     @classmethod
@@ -67,18 +77,23 @@ class Usage:
         return cls(
             input_tokens=usage_data.get("prompt_tokens", 0),
             output_tokens=usage_data.get("completion_tokens", 0),
-            cache_read_tokens=None,  # Mistral doesn't support caching
-            cache_write_tokens=None,
+            cache_read_tokens=0,  # Mistral doesn't support caching
+            cache_write_tokens=0,
         )
     @classmethod
     def from_gemini_usage(cls, usage_data: dict) -> "Usage":
-        """Create Usage from Gemini API response usage data."""
+        """Create Usage from Gemini API response usage data.
+        Gemini supports context caching - cached tokens appear in cachedContentTokenCount.
+        """
+        cached_tokens = usage_data.get("cachedContentTokenCount", 0)
         return cls(
             input_tokens=usage_data.get("promptTokenCount", 0),
             output_tokens=usage_data.get("candidatesTokenCount", 0),
-            cache_read_tokens=None,  # Gemini doesn't support caching yet
-            cache_write_tokens=None,
+            cache_read_tokens=cached_tokens if cached_tokens > 0 else 0,
+            cache_write_tokens=0,  # Gemini doesn't charge separately for cache writes
         )
     def to_dict(self) -> dict:
@@ -100,8 +115,8 @@ class Usage:
         return cls(
             input_tokens=data.get("input_tokens", 0),
             output_tokens=data.get("output_tokens", 0),
-            cache_read_tokens=data.get("cache_read_tokens"),
-            cache_write_tokens=data.get("cache_write_tokens"),
+            cache_read_tokens=data.get("cache_read_tokens", 0),
+            cache_write_tokens=data.get("cache_write_tokens", 0),
         )
     def __add__(self, other: "Usage") -> "Usage":
@@ -111,14 +126,8 @@ class Usage:
             output_tokens=self.output_tokens + other.output_tokens,
             cache_read_tokens=(
                 (self.cache_read_tokens or 0) + (other.cache_read_tokens or 0)
-                if self.cache_read_tokens is not None
-                or other.cache_read_tokens is not None
-                else None
             ),
             cache_write_tokens=(
                 (self.cache_write_tokens or 0) + (other.cache_write_tokens or 0)
-                if self.cache_write_tokens is not None
-                or other.cache_write_tokens is not None
-                else None
             ),
         )

{lm_deluge-0.0.53 → lm_deluge-0.0.55/src/lm_deluge.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lm_deluge
-Version: 0.0.53
+Version: 0.0.55
 Summary: Python utility for using LLM API models.
 Author-email: Benjamin Anderson <ben@trytaylor.ai>
 Requires-Python: >=3.10

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/LICENSE RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/README.md RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/setup.cfg RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/agent.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/__init__.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/anthropic.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/bedrock.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/common.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/deprecated/bedrock.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/deprecated/cohere.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/deprecated/deepseek.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/deprecated/mistral.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/deprecated/vertex.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/gemini.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/mistral.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/openai.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/batches.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/built_in_tools/anthropic/__init__.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/built_in_tools/anthropic/bash.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/built_in_tools/anthropic/computer_use.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/built_in_tools/anthropic/editor.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/built_in_tools/base.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/built_in_tools/openai.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/cache.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/cli.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/config.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/embed.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/errors.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/file.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/gemini_limits.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/image.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/llm_tools/__init__.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/llm_tools/classify.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/llm_tools/locate.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/llm_tools/ocr.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/llm_tools/score.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/llm_tools/translate.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/bedrock.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/cerebras.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/cohere.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/deepseek.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/fireworks.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/grok.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/groq.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/meta.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/mistral.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/openrouter.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/together.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/presets/cerebras.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/presets/meta.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/prompt.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/request_context.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/rerank.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/tool.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/util/harmony.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/util/json.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/util/logprobs.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/util/spatial.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/util/validation.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/util/xml.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge.egg-info/requires.txt RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge.egg-info/top_level.txt RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/tests/test_builtin_tools.py RENAMED Viewed

File without changes

{lm_deluge-0.0.53 → lm_deluge-0.0.55}/tests/test_native_mcp_server.py RENAMED Viewed

File without changes

lm-deluge 0.0.53__tar.gz → 0.0.55__tar.gz

Potentially problematic release.

lm-deluge 0.0.53tar.gz → 0.0.55tar.gz