lm-deluge 0.0.53__tar.gz → 0.0.55__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lm-deluge might be problematic. Click here for more details.

Files changed (80) hide show
  1. {lm_deluge-0.0.53/src/lm_deluge.egg-info → lm_deluge-0.0.55}/PKG-INFO +1 -1
  2. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/pyproject.toml +1 -1
  3. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/__init__.py +3 -4
  4. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/base.py +6 -0
  5. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/response.py +28 -1
  6. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/client.py +67 -124
  7. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/llm_tools/extract.py +7 -5
  8. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/__init__.py +4 -1
  9. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/anthropic.py +20 -2
  10. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/google.py +20 -12
  11. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/openai.py +18 -8
  12. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/tracker.py +17 -10
  13. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/usage.py +30 -21
  14. {lm_deluge-0.0.53 → lm_deluge-0.0.55/src/lm_deluge.egg-info}/PKG-INFO +1 -1
  15. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/LICENSE +0 -0
  16. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/README.md +0 -0
  17. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/setup.cfg +0 -0
  18. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/agent.py +0 -0
  19. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/__init__.py +0 -0
  20. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/anthropic.py +0 -0
  21. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/bedrock.py +0 -0
  22. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/common.py +0 -0
  23. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/deprecated/bedrock.py +0 -0
  24. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/deprecated/cohere.py +0 -0
  25. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/deprecated/deepseek.py +0 -0
  26. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/deprecated/mistral.py +0 -0
  27. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/deprecated/vertex.py +0 -0
  28. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/gemini.py +0 -0
  29. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/mistral.py +0 -0
  30. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/openai.py +0 -0
  31. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/batches.py +0 -0
  32. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/built_in_tools/anthropic/__init__.py +0 -0
  33. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/built_in_tools/anthropic/bash.py +0 -0
  34. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/built_in_tools/anthropic/computer_use.py +0 -0
  35. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/built_in_tools/anthropic/editor.py +0 -0
  36. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/built_in_tools/base.py +0 -0
  37. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/built_in_tools/openai.py +0 -0
  38. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/cache.py +0 -0
  39. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/cli.py +0 -0
  40. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/config.py +0 -0
  41. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/embed.py +0 -0
  42. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/errors.py +0 -0
  43. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/file.py +0 -0
  44. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/gemini_limits.py +0 -0
  45. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/image.py +0 -0
  46. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/llm_tools/__init__.py +0 -0
  47. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/llm_tools/classify.py +0 -0
  48. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/llm_tools/locate.py +0 -0
  49. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/llm_tools/ocr.py +0 -0
  50. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/llm_tools/score.py +0 -0
  51. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/llm_tools/translate.py +0 -0
  52. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/bedrock.py +0 -0
  53. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/cerebras.py +0 -0
  54. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/cohere.py +0 -0
  55. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/deepseek.py +0 -0
  56. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/fireworks.py +0 -0
  57. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/grok.py +0 -0
  58. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/groq.py +0 -0
  59. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/meta.py +0 -0
  60. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/mistral.py +0 -0
  61. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/openrouter.py +0 -0
  62. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/together.py +0 -0
  63. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/presets/cerebras.py +0 -0
  64. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/presets/meta.py +0 -0
  65. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/prompt.py +0 -0
  66. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/request_context.py +0 -0
  67. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/rerank.py +0 -0
  68. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/tool.py +0 -0
  69. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/util/harmony.py +0 -0
  70. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/util/json.py +0 -0
  71. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/util/logprobs.py +0 -0
  72. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/util/spatial.py +0 -0
  73. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/util/validation.py +0 -0
  74. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/util/xml.py +0 -0
  75. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge.egg-info/SOURCES.txt +0 -0
  76. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge.egg-info/dependency_links.txt +0 -0
  77. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge.egg-info/requires.txt +0 -0
  78. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge.egg-info/top_level.txt +0 -0
  79. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/tests/test_builtin_tools.py +0 -0
  80. {lm_deluge-0.0.53 → lm_deluge-0.0.55}/tests/test_native_mcp_server.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lm_deluge
3
- Version: 0.0.53
3
+ Version: 0.0.55
4
4
  Summary: Python utility for using LLM API models.
5
5
  Author-email: Benjamin Anderson <ben@trytaylor.ai>
6
6
  Requires-Python: >=3.10
@@ -3,7 +3,7 @@ requires = ["setuptools", "wheel"]
3
3
 
4
4
  [project]
5
5
  name = "lm_deluge"
6
- version = "0.0.53"
6
+ version = "0.0.55"
7
7
  authors = [{ name = "Benjamin Anderson", email = "ben@trytaylor.ai" }]
8
8
  description = "Python utility for using LLM API models."
9
9
  readme = "README.md"
@@ -1,10 +1,9 @@
1
- from .client import LLMClient, SamplingParams, APIResponse
1
+ from .client import APIResponse, LLMClient, SamplingParams
2
+ from .file import File
2
3
  from .prompt import Conversation, Message
3
4
  from .tool import Tool
4
- from .file import File
5
- import dotenv
6
5
 
7
- dotenv.load_dotenv()
6
+ # dotenv.load_dotenv() - don't do this, fucks with other packages
8
7
 
9
8
  __all__ = [
10
9
  "LLMClient",
@@ -52,6 +52,9 @@ class APIRequestBase(ABC):
52
52
  self, base_headers: dict[str, str], exclude_patterns: list[str] | None = None
53
53
  ) -> dict[str, str]:
54
54
  """Merge extra_headers with base headers, giving priority to extra_headers."""
55
+ # Filter out None values from base headers (e.g., missing API keys)
56
+ base_headers = {k: v for k, v in base_headers.items() if v is not None}
57
+
55
58
  if not self.context.extra_headers:
56
59
  return base_headers
57
60
 
@@ -69,6 +72,9 @@ class APIRequestBase(ABC):
69
72
  # Start with base headers, then overlay filtered extra headers (extra takes precedence)
70
73
  merged = dict(base_headers)
71
74
  merged.update(filtered_extra)
75
+
76
+ # Filter out None values from final merged headers
77
+ merged = {k: v for k, v in merged.items() if v is not None}
72
78
  return merged
73
79
 
74
80
  def handle_success(self, data):
@@ -84,10 +84,37 @@ class APIResponse:
84
84
  and api_model.input_cost is not None
85
85
  and api_model.output_cost is not None
86
86
  ):
87
+ # Calculate input cost, accounting for cached vs non-cached tokens
88
+ # Different providers report tokens differently:
89
+ # - Anthropic/Bedrock: input_tokens is ONLY non-cached, cache_read_tokens is separate
90
+ # - OpenAI/Gemini: input_tokens INCLUDES cached, cache_read_tokens is a subset
91
+ cache_read_tokens = self.usage.cache_read_tokens or 0
92
+
93
+ if api_model.api_spec in ("anthropic", "bedrock"):
94
+ # For Anthropic: input_tokens already excludes cache, so use directly
95
+ non_cached_input_tokens = self.usage.input_tokens
96
+ else:
97
+ # For OpenAI/Gemini: input_tokens includes cache, so subtract it
98
+ non_cached_input_tokens = self.usage.input_tokens - cache_read_tokens
99
+
87
100
  self.cost = (
88
- self.usage.input_tokens * api_model.input_cost / 1e6
101
+ non_cached_input_tokens * api_model.input_cost / 1e6
89
102
  + self.usage.output_tokens * api_model.output_cost / 1e6
90
103
  )
104
+
105
+ # Add cost for cache read tokens (at reduced rate)
106
+ if cache_read_tokens > 0 and api_model.cached_input_cost is not None:
107
+ self.cost += cache_read_tokens * api_model.cached_input_cost / 1e6
108
+
109
+ # Add cost for cache write tokens (only for Anthropic)
110
+ if (
111
+ self.usage.cache_write_tokens
112
+ and self.usage.cache_write_tokens > 0
113
+ and api_model.cache_write_cost is not None
114
+ ):
115
+ self.cost += (
116
+ self.usage.cache_write_tokens * api_model.cache_write_cost / 1e6
117
+ )
91
118
  elif self.content is not None and self.completion is not None:
92
119
  pass
93
120
  # print(
@@ -30,6 +30,7 @@ class _LLMClient(BaseModel):
30
30
  """
31
31
 
32
32
  model_names: str | list[str] = ["gpt-4.1-mini"]
33
+ name: str | None = None
33
34
  max_requests_per_minute: int = 1_000
34
35
  max_tokens_per_minute: int = 100_000
35
36
  max_concurrent_requests: int = 225
@@ -69,6 +70,7 @@ class _LLMClient(BaseModel):
69
70
  max_requests_per_minute=self.max_requests_per_minute,
70
71
  max_tokens_per_minute=self.max_tokens_per_minute,
71
72
  max_concurrent_requests=self.max_concurrent_requests,
73
+ client_name=self.name or "LLMClient",
72
74
  progress_style=self.progress,
73
75
  use_progress_bar=show_progress,
74
76
  )
@@ -80,6 +82,22 @@ class _LLMClient(BaseModel):
80
82
  self._tracker.log_final_status()
81
83
  self._tracker = None
82
84
 
85
+ def reset_tracker(self):
86
+ """Reset tracker by closing and reopening with fresh state.
87
+
88
+ Useful when reusing a client across multiple batches and you want
89
+ the progress bar to start from 0 instead of showing cumulative totals.
90
+ """
91
+ if self._tracker is None:
92
+ return
93
+
94
+ # Close existing tracker (including progress bar)
95
+ show_progress = self._tracker.use_progress_bar
96
+ self.close()
97
+
98
+ # Create fresh tracker
99
+ self.open(total=0, show_progress=show_progress)
100
+
83
101
  # NEW! Builder methods
84
102
  def with_model(self, model: str):
85
103
  self.model_names = [model]
@@ -153,6 +171,13 @@ class _LLMClient(BaseModel):
153
171
  # normalize weights
154
172
  self.model_weights = [w / sum(self.model_weights) for w in self.model_weights]
155
173
 
174
+ # Auto-generate name if not provided
175
+ if self.name is None:
176
+ if len(self.model_names) == 1:
177
+ self.name = self.model_names[0]
178
+ else:
179
+ self.name = "LLMClient"
180
+
156
181
  # Validate logprobs settings across all sampling params
157
182
  if self.logprobs or any(sp.logprobs for sp in self.sampling_params):
158
183
  print("Logprobs enabled.")
@@ -353,147 +378,61 @@ class _LLMClient(BaseModel):
353
378
  cache: CachePattern | None = None,
354
379
  use_responses_api: bool = False,
355
380
  ) -> list[APIResponse | None] | list[str | None] | dict[str, int]:
356
- # Convert prompts to Conversations - no upfront cache checking for dynamic caching!
357
- prompts = prompts_to_conversations(prompts)
358
- ids = list(range(len(prompts)))
359
- results: list[APIResponse | None] = [None for _ in range(len(prompts))]
360
- contexts: list[RequestContext | None] = [None for _ in range(len(prompts))]
361
- inflight_tasks: set[asyncio.Task[None]] = set()
362
- # Use existing tracker if client has been opened; otherwise open/close automatically
363
- tracker: StatusTracker
364
- tracker_preopened = self._tracker is not None
365
- if tracker_preopened:
366
- tracker = self._tracker # type: ignore[assignment]
367
- tracker.add_to_total(len(prompts))
368
- else:
369
- self.open(total=len(prompts), show_progress=show_progress)
370
- tracker = self._tracker # type: ignore[assignment]
371
- assert tracker is not None
372
-
373
- # Create retry queue for failed requests
374
- retry_queue: asyncio.Queue[RequestContext] = asyncio.Queue()
381
+ """Process multiple prompts asynchronously using the start_nowait/wait_for_all backend.
375
382
 
376
- # Calculate sleep time for rate limiting (legacy; gating happens in _wait_for_capacity)
377
- seconds_to_sleep_each_loop = (60.0 * 0.9) / tracker.max_requests_per_minute
378
-
379
- # Main dispatch loop - using original pattern but with all prompts
380
- next_context = None # Persist across iterations like original
381
- next_is_retry = False # Track whether next_context is a retry
382
- prompts_not_finished = True
383
- prompts_iter = iter(zip(ids, prompts))
384
-
385
- while True:
386
- # Get next context (retry or new) - only if we don't already have one waiting
387
- if next_context is None:
388
- if not retry_queue.empty():
389
- next_context = retry_queue.get_nowait()
390
- next_is_retry = True
391
- print(f"Retrying request {next_context.task_id}.")
392
- elif prompts_not_finished:
393
- try:
394
- task_id, prompt = next(prompts_iter)
395
- model, sampling_params = self._select_model()
396
- assert isinstance(prompt, Conversation)
397
- next_context = RequestContext(
398
- task_id=task_id,
399
- model_name=model,
400
- prompt=prompt,
401
- sampling_params=sampling_params,
402
- attempts_left=self.max_attempts,
403
- request_timeout=self.request_timeout,
404
- status_tracker=tracker,
405
- tools=tools,
406
- cache=cache,
407
- use_responses_api=use_responses_api,
408
- extra_headers=self.extra_headers,
409
- force_local_mcp=self.force_local_mcp,
410
- )
411
-
412
- next_is_retry = False
413
- except StopIteration:
414
- prompts_not_finished = False
415
-
416
- # Dispatch using shared capacity gate (consistent with start_nowait)
417
- if next_context:
418
- # Wait here until we have capacity to launch this context
419
- await self._wait_for_capacity(
420
- next_context.num_tokens, tracker, retry=next_is_retry
421
- )
422
-
423
- # Launch simplified request processing
424
- contexts[next_context.task_id] = next_context
425
-
426
- async def process_and_store(ctx: RequestContext):
427
- try:
428
- response = await self.process_single_request(ctx, retry_queue)
429
- results[ctx.task_id] = response
430
- except BaseException as exc:
431
- # Capture cancellations and other BaseExceptions before fallback response fires.
432
- error_response = APIResponse(
433
- id=ctx.task_id,
434
- model_internal=ctx.model_name,
435
- prompt=ctx.prompt,
436
- sampling_params=ctx.sampling_params,
437
- status_code=None,
438
- is_error=True,
439
- error_message=f"{type(exc).__name__}: {exc}",
440
- raw_response={"exception_repr": repr(exc)},
441
- )
442
- results[ctx.task_id] = error_response
443
- if ctx.status_tracker:
444
- ctx.status_tracker.task_failed(ctx.task_id)
445
- raise
446
-
447
- task = asyncio.create_task(process_and_store(next_context))
448
- inflight_tasks.add(task)
449
- task.add_done_callback(inflight_tasks.discard)
450
- next_context = None # Reset after successful dispatch
451
- next_is_retry = False
452
-
453
- # Update progress - original logic
454
- tracker.update_pbar()
455
-
456
- # Check completion: consider final outcomes, not in-progress count
457
- # This avoids rare hangs if in-progress is miscounted (e.g., double-increment).
458
- if (tracker.num_tasks_succeeded + tracker.num_tasks_failed) >= len(
459
- prompts
460
- ) and retry_queue.empty():
461
- break
383
+ This implementation creates all tasks upfront and waits for them to complete,
384
+ avoiding issues with tracker state accumulating across multiple calls.
385
+ """
386
+ # Convert prompts to Conversations
387
+ prompts = prompts_to_conversations(prompts)
462
388
 
463
- # Yield briefly to allow in-flight tasks to progress
464
- await asyncio.sleep(min(0.01, seconds_to_sleep_each_loop))
389
+ # Ensure tracker exists (start_nowait will call add_to_total for each task)
390
+ if self._tracker is None:
391
+ self.open(total=0, show_progress=show_progress)
392
+ tracker_preopened = False
393
+ else:
394
+ tracker_preopened = True
395
+
396
+ # Start all tasks using start_nowait - tasks will coordinate via shared capacity lock
397
+ task_ids = []
398
+ for prompt in prompts:
399
+ assert isinstance(prompt, Conversation)
400
+ task_id = self.start_nowait(
401
+ prompt,
402
+ tools=tools,
403
+ cache=cache,
404
+ use_responses_api=use_responses_api,
405
+ )
406
+ task_ids.append(task_id)
465
407
 
466
- if inflight_tasks:
467
- await asyncio.gather(*inflight_tasks, return_exceptions=True)
408
+ # Wait for all tasks to complete
409
+ results = await self.wait_for_all(task_ids)
468
410
 
411
+ # Close tracker if we opened it
469
412
  if not tracker_preopened:
470
413
  self.close()
471
414
 
415
+ # Defensive check: This should rarely happen, but provides a safety net
472
416
  for idx, response in enumerate(results):
473
417
  if response is None:
474
- ctx = contexts[idx]
475
- prompt = ctx.prompt if ctx else prompts[idx]
476
- sampling_params = (
477
- ctx.sampling_params
478
- if ctx
479
- else self.sampling_params[0]
480
- if self.sampling_params
481
- else SamplingParams()
418
+ # This should only happen if there's a bug in _run_context
419
+ print(
420
+ f"WARNING: result[{idx}] is None! Creating defensive error response. "
421
+ f"Please report this bug."
482
422
  )
483
- model_name = ctx.model_name if ctx else self.model_names[0]
484
- assert isinstance(
485
- prompt, Conversation
486
- ), "expected prompt to be a conversation"
487
423
  results[idx] = APIResponse(
488
424
  id=idx,
489
- model_internal=model_name,
490
- prompt=prompt,
491
- sampling_params=sampling_params,
425
+ model_internal=self.model_names[0],
426
+ prompt=prompts[idx], # type: ignore
427
+ sampling_params=self.sampling_params[0]
428
+ if self.sampling_params
429
+ else SamplingParams(),
492
430
  status_code=None,
493
431
  is_error=True,
494
432
  error_message="Internal error: no response produced.",
495
433
  )
496
434
 
435
+ # Handle return format
497
436
  if return_completions_only:
498
437
  return [r.completion if r is not None else None for r in results]
499
438
 
@@ -795,6 +734,7 @@ class _LLMClient(BaseModel):
795
734
  def LLMClient(
796
735
  model_names: str,
797
736
  *,
737
+ name: str | None = None,
798
738
  max_requests_per_minute: int = 1_000,
799
739
  max_tokens_per_minute: int = 100_000,
800
740
  max_concurrent_requests: int = 225,
@@ -821,6 +761,7 @@ def LLMClient(
821
761
  def LLMClient(
822
762
  model_names: list[str],
823
763
  *,
764
+ name: str | None = None,
824
765
  max_requests_per_minute: int = 1_000,
825
766
  max_tokens_per_minute: int = 100_000,
826
767
  max_concurrent_requests: int = 225,
@@ -846,6 +787,7 @@ def LLMClient(
846
787
  def LLMClient(
847
788
  model_names: str | list[str] = "gpt-4.1-mini",
848
789
  *,
790
+ name: str | None = None,
849
791
  max_requests_per_minute: int = 1_000,
850
792
  max_tokens_per_minute: int = 100_000,
851
793
  max_concurrent_requests: int = 225,
@@ -883,6 +825,7 @@ def LLMClient(
883
825
  # Simply pass everything to the Pydantic constructor
884
826
  return _LLMClient(
885
827
  model_names=model_names,
828
+ name=name,
886
829
  max_requests_per_minute=max_requests_per_minute,
887
830
  max_tokens_per_minute=max_tokens_per_minute,
888
831
  max_concurrent_requests=max_concurrent_requests,
@@ -1,11 +1,12 @@
1
1
  import asyncio
2
2
  import io
3
3
  import json
4
+ import os
4
5
  from typing import Any
5
6
 
7
+ from lm_deluge.client import _LLMClient
6
8
  from lm_deluge.file import File
7
9
 
8
- from ..client import LLMClient
9
10
  from ..prompt import Conversation
10
11
  from ..util.json import load_json
11
12
 
@@ -18,7 +19,7 @@ except ImportError:
18
19
  async def extract_async(
19
20
  inputs: list[str | Any],
20
21
  schema: Any,
21
- client: LLMClient,
22
+ client: _LLMClient,
22
23
  document_name: str | None = None,
23
24
  object_name: str | None = None,
24
25
  show_progress: bool = True,
@@ -32,12 +33,13 @@ async def extract_async(
32
33
  raise ValueError("schema must be a pydantic model or a dict.")
33
34
 
34
35
  # warn if json_mode is not True
36
+ has_warned = os.environ.get("LM_DELUGE_WARN_JSON_MODE", False)
35
37
  for sp in client.sampling_params:
36
- if sp.json_mode is False:
38
+ if sp.json_mode is False and not has_warned:
37
39
  print(
38
40
  "Warning: json_mode is False for one or more sampling params. You may get invalid output."
39
41
  )
40
- break
42
+ os.environ["LM_DELUGE_WARN_JSON_MODE"] = "True"
41
43
  # check_schema(schema_dict) -- figure out later
42
44
  if document_name is None:
43
45
  document_name = "text"
@@ -111,7 +113,7 @@ async def extract_async(
111
113
  def extract(
112
114
  inputs: list[str | Any],
113
115
  schema: Any,
114
- client: LLMClient,
116
+ client: _LLMClient,
115
117
  document_name: str | None = None,
116
118
  object_name: str | None = None,
117
119
  show_progress: bool = True,
@@ -29,7 +29,8 @@ class APIModel:
29
29
  api_base: str
30
30
  api_key_env_var: str
31
31
  api_spec: str
32
- cached_input_cost: float | None = 0
32
+ cached_input_cost: float | None = 0 # $ per million cached/read input tokens
33
+ cache_write_cost: float | None = 0 # $ per million cache write tokens
33
34
  input_cost: float | None = 0 # $ per million input tokens
34
35
  output_cost: float | None = 0 # $ per million output tokens
35
36
  supports_json: bool = False
@@ -89,6 +90,7 @@ def register_model(
89
90
  api_spec: str = "openai",
90
91
  input_cost: float | None = 0, # $ per million input tokens
91
92
  cached_input_cost: float | None = 0,
93
+ cache_write_cost: float | None = 0, # $ per million cache write tokens
92
94
  output_cost: float | None = 0, # $ per million output tokens
93
95
  supports_json: bool = False,
94
96
  supports_logprobs: bool = False,
@@ -106,6 +108,7 @@ def register_model(
106
108
  api_key_env_var=api_key_env_var,
107
109
  api_spec=api_spec,
108
110
  cached_input_cost=cached_input_cost,
111
+ cache_write_cost=cache_write_cost,
109
112
  input_cost=input_cost,
110
113
  output_cost=output_cost,
111
114
  supports_json=supports_json,
@@ -18,6 +18,8 @@ ANTHROPIC_MODELS = {
18
18
  "supports_json": False,
19
19
  "api_spec": "anthropic",
20
20
  "input_cost": 3.0,
21
+ "cached_input_cost": 0.30,
22
+ "cache_write_cost": 3.75,
21
23
  "output_cost": 15.0,
22
24
  "requests_per_minute": 4_000,
23
25
  "tokens_per_minute": 400_000,
@@ -30,6 +32,8 @@ ANTHROPIC_MODELS = {
30
32
  "supports_json": False,
31
33
  "api_spec": "anthropic",
32
34
  "input_cost": 15.0,
35
+ "cached_input_cost": 1.50,
36
+ "cache_write_cost": 18.75,
33
37
  "output_cost": 75.0,
34
38
  "requests_per_minute": 4_000,
35
39
  "tokens_per_minute": 400_000,
@@ -43,6 +47,8 @@ ANTHROPIC_MODELS = {
43
47
  "supports_json": False,
44
48
  "api_spec": "anthropic",
45
49
  "input_cost": 15.0,
50
+ "cached_input_cost": 1.50,
51
+ "cache_write_cost": 18.75,
46
52
  "output_cost": 75.0,
47
53
  "requests_per_minute": 4_000,
48
54
  "tokens_per_minute": 400_000,
@@ -56,6 +62,8 @@ ANTHROPIC_MODELS = {
56
62
  "supports_json": False,
57
63
  "api_spec": "anthropic",
58
64
  "input_cost": 3.0,
65
+ "cached_input_cost": 0.30,
66
+ "cache_write_cost": 3.75,
59
67
  "output_cost": 15.0,
60
68
  "requests_per_minute": 4_000,
61
69
  "tokens_per_minute": 400_000,
@@ -68,6 +76,8 @@ ANTHROPIC_MODELS = {
68
76
  "supports_json": False,
69
77
  "api_spec": "anthropic",
70
78
  "input_cost": 3.0,
79
+ "cached_input_cost": 0.30,
80
+ "cache_write_cost": 3.75,
71
81
  "output_cost": 15.0,
72
82
  "requests_per_minute": 4_000,
73
83
  "tokens_per_minute": 400_000,
@@ -81,6 +91,8 @@ ANTHROPIC_MODELS = {
81
91
  "supports_json": False,
82
92
  "api_spec": "anthropic",
83
93
  "input_cost": 3.0,
94
+ "cached_input_cost": 0.30,
95
+ "cache_write_cost": 3.75,
84
96
  "output_cost": 15.0,
85
97
  "requests_per_minute": 4_000,
86
98
  "tokens_per_minute": 400_000,
@@ -93,6 +105,8 @@ ANTHROPIC_MODELS = {
93
105
  "supports_json": False,
94
106
  "api_spec": "anthropic",
95
107
  "input_cost": 3.0,
108
+ "cached_input_cost": 0.30,
109
+ "cache_write_cost": 3.75,
96
110
  "output_cost": 15.0,
97
111
  "requests_per_minute": 4_000,
98
112
  "tokens_per_minute": 400_000,
@@ -116,8 +130,10 @@ ANTHROPIC_MODELS = {
116
130
  "api_key_env_var": "ANTHROPIC_API_KEY",
117
131
  "supports_json": False,
118
132
  "api_spec": "anthropic",
119
- "input_cost": 1.00,
120
- "output_cost": 5.00,
133
+ "input_cost": 0.8,
134
+ "cached_input_cost": 0.08,
135
+ "cache_write_cost": 1.00,
136
+ "output_cost": 4.00,
121
137
  "requests_per_minute": 20_000,
122
138
  "tokens_per_minute": 4_000_000, # supposed to be this but they fucked up
123
139
  },
@@ -129,6 +145,8 @@ ANTHROPIC_MODELS = {
129
145
  "supports_json": False,
130
146
  "api_spec": "anthropic",
131
147
  "input_cost": 0.25,
148
+ "cache_write_cost": 0.30,
149
+ "cached_input_cost": 0.03,
132
150
  "output_cost": 1.25,
133
151
  "requests_per_minute": 10_000,
134
152
  "tokens_per_minute": 4_000_000, # supposed to be this but they fucked up
@@ -18,6 +18,7 @@ GOOGLE_MODELS = {
18
18
  "supports_logprobs": False,
19
19
  "api_spec": "openai",
20
20
  "input_cost": 0.1,
21
+ "cached_input_cost": 0.025,
21
22
  "output_cost": 0.4,
22
23
  "requests_per_minute": 20,
23
24
  "tokens_per_minute": 100_000,
@@ -31,8 +32,8 @@ GOOGLE_MODELS = {
31
32
  "supports_json": True,
32
33
  "supports_logprobs": False,
33
34
  "api_spec": "openai",
34
- "input_cost": 0.1,
35
- "output_cost": 0.4,
35
+ "input_cost": 0.075,
36
+ "output_cost": 0.3,
36
37
  "requests_per_minute": 20,
37
38
  "tokens_per_minute": 100_000,
38
39
  "reasoning_model": False,
@@ -45,8 +46,9 @@ GOOGLE_MODELS = {
45
46
  "supports_json": True,
46
47
  "supports_logprobs": False,
47
48
  "api_spec": "openai",
48
- "input_cost": 0.1,
49
- "output_cost": 0.4,
49
+ "input_cost": 1.25,
50
+ "cached_input_cost": 0.31,
51
+ "output_cost": 10.0,
50
52
  "requests_per_minute": 20,
51
53
  "tokens_per_minute": 100_000,
52
54
  "reasoning_model": True,
@@ -59,8 +61,9 @@ GOOGLE_MODELS = {
59
61
  "supports_json": True,
60
62
  "supports_logprobs": False,
61
63
  "api_spec": "openai",
62
- "input_cost": 0.1,
63
- "output_cost": 0.4,
64
+ "input_cost": 0.3,
65
+ "cached_input_cost": 0.075,
66
+ "output_cost": 2.5,
64
67
  "requests_per_minute": 20,
65
68
  "tokens_per_minute": 100_000,
66
69
  "reasoning_model": True,
@@ -74,6 +77,7 @@ GOOGLE_MODELS = {
74
77
  "supports_logprobs": False,
75
78
  "api_spec": "openai",
76
79
  "input_cost": 0.1,
80
+ "cached_input_cost": 0.025,
77
81
  "output_cost": 0.4,
78
82
  "requests_per_minute": 20,
79
83
  "tokens_per_minute": 100_000,
@@ -89,6 +93,7 @@ GOOGLE_MODELS = {
89
93
  "supports_logprobs": False,
90
94
  "api_spec": "gemini",
91
95
  "input_cost": 0.1,
96
+ "cached_input_cost": 0.025,
92
97
  "output_cost": 0.4,
93
98
  "requests_per_minute": 20,
94
99
  "tokens_per_minute": 100_000,
@@ -102,8 +107,8 @@ GOOGLE_MODELS = {
102
107
  "supports_json": True,
103
108
  "supports_logprobs": False,
104
109
  "api_spec": "gemini",
105
- "input_cost": 0.1,
106
- "output_cost": 0.4,
110
+ "input_cost": 0.075,
111
+ "output_cost": 0.3,
107
112
  "requests_per_minute": 20,
108
113
  "tokens_per_minute": 100_000,
109
114
  "reasoning_model": False,
@@ -116,8 +121,9 @@ GOOGLE_MODELS = {
116
121
  "supports_json": True,
117
122
  "supports_logprobs": False,
118
123
  "api_spec": "gemini",
119
- "input_cost": 0.1,
120
- "output_cost": 0.4,
124
+ "input_cost": 1.25,
125
+ "cached_input_cost": 0.31,
126
+ "output_cost": 10.0,
121
127
  "requests_per_minute": 20,
122
128
  "tokens_per_minute": 100_000,
123
129
  "reasoning_model": True,
@@ -130,8 +136,9 @@ GOOGLE_MODELS = {
130
136
  "supports_json": True,
131
137
  "supports_logprobs": False,
132
138
  "api_spec": "gemini",
133
- "input_cost": 0.1,
134
- "output_cost": 0.4,
139
+ "input_cost": 0.3,
140
+ "cached_input_cost": 0.075,
141
+ "output_cost": 2.5,
135
142
  "requests_per_minute": 20,
136
143
  "tokens_per_minute": 100_000,
137
144
  "reasoning_model": True,
@@ -145,6 +152,7 @@ GOOGLE_MODELS = {
145
152
  "supports_logprobs": False,
146
153
  "api_spec": "gemini",
147
154
  "input_cost": 0.1,
155
+ "cached_input_cost": 0.025,
148
156
  "output_cost": 0.4,
149
157
  "requests_per_minute": 20,
150
158
  "tokens_per_minute": 100_000,
@@ -75,8 +75,8 @@ OPENAI_MODELS = {
75
75
  "supports_logprobs": False,
76
76
  "supports_responses": True,
77
77
  "api_spec": "openai",
78
- "input_cost": 2.0,
79
- "output_cost": 8.0,
78
+ "input_cost": 3.0,
79
+ "output_cost": 12.0,
80
80
  "requests_per_minute": 20,
81
81
  "tokens_per_minute": 100_000,
82
82
  "reasoning_model": False,
@@ -90,8 +90,9 @@ OPENAI_MODELS = {
90
90
  "supports_logprobs": True,
91
91
  "supports_responses": True,
92
92
  "api_spec": "openai",
93
- "input_cost": 10.0,
94
- "output_cost": 40.0,
93
+ "input_cost": 2.0,
94
+ "cached_input_cost": 0.50,
95
+ "output_cost": 8.0,
95
96
  "requests_per_minute": 20,
96
97
  "tokens_per_minute": 100_000,
97
98
  "reasoning_model": True,
@@ -106,6 +107,7 @@ OPENAI_MODELS = {
106
107
  "supports_responses": True,
107
108
  "api_spec": "openai",
108
109
  "input_cost": 1.1,
110
+ "cached_input_cost": 0.275,
109
111
  "output_cost": 4.4,
110
112
  "requests_per_minute": 20,
111
113
  "tokens_per_minute": 100_000,
@@ -121,6 +123,7 @@ OPENAI_MODELS = {
121
123
  "supports_responses": True,
122
124
  "api_spec": "openai",
123
125
  "input_cost": 2.0,
126
+ "cached_input_cost": 0.50,
124
127
  "output_cost": 8.0,
125
128
  "requests_per_minute": 20,
126
129
  "tokens_per_minute": 100_000,
@@ -136,6 +139,7 @@ OPENAI_MODELS = {
136
139
  "supports_responses": True,
137
140
  "api_spec": "openai",
138
141
  "input_cost": 0.4,
142
+ "cached_input_cost": 0.10,
139
143
  "output_cost": 1.6,
140
144
  "requests_per_minute": 20,
141
145
  "tokens_per_minute": 100_000,
@@ -151,6 +155,7 @@ OPENAI_MODELS = {
151
155
  "supports_responses": True,
152
156
  "api_spec": "openai",
153
157
  "input_cost": 0.1,
158
+ "cached_input_cost": 0.025,
154
159
  "output_cost": 0.4,
155
160
  "requests_per_minute": 20,
156
161
  "tokens_per_minute": 100_000,
@@ -181,6 +186,7 @@ OPENAI_MODELS = {
181
186
  "supports_responses": True,
182
187
  "api_spec": "openai",
183
188
  "input_cost": 1.1,
189
+ "cached_input_cost": 0.55,
184
190
  "output_cost": 4.4,
185
191
  "requests_per_minute": 20,
186
192
  "tokens_per_minute": 100_000,
@@ -196,6 +202,7 @@ OPENAI_MODELS = {
196
202
  "supports_responses": True,
197
203
  "api_spec": "openai",
198
204
  "input_cost": 15.0,
205
+ "cached_input_cost": 7.50,
199
206
  "output_cost": 60.0,
200
207
  "requests_per_minute": 20,
201
208
  "tokens_per_minute": 100_000,
@@ -225,8 +232,9 @@ OPENAI_MODELS = {
225
232
  "supports_logprobs": True,
226
233
  "supports_responses": True,
227
234
  "api_spec": "openai",
228
- "input_cost": 3.0,
229
- "output_cost": 15.0,
235
+ "input_cost": 1.1,
236
+ "cached_input_cost": 0.55,
237
+ "output_cost": 4.4,
230
238
  "requests_per_minute": 20,
231
239
  "tokens_per_minute": 100_000,
232
240
  "reasoning_model": True,
@@ -240,8 +248,9 @@ OPENAI_MODELS = {
240
248
  "supports_logprobs": True,
241
249
  "supports_responses": True,
242
250
  "api_spec": "openai",
243
- "input_cost": 5.0,
244
- "output_cost": 15.0,
251
+ "input_cost": 2.50,
252
+ "cached_input_cost": 1.25,
253
+ "output_cost": 10.0,
245
254
  "requests_per_minute": 10_000,
246
255
  "tokens_per_minute": 30_000_000,
247
256
  },
@@ -255,6 +264,7 @@ OPENAI_MODELS = {
255
264
  "supports_responses": True,
256
265
  "api_spec": "openai",
257
266
  "input_cost": 0.15,
267
+ "cached_input_cost": 0.075,
258
268
  "output_cost": 0.6,
259
269
  "requests_per_minute": 60_000,
260
270
  "tokens_per_minute": 250_000_000,
@@ -13,7 +13,6 @@ from rich.progress import (
13
13
  TaskID,
14
14
  TextColumn,
15
15
  )
16
- from rich.text import Text
17
16
  from tqdm.auto import tqdm
18
17
 
19
18
  SECONDS_TO_PAUSE_AFTER_RATE_LIMIT_ERROR = 5
@@ -24,6 +23,7 @@ class StatusTracker:
24
23
  max_requests_per_minute: int
25
24
  max_tokens_per_minute: int
26
25
  max_concurrent_requests: int
26
+ client_name: str = "LLMClient"
27
27
  num_tasks_started: int = 0
28
28
  num_tasks_in_progress: int = 0
29
29
  num_tasks_succeeded: int = 0
@@ -187,14 +187,16 @@ class StatusTracker:
187
187
 
188
188
  def _init_rich_display(self, total: int):
189
189
  """Initialize Rich display components."""
190
- self._rich_console = Console()
190
+ self._rich_console = Console(highlight=False)
191
+ # Escape square brackets so Rich doesn't interpret them as markup
192
+ description = f"[bold blue]\\[{self.client_name}][/bold blue] Processing..."
191
193
  self._rich_progress = Progress(
192
194
  SpinnerColumn(),
193
- TextColumn("Processing requests..."),
195
+ TextColumn("[progress.description]{task.description}"),
194
196
  BarColumn(),
195
197
  MofNCompleteColumn(),
196
198
  )
197
- self._rich_task_id = self._rich_progress.add_task("requests", total=total)
199
+ self._rich_task_id = self._rich_progress.add_task(description, total=total)
198
200
  self._rich_stop_event = asyncio.Event()
199
201
  self._rich_display_task = asyncio.create_task(self._rich_display_updater())
200
202
 
@@ -217,12 +219,17 @@ class StatusTracker:
217
219
  total=self.progress_bar_total,
218
220
  )
219
221
 
220
- tokens_info = f"TPM Capacity: {self.available_token_capacity / 1000:.1f}k/{self.max_tokens_per_minute / 1000:.1f}k"
221
- reqs_info = f"RPM Capacity: {int(self.available_request_capacity)}/{self.max_requests_per_minute}"
222
- in_progress = f"In Progress: {int(self.num_tasks_in_progress)}"
223
- capacity_text = Text(f"{in_progress} {tokens_info} • {reqs_info}")
222
+ tokens_info = f"{self.available_token_capacity / 1000:.1f}k/{self.max_tokens_per_minute / 1000:.1f}k TPM"
223
+ reqs_info = f"{int(self.available_request_capacity)}/{self.max_requests_per_minute} RPM"
224
+ in_progress = (
225
+ f" [gold3]In Progress:[/gold3] {int(self.num_tasks_in_progress)} "
226
+ + ("requests" if self.num_tasks_in_progress != 1 else "request")
227
+ )
228
+ capacity_text = (
229
+ f" [gold3]Capacity:[/gold3] {tokens_info} • {reqs_info}"
230
+ )
224
231
 
225
- display = Group(self._rich_progress, capacity_text)
232
+ display = Group(self._rich_progress, in_progress, capacity_text)
226
233
  live.update(display)
227
234
 
228
235
  await asyncio.sleep(0.1)
@@ -252,7 +259,7 @@ class StatusTracker:
252
259
  return
253
260
  while not self._manual_stop_event.is_set():
254
261
  print(
255
- f"Completed {self.num_tasks_succeeded}/{self.progress_bar_total} requests"
262
+ f"[{self.client_name}] Completed {self.num_tasks_succeeded}/{self.progress_bar_total} requests"
256
263
  )
257
264
  await asyncio.sleep(self.progress_print_interval)
258
265
 
@@ -1,5 +1,4 @@
1
1
  from dataclasses import dataclass
2
- from typing import Optional
3
2
 
4
3
 
5
4
  @dataclass
@@ -13,8 +12,8 @@ class Usage:
13
12
 
14
13
  input_tokens: int = 0
15
14
  output_tokens: int = 0
16
- cache_read_tokens: Optional[int] = None # Tokens read from cache (Anthropic)
17
- cache_write_tokens: Optional[int] = None # Tokens written to cache (Anthropic)
15
+ cache_read_tokens: int = 0
16
+ cache_write_tokens: int = 0
18
17
 
19
18
  @property
20
19
  def total_input_tokens(self) -> int:
@@ -47,18 +46,29 @@ class Usage:
47
46
  return cls(
48
47
  input_tokens=usage_data.get("input_tokens", 0),
49
48
  output_tokens=usage_data.get("output_tokens", 0),
50
- cache_read_tokens=usage_data.get("cache_read_input_tokens"),
51
- cache_write_tokens=usage_data.get("cache_creation_input_tokens"),
49
+ cache_read_tokens=usage_data.get("cache_read_input_tokens", 0),
50
+ cache_write_tokens=usage_data.get("cache_creation_input_tokens", 0),
52
51
  )
53
52
 
54
53
  @classmethod
55
54
  def from_openai_usage(cls, usage_data: dict) -> "Usage":
56
- """Create Usage from OpenAI API response usage data."""
55
+ """Create Usage from OpenAI API response usage data.
56
+
57
+ OpenAI supports prompt caching - cached tokens appear in prompt_tokens_details.cached_tokens.
58
+ Caching is automatic for prompts over 1024 tokens.
59
+ """
60
+ prompt_tokens_details = usage_data.get("prompt_tokens_details", {})
61
+ cached_tokens = (
62
+ prompt_tokens_details.get("cached_tokens", 0)
63
+ if prompt_tokens_details
64
+ else 0
65
+ )
66
+
57
67
  return cls(
58
68
  input_tokens=usage_data.get("prompt_tokens", 0),
59
69
  output_tokens=usage_data.get("completion_tokens", 0),
60
- cache_read_tokens=None, # OpenAI doesn't support caching yet
61
- cache_write_tokens=None,
70
+ cache_read_tokens=cached_tokens if cached_tokens > 0 else 0,
71
+ cache_write_tokens=0, # OpenAI doesn't charge separately for cache writes
62
72
  )
63
73
 
64
74
  @classmethod
@@ -67,18 +77,23 @@ class Usage:
67
77
  return cls(
68
78
  input_tokens=usage_data.get("prompt_tokens", 0),
69
79
  output_tokens=usage_data.get("completion_tokens", 0),
70
- cache_read_tokens=None, # Mistral doesn't support caching
71
- cache_write_tokens=None,
80
+ cache_read_tokens=0, # Mistral doesn't support caching
81
+ cache_write_tokens=0,
72
82
  )
73
83
 
74
84
  @classmethod
75
85
  def from_gemini_usage(cls, usage_data: dict) -> "Usage":
76
- """Create Usage from Gemini API response usage data."""
86
+ """Create Usage from Gemini API response usage data.
87
+
88
+ Gemini supports context caching - cached tokens appear in cachedContentTokenCount.
89
+ """
90
+ cached_tokens = usage_data.get("cachedContentTokenCount", 0)
91
+
77
92
  return cls(
78
93
  input_tokens=usage_data.get("promptTokenCount", 0),
79
94
  output_tokens=usage_data.get("candidatesTokenCount", 0),
80
- cache_read_tokens=None, # Gemini doesn't support caching yet
81
- cache_write_tokens=None,
95
+ cache_read_tokens=cached_tokens if cached_tokens > 0 else 0,
96
+ cache_write_tokens=0, # Gemini doesn't charge separately for cache writes
82
97
  )
83
98
 
84
99
  def to_dict(self) -> dict:
@@ -100,8 +115,8 @@ class Usage:
100
115
  return cls(
101
116
  input_tokens=data.get("input_tokens", 0),
102
117
  output_tokens=data.get("output_tokens", 0),
103
- cache_read_tokens=data.get("cache_read_tokens"),
104
- cache_write_tokens=data.get("cache_write_tokens"),
118
+ cache_read_tokens=data.get("cache_read_tokens", 0),
119
+ cache_write_tokens=data.get("cache_write_tokens", 0),
105
120
  )
106
121
 
107
122
  def __add__(self, other: "Usage") -> "Usage":
@@ -111,14 +126,8 @@ class Usage:
111
126
  output_tokens=self.output_tokens + other.output_tokens,
112
127
  cache_read_tokens=(
113
128
  (self.cache_read_tokens or 0) + (other.cache_read_tokens or 0)
114
- if self.cache_read_tokens is not None
115
- or other.cache_read_tokens is not None
116
- else None
117
129
  ),
118
130
  cache_write_tokens=(
119
131
  (self.cache_write_tokens or 0) + (other.cache_write_tokens or 0)
120
- if self.cache_write_tokens is not None
121
- or other.cache_write_tokens is not None
122
- else None
123
132
  ),
124
133
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lm_deluge
3
- Version: 0.0.53
3
+ Version: 0.0.55
4
4
  Summary: Python utility for using LLM API models.
5
5
  Author-email: Benjamin Anderson <ben@trytaylor.ai>
6
6
  Requires-Python: >=3.10
File without changes
File without changes
File without changes