lm-deluge 0.0.54__tar.gz → 0.0.56__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lm-deluge might be problematic. Click here for more details.

Files changed (80) hide show
  1. {lm_deluge-0.0.54/src/lm_deluge.egg-info → lm_deluge-0.0.56}/PKG-INFO +1 -1
  2. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/pyproject.toml +1 -1
  3. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/api_requests/base.py +6 -0
  4. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/api_requests/response.py +28 -1
  5. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/client.py +16 -0
  6. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/models/__init__.py +4 -1
  7. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/models/anthropic.py +20 -2
  8. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/models/google.py +20 -12
  9. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/models/openai.py +18 -8
  10. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/tracker.py +78 -10
  11. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/usage.py +30 -21
  12. {lm_deluge-0.0.54 → lm_deluge-0.0.56/src/lm_deluge.egg-info}/PKG-INFO +1 -1
  13. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/LICENSE +0 -0
  14. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/README.md +0 -0
  15. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/setup.cfg +0 -0
  16. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/__init__.py +0 -0
  17. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/agent.py +0 -0
  18. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/api_requests/__init__.py +0 -0
  19. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/api_requests/anthropic.py +0 -0
  20. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/api_requests/bedrock.py +0 -0
  21. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/api_requests/common.py +0 -0
  22. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/api_requests/deprecated/bedrock.py +0 -0
  23. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/api_requests/deprecated/cohere.py +0 -0
  24. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/api_requests/deprecated/deepseek.py +0 -0
  25. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/api_requests/deprecated/mistral.py +0 -0
  26. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/api_requests/deprecated/vertex.py +0 -0
  27. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/api_requests/gemini.py +0 -0
  28. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/api_requests/mistral.py +0 -0
  29. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/api_requests/openai.py +0 -0
  30. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/batches.py +0 -0
  31. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/built_in_tools/anthropic/__init__.py +0 -0
  32. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/built_in_tools/anthropic/bash.py +0 -0
  33. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/built_in_tools/anthropic/computer_use.py +0 -0
  34. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/built_in_tools/anthropic/editor.py +0 -0
  35. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/built_in_tools/base.py +0 -0
  36. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/built_in_tools/openai.py +0 -0
  37. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/cache.py +0 -0
  38. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/cli.py +0 -0
  39. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/config.py +0 -0
  40. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/embed.py +0 -0
  41. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/errors.py +0 -0
  42. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/file.py +0 -0
  43. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/gemini_limits.py +0 -0
  44. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/image.py +0 -0
  45. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/llm_tools/__init__.py +0 -0
  46. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/llm_tools/classify.py +0 -0
  47. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/llm_tools/extract.py +0 -0
  48. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/llm_tools/locate.py +0 -0
  49. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/llm_tools/ocr.py +0 -0
  50. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/llm_tools/score.py +0 -0
  51. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/llm_tools/translate.py +0 -0
  52. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/models/bedrock.py +0 -0
  53. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/models/cerebras.py +0 -0
  54. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/models/cohere.py +0 -0
  55. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/models/deepseek.py +0 -0
  56. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/models/fireworks.py +0 -0
  57. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/models/grok.py +0 -0
  58. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/models/groq.py +0 -0
  59. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/models/meta.py +0 -0
  60. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/models/mistral.py +0 -0
  61. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/models/openrouter.py +0 -0
  62. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/models/together.py +0 -0
  63. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/presets/cerebras.py +0 -0
  64. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/presets/meta.py +0 -0
  65. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/prompt.py +0 -0
  66. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/request_context.py +0 -0
  67. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/rerank.py +0 -0
  68. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/tool.py +0 -0
  69. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/util/harmony.py +0 -0
  70. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/util/json.py +0 -0
  71. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/util/logprobs.py +0 -0
  72. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/util/spatial.py +0 -0
  73. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/util/validation.py +0 -0
  74. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge/util/xml.py +0 -0
  75. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge.egg-info/SOURCES.txt +0 -0
  76. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge.egg-info/dependency_links.txt +0 -0
  77. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge.egg-info/requires.txt +0 -0
  78. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/src/lm_deluge.egg-info/top_level.txt +0 -0
  79. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/tests/test_builtin_tools.py +0 -0
  80. {lm_deluge-0.0.54 → lm_deluge-0.0.56}/tests/test_native_mcp_server.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lm_deluge
3
- Version: 0.0.54
3
+ Version: 0.0.56
4
4
  Summary: Python utility for using LLM API models.
5
5
  Author-email: Benjamin Anderson <ben@trytaylor.ai>
6
6
  Requires-Python: >=3.10
@@ -3,7 +3,7 @@ requires = ["setuptools", "wheel"]
3
3
 
4
4
  [project]
5
5
  name = "lm_deluge"
6
- version = "0.0.54"
6
+ version = "0.0.56"
7
7
  authors = [{ name = "Benjamin Anderson", email = "ben@trytaylor.ai" }]
8
8
  description = "Python utility for using LLM API models."
9
9
  readme = "README.md"
@@ -52,6 +52,9 @@ class APIRequestBase(ABC):
52
52
  self, base_headers: dict[str, str], exclude_patterns: list[str] | None = None
53
53
  ) -> dict[str, str]:
54
54
  """Merge extra_headers with base headers, giving priority to extra_headers."""
55
+ # Filter out None values from base headers (e.g., missing API keys)
56
+ base_headers = {k: v for k, v in base_headers.items() if v is not None}
57
+
55
58
  if not self.context.extra_headers:
56
59
  return base_headers
57
60
 
@@ -69,6 +72,9 @@ class APIRequestBase(ABC):
69
72
  # Start with base headers, then overlay filtered extra headers (extra takes precedence)
70
73
  merged = dict(base_headers)
71
74
  merged.update(filtered_extra)
75
+
76
+ # Filter out None values from final merged headers
77
+ merged = {k: v for k, v in merged.items() if v is not None}
72
78
  return merged
73
79
 
74
80
  def handle_success(self, data):
@@ -84,10 +84,37 @@ class APIResponse:
84
84
  and api_model.input_cost is not None
85
85
  and api_model.output_cost is not None
86
86
  ):
87
+ # Calculate input cost, accounting for cached vs non-cached tokens
88
+ # Different providers report tokens differently:
89
+ # - Anthropic/Bedrock: input_tokens is ONLY non-cached, cache_read_tokens is separate
90
+ # - OpenAI/Gemini: input_tokens INCLUDES cached, cache_read_tokens is a subset
91
+ cache_read_tokens = self.usage.cache_read_tokens or 0
92
+
93
+ if api_model.api_spec in ("anthropic", "bedrock"):
94
+ # For Anthropic: input_tokens already excludes cache, so use directly
95
+ non_cached_input_tokens = self.usage.input_tokens
96
+ else:
97
+ # For OpenAI/Gemini: input_tokens includes cache, so subtract it
98
+ non_cached_input_tokens = self.usage.input_tokens - cache_read_tokens
99
+
87
100
  self.cost = (
88
- self.usage.input_tokens * api_model.input_cost / 1e6
101
+ non_cached_input_tokens * api_model.input_cost / 1e6
89
102
  + self.usage.output_tokens * api_model.output_cost / 1e6
90
103
  )
104
+
105
+ # Add cost for cache read tokens (at reduced rate)
106
+ if cache_read_tokens > 0 and api_model.cached_input_cost is not None:
107
+ self.cost += cache_read_tokens * api_model.cached_input_cost / 1e6
108
+
109
+ # Add cost for cache write tokens (only for Anthropic)
110
+ if (
111
+ self.usage.cache_write_tokens
112
+ and self.usage.cache_write_tokens > 0
113
+ and api_model.cache_write_cost is not None
114
+ ):
115
+ self.cost += (
116
+ self.usage.cache_write_tokens * api_model.cache_write_cost / 1e6
117
+ )
91
118
  elif self.content is not None and self.completion is not None:
92
119
  pass
93
120
  # print(
@@ -30,6 +30,7 @@ class _LLMClient(BaseModel):
30
30
  """
31
31
 
32
32
  model_names: str | list[str] = ["gpt-4.1-mini"]
33
+ name: str | None = None
33
34
  max_requests_per_minute: int = 1_000
34
35
  max_tokens_per_minute: int = 100_000
35
36
  max_concurrent_requests: int = 225
@@ -69,6 +70,7 @@ class _LLMClient(BaseModel):
69
70
  max_requests_per_minute=self.max_requests_per_minute,
70
71
  max_tokens_per_minute=self.max_tokens_per_minute,
71
72
  max_concurrent_requests=self.max_concurrent_requests,
73
+ client_name=self.name or "LLMClient",
72
74
  progress_style=self.progress,
73
75
  use_progress_bar=show_progress,
74
76
  )
@@ -169,6 +171,13 @@ class _LLMClient(BaseModel):
169
171
  # normalize weights
170
172
  self.model_weights = [w / sum(self.model_weights) for w in self.model_weights]
171
173
 
174
+ # Auto-generate name if not provided
175
+ if self.name is None:
176
+ if len(self.model_names) == 1:
177
+ self.name = self.model_names[0]
178
+ else:
179
+ self.name = "LLMClient"
180
+
172
181
  # Validate logprobs settings across all sampling params
173
182
  if self.logprobs or any(sp.logprobs for sp in self.sampling_params):
174
183
  print("Logprobs enabled.")
@@ -286,6 +295,7 @@ class _LLMClient(BaseModel):
286
295
  # Handle successful response
287
296
  if not response.is_error:
288
297
  context.status_tracker.task_succeeded(context.task_id)
298
+ context.status_tracker.track_usage(response)
289
299
  # Cache successful responses immediately
290
300
  if self.cache and response.completion:
291
301
  # print(f"DEBUG: Caching successful response")
@@ -324,6 +334,8 @@ class _LLMClient(BaseModel):
324
334
 
325
335
  # No retries left or no retry queue - final failure
326
336
  context.status_tracker.task_failed(context.task_id)
337
+ # Track usage even for failed requests if they made an API call
338
+ context.status_tracker.track_usage(response)
327
339
  context.maybe_callback(response, context.status_tracker)
328
340
 
329
341
  # Print final error message
@@ -725,6 +737,7 @@ class _LLMClient(BaseModel):
725
737
  def LLMClient(
726
738
  model_names: str,
727
739
  *,
740
+ name: str | None = None,
728
741
  max_requests_per_minute: int = 1_000,
729
742
  max_tokens_per_minute: int = 100_000,
730
743
  max_concurrent_requests: int = 225,
@@ -751,6 +764,7 @@ def LLMClient(
751
764
  def LLMClient(
752
765
  model_names: list[str],
753
766
  *,
767
+ name: str | None = None,
754
768
  max_requests_per_minute: int = 1_000,
755
769
  max_tokens_per_minute: int = 100_000,
756
770
  max_concurrent_requests: int = 225,
@@ -776,6 +790,7 @@ def LLMClient(
776
790
  def LLMClient(
777
791
  model_names: str | list[str] = "gpt-4.1-mini",
778
792
  *,
793
+ name: str | None = None,
779
794
  max_requests_per_minute: int = 1_000,
780
795
  max_tokens_per_minute: int = 100_000,
781
796
  max_concurrent_requests: int = 225,
@@ -813,6 +828,7 @@ def LLMClient(
813
828
  # Simply pass everything to the Pydantic constructor
814
829
  return _LLMClient(
815
830
  model_names=model_names,
831
+ name=name,
816
832
  max_requests_per_minute=max_requests_per_minute,
817
833
  max_tokens_per_minute=max_tokens_per_minute,
818
834
  max_concurrent_requests=max_concurrent_requests,
@@ -29,7 +29,8 @@ class APIModel:
29
29
  api_base: str
30
30
  api_key_env_var: str
31
31
  api_spec: str
32
- cached_input_cost: float | None = 0
32
+ cached_input_cost: float | None = 0 # $ per million cached/read input tokens
33
+ cache_write_cost: float | None = 0 # $ per million cache write tokens
33
34
  input_cost: float | None = 0 # $ per million input tokens
34
35
  output_cost: float | None = 0 # $ per million output tokens
35
36
  supports_json: bool = False
@@ -89,6 +90,7 @@ def register_model(
89
90
  api_spec: str = "openai",
90
91
  input_cost: float | None = 0, # $ per million input tokens
91
92
  cached_input_cost: float | None = 0,
93
+ cache_write_cost: float | None = 0, # $ per million cache write tokens
92
94
  output_cost: float | None = 0, # $ per million output tokens
93
95
  supports_json: bool = False,
94
96
  supports_logprobs: bool = False,
@@ -106,6 +108,7 @@ def register_model(
106
108
  api_key_env_var=api_key_env_var,
107
109
  api_spec=api_spec,
108
110
  cached_input_cost=cached_input_cost,
111
+ cache_write_cost=cache_write_cost,
109
112
  input_cost=input_cost,
110
113
  output_cost=output_cost,
111
114
  supports_json=supports_json,
@@ -18,6 +18,8 @@ ANTHROPIC_MODELS = {
18
18
  "supports_json": False,
19
19
  "api_spec": "anthropic",
20
20
  "input_cost": 3.0,
21
+ "cached_input_cost": 0.30,
22
+ "cache_write_cost": 3.75,
21
23
  "output_cost": 15.0,
22
24
  "requests_per_minute": 4_000,
23
25
  "tokens_per_minute": 400_000,
@@ -30,6 +32,8 @@ ANTHROPIC_MODELS = {
30
32
  "supports_json": False,
31
33
  "api_spec": "anthropic",
32
34
  "input_cost": 15.0,
35
+ "cached_input_cost": 1.50,
36
+ "cache_write_cost": 18.75,
33
37
  "output_cost": 75.0,
34
38
  "requests_per_minute": 4_000,
35
39
  "tokens_per_minute": 400_000,
@@ -43,6 +47,8 @@ ANTHROPIC_MODELS = {
43
47
  "supports_json": False,
44
48
  "api_spec": "anthropic",
45
49
  "input_cost": 15.0,
50
+ "cached_input_cost": 1.50,
51
+ "cache_write_cost": 18.75,
46
52
  "output_cost": 75.0,
47
53
  "requests_per_minute": 4_000,
48
54
  "tokens_per_minute": 400_000,
@@ -56,6 +62,8 @@ ANTHROPIC_MODELS = {
56
62
  "supports_json": False,
57
63
  "api_spec": "anthropic",
58
64
  "input_cost": 3.0,
65
+ "cached_input_cost": 0.30,
66
+ "cache_write_cost": 3.75,
59
67
  "output_cost": 15.0,
60
68
  "requests_per_minute": 4_000,
61
69
  "tokens_per_minute": 400_000,
@@ -68,6 +76,8 @@ ANTHROPIC_MODELS = {
68
76
  "supports_json": False,
69
77
  "api_spec": "anthropic",
70
78
  "input_cost": 3.0,
79
+ "cached_input_cost": 0.30,
80
+ "cache_write_cost": 3.75,
71
81
  "output_cost": 15.0,
72
82
  "requests_per_minute": 4_000,
73
83
  "tokens_per_minute": 400_000,
@@ -81,6 +91,8 @@ ANTHROPIC_MODELS = {
81
91
  "supports_json": False,
82
92
  "api_spec": "anthropic",
83
93
  "input_cost": 3.0,
94
+ "cached_input_cost": 0.30,
95
+ "cache_write_cost": 3.75,
84
96
  "output_cost": 15.0,
85
97
  "requests_per_minute": 4_000,
86
98
  "tokens_per_minute": 400_000,
@@ -93,6 +105,8 @@ ANTHROPIC_MODELS = {
93
105
  "supports_json": False,
94
106
  "api_spec": "anthropic",
95
107
  "input_cost": 3.0,
108
+ "cached_input_cost": 0.30,
109
+ "cache_write_cost": 3.75,
96
110
  "output_cost": 15.0,
97
111
  "requests_per_minute": 4_000,
98
112
  "tokens_per_minute": 400_000,
@@ -116,8 +130,10 @@ ANTHROPIC_MODELS = {
116
130
  "api_key_env_var": "ANTHROPIC_API_KEY",
117
131
  "supports_json": False,
118
132
  "api_spec": "anthropic",
119
- "input_cost": 1.00,
120
- "output_cost": 5.00,
133
+ "input_cost": 0.8,
134
+ "cached_input_cost": 0.08,
135
+ "cache_write_cost": 1.00,
136
+ "output_cost": 4.00,
121
137
  "requests_per_minute": 20_000,
122
138
  "tokens_per_minute": 4_000_000, # supposed to be this but they fucked up
123
139
  },
@@ -129,6 +145,8 @@ ANTHROPIC_MODELS = {
129
145
  "supports_json": False,
130
146
  "api_spec": "anthropic",
131
147
  "input_cost": 0.25,
148
+ "cache_write_cost": 0.30,
149
+ "cached_input_cost": 0.03,
132
150
  "output_cost": 1.25,
133
151
  "requests_per_minute": 10_000,
134
152
  "tokens_per_minute": 4_000_000, # supposed to be this but they fucked up
@@ -18,6 +18,7 @@ GOOGLE_MODELS = {
18
18
  "supports_logprobs": False,
19
19
  "api_spec": "openai",
20
20
  "input_cost": 0.1,
21
+ "cached_input_cost": 0.025,
21
22
  "output_cost": 0.4,
22
23
  "requests_per_minute": 20,
23
24
  "tokens_per_minute": 100_000,
@@ -31,8 +32,8 @@ GOOGLE_MODELS = {
31
32
  "supports_json": True,
32
33
  "supports_logprobs": False,
33
34
  "api_spec": "openai",
34
- "input_cost": 0.1,
35
- "output_cost": 0.4,
35
+ "input_cost": 0.075,
36
+ "output_cost": 0.3,
36
37
  "requests_per_minute": 20,
37
38
  "tokens_per_minute": 100_000,
38
39
  "reasoning_model": False,
@@ -45,8 +46,9 @@ GOOGLE_MODELS = {
45
46
  "supports_json": True,
46
47
  "supports_logprobs": False,
47
48
  "api_spec": "openai",
48
- "input_cost": 0.1,
49
- "output_cost": 0.4,
49
+ "input_cost": 1.25,
50
+ "cached_input_cost": 0.31,
51
+ "output_cost": 10.0,
50
52
  "requests_per_minute": 20,
51
53
  "tokens_per_minute": 100_000,
52
54
  "reasoning_model": True,
@@ -59,8 +61,9 @@ GOOGLE_MODELS = {
59
61
  "supports_json": True,
60
62
  "supports_logprobs": False,
61
63
  "api_spec": "openai",
62
- "input_cost": 0.1,
63
- "output_cost": 0.4,
64
+ "input_cost": 0.3,
65
+ "cached_input_cost": 0.075,
66
+ "output_cost": 2.5,
64
67
  "requests_per_minute": 20,
65
68
  "tokens_per_minute": 100_000,
66
69
  "reasoning_model": True,
@@ -74,6 +77,7 @@ GOOGLE_MODELS = {
74
77
  "supports_logprobs": False,
75
78
  "api_spec": "openai",
76
79
  "input_cost": 0.1,
80
+ "cached_input_cost": 0.025,
77
81
  "output_cost": 0.4,
78
82
  "requests_per_minute": 20,
79
83
  "tokens_per_minute": 100_000,
@@ -89,6 +93,7 @@ GOOGLE_MODELS = {
89
93
  "supports_logprobs": False,
90
94
  "api_spec": "gemini",
91
95
  "input_cost": 0.1,
96
+ "cached_input_cost": 0.025,
92
97
  "output_cost": 0.4,
93
98
  "requests_per_minute": 20,
94
99
  "tokens_per_minute": 100_000,
@@ -102,8 +107,8 @@ GOOGLE_MODELS = {
102
107
  "supports_json": True,
103
108
  "supports_logprobs": False,
104
109
  "api_spec": "gemini",
105
- "input_cost": 0.1,
106
- "output_cost": 0.4,
110
+ "input_cost": 0.075,
111
+ "output_cost": 0.3,
107
112
  "requests_per_minute": 20,
108
113
  "tokens_per_minute": 100_000,
109
114
  "reasoning_model": False,
@@ -116,8 +121,9 @@ GOOGLE_MODELS = {
116
121
  "supports_json": True,
117
122
  "supports_logprobs": False,
118
123
  "api_spec": "gemini",
119
- "input_cost": 0.1,
120
- "output_cost": 0.4,
124
+ "input_cost": 1.25,
125
+ "cached_input_cost": 0.31,
126
+ "output_cost": 10.0,
121
127
  "requests_per_minute": 20,
122
128
  "tokens_per_minute": 100_000,
123
129
  "reasoning_model": True,
@@ -130,8 +136,9 @@ GOOGLE_MODELS = {
130
136
  "supports_json": True,
131
137
  "supports_logprobs": False,
132
138
  "api_spec": "gemini",
133
- "input_cost": 0.1,
134
- "output_cost": 0.4,
139
+ "input_cost": 0.3,
140
+ "cached_input_cost": 0.075,
141
+ "output_cost": 2.5,
135
142
  "requests_per_minute": 20,
136
143
  "tokens_per_minute": 100_000,
137
144
  "reasoning_model": True,
@@ -145,6 +152,7 @@ GOOGLE_MODELS = {
145
152
  "supports_logprobs": False,
146
153
  "api_spec": "gemini",
147
154
  "input_cost": 0.1,
155
+ "cached_input_cost": 0.025,
148
156
  "output_cost": 0.4,
149
157
  "requests_per_minute": 20,
150
158
  "tokens_per_minute": 100_000,
@@ -75,8 +75,8 @@ OPENAI_MODELS = {
75
75
  "supports_logprobs": False,
76
76
  "supports_responses": True,
77
77
  "api_spec": "openai",
78
- "input_cost": 2.0,
79
- "output_cost": 8.0,
78
+ "input_cost": 3.0,
79
+ "output_cost": 12.0,
80
80
  "requests_per_minute": 20,
81
81
  "tokens_per_minute": 100_000,
82
82
  "reasoning_model": False,
@@ -90,8 +90,9 @@ OPENAI_MODELS = {
90
90
  "supports_logprobs": True,
91
91
  "supports_responses": True,
92
92
  "api_spec": "openai",
93
- "input_cost": 10.0,
94
- "output_cost": 40.0,
93
+ "input_cost": 2.0,
94
+ "cached_input_cost": 0.50,
95
+ "output_cost": 8.0,
95
96
  "requests_per_minute": 20,
96
97
  "tokens_per_minute": 100_000,
97
98
  "reasoning_model": True,
@@ -106,6 +107,7 @@ OPENAI_MODELS = {
106
107
  "supports_responses": True,
107
108
  "api_spec": "openai",
108
109
  "input_cost": 1.1,
110
+ "cached_input_cost": 0.275,
109
111
  "output_cost": 4.4,
110
112
  "requests_per_minute": 20,
111
113
  "tokens_per_minute": 100_000,
@@ -121,6 +123,7 @@ OPENAI_MODELS = {
121
123
  "supports_responses": True,
122
124
  "api_spec": "openai",
123
125
  "input_cost": 2.0,
126
+ "cached_input_cost": 0.50,
124
127
  "output_cost": 8.0,
125
128
  "requests_per_minute": 20,
126
129
  "tokens_per_minute": 100_000,
@@ -136,6 +139,7 @@ OPENAI_MODELS = {
136
139
  "supports_responses": True,
137
140
  "api_spec": "openai",
138
141
  "input_cost": 0.4,
142
+ "cached_input_cost": 0.10,
139
143
  "output_cost": 1.6,
140
144
  "requests_per_minute": 20,
141
145
  "tokens_per_minute": 100_000,
@@ -151,6 +155,7 @@ OPENAI_MODELS = {
151
155
  "supports_responses": True,
152
156
  "api_spec": "openai",
153
157
  "input_cost": 0.1,
158
+ "cached_input_cost": 0.025,
154
159
  "output_cost": 0.4,
155
160
  "requests_per_minute": 20,
156
161
  "tokens_per_minute": 100_000,
@@ -181,6 +186,7 @@ OPENAI_MODELS = {
181
186
  "supports_responses": True,
182
187
  "api_spec": "openai",
183
188
  "input_cost": 1.1,
189
+ "cached_input_cost": 0.55,
184
190
  "output_cost": 4.4,
185
191
  "requests_per_minute": 20,
186
192
  "tokens_per_minute": 100_000,
@@ -196,6 +202,7 @@ OPENAI_MODELS = {
196
202
  "supports_responses": True,
197
203
  "api_spec": "openai",
198
204
  "input_cost": 15.0,
205
+ "cached_input_cost": 7.50,
199
206
  "output_cost": 60.0,
200
207
  "requests_per_minute": 20,
201
208
  "tokens_per_minute": 100_000,
@@ -225,8 +232,9 @@ OPENAI_MODELS = {
225
232
  "supports_logprobs": True,
226
233
  "supports_responses": True,
227
234
  "api_spec": "openai",
228
- "input_cost": 3.0,
229
- "output_cost": 15.0,
235
+ "input_cost": 1.1,
236
+ "cached_input_cost": 0.55,
237
+ "output_cost": 4.4,
230
238
  "requests_per_minute": 20,
231
239
  "tokens_per_minute": 100_000,
232
240
  "reasoning_model": True,
@@ -240,8 +248,9 @@ OPENAI_MODELS = {
240
248
  "supports_logprobs": True,
241
249
  "supports_responses": True,
242
250
  "api_spec": "openai",
243
- "input_cost": 5.0,
244
- "output_cost": 15.0,
251
+ "input_cost": 2.50,
252
+ "cached_input_cost": 1.25,
253
+ "output_cost": 10.0,
245
254
  "requests_per_minute": 10_000,
246
255
  "tokens_per_minute": 30_000_000,
247
256
  },
@@ -255,6 +264,7 @@ OPENAI_MODELS = {
255
264
  "supports_responses": True,
256
265
  "api_spec": "openai",
257
266
  "input_cost": 0.15,
267
+ "cached_input_cost": 0.075,
258
268
  "output_cost": 0.6,
259
269
  "requests_per_minute": 60_000,
260
270
  "tokens_per_minute": 250_000_000,
@@ -13,7 +13,6 @@ from rich.progress import (
13
13
  TaskID,
14
14
  TextColumn,
15
15
  )
16
- from rich.text import Text
17
16
  from tqdm.auto import tqdm
18
17
 
19
18
  SECONDS_TO_PAUSE_AFTER_RATE_LIMIT_ERROR = 5
@@ -24,6 +23,7 @@ class StatusTracker:
24
23
  max_requests_per_minute: int
25
24
  max_tokens_per_minute: int
26
25
  max_concurrent_requests: int
26
+ client_name: str = "LLMClient"
27
27
  num_tasks_started: int = 0
28
28
  num_tasks_in_progress: int = 0
29
29
  num_tasks_succeeded: int = 0
@@ -33,6 +33,13 @@ class StatusTracker:
33
33
  total_requests: int = 0
34
34
  retry_queue: asyncio.Queue = field(default_factory=asyncio.Queue)
35
35
 
36
+ # Cumulative usage tracking
37
+ total_cost: float = 0.0
38
+ total_input_tokens: int = 0 # non-cached input tokens
39
+ total_cache_read_tokens: int = 0
40
+ total_cache_write_tokens: int = 0
41
+ total_output_tokens: int = 0
42
+
36
43
  # Progress bar configuration
37
44
  use_progress_bar: bool = True
38
45
  progress_bar_total: int | None = None
@@ -131,6 +138,25 @@ class StatusTracker:
131
138
  self.num_tasks_in_progress -= 1
132
139
  self.num_tasks_failed += 1
133
140
 
141
+ def track_usage(self, response):
142
+ """Accumulate usage statistics from a completed request.
143
+
144
+ Args:
145
+ response: APIResponse object containing usage and cost information
146
+ """
147
+ if response.cost:
148
+ self.total_cost += response.cost
149
+
150
+ if response.usage:
151
+ self.total_output_tokens += response.usage.output_tokens
152
+ self.total_input_tokens += response.usage.input_tokens
153
+
154
+ if response.usage.cache_read_tokens:
155
+ self.total_cache_read_tokens += response.usage.cache_read_tokens
156
+
157
+ if response.usage.cache_write_tokens:
158
+ self.total_cache_write_tokens += response.usage.cache_write_tokens
159
+
134
160
  def log_final_status(self):
135
161
  # Close progress bar before printing final status
136
162
  self.close_progress_bar()
@@ -144,6 +170,22 @@ class StatusTracker:
144
170
  f"{self.num_rate_limit_errors} rate limit errors received. Consider running at a lower rate."
145
171
  )
146
172
 
173
+ # Display cumulative usage stats if available
174
+ if self.total_cost > 0 or self.total_input_tokens > 0 or self.total_output_tokens > 0:
175
+ usage_parts = []
176
+ if self.total_cost > 0:
177
+ usage_parts.append(f"Cost: ${self.total_cost:.4f}")
178
+ if self.total_input_tokens > 0 or self.total_output_tokens > 0:
179
+ usage_parts.append(
180
+ f"Tokens: {self.total_input_tokens:,} in / {self.total_output_tokens:,} out"
181
+ )
182
+ if self.total_cache_read_tokens > 0:
183
+ usage_parts.append(f"Cache: {self.total_cache_read_tokens:,} read")
184
+ if self.total_cache_write_tokens > 0:
185
+ usage_parts.append(f"{self.total_cache_write_tokens:,} write")
186
+
187
+ print(" | ".join(usage_parts))
188
+
147
189
  @property
148
190
  def pbar(self) -> tqdm | None:
149
191
  """Backward compatibility property to access progress bar."""
@@ -187,14 +229,16 @@ class StatusTracker:
187
229
 
188
230
  def _init_rich_display(self, total: int):
189
231
  """Initialize Rich display components."""
190
- self._rich_console = Console()
232
+ self._rich_console = Console(highlight=False)
233
+ # Escape square brackets so Rich doesn't interpret them as markup
234
+ description = f"[bold blue]\\[{self.client_name}][/bold blue] Processing..."
191
235
  self._rich_progress = Progress(
192
236
  SpinnerColumn(),
193
- TextColumn("Processing requests..."),
237
+ TextColumn("[progress.description]{task.description}"),
194
238
  BarColumn(),
195
239
  MofNCompleteColumn(),
196
240
  )
197
- self._rich_task_id = self._rich_progress.add_task("requests", total=total)
241
+ self._rich_task_id = self._rich_progress.add_task(description, total=total)
198
242
  self._rich_stop_event = asyncio.Event()
199
243
  self._rich_display_task = asyncio.create_task(self._rich_display_updater())
200
244
 
@@ -217,12 +261,36 @@ class StatusTracker:
217
261
  total=self.progress_bar_total,
218
262
  )
219
263
 
220
- tokens_info = f"TPM Capacity: {self.available_token_capacity / 1000:.1f}k/{self.max_tokens_per_minute / 1000:.1f}k"
221
- reqs_info = f"RPM Capacity: {int(self.available_request_capacity)}/{self.max_requests_per_minute}"
222
- in_progress = f"In Progress: {int(self.num_tasks_in_progress)}"
223
- capacity_text = Text(f"{in_progress} {tokens_info} • {reqs_info}")
264
+ tokens_info = f"{self.available_token_capacity / 1000:.1f}k/{self.max_tokens_per_minute / 1000:.1f}k TPM"
265
+ reqs_info = f"{int(self.available_request_capacity)}/{self.max_requests_per_minute} RPM"
266
+ in_progress = (
267
+ f" [gold3]In Progress:[/gold3] {int(self.num_tasks_in_progress)} "
268
+ + ("requests" if self.num_tasks_in_progress != 1 else "request")
269
+ )
270
+ capacity_text = (
271
+ f" [gold3]Capacity:[/gold3] {tokens_info} • {reqs_info}"
272
+ )
224
273
 
225
- display = Group(self._rich_progress, capacity_text)
274
+ # Format usage stats
275
+ usage_parts = []
276
+ if self.total_cost > 0:
277
+ usage_parts.append(f"${self.total_cost:.4f}")
278
+ if self.total_input_tokens > 0 or self.total_output_tokens > 0:
279
+ input_k = self.total_input_tokens / 1000
280
+ output_k = self.total_output_tokens / 1000
281
+ usage_parts.append(f"{input_k:.1f}k in • {output_k:.1f}k out")
282
+ if self.total_cache_read_tokens > 0:
283
+ cache_k = self.total_cache_read_tokens / 1000
284
+ usage_parts.append(f"{cache_k:.1f}k cached")
285
+
286
+ usage_text = ""
287
+ if usage_parts:
288
+ usage_text = f" [gold3]Usage:[/gold3] {' • '.join(usage_parts)}"
289
+
290
+ if usage_text:
291
+ display = Group(self._rich_progress, in_progress, capacity_text, usage_text)
292
+ else:
293
+ display = Group(self._rich_progress, in_progress, capacity_text)
226
294
  live.update(display)
227
295
 
228
296
  await asyncio.sleep(0.1)
@@ -252,7 +320,7 @@ class StatusTracker:
252
320
  return
253
321
  while not self._manual_stop_event.is_set():
254
322
  print(
255
- f"Completed {self.num_tasks_succeeded}/{self.progress_bar_total} requests"
323
+ f"[{self.client_name}] Completed {self.num_tasks_succeeded}/{self.progress_bar_total} requests"
256
324
  )
257
325
  await asyncio.sleep(self.progress_print_interval)
258
326
 
@@ -1,5 +1,4 @@
1
1
  from dataclasses import dataclass
2
- from typing import Optional
3
2
 
4
3
 
5
4
  @dataclass
@@ -13,8 +12,8 @@ class Usage:
13
12
 
14
13
  input_tokens: int = 0
15
14
  output_tokens: int = 0
16
- cache_read_tokens: Optional[int] = None # Tokens read from cache (Anthropic)
17
- cache_write_tokens: Optional[int] = None # Tokens written to cache (Anthropic)
15
+ cache_read_tokens: int = 0
16
+ cache_write_tokens: int = 0
18
17
 
19
18
  @property
20
19
  def total_input_tokens(self) -> int:
@@ -47,18 +46,29 @@ class Usage:
47
46
  return cls(
48
47
  input_tokens=usage_data.get("input_tokens", 0),
49
48
  output_tokens=usage_data.get("output_tokens", 0),
50
- cache_read_tokens=usage_data.get("cache_read_input_tokens"),
51
- cache_write_tokens=usage_data.get("cache_creation_input_tokens"),
49
+ cache_read_tokens=usage_data.get("cache_read_input_tokens", 0),
50
+ cache_write_tokens=usage_data.get("cache_creation_input_tokens", 0),
52
51
  )
53
52
 
54
53
  @classmethod
55
54
  def from_openai_usage(cls, usage_data: dict) -> "Usage":
56
- """Create Usage from OpenAI API response usage data."""
55
+ """Create Usage from OpenAI API response usage data.
56
+
57
+ OpenAI supports prompt caching - cached tokens appear in prompt_tokens_details.cached_tokens.
58
+ Caching is automatic for prompts over 1024 tokens.
59
+ """
60
+ prompt_tokens_details = usage_data.get("prompt_tokens_details", {})
61
+ cached_tokens = (
62
+ prompt_tokens_details.get("cached_tokens", 0)
63
+ if prompt_tokens_details
64
+ else 0
65
+ )
66
+
57
67
  return cls(
58
68
  input_tokens=usage_data.get("prompt_tokens", 0),
59
69
  output_tokens=usage_data.get("completion_tokens", 0),
60
- cache_read_tokens=None, # OpenAI doesn't support caching yet
61
- cache_write_tokens=None,
70
+ cache_read_tokens=cached_tokens if cached_tokens > 0 else 0,
71
+ cache_write_tokens=0, # OpenAI doesn't charge separately for cache writes
62
72
  )
63
73
 
64
74
  @classmethod
@@ -67,18 +77,23 @@ class Usage:
67
77
  return cls(
68
78
  input_tokens=usage_data.get("prompt_tokens", 0),
69
79
  output_tokens=usage_data.get("completion_tokens", 0),
70
- cache_read_tokens=None, # Mistral doesn't support caching
71
- cache_write_tokens=None,
80
+ cache_read_tokens=0, # Mistral doesn't support caching
81
+ cache_write_tokens=0,
72
82
  )
73
83
 
74
84
  @classmethod
75
85
  def from_gemini_usage(cls, usage_data: dict) -> "Usage":
76
- """Create Usage from Gemini API response usage data."""
86
+ """Create Usage from Gemini API response usage data.
87
+
88
+ Gemini supports context caching - cached tokens appear in cachedContentTokenCount.
89
+ """
90
+ cached_tokens = usage_data.get("cachedContentTokenCount", 0)
91
+
77
92
  return cls(
78
93
  input_tokens=usage_data.get("promptTokenCount", 0),
79
94
  output_tokens=usage_data.get("candidatesTokenCount", 0),
80
- cache_read_tokens=None, # Gemini doesn't support caching yet
81
- cache_write_tokens=None,
95
+ cache_read_tokens=cached_tokens if cached_tokens > 0 else 0,
96
+ cache_write_tokens=0, # Gemini doesn't charge separately for cache writes
82
97
  )
83
98
 
84
99
  def to_dict(self) -> dict:
@@ -100,8 +115,8 @@ class Usage:
100
115
  return cls(
101
116
  input_tokens=data.get("input_tokens", 0),
102
117
  output_tokens=data.get("output_tokens", 0),
103
- cache_read_tokens=data.get("cache_read_tokens"),
104
- cache_write_tokens=data.get("cache_write_tokens"),
118
+ cache_read_tokens=data.get("cache_read_tokens", 0),
119
+ cache_write_tokens=data.get("cache_write_tokens", 0),
105
120
  )
106
121
 
107
122
  def __add__(self, other: "Usage") -> "Usage":
@@ -111,14 +126,8 @@ class Usage:
111
126
  output_tokens=self.output_tokens + other.output_tokens,
112
127
  cache_read_tokens=(
113
128
  (self.cache_read_tokens or 0) + (other.cache_read_tokens or 0)
114
- if self.cache_read_tokens is not None
115
- or other.cache_read_tokens is not None
116
- else None
117
129
  ),
118
130
  cache_write_tokens=(
119
131
  (self.cache_write_tokens or 0) + (other.cache_write_tokens or 0)
120
- if self.cache_write_tokens is not None
121
- or other.cache_write_tokens is not None
122
- else None
123
132
  ),
124
133
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lm_deluge
3
- Version: 0.0.54
3
+ Version: 0.0.56
4
4
  Summary: Python utility for using LLM API models.
5
5
  Author-email: Benjamin Anderson <ben@trytaylor.ai>
6
6
  Requires-Python: >=3.10
File without changes
File without changes
File without changes