llm-cost-guard 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_cost_guard/__init__.py +39 -0
- llm_cost_guard/backends/__init__.py +52 -0
- llm_cost_guard/backends/base.py +121 -0
- llm_cost_guard/backends/memory.py +265 -0
- llm_cost_guard/backends/sqlite.py +425 -0
- llm_cost_guard/budget.py +306 -0
- llm_cost_guard/cli.py +464 -0
- llm_cost_guard/clients/__init__.py +11 -0
- llm_cost_guard/clients/anthropic.py +231 -0
- llm_cost_guard/clients/openai.py +262 -0
- llm_cost_guard/exceptions.py +71 -0
- llm_cost_guard/integrations/__init__.py +12 -0
- llm_cost_guard/integrations/cache.py +189 -0
- llm_cost_guard/integrations/langchain.py +257 -0
- llm_cost_guard/models.py +123 -0
- llm_cost_guard/pricing/__init__.py +7 -0
- llm_cost_guard/pricing/anthropic.yaml +88 -0
- llm_cost_guard/pricing/bedrock.yaml +215 -0
- llm_cost_guard/pricing/loader.py +221 -0
- llm_cost_guard/pricing/openai.yaml +148 -0
- llm_cost_guard/pricing/vertex.yaml +133 -0
- llm_cost_guard/providers/__init__.py +69 -0
- llm_cost_guard/providers/anthropic.py +115 -0
- llm_cost_guard/providers/base.py +72 -0
- llm_cost_guard/providers/bedrock.py +135 -0
- llm_cost_guard/providers/openai.py +110 -0
- llm_cost_guard/rate_limit.py +233 -0
- llm_cost_guard/span.py +143 -0
- llm_cost_guard/tokenizers/__init__.py +7 -0
- llm_cost_guard/tokenizers/base.py +207 -0
- llm_cost_guard/tracker.py +718 -0
- llm_cost_guard-0.1.0.dist-info/METADATA +357 -0
- llm_cost_guard-0.1.0.dist-info/RECORD +36 -0
- llm_cost_guard-0.1.0.dist-info/WHEEL +4 -0
- llm_cost_guard-0.1.0.dist-info/entry_points.txt +2 -0
- llm_cost_guard-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pricing data loader for LLM Cost Guard.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Dict, Optional
|
|
9
|
+
import logging
|
|
10
|
+
import yaml
|
|
11
|
+
|
|
12
|
+
from llm_cost_guard.models import ModelPricing, ModelType
|
|
13
|
+
from llm_cost_guard.exceptions import PricingNotFoundError
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
# Default pricing data directory
|
|
18
|
+
PRICING_DIR = Path(__file__).parent
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class PricingLoader:
|
|
22
|
+
"""Loads and manages pricing data for LLM providers."""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
pricing_overrides: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
27
|
+
pricing_stale_warning_days: int = 7,
|
|
28
|
+
pricing_stale_error_days: int = 30,
|
|
29
|
+
bedrock_region: str = "us-east-1",
|
|
30
|
+
):
|
|
31
|
+
self._pricing_data: Dict[str, Dict[str, ModelPricing]] = {}
|
|
32
|
+
self._pricing_versions: Dict[str, str] = {}
|
|
33
|
+
self._pricing_overrides = pricing_overrides or {}
|
|
34
|
+
self._stale_warning_days = pricing_stale_warning_days
|
|
35
|
+
self._stale_error_days = pricing_stale_error_days
|
|
36
|
+
self._bedrock_region = bedrock_region
|
|
37
|
+
self._last_loaded: Optional[datetime] = None
|
|
38
|
+
|
|
39
|
+
self._load_all_pricing()
|
|
40
|
+
|
|
41
|
+
def _load_all_pricing(self) -> None:
|
|
42
|
+
"""Load pricing from all YAML files."""
|
|
43
|
+
for yaml_file in PRICING_DIR.glob("*.yaml"):
|
|
44
|
+
provider = yaml_file.stem
|
|
45
|
+
self._load_provider_pricing(provider, yaml_file)
|
|
46
|
+
|
|
47
|
+
self._last_loaded = datetime.now()
|
|
48
|
+
|
|
49
|
+
def _load_provider_pricing(self, provider: str, yaml_path: Path) -> None:
|
|
50
|
+
"""Load pricing for a specific provider."""
|
|
51
|
+
try:
|
|
52
|
+
with open(yaml_path, "r") as f:
|
|
53
|
+
data = yaml.safe_load(f)
|
|
54
|
+
|
|
55
|
+
if not data:
|
|
56
|
+
return
|
|
57
|
+
|
|
58
|
+
self._pricing_versions[provider] = data.get("version", "unknown")
|
|
59
|
+
self._pricing_data[provider] = {}
|
|
60
|
+
|
|
61
|
+
models = data.get("models", {})
|
|
62
|
+
for model_name, model_data in models.items():
|
|
63
|
+
model_type_str = model_data.get("model_type", "chat")
|
|
64
|
+
model_type = ModelType(model_type_str) if model_type_str else ModelType.CHAT
|
|
65
|
+
|
|
66
|
+
pricing = ModelPricing(
|
|
67
|
+
input_cost_per_1k=model_data.get("input_cost_per_1k", 0.0),
|
|
68
|
+
output_cost_per_1k=model_data.get("output_cost_per_1k", 0.0),
|
|
69
|
+
cached_input_cost_per_1k=model_data.get("cached_input_cost_per_1k"),
|
|
70
|
+
context_window=model_data.get("context_window", 128000),
|
|
71
|
+
model_type=model_type,
|
|
72
|
+
image_cost_per_image=model_data.get("image_cost_per_image"),
|
|
73
|
+
audio_cost_per_minute=model_data.get("audio_cost_per_minute"),
|
|
74
|
+
embedding_dimensions=model_data.get("embedding_dimensions"),
|
|
75
|
+
)
|
|
76
|
+
self._pricing_data[provider][model_name] = pricing
|
|
77
|
+
|
|
78
|
+
except Exception as e:
|
|
79
|
+
logger.warning(f"Failed to load pricing for {provider}: {e}")
|
|
80
|
+
|
|
81
|
+
def get_pricing(self, provider: str, model: str) -> ModelPricing:
|
|
82
|
+
"""Get pricing for a specific model."""
|
|
83
|
+
# Normalize provider and model names
|
|
84
|
+
provider = provider.lower()
|
|
85
|
+
model_lower = model.lower()
|
|
86
|
+
|
|
87
|
+
# Check overrides first
|
|
88
|
+
override_key = f"{provider}/{model}"
|
|
89
|
+
if override_key in self._pricing_overrides:
|
|
90
|
+
override = self._pricing_overrides[override_key]
|
|
91
|
+
return ModelPricing(
|
|
92
|
+
input_cost_per_1k=override.get("input_cost_per_1k", 0.0),
|
|
93
|
+
output_cost_per_1k=override.get("output_cost_per_1k", 0.0),
|
|
94
|
+
cached_input_cost_per_1k=override.get("cached_input_cost_per_1k"),
|
|
95
|
+
context_window=override.get("context_window", 128000),
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Check loaded pricing
|
|
99
|
+
if provider in self._pricing_data:
|
|
100
|
+
provider_pricing = self._pricing_data[provider]
|
|
101
|
+
|
|
102
|
+
# Try exact match
|
|
103
|
+
if model in provider_pricing:
|
|
104
|
+
return provider_pricing[model]
|
|
105
|
+
|
|
106
|
+
# Try lowercase match
|
|
107
|
+
if model_lower in provider_pricing:
|
|
108
|
+
return provider_pricing[model_lower]
|
|
109
|
+
|
|
110
|
+
# Try prefix match (for versioned models like gpt-4-0613)
|
|
111
|
+
for known_model in provider_pricing:
|
|
112
|
+
if model_lower.startswith(known_model) or known_model.startswith(model_lower):
|
|
113
|
+
return provider_pricing[known_model]
|
|
114
|
+
|
|
115
|
+
raise PricingNotFoundError(
|
|
116
|
+
f"Pricing not found for {provider}/{model}", provider=provider, model=model
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
def calculate_cost(
|
|
120
|
+
self,
|
|
121
|
+
provider: str,
|
|
122
|
+
model: str,
|
|
123
|
+
input_tokens: int,
|
|
124
|
+
output_tokens: int,
|
|
125
|
+
cached_tokens: int = 0,
|
|
126
|
+
) -> tuple[float, float, float]:
|
|
127
|
+
"""
|
|
128
|
+
Calculate cost for a call.
|
|
129
|
+
Returns (input_cost, output_cost, total_cost).
|
|
130
|
+
"""
|
|
131
|
+
pricing = self.get_pricing(provider, model)
|
|
132
|
+
|
|
133
|
+
# Calculate input cost (considering cache)
|
|
134
|
+
regular_input_tokens = input_tokens - cached_tokens
|
|
135
|
+
input_cost = (regular_input_tokens / 1000) * pricing.input_cost_per_1k
|
|
136
|
+
|
|
137
|
+
# Add cached token cost if applicable
|
|
138
|
+
if cached_tokens > 0 and pricing.cached_input_cost_per_1k is not None:
|
|
139
|
+
input_cost += (cached_tokens / 1000) * pricing.cached_input_cost_per_1k
|
|
140
|
+
|
|
141
|
+
# Calculate output cost
|
|
142
|
+
output_cost = (output_tokens / 1000) * pricing.output_cost_per_1k
|
|
143
|
+
|
|
144
|
+
total_cost = input_cost + output_cost
|
|
145
|
+
|
|
146
|
+
return input_cost, output_cost, total_cost
|
|
147
|
+
|
|
148
|
+
def estimate_cost(
|
|
149
|
+
self,
|
|
150
|
+
provider: str,
|
|
151
|
+
model: str,
|
|
152
|
+
input_tokens: int,
|
|
153
|
+
max_output_tokens: int = 4096,
|
|
154
|
+
) -> float:
|
|
155
|
+
"""Estimate maximum cost for a call (for budget reservation)."""
|
|
156
|
+
pricing = self.get_pricing(provider, model)
|
|
157
|
+
|
|
158
|
+
input_cost = (input_tokens / 1000) * pricing.input_cost_per_1k
|
|
159
|
+
output_cost = (max_output_tokens / 1000) * pricing.output_cost_per_1k
|
|
160
|
+
|
|
161
|
+
return input_cost + output_cost
|
|
162
|
+
|
|
163
|
+
@property
|
|
164
|
+
def last_updated(self) -> Optional[datetime]:
|
|
165
|
+
"""Get when pricing was last loaded."""
|
|
166
|
+
return self._last_loaded
|
|
167
|
+
|
|
168
|
+
@property
|
|
169
|
+
def pricing_version(self) -> Dict[str, str]:
|
|
170
|
+
"""Get pricing versions for all providers."""
|
|
171
|
+
return dict(self._pricing_versions)
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def is_stale(self) -> bool:
|
|
175
|
+
"""Check if pricing data is stale (beyond warning threshold)."""
|
|
176
|
+
if self._last_loaded is None:
|
|
177
|
+
return True
|
|
178
|
+
|
|
179
|
+
age_days = (datetime.now() - self._last_loaded).days
|
|
180
|
+
return age_days >= self._stale_warning_days
|
|
181
|
+
|
|
182
|
+
@property
|
|
183
|
+
def is_very_stale(self) -> bool:
|
|
184
|
+
"""Check if pricing data is very stale (beyond error threshold)."""
|
|
185
|
+
if self._last_loaded is None:
|
|
186
|
+
return True
|
|
187
|
+
|
|
188
|
+
age_days = (datetime.now() - self._last_loaded).days
|
|
189
|
+
return age_days >= self._stale_error_days
|
|
190
|
+
|
|
191
|
+
def get_all_models(self, provider: Optional[str] = None) -> Dict[str, list[str]]:
|
|
192
|
+
"""Get all known models, optionally filtered by provider."""
|
|
193
|
+
if provider:
|
|
194
|
+
return {provider: list(self._pricing_data.get(provider, {}).keys())}
|
|
195
|
+
return {p: list(models.keys()) for p, models in self._pricing_data.items()}
|
|
196
|
+
|
|
197
|
+
def refresh(self) -> None:
|
|
198
|
+
"""Reload pricing data from files."""
|
|
199
|
+
self._pricing_data.clear()
|
|
200
|
+
self._pricing_versions.clear()
|
|
201
|
+
self._load_all_pricing()
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
# Global pricing loader instance
|
|
205
|
+
_global_loader: Optional[PricingLoader] = None
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def get_pricing(provider: str, model: str) -> ModelPricing:
|
|
209
|
+
"""Get pricing for a model using the global loader."""
|
|
210
|
+
global _global_loader
|
|
211
|
+
if _global_loader is None:
|
|
212
|
+
_global_loader = PricingLoader()
|
|
213
|
+
return _global_loader.get_pricing(provider, model)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def get_pricing_loader() -> PricingLoader:
|
|
217
|
+
"""Get the global pricing loader instance."""
|
|
218
|
+
global _global_loader
|
|
219
|
+
if _global_loader is None:
|
|
220
|
+
_global_loader = PricingLoader()
|
|
221
|
+
return _global_loader
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
version: "2026-01-15"
|
|
2
|
+
models:
|
|
3
|
+
# GPT-4o models
|
|
4
|
+
gpt-4o:
|
|
5
|
+
input_cost_per_1k: 0.0025
|
|
6
|
+
output_cost_per_1k: 0.01
|
|
7
|
+
cached_input_cost_per_1k: 0.00125
|
|
8
|
+
context_window: 128000
|
|
9
|
+
model_type: chat
|
|
10
|
+
|
|
11
|
+
gpt-4o-2024-11-20:
|
|
12
|
+
input_cost_per_1k: 0.0025
|
|
13
|
+
output_cost_per_1k: 0.01
|
|
14
|
+
cached_input_cost_per_1k: 0.00125
|
|
15
|
+
context_window: 128000
|
|
16
|
+
model_type: chat
|
|
17
|
+
|
|
18
|
+
gpt-4o-mini:
|
|
19
|
+
input_cost_per_1k: 0.00015
|
|
20
|
+
output_cost_per_1k: 0.0006
|
|
21
|
+
cached_input_cost_per_1k: 0.000075
|
|
22
|
+
context_window: 128000
|
|
23
|
+
model_type: chat
|
|
24
|
+
|
|
25
|
+
gpt-4o-mini-2024-07-18:
|
|
26
|
+
input_cost_per_1k: 0.00015
|
|
27
|
+
output_cost_per_1k: 0.0006
|
|
28
|
+
cached_input_cost_per_1k: 0.000075
|
|
29
|
+
context_window: 128000
|
|
30
|
+
model_type: chat
|
|
31
|
+
|
|
32
|
+
# GPT-4 Turbo
|
|
33
|
+
gpt-4-turbo:
|
|
34
|
+
input_cost_per_1k: 0.01
|
|
35
|
+
output_cost_per_1k: 0.03
|
|
36
|
+
context_window: 128000
|
|
37
|
+
model_type: chat
|
|
38
|
+
|
|
39
|
+
gpt-4-turbo-preview:
|
|
40
|
+
input_cost_per_1k: 0.01
|
|
41
|
+
output_cost_per_1k: 0.03
|
|
42
|
+
context_window: 128000
|
|
43
|
+
model_type: chat
|
|
44
|
+
|
|
45
|
+
# GPT-4
|
|
46
|
+
gpt-4:
|
|
47
|
+
input_cost_per_1k: 0.03
|
|
48
|
+
output_cost_per_1k: 0.06
|
|
49
|
+
context_window: 8192
|
|
50
|
+
model_type: chat
|
|
51
|
+
|
|
52
|
+
gpt-4-32k:
|
|
53
|
+
input_cost_per_1k: 0.06
|
|
54
|
+
output_cost_per_1k: 0.12
|
|
55
|
+
context_window: 32768
|
|
56
|
+
model_type: chat
|
|
57
|
+
|
|
58
|
+
# GPT-3.5 Turbo
|
|
59
|
+
gpt-3.5-turbo:
|
|
60
|
+
input_cost_per_1k: 0.0005
|
|
61
|
+
output_cost_per_1k: 0.0015
|
|
62
|
+
context_window: 16385
|
|
63
|
+
model_type: chat
|
|
64
|
+
|
|
65
|
+
gpt-3.5-turbo-0125:
|
|
66
|
+
input_cost_per_1k: 0.0005
|
|
67
|
+
output_cost_per_1k: 0.0015
|
|
68
|
+
context_window: 16385
|
|
69
|
+
model_type: chat
|
|
70
|
+
|
|
71
|
+
gpt-3.5-turbo-instruct:
|
|
72
|
+
input_cost_per_1k: 0.0015
|
|
73
|
+
output_cost_per_1k: 0.002
|
|
74
|
+
context_window: 4096
|
|
75
|
+
model_type: completion
|
|
76
|
+
|
|
77
|
+
# o1 reasoning models
|
|
78
|
+
o1:
|
|
79
|
+
input_cost_per_1k: 0.015
|
|
80
|
+
output_cost_per_1k: 0.06
|
|
81
|
+
cached_input_cost_per_1k: 0.0075
|
|
82
|
+
context_window: 200000
|
|
83
|
+
model_type: chat
|
|
84
|
+
|
|
85
|
+
o1-preview:
|
|
86
|
+
input_cost_per_1k: 0.015
|
|
87
|
+
output_cost_per_1k: 0.06
|
|
88
|
+
context_window: 128000
|
|
89
|
+
model_type: chat
|
|
90
|
+
|
|
91
|
+
o1-mini:
|
|
92
|
+
input_cost_per_1k: 0.003
|
|
93
|
+
output_cost_per_1k: 0.012
|
|
94
|
+
cached_input_cost_per_1k: 0.0015
|
|
95
|
+
context_window: 128000
|
|
96
|
+
model_type: chat
|
|
97
|
+
|
|
98
|
+
# Embedding models
|
|
99
|
+
text-embedding-3-small:
|
|
100
|
+
input_cost_per_1k: 0.00002
|
|
101
|
+
output_cost_per_1k: 0.0
|
|
102
|
+
context_window: 8191
|
|
103
|
+
model_type: embedding
|
|
104
|
+
embedding_dimensions: 1536
|
|
105
|
+
|
|
106
|
+
text-embedding-3-large:
|
|
107
|
+
input_cost_per_1k: 0.00013
|
|
108
|
+
output_cost_per_1k: 0.0
|
|
109
|
+
context_window: 8191
|
|
110
|
+
model_type: embedding
|
|
111
|
+
embedding_dimensions: 3072
|
|
112
|
+
|
|
113
|
+
text-embedding-ada-002:
|
|
114
|
+
input_cost_per_1k: 0.0001
|
|
115
|
+
output_cost_per_1k: 0.0
|
|
116
|
+
context_window: 8191
|
|
117
|
+
model_type: embedding
|
|
118
|
+
embedding_dimensions: 1536
|
|
119
|
+
|
|
120
|
+
# Image models (DALL-E)
|
|
121
|
+
dall-e-3:
|
|
122
|
+
input_cost_per_1k: 0.0
|
|
123
|
+
output_cost_per_1k: 0.0
|
|
124
|
+
image_cost_per_image: 0.04 # standard quality 1024x1024
|
|
125
|
+
model_type: image
|
|
126
|
+
|
|
127
|
+
dall-e-2:
|
|
128
|
+
input_cost_per_1k: 0.0
|
|
129
|
+
output_cost_per_1k: 0.0
|
|
130
|
+
image_cost_per_image: 0.02
|
|
131
|
+
model_type: image
|
|
132
|
+
|
|
133
|
+
# Audio models
|
|
134
|
+
whisper-1:
|
|
135
|
+
input_cost_per_1k: 0.0
|
|
136
|
+
output_cost_per_1k: 0.0
|
|
137
|
+
audio_cost_per_minute: 0.006
|
|
138
|
+
model_type: audio
|
|
139
|
+
|
|
140
|
+
tts-1:
|
|
141
|
+
input_cost_per_1k: 0.015
|
|
142
|
+
output_cost_per_1k: 0.0
|
|
143
|
+
model_type: audio
|
|
144
|
+
|
|
145
|
+
tts-1-hd:
|
|
146
|
+
input_cost_per_1k: 0.03
|
|
147
|
+
output_cost_per_1k: 0.0
|
|
148
|
+
model_type: audio
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
version: "2026-01-15"
|
|
2
|
+
# Google Vertex AI / Gemini pricing
|
|
3
|
+
models:
|
|
4
|
+
# Gemini 1.5 Pro
|
|
5
|
+
gemini-1.5-pro:
|
|
6
|
+
input_cost_per_1k: 0.00125
|
|
7
|
+
output_cost_per_1k: 0.005
|
|
8
|
+
context_window: 2000000
|
|
9
|
+
model_type: chat
|
|
10
|
+
|
|
11
|
+
gemini-1.5-pro-001:
|
|
12
|
+
input_cost_per_1k: 0.00125
|
|
13
|
+
output_cost_per_1k: 0.005
|
|
14
|
+
context_window: 2000000
|
|
15
|
+
model_type: chat
|
|
16
|
+
|
|
17
|
+
gemini-1.5-pro-002:
|
|
18
|
+
input_cost_per_1k: 0.00125
|
|
19
|
+
output_cost_per_1k: 0.005
|
|
20
|
+
context_window: 2000000
|
|
21
|
+
model_type: chat
|
|
22
|
+
|
|
23
|
+
# Gemini 1.5 Flash
|
|
24
|
+
gemini-1.5-flash:
|
|
25
|
+
input_cost_per_1k: 0.000075
|
|
26
|
+
output_cost_per_1k: 0.0003
|
|
27
|
+
context_window: 1000000
|
|
28
|
+
model_type: chat
|
|
29
|
+
|
|
30
|
+
gemini-1.5-flash-001:
|
|
31
|
+
input_cost_per_1k: 0.000075
|
|
32
|
+
output_cost_per_1k: 0.0003
|
|
33
|
+
context_window: 1000000
|
|
34
|
+
model_type: chat
|
|
35
|
+
|
|
36
|
+
gemini-1.5-flash-002:
|
|
37
|
+
input_cost_per_1k: 0.000075
|
|
38
|
+
output_cost_per_1k: 0.0003
|
|
39
|
+
context_window: 1000000
|
|
40
|
+
model_type: chat
|
|
41
|
+
|
|
42
|
+
# Gemini 1.0 Pro
|
|
43
|
+
gemini-1.0-pro:
|
|
44
|
+
input_cost_per_1k: 0.0005
|
|
45
|
+
output_cost_per_1k: 0.0015
|
|
46
|
+
context_window: 32760
|
|
47
|
+
model_type: chat
|
|
48
|
+
|
|
49
|
+
gemini-1.0-pro-001:
|
|
50
|
+
input_cost_per_1k: 0.0005
|
|
51
|
+
output_cost_per_1k: 0.0015
|
|
52
|
+
context_window: 32760
|
|
53
|
+
model_type: chat
|
|
54
|
+
|
|
55
|
+
gemini-1.0-pro-002:
|
|
56
|
+
input_cost_per_1k: 0.0005
|
|
57
|
+
output_cost_per_1k: 0.0015
|
|
58
|
+
context_window: 32760
|
|
59
|
+
model_type: chat
|
|
60
|
+
|
|
61
|
+
# Gemini 2.0 Flash
|
|
62
|
+
gemini-2.0-flash-exp:
|
|
63
|
+
input_cost_per_1k: 0.0
|
|
64
|
+
output_cost_per_1k: 0.0
|
|
65
|
+
context_window: 1000000
|
|
66
|
+
model_type: chat
|
|
67
|
+
|
|
68
|
+
# PaLM 2 (legacy)
|
|
69
|
+
text-bison:
|
|
70
|
+
input_cost_per_1k: 0.00025
|
|
71
|
+
output_cost_per_1k: 0.0005
|
|
72
|
+
context_window: 8192
|
|
73
|
+
model_type: chat
|
|
74
|
+
|
|
75
|
+
text-bison-32k:
|
|
76
|
+
input_cost_per_1k: 0.00025
|
|
77
|
+
output_cost_per_1k: 0.0005
|
|
78
|
+
context_window: 32000
|
|
79
|
+
model_type: chat
|
|
80
|
+
|
|
81
|
+
chat-bison:
|
|
82
|
+
input_cost_per_1k: 0.00025
|
|
83
|
+
output_cost_per_1k: 0.0005
|
|
84
|
+
context_window: 8192
|
|
85
|
+
model_type: chat
|
|
86
|
+
|
|
87
|
+
chat-bison-32k:
|
|
88
|
+
input_cost_per_1k: 0.00025
|
|
89
|
+
output_cost_per_1k: 0.0005
|
|
90
|
+
context_window: 32000
|
|
91
|
+
model_type: chat
|
|
92
|
+
|
|
93
|
+
# Embeddings
|
|
94
|
+
textembedding-gecko:
|
|
95
|
+
input_cost_per_1k: 0.00001
|
|
96
|
+
output_cost_per_1k: 0.0
|
|
97
|
+
context_window: 3072
|
|
98
|
+
model_type: embedding
|
|
99
|
+
embedding_dimensions: 768
|
|
100
|
+
|
|
101
|
+
textembedding-gecko-multilingual:
|
|
102
|
+
input_cost_per_1k: 0.00001
|
|
103
|
+
output_cost_per_1k: 0.0
|
|
104
|
+
context_window: 3072
|
|
105
|
+
model_type: embedding
|
|
106
|
+
embedding_dimensions: 768
|
|
107
|
+
|
|
108
|
+
text-embedding-004:
|
|
109
|
+
input_cost_per_1k: 0.00001
|
|
110
|
+
output_cost_per_1k: 0.0
|
|
111
|
+
context_window: 2048
|
|
112
|
+
model_type: embedding
|
|
113
|
+
embedding_dimensions: 768
|
|
114
|
+
|
|
115
|
+
text-multilingual-embedding-002:
|
|
116
|
+
input_cost_per_1k: 0.00001
|
|
117
|
+
output_cost_per_1k: 0.0
|
|
118
|
+
context_window: 2048
|
|
119
|
+
model_type: embedding
|
|
120
|
+
embedding_dimensions: 768
|
|
121
|
+
|
|
122
|
+
# Image models
|
|
123
|
+
imagen-3.0-generate-001:
|
|
124
|
+
input_cost_per_1k: 0.0
|
|
125
|
+
output_cost_per_1k: 0.0
|
|
126
|
+
image_cost_per_image: 0.04
|
|
127
|
+
model_type: image
|
|
128
|
+
|
|
129
|
+
imagen-3.0-fast-generate-001:
|
|
130
|
+
input_cost_per_1k: 0.0
|
|
131
|
+
output_cost_per_1k: 0.0
|
|
132
|
+
image_cost_per_image: 0.02
|
|
133
|
+
model_type: image
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLM provider integrations for LLM Cost Guard.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from llm_cost_guard.providers.base import Provider
|
|
6
|
+
from llm_cost_guard.providers.openai import OpenAIProvider
|
|
7
|
+
from llm_cost_guard.providers.anthropic import AnthropicProvider
|
|
8
|
+
from llm_cost_guard.providers.bedrock import BedrockProvider
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"Provider",
|
|
12
|
+
"OpenAIProvider",
|
|
13
|
+
"AnthropicProvider",
|
|
14
|
+
"BedrockProvider",
|
|
15
|
+
"get_provider",
|
|
16
|
+
"detect_provider",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_provider(name: str) -> Provider:
|
|
21
|
+
"""Get a provider by name."""
|
|
22
|
+
providers = {
|
|
23
|
+
"openai": OpenAIProvider,
|
|
24
|
+
"anthropic": AnthropicProvider,
|
|
25
|
+
"bedrock": BedrockProvider,
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
name = name.lower()
|
|
29
|
+
if name not in providers:
|
|
30
|
+
raise ValueError(f"Unknown provider: {name}. Available: {list(providers.keys())}")
|
|
31
|
+
|
|
32
|
+
return providers[name]()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def detect_provider(model: str) -> str:
|
|
36
|
+
"""Detect the provider from a model name."""
|
|
37
|
+
model_lower = model.lower()
|
|
38
|
+
|
|
39
|
+
# OpenAI models
|
|
40
|
+
if any(
|
|
41
|
+
prefix in model_lower
|
|
42
|
+
for prefix in ["gpt-", "o1", "text-embedding", "dall-e", "whisper", "tts-"]
|
|
43
|
+
):
|
|
44
|
+
return "openai"
|
|
45
|
+
|
|
46
|
+
# Anthropic models
|
|
47
|
+
if "claude" in model_lower and not model_lower.startswith("anthropic."):
|
|
48
|
+
return "anthropic"
|
|
49
|
+
|
|
50
|
+
# AWS Bedrock models (have provider prefix)
|
|
51
|
+
if any(
|
|
52
|
+
model_lower.startswith(prefix)
|
|
53
|
+
for prefix in [
|
|
54
|
+
"anthropic.",
|
|
55
|
+
"amazon.",
|
|
56
|
+
"meta.",
|
|
57
|
+
"mistral.",
|
|
58
|
+
"cohere.",
|
|
59
|
+
"ai21.",
|
|
60
|
+
]
|
|
61
|
+
):
|
|
62
|
+
return "bedrock"
|
|
63
|
+
|
|
64
|
+
# Google Vertex AI
|
|
65
|
+
if any(prefix in model_lower for prefix in ["gemini", "palm", "text-bison", "chat-bison"]):
|
|
66
|
+
return "vertex"
|
|
67
|
+
|
|
68
|
+
# Default to OpenAI
|
|
69
|
+
return "openai"
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Anthropic provider for LLM Cost Guard.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from llm_cost_guard.models import UsageData
|
|
8
|
+
from llm_cost_guard.providers.base import Provider
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AnthropicProvider(Provider):
|
|
12
|
+
"""Anthropic API provider."""
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def name(self) -> str:
|
|
16
|
+
return "anthropic"
|
|
17
|
+
|
|
18
|
+
def extract_usage(self, response: Any) -> UsageData:
|
|
19
|
+
"""Extract token usage from an Anthropic API response."""
|
|
20
|
+
usage = UsageData()
|
|
21
|
+
|
|
22
|
+
# Handle dictionary response
|
|
23
|
+
if isinstance(response, dict):
|
|
24
|
+
usage_data = response.get("usage", {})
|
|
25
|
+
usage.input_tokens = usage_data.get("input_tokens", 0)
|
|
26
|
+
usage.output_tokens = usage_data.get("output_tokens", 0)
|
|
27
|
+
|
|
28
|
+
# Check for cached tokens
|
|
29
|
+
usage.cached_tokens = usage_data.get("cache_read_input_tokens", 0)
|
|
30
|
+
cache_creation = usage_data.get("cache_creation_input_tokens", 0)
|
|
31
|
+
|
|
32
|
+
# Total tokens
|
|
33
|
+
usage.total_tokens = usage.input_tokens + usage.output_tokens
|
|
34
|
+
|
|
35
|
+
return usage
|
|
36
|
+
|
|
37
|
+
# Handle Anthropic client response object
|
|
38
|
+
if hasattr(response, "usage") and response.usage is not None:
|
|
39
|
+
usage.input_tokens = getattr(response.usage, "input_tokens", 0) or 0
|
|
40
|
+
usage.output_tokens = getattr(response.usage, "output_tokens", 0) or 0
|
|
41
|
+
|
|
42
|
+
# Anthropic prompt caching
|
|
43
|
+
if hasattr(response.usage, "cache_read_input_tokens"):
|
|
44
|
+
usage.cached_tokens = response.usage.cache_read_input_tokens or 0
|
|
45
|
+
|
|
46
|
+
usage.total_tokens = usage.input_tokens + usage.output_tokens
|
|
47
|
+
|
|
48
|
+
return usage
|
|
49
|
+
|
|
50
|
+
def extract_model(self, response: Any) -> str:
|
|
51
|
+
"""Extract the model name from an Anthropic API response."""
|
|
52
|
+
if isinstance(response, dict):
|
|
53
|
+
return response.get("model", "unknown")
|
|
54
|
+
|
|
55
|
+
if hasattr(response, "model"):
|
|
56
|
+
return response.model or "unknown"
|
|
57
|
+
|
|
58
|
+
return "unknown"
|
|
59
|
+
|
|
60
|
+
def extract_cached_tokens(self, response: Any) -> int:
|
|
61
|
+
"""Extract cached token count from an Anthropic API response."""
|
|
62
|
+
usage = self.extract_usage(response)
|
|
63
|
+
return usage.cached_tokens
|
|
64
|
+
|
|
65
|
+
def normalize_model_name(self, model: str) -> str:
|
|
66
|
+
"""Normalize Anthropic model name."""
|
|
67
|
+
# Anthropic model names are usually already normalized
|
|
68
|
+
return model
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class AnthropicStreamingHandler:
|
|
72
|
+
"""Handler for streaming Anthropic responses."""
|
|
73
|
+
|
|
74
|
+
def __init__(self):
|
|
75
|
+
self.input_tokens = 0
|
|
76
|
+
self.output_tokens = 0
|
|
77
|
+
self.model = "unknown"
|
|
78
|
+
self._started = False
|
|
79
|
+
|
|
80
|
+
def handle_event(self, event: Any) -> None:
|
|
81
|
+
"""Process a streaming event."""
|
|
82
|
+
if isinstance(event, dict):
|
|
83
|
+
event_type = event.get("type", "")
|
|
84
|
+
|
|
85
|
+
if event_type == "message_start":
|
|
86
|
+
message = event.get("message", {})
|
|
87
|
+
self.model = message.get("model", self.model)
|
|
88
|
+
usage = message.get("usage", {})
|
|
89
|
+
self.input_tokens = usage.get("input_tokens", 0)
|
|
90
|
+
|
|
91
|
+
elif event_type == "message_delta":
|
|
92
|
+
usage = event.get("usage", {})
|
|
93
|
+
self.output_tokens = usage.get("output_tokens", 0)
|
|
94
|
+
|
|
95
|
+
else:
|
|
96
|
+
# Handle event objects
|
|
97
|
+
event_type = getattr(event, "type", "")
|
|
98
|
+
|
|
99
|
+
if event_type == "message_start":
|
|
100
|
+
if hasattr(event, "message"):
|
|
101
|
+
self.model = getattr(event.message, "model", self.model)
|
|
102
|
+
if hasattr(event.message, "usage"):
|
|
103
|
+
self.input_tokens = getattr(event.message.usage, "input_tokens", 0)
|
|
104
|
+
|
|
105
|
+
elif event_type == "message_delta":
|
|
106
|
+
if hasattr(event, "usage"):
|
|
107
|
+
self.output_tokens = getattr(event.usage, "output_tokens", 0)
|
|
108
|
+
|
|
109
|
+
def get_usage(self) -> UsageData:
|
|
110
|
+
"""Get final usage data."""
|
|
111
|
+
return UsageData(
|
|
112
|
+
input_tokens=self.input_tokens,
|
|
113
|
+
output_tokens=self.output_tokens,
|
|
114
|
+
total_tokens=self.input_tokens + self.output_tokens,
|
|
115
|
+
)
|