PyPI - llm-cost-guard - Versions diffs - 0.1.0__py3-none-any.whl - Mend

llm-cost-guard 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

llm_cost_guard/__init__.py +39 -0
llm_cost_guard/backends/__init__.py +52 -0
llm_cost_guard/backends/base.py +121 -0
llm_cost_guard/backends/memory.py +265 -0
llm_cost_guard/backends/sqlite.py +425 -0
llm_cost_guard/budget.py +306 -0
llm_cost_guard/cli.py +464 -0
llm_cost_guard/clients/__init__.py +11 -0
llm_cost_guard/clients/anthropic.py +231 -0
llm_cost_guard/clients/openai.py +262 -0
llm_cost_guard/exceptions.py +71 -0
llm_cost_guard/integrations/__init__.py +12 -0
llm_cost_guard/integrations/cache.py +189 -0
llm_cost_guard/integrations/langchain.py +257 -0
llm_cost_guard/models.py +123 -0
llm_cost_guard/pricing/__init__.py +7 -0
llm_cost_guard/pricing/anthropic.yaml +88 -0
llm_cost_guard/pricing/bedrock.yaml +215 -0
llm_cost_guard/pricing/loader.py +221 -0
llm_cost_guard/pricing/openai.yaml +148 -0
llm_cost_guard/pricing/vertex.yaml +133 -0
llm_cost_guard/providers/__init__.py +69 -0
llm_cost_guard/providers/anthropic.py +115 -0
llm_cost_guard/providers/base.py +72 -0
llm_cost_guard/providers/bedrock.py +135 -0
llm_cost_guard/providers/openai.py +110 -0
llm_cost_guard/rate_limit.py +233 -0
llm_cost_guard/span.py +143 -0
llm_cost_guard/tokenizers/__init__.py +7 -0
llm_cost_guard/tokenizers/base.py +207 -0
llm_cost_guard/tracker.py +718 -0
llm_cost_guard-0.1.0.dist-info/METADATA +357 -0
llm_cost_guard-0.1.0.dist-info/RECORD +36 -0
llm_cost_guard-0.1.0.dist-info/WHEEL +4 -0
llm_cost_guard-0.1.0.dist-info/entry_points.txt +2 -0
llm_cost_guard-0.1.0.dist-info/licenses/LICENSE +21 -0

llm_cost_guard/integrations/langchain.py ADDED Viewed

@@ -0,0 +1,257 @@
+"""
+LangChain integration for LLM Cost Guard.
+"""
+import functools
+import logging
+from typing import Any, Callable, Dict, List, Optional, TypeVar, Union
+logger = logging.getLogger(__name__)
+F = TypeVar("F", bound=Callable[..., Any])
+try:
+    from langchain_core.callbacks.base import BaseCallbackHandler
+    from langchain_core.outputs import LLMResult
+    LANGCHAIN_AVAILABLE = True
+except ImportError:
+    LANGCHAIN_AVAILABLE = False
+    BaseCallbackHandler = object  # type: ignore
+    LLMResult = None  # type: ignore
+class CostTrackingCallback(BaseCallbackHandler):
+    """
+    LangChain callback handler for cost tracking.
+    Usage:
+        from llm_cost_guard import CostTracker
+        from llm_cost_guard.integrations.langchain import CostTrackingCallback
+        tracker = CostTracker()
+        callback = CostTrackingCallback(tracker)
+        llm = ChatOpenAI(model="gpt-4o", callbacks=[callback])
+        result = llm.invoke("Hello!")
+    """
+    def __init__(
+        self,
+        tracker: Any,  # CostTracker
+        tags: Optional[Dict[str, str]] = None,
+    ):
+        """
+        Initialize the callback handler.
+        Args:
+            tracker: CostTracker instance
+            tags: Default tags to apply to all tracked calls
+        """
+        if not LANGCHAIN_AVAILABLE:
+            raise ImportError(
+                "LangChain is required for this integration. "
+                "Install with: pip install llm-cost-guard[langchain]"
+            )
+        super().__init__()
+        self._tracker = tracker
+        self._default_tags = tags or {}
+        # Track in-flight calls
+        self._run_info: Dict[str, Dict[str, Any]] = {}
+    def on_llm_start(
+        self,
+        serialized: Dict[str, Any],
+        prompts: List[str],
+        *,
+        run_id: Any,
+        parent_run_id: Optional[Any] = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        **kwargs: Any,
+    ) -> None:
+        """Record the start of an LLM call."""
+        import time
+        self._run_info[str(run_id)] = {
+            "start_time": time.time(),
+            "model": serialized.get("kwargs", {}).get("model_name", "unknown"),
+            "prompts": prompts,
+            "tags": tags or [],
+            "metadata": metadata or {},
+        }
+    def on_chat_model_start(
+        self,
+        serialized: Dict[str, Any],
+        messages: List[List[Any]],
+        *,
+        run_id: Any,
+        parent_run_id: Optional[Any] = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        **kwargs: Any,
+    ) -> None:
+        """Record the start of a chat model call."""
+        import time
+        model = serialized.get("kwargs", {}).get("model_name")
+        if not model:
+            model = serialized.get("kwargs", {}).get("model", "unknown")
+        self._run_info[str(run_id)] = {
+            "start_time": time.time(),
+            "model": model,
+            "messages": messages,
+            "tags": tags or [],
+            "metadata": metadata or {},
+        }
+    def on_llm_end(
+        self,
+        response: "LLMResult",
+        *,
+        run_id: Any,
+        parent_run_id: Optional[Any] = None,
+        **kwargs: Any,
+    ) -> None:
+        """Record the end of an LLM call."""
+        import time
+        run_id_str = str(run_id)
+        if run_id_str not in self._run_info:
+            return
+        run_info = self._run_info.pop(run_id_str)
+        latency_ms = int((time.time() - run_info["start_time"]) * 1000)
+        # Extract usage from response
+        input_tokens = 0
+        output_tokens = 0
+        if response.llm_output:
+            token_usage = response.llm_output.get("token_usage", {})
+            input_tokens = token_usage.get("prompt_tokens", 0)
+            output_tokens = token_usage.get("completion_tokens", 0)
+            # Also check for model-specific usage
+            if "usage" in response.llm_output:
+                usage = response.llm_output["usage"]
+                input_tokens = usage.get("prompt_tokens", usage.get("input_tokens", input_tokens))
+                output_tokens = usage.get(
+                    "completion_tokens", usage.get("output_tokens", output_tokens)
+                )
+        # Detect provider from model
+        from llm_cost_guard.providers import detect_provider
+        model = run_info["model"]
+        provider = detect_provider(model)
+        # Build tags
+        tags = dict(self._default_tags)
+        for tag in run_info.get("tags", []):
+            if ":" in tag:
+                key, value = tag.split(":", 1)
+                tags[key] = value
+            else:
+                tags[tag] = "true"
+        # Record the call
+        try:
+            self._tracker.record(
+                provider=provider,
+                model=model,
+                input_tokens=input_tokens,
+                output_tokens=output_tokens,
+                tags=tags,
+                success=True,
+                latency_ms=latency_ms,
+                metadata=run_info.get("metadata", {}),
+            )
+        except Exception as e:
+            logger.warning(f"Failed to record LangChain call: {e}")
+    def on_llm_error(
+        self,
+        error: BaseException,
+        *,
+        run_id: Any,
+        parent_run_id: Optional[Any] = None,
+        **kwargs: Any,
+    ) -> None:
+        """Record an LLM call error."""
+        import time
+        run_id_str = str(run_id)
+        if run_id_str not in self._run_info:
+            return
+        run_info = self._run_info.pop(run_id_str)
+        latency_ms = int((time.time() - run_info["start_time"]) * 1000)
+        # Detect provider from model
+        from llm_cost_guard.providers import detect_provider
+        model = run_info["model"]
+        provider = detect_provider(model)
+        # Build tags
+        tags = dict(self._default_tags)
+        for tag in run_info.get("tags", []):
+            if ":" in tag:
+                key, value = tag.split(":", 1)
+                tags[key] = value
+            else:
+                tags[tag] = "true"
+        # Record the failed call
+        try:
+            self._tracker.record(
+                provider=provider,
+                model=model,
+                input_tokens=0,  # We don't know tokens for failed calls
+                output_tokens=0,
+                tags=tags,
+                success=False,
+                error_type=type(error).__name__,
+                latency_ms=latency_ms,
+                metadata=run_info.get("metadata", {}),
+            )
+        except Exception as e:
+            logger.warning(f"Failed to record LangChain error: {e}")
+def track_chain(
+    tracker: Any,  # CostTracker
+    tags: Optional[Dict[str, str]] = None,
+) -> Callable[[F], F]:
+    """
+    Decorator to track costs for an entire LangChain chain.
+    Usage:
+        @track_chain(tracker, tags={"chain": "rag_pipeline"})
+        def my_rag_chain(query):
+            # Chain implementation
+            return result
+    Args:
+        tracker: CostTracker instance
+        tags: Tags to apply to the tracked span
+    Returns:
+        Decorated function
+    """
+    def decorator(func: F) -> F:
+        @functools.wraps(func)
+        def wrapper(*args: Any, **kwargs: Any) -> Any:
+            with tracker.span(func.__name__, tags=tags):
+                return func(*args, **kwargs)
+        return wrapper  # type: ignore
+    return decorator

llm_cost_guard/models.py ADDED Viewed

@@ -0,0 +1,123 @@
+"""
+Data models for LLM Cost Guard.
+"""
+from dataclasses import dataclass, field
+from datetime import datetime
+from enum import Enum
+from typing import Any, Dict, List, Literal, Optional
+class ModelType(str, Enum):
+    """Types of LLM models."""
+    CHAT = "chat"
+    EMBEDDING = "embedding"
+    IMAGE = "image"
+    AUDIO = "audio"
+    COMPLETION = "completion"
+@dataclass
+class CostRecord:
+    """Single LLM call record."""
+    timestamp: datetime
+    provider: str
+    model: str
+    model_type: ModelType = ModelType.CHAT
+    input_tokens: int = 0
+    output_tokens: int = 0
+    input_cost: float = 0.0
+    output_cost: float = 0.0
+    total_cost: float = 0.0
+    latency_ms: int = 0
+    tags: Dict[str, str] = field(default_factory=dict)
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    success: bool = True
+    error_type: Optional[str] = None
+    cached: bool = False
+    cache_savings: float = 0.0
+    span_id: Optional[str] = None
+    def __post_init__(self) -> None:
+        """Calculate total cost if not provided."""
+        if self.total_cost == 0.0 and (self.input_cost > 0 or self.output_cost > 0):
+            self.total_cost = self.input_cost + self.output_cost
+@dataclass
+class CostReport:
+    """Aggregated cost report."""
+    start_date: Optional[datetime] = None
+    end_date: Optional[datetime] = None
+    total_cost: float = 0.0
+    total_input_tokens: int = 0
+    total_output_tokens: int = 0
+    total_calls: int = 0
+    successful_calls: int = 0
+    failed_calls: int = 0
+    cache_hits: int = 0
+    cache_savings: float = 0.0
+    effective_cost: float = 0.0  # total_cost - cache_savings
+    records: List[CostRecord] = field(default_factory=list)
+    grouped_data: Dict[str, Any] = field(default_factory=dict)
+    def __post_init__(self) -> None:
+        """Calculate effective cost."""
+        if self.effective_cost == 0.0:
+            self.effective_cost = self.total_cost - self.cache_savings
+@dataclass
+class HealthStatus:
+    """Health check status for the tracker."""
+    healthy: bool = True
+    backend_connected: bool = True
+    pricing_fresh: bool = True
+    last_record_time: Optional[datetime] = None
+    pending_records: int = 0
+    errors: List[str] = field(default_factory=list)
+    pricing_version: Optional[str] = None
+    pricing_last_updated: Optional[datetime] = None
+@dataclass
+class ModelPricing:
+    """Pricing information for a model."""
+    input_cost_per_1k: float
+    output_cost_per_1k: float
+    cached_input_cost_per_1k: Optional[float] = None
+    context_window: int = 128000
+    model_type: ModelType = ModelType.CHAT
+    # For image models
+    image_cost_per_image: Optional[float] = None
+    # For audio models
+    audio_cost_per_minute: Optional[float] = None
+    # For embedding models
+    embedding_dimensions: Optional[int] = None
+@dataclass
+class UsageData:
+    """Token usage data from an LLM call."""
+    input_tokens: int = 0
+    output_tokens: int = 0
+    cached_tokens: int = 0
+    total_tokens: int = 0
+    # For non-text models
+    image_count: int = 0
+    audio_duration_seconds: float = 0.0
+    def __post_init__(self) -> None:
+        """Calculate total tokens if not provided."""
+        if self.total_tokens == 0:
+            self.total_tokens = self.input_tokens + self.output_tokens

llm_cost_guard/pricing/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""
+Pricing module for LLM Cost Guard.
+"""
+from llm_cost_guard.pricing.loader import PricingLoader, get_pricing
+__all__ = ["PricingLoader", "get_pricing"]

llm_cost_guard/pricing/anthropic.yaml ADDED Viewed

@@ -0,0 +1,88 @@
+version: "2026-01-15"
+models:
+  # Claude 3.5 Sonnet
+  claude-3-5-sonnet-20241022:
+    input_cost_per_1k: 0.003
+    output_cost_per_1k: 0.015
+    cached_input_cost_per_1k: 0.0003
+    context_window: 200000
+    model_type: chat
+  claude-3-5-sonnet-latest:
+    input_cost_per_1k: 0.003
+    output_cost_per_1k: 0.015
+    cached_input_cost_per_1k: 0.0003
+    context_window: 200000
+    model_type: chat
+  claude-3-5-sonnet-20240620:
+    input_cost_per_1k: 0.003
+    output_cost_per_1k: 0.015
+    cached_input_cost_per_1k: 0.0003
+    context_window: 200000
+    model_type: chat
+  # Claude 3.5 Haiku
+  claude-3-5-haiku-20241022:
+    input_cost_per_1k: 0.0008
+    output_cost_per_1k: 0.004
+    cached_input_cost_per_1k: 0.00008
+    context_window: 200000
+    model_type: chat
+  claude-3-5-haiku-latest:
+    input_cost_per_1k: 0.0008
+    output_cost_per_1k: 0.004
+    cached_input_cost_per_1k: 0.00008
+    context_window: 200000
+    model_type: chat
+  # Claude 3 Opus
+  claude-3-opus-20240229:
+    input_cost_per_1k: 0.015
+    output_cost_per_1k: 0.075
+    cached_input_cost_per_1k: 0.0015
+    context_window: 200000
+    model_type: chat
+  claude-3-opus-latest:
+    input_cost_per_1k: 0.015
+    output_cost_per_1k: 0.075
+    cached_input_cost_per_1k: 0.0015
+    context_window: 200000
+    model_type: chat
+  # Claude 3 Sonnet
+  claude-3-sonnet-20240229:
+    input_cost_per_1k: 0.003
+    output_cost_per_1k: 0.015
+    context_window: 200000
+    model_type: chat
+  # Claude 3 Haiku
+  claude-3-haiku-20240307:
+    input_cost_per_1k: 0.00025
+    output_cost_per_1k: 0.00125
+    cached_input_cost_per_1k: 0.00003
+    context_window: 200000
+    model_type: chat
+  # Claude 2 (legacy)
+  claude-2.1:
+    input_cost_per_1k: 0.008
+    output_cost_per_1k: 0.024
+    context_window: 200000
+    model_type: chat
+  claude-2.0:
+    input_cost_per_1k: 0.008
+    output_cost_per_1k: 0.024
+    context_window: 100000
+    model_type: chat
+  # Claude Instant (legacy)
+  claude-instant-1.2:
+    input_cost_per_1k: 0.0008
+    output_cost_per_1k: 0.0024
+    context_window: 100000
+    model_type: chat

llm_cost_guard/pricing/bedrock.yaml ADDED Viewed

@@ -0,0 +1,215 @@
+version: "2026-01-15"
+# AWS Bedrock pricing (us-east-1 region)
+models:
+  # Anthropic Claude on Bedrock
+  anthropic.claude-3-5-sonnet-20241022-v2:0:
+    input_cost_per_1k: 0.003
+    output_cost_per_1k: 0.015
+    context_window: 200000
+    model_type: chat
+  anthropic.claude-3-5-sonnet-20240620-v1:0:
+    input_cost_per_1k: 0.003
+    output_cost_per_1k: 0.015
+    context_window: 200000
+    model_type: chat
+  anthropic.claude-3-5-haiku-20241022-v1:0:
+    input_cost_per_1k: 0.0008
+    output_cost_per_1k: 0.004
+    context_window: 200000
+    model_type: chat
+  anthropic.claude-3-opus-20240229-v1:0:
+    input_cost_per_1k: 0.015
+    output_cost_per_1k: 0.075
+    context_window: 200000
+    model_type: chat
+  anthropic.claude-3-sonnet-20240229-v1:0:
+    input_cost_per_1k: 0.003
+    output_cost_per_1k: 0.015
+    context_window: 200000
+    model_type: chat
+  anthropic.claude-3-haiku-20240307-v1:0:
+    input_cost_per_1k: 0.00025
+    output_cost_per_1k: 0.00125
+    context_window: 200000
+    model_type: chat
+  anthropic.claude-v2:1:
+    input_cost_per_1k: 0.008
+    output_cost_per_1k: 0.024
+    context_window: 200000
+    model_type: chat
+  anthropic.claude-v2:
+    input_cost_per_1k: 0.008
+    output_cost_per_1k: 0.024
+    context_window: 100000
+    model_type: chat
+  anthropic.claude-instant-v1:
+    input_cost_per_1k: 0.0008
+    output_cost_per_1k: 0.0024
+    context_window: 100000
+    model_type: chat
+  # Amazon Titan
+  amazon.titan-text-premier-v1:0:
+    input_cost_per_1k: 0.0005
+    output_cost_per_1k: 0.0015
+    context_window: 32000
+    model_type: chat
+  amazon.titan-text-express-v1:
+    input_cost_per_1k: 0.0002
+    output_cost_per_1k: 0.0006
+    context_window: 8000
+    model_type: chat
+  amazon.titan-text-lite-v1:
+    input_cost_per_1k: 0.00015
+    output_cost_per_1k: 0.0002
+    context_window: 4000
+    model_type: chat
+  amazon.titan-embed-text-v1:
+    input_cost_per_1k: 0.0001
+    output_cost_per_1k: 0.0
+    context_window: 8000
+    model_type: embedding
+    embedding_dimensions: 1536
+  amazon.titan-embed-text-v2:0:
+    input_cost_per_1k: 0.00002
+    output_cost_per_1k: 0.0
+    context_window: 8000
+    model_type: embedding
+    embedding_dimensions: 1024
+  # Meta Llama
+  meta.llama3-2-90b-instruct-v1:0:
+    input_cost_per_1k: 0.002
+    output_cost_per_1k: 0.002
+    context_window: 128000
+    model_type: chat
+  meta.llama3-2-11b-instruct-v1:0:
+    input_cost_per_1k: 0.00016
+    output_cost_per_1k: 0.00016
+    context_window: 128000
+    model_type: chat
+  meta.llama3-2-3b-instruct-v1:0:
+    input_cost_per_1k: 0.00015
+    output_cost_per_1k: 0.00015
+    context_window: 128000
+    model_type: chat
+  meta.llama3-2-1b-instruct-v1:0:
+    input_cost_per_1k: 0.0001
+    output_cost_per_1k: 0.0001
+    context_window: 128000
+    model_type: chat
+  meta.llama3-1-405b-instruct-v1:0:
+    input_cost_per_1k: 0.00195
+    output_cost_per_1k: 0.00256
+    context_window: 128000
+    model_type: chat
+  meta.llama3-1-70b-instruct-v1:0:
+    input_cost_per_1k: 0.00072
+    output_cost_per_1k: 0.00072
+    context_window: 128000
+    model_type: chat
+  meta.llama3-1-8b-instruct-v1:0:
+    input_cost_per_1k: 0.00022
+    output_cost_per_1k: 0.00022
+    context_window: 128000
+    model_type: chat
+  meta.llama3-70b-instruct-v1:0:
+    input_cost_per_1k: 0.00265
+    output_cost_per_1k: 0.0035
+    context_window: 8000
+    model_type: chat
+  meta.llama3-8b-instruct-v1:0:
+    input_cost_per_1k: 0.0003
+    output_cost_per_1k: 0.0006
+    context_window: 8000
+    model_type: chat
+  # Mistral
+  mistral.mistral-large-2407-v1:0:
+    input_cost_per_1k: 0.002
+    output_cost_per_1k: 0.006
+    context_window: 128000
+    model_type: chat
+  mistral.mistral-large-2402-v1:0:
+    input_cost_per_1k: 0.004
+    output_cost_per_1k: 0.012
+    context_window: 32000
+    model_type: chat
+  mistral.mistral-small-2402-v1:0:
+    input_cost_per_1k: 0.001
+    output_cost_per_1k: 0.003
+    context_window: 32000
+    model_type: chat
+  mistral.mixtral-8x7b-instruct-v0:1:
+    input_cost_per_1k: 0.00045
+    output_cost_per_1k: 0.0007
+    context_window: 32000
+    model_type: chat
+  mistral.mistral-7b-instruct-v0:2:
+    input_cost_per_1k: 0.00015
+    output_cost_per_1k: 0.0002
+    context_window: 32000
+    model_type: chat
+  # Cohere
+  cohere.command-r-plus-v1:0:
+    input_cost_per_1k: 0.003
+    output_cost_per_1k: 0.015
+    context_window: 128000
+    model_type: chat
+  cohere.command-r-v1:0:
+    input_cost_per_1k: 0.0005
+    output_cost_per_1k: 0.0015
+    context_window: 128000
+    model_type: chat
+  cohere.command-text-v14:
+    input_cost_per_1k: 0.0015
+    output_cost_per_1k: 0.002
+    context_window: 4096
+    model_type: chat
+  cohere.command-light-text-v14:
+    input_cost_per_1k: 0.0003
+    output_cost_per_1k: 0.0006
+    context_window: 4096
+    model_type: chat
+  cohere.embed-english-v3:
+    input_cost_per_1k: 0.0001
+    output_cost_per_1k: 0.0
+    context_window: 512
+    model_type: embedding
+    embedding_dimensions: 1024
+  cohere.embed-multilingual-v3:
+    input_cost_per_1k: 0.0001
+    output_cost_per_1k: 0.0
+    context_window: 512
+    model_type: embedding
+    embedding_dimensions: 1024