PyPI - token-limit - Versions diffs - 0.1.0__py3-none-any.whl - Mend

token-limit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

token_limit/__init__.py +8 -0
token_limit/config.py +35 -0
token_limit/exceptions.py +5 -0
token_limit/meter.py +479 -0
token_limit/patches/__init__.py +14 -0
token_limit/patches/_base_patch.py +397 -0
token_limit/patches/anthropic_patch.py +627 -0
token_limit/patches/deepseek_patch.py +707 -0
token_limit/patches/google_patch.py +677 -0
token_limit/patches/openai_patch.py +1199 -0
token_limit/patches/openrouter_patch.py +400 -0
token_limit/transport/http_client.py +311 -0
token_limit/transport/queue.py +95 -0
token_limit/types.py +92 -0
token_limit-0.1.0.dist-info/METADATA +532 -0
token_limit-0.1.0.dist-info/RECORD +18 -0
token_limit-0.1.0.dist-info/WHEEL +4 -0
token_limit-0.1.0.dist-info/licenses/LICENSE +35 -0

token_limit/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+__version__ = '0.1.0'
+from token_limit.meter import Meter, PER_MONTH, PER_DAY
+from token_limit.exceptions import LimitExceededException
+from token_limit.meter import MeterConfig
+__all__ = ["Meter", "PER_MONTH", "PER_DAY", "LimitExceededException", "MeterConfig"]

token_limit/config.py ADDED Viewed

@@ -0,0 +1,35 @@
+from dataclasses import dataclass, field
+from typing import Callable, List, Optional
+from .transport.http_client import INGEST_URL
+from .types import LLMEvent
+@dataclass
+class MeterConfig:
+    # ── required ──────────────────────────────────────────────────
+    api_key: str  # your SaaS API key
+    url: str = INGEST_URL  # POST endpoint on your backend
+    # ── batching ──────────────────────────────────────────────────
+    flush_interval: float = 5.0  # seconds between auto-flushes
+    max_batch_size: int = 50  # flush early if queue hits this
+    max_queue_size: int = 1000  # drop oldest if queue overflows
+    # ── limit checks ─────────────────────────────────────────────
+    limit_check_cache_ttl: float = (
+        5.0  # seconds a check_limit() result is cached per tenant
+    )
+    # ── behaviour ─────────────────────────────────────────────────
+    raise_on_error: bool = False  # if True, re-raise patch exceptions
+    debug: bool = False  # print captured events to stdout
+    # ── hooks (optional) ─────────────────────────────────────────
+    on_event: Optional[Callable[[LLMEvent], None]] = None  # called after capture
+    on_flush_error: Optional[Callable[[Exception], None]] = None  # transport errors
+    # ── patches to install (default = all available) ──────────────
+    patches: List[str] = field(
+        default_factory=lambda: ["openai", "anthropic", "deepseek", "google","openrouter"]
+    )

token_limit/exceptions.py ADDED Viewed

@@ -0,0 +1,5 @@
+class LimitExceededException(Exception):
+    """Raised when a tenant's usage exceeds their configured threshold."""
+    def __init__(self, message: str = "tenant_usage_limit_exceeded"):
+        super().__init__(message)

token_limit/meter.py ADDED Viewed

@@ -0,0 +1,479 @@
+"""
+token_limit.meter
+~~~~~~~~~~~~~~~~~~~~~
+The single entry point developers import and initialise.
+Usage
+-----
+    from token_limit import Meter, MeterConfig
+    meter = Meter(MeterConfig(
+        api_key="sk-...",
+        url="https://api.yoursaas.com/v1/ingest",
+    ))
+    meter.patch_all()   # ← one line, all providers auto-instrumented
+    # Per-request tenant override:
+    with meter.for_tenant("stripe-inc"):
+        response = openai_client.chat.completions.create(...)
+"""
+from __future__ import annotations
+import atexit
+from datetime import datetime, timezone
+import logging
+import threading
+import time
+from contextlib import contextmanager
+from contextvars import ContextVar
+from typing import Dict, Generator, List, Optional, Tuple
+try:
+    import aiohttp as _aiohttp  # optional — only needed for async limit checks
+except ImportError:  # pragma: no cover
+    _aiohttp = None  # type: ignore[assignment]
+from .exceptions import LimitExceededException
+from .config import MeterConfig
+from .patches import PATCH_REGISTRY
+from .patches._base_patch import BasePatch
+from .transport.http_client import CHECK_LIMIT_URL, HttpClient, SET_LIMIT_URL, _TIMEOUT
+from .transport.queue import EventQueue
+from .types import LLMEvent
+logger = logging.getLogger("token_limit")
+# Thread-/async-safe tenant_id override
+_tenant_id_var: ContextVar[Optional[str]] = ContextVar("tenant_id", default=None)
+PER_DAY = "per_day"
+PER_MONTH = "per_month"
+class Meter:
+    """
+    Central SDK object.  One instance per application.
+    Parameters
+    ----------
+    config : MeterConfig
+        All configuration in one place.
+    """
+    def __init__(self, config: MeterConfig) -> None:
+        self.config = config  # meter config
+        self._patches: Dict[str, BasePatch] = {}
+        self._http = HttpClient(
+            url=self.config.url,
+            api_key=config.api_key,
+            on_error=config.on_flush_error,
+        )
+        # Dedicated, reused clients for the limit-check / set-limit
+        # endpoints. HttpClient itself pools its underlying connections
+        # at the class level (see transport.http_client), but we still
+        # avoid constructing a fresh HttpClient object on every single
+        # patched LLM call by building these once, here.
+        self._limit_check_http = HttpClient(
+            url=CHECK_LIMIT_URL,
+            api_key=config.api_key,
+        )
+        self._set_limit_http = HttpClient(
+            url=SET_LIMIT_URL,
+            api_key=config.api_key,
+        )
+        # Lazily-created, reused aiohttp session for async_check_limit.
+        # Created on first use rather than here, since aiohttp sessions
+        # must be created inside a running event loop.
+        self._aiohttp_session: Optional["_aiohttp.ClientSession"] = None
+        # In-process cache for limit checks, keyed by tenant_id:
+        #   {tenant_id: (checked_at_monotonic, limit_exceeded: bool, remaining_tokens: Optional[int])}
+        # check_limit() / async_check_limit() are called on *every*
+        # patched SDK call (every chat completion, every Gemini call,
+        # etc.), so without a cache each call pays a full network
+        # round-trip to the billing backend purely to enforce a number
+        # that does not change meaningfully within a few seconds. The
+        # TTL is intentionally short — this trades a small amount of
+        # enforcement staleness for avoiding network latency on every
+        # single LLM call.
+        self._limit_cache: Dict[str, Tuple[float, bool, Optional[int]]] = {}
+        self._limit_cache_lock = threading.Lock()
+        self._limit_cache_ttl: float = getattr(config, "limit_check_cache_ttl", 5.0)
+        # Local-only token trip-wire, keyed by tenant_id:
+        #   {tenant_id: tokens captured since the last real network sync}
+        # Lets the SDK catch a runaway burst *within* the TTL window
+        # (e.g. a retry loop firing dozens of calls in a few seconds)
+        # without waiting for the next backend round-trip. Reset to 0
+        # every time _store_limit_result() records a fresh sync, since
+        # the remaining_tokens value from that sync already reflects
+        # all spend up to that point. Guarded by _limit_cache_lock since
+        # the two dicts are always read/written together.
+        self._local_token_spend: Dict[str, int] = {}
+        self._queue = EventQueue(
+            flush_fn=self._http.send_batch,
+            flush_interval=config.flush_interval,
+            max_batch_size=config.max_batch_size,
+            max_queue_size=config.max_queue_size,
+        )
+        atexit.register(self._queue.shutdown)
+    # ── patching ─────────────────────────────────────────────────
+    def patch_all(self) -> "Meter":
+        """Install all configured provider patches. Chainable."""
+        for name in self.config.patches:
+            self.patch(name)
+        return self
+    def patch(self, provider: str) -> "Meter":
+        """Install a single provider patch by name."""
+        cls = PATCH_REGISTRY.get(provider)
+        if cls is None:
+            raise ValueError(
+                f"Unknown provider '{provider}'. " f"Available: {list(PATCH_REGISTRY)}"
+            )
+        instance = cls(self)
+        instance.patch()
+        self._patches[provider] = instance
+        if self.config.debug:
+            status = "active" if instance.is_active else "skipped (not installed)"
+            logger.info("token_limit: patch '%s' %s", provider, status)
+        return self
+    def unpatch_all(self) -> "Meter":
+        """Restore all original SDK methods."""
+        for patch in self._patches.values():
+            patch.unpatch()
+        self._patches.clear()
+        return self
+    def unpatch(self, provider: str) -> "Meter":
+        patch = self._patches.pop(provider, None)
+        if patch:
+            patch.unpatch()
+        return self
+    # ── tenant context ─────────────────────────────────────────
+    @property
+    def current_tenant_id(self) -> Optional[str]:
+        return _tenant_id_var.get()
+    @contextmanager
+    def for_tenant(self, tenant_id: str) -> Generator[None, None, None]:
+        """
+        Context manager that tags every event inside the block with tenant_id.
+        Thread-safe and async-safe via contextvars.
+            with meter.for_tenant("acme-corp"):
+                client.chat.completions.create(...)
+        """
+        token = _tenant_id_var.set(tenant_id)
+        try:
+            yield
+        finally:
+            _tenant_id_var.reset(token)
+    def set_tenant(self, tenant_id: str) -> None:
+        """
+        Imperative alternative — useful when you can't use a context manager.
+        Sets the tenant for the current thread/task until changed.
+        """
+        _tenant_id_var.set(tenant_id)
+    def _cached_limit_entry(
+        self, tenant_id: str
+    ) -> Optional[Tuple[bool, Optional[int]]]:
+        """
+        Return the cached (limit_exceeded, remaining_tokens) pair for
+        tenant_id if the cache entry is still within TTL, else None
+        (cache miss/expired).
+        """
+        with self._limit_cache_lock:
+            entry = self._limit_cache.get(tenant_id)
+        if entry is None:
+            return None
+        checked_at, limit_exceeded, remaining_tokens = entry
+        if time.monotonic() - checked_at > self._limit_cache_ttl:
+            return None
+        return limit_exceeded, remaining_tokens
+    def _store_limit_result(
+        self,
+        tenant_id: str,
+        limit_exceeded: bool,
+        remaining_tokens: Optional[int] = None,
+    ) -> None:
+        with self._limit_cache_lock:
+            self._limit_cache[tenant_id] = (
+                time.monotonic(),
+                limit_exceeded,
+                remaining_tokens,
+            )
+            # Fresh sync point — remaining_tokens above already accounts
+            # for every event up to now, so local accounting restarts.
+            self._local_token_spend[tenant_id] = 0
+    def _accumulate_local_tokens(self, data: dict) -> None:
+        """
+        Local trip-wire, step 1: add this event's token usage to the
+        tenant's running total since the last network sync.
+            local_tokens = local_tokens + tokens_this_call
+        Called from _capture() for every event, regardless of provider —
+        every patch already funnels through _capture() with total_tokens
+        (or input_tokens/output_tokens) in `data`.
+        """
+        tenant_id = data.get("tenant_id")
+        if not tenant_id:
+            return
+        total_tokens = data.get("total_tokens") or (
+            (data.get("input_tokens") or 0) + (data.get("output_tokens") or 0)
+        )
+        if not total_tokens:
+            return
+        with self._limit_cache_lock:
+            self._local_token_spend[tenant_id] = self._local_token_spend.get(
+                tenant_id, 0
+            ) + int(total_tokens)
+    def check_limit(self) -> None:
+        """
+        Check current usage vs threshold for a tenant (sync).
+        Raises LimitExceededException if the tenant's usage limit is reached.
+        No-ops silently when there is no active tenant context.
+        Results are cached per-tenant for `limit_check_cache_ttl` seconds
+        (default 5s) so high-throughput callers don't pay a network
+        round-trip to the billing backend on every single LLM call.
+        """
+        tenant_id = self.current_tenant_id
+        if not tenant_id:
+            return
+        # cache sync -> first
+        cached = self._cached_limit_entry(tenant_id)
+        if cached is not None:
+            limit_exceeded, remaining_tokens = cached
+            if limit_exceeded:
+                raise LimitExceededException()
+            return
+        # realy sync
+        response = self._limit_check_http._get(params={"tenant_id": tenant_id})
+        limit_exceeded = bool(response.get("limit_exceeded"))
+        remaining_tokens = response.get("remaining_tokens")
+        self._store_limit_result(tenant_id, limit_exceeded, remaining_tokens)
+        if limit_exceeded:
+            raise LimitExceededException()
+    async def async_check_limit(self) -> None:
+        """
+        Async variant of check_limit — used by _make_async_wrapper so the
+        event loop is never blocked by a synchronous HTTP call.
+        Raises LimitExceededException if the tenant's usage limit is reached.
+        No-ops silently when there is no active tenant context.
+        Shares the same per-tenant TTL cache as check_limit(), so a mix
+        of sync and async patched calls for the same tenant still only
+        hits the network once per TTL window.
+        Requires ``aiohttp`` to be installed.  If it is absent the method
+        falls back to the synchronous check (acceptable for low-concurrency
+        use-cases, but will block the event loop on each call).
+        """
+        tenant_id = self.current_tenant_id
+        if not tenant_id:
+            return
+        cached = self._cached_limit_entry(tenant_id)
+        if cached is not None:
+            limit_exceeded, remaining_tokens = cached
+            if limit_exceeded:
+                raise LimitExceededException()
+            return
+        if _aiohttp is None:
+            # Graceful degradation: fall back to sync check with a warning.
+            logger.warning(
+                "token_limit: aiohttp is not installed; falling back to "
+                "synchronous limit check inside async wrapper.  "
+                "Install aiohttp to avoid blocking the event loop."
+            )
+            self.check_limit()
+            return
+        headers = {
+            "Authorization": f"Bearer {self.config.api_key}",
+            "Content-Type": "application/json",
+        }
+        params = {"tenant_id": tenant_id}
+        timeout = _aiohttp.ClientTimeout(total=_TIMEOUT)
+        session = self._get_aiohttp_session()
+        async with session.get(
+            CHECK_LIMIT_URL, headers=headers, params=params, timeout=timeout
+        ) as resp:
+            resp.raise_for_status()
+            data = await resp.json()
+        limit_exceeded = bool(data.get("limit_exceeded"))
+        remaining_tokens = data.get("remaining_tokens")
+        self._store_limit_result(tenant_id, limit_exceeded, remaining_tokens)
+        if limit_exceeded:
+            raise LimitExceededException()
+    def _get_aiohttp_session(self) -> "_aiohttp.ClientSession":
+        """
+        Lazily create and reuse a single aiohttp.ClientSession across all
+        async_check_limit() calls, instead of opening and tearing down a
+        brand-new session on every call.  Sessions must be created from
+        inside a running event loop, so this happens on first use rather
+        than in __init__.
+        """
+        if self._aiohttp_session is None or self._aiohttp_session.closed:
+            self._aiohttp_session = _aiohttp.ClientSession()
+        return self._aiohttp_session
+    def set_limit(
+        self,
+        tenant_id: str,
+        limit_usd: float,
+        frequency: str = PER_MONTH,
+        effective_date=datetime.now(tz=timezone.utc).isoformat(),
+    ) -> None:
+        """
+        Set (or update) the monthly spend limit for a tenant.
+        Calls the backend to upsert the threshold record.
+        """
+        self._set_limit_http._post(
+            payload={
+                "tenant_id": tenant_id,
+                "limit_usd": limit_usd,
+                "frequency": frequency,
+                "effective_date": effective_date,
+            },
+        )
+        # Invalidate any cached limit-check result so the new threshold
+        # is honored immediately rather than after the TTL expires.
+        with self._limit_cache_lock:
+            self._limit_cache.pop(tenant_id, None)
+            self._local_token_spend.pop(tenant_id, None)
+    # ── openrouter  ──────────────────────────────────────────
+    def register_openrouter_client(self, client) -> None:
+        """Instrument an existing openai.OpenAI / AsyncOpenAI instance for OpenRouter."""
+        if "openrouter" not in self._patches:
+            self.patch("openrouter")                   # creates & stores in self._patches
+        self._patches["openrouter"].patch_instance(client)
+    def openrouter_client(self, api_key: str, **kwargs):
+        """Factory: create an openai.OpenAI client for OpenRouter + register it."""
+        import openai
+        client = openai.OpenAI(
+            base_url="https://openrouter.ai/api/v1",
+            api_key=api_key,
+            **kwargs,
+        )
+        self.register_openrouter_client(client)
+        return client
+    def async_openrouter_client(self, api_key: str, **kwargs):
+        """Factory: create an openai.AsyncOpenAI client for OpenRouter + register it."""
+        import openai
+        client = openai.AsyncOpenAI(
+            base_url="https://openrouter.ai/api/v1",
+            api_key=api_key,
+            **kwargs,
+        )
+        self.register_openrouter_client(client)
+        return client
+    # ── manual tracking ──────────────────────────────────────────
+    def track_manually(
+        self,
+        *,
+        tenant_id: Optional[str] = None,
+        provider: str,
+        model: str,
+        input_tokens: int = 0,
+        output_tokens: int = 0,
+        **extra,
+    ) -> LLMEvent:
+        """
+        Manually record an event (for providers not yet patched, or custom logic).
+            meter.track_manually(
+                # necessary fields
+                provider="custom",
+                model="my-model-v1",
+                input_tokens=512,
+                output_tokens=128,
+                tenant_id="user_track_manually",
+            )
+        Keep in mind: these fields being sending through 'track_manually',
+        should have relevant values of 'Model Pricing' in dashboard for calculating cost events;
+        otherwise, cost won't be calculated but event is recorded.
+        """
+        event_data = {
+            "provider": provider,
+            "model": model,
+            "input_tokens": input_tokens,
+            "output_tokens": output_tokens,
+            "total_tokens": input_tokens + output_tokens,
+            "tenant_id": tenant_id or self.current_tenant_id,
+            **extra,
+        }
+        return self._capture(event_data)
+    # ── context manager support ──────────────────────────────────
+    def __enter__(self) -> "Meter":
+        return self
+    def __exit__(self, *_) -> None:
+        self.unpatch_all()
+        self._queue.shutdown()
+        if self._aiohttp_session is not None and not self._aiohttp_session.closed:
+            # __exit__ is sync; aiohttp sessions close async, but we're
+            # best-effort here since some loops may already be closed
+            # by the time __exit__ runs. Swallow errors rather than
+            # raise during cleanup.
+            try:
+                import asyncio
+                loop = asyncio.get_event_loop()
+                if loop.is_running():
+                    loop.create_task(self._aiohttp_session.close())
+                else:
+                    loop.run_until_complete(self._aiohttp_session.close())
+            except Exception:
+                pass
+    # ── internal ─────────────────────────────────────────────────
+    def _capture(self, data: dict) -> LLMEvent:
+        """Build an LLMEvent and enqueue it. Called by every patch."""
+        self._accumulate_local_tokens(data)
+        known_fields = LLMEvent.__dataclass_fields__
+        if self.config.debug:
+            dropped = [k for k in data if k not in known_fields]
+            if dropped:
+                logger.debug(
+                    "token_limit: dropping unknown event field(s) %s "
+                    "(not present on LLMEvent) — check for typos in "
+                    "track_manually()/extractor kwargs.",
+                    dropped,
+                )
+        event = LLMEvent(**{k: v for k, v in data.items() if k in known_fields})
+        if self.config.debug:
+            logger.debug("token_limit captured: %s", event)
+        if self.config.on_event:
+            try:
+                self.config.on_event(event)
+            except Exception:
+                pass
+        self._queue.push(event)
+        return event

token_limit/patches/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+from token_limit.patches.deepseek_patch import DeepSeekPatch
+from .anthropic_patch import AnthropicPatch
+from .openai_patch import OpenAIPatch
+from .google_patch import GooglePatch
+from .openrouter_patch import OpenRouterPatch
+PATCH_REGISTRY = {
+    "anthropic": AnthropicPatch,
+    "openai": OpenAIPatch,
+    "google": GooglePatch,
+    "openrouter": OpenRouterPatch,
+    "deepseek": DeepSeekPatch
+}