PyPI - tokenrail - Versions diffs - 1.0.0__py3-none-any.whl - Mend

tokenrail 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

tokenrail/__init__.py +31 -0
tokenrail/catalog.py +159 -0
tokenrail/client.py +55 -0
tokenrail/executor.py +246 -0
tokenrail/monitor.py +203 -0
tokenrail/providers/__init__.py +3 -0
tokenrail/providers/base.py +20 -0
tokenrail/providers/openai.py +218 -0
tokenrail/py.typed +0 -0
tokenrail/sinks.py +104 -0
tokenrail/types.py +219 -0
tokenrail-1.0.0.dist-info/METADATA +143 -0
tokenrail-1.0.0.dist-info/RECORD +15 -0
tokenrail-1.0.0.dist-info/WHEEL +4 -0
tokenrail-1.0.0.dist-info/licenses/LICENSE +21 -0

tokenrail/__init__.py ADDED Viewed

@@ -0,0 +1,31 @@
+"""Thin client and batch execution helpers for OpenAI Responses API workloads.
+tokenrail wraps the OpenAI Responses API with a ``client.responses.create(...)``-style
+surface and adds thread-based batch execution, client-side RPM/TPM submit throttling,
+per-model token/cost monitoring, and resumable JSONL / per-request result writing.
+"""
+from .client import RailClient
+from .executor import BatchExecutor, batch_items_from_queries
+from .monitor import RollingMetricsMonitor
+from .providers import OpenAIProvider
+from .sinks import PerRequestJsonSink, ResultsJsonlSink
+from .types import BatchItem, CostBreakdown, NormalizedResponse, StatsSnapshot, UsageBreakdown
+__version__ = "1.0.0"
+__all__ = [
+    "__version__",
+    "BatchExecutor",
+    "BatchItem",
+    "CostBreakdown",
+    "NormalizedResponse",
+    "OpenAIProvider",
+    "PerRequestJsonSink",
+    "RailClient",
+    "ResultsJsonlSink",
+    "RollingMetricsMonitor",
+    "StatsSnapshot",
+    "UsageBreakdown",
+    "batch_items_from_queries",
+]

tokenrail/catalog.py ADDED Viewed

@@ -0,0 +1,159 @@
+from __future__ import annotations
+from collections.abc import Iterable
+from dataclasses import dataclass
+from decimal import Decimal
+from typing import TypeVar
+from .types import CostBreakdown, UsageBreakdown
+_T = TypeVar("_T")
+@dataclass(frozen=True, slots=True)
+class ModelCapabilities:
+    reasoning_effort: bool
+    verbosity: bool
+    temperature: bool
+    top_p: bool
+    max_output_tokens: bool
+    response_format: bool
+@dataclass(frozen=True, slots=True)
+class ModelPricing:
+    input_per_million: Decimal
+    cached_input_per_million: Decimal | None
+    output_per_million: Decimal
+    service_tier: str = "default"
+_CAPABILITY_RULES: list[tuple[tuple[str, ...], ModelCapabilities]] = [
+    (
+        ("gpt-5", "o1", "o3", "o4"),
+        ModelCapabilities(
+            reasoning_effort=True,
+            verbosity=True,
+            temperature=True,
+            top_p=True,
+            max_output_tokens=True,
+            response_format=True,
+        ),
+    ),
+    (
+        ("gpt-4.1", "gpt-4o"),
+        ModelCapabilities(
+            reasoning_effort=False,
+            verbosity=True,
+            temperature=True,
+            top_p=True,
+            max_output_tokens=True,
+            response_format=True,
+        ),
+    ),
+]
+_DEFAULT_CAPABILITIES = ModelCapabilities(
+    reasoning_effort=False,
+    verbosity=False,
+    temperature=True,
+    top_p=True,
+    max_output_tokens=True,
+    response_format=True,
+)
+_PRICING_RULES: list[tuple[tuple[str, ...], ModelPricing]] = [
+    (("gpt-5.5",), ModelPricing(Decimal("5.00"), Decimal("0.50"), Decimal("30.00"))),
+    (("gpt-5.4-mini",), ModelPricing(Decimal("0.750"), Decimal("0.075"), Decimal("4.500"))),
+    (("gpt-5.4-nano",), ModelPricing(Decimal("0.20"), Decimal("0.02"), Decimal("1.25"))),
+    (("gpt-5.4",), ModelPricing(Decimal("2.50"), Decimal("0.25"), Decimal("15.00"))),
+    (("gpt-5.2",), ModelPricing(Decimal("1.75"), Decimal("0.175"), Decimal("14.00"))),
+    (("gpt-5-mini",), ModelPricing(Decimal("0.25"), Decimal("0.025"), Decimal("2.00"))),
+    (("gpt-5-nano",), ModelPricing(Decimal("0.05"), Decimal("0.005"), Decimal("0.40"))),
+    (("gpt-5",), ModelPricing(Decimal("1.25"), Decimal("0.125"), Decimal("10.00"))),
+    (("gpt-4.1-mini",), ModelPricing(Decimal("0.40"), Decimal("0.10"), Decimal("1.60"))),
+    (("gpt-4.1-nano",), ModelPricing(Decimal("0.10"), Decimal("0.025"), Decimal("0.40"))),
+    (("gpt-4.1",), ModelPricing(Decimal("2.00"), Decimal("0.50"), Decimal("8.00"))),
+    (("gpt-4o-mini",), ModelPricing(Decimal("0.15"), Decimal("0.075"), Decimal("0.60"))),
+    (("gpt-4o",), ModelPricing(Decimal("2.50"), Decimal("1.25"), Decimal("10.00"))),
+    (("o4-mini",), ModelPricing(Decimal("1.10"), Decimal("0.275"), Decimal("4.40"))),
+    (("o3",), ModelPricing(Decimal("2.00"), Decimal("0.50"), Decimal("8.00"))),
+    (("o1",), ModelPricing(Decimal("15.00"), Decimal("7.50"), Decimal("60.00"))),
+]
+_MODEL_NAME_DELIMITERS = {"-", "_", "/", ":", " ", "."}
+def _is_delimited_match(model: str, candidate: str, start: int, end: int) -> bool:
+    before = model[start - 1] if start > 0 else None
+    after = model[end] if end < len(model) else None
+    before_ok = before is None or before in _MODEL_NAME_DELIMITERS
+    after_ok = after is None or after in _MODEL_NAME_DELIMITERS
+    return before_ok and after_ok
+def _match_rule(model: str, rules: Iterable[tuple[tuple[str, ...], _T]]) -> _T | None:
+    matches: list[tuple[int, int, _T]] = []
+    for rule_index, (prefixes, payload) in enumerate(rules):
+        for prefix in prefixes:
+            start = model.find(prefix)
+            while start != -1:
+                end = start + len(prefix)
+                if _is_delimited_match(model, prefix, start, end):
+                    matches.append((len(prefix), rule_index, payload))
+                    break
+                start = model.find(prefix, start + 1)
+    if not matches:
+        return None
+    matches.sort(key=lambda item: (-item[0], item[1]))
+    return matches[0][2]
+def get_model_capabilities(model: str) -> ModelCapabilities:
+    """Return the request-parameter capabilities for ``model``.
+    Matching is delimiter-aware substring matching against the checked-in
+    capability registry; unknown models fall back to a conservative default.
+    """
+    return _match_rule(model, _CAPABILITY_RULES) or _DEFAULT_CAPABILITIES
+def get_model_pricing(model: str, service_tier: str = "default") -> ModelPricing | None:
+    """Return per-million-token pricing for ``model``, or ``None`` if unknown.
+    The checked-in registry only carries default-tier prices; for other service
+    tiers the default-tier price is returned as an approximation.
+    """
+    return _match_rule(model, _PRICING_RULES)
+def calculate_cost(
+    model: str,
+    usage: UsageBreakdown,
+    payer: str | None,
+    service_tier: str = "default",
+) -> CostBreakdown | None:
+    """Compute the nominal USD cost of ``usage`` and attribute it to a payer.
+    Returns ``None`` when the model has no pricing entry. When ``payer`` is
+    ``"openai"`` the cost is attributed to OpenAI instead of the developer.
+    """
+    pricing = get_model_pricing(model, service_tier=service_tier)
+    if pricing is None:
+        return None
+    uncached_input = max(usage.input_tokens - usage.cached_tokens, 0)
+    cached_rate = pricing.cached_input_per_million or Decimal("0")
+    total = (
+        (Decimal(uncached_input) * pricing.input_per_million)
+        + (Decimal(usage.cached_tokens) * cached_rate)
+        + (Decimal(usage.output_tokens) * pricing.output_per_million)
+    ) / Decimal("1000000")
+    nominal = float(total)
+    if payer == "openai":
+        return CostBreakdown(nominal_usd=nominal, developer_usd=0.0, openai_usd=nominal, payer=payer)
+    return CostBreakdown(nominal_usd=nominal, developer_usd=nominal, openai_usd=0.0, payer=payer)

tokenrail/client.py ADDED Viewed

@@ -0,0 +1,55 @@
+from __future__ import annotations
+from typing import Any
+from .providers.base import BaseProvider
+from .providers.openai import OpenAIProvider
+from .types import NormalizedResponse
+class _ResponsesNamespace:
+    def __init__(self, provider: BaseProvider) -> None:
+        self._provider = provider
+    def create(self, **kwargs: Any) -> NormalizedResponse:
+        return self._provider.create(**kwargs)
+class RailClient:
+    """Provider-agnostic client with a ``client.responses.create(...)`` surface.
+    Wraps a :class:`~tokenrail.providers.base.BaseProvider` and exposes it through
+    a ``responses`` namespace that mirrors the OpenAI SDK call shape while
+    returning :class:`~tokenrail.types.NormalizedResponse` objects.
+    """
+    def __init__(self, provider: BaseProvider) -> None:
+        self.provider = provider
+        self.responses = _ResponsesNamespace(provider)
+    @classmethod
+    def openai(
+        cls,
+        *,
+        api_key: str | None = None,
+        organization: str | None = None,
+        timeout: float | None = None,
+        base_url: str | None = None,
+        max_retries: int = 2,
+        client: Any | None = None,
+    ) -> RailClient:
+        """Build a :class:`RailClient` backed by the OpenAI Python SDK.
+        ``max_retries`` configures the SDK's built-in retry behavior; tokenrail
+        does not add its own retry loop. Pass ``client`` to inject a pre-built
+        (or fake) OpenAI client instead of constructing one.
+        """
+        provider = OpenAIProvider(
+            client=client,
+            api_key=api_key,
+            organization=organization,
+            timeout=timeout,
+            base_url=base_url,
+            max_retries=max_retries,
+        )
+        return cls(provider=provider)

tokenrail/executor.py ADDED Viewed

@@ -0,0 +1,246 @@
+from __future__ import annotations
+import time
+from collections import deque
+from collections.abc import Callable, Sequence
+from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait
+from typing import Any
+from .monitor import RollingMetricsMonitor
+from .sinks import ResultSink
+from .types import BatchItem, NormalizedResponse, StatsSnapshot, TimingBreakdown, UsageBreakdown
+def batch_items_from_queries(queries: dict[str, Any], **shared_request_kwargs: Any) -> list[BatchItem]:
+    """Build :class:`BatchItem` objects from an ``{id: input}`` mapping.
+    Each value becomes the request ``input``; ``shared_request_kwargs`` (e.g.
+    ``model``, ``reasoning_effort``) are applied to every item.
+    """
+    return [
+        BatchItem(id=str(item_id), request_kwargs={"input": messages, **shared_request_kwargs})
+        for item_id, messages in queries.items()
+    ]
+def _error_response(item_id: str, model: str, provider: str, error: Exception) -> NormalizedResponse:
+    return NormalizedResponse(
+        id=item_id,
+        model=model,
+        provider=provider,
+        output_text=None,
+        raw_response=None,
+        usage=UsageBreakdown.empty(),
+        billing=None,
+        cost=None,
+        timing=TimingBreakdown(started_at=0.0, completed_at=0.0, latency_seconds=0.0),
+        error=f"{type(error).__name__}: {error}",
+    )
+class _SubmitRateLimiter:
+    def __init__(
+        self,
+        *,
+        max_rpm: int | None,
+        max_tpm: int | None,
+        window_seconds: float = 60.0,
+        time_fn: Callable[[], float] = time.time,
+        sleep_fn: Callable[[float], None] = time.sleep,
+    ) -> None:
+        if max_rpm is not None and max_rpm < 1:
+            raise ValueError("max_rpm must be at least 1")
+        if max_tpm is not None and max_tpm < 1:
+            raise ValueError("max_tpm must be at least 1")
+        self.max_rpm = max_rpm
+        self.max_tpm = max_tpm
+        self.window_seconds = window_seconds
+        self.time_fn = time_fn
+        self.sleep_fn = sleep_fn
+        self._submitted_at: deque[float] = deque()
+        self._completed_events: deque[tuple[float, int]] = deque()
+        self._inflight_estimates: deque[int] = deque()
+        self._inflight_estimated_tokens = 0
+        self._completed_requests = 0
+        self._completed_tokens = 0
+    def _prune(self, now: float) -> None:
+        cutoff = now - self.window_seconds
+        while self._submitted_at and self._submitted_at[0] <= cutoff:
+            self._submitted_at.popleft()
+        while self._completed_events and self._completed_events[0][0] <= cutoff:
+            self._completed_events.popleft()
+    def _estimated_next_tokens(self) -> int:
+        if self._completed_requests == 0:
+            return 0
+        return (self._completed_tokens + self._completed_requests - 1) // self._completed_requests
+    def _rolling_completed_tokens(self) -> int:
+        return sum(tokens for _, tokens in self._completed_events)
+    def can_submit(self) -> bool:
+        now = self.time_fn()
+        self._prune(now)
+        if self.max_rpm is not None and len(self._submitted_at) >= self.max_rpm:
+            return False
+        if self.max_tpm is not None:
+            if self._completed_requests == 0 and self._submitted_at:
+                return False
+            estimated_next = self._estimated_next_tokens()
+            if not self._completed_events and self._inflight_estimated_tokens == 0:
+                return True
+            if self._rolling_completed_tokens() + self._inflight_estimated_tokens + estimated_next > self.max_tpm:
+                return False
+        return True
+    def retry_after(self) -> float | None:
+        now = self.time_fn()
+        self._prune(now)
+        waits: list[float] = []
+        if self.max_rpm is not None and len(self._submitted_at) >= self.max_rpm:
+            waits.append(self._submitted_at[0] + self.window_seconds - now)
+        if self.max_tpm is not None and self._completed_events:
+            projected = (
+                self._rolling_completed_tokens() + self._inflight_estimated_tokens + self._estimated_next_tokens()
+            )
+            if projected > self.max_tpm:
+                waits.append(self._completed_events[0][0] + self.window_seconds - now)
+        if waits:
+            return max(min(waits), 0.0)
+        if not self.can_submit():
+            return None
+        return 0.0
+    def wait_until_allowed(self) -> None:
+        while not self.can_submit():
+            self.sleep_fn(self.retry_after() or 0.01)
+    def record_submit(self) -> None:
+        now = self.time_fn()
+        self._prune(now)
+        self._submitted_at.append(now)
+        estimated_tokens = self._estimated_next_tokens() if self.max_tpm is not None else 0
+        self._inflight_estimates.append(estimated_tokens)
+        self._inflight_estimated_tokens += estimated_tokens
+    def record_completion(self, response: NormalizedResponse) -> None:
+        now = self.time_fn()
+        self._prune(now)
+        if self._inflight_estimates:
+            self._inflight_estimated_tokens -= self._inflight_estimates.popleft()
+        total_tokens = response.usage.total_tokens or (response.usage.input_tokens + response.usage.output_tokens)
+        self._completed_events.append((now, total_tokens))
+        self._completed_requests += 1
+        self._completed_tokens += total_tokens
+class BatchExecutor:
+    """Thread-based batch runner for :class:`~tokenrail.client.RailClient` requests.
+    Submits items to a thread pool while honoring optional client-side
+    ``max_rpm`` / ``max_tpm`` submit limits, writes each result to the
+    configured sinks, and records metrics on the monitor. Items whose ids are
+    already present in the first sink are skipped, which makes re-runs
+    resumable. Request errors are captured as error responses rather than
+    raised, so a single failing item does not abort the batch.
+    """
+    def __init__(
+        self,
+        *,
+        client: Any,
+        max_workers: int = 20,
+        max_rpm: int | None = None,
+        max_tpm: int | None = None,
+        sinks: Sequence[ResultSink] | None = None,
+        monitor: RollingMetricsMonitor | None = None,
+    ) -> None:
+        self.client = client
+        self.max_workers = max_workers
+        self.max_rpm = max_rpm
+        self.max_tpm = max_tpm
+        self.sinks = list(sinks or [])
+        self.monitor = monitor or RollingMetricsMonitor()
+        self._time_fn = time.time
+        self._sleep_fn = time.sleep
+    def _save(self, response: NormalizedResponse) -> None:
+        for sink in self.sinks:
+            sink.save(response)
+    def _load_done_ids(self) -> set[str]:
+        if not self.sinks:
+            return set()
+        return self.sinks[0].load_done_ids()
+    def _prepare_items(self, items: Sequence[BatchItem] | dict[str, Any]) -> list[BatchItem]:
+        if isinstance(items, dict):
+            return batch_items_from_queries(items)
+        return [BatchItem(id=str(item.id), request_kwargs=dict(item.request_kwargs)) for item in items]
+    def _request_kwargs(self, item: BatchItem) -> dict[str, Any]:
+        request_kwargs = dict(item.request_kwargs)
+        request_kwargs.setdefault("request_id", item.id)
+        return request_kwargs
+    def _call_single(self, item: BatchItem) -> NormalizedResponse:
+        request_kwargs = self._request_kwargs(item)
+        try:
+            return self.client.responses.create(**request_kwargs)
+        except Exception as exc:
+            model = str(request_kwargs.get("model") or getattr(self.client.provider, "model_id", "unknown"))
+            return _error_response(item.id, model=model, provider=self.client.provider.name, error=exc)
+    def _run_threaded(self, items: list[BatchItem]) -> None:
+        limiter = _SubmitRateLimiter(
+            max_rpm=self.max_rpm,
+            max_tpm=self.max_tpm,
+            time_fn=self._time_fn,
+            sleep_fn=self._sleep_fn,
+        )
+        next_item = 0
+        pending: set[Future[NormalizedResponse]] = set()
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            while next_item < len(items) or pending:
+                while next_item < len(items) and len(pending) < self.max_workers and limiter.can_submit():
+                    limiter.record_submit()
+                    pending.add(executor.submit(self._call_single, items[next_item]))
+                    next_item += 1
+                if not pending:
+                    if next_item < len(items):
+                        limiter.wait_until_allowed()
+                    continue
+                timeout = None
+                if next_item < len(items) and len(pending) < self.max_workers and not limiter.can_submit():
+                    timeout = limiter.retry_after()
+                done, pending = wait(pending, timeout=timeout, return_when=FIRST_COMPLETED)
+                if not done:
+                    continue
+                for future in done:
+                    response = future.result()
+                    limiter.record_completion(response)
+                    self._save(response)
+                    self.monitor.record(response)
+    def run(self, items: Sequence[BatchItem] | dict[str, Any]) -> StatsSnapshot:
+        """Execute ``items`` (a sequence of :class:`BatchItem` or an ``{id: input}``
+        dict) and return the final :class:`~tokenrail.types.StatsSnapshot`."""
+        self.monitor.reset()
+        normalized_items = self._prepare_items(items)
+        done_ids = self._load_done_ids()
+        todo = [item for item in normalized_items if item.id not in done_ids]
+        skipped = len(normalized_items) - len(todo)
+        self.monitor.start(
+            total_requests=len(normalized_items),
+            todo_requests=len(todo),
+            skipped_requests=skipped,
+        )
+        self._run_threaded(todo)
+        return self.monitor.finalize(total_requests=len(normalized_items), skipped_requests=skipped)