PyPI - llm-evalgate - Versions diffs - 0.1.0__py3-none-any.whl - Mend

llm-evalgate 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

llm_evalgate-0.1.0.dist-info/METADATA +188 -0
llm_evalgate-0.1.0.dist-info/RECORD +16 -0
llm_evalgate-0.1.0.dist-info/WHEEL +4 -0
llm_evalkit/__init__.py +4 -0
llm_evalkit/eval/__init__.py +4 -0
llm_evalkit/eval/dimension.py +31 -0
llm_evalkit/eval/dimensions/__init__.py +11 -0
llm_evalkit/eval/dimensions/blocklist.py +32 -0
llm_evalkit/eval/dimensions/factual.py +56 -0
llm_evalkit/eval/dimensions/readability.py +26 -0
llm_evalkit/eval/dimensions/schema.py +30 -0
llm_evalkit/eval/harness.py +50 -0
llm_evalkit/reliable/__init__.py +12 -0
llm_evalkit/reliable/circuit.py +97 -0
llm_evalkit/reliable/fallback.py +53 -0
llm_evalkit/reliable/retry.py +45 -0

llm_evalgate-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,188 @@
+Metadata-Version: 2.4
+Name: llm-evalgate
+Version: 0.1.0
+Summary: Deterministic eval gates and reliability primitives for LLM pipelines
+Project-URL: Homepage, https://github.com/LesterALeong/llm-evalkit
+Project-URL: Repository, https://github.com/LesterALeong/llm-evalkit
+Project-URL: Issues, https://github.com/LesterALeong/llm-evalkit/issues
+Author-email: Lester Leong <lester.leong89@gmail.com>
+License: MIT
+Keywords: agents,ai,eval,evaluation,llm,pipeline,reliability
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.9
+Requires-Dist: textstat>=0.7
+Provides-Extra: dev
+Requires-Dist: hatch>=1.12; extra == 'dev'
+Requires-Dist: pytest-cov>=5.0; extra == 'dev'
+Requires-Dist: pytest>=8.0; extra == 'dev'
+Requires-Dist: ruff>=0.4; extra == 'dev'
+Description-Content-Type: text/markdown
+# llm-evalkit
+Deterministic eval gates and reliability primitives for LLM pipelines.
+[![CI](https://github.com/LesterALeong/llm-evalkit/actions/workflows/ci.yml/badge.svg)](https://github.com/LesterALeong/llm-evalkit/actions/workflows/ci.yml)
+[![PyPI](https://img.shields.io/pypi/v/llm-evalgate)](https://pypi.org/project/llm-evalgate/)
+[![Python](https://img.shields.io/pypi/pyversions/llm-evalgate)](https://pypi.org/project/llm-evalgate/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
+---
+Most LLM eval tooling is either LLM-as-judge (non-deterministic, expensive, not CI-friendly) or a heavy enterprise suite. `llm-evalkit` is neither.
+It gives you two things:
+- **Eval gates**: code-only quality dimensions that run the same way every time. Drop them into any pipeline, run them in CI, get a pass/fail with a reason.
+- **Reliability primitives**: retry with backoff, model fallback chains, and a circuit breaker. The building blocks for LLM pipelines that hold up in production.
+## Install
+```bash
+pip install llm-evalgate
+```
+## Quickstart
+### Eval gates
+```python
+from llm_evalkit import EvalHarness
+from llm_evalkit.eval.dimensions import BlocklistDimension, ReadabilityDimension, SchemaComplianceDimension
+harness = EvalHarness([
+    BlocklistDimension(terms=["confidential", "internal use only"]),
+    ReadabilityDimension(threshold=0.3),
+    SchemaComplianceDimension(required_fields=["title:", "summary:"]),
+])
+report = harness.run(llm_output)
+if not report.passed:
+    print(report)
+    # EvalReport: FAIL
+    #   FAIL [blocklist] score=0.000 — prohibited terms found: ['confidential']
+    #   PASS [readability] score=0.612 — Flesch ease=61.2, FK grade=8.4
+    #   PASS [schema_compliance] score=1.000 — all 2 required fields present
+```
+### Custom dimension
+```python
+from llm_evalkit import Dimension
+class JsonDimension(Dimension):
+    def evaluate(self, text: str) -> tuple[float, str]:
+        import json
+        try:
+            json.loads(text)
+            return 1.0, "valid JSON"
+        except json.JSONDecodeError as e:
+            return 0.0, f"invalid JSON: {e}"
+harness = EvalHarness([JsonDimension(threshold=1.0)])
+report = harness.run('{"key": "value"}')
+assert report.passed
+```
+### Retry
+```python
+from llm_evalkit.reliable import retry
+@retry(max_attempts=3, backoff=2.0)
+def call_llm(prompt: str) -> str:
+    return client.messages.create(...)
+```
+### Fallback chain
+```python
+from llm_evalkit.reliable import with_fallback, with_fallback_chain
+# two-model fallback
+result = with_fallback(
+    primary=lambda: call_model("claude-opus-4-8", prompt),
+    fallback=lambda: call_model("claude-sonnet-4-6", prompt),
+)
+# ordered chain — first success wins
+result = with_fallback_chain([
+    lambda: call_model("claude-opus-4-8", prompt),
+    lambda: call_model("claude-sonnet-4-6", prompt),
+    lambda: call_model("claude-haiku-4-5", prompt),
+])
+```
+### Circuit breaker
+```python
+from llm_evalkit.reliable import CircuitBreaker, CircuitOpenError
+breaker = CircuitBreaker(failure_threshold=5, recovery_timeout=60)
+try:
+    with breaker:
+        result = call_llm(prompt)
+except CircuitOpenError:
+    result = cached_response  # serve from cache while circuit is open
+```
+## Built-in dimensions
+| Dimension | What it checks | Default threshold |
+|---|---|---|
+| `BlocklistDimension` | No prohibited terms in output | 1.0 (zero tolerance) |
+| `ReadabilityDimension` | Flesch Reading Ease score | 0.3 (college-level prose) |
+| `SchemaComplianceDimension` | Required fields are present | 1.0 (all fields) |
+| `FactualGroundingDimension` | Numeric claims traceable to evidence | 0.85 |
+All dimensions follow the same interface: `evaluate(text) -> (score, detail)`. Writing a new one is ten lines.
+## Why deterministic?
+LLM-as-judge eval is useful for research. In production pipelines, you need:
+- The same input to produce the same pass/fail result every run
+- CI to catch regressions without burning tokens on every commit
+- An audit trail that doesn't depend on a model that may drift
+`llm-evalkit` eval dimensions are pure functions. No model calls, no network, no randomness.
+## Composing with a pipeline
+```python
+from llm_evalkit import EvalHarness
+from llm_evalkit.eval.dimensions import BlocklistDimension, ReadabilityDimension
+from llm_evalkit.reliable import retry, with_fallback
+harness = EvalHarness([
+    BlocklistDimension(terms=["[REDACTED]", "TODO"]),
+    ReadabilityDimension(threshold=0.2),
+])
+@retry(max_attempts=3, backoff=2.0)
+def generate(prompt: str) -> str:
+    return with_fallback(
+        primary=lambda: call_model("claude-opus-4-8", prompt),
+        fallback=lambda: call_model("claude-sonnet-4-6", prompt),
+    )
+output = generate(prompt)
+report = harness.run(output)
+if not report.passed:
+    raise ValueError(f"Output failed eval gate:\n{report}")
+```
+## License
+MIT

llm_evalgate-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+llm_evalkit/__init__.py,sha256=gnrmRzhGRj1uAlIPQIiMwUD-uKdQ-p_kbVJNXtWDCyo,178
+llm_evalkit/eval/__init__.py,sha256=LzfOrmNGsdj42fRei8sMpgKUnxHKd0kkUjvZMwB2das,168
+llm_evalkit/eval/dimension.py,sha256=Mz-GRBkxVSPjWeZzqr-qnVsOoxDUM8TmQi2xRykNiIg,918
+llm_evalkit/eval/harness.py,sha256=LAav9byP2xM1IQ9oi5mnuPHSYdDRKT3N9dQ-YtuXOR4,1585
+llm_evalkit/eval/dimensions/__init__.py,sha256=BIoF-tovjnu9ATQfeY8vjft5VAbl5-ATCVR_mHk6sIo,316
+llm_evalkit/eval/dimensions/blocklist.py,sha256=thLVxdYt17FkOnvXsJQjGcrRQu6lnjUUM90rZ1yhJXM,1016
+llm_evalkit/eval/dimensions/factual.py,sha256=fxPil-RJSfqZ2UT2GSJqglYn3NtHJSderI0MVSSf9oA,1885
+llm_evalkit/eval/dimensions/readability.py,sha256=e9ATzw6jtAnk0MwDeiO9CniIYXsuVkp9KnsUtLSvy_w,988
+llm_evalkit/eval/dimensions/schema.py,sha256=nK7RfRrC1wMHzDG8qimTQxKTNDH4BCdUNYAxRc9q16E,1001
+llm_evalkit/reliable/__init__.py,sha256=r57_TXMwWN_15kAJmCRyK6iU-xSQ47XrgCVlbdlJzwk,292
+llm_evalkit/reliable/circuit.py,sha256=k1BrQkglaIAQBE6uV_FbbiLGnWS3vlFRH8Nf__n2X0Q,2927
+llm_evalkit/reliable/fallback.py,sha256=iEdGtCCWsfGl-73-YZavjSpeyknPy1zOdojMWbOLer4,1286
+llm_evalkit/reliable/retry.py,sha256=w6YXk0ygO7w3R_7xEgLMiTnFlbB2pkbZD0jm2Ol_osY,1254
+llm_evalgate-0.1.0.dist-info/METADATA,sha256=bIDdqKcyah9fLzLd2zEXUASAjApdbt0AWxQawcN05NE,6198
+llm_evalgate-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
+llm_evalgate-0.1.0.dist-info/RECORD,,

llm_evalgate-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.29.0
+Root-Is-Purelib: true
+Tag: py3-none-any

llm_evalkit/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .eval.dimension import Dimension, DimensionResult
+from .eval.harness import EvalHarness, EvalReport
+__all__ = ["Dimension", "DimensionResult", "EvalHarness", "EvalReport"]

llm_evalkit/eval/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .dimension import Dimension, DimensionResult
+from .harness import EvalHarness, EvalReport
+__all__ = ["Dimension", "DimensionResult", "EvalHarness", "EvalReport"]

llm_evalkit/eval/dimension.py ADDED Viewed

@@ -0,0 +1,31 @@
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+@dataclass(frozen=True)
+class DimensionResult:
+    score: float
+    passed: bool
+    detail: str
+class Dimension(ABC):
+    """Base class for a single eval dimension.
+    Subclass this and implement ``evaluate``. The harness calls ``run``,
+    which applies the threshold and returns a ``DimensionResult``.
+    """
+    def __init__(self, threshold: float = 1.0, name: str | None = None) -> None:
+        self.threshold = threshold
+        self.name = name or self.__class__.__name__
+    @abstractmethod
+    def evaluate(self, text: str) -> tuple[float, str]:
+        """Return (score, detail). Score is in [0.0, 1.0]."""
+    def run(self, text: str) -> DimensionResult:
+        score, detail = self.evaluate(text)
+        return DimensionResult(score=score, passed=score >= self.threshold, detail=detail)

llm_evalkit/eval/dimensions/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from .blocklist import BlocklistDimension
+from .factual import FactualGroundingDimension
+from .readability import ReadabilityDimension
+from .schema import SchemaComplianceDimension
+__all__ = [
+    "BlocklistDimension",
+    "FactualGroundingDimension",
+    "ReadabilityDimension",
+    "SchemaComplianceDimension",
+]

llm_evalkit/eval/dimensions/blocklist.py ADDED Viewed

@@ -0,0 +1,32 @@
+from __future__ import annotations
+import re
+from ..dimension import Dimension
+class BlocklistDimension(Dimension):
+    """Fail if any prohibited term appears in the text (case-insensitive).
+    Score is 1.0 when clean, 0.0 when any term is found.  Useful for
+    preventing confidential identifiers, brand names, or internal jargon
+    from leaking into LLM output.
+    """
+    def __init__(
+        self,
+        terms: list[str],
+        threshold: float = 1.0,
+        name: str = "blocklist",
+        case_sensitive: bool = False,
+    ) -> None:
+        super().__init__(threshold=threshold, name=name)
+        flags = 0 if case_sensitive else re.IGNORECASE
+        self._patterns = [re.compile(re.escape(t), flags) for t in terms]
+        self._terms = terms
+    def evaluate(self, text: str) -> tuple[float, str]:
+        found = [t for t, p in zip(self._terms, self._patterns) if p.search(text)]
+        if found:
+            return 0.0, f"prohibited terms found: {found}"
+        return 1.0, "clean"

llm_evalkit/eval/dimensions/factual.py ADDED Viewed

@@ -0,0 +1,56 @@
+from __future__ import annotations
+import re
+from ..dimension import Dimension
+class FactualGroundingDimension(Dimension):
+    """Check that numeric claims in LLM output are traceable to evidence.
+    For each number extracted from the text, we check whether a value
+    within ``rel_tolerance`` of it appears in the evidence list.  Score
+    is the fraction of numeric claims that are grounded.
+    If no evidence is supplied the dimension is skipped (returns 1.0).
+    If the text contains no numbers, it also passes.
+    """
+    def __init__(
+        self,
+        evidence: list[float] | None = None,
+        rel_tolerance: float = 0.02,
+        threshold: float = 0.85,
+        name: str = "factual_grounding",
+    ) -> None:
+        super().__init__(threshold=threshold, name=name)
+        self._evidence = evidence or []
+        self._rel_tolerance = rel_tolerance
+    def _numbers_in_text(self, text: str) -> list[float]:
+        raw = re.findall(r"[\d,]+(?:\.\d+)?", text)
+        results = []
+        for r in raw:
+            try:
+                results.append(float(r.replace(",", "")))
+            except ValueError:
+                pass
+        return results
+    def _is_grounded(self, value: float) -> bool:
+        for ev in self._evidence:
+            if ev == 0:
+                continue
+            if abs(value - ev) / abs(ev) <= self._rel_tolerance:
+                return True
+        return False
+    def evaluate(self, text: str) -> tuple[float, str]:
+        if not self._evidence:
+            return 1.0, "skipped (no evidence supplied)"
+        numbers = self._numbers_in_text(text)
+        if not numbers:
+            return 1.0, "no numeric claims found"
+        grounded = [n for n in numbers if self._is_grounded(n)]
+        score = len(grounded) / len(numbers)
+        return score, f"{len(grounded)}/{len(numbers)} numeric claims grounded"

llm_evalkit/eval/dimensions/readability.py ADDED Viewed

@@ -0,0 +1,26 @@
+from __future__ import annotations
+import textstat
+from ..dimension import Dimension
+class ReadabilityDimension(Dimension):
+    """Pass when Flesch Reading Ease score maps to a grade <= max_grade.
+    ``threshold`` is a normalised [0, 1] score derived from Flesch Reading
+    Ease, where 1.0 = very easy and 0.0 = very difficult.  The default
+    threshold of 0.3 accepts most professional prose up to ~college level.
+    """
+    def __init__(self, threshold: float = 0.3, name: str = "readability") -> None:
+        super().__init__(threshold=threshold, name=name)
+    def evaluate(self, text: str) -> tuple[float, str]:
+        if not text.strip():
+            return 0.0, "empty text"
+        ease = textstat.flesch_reading_ease(text)
+        # Flesch ease: 100=very easy, 0=very hard. Normalise to [0, 1].
+        score = max(0.0, min(1.0, ease / 100.0))
+        grade = textstat.flesch_kincaid_grade(text)
+        return score, f"Flesch ease={ease:.1f}, FK grade={grade:.1f}"

llm_evalkit/eval/dimensions/schema.py ADDED Viewed

@@ -0,0 +1,30 @@
+from __future__ import annotations
+from ..dimension import Dimension
+class SchemaComplianceDimension(Dimension):
+    """Check that required fields are present in the text.
+    Useful for structured LLM outputs (JSON, YAML, markdown with
+    required sections) where missing fields are a hard failure.
+    ``required_fields`` is a list of strings that must each appear
+    verbatim somewhere in the text.
+    """
+    def __init__(
+        self,
+        required_fields: list[str],
+        threshold: float = 1.0,
+        name: str = "schema_compliance",
+    ) -> None:
+        super().__init__(threshold=threshold, name=name)
+        self._required = required_fields
+    def evaluate(self, text: str) -> tuple[float, str]:
+        missing = [f for f in self._required if f not in text]
+        if missing:
+            score = 1.0 - len(missing) / len(self._required)
+            return score, f"missing fields: {missing}"
+        return 1.0, f"all {len(self._required)} required fields present"

llm_evalkit/eval/harness.py ADDED Viewed

@@ -0,0 +1,50 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from .dimension import Dimension, DimensionResult
+@dataclass
+class EvalReport:
+    passed: bool
+    results: dict[str, DimensionResult]
+    text: str
+    @property
+    def failures(self) -> dict[str, DimensionResult]:
+        return {name: r for name, r in self.results.items() if not r.passed}
+    def __str__(self) -> str:
+        lines = [f"EvalReport: {'PASS' if self.passed else 'FAIL'}"]
+        for name, result in self.results.items():
+            status = "PASS" if result.passed else "FAIL"
+            lines.append(f"  {status} [{name}] score={result.score:.3f} — {result.detail}")
+        return "\n".join(lines)
+class EvalHarness:
+    """Run a list of dimensions against text and produce an EvalReport.
+    Usage::
+        harness = EvalHarness([
+            ReadabilityDimension(threshold=0.7),
+            BlocklistDimension(terms=["secret", "internal"]),
+        ])
+        report = harness.run(text)
+        if not report.passed:
+            raise ValueError(str(report))
+    """
+    def __init__(self, dimensions: list[Dimension]) -> None:
+        if not dimensions:
+            raise ValueError("EvalHarness requires at least one dimension.")
+        self._dimensions = dimensions
+    def run(self, text: str) -> EvalReport:
+        results: dict[str, DimensionResult] = {}
+        for dim in self._dimensions:
+            results[dim.name] = dim.run(text)
+        passed = all(r.passed for r in results.values())
+        return EvalReport(passed=passed, results=results, text=text)

llm_evalkit/reliable/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+from .circuit import CircuitBreaker, CircuitOpenError, CircuitState
+from .fallback import with_fallback, with_fallback_chain
+from .retry import retry
+__all__ = [
+    "retry",
+    "with_fallback",
+    "with_fallback_chain",
+    "CircuitBreaker",
+    "CircuitOpenError",
+    "CircuitState",
+]

llm_evalkit/reliable/circuit.py ADDED Viewed

@@ -0,0 +1,97 @@
+from __future__ import annotations
+import time
+from enum import Enum
+class CircuitState(Enum):
+    CLOSED = "closed"      # normal operation
+    OPEN = "open"          # failing, rejecting calls
+    HALF_OPEN = "half_open"  # probing for recovery
+class CircuitOpenError(Exception):
+    """Raised when a call is attempted while the circuit is open."""
+class CircuitBreaker:
+    """Prevent cascading failures by stopping calls to a failing service.
+    Usage::
+        breaker = CircuitBreaker(failure_threshold=5, recovery_timeout=60)
+        # context manager form
+        with breaker:
+            result = call_llm(prompt)
+        # or call form
+        result = breaker.call(lambda: call_llm(prompt))
+    State transitions:
+    - CLOSED -> OPEN: after ``failure_threshold`` consecutive failures
+    - OPEN -> HALF_OPEN: after ``recovery_timeout`` seconds
+    - HALF_OPEN -> CLOSED: on first success
+    - HALF_OPEN -> OPEN: on first failure
+    """
+    def __init__(
+        self,
+        failure_threshold: int = 5,
+        recovery_timeout: float = 60.0,
+        exceptions: tuple[type[Exception], ...] = (Exception,),
+    ) -> None:
+        self.failure_threshold = failure_threshold
+        self.recovery_timeout = recovery_timeout
+        self.exceptions = exceptions
+        self._state = CircuitState.CLOSED
+        self._failure_count = 0
+        self._opened_at: float | None = None
+    @property
+    def state(self) -> CircuitState:
+        if self._state is CircuitState.OPEN:
+            assert self._opened_at is not None
+            if time.monotonic() - self._opened_at >= self.recovery_timeout:
+                self._state = CircuitState.HALF_OPEN
+        return self._state
+    def _on_success(self) -> None:
+        self._failure_count = 0
+        self._state = CircuitState.CLOSED
+        self._opened_at = None
+    def _on_failure(self) -> None:
+        self._failure_count += 1
+        if self._failure_count >= self.failure_threshold:
+            self._state = CircuitState.OPEN
+            self._opened_at = time.monotonic()
+    def call(self, fn):
+        if self.state is CircuitState.OPEN:
+            raise CircuitOpenError(
+                f"Circuit is open. Retry after {self.recovery_timeout}s."
+            )
+        try:
+            result = fn()
+            self._on_success()
+            return result
+        except self.exceptions as exc:
+            self._on_failure()
+            raise exc
+    def __enter__(self):
+        if self.state is CircuitState.OPEN:
+            raise CircuitOpenError(
+                f"Circuit is open. Retry after {self.recovery_timeout}s."
+            )
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None and issubclass(exc_type, self.exceptions):
+            self._on_failure()
+            return False
+        if exc_type is None:
+            self._on_success()
+        return False

llm_evalkit/reliable/fallback.py ADDED Viewed

@@ -0,0 +1,53 @@
+from __future__ import annotations
+from collections.abc import Callable
+from typing import TypeVar
+T = TypeVar("T")
+def with_fallback(
+    primary: Callable[[], T],
+    fallback: Callable[[], T],
+    exceptions: tuple[type[Exception], ...] = (Exception,),
+) -> T:
+    """Call ``primary``; on failure call ``fallback``.
+    Usage::
+        result = with_fallback(
+            primary=lambda: call_opus(prompt),
+            fallback=lambda: call_sonnet(prompt),
+        )
+    """
+    try:
+        return primary()
+    except exceptions:
+        return fallback()
+def with_fallback_chain(
+    callables: list[Callable[[], T]],
+    exceptions: tuple[type[Exception], ...] = (Exception,),
+) -> T:
+    """Try each callable in order, returning the first success.
+    Raises the last exception if all callables fail.
+    Usage::
+        result = with_fallback_chain([
+            lambda: call_opus(prompt),
+            lambda: call_sonnet(prompt),
+            lambda: call_haiku(prompt),
+        ])
+    """
+    if not callables:
+        raise ValueError("callables list is empty")
+    last_exc: Exception | None = None
+    for fn in callables:
+        try:
+            return fn()
+        except exceptions as exc:
+            last_exc = exc
+    raise last_exc  # type: ignore[misc]

llm_evalkit/reliable/retry.py ADDED Viewed

@@ -0,0 +1,45 @@
+from __future__ import annotations
+import functools
+import time
+from collections.abc import Callable
+from typing import Any, TypeVar
+F = TypeVar("F", bound=Callable[..., Any])
+def retry(
+    max_attempts: int = 3,
+    backoff: float = 2.0,
+    exceptions: tuple[type[Exception], ...] = (Exception,),
+) -> Callable[[F], F]:
+    """Retry a callable on failure with exponential backoff.
+    Usage::
+        @retry(max_attempts=3, backoff=2.0)
+        def call_llm(prompt: str) -> str:
+            ...
+        # or without decorator syntax:
+        result = retry(max_attempts=3)(call_llm)(prompt)
+    """
+    def decorator(fn: F) -> F:
+        @functools.wraps(fn)
+        def wrapper(*args: Any, **kwargs: Any) -> Any:
+            delay = backoff
+            last_exc: Exception | None = None
+            for attempt in range(1, max_attempts + 1):
+                try:
+                    return fn(*args, **kwargs)
+                except exceptions as exc:
+                    last_exc = exc
+                    if attempt < max_attempts:
+                        time.sleep(delay)
+                        delay *= backoff
+            raise last_exc  # type: ignore[misc]
+        return wrapper  # type: ignore[return-value]
+    return decorator