llm-evalgate 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,188 @@
1
+ Metadata-Version: 2.4
2
+ Name: llm-evalgate
3
+ Version: 0.1.0
4
+ Summary: Deterministic eval gates and reliability primitives for LLM pipelines
5
+ Project-URL: Homepage, https://github.com/LesterALeong/llm-evalkit
6
+ Project-URL: Repository, https://github.com/LesterALeong/llm-evalkit
7
+ Project-URL: Issues, https://github.com/LesterALeong/llm-evalkit/issues
8
+ Author-email: Lester Leong <lester.leong89@gmail.com>
9
+ License: MIT
10
+ Keywords: agents,ai,eval,evaluation,llm,pipeline,reliability
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Requires-Python: >=3.9
22
+ Requires-Dist: textstat>=0.7
23
+ Provides-Extra: dev
24
+ Requires-Dist: hatch>=1.12; extra == 'dev'
25
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
26
+ Requires-Dist: pytest>=8.0; extra == 'dev'
27
+ Requires-Dist: ruff>=0.4; extra == 'dev'
28
+ Description-Content-Type: text/markdown
29
+
30
+ # llm-evalkit
31
+
32
+ Deterministic eval gates and reliability primitives for LLM pipelines.
33
+
34
+ [![CI](https://github.com/LesterALeong/llm-evalkit/actions/workflows/ci.yml/badge.svg)](https://github.com/LesterALeong/llm-evalkit/actions/workflows/ci.yml)
35
+ [![PyPI](https://img.shields.io/pypi/v/llm-evalgate)](https://pypi.org/project/llm-evalgate/)
36
+ [![Python](https://img.shields.io/pypi/pyversions/llm-evalgate)](https://pypi.org/project/llm-evalgate/)
37
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
38
+
39
+ ---
40
+
41
+ Most LLM eval tooling is either LLM-as-judge (non-deterministic, expensive, not CI-friendly) or a heavy enterprise suite. `llm-evalkit` is neither.
42
+
43
+ It gives you two things:
44
+
45
+ - **Eval gates**: code-only quality dimensions that run the same way every time. Drop them into any pipeline, run them in CI, get a pass/fail with a reason.
46
+ - **Reliability primitives**: retry with backoff, model fallback chains, and a circuit breaker. The building blocks for LLM pipelines that hold up in production.
47
+
48
+ ## Install
49
+
50
+ ```bash
51
+ pip install llm-evalgate
52
+ ```
53
+
54
+ ## Quickstart
55
+
56
+ ### Eval gates
57
+
58
+ ```python
59
+ from llm_evalkit import EvalHarness
60
+ from llm_evalkit.eval.dimensions import BlocklistDimension, ReadabilityDimension, SchemaComplianceDimension
61
+
62
+ harness = EvalHarness([
63
+ BlocklistDimension(terms=["confidential", "internal use only"]),
64
+ ReadabilityDimension(threshold=0.3),
65
+ SchemaComplianceDimension(required_fields=["title:", "summary:"]),
66
+ ])
67
+
68
+ report = harness.run(llm_output)
69
+
70
+ if not report.passed:
71
+ print(report)
72
+ # EvalReport: FAIL
73
+ # FAIL [blocklist] score=0.000 — prohibited terms found: ['confidential']
74
+ # PASS [readability] score=0.612 — Flesch ease=61.2, FK grade=8.4
75
+ # PASS [schema_compliance] score=1.000 — all 2 required fields present
76
+ ```
77
+
78
+ ### Custom dimension
79
+
80
+ ```python
81
+ from llm_evalkit import Dimension
82
+
83
+ class JsonDimension(Dimension):
84
+ def evaluate(self, text: str) -> tuple[float, str]:
85
+ import json
86
+ try:
87
+ json.loads(text)
88
+ return 1.0, "valid JSON"
89
+ except json.JSONDecodeError as e:
90
+ return 0.0, f"invalid JSON: {e}"
91
+
92
+ harness = EvalHarness([JsonDimension(threshold=1.0)])
93
+ report = harness.run('{"key": "value"}')
94
+ assert report.passed
95
+ ```
96
+
97
+ ### Retry
98
+
99
+ ```python
100
+ from llm_evalkit.reliable import retry
101
+
102
+ @retry(max_attempts=3, backoff=2.0)
103
+ def call_llm(prompt: str) -> str:
104
+ return client.messages.create(...)
105
+ ```
106
+
107
+ ### Fallback chain
108
+
109
+ ```python
110
+ from llm_evalkit.reliable import with_fallback, with_fallback_chain
111
+
112
+ # two-model fallback
113
+ result = with_fallback(
114
+ primary=lambda: call_model("claude-opus-4-8", prompt),
115
+ fallback=lambda: call_model("claude-sonnet-4-6", prompt),
116
+ )
117
+
118
+ # ordered chain — first success wins
119
+ result = with_fallback_chain([
120
+ lambda: call_model("claude-opus-4-8", prompt),
121
+ lambda: call_model("claude-sonnet-4-6", prompt),
122
+ lambda: call_model("claude-haiku-4-5", prompt),
123
+ ])
124
+ ```
125
+
126
+ ### Circuit breaker
127
+
128
+ ```python
129
+ from llm_evalkit.reliable import CircuitBreaker, CircuitOpenError
130
+
131
+ breaker = CircuitBreaker(failure_threshold=5, recovery_timeout=60)
132
+
133
+ try:
134
+ with breaker:
135
+ result = call_llm(prompt)
136
+ except CircuitOpenError:
137
+ result = cached_response # serve from cache while circuit is open
138
+ ```
139
+
140
+ ## Built-in dimensions
141
+
142
+ | Dimension | What it checks | Default threshold |
143
+ |---|---|---|
144
+ | `BlocklistDimension` | No prohibited terms in output | 1.0 (zero tolerance) |
145
+ | `ReadabilityDimension` | Flesch Reading Ease score | 0.3 (college-level prose) |
146
+ | `SchemaComplianceDimension` | Required fields are present | 1.0 (all fields) |
147
+ | `FactualGroundingDimension` | Numeric claims traceable to evidence | 0.85 |
148
+
149
+ All dimensions follow the same interface: `evaluate(text) -> (score, detail)`. Writing a new one is ten lines.
150
+
151
+ ## Why deterministic?
152
+
153
+ LLM-as-judge eval is useful for research. In production pipelines, you need:
154
+
155
+ - The same input to produce the same pass/fail result every run
156
+ - CI to catch regressions without burning tokens on every commit
157
+ - An audit trail that doesn't depend on a model that may drift
158
+
159
+ `llm-evalkit` eval dimensions are pure functions. No model calls, no network, no randomness.
160
+
161
+ ## Composing with a pipeline
162
+
163
+ ```python
164
+ from llm_evalkit import EvalHarness
165
+ from llm_evalkit.eval.dimensions import BlocklistDimension, ReadabilityDimension
166
+ from llm_evalkit.reliable import retry, with_fallback
167
+
168
+ harness = EvalHarness([
169
+ BlocklistDimension(terms=["[REDACTED]", "TODO"]),
170
+ ReadabilityDimension(threshold=0.2),
171
+ ])
172
+
173
+ @retry(max_attempts=3, backoff=2.0)
174
+ def generate(prompt: str) -> str:
175
+ return with_fallback(
176
+ primary=lambda: call_model("claude-opus-4-8", prompt),
177
+ fallback=lambda: call_model("claude-sonnet-4-6", prompt),
178
+ )
179
+
180
+ output = generate(prompt)
181
+ report = harness.run(output)
182
+ if not report.passed:
183
+ raise ValueError(f"Output failed eval gate:\n{report}")
184
+ ```
185
+
186
+ ## License
187
+
188
+ MIT
@@ -0,0 +1,16 @@
1
+ llm_evalkit/__init__.py,sha256=gnrmRzhGRj1uAlIPQIiMwUD-uKdQ-p_kbVJNXtWDCyo,178
2
+ llm_evalkit/eval/__init__.py,sha256=LzfOrmNGsdj42fRei8sMpgKUnxHKd0kkUjvZMwB2das,168
3
+ llm_evalkit/eval/dimension.py,sha256=Mz-GRBkxVSPjWeZzqr-qnVsOoxDUM8TmQi2xRykNiIg,918
4
+ llm_evalkit/eval/harness.py,sha256=LAav9byP2xM1IQ9oi5mnuPHSYdDRKT3N9dQ-YtuXOR4,1585
5
+ llm_evalkit/eval/dimensions/__init__.py,sha256=BIoF-tovjnu9ATQfeY8vjft5VAbl5-ATCVR_mHk6sIo,316
6
+ llm_evalkit/eval/dimensions/blocklist.py,sha256=thLVxdYt17FkOnvXsJQjGcrRQu6lnjUUM90rZ1yhJXM,1016
7
+ llm_evalkit/eval/dimensions/factual.py,sha256=fxPil-RJSfqZ2UT2GSJqglYn3NtHJSderI0MVSSf9oA,1885
8
+ llm_evalkit/eval/dimensions/readability.py,sha256=e9ATzw6jtAnk0MwDeiO9CniIYXsuVkp9KnsUtLSvy_w,988
9
+ llm_evalkit/eval/dimensions/schema.py,sha256=nK7RfRrC1wMHzDG8qimTQxKTNDH4BCdUNYAxRc9q16E,1001
10
+ llm_evalkit/reliable/__init__.py,sha256=r57_TXMwWN_15kAJmCRyK6iU-xSQ47XrgCVlbdlJzwk,292
11
+ llm_evalkit/reliable/circuit.py,sha256=k1BrQkglaIAQBE6uV_FbbiLGnWS3vlFRH8Nf__n2X0Q,2927
12
+ llm_evalkit/reliable/fallback.py,sha256=iEdGtCCWsfGl-73-YZavjSpeyknPy1zOdojMWbOLer4,1286
13
+ llm_evalkit/reliable/retry.py,sha256=w6YXk0ygO7w3R_7xEgLMiTnFlbB2pkbZD0jm2Ol_osY,1254
14
+ llm_evalgate-0.1.0.dist-info/METADATA,sha256=bIDdqKcyah9fLzLd2zEXUASAjApdbt0AWxQawcN05NE,6198
15
+ llm_evalgate-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
16
+ llm_evalgate-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,4 @@
1
+ from .eval.dimension import Dimension, DimensionResult
2
+ from .eval.harness import EvalHarness, EvalReport
3
+
4
+ __all__ = ["Dimension", "DimensionResult", "EvalHarness", "EvalReport"]
@@ -0,0 +1,4 @@
1
+ from .dimension import Dimension, DimensionResult
2
+ from .harness import EvalHarness, EvalReport
3
+
4
+ __all__ = ["Dimension", "DimensionResult", "EvalHarness", "EvalReport"]
@@ -0,0 +1,31 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from dataclasses import dataclass
5
+
6
+
7
+ @dataclass(frozen=True)
8
+ class DimensionResult:
9
+ score: float
10
+ passed: bool
11
+ detail: str
12
+
13
+
14
+ class Dimension(ABC):
15
+ """Base class for a single eval dimension.
16
+
17
+ Subclass this and implement ``evaluate``. The harness calls ``run``,
18
+ which applies the threshold and returns a ``DimensionResult``.
19
+ """
20
+
21
+ def __init__(self, threshold: float = 1.0, name: str | None = None) -> None:
22
+ self.threshold = threshold
23
+ self.name = name or self.__class__.__name__
24
+
25
+ @abstractmethod
26
+ def evaluate(self, text: str) -> tuple[float, str]:
27
+ """Return (score, detail). Score is in [0.0, 1.0]."""
28
+
29
+ def run(self, text: str) -> DimensionResult:
30
+ score, detail = self.evaluate(text)
31
+ return DimensionResult(score=score, passed=score >= self.threshold, detail=detail)
@@ -0,0 +1,11 @@
1
+ from .blocklist import BlocklistDimension
2
+ from .factual import FactualGroundingDimension
3
+ from .readability import ReadabilityDimension
4
+ from .schema import SchemaComplianceDimension
5
+
6
+ __all__ = [
7
+ "BlocklistDimension",
8
+ "FactualGroundingDimension",
9
+ "ReadabilityDimension",
10
+ "SchemaComplianceDimension",
11
+ ]
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+ from ..dimension import Dimension
6
+
7
+
8
+ class BlocklistDimension(Dimension):
9
+ """Fail if any prohibited term appears in the text (case-insensitive).
10
+
11
+ Score is 1.0 when clean, 0.0 when any term is found. Useful for
12
+ preventing confidential identifiers, brand names, or internal jargon
13
+ from leaking into LLM output.
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ terms: list[str],
19
+ threshold: float = 1.0,
20
+ name: str = "blocklist",
21
+ case_sensitive: bool = False,
22
+ ) -> None:
23
+ super().__init__(threshold=threshold, name=name)
24
+ flags = 0 if case_sensitive else re.IGNORECASE
25
+ self._patterns = [re.compile(re.escape(t), flags) for t in terms]
26
+ self._terms = terms
27
+
28
+ def evaluate(self, text: str) -> tuple[float, str]:
29
+ found = [t for t, p in zip(self._terms, self._patterns) if p.search(text)]
30
+ if found:
31
+ return 0.0, f"prohibited terms found: {found}"
32
+ return 1.0, "clean"
@@ -0,0 +1,56 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+ from ..dimension import Dimension
6
+
7
+
8
+ class FactualGroundingDimension(Dimension):
9
+ """Check that numeric claims in LLM output are traceable to evidence.
10
+
11
+ For each number extracted from the text, we check whether a value
12
+ within ``rel_tolerance`` of it appears in the evidence list. Score
13
+ is the fraction of numeric claims that are grounded.
14
+
15
+ If no evidence is supplied the dimension is skipped (returns 1.0).
16
+ If the text contains no numbers, it also passes.
17
+ """
18
+
19
+ def __init__(
20
+ self,
21
+ evidence: list[float] | None = None,
22
+ rel_tolerance: float = 0.02,
23
+ threshold: float = 0.85,
24
+ name: str = "factual_grounding",
25
+ ) -> None:
26
+ super().__init__(threshold=threshold, name=name)
27
+ self._evidence = evidence or []
28
+ self._rel_tolerance = rel_tolerance
29
+
30
+ def _numbers_in_text(self, text: str) -> list[float]:
31
+ raw = re.findall(r"[\d,]+(?:\.\d+)?", text)
32
+ results = []
33
+ for r in raw:
34
+ try:
35
+ results.append(float(r.replace(",", "")))
36
+ except ValueError:
37
+ pass
38
+ return results
39
+
40
+ def _is_grounded(self, value: float) -> bool:
41
+ for ev in self._evidence:
42
+ if ev == 0:
43
+ continue
44
+ if abs(value - ev) / abs(ev) <= self._rel_tolerance:
45
+ return True
46
+ return False
47
+
48
+ def evaluate(self, text: str) -> tuple[float, str]:
49
+ if not self._evidence:
50
+ return 1.0, "skipped (no evidence supplied)"
51
+ numbers = self._numbers_in_text(text)
52
+ if not numbers:
53
+ return 1.0, "no numeric claims found"
54
+ grounded = [n for n in numbers if self._is_grounded(n)]
55
+ score = len(grounded) / len(numbers)
56
+ return score, f"{len(grounded)}/{len(numbers)} numeric claims grounded"
@@ -0,0 +1,26 @@
1
+ from __future__ import annotations
2
+
3
+ import textstat
4
+
5
+ from ..dimension import Dimension
6
+
7
+
8
+ class ReadabilityDimension(Dimension):
9
+ """Pass when Flesch Reading Ease score maps to a grade <= max_grade.
10
+
11
+ ``threshold`` is a normalised [0, 1] score derived from Flesch Reading
12
+ Ease, where 1.0 = very easy and 0.0 = very difficult. The default
13
+ threshold of 0.3 accepts most professional prose up to ~college level.
14
+ """
15
+
16
+ def __init__(self, threshold: float = 0.3, name: str = "readability") -> None:
17
+ super().__init__(threshold=threshold, name=name)
18
+
19
+ def evaluate(self, text: str) -> tuple[float, str]:
20
+ if not text.strip():
21
+ return 0.0, "empty text"
22
+ ease = textstat.flesch_reading_ease(text)
23
+ # Flesch ease: 100=very easy, 0=very hard. Normalise to [0, 1].
24
+ score = max(0.0, min(1.0, ease / 100.0))
25
+ grade = textstat.flesch_kincaid_grade(text)
26
+ return score, f"Flesch ease={ease:.1f}, FK grade={grade:.1f}"
@@ -0,0 +1,30 @@
1
+ from __future__ import annotations
2
+
3
+ from ..dimension import Dimension
4
+
5
+
6
+ class SchemaComplianceDimension(Dimension):
7
+ """Check that required fields are present in the text.
8
+
9
+ Useful for structured LLM outputs (JSON, YAML, markdown with
10
+ required sections) where missing fields are a hard failure.
11
+
12
+ ``required_fields`` is a list of strings that must each appear
13
+ verbatim somewhere in the text.
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ required_fields: list[str],
19
+ threshold: float = 1.0,
20
+ name: str = "schema_compliance",
21
+ ) -> None:
22
+ super().__init__(threshold=threshold, name=name)
23
+ self._required = required_fields
24
+
25
+ def evaluate(self, text: str) -> tuple[float, str]:
26
+ missing = [f for f in self._required if f not in text]
27
+ if missing:
28
+ score = 1.0 - len(missing) / len(self._required)
29
+ return score, f"missing fields: {missing}"
30
+ return 1.0, f"all {len(self._required)} required fields present"
@@ -0,0 +1,50 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+ from .dimension import Dimension, DimensionResult
6
+
7
+
8
+ @dataclass
9
+ class EvalReport:
10
+ passed: bool
11
+ results: dict[str, DimensionResult]
12
+ text: str
13
+
14
+ @property
15
+ def failures(self) -> dict[str, DimensionResult]:
16
+ return {name: r for name, r in self.results.items() if not r.passed}
17
+
18
+ def __str__(self) -> str:
19
+ lines = [f"EvalReport: {'PASS' if self.passed else 'FAIL'}"]
20
+ for name, result in self.results.items():
21
+ status = "PASS" if result.passed else "FAIL"
22
+ lines.append(f" {status} [{name}] score={result.score:.3f} — {result.detail}")
23
+ return "\n".join(lines)
24
+
25
+
26
+ class EvalHarness:
27
+ """Run a list of dimensions against text and produce an EvalReport.
28
+
29
+ Usage::
30
+
31
+ harness = EvalHarness([
32
+ ReadabilityDimension(threshold=0.7),
33
+ BlocklistDimension(terms=["secret", "internal"]),
34
+ ])
35
+ report = harness.run(text)
36
+ if not report.passed:
37
+ raise ValueError(str(report))
38
+ """
39
+
40
+ def __init__(self, dimensions: list[Dimension]) -> None:
41
+ if not dimensions:
42
+ raise ValueError("EvalHarness requires at least one dimension.")
43
+ self._dimensions = dimensions
44
+
45
+ def run(self, text: str) -> EvalReport:
46
+ results: dict[str, DimensionResult] = {}
47
+ for dim in self._dimensions:
48
+ results[dim.name] = dim.run(text)
49
+ passed = all(r.passed for r in results.values())
50
+ return EvalReport(passed=passed, results=results, text=text)
@@ -0,0 +1,12 @@
1
+ from .circuit import CircuitBreaker, CircuitOpenError, CircuitState
2
+ from .fallback import with_fallback, with_fallback_chain
3
+ from .retry import retry
4
+
5
+ __all__ = [
6
+ "retry",
7
+ "with_fallback",
8
+ "with_fallback_chain",
9
+ "CircuitBreaker",
10
+ "CircuitOpenError",
11
+ "CircuitState",
12
+ ]
@@ -0,0 +1,97 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ from enum import Enum
5
+
6
+
7
+ class CircuitState(Enum):
8
+ CLOSED = "closed" # normal operation
9
+ OPEN = "open" # failing, rejecting calls
10
+ HALF_OPEN = "half_open" # probing for recovery
11
+
12
+
13
+ class CircuitOpenError(Exception):
14
+ """Raised when a call is attempted while the circuit is open."""
15
+
16
+
17
+ class CircuitBreaker:
18
+ """Prevent cascading failures by stopping calls to a failing service.
19
+
20
+ Usage::
21
+
22
+ breaker = CircuitBreaker(failure_threshold=5, recovery_timeout=60)
23
+
24
+ # context manager form
25
+ with breaker:
26
+ result = call_llm(prompt)
27
+
28
+ # or call form
29
+ result = breaker.call(lambda: call_llm(prompt))
30
+
31
+ State transitions:
32
+ - CLOSED -> OPEN: after ``failure_threshold`` consecutive failures
33
+ - OPEN -> HALF_OPEN: after ``recovery_timeout`` seconds
34
+ - HALF_OPEN -> CLOSED: on first success
35
+ - HALF_OPEN -> OPEN: on first failure
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ failure_threshold: int = 5,
41
+ recovery_timeout: float = 60.0,
42
+ exceptions: tuple[type[Exception], ...] = (Exception,),
43
+ ) -> None:
44
+ self.failure_threshold = failure_threshold
45
+ self.recovery_timeout = recovery_timeout
46
+ self.exceptions = exceptions
47
+
48
+ self._state = CircuitState.CLOSED
49
+ self._failure_count = 0
50
+ self._opened_at: float | None = None
51
+
52
+ @property
53
+ def state(self) -> CircuitState:
54
+ if self._state is CircuitState.OPEN:
55
+ assert self._opened_at is not None
56
+ if time.monotonic() - self._opened_at >= self.recovery_timeout:
57
+ self._state = CircuitState.HALF_OPEN
58
+ return self._state
59
+
60
+ def _on_success(self) -> None:
61
+ self._failure_count = 0
62
+ self._state = CircuitState.CLOSED
63
+ self._opened_at = None
64
+
65
+ def _on_failure(self) -> None:
66
+ self._failure_count += 1
67
+ if self._failure_count >= self.failure_threshold:
68
+ self._state = CircuitState.OPEN
69
+ self._opened_at = time.monotonic()
70
+
71
+ def call(self, fn):
72
+ if self.state is CircuitState.OPEN:
73
+ raise CircuitOpenError(
74
+ f"Circuit is open. Retry after {self.recovery_timeout}s."
75
+ )
76
+ try:
77
+ result = fn()
78
+ self._on_success()
79
+ return result
80
+ except self.exceptions as exc:
81
+ self._on_failure()
82
+ raise exc
83
+
84
+ def __enter__(self):
85
+ if self.state is CircuitState.OPEN:
86
+ raise CircuitOpenError(
87
+ f"Circuit is open. Retry after {self.recovery_timeout}s."
88
+ )
89
+ return self
90
+
91
+ def __exit__(self, exc_type, exc_val, exc_tb):
92
+ if exc_type is not None and issubclass(exc_type, self.exceptions):
93
+ self._on_failure()
94
+ return False
95
+ if exc_type is None:
96
+ self._on_success()
97
+ return False
@@ -0,0 +1,53 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Callable
4
+ from typing import TypeVar
5
+
6
+ T = TypeVar("T")
7
+
8
+
9
+ def with_fallback(
10
+ primary: Callable[[], T],
11
+ fallback: Callable[[], T],
12
+ exceptions: tuple[type[Exception], ...] = (Exception,),
13
+ ) -> T:
14
+ """Call ``primary``; on failure call ``fallback``.
15
+
16
+ Usage::
17
+
18
+ result = with_fallback(
19
+ primary=lambda: call_opus(prompt),
20
+ fallback=lambda: call_sonnet(prompt),
21
+ )
22
+ """
23
+ try:
24
+ return primary()
25
+ except exceptions:
26
+ return fallback()
27
+
28
+
29
+ def with_fallback_chain(
30
+ callables: list[Callable[[], T]],
31
+ exceptions: tuple[type[Exception], ...] = (Exception,),
32
+ ) -> T:
33
+ """Try each callable in order, returning the first success.
34
+
35
+ Raises the last exception if all callables fail.
36
+
37
+ Usage::
38
+
39
+ result = with_fallback_chain([
40
+ lambda: call_opus(prompt),
41
+ lambda: call_sonnet(prompt),
42
+ lambda: call_haiku(prompt),
43
+ ])
44
+ """
45
+ if not callables:
46
+ raise ValueError("callables list is empty")
47
+ last_exc: Exception | None = None
48
+ for fn in callables:
49
+ try:
50
+ return fn()
51
+ except exceptions as exc:
52
+ last_exc = exc
53
+ raise last_exc # type: ignore[misc]
@@ -0,0 +1,45 @@
1
+ from __future__ import annotations
2
+
3
+ import functools
4
+ import time
5
+ from collections.abc import Callable
6
+ from typing import Any, TypeVar
7
+
8
+ F = TypeVar("F", bound=Callable[..., Any])
9
+
10
+
11
+ def retry(
12
+ max_attempts: int = 3,
13
+ backoff: float = 2.0,
14
+ exceptions: tuple[type[Exception], ...] = (Exception,),
15
+ ) -> Callable[[F], F]:
16
+ """Retry a callable on failure with exponential backoff.
17
+
18
+ Usage::
19
+
20
+ @retry(max_attempts=3, backoff=2.0)
21
+ def call_llm(prompt: str) -> str:
22
+ ...
23
+
24
+ # or without decorator syntax:
25
+ result = retry(max_attempts=3)(call_llm)(prompt)
26
+ """
27
+
28
+ def decorator(fn: F) -> F:
29
+ @functools.wraps(fn)
30
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
31
+ delay = backoff
32
+ last_exc: Exception | None = None
33
+ for attempt in range(1, max_attempts + 1):
34
+ try:
35
+ return fn(*args, **kwargs)
36
+ except exceptions as exc:
37
+ last_exc = exc
38
+ if attempt < max_attempts:
39
+ time.sleep(delay)
40
+ delay *= backoff
41
+ raise last_exc # type: ignore[misc]
42
+
43
+ return wrapper # type: ignore[return-value]
44
+
45
+ return decorator