recension 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- recension-0.5.0/.gitignore +33 -0
- recension-0.5.0/CHANGELOG.md +103 -0
- recension-0.5.0/LICENSE +21 -0
- recension-0.5.0/PKG-INFO +150 -0
- recension-0.5.0/README.md +108 -0
- recension-0.5.0/pyproject.toml +83 -0
- recension-0.5.0/recension/__init__.py +74 -0
- recension-0.5.0/recension/artifact.py +337 -0
- recension-0.5.0/recension/budget.py +52 -0
- recension-0.5.0/recension/cli.py +374 -0
- recension-0.5.0/recension/evalset.py +166 -0
- recension-0.5.0/recension/exceptions.py +69 -0
- recension-0.5.0/recension/leakage.py +154 -0
- recension-0.5.0/recension/models/__init__.py +11 -0
- recension-0.5.0/recension/models/anthropic.py +136 -0
- recension-0.5.0/recension/models/base.py +85 -0
- recension-0.5.0/recension/models/mock.py +81 -0
- recension-0.5.0/recension/objective.py +214 -0
- recension-0.5.0/recension/optimizer.py +772 -0
- recension-0.5.0/recension/proposer.py +294 -0
- recension-0.5.0/recension/py.typed +0 -0
- recension-0.5.0/recension/record.py +453 -0
- recension-0.5.0/recension/report.py +180 -0
- recension-0.5.0/recension/stats.py +96 -0
- recension-0.5.0/tests/test_artifact.py +164 -0
- recension-0.5.0/tests/test_budget.py +34 -0
- recension-0.5.0/tests/test_cli.py +228 -0
- recension-0.5.0/tests/test_evalset.py +118 -0
- recension-0.5.0/tests/test_leakage.py +75 -0
- recension-0.5.0/tests/test_models.py +109 -0
- recension-0.5.0/tests/test_objective.py +121 -0
- recension-0.5.0/tests/test_optimizer.py +622 -0
- recension-0.5.0/tests/test_proposer.py +193 -0
- recension-0.5.0/tests/test_record.py +183 -0
- recension-0.5.0/tests/test_report.py +47 -0
- recension-0.5.0/tests/test_stats.py +51 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.eggs/
|
|
6
|
+
build/
|
|
7
|
+
dist/
|
|
8
|
+
.venv/
|
|
9
|
+
|
|
10
|
+
# Tooling caches
|
|
11
|
+
.mypy_cache/
|
|
12
|
+
.pytest_cache/
|
|
13
|
+
.ruff_cache/
|
|
14
|
+
.coverage
|
|
15
|
+
|
|
16
|
+
# Docs / examples build output (regenerated, never committed)
|
|
17
|
+
site/
|
|
18
|
+
docs/examples/generated/
|
|
19
|
+
examples/output/
|
|
20
|
+
|
|
21
|
+
# Run records written by the CLI / examples
|
|
22
|
+
run_record.json
|
|
23
|
+
optimized_prompt.txt
|
|
24
|
+
|
|
25
|
+
# Internal build/spec/maintainer docs, kept local, not published (end-user markdown only)
|
|
26
|
+
CLAUDE.md
|
|
27
|
+
recension_PRD.md
|
|
28
|
+
RELEASING.md
|
|
29
|
+
|
|
30
|
+
# OS / editor
|
|
31
|
+
.DS_Store
|
|
32
|
+
.idea/
|
|
33
|
+
.vscode/
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project are documented here. The format follows
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and the project
|
|
5
|
+
adheres to [Semantic Versioning](https://semver.org/).
|
|
6
|
+
|
|
7
|
+
## [0.5.0] - 2026-06-11
|
|
8
|
+
|
|
9
|
+
Shareable audits and an ecosystem seam.
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
|
|
13
|
+
- `recension report <record.json> -o report.html` renders a run record into a single standalone HTML
|
|
14
|
+
audit page (inline CSS, no assets, no network): baseline and final scores, the locked test estimate
|
|
15
|
+
and overfit flag, every round's diagnosis and candidates with significance/guard/leakage detail, the
|
|
16
|
+
accepted diff, the per-slice breakdown, the token ledger, and the integrity status. New public
|
|
17
|
+
`render_report`.
|
|
18
|
+
- A pluggable `Proposer` interface (`ReflectiveOptimizer(proposer=...)`) so candidate generation can be
|
|
19
|
+
supplied by an external optimizer (DSPy, GEPA, or your own) while recension keeps owning versioning,
|
|
20
|
+
held-out measurement, leakage detection, and the audit record. Ships `DefaultProposer` (the built-in
|
|
21
|
+
reflective loop) and `CallableProposer` (wrap plain functions). New "Bring your own optimizer" docs.
|
|
22
|
+
|
|
23
|
+
## [0.4.0] - 2026-06-11
|
|
24
|
+
|
|
25
|
+
Multi-dimensional evaluation: one number hides regressions. All additions are opt-in.
|
|
26
|
+
|
|
27
|
+
### Added
|
|
28
|
+
|
|
29
|
+
- Per-slice reporting: set `slice_by` to an `Example.metadata` key and the record carries per-subgroup
|
|
30
|
+
baseline-vs-final scores (`SliceScore`), so a run that improves overall while regressing a segment is
|
|
31
|
+
visible. `slice_tolerance` controls when a slice is announced as regressed.
|
|
32
|
+
- Guarded acceptance: `guards=[...]` of secondary objectives that must not regress. A candidate that
|
|
33
|
+
improves the primary metric but lowers a guard beyond `guard_tolerance` is rejected, with the
|
|
34
|
+
incumbent-vs-candidate guard scores recorded (`GuardScore`). Ships the `MaxLength` guard objective.
|
|
35
|
+
- Cost ledger: an optional model-usage capability (`SupportsUsage` / `TokenUsage`). `MockModel` reports
|
|
36
|
+
synthetic deterministic counts; `AnthropicModel` reads `response.usage`. The record carries per-round
|
|
37
|
+
and total input/output tokens. Models without usage report zeros.
|
|
38
|
+
- CLI config keys: `slice_by`, `slice_tolerance`, `guards`, `guard_tolerance`.
|
|
39
|
+
|
|
40
|
+
## [0.3.0] - 2026-06-11
|
|
41
|
+
|
|
42
|
+
Prompts as tested, tamper-evident artifacts.
|
|
43
|
+
|
|
44
|
+
### Added
|
|
45
|
+
|
|
46
|
+
- `recension check`: a prompt regression guard for CI. Scores the current artifact against a baseline
|
|
47
|
+
(a prior record's score or a literal) on the validation or test split and exits non-zero on a
|
|
48
|
+
regression, so a prompt change that hurts your eval set fails the build. Backed by the new public
|
|
49
|
+
`score_artifact` helper. Docs include a GitHub Actions recipe.
|
|
50
|
+
- Tamper-evident records: `RunRecord.verify()` checks the embedded artifact's content-addressed
|
|
51
|
+
version chain (catching edited text/ids with no external reference), `RunRecord.fingerprint()` is a
|
|
52
|
+
deterministic content hash, and `sign()` / `verify_signature()` add optional HMAC signing
|
|
53
|
+
(`RECENSION_SIGNING_KEY`). New `recension verify` command; `recension show` prints an integrity line.
|
|
54
|
+
`TextArtifact.verify()` exposes the version-chain check directly.
|
|
55
|
+
|
|
56
|
+
## [0.2.0] - 2026-06-10
|
|
57
|
+
|
|
58
|
+
Honest measurement. Both additions are opt-in; 0.1.0 code is unaffected.
|
|
59
|
+
|
|
60
|
+
### Added
|
|
61
|
+
|
|
62
|
+
- Optional locked `test` split on `EvalSet` (records with `split: "test"`). The optimizer scores the
|
|
63
|
+
final incumbent on it exactly once and records `final_test_score`, the `validation`/`test` gap, and
|
|
64
|
+
a `validation_overfit` flag when the gap exceeds `overfit_gap` (default 0.1). This gives an unbiased
|
|
65
|
+
final estimate that the repeated selection on `validation` cannot.
|
|
66
|
+
- Significance-based acceptance (`accept_significant`, with `alpha` and `bootstrap_resamples`). When
|
|
67
|
+
on, a candidate is accepted only if its validation gain is statistically significant (a seeded
|
|
68
|
+
paired-bootstrap confidence interval excluding 0), not merely above `min_improvement`. The bootstrap
|
|
69
|
+
is recorded on the round's best candidate. New stdlib-only `recension.stats` module; new
|
|
70
|
+
`SignificanceRecord`.
|
|
71
|
+
- CLI config keys for the above (`accept_significant`, `alpha`, `bootstrap_resamples`, `overfit_gap`)
|
|
72
|
+
and a `test` split in `examples/cli/`.
|
|
73
|
+
|
|
74
|
+
## [0.1.0] - 2026-06-10
|
|
75
|
+
|
|
76
|
+
Initial release.
|
|
77
|
+
|
|
78
|
+
### Added
|
|
79
|
+
|
|
80
|
+
- `TextArtifact`: versioned text with content-addressed version ids, unified
|
|
81
|
+
diffs (stdlib `difflib`), append-only rollback, and full `Provenance` on
|
|
82
|
+
every accepted version (diagnosis, scores, rejected sibling candidates, diff).
|
|
83
|
+
- `EvalSet` / `Example` with an enforced train/validation split; loaders
|
|
84
|
+
`from_records` and `from_jsonl`. Split-integrity violations raise
|
|
85
|
+
`DegenerateEvalError`.
|
|
86
|
+
- Objectives: `ExactMatch`, token-level `F1`, and model-graded `LLMJudge`
|
|
87
|
+
(flagged as model-graded in run records).
|
|
88
|
+
- `ReflectiveOptimizer`: the propose/test/accept loop, with failures diagnosed on
|
|
89
|
+
train, distinct candidates generated and scored on validation, acceptance
|
|
90
|
+
gated on `min_improvement` plus leakage checks.
|
|
91
|
+
- `Budget`: caller-controlled candidates per round, rounds, diagnosis depth,
|
|
92
|
+
and a hard `max_model_calls` ceiling (`BudgetExceeded` carries the partial
|
|
93
|
+
audit record).
|
|
94
|
+
- Leakage heuristics: verbatim validation spans and implausible
|
|
95
|
+
validation-vs-train gains; surfaced as flags, or raised via strict mode.
|
|
96
|
+
- `RunRecord` / `RoundRecord` / `CandidateRecord`: a complete, serializable
|
|
97
|
+
audit record of every run, with the full artifact embedded.
|
|
98
|
+
- Model layer: provider-agnostic `Model` protocol, deterministic `MockModel`
|
|
99
|
+
(the entire test suite runs offline), and an optional Anthropic backend
|
|
100
|
+
(`recension[anthropic]`; API key from the environment only).
|
|
101
|
+
- CLI: `recension run --config run.yaml`, `recension show`, `recension diff`.
|
|
102
|
+
- Three reproducible worked examples and a docs site (MkDocs + API reference)
|
|
103
|
+
with pages regenerated from real offline runs.
|
recension-0.5.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Anthony Nyström
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
recension-0.5.0/PKG-INFO
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: recension
|
|
3
|
+
Version: 0.5.0
|
|
4
|
+
Summary: Measured optimization of the text layer around a language model: versioned artifacts, held-out evaluation, leakage detection, and a complete audit record.
|
|
5
|
+
Project-URL: Homepage, https://github.com/AnthonyNystrom/recension
|
|
6
|
+
Project-URL: Documentation, https://anthonynystrom.github.io/recension/
|
|
7
|
+
Project-URL: Repository, https://github.com/AnthonyNystrom/recension
|
|
8
|
+
Project-URL: Changelog, https://github.com/AnthonyNystrom/recension/blob/main/CHANGELOG.md
|
|
9
|
+
Author-email: Anthony Nyström <nystrom.anthony@gmail.com>
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: auditability,evaluation,llm,prompt-optimization,provenance
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
|
+
Classifier: Typing :: Typed
|
|
24
|
+
Requires-Python: >=3.12
|
|
25
|
+
Requires-Dist: pyyaml>=6.0
|
|
26
|
+
Provides-Extra: anthropic
|
|
27
|
+
Requires-Dist: anthropic>=0.40; extra == 'anthropic'
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: build>=1.2; extra == 'dev'
|
|
30
|
+
Requires-Dist: mypy>=1.13; extra == 'dev'
|
|
31
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
32
|
+
Requires-Dist: ruff>=0.8; extra == 'dev'
|
|
33
|
+
Requires-Dist: twine>=5.0; extra == 'dev'
|
|
34
|
+
Requires-Dist: types-pyyaml; extra == 'dev'
|
|
35
|
+
Provides-Extra: docs
|
|
36
|
+
Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
|
|
37
|
+
Requires-Dist: mkdocs>=1.6; extra == 'docs'
|
|
38
|
+
Requires-Dist: mkdocstrings[python]>=0.27; extra == 'docs'
|
|
39
|
+
Provides-Extra: webapp
|
|
40
|
+
Requires-Dist: flask>=3.0; extra == 'webapp'
|
|
41
|
+
Description-Content-Type: text/markdown
|
|
42
|
+
|
|
43
|
+
<p align="center">
|
|
44
|
+
<picture>
|
|
45
|
+
<source media="(prefers-color-scheme: dark)" srcset="docs/assets/recension-logo-dark.svg">
|
|
46
|
+
<img src="docs/assets/recension-logo.svg" alt="recension" width="340">
|
|
47
|
+
</picture>
|
|
48
|
+
</p>
|
|
49
|
+
|
|
50
|
+
# recension
|
|
51
|
+
|
|
52
|
+
Measured optimization of the text layer around a language model (prompts, context templates, skill and instruction files) with the rigor normally reserved for weight training: a held-out objective, a baseline, versioned artifacts, and a complete audit trail.
|
|
53
|
+
|
|
54
|
+
The name comes from textual criticism. A *recension* is the revision of a text by collating variant readings and keeping the best-supported one. That is the loop this library runs: propose multiple candidate edits, test each against held-out evidence, commit only what measurably improves, and record why.
|
|
55
|
+
|
|
56
|
+
## Why
|
|
57
|
+
|
|
58
|
+
The usual way to improve a prompt is to edit it, eyeball a few outputs, and ship. There is no held-out measurement, no record of why a change was made, and no defense against overfitting to the handful of cases you inspected. `recension` replaces that loop with a measured one:
|
|
59
|
+
|
|
60
|
+
- **No edit is accepted without a held-out score that beats the incumbent.** Failures are diagnosed on a train split; acceptance happens only on a validation split, and can require the gain to be *statistically significant* rather than above an epsilon. An optional locked test split gives an unbiased final estimate.
|
|
61
|
+
- **Every accepted version carries provenance**: the failures that motivated it, the diagnosis, every sibling candidate considered (with scores), and the diff. A reviewer who didn't run the optimization can reconstruct every decision.
|
|
62
|
+
- **One number doesn't hide regressions.** Optional per-slice scores, non-regression guard objectives, and a token-cost ledger surface what an aggregate averages away.
|
|
63
|
+
- **Leakage is checked, not assumed away.** Heuristics flag candidates that embed validation content or show implausible validation gains.
|
|
64
|
+
- **Records are built to be acted on.** Gate a prompt in CI with `recension check`, detect tampering with `recension verify`, and share a standalone HTML audit with `recension report`.
|
|
65
|
+
- **Compute is a dial.** Candidates per round, rounds, diagnosis depth, and a hard ceiling on model calls are all caller-controlled.
|
|
66
|
+
|
|
67
|
+
## Real-world use cases
|
|
68
|
+
|
|
69
|
+
- **Production classification and extraction** (support-ticket triage, invoice fields, moderation): improve a labeling prompt on labeled data with measured, regression-safe edits.
|
|
70
|
+
- **RAG context templates**: tune how retrieved chunks are assembled into the prompt with the model held fixed, so the metric move is attributable to the text.
|
|
71
|
+
- **Agent and skill instructions**: optimize longer instruction files judged by an `LLMJudge` rubric when there is no gold answer.
|
|
72
|
+
- **Governance and audit**: ship a replayable, tamper-evident `RunRecord` for every prompt change; gate merges in CI with `recension check`, and hand reviewers a standalone HTML audit with `recension report`.
|
|
73
|
+
|
|
74
|
+
Full write-ups, plus a "how it works" walkthrough, are on the [documentation site](https://anthonynystrom.github.io/recension/).
|
|
75
|
+
|
|
76
|
+
## Prior art, honestly
|
|
77
|
+
|
|
78
|
+
DSPy and GEPA own the optimization mechanics this library's `ReflectiveOptimizer` performs; if you want state-of-the-art prompt optimization algorithms, look there. `recension`'s contribution is the **measurement and governance shell** around a text artifact: versioned artifacts with provenance, leakage detection, the complete audit record, and budgeted update-time compute. That delegation is a real seam, not a promise: the `Proposer` protocol lets an external engine supply the candidate edits while recension keeps owning the artifact, the measurement, and the record (see the [Bring your own optimizer](https://anthonynystrom.github.io/recension/ecosystem/) guide).
|
|
79
|
+
|
|
80
|
+
## Install
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
pip install recension # core: zero provider dependencies
|
|
84
|
+
pip install "recension[anthropic]" # adds the Anthropic backend
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Python 3.12+. The core (and the whole test suite) runs against a deterministic `MockModel` with no API key and no network.
|
|
88
|
+
|
|
89
|
+
## Quickstart
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
from recension import (
|
|
93
|
+
Budget, EvalSet, ExactMatch, MockModel, ReflectiveOptimizer, TextArtifact,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
artifact = TextArtifact.from_text("Label the sentiment of the message.")
|
|
97
|
+
|
|
98
|
+
# Held-out examples, split into train (for diagnosis) and validation (for
|
|
99
|
+
# acceptance). Load from a JSONL file with EvalSet.from_jsonl(path) instead.
|
|
100
|
+
evalset = EvalSet.from_records([
|
|
101
|
+
{"id": "t1", "input": "Absolutely love this", "expected": "positive", "split": "train"},
|
|
102
|
+
{"id": "t2", "input": "Broke after a day", "expected": "negative", "split": "train"},
|
|
103
|
+
{"id": "v1", "input": "Terrible support", "expected": "negative", "split": "validation"},
|
|
104
|
+
{"id": "v2", "input": "Exceeded expectations", "expected": "positive", "split": "validation"},
|
|
105
|
+
])
|
|
106
|
+
|
|
107
|
+
optimizer = ReflectiveOptimizer(
|
|
108
|
+
artifact=artifact,
|
|
109
|
+
evalset=evalset,
|
|
110
|
+
objective=ExactMatch(),
|
|
111
|
+
model=MockModel(), # offline mock; see below for the real backend
|
|
112
|
+
budget=Budget(candidates_per_round=4, rounds=3, max_model_calls=200),
|
|
113
|
+
seed=7,
|
|
114
|
+
)
|
|
115
|
+
record = optimizer.run()
|
|
116
|
+
|
|
117
|
+
print(record.summary()) # baseline → accepted versions → final score
|
|
118
|
+
record.save("run_record.json") # the complete audit artifact
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
To run against a real model, install the extra (`pip install "recension[anthropic]"`), set `ANTHROPIC_API_KEY` in your environment, and use the Anthropic backend:
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from recension.models.anthropic import AnthropicModel # kept off the top-level import so the core needs no provider deps
|
|
125
|
+
|
|
126
|
+
optimizer = ReflectiveOptimizer(..., model=AnthropicModel())
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
API keys are read from the environment only, never from code or config.
|
|
130
|
+
|
|
131
|
+
## CLI
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
recension run --config run.yaml # execute an optimization, write the record
|
|
135
|
+
recension show run_record.json # baseline, accepted diffs, score progression, integrity
|
|
136
|
+
recension diff run_record.json vA vB # diff between two artifact versions
|
|
137
|
+
recension check --config run.yaml --baseline run_record.json # CI guard: exit non-zero on regression
|
|
138
|
+
recension verify run_record.json # detect tampering (content-addressed version chain)
|
|
139
|
+
recension report run_record.json -o report.html # standalone HTML audit page
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
A runnable, fully commented config and dataset live in [`examples/cli/`](examples/cli) (`recension run --config examples/cli/run.yaml`); the config schema and a GitHub Actions recipe for `recension check` are in the [CLI guide](https://anthonynystrom.github.io/recension/cli/).
|
|
143
|
+
|
|
144
|
+
## Documentation
|
|
145
|
+
|
|
146
|
+
Full docs, API reference, and three worked examples (each reproducible offline against `MockModel`): **https://anthonynystrom.github.io/recension/**
|
|
147
|
+
|
|
148
|
+
## License
|
|
149
|
+
|
|
150
|
+
MIT
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<picture>
|
|
3
|
+
<source media="(prefers-color-scheme: dark)" srcset="docs/assets/recension-logo-dark.svg">
|
|
4
|
+
<img src="docs/assets/recension-logo.svg" alt="recension" width="340">
|
|
5
|
+
</picture>
|
|
6
|
+
</p>
|
|
7
|
+
|
|
8
|
+
# recension
|
|
9
|
+
|
|
10
|
+
Measured optimization of the text layer around a language model (prompts, context templates, skill and instruction files) with the rigor normally reserved for weight training: a held-out objective, a baseline, versioned artifacts, and a complete audit trail.
|
|
11
|
+
|
|
12
|
+
The name comes from textual criticism. A *recension* is the revision of a text by collating variant readings and keeping the best-supported one. That is the loop this library runs: propose multiple candidate edits, test each against held-out evidence, commit only what measurably improves, and record why.
|
|
13
|
+
|
|
14
|
+
## Why
|
|
15
|
+
|
|
16
|
+
The usual way to improve a prompt is to edit it, eyeball a few outputs, and ship. There is no held-out measurement, no record of why a change was made, and no defense against overfitting to the handful of cases you inspected. `recension` replaces that loop with a measured one:
|
|
17
|
+
|
|
18
|
+
- **No edit is accepted without a held-out score that beats the incumbent.** Failures are diagnosed on a train split; acceptance happens only on a validation split, and can require the gain to be *statistically significant* rather than above an epsilon. An optional locked test split gives an unbiased final estimate.
|
|
19
|
+
- **Every accepted version carries provenance**: the failures that motivated it, the diagnosis, every sibling candidate considered (with scores), and the diff. A reviewer who didn't run the optimization can reconstruct every decision.
|
|
20
|
+
- **One number doesn't hide regressions.** Optional per-slice scores, non-regression guard objectives, and a token-cost ledger surface what an aggregate averages away.
|
|
21
|
+
- **Leakage is checked, not assumed away.** Heuristics flag candidates that embed validation content or show implausible validation gains.
|
|
22
|
+
- **Records are built to be acted on.** Gate a prompt in CI with `recension check`, detect tampering with `recension verify`, and share a standalone HTML audit with `recension report`.
|
|
23
|
+
- **Compute is a dial.** Candidates per round, rounds, diagnosis depth, and a hard ceiling on model calls are all caller-controlled.
|
|
24
|
+
|
|
25
|
+
## Real-world use cases
|
|
26
|
+
|
|
27
|
+
- **Production classification and extraction** (support-ticket triage, invoice fields, moderation): improve a labeling prompt on labeled data with measured, regression-safe edits.
|
|
28
|
+
- **RAG context templates**: tune how retrieved chunks are assembled into the prompt with the model held fixed, so the metric move is attributable to the text.
|
|
29
|
+
- **Agent and skill instructions**: optimize longer instruction files judged by an `LLMJudge` rubric when there is no gold answer.
|
|
30
|
+
- **Governance and audit**: ship a replayable, tamper-evident `RunRecord` for every prompt change; gate merges in CI with `recension check`, and hand reviewers a standalone HTML audit with `recension report`.
|
|
31
|
+
|
|
32
|
+
Full write-ups, plus a "how it works" walkthrough, are on the [documentation site](https://anthonynystrom.github.io/recension/).
|
|
33
|
+
|
|
34
|
+
## Prior art, honestly
|
|
35
|
+
|
|
36
|
+
DSPy and GEPA own the optimization mechanics this library's `ReflectiveOptimizer` performs; if you want state-of-the-art prompt optimization algorithms, look there. `recension`'s contribution is the **measurement and governance shell** around a text artifact: versioned artifacts with provenance, leakage detection, the complete audit record, and budgeted update-time compute. That delegation is a real seam, not a promise: the `Proposer` protocol lets an external engine supply the candidate edits while recension keeps owning the artifact, the measurement, and the record (see the [Bring your own optimizer](https://anthonynystrom.github.io/recension/ecosystem/) guide).
|
|
37
|
+
|
|
38
|
+
## Install
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install recension # core: zero provider dependencies
|
|
42
|
+
pip install "recension[anthropic]" # adds the Anthropic backend
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Python 3.12+. The core (and the whole test suite) runs against a deterministic `MockModel` with no API key and no network.
|
|
46
|
+
|
|
47
|
+
## Quickstart
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from recension import (
|
|
51
|
+
Budget, EvalSet, ExactMatch, MockModel, ReflectiveOptimizer, TextArtifact,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
artifact = TextArtifact.from_text("Label the sentiment of the message.")
|
|
55
|
+
|
|
56
|
+
# Held-out examples, split into train (for diagnosis) and validation (for
|
|
57
|
+
# acceptance). Load from a JSONL file with EvalSet.from_jsonl(path) instead.
|
|
58
|
+
evalset = EvalSet.from_records([
|
|
59
|
+
{"id": "t1", "input": "Absolutely love this", "expected": "positive", "split": "train"},
|
|
60
|
+
{"id": "t2", "input": "Broke after a day", "expected": "negative", "split": "train"},
|
|
61
|
+
{"id": "v1", "input": "Terrible support", "expected": "negative", "split": "validation"},
|
|
62
|
+
{"id": "v2", "input": "Exceeded expectations", "expected": "positive", "split": "validation"},
|
|
63
|
+
])
|
|
64
|
+
|
|
65
|
+
optimizer = ReflectiveOptimizer(
|
|
66
|
+
artifact=artifact,
|
|
67
|
+
evalset=evalset,
|
|
68
|
+
objective=ExactMatch(),
|
|
69
|
+
model=MockModel(), # offline mock; see below for the real backend
|
|
70
|
+
budget=Budget(candidates_per_round=4, rounds=3, max_model_calls=200),
|
|
71
|
+
seed=7,
|
|
72
|
+
)
|
|
73
|
+
record = optimizer.run()
|
|
74
|
+
|
|
75
|
+
print(record.summary()) # baseline → accepted versions → final score
|
|
76
|
+
record.save("run_record.json") # the complete audit artifact
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
To run against a real model, install the extra (`pip install "recension[anthropic]"`), set `ANTHROPIC_API_KEY` in your environment, and use the Anthropic backend:
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from recension.models.anthropic import AnthropicModel # kept off the top-level import so the core needs no provider deps
|
|
83
|
+
|
|
84
|
+
optimizer = ReflectiveOptimizer(..., model=AnthropicModel())
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
API keys are read from the environment only, never from code or config.
|
|
88
|
+
|
|
89
|
+
## CLI
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
recension run --config run.yaml # execute an optimization, write the record
|
|
93
|
+
recension show run_record.json # baseline, accepted diffs, score progression, integrity
|
|
94
|
+
recension diff run_record.json vA vB # diff between two artifact versions
|
|
95
|
+
recension check --config run.yaml --baseline run_record.json # CI guard: exit non-zero on regression
|
|
96
|
+
recension verify run_record.json # detect tampering (content-addressed version chain)
|
|
97
|
+
recension report run_record.json -o report.html # standalone HTML audit page
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
A runnable, fully commented config and dataset live in [`examples/cli/`](examples/cli) (`recension run --config examples/cli/run.yaml`); the config schema and a GitHub Actions recipe for `recension check` are in the [CLI guide](https://anthonynystrom.github.io/recension/cli/).
|
|
101
|
+
|
|
102
|
+
## Documentation
|
|
103
|
+
|
|
104
|
+
Full docs, API reference, and three worked examples (each reproducible offline against `MockModel`): **https://anthonynystrom.github.io/recension/**
|
|
105
|
+
|
|
106
|
+
## License
|
|
107
|
+
|
|
108
|
+
MIT
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "recension"
|
|
7
|
+
version = "0.5.0"
|
|
8
|
+
description = "Measured optimization of the text layer around a language model: versioned artifacts, held-out evaluation, leakage detection, and a complete audit record."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Anthony Nyström", email = "nystrom.anthony@gmail.com" }]
|
|
13
|
+
keywords = ["prompt-optimization", "llm", "evaluation", "provenance", "auditability"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"Intended Audience :: Science/Research",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Operating System :: OS Independent",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Programming Language :: Python :: 3.13",
|
|
23
|
+
"Programming Language :: Python :: 3.14",
|
|
24
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
25
|
+
"Typing :: Typed",
|
|
26
|
+
]
|
|
27
|
+
# DESIGN NOTE: PyYAML is the single runtime dependency, required only because the
|
|
28
|
+
# PRD specifies YAML config files for the CLI. Everything else is stdlib.
|
|
29
|
+
dependencies = ["pyyaml>=6.0"]
|
|
30
|
+
|
|
31
|
+
[project.optional-dependencies]
|
|
32
|
+
anthropic = ["anthropic>=0.40"]
|
|
33
|
+
dev = [
|
|
34
|
+
"pytest>=8.0",
|
|
35
|
+
"mypy>=1.13",
|
|
36
|
+
"ruff>=0.8",
|
|
37
|
+
"build>=1.2",
|
|
38
|
+
"twine>=5.0",
|
|
39
|
+
"types-PyYAML",
|
|
40
|
+
]
|
|
41
|
+
docs = [
|
|
42
|
+
"mkdocs>=1.6",
|
|
43
|
+
"mkdocs-material>=9.5",
|
|
44
|
+
"mkdocstrings[python]>=0.27",
|
|
45
|
+
]
|
|
46
|
+
# DESIGN NOTE: Flask powers only the optional `examples/webapp` live demo; it is
|
|
47
|
+
# never imported by the library, so it stays out of the core and docs groups.
|
|
48
|
+
webapp = ["flask>=3.0"]
|
|
49
|
+
|
|
50
|
+
[project.scripts]
|
|
51
|
+
recension = "recension.cli:main"
|
|
52
|
+
|
|
53
|
+
[project.urls]
|
|
54
|
+
Homepage = "https://github.com/AnthonyNystrom/recension"
|
|
55
|
+
Documentation = "https://anthonynystrom.github.io/recension/"
|
|
56
|
+
Repository = "https://github.com/AnthonyNystrom/recension"
|
|
57
|
+
Changelog = "https://github.com/AnthonyNystrom/recension/blob/main/CHANGELOG.md"
|
|
58
|
+
|
|
59
|
+
[tool.hatch.build.targets.wheel]
|
|
60
|
+
packages = ["recension"]
|
|
61
|
+
|
|
62
|
+
[tool.hatch.build.targets.sdist]
|
|
63
|
+
include = ["recension", "tests", "README.md", "CHANGELOG.md", "LICENSE"]
|
|
64
|
+
|
|
65
|
+
[tool.ruff]
|
|
66
|
+
line-length = 100
|
|
67
|
+
target-version = "py312"
|
|
68
|
+
|
|
69
|
+
[tool.ruff.lint]
|
|
70
|
+
select = ["E", "F", "W", "I", "UP", "B"]
|
|
71
|
+
|
|
72
|
+
[tool.mypy]
|
|
73
|
+
strict = true
|
|
74
|
+
python_version = "3.12"
|
|
75
|
+
files = ["recension", "tests"]
|
|
76
|
+
|
|
77
|
+
[[tool.mypy.overrides]]
|
|
78
|
+
module = "anthropic.*"
|
|
79
|
+
ignore_missing_imports = true
|
|
80
|
+
|
|
81
|
+
[tool.pytest.ini_options]
|
|
82
|
+
testpaths = ["tests"]
|
|
83
|
+
addopts = "-q"
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""recension: measured optimization of the text layer around a language model.
|
|
2
|
+
|
|
3
|
+
Versioned artifacts with provenance, held-out evaluation, leakage detection,
|
|
4
|
+
and a complete audit record. See the README for the prior-art boundary:
|
|
5
|
+
optimization mechanics are well covered by DSPy and GEPA; recension's
|
|
6
|
+
contribution is the measurement-and-governance shell around a text artifact.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from .artifact import Provenance, RejectedCandidate, TextArtifact, Version
|
|
12
|
+
from .budget import Budget
|
|
13
|
+
from .evalset import EvalSet, Example
|
|
14
|
+
from .exceptions import (
|
|
15
|
+
ArtifactError,
|
|
16
|
+
BudgetExceeded,
|
|
17
|
+
ConfigError,
|
|
18
|
+
DegenerateEvalError,
|
|
19
|
+
LeakageDetected,
|
|
20
|
+
RecensionError,
|
|
21
|
+
)
|
|
22
|
+
from .models import Message, MockModel, Model
|
|
23
|
+
from .objective import F1, ExactMatch, LLMJudge, MaxLength, Objective
|
|
24
|
+
from .optimizer import ReflectiveOptimizer, score_artifact
|
|
25
|
+
from .proposer import CallableProposer, DefaultProposer, FailureCase, Proposer
|
|
26
|
+
from .record import (
|
|
27
|
+
CandidateRecord,
|
|
28
|
+
GuardScore,
|
|
29
|
+
RoundRecord,
|
|
30
|
+
RunRecord,
|
|
31
|
+
SignificanceRecord,
|
|
32
|
+
SliceScore,
|
|
33
|
+
)
|
|
34
|
+
from .report import render_report
|
|
35
|
+
|
|
36
|
+
__version__ = "0.5.0"
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
"ArtifactError",
|
|
40
|
+
"Budget",
|
|
41
|
+
"BudgetExceeded",
|
|
42
|
+
"CallableProposer",
|
|
43
|
+
"CandidateRecord",
|
|
44
|
+
"ConfigError",
|
|
45
|
+
"DefaultProposer",
|
|
46
|
+
"DegenerateEvalError",
|
|
47
|
+
"EvalSet",
|
|
48
|
+
"ExactMatch",
|
|
49
|
+
"Example",
|
|
50
|
+
"F1",
|
|
51
|
+
"FailureCase",
|
|
52
|
+
"GuardScore",
|
|
53
|
+
"LLMJudge",
|
|
54
|
+
"MaxLength",
|
|
55
|
+
"LeakageDetected",
|
|
56
|
+
"Message",
|
|
57
|
+
"MockModel",
|
|
58
|
+
"Model",
|
|
59
|
+
"Objective",
|
|
60
|
+
"Proposer",
|
|
61
|
+
"Provenance",
|
|
62
|
+
"RecensionError",
|
|
63
|
+
"ReflectiveOptimizer",
|
|
64
|
+
"RejectedCandidate",
|
|
65
|
+
"RoundRecord",
|
|
66
|
+
"RunRecord",
|
|
67
|
+
"SignificanceRecord",
|
|
68
|
+
"SliceScore",
|
|
69
|
+
"TextArtifact",
|
|
70
|
+
"render_report",
|
|
71
|
+
"score_artifact",
|
|
72
|
+
"Version",
|
|
73
|
+
"__version__",
|
|
74
|
+
]
|