recension 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. recension-0.5.0/.gitignore +33 -0
  2. recension-0.5.0/CHANGELOG.md +103 -0
  3. recension-0.5.0/LICENSE +21 -0
  4. recension-0.5.0/PKG-INFO +150 -0
  5. recension-0.5.0/README.md +108 -0
  6. recension-0.5.0/pyproject.toml +83 -0
  7. recension-0.5.0/recension/__init__.py +74 -0
  8. recension-0.5.0/recension/artifact.py +337 -0
  9. recension-0.5.0/recension/budget.py +52 -0
  10. recension-0.5.0/recension/cli.py +374 -0
  11. recension-0.5.0/recension/evalset.py +166 -0
  12. recension-0.5.0/recension/exceptions.py +69 -0
  13. recension-0.5.0/recension/leakage.py +154 -0
  14. recension-0.5.0/recension/models/__init__.py +11 -0
  15. recension-0.5.0/recension/models/anthropic.py +136 -0
  16. recension-0.5.0/recension/models/base.py +85 -0
  17. recension-0.5.0/recension/models/mock.py +81 -0
  18. recension-0.5.0/recension/objective.py +214 -0
  19. recension-0.5.0/recension/optimizer.py +772 -0
  20. recension-0.5.0/recension/proposer.py +294 -0
  21. recension-0.5.0/recension/py.typed +0 -0
  22. recension-0.5.0/recension/record.py +453 -0
  23. recension-0.5.0/recension/report.py +180 -0
  24. recension-0.5.0/recension/stats.py +96 -0
  25. recension-0.5.0/tests/test_artifact.py +164 -0
  26. recension-0.5.0/tests/test_budget.py +34 -0
  27. recension-0.5.0/tests/test_cli.py +228 -0
  28. recension-0.5.0/tests/test_evalset.py +118 -0
  29. recension-0.5.0/tests/test_leakage.py +75 -0
  30. recension-0.5.0/tests/test_models.py +109 -0
  31. recension-0.5.0/tests/test_objective.py +121 -0
  32. recension-0.5.0/tests/test_optimizer.py +622 -0
  33. recension-0.5.0/tests/test_proposer.py +193 -0
  34. recension-0.5.0/tests/test_record.py +183 -0
  35. recension-0.5.0/tests/test_report.py +47 -0
  36. recension-0.5.0/tests/test_stats.py +51 -0
@@ -0,0 +1,33 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .eggs/
6
+ build/
7
+ dist/
8
+ .venv/
9
+
10
+ # Tooling caches
11
+ .mypy_cache/
12
+ .pytest_cache/
13
+ .ruff_cache/
14
+ .coverage
15
+
16
+ # Docs / examples build output (regenerated, never committed)
17
+ site/
18
+ docs/examples/generated/
19
+ examples/output/
20
+
21
+ # Run records written by the CLI / examples
22
+ run_record.json
23
+ optimized_prompt.txt
24
+
25
+ # Internal build/spec/maintainer docs, kept local, not published (end-user markdown only)
26
+ CLAUDE.md
27
+ recension_PRD.md
28
+ RELEASING.md
29
+
30
+ # OS / editor
31
+ .DS_Store
32
+ .idea/
33
+ .vscode/
@@ -0,0 +1,103 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project are documented here. The format follows
4
+ [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and the project
5
+ adheres to [Semantic Versioning](https://semver.org/).
6
+
7
+ ## [0.5.0] - 2026-06-11
8
+
9
+ Shareable audits and an ecosystem seam.
10
+
11
+ ### Added
12
+
13
+ - `recension report <record.json> -o report.html` renders a run record into a single standalone HTML
14
+ audit page (inline CSS, no assets, no network): baseline and final scores, the locked test estimate
15
+ and overfit flag, every round's diagnosis and candidates with significance/guard/leakage detail, the
16
+ accepted diff, the per-slice breakdown, the token ledger, and the integrity status. New public
17
+ `render_report`.
18
+ - A pluggable `Proposer` interface (`ReflectiveOptimizer(proposer=...)`) so candidate generation can be
19
+ supplied by an external optimizer (DSPy, GEPA, or your own) while recension keeps owning versioning,
20
+ held-out measurement, leakage detection, and the audit record. Ships `DefaultProposer` (the built-in
21
+ reflective loop) and `CallableProposer` (wrap plain functions). New "Bring your own optimizer" docs.
22
+
23
+ ## [0.4.0] - 2026-06-11
24
+
25
+ Multi-dimensional evaluation: one number hides regressions. All additions are opt-in.
26
+
27
+ ### Added
28
+
29
+ - Per-slice reporting: set `slice_by` to an `Example.metadata` key and the record carries per-subgroup
30
+ baseline-vs-final scores (`SliceScore`), so a run that improves overall while regressing a segment is
31
+ visible. `slice_tolerance` controls when a slice is announced as regressed.
32
+ - Guarded acceptance: `guards=[...]` of secondary objectives that must not regress. A candidate that
33
+ improves the primary metric but lowers a guard beyond `guard_tolerance` is rejected, with the
34
+ incumbent-vs-candidate guard scores recorded (`GuardScore`). Ships the `MaxLength` guard objective.
35
+ - Cost ledger: an optional model-usage capability (`SupportsUsage` / `TokenUsage`). `MockModel` reports
36
+ synthetic deterministic counts; `AnthropicModel` reads `response.usage`. The record carries per-round
37
+ and total input/output tokens. Models without usage report zeros.
38
+ - CLI config keys: `slice_by`, `slice_tolerance`, `guards`, `guard_tolerance`.
39
+
40
+ ## [0.3.0] - 2026-06-11
41
+
42
+ Prompts as tested, tamper-evident artifacts.
43
+
44
+ ### Added
45
+
46
+ - `recension check`: a prompt regression guard for CI. Scores the current artifact against a baseline
47
+ (a prior record's score or a literal) on the validation or test split and exits non-zero on a
48
+ regression, so a prompt change that hurts your eval set fails the build. Backed by the new public
49
+ `score_artifact` helper. Docs include a GitHub Actions recipe.
50
+ - Tamper-evident records: `RunRecord.verify()` checks the embedded artifact's content-addressed
51
+ version chain (catching edited text/ids with no external reference), `RunRecord.fingerprint()` is a
52
+ deterministic content hash, and `sign()` / `verify_signature()` add optional HMAC signing
53
+ (`RECENSION_SIGNING_KEY`). New `recension verify` command; `recension show` prints an integrity line.
54
+ `TextArtifact.verify()` exposes the version-chain check directly.
55
+
56
+ ## [0.2.0] - 2026-06-10
57
+
58
+ Honest measurement. Both additions are opt-in; 0.1.0 code is unaffected.
59
+
60
+ ### Added
61
+
62
+ - Optional locked `test` split on `EvalSet` (records with `split: "test"`). The optimizer scores the
63
+ final incumbent on it exactly once and records `final_test_score`, the `validation`/`test` gap, and
64
+ a `validation_overfit` flag when the gap exceeds `overfit_gap` (default 0.1). This gives an unbiased
65
+ final estimate that the repeated selection on `validation` cannot.
66
+ - Significance-based acceptance (`accept_significant`, with `alpha` and `bootstrap_resamples`). When
67
+ on, a candidate is accepted only if its validation gain is statistically significant (a seeded
68
+ paired-bootstrap confidence interval excluding 0), not merely above `min_improvement`. The bootstrap
69
+ is recorded on the round's best candidate. New stdlib-only `recension.stats` module; new
70
+ `SignificanceRecord`.
71
+ - CLI config keys for the above (`accept_significant`, `alpha`, `bootstrap_resamples`, `overfit_gap`)
72
+ and a `test` split in `examples/cli/`.
73
+
74
+ ## [0.1.0] - 2026-06-10
75
+
76
+ Initial release.
77
+
78
+ ### Added
79
+
80
+ - `TextArtifact`: versioned text with content-addressed version ids, unified
81
+ diffs (stdlib `difflib`), append-only rollback, and full `Provenance` on
82
+ every accepted version (diagnosis, scores, rejected sibling candidates, diff).
83
+ - `EvalSet` / `Example` with an enforced train/validation split; loaders
84
+ `from_records` and `from_jsonl`. Split-integrity violations raise
85
+ `DegenerateEvalError`.
86
+ - Objectives: `ExactMatch`, token-level `F1`, and model-graded `LLMJudge`
87
+ (flagged as model-graded in run records).
88
+ - `ReflectiveOptimizer`: the propose/test/accept loop, with failures diagnosed on
89
+ train, distinct candidates generated and scored on validation, acceptance
90
+ gated on `min_improvement` plus leakage checks.
91
+ - `Budget`: caller-controlled candidates per round, rounds, diagnosis depth,
92
+ and a hard `max_model_calls` ceiling (`BudgetExceeded` carries the partial
93
+ audit record).
94
+ - Leakage heuristics: verbatim validation spans and implausible
95
+ validation-vs-train gains; surfaced as flags, or raised via strict mode.
96
+ - `RunRecord` / `RoundRecord` / `CandidateRecord`: a complete, serializable
97
+ audit record of every run, with the full artifact embedded.
98
+ - Model layer: provider-agnostic `Model` protocol, deterministic `MockModel`
99
+ (the entire test suite runs offline), and an optional Anthropic backend
100
+ (`recension[anthropic]`; API key from the environment only).
101
+ - CLI: `recension run --config run.yaml`, `recension show`, `recension diff`.
102
+ - Three reproducible worked examples and a docs site (MkDocs + API reference)
103
+ with pages regenerated from real offline runs.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Anthony Nyström
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,150 @@
1
+ Metadata-Version: 2.4
2
+ Name: recension
3
+ Version: 0.5.0
4
+ Summary: Measured optimization of the text layer around a language model: versioned artifacts, held-out evaluation, leakage detection, and a complete audit record.
5
+ Project-URL: Homepage, https://github.com/AnthonyNystrom/recension
6
+ Project-URL: Documentation, https://anthonynystrom.github.io/recension/
7
+ Project-URL: Repository, https://github.com/AnthonyNystrom/recension
8
+ Project-URL: Changelog, https://github.com/AnthonyNystrom/recension/blob/main/CHANGELOG.md
9
+ Author-email: Anthony Nyström <nystrom.anthony@gmail.com>
10
+ License: MIT
11
+ License-File: LICENSE
12
+ Keywords: auditability,evaluation,llm,prompt-optimization,provenance
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Programming Language :: Python :: 3.14
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Classifier: Typing :: Typed
24
+ Requires-Python: >=3.12
25
+ Requires-Dist: pyyaml>=6.0
26
+ Provides-Extra: anthropic
27
+ Requires-Dist: anthropic>=0.40; extra == 'anthropic'
28
+ Provides-Extra: dev
29
+ Requires-Dist: build>=1.2; extra == 'dev'
30
+ Requires-Dist: mypy>=1.13; extra == 'dev'
31
+ Requires-Dist: pytest>=8.0; extra == 'dev'
32
+ Requires-Dist: ruff>=0.8; extra == 'dev'
33
+ Requires-Dist: twine>=5.0; extra == 'dev'
34
+ Requires-Dist: types-pyyaml; extra == 'dev'
35
+ Provides-Extra: docs
36
+ Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
37
+ Requires-Dist: mkdocs>=1.6; extra == 'docs'
38
+ Requires-Dist: mkdocstrings[python]>=0.27; extra == 'docs'
39
+ Provides-Extra: webapp
40
+ Requires-Dist: flask>=3.0; extra == 'webapp'
41
+ Description-Content-Type: text/markdown
42
+
43
+ <p align="center">
44
+ <picture>
45
+ <source media="(prefers-color-scheme: dark)" srcset="docs/assets/recension-logo-dark.svg">
46
+ <img src="docs/assets/recension-logo.svg" alt="recension" width="340">
47
+ </picture>
48
+ </p>
49
+
50
+ # recension
51
+
52
+ Measured optimization of the text layer around a language model (prompts, context templates, skill and instruction files) with the rigor normally reserved for weight training: a held-out objective, a baseline, versioned artifacts, and a complete audit trail.
53
+
54
+ The name comes from textual criticism. A *recension* is the revision of a text by collating variant readings and keeping the best-supported one. That is the loop this library runs: propose multiple candidate edits, test each against held-out evidence, commit only what measurably improves, and record why.
55
+
56
+ ## Why
57
+
58
+ The usual way to improve a prompt is to edit it, eyeball a few outputs, and ship. There is no held-out measurement, no record of why a change was made, and no defense against overfitting to the handful of cases you inspected. `recension` replaces that loop with a measured one:
59
+
60
+ - **No edit is accepted without a held-out score that beats the incumbent.** Failures are diagnosed on a train split; acceptance happens only on a validation split, and can require the gain to be *statistically significant* rather than above an epsilon. An optional locked test split gives an unbiased final estimate.
61
+ - **Every accepted version carries provenance**: the failures that motivated it, the diagnosis, every sibling candidate considered (with scores), and the diff. A reviewer who didn't run the optimization can reconstruct every decision.
62
+ - **One number doesn't hide regressions.** Optional per-slice scores, non-regression guard objectives, and a token-cost ledger surface what an aggregate averages away.
63
+ - **Leakage is checked, not assumed away.** Heuristics flag candidates that embed validation content or show implausible validation gains.
64
+ - **Records are built to be acted on.** Gate a prompt in CI with `recension check`, detect tampering with `recension verify`, and share a standalone HTML audit with `recension report`.
65
+ - **Compute is a dial.** Candidates per round, rounds, diagnosis depth, and a hard ceiling on model calls are all caller-controlled.
66
+
67
+ ## Real-world use cases
68
+
69
+ - **Production classification and extraction** (support-ticket triage, invoice fields, moderation): improve a labeling prompt on labeled data with measured, regression-safe edits.
70
+ - **RAG context templates**: tune how retrieved chunks are assembled into the prompt with the model held fixed, so the metric move is attributable to the text.
71
+ - **Agent and skill instructions**: optimize longer instruction files judged by an `LLMJudge` rubric when there is no gold answer.
72
+ - **Governance and audit**: ship a replayable, tamper-evident `RunRecord` for every prompt change; gate merges in CI with `recension check`, and hand reviewers a standalone HTML audit with `recension report`.
73
+
74
+ Full write-ups, plus a "how it works" walkthrough, are on the [documentation site](https://anthonynystrom.github.io/recension/).
75
+
76
+ ## Prior art, honestly
77
+
78
+ DSPy and GEPA own the optimization mechanics this library's `ReflectiveOptimizer` performs; if you want state-of-the-art prompt optimization algorithms, look there. `recension`'s contribution is the **measurement and governance shell** around a text artifact: versioned artifacts with provenance, leakage detection, the complete audit record, and budgeted update-time compute. That delegation is a real seam, not a promise: the `Proposer` protocol lets an external engine supply the candidate edits while recension keeps owning the artifact, the measurement, and the record (see the [Bring your own optimizer](https://anthonynystrom.github.io/recension/ecosystem/) guide).
79
+
80
+ ## Install
81
+
82
+ ```bash
83
+ pip install recension # core: zero provider dependencies
84
+ pip install "recension[anthropic]" # adds the Anthropic backend
85
+ ```
86
+
87
+ Python 3.12+. The core (and the whole test suite) runs against a deterministic `MockModel` with no API key and no network.
88
+
89
+ ## Quickstart
90
+
91
+ ```python
92
+ from recension import (
93
+ Budget, EvalSet, ExactMatch, MockModel, ReflectiveOptimizer, TextArtifact,
94
+ )
95
+
96
+ artifact = TextArtifact.from_text("Label the sentiment of the message.")
97
+
98
+ # Held-out examples, split into train (for diagnosis) and validation (for
99
+ # acceptance). Load from a JSONL file with EvalSet.from_jsonl(path) instead.
100
+ evalset = EvalSet.from_records([
101
+ {"id": "t1", "input": "Absolutely love this", "expected": "positive", "split": "train"},
102
+ {"id": "t2", "input": "Broke after a day", "expected": "negative", "split": "train"},
103
+ {"id": "v1", "input": "Terrible support", "expected": "negative", "split": "validation"},
104
+ {"id": "v2", "input": "Exceeded expectations", "expected": "positive", "split": "validation"},
105
+ ])
106
+
107
+ optimizer = ReflectiveOptimizer(
108
+ artifact=artifact,
109
+ evalset=evalset,
110
+ objective=ExactMatch(),
111
+ model=MockModel(), # offline mock; see below for the real backend
112
+ budget=Budget(candidates_per_round=4, rounds=3, max_model_calls=200),
113
+ seed=7,
114
+ )
115
+ record = optimizer.run()
116
+
117
+ print(record.summary()) # baseline → accepted versions → final score
118
+ record.save("run_record.json") # the complete audit artifact
119
+ ```
120
+
121
+ To run against a real model, install the extra (`pip install "recension[anthropic]"`), set `ANTHROPIC_API_KEY` in your environment, and use the Anthropic backend:
122
+
123
+ ```python
124
+ from recension.models.anthropic import AnthropicModel # kept off the top-level import so the core needs no provider deps
125
+
126
+ optimizer = ReflectiveOptimizer(..., model=AnthropicModel())
127
+ ```
128
+
129
+ API keys are read from the environment only, never from code or config.
130
+
131
+ ## CLI
132
+
133
+ ```bash
134
+ recension run --config run.yaml # execute an optimization, write the record
135
+ recension show run_record.json # baseline, accepted diffs, score progression, integrity
136
+ recension diff run_record.json vA vB # diff between two artifact versions
137
+ recension check --config run.yaml --baseline run_record.json # CI guard: exit non-zero on regression
138
+ recension verify run_record.json # detect tampering (content-addressed version chain)
139
+ recension report run_record.json -o report.html # standalone HTML audit page
140
+ ```
141
+
142
+ A runnable, fully commented config and dataset live in [`examples/cli/`](examples/cli) (`recension run --config examples/cli/run.yaml`); the config schema and a GitHub Actions recipe for `recension check` are in the [CLI guide](https://anthonynystrom.github.io/recension/cli/).
143
+
144
+ ## Documentation
145
+
146
+ Full docs, API reference, and three worked examples (each reproducible offline against `MockModel`): **https://anthonynystrom.github.io/recension/**
147
+
148
+ ## License
149
+
150
+ MIT
@@ -0,0 +1,108 @@
1
+ <p align="center">
2
+ <picture>
3
+ <source media="(prefers-color-scheme: dark)" srcset="docs/assets/recension-logo-dark.svg">
4
+ <img src="docs/assets/recension-logo.svg" alt="recension" width="340">
5
+ </picture>
6
+ </p>
7
+
8
+ # recension
9
+
10
+ Measured optimization of the text layer around a language model (prompts, context templates, skill and instruction files) with the rigor normally reserved for weight training: a held-out objective, a baseline, versioned artifacts, and a complete audit trail.
11
+
12
+ The name comes from textual criticism. A *recension* is the revision of a text by collating variant readings and keeping the best-supported one. That is the loop this library runs: propose multiple candidate edits, test each against held-out evidence, commit only what measurably improves, and record why.
13
+
14
+ ## Why
15
+
16
+ The usual way to improve a prompt is to edit it, eyeball a few outputs, and ship. There is no held-out measurement, no record of why a change was made, and no defense against overfitting to the handful of cases you inspected. `recension` replaces that loop with a measured one:
17
+
18
+ - **No edit is accepted without a held-out score that beats the incumbent.** Failures are diagnosed on a train split; acceptance happens only on a validation split, and can require the gain to be *statistically significant* rather than above an epsilon. An optional locked test split gives an unbiased final estimate.
19
+ - **Every accepted version carries provenance**: the failures that motivated it, the diagnosis, every sibling candidate considered (with scores), and the diff. A reviewer who didn't run the optimization can reconstruct every decision.
20
+ - **One number doesn't hide regressions.** Optional per-slice scores, non-regression guard objectives, and a token-cost ledger surface what an aggregate averages away.
21
+ - **Leakage is checked, not assumed away.** Heuristics flag candidates that embed validation content or show implausible validation gains.
22
+ - **Records are built to be acted on.** Gate a prompt in CI with `recension check`, detect tampering with `recension verify`, and share a standalone HTML audit with `recension report`.
23
+ - **Compute is a dial.** Candidates per round, rounds, diagnosis depth, and a hard ceiling on model calls are all caller-controlled.
24
+
25
+ ## Real-world use cases
26
+
27
+ - **Production classification and extraction** (support-ticket triage, invoice fields, moderation): improve a labeling prompt on labeled data with measured, regression-safe edits.
28
+ - **RAG context templates**: tune how retrieved chunks are assembled into the prompt with the model held fixed, so the metric move is attributable to the text.
29
+ - **Agent and skill instructions**: optimize longer instruction files judged by an `LLMJudge` rubric when there is no gold answer.
30
+ - **Governance and audit**: ship a replayable, tamper-evident `RunRecord` for every prompt change; gate merges in CI with `recension check`, and hand reviewers a standalone HTML audit with `recension report`.
31
+
32
+ Full write-ups, plus a "how it works" walkthrough, are on the [documentation site](https://anthonynystrom.github.io/recension/).
33
+
34
+ ## Prior art, honestly
35
+
36
+ DSPy and GEPA own the optimization mechanics this library's `ReflectiveOptimizer` performs; if you want state-of-the-art prompt optimization algorithms, look there. `recension`'s contribution is the **measurement and governance shell** around a text artifact: versioned artifacts with provenance, leakage detection, the complete audit record, and budgeted update-time compute. That delegation is a real seam, not a promise: the `Proposer` protocol lets an external engine supply the candidate edits while recension keeps owning the artifact, the measurement, and the record (see the [Bring your own optimizer](https://anthonynystrom.github.io/recension/ecosystem/) guide).
37
+
38
+ ## Install
39
+
40
+ ```bash
41
+ pip install recension # core: zero provider dependencies
42
+ pip install "recension[anthropic]" # adds the Anthropic backend
43
+ ```
44
+
45
+ Python 3.12+. The core (and the whole test suite) runs against a deterministic `MockModel` with no API key and no network.
46
+
47
+ ## Quickstart
48
+
49
+ ```python
50
+ from recension import (
51
+ Budget, EvalSet, ExactMatch, MockModel, ReflectiveOptimizer, TextArtifact,
52
+ )
53
+
54
+ artifact = TextArtifact.from_text("Label the sentiment of the message.")
55
+
56
+ # Held-out examples, split into train (for diagnosis) and validation (for
57
+ # acceptance). Load from a JSONL file with EvalSet.from_jsonl(path) instead.
58
+ evalset = EvalSet.from_records([
59
+ {"id": "t1", "input": "Absolutely love this", "expected": "positive", "split": "train"},
60
+ {"id": "t2", "input": "Broke after a day", "expected": "negative", "split": "train"},
61
+ {"id": "v1", "input": "Terrible support", "expected": "negative", "split": "validation"},
62
+ {"id": "v2", "input": "Exceeded expectations", "expected": "positive", "split": "validation"},
63
+ ])
64
+
65
+ optimizer = ReflectiveOptimizer(
66
+ artifact=artifact,
67
+ evalset=evalset,
68
+ objective=ExactMatch(),
69
+ model=MockModel(), # offline mock; see below for the real backend
70
+ budget=Budget(candidates_per_round=4, rounds=3, max_model_calls=200),
71
+ seed=7,
72
+ )
73
+ record = optimizer.run()
74
+
75
+ print(record.summary()) # baseline → accepted versions → final score
76
+ record.save("run_record.json") # the complete audit artifact
77
+ ```
78
+
79
+ To run against a real model, install the extra (`pip install "recension[anthropic]"`), set `ANTHROPIC_API_KEY` in your environment, and use the Anthropic backend:
80
+
81
+ ```python
82
+ from recension.models.anthropic import AnthropicModel # kept off the top-level import so the core needs no provider deps
83
+
84
+ optimizer = ReflectiveOptimizer(..., model=AnthropicModel())
85
+ ```
86
+
87
+ API keys are read from the environment only, never from code or config.
88
+
89
+ ## CLI
90
+
91
+ ```bash
92
+ recension run --config run.yaml # execute an optimization, write the record
93
+ recension show run_record.json # baseline, accepted diffs, score progression, integrity
94
+ recension diff run_record.json vA vB # diff between two artifact versions
95
+ recension check --config run.yaml --baseline run_record.json # CI guard: exit non-zero on regression
96
+ recension verify run_record.json # detect tampering (content-addressed version chain)
97
+ recension report run_record.json -o report.html # standalone HTML audit page
98
+ ```
99
+
100
+ A runnable, fully commented config and dataset live in [`examples/cli/`](examples/cli) (`recension run --config examples/cli/run.yaml`); the config schema and a GitHub Actions recipe for `recension check` are in the [CLI guide](https://anthonynystrom.github.io/recension/cli/).
101
+
102
+ ## Documentation
103
+
104
+ Full docs, API reference, and three worked examples (each reproducible offline against `MockModel`): **https://anthonynystrom.github.io/recension/**
105
+
106
+ ## License
107
+
108
+ MIT
@@ -0,0 +1,83 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "recension"
7
+ version = "0.5.0"
8
+ description = "Measured optimization of the text layer around a language model: versioned artifacts, held-out evaluation, leakage detection, and a complete audit record."
9
+ readme = "README.md"
10
+ requires-python = ">=3.12"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "Anthony Nyström", email = "nystrom.anthony@gmail.com" }]
13
+ keywords = ["prompt-optimization", "llm", "evaluation", "provenance", "auditability"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Intended Audience :: Developers",
17
+ "Intended Audience :: Science/Research",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Operating System :: OS Independent",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Programming Language :: Python :: 3.13",
23
+ "Programming Language :: Python :: 3.14",
24
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
25
+ "Typing :: Typed",
26
+ ]
27
+ # DESIGN NOTE: PyYAML is the single runtime dependency, required only because the
28
+ # PRD specifies YAML config files for the CLI. Everything else is stdlib.
29
+ dependencies = ["pyyaml>=6.0"]
30
+
31
+ [project.optional-dependencies]
32
+ anthropic = ["anthropic>=0.40"]
33
+ dev = [
34
+ "pytest>=8.0",
35
+ "mypy>=1.13",
36
+ "ruff>=0.8",
37
+ "build>=1.2",
38
+ "twine>=5.0",
39
+ "types-PyYAML",
40
+ ]
41
+ docs = [
42
+ "mkdocs>=1.6",
43
+ "mkdocs-material>=9.5",
44
+ "mkdocstrings[python]>=0.27",
45
+ ]
46
+ # DESIGN NOTE: Flask powers only the optional `examples/webapp` live demo; it is
47
+ # never imported by the library, so it stays out of the core and docs groups.
48
+ webapp = ["flask>=3.0"]
49
+
50
+ [project.scripts]
51
+ recension = "recension.cli:main"
52
+
53
+ [project.urls]
54
+ Homepage = "https://github.com/AnthonyNystrom/recension"
55
+ Documentation = "https://anthonynystrom.github.io/recension/"
56
+ Repository = "https://github.com/AnthonyNystrom/recension"
57
+ Changelog = "https://github.com/AnthonyNystrom/recension/blob/main/CHANGELOG.md"
58
+
59
+ [tool.hatch.build.targets.wheel]
60
+ packages = ["recension"]
61
+
62
+ [tool.hatch.build.targets.sdist]
63
+ include = ["recension", "tests", "README.md", "CHANGELOG.md", "LICENSE"]
64
+
65
+ [tool.ruff]
66
+ line-length = 100
67
+ target-version = "py312"
68
+
69
+ [tool.ruff.lint]
70
+ select = ["E", "F", "W", "I", "UP", "B"]
71
+
72
+ [tool.mypy]
73
+ strict = true
74
+ python_version = "3.12"
75
+ files = ["recension", "tests"]
76
+
77
+ [[tool.mypy.overrides]]
78
+ module = "anthropic.*"
79
+ ignore_missing_imports = true
80
+
81
+ [tool.pytest.ini_options]
82
+ testpaths = ["tests"]
83
+ addopts = "-q"
@@ -0,0 +1,74 @@
1
+ """recension: measured optimization of the text layer around a language model.
2
+
3
+ Versioned artifacts with provenance, held-out evaluation, leakage detection,
4
+ and a complete audit record. See the README for the prior-art boundary:
5
+ optimization mechanics are well covered by DSPy and GEPA; recension's
6
+ contribution is the measurement-and-governance shell around a text artifact.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from .artifact import Provenance, RejectedCandidate, TextArtifact, Version
12
+ from .budget import Budget
13
+ from .evalset import EvalSet, Example
14
+ from .exceptions import (
15
+ ArtifactError,
16
+ BudgetExceeded,
17
+ ConfigError,
18
+ DegenerateEvalError,
19
+ LeakageDetected,
20
+ RecensionError,
21
+ )
22
+ from .models import Message, MockModel, Model
23
+ from .objective import F1, ExactMatch, LLMJudge, MaxLength, Objective
24
+ from .optimizer import ReflectiveOptimizer, score_artifact
25
+ from .proposer import CallableProposer, DefaultProposer, FailureCase, Proposer
26
+ from .record import (
27
+ CandidateRecord,
28
+ GuardScore,
29
+ RoundRecord,
30
+ RunRecord,
31
+ SignificanceRecord,
32
+ SliceScore,
33
+ )
34
+ from .report import render_report
35
+
36
+ __version__ = "0.5.0"
37
+
38
+ __all__ = [
39
+ "ArtifactError",
40
+ "Budget",
41
+ "BudgetExceeded",
42
+ "CallableProposer",
43
+ "CandidateRecord",
44
+ "ConfigError",
45
+ "DefaultProposer",
46
+ "DegenerateEvalError",
47
+ "EvalSet",
48
+ "ExactMatch",
49
+ "Example",
50
+ "F1",
51
+ "FailureCase",
52
+ "GuardScore",
53
+ "LLMJudge",
54
+ "MaxLength",
55
+ "LeakageDetected",
56
+ "Message",
57
+ "MockModel",
58
+ "Model",
59
+ "Objective",
60
+ "Proposer",
61
+ "Provenance",
62
+ "RecensionError",
63
+ "ReflectiveOptimizer",
64
+ "RejectedCandidate",
65
+ "RoundRecord",
66
+ "RunRecord",
67
+ "SignificanceRecord",
68
+ "SliceScore",
69
+ "TextArtifact",
70
+ "render_report",
71
+ "score_artifact",
72
+ "Version",
73
+ "__version__",
74
+ ]