falsify 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. {falsify-0.2.0 → falsify-0.3.0}/NOTICE +1 -1
  2. {falsify-0.2.0/falsify.egg-info → falsify-0.3.0}/PKG-INFO +67 -45
  3. {falsify-0.2.0 → falsify-0.3.0}/README.md +65 -43
  4. {falsify-0.2.0 → falsify-0.3.0/falsify.egg-info}/PKG-INFO +67 -45
  5. {falsify-0.2.0 → falsify-0.3.0}/falsify.egg-info/SOURCES.txt +3 -1
  6. falsify-0.3.0/falsify.egg-info/entry_points.txt +3 -0
  7. {falsify-0.2.0 → falsify-0.3.0}/falsify.egg-info/top_level.txt +1 -0
  8. falsify-0.3.0/falsify_prml.py +321 -0
  9. {falsify-0.2.0 → falsify-0.3.0}/pyproject.toml +5 -4
  10. falsify-0.2.0/tests/test_juju_sample.py → falsify-0.3.0/tests/test_calibration_sample.py +10 -10
  11. {falsify-0.2.0 → falsify-0.3.0}/tests/test_ci_workflow.py +2 -2
  12. {falsify-0.2.0 → falsify-0.3.0}/tests/test_demo_script.py +1 -1
  13. falsify-0.3.0/tests/test_prml_cli.py +73 -0
  14. {falsify-0.2.0 → falsify-0.3.0}/tests/test_pyproject.py +10 -6
  15. {falsify-0.2.0 → falsify-0.3.0}/tests/test_readme.py +4 -2
  16. falsify-0.2.0/falsify.egg-info/entry_points.txt +0 -2
  17. {falsify-0.2.0 → falsify-0.3.0}/LICENSE +0 -0
  18. {falsify-0.2.0 → falsify-0.3.0}/falsify.egg-info/dependency_links.txt +0 -0
  19. {falsify-0.2.0 → falsify-0.3.0}/falsify.egg-info/requires.txt +0 -0
  20. {falsify-0.2.0 → falsify-0.3.0}/falsify.py +0 -0
  21. {falsify-0.2.0 → falsify-0.3.0}/mcp_server/__init__.py +0 -0
  22. {falsify-0.2.0 → falsify-0.3.0}/mcp_server/__main__.py +0 -0
  23. {falsify-0.2.0 → falsify-0.3.0}/mcp_server/server.py +0 -0
  24. {falsify-0.2.0 → falsify-0.3.0}/setup.cfg +0 -0
  25. {falsify-0.2.0 → falsify-0.3.0}/tests/test_adversarial_doc.py +0 -0
  26. {falsify-0.2.0 → falsify-0.3.0}/tests/test_agent_claim_auditor.py +0 -0
  27. {falsify-0.2.0 → falsify-0.3.0}/tests/test_agent_verdict_refresher.py +0 -0
  28. {falsify-0.2.0 → falsify-0.3.0}/tests/test_architecture.py +0 -0
  29. {falsify-0.2.0 → falsify-0.3.0}/tests/test_bench.py +0 -0
  30. {falsify-0.2.0 → falsify-0.3.0}/tests/test_case_studies_doc.py +0 -0
  31. {falsify-0.2.0 → falsify-0.3.0}/tests/test_changelog.py +0 -0
  32. {falsify-0.2.0 → falsify-0.3.0}/tests/test_claude_md.py +0 -0
  33. {falsify-0.2.0 → falsify-0.3.0}/tests/test_code_of_conduct.py +0 -0
  34. {falsify-0.2.0 → falsify-0.3.0}/tests/test_comparison_doc.py +0 -0
  35. {falsify-0.2.0 → falsify-0.3.0}/tests/test_contributing.py +0 -0
  36. {falsify-0.2.0 → falsify-0.3.0}/tests/test_demo_script_doc.py +0 -0
  37. {falsify-0.2.0 → falsify-0.3.0}/tests/test_demo_shot_list.py +0 -0
  38. {falsify-0.2.0 → falsify-0.3.0}/tests/test_diff.py +0 -0
  39. {falsify-0.2.0 → falsify-0.3.0}/tests/test_docker.py +0 -0
  40. {falsify-0.2.0 → falsify-0.3.0}/tests/test_doctor.py +0 -0
  41. {falsify-0.2.0 → falsify-0.3.0}/tests/test_editorconfig.py +0 -0
  42. {falsify-0.2.0 → falsify-0.3.0}/tests/test_examples_doc.py +0 -0
  43. {falsify-0.2.0 → falsify-0.3.0}/tests/test_export.py +0 -0
  44. {falsify-0.2.0 → falsify-0.3.0}/tests/test_faq.py +0 -0
  45. {falsify-0.2.0 → falsify-0.3.0}/tests/test_github_repo_maturity.py +0 -0
  46. {falsify-0.2.0 → falsify-0.3.0}/tests/test_github_templates.py +0 -0
  47. {falsify-0.2.0 → falsify-0.3.0}/tests/test_gitignore.py +0 -0
  48. {falsify-0.2.0 → falsify-0.3.0}/tests/test_glossary_doc.py +0 -0
  49. {falsify-0.2.0 → falsify-0.3.0}/tests/test_guard.py +0 -0
  50. {falsify-0.2.0 → falsify-0.3.0}/tests/test_hook_install.py +0 -0
  51. {falsify-0.2.0 → falsify-0.3.0}/tests/test_init.py +0 -0
  52. {falsify-0.2.0 → falsify-0.3.0}/tests/test_init_templates.py +0 -0
  53. {falsify-0.2.0 → falsify-0.3.0}/tests/test_integration_e2e.py +0 -0
  54. {falsify-0.2.0 → falsify-0.3.0}/tests/test_list.py +0 -0
  55. {falsify-0.2.0 → falsify-0.3.0}/tests/test_lock.py +0 -0
  56. {falsify-0.2.0 → falsify-0.3.0}/tests/test_makefile.py +0 -0
  57. {falsify-0.2.0 → falsify-0.3.0}/tests/test_managed_agents.py +0 -0
  58. {falsify-0.2.0 → falsify-0.3.0}/tests/test_mcp.py +0 -0
  59. {falsify-0.2.0 → falsify-0.3.0}/tests/test_mcp_server.py +0 -0
  60. {falsify-0.2.0 → falsify-0.3.0}/tests/test_pitch.py +0 -0
  61. {falsify-0.2.0 → falsify-0.3.0}/tests/test_pre_commit.py +0 -0
  62. {falsify-0.2.0 → falsify-0.3.0}/tests/test_prml_v02_candidates.py +0 -0
  63. {falsify-0.2.0 → falsify-0.3.0}/tests/test_prml_vectors.py +0 -0
  64. {falsify-0.2.0 → falsify-0.3.0}/tests/test_release_check.py +0 -0
  65. {falsify-0.2.0 → falsify-0.3.0}/tests/test_release_workflow.py +0 -0
  66. {falsify-0.2.0 → falsify-0.3.0}/tests/test_replay.py +0 -0
  67. {falsify-0.2.0 → falsify-0.3.0}/tests/test_roadmap.py +0 -0
  68. {falsify-0.2.0 → falsify-0.3.0}/tests/test_run.py +0 -0
  69. {falsify-0.2.0 → falsify-0.3.0}/tests/test_score.py +0 -0
  70. {falsify-0.2.0 → falsify-0.3.0}/tests/test_self_dogfood.py +0 -0
  71. {falsify-0.2.0 → falsify-0.3.0}/tests/test_skill_author.py +0 -0
  72. {falsify-0.2.0 → falsify-0.3.0}/tests/test_skill_ci_doctor.py +0 -0
  73. {falsify-0.2.0 → falsify-0.3.0}/tests/test_skill_claim_audit.py +0 -0
  74. {falsify-0.2.0 → falsify-0.3.0}/tests/test_skill_claim_review.py +0 -0
  75. {falsify-0.2.0 → falsify-0.3.0}/tests/test_skill_falsify.py +0 -0
  76. {falsify-0.2.0 → falsify-0.3.0}/tests/test_slash_commands.py +0 -0
  77. {falsify-0.2.0 → falsify-0.3.0}/tests/test_stats.py +0 -0
  78. {falsify-0.2.0 → falsify-0.3.0}/tests/test_stats_html.py +0 -0
  79. {falsify-0.2.0 → falsify-0.3.0}/tests/test_submission.py +0 -0
  80. {falsify-0.2.0 → falsify-0.3.0}/tests/test_submission_md.py +0 -0
  81. {falsify-0.2.0 → falsify-0.3.0}/tests/test_trend.py +0 -0
  82. {falsify-0.2.0 → falsify-0.3.0}/tests/test_tutorial.py +0 -0
  83. {falsify-0.2.0 → falsify-0.3.0}/tests/test_verdict.py +0 -0
  84. {falsify-0.2.0 → falsify-0.3.0}/tests/test_verify.py +0 -0
  85. {falsify-0.2.0 → falsify-0.3.0}/tests/test_version.py +0 -0
  86. {falsify-0.2.0 → falsify-0.3.0}/tests/test_why.py +0 -0
@@ -32,7 +32,7 @@ Teams deploying falsify in production as part of a commercial service
32
32
  are encouraged — but not required by the MIT License — to contact the
33
33
  author about support, SLAs, and enterprise features:
34
34
 
35
- hello@studio-11.co
35
+ hello@falsify.dev
36
36
 
37
37
  See docs/COMMERCIAL.md for details.
38
38
 
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: falsify
3
- Version: 0.2.0
4
- Summary: Pre-registration and CI for AI-agent claims deterministic PASS or FAIL.
3
+ Version: 0.3.0
4
+ Summary: PRML reference CLI pre-register an ML evaluation claim as a SHA-256 manifest; verify PASS/FAIL/TAMPERED.
5
5
  Author: Cüneyt Öztürk
6
6
  License: MIT
7
7
  Project-URL: Homepage, https://falsify.dev
@@ -34,21 +34,25 @@ Dynamic: license-file
34
34
 
35
35
  **ML evaluation claims should be locked before the experiment runs, not reported after.**
36
36
 
37
- `falsify` commits a claim — metric, threshold, dataset hash, seed — as a SHA-256 manifest. Run the eval. The hash either matches or it doesn't.
37
+ PRML commits a claim — metric, threshold, dataset hash, seed — as a SHA-256 manifest. Run the eval. The hash either matches or it doesn't.
38
38
 
39
39
  ```bash
40
- $ falsify lock claim.yaml
41
- locked: sha256:a3f9...c821
40
+ $ pip install falsify
41
+ $ falsify lock claim.prml.yaml
42
+ locked: claim.prml.yaml
43
+ sha256: c30dba8e0f566d1beebf4f8d468e6e07c821f0c72562dfb64ddf6596796f7797
42
44
 
43
- $ falsify verdict claim.yaml
44
- PASS accuracy 0.934 >= 0.90 (hash verified)
45
+ $ falsify verify claim.prml.yaml --observed 0.934
46
+ PASS metric=accuracy observed=0.934 >= threshold=0.9
45
47
 
46
- # tampered:
47
- $ falsify verdict claim.yaml
48
- TAMPERED sha256 mismatch — spec modified after locking (exit 3)
48
+ # spec edited after locking → hash no longer matches:
49
+ $ falsify verify claim.prml.yaml --observed 0.934
50
+ TAMPERED (exit 3)
49
51
  ```
50
52
 
51
- 4 reference implementations Python, JavaScript, Go, Rust — byte-equivalent on the 12 v0.1 conformance vectors (8 v0.2 candidates ship alongside, full 20-vector parity targeted for v0.2 freeze 2026-05-22). Designed for ML eval rigor. Maps to EU AI Act Article 12 evidence as a side effect.
53
+ No install? Verify any manifest in-browser at [registry.falsify.dev](https://registry.falsify.dev). Byte-equivalent reference CLIs also ship for JS (`npm i -g falsify-js`), Go, and Rust.
54
+
55
+ 4 reference implementations (Python, JavaScript, Go, Rust) byte-equivalent on all 20 conformance vectors (12 v0.1 stable + 8 v0.2). PRML v0.2 frozen 2026-05-22. The same day, Lock #2 (a public hypothesis on the spec's own distribution, target ≥3 external contributors in 14 days) resolved at 0/3. The mechanism worked, the post-mortem is at [falsify.dev/notes/lock-2-postmortem](https://falsify.dev/notes/lock-2-postmortem/). Designed for ML eval rigor. Maps to EU AI Act Article 12 evidence as a side effect.
52
56
 
53
57
  > **Pre-registration + CI for AI-agent claims.** Lock the claim and threshold with SHA-256 *before* running the experiment — or the result doesn't count.
54
58
 
@@ -81,10 +85,6 @@ TAMPERED sha256 mismatch — spec modified after locking (exit 3)
81
85
 
82
86
  ---
83
87
 
84
- > **Latest — 2026-05-14** · v0.1 published on Zenodo: citable DOI [10.5281/zenodo.20177839](https://doi.org/10.5281/zenodo.20177839). PRML JSON Schema [merged into SchemaStore](https://github.com/SchemaStore/schemastore/pull/5673) (2026-05-11) by Mads Kristensen (Microsoft) — `.prml.yaml` files now autocomplete in VS Code, JetBrains, Helix, Zed, and Cursor. [OECD.AI Catalogue submission](https://oecd.ai/en/catalogue/tools) filed, vetting in progress. NIST AI 800-2 late comment archived. JTC 21 routed via Dr. Sebastian Hallensleben. `registry.falsify.dev` live with README badges at `registry.falsify.dev/badge/<hash>.svg`. **v0.1.4 released** ([release notes](https://github.com/studio-11-co/falsify/releases/tag/v0.1.4) · `pip install falsify==0.1.4`). PRML v0.1 specification published with **four reference implementations** (Python · [JavaScript](impl/js/) · [Go](impl/go/) · [Rust](impl/rust/)) all reproducing the [12 v0.1 vectors](spec/test-vectors/v0.1/) and [8 v0.2 candidate vectors](spec/test-vectors/v0.2/) byte-for-byte (20 vectors total). [14-page arXiv preprint](spec/paper/) and [v0.2 RFC](https://spec.falsify.dev/v0.2-rfc) (freeze 2026-05-22) open for public review.
85
-
86
- ---
87
-
88
88
  ## The problem
89
89
 
90
90
  Your team claims the model hits **94% accuracy**. You ship it. Three weeks later a customer proves the real number is **71%**.
@@ -97,9 +97,9 @@ PRML does not prove an ML result is true. It proves that a specific evaluation c
97
97
 
98
98
  **Falsify fixes this with a single idea from science:** you must pre-register the claim *before* you run the experiment. If you change the spec after seeing the data, the hash changes, the audit trail breaks, and CI fails with exit code 3.
99
99
 
100
- $ falsify lock accuracy_claim # SHA-256 the spec
101
- $ falsify run accuracy_claim # reproducible experiment
102
- $ falsify verdict accuracy_claim # exit 0 = PASS, 10 = FAIL, 3 = tampered
100
+ $ falsify-engine lock accuracy_claim # SHA-256 the spec
101
+ $ falsify-engine run accuracy_claim # reproducible experiment
102
+ $ falsify-engine verdict accuracy_claim # exit 0 = PASS, 10 = FAIL, 3 = tampered
103
103
 
104
104
  Deterministic exit codes are the API. CI gates on them. Humans read the audit trail. The claim either survives contact with the data or it doesn't.
105
105
 
@@ -128,14 +128,14 @@ See [docs/CASE_STUDIES.md](docs/CASE_STUDIES.md) for three concrete adoption sto
128
128
 
129
129
  ---
130
130
 
131
- **Current version:** 0.1.0 run `python3 falsify.py --version`.
131
+ **Current version:** falsify 0.3.0 (PRML CLI) · falsify-engine 0.2.0 — `falsify --version`.
132
132
  **Working with Claude Code?** See [CLAUDE.md](CLAUDE.md).
133
133
 
134
134
  ---
135
135
 
136
136
  ## Specification artifacts
137
137
 
138
- Falsify is the reference implementation of **PRML v0.1** — Pre-Registered ML Manifest Specification. The spec, conformance suite, and adjacent documents live under `spec/`:
138
+ This repository is the home of **PRML v0.1** — Pre-Registered ML Manifest Specification. The spec, conformance suite, reference implementations (`impl/`, JS/Go/Rust + a Python reference target), and adjacent documents live under `spec/`:
139
139
 
140
140
  - **[`spec/PRML-v0.1.md`](spec/PRML-v0.1.md)** — the spec (RFC-style, CC BY 4.0)
141
141
  - **[`spec/test-vectors/v0.1/`](spec/test-vectors/v0.1/)** — 12 conformance vectors with locked SHA-256 digests
@@ -153,6 +153,15 @@ Falsify is the reference implementation of **PRML v0.1** — Pre-Registered ML M
153
153
  - **[NIST AI RMF 1.0 crosswalk](https://spec.falsify.dev/nist-ai-rmf/)** — GOVERN / MAP / MEASURE / MANAGE subcategory map (incl. AI 600-1 GenAI Profile)
154
154
  - **[ISO/IEC 42001:2023 crosswalk](https://spec.falsify.dev/iso-42001/)** — AIMS clause-by-clause evidence map (Clauses 7-9 + Annex A controls)
155
155
 
156
+ **Long-form working notes** (2026-05-23, written for compliance leads, AI governance officers, and notified body assessors preparing for the 2 August 2026 deadline; CC BY 4.0):
157
+
158
+ - **[EU AI Act readiness assessment](https://falsify.dev/eu-ai-act-readiness/)** — six binding articles, ten-question gap check, evidence shape per obligation
159
+ - **[2 August 2026 deadline](https://falsify.dev/ai-act-deadline-august-2026/)** — three application dates, Article 99 penalty structure, ten-week plan
160
+ - **[Article 12 logging checklist](https://falsify.dev/article-12-checklist/)** — ten closeable questions, six event categories, printable single-page summary
161
+ - **[Notified body evidence](https://falsify.dev/notified-body-evidence/)** — Annex VI vs Annex VII conformity assessment, six artefact families
162
+ - **[ISO/IEC 42001 readiness](https://falsify.dev/iso-42001-readiness/)** — seven clauses, EU AI Act Article 17 overlap, twelve-month certification path
163
+ - **[Lock #2 post-mortem](https://falsify.dev/notes/lock-2-postmortem/)** — field report on running a falsifiable spec in public
164
+
156
165
  **Reference implementations** (four languages, 12 v0.1 + 8 v0.2 candidate vectors = 20 total; multi-lang CI runs all 20 byte-for-byte per push and daily at 04:00 UTC):
157
166
 
158
167
  - **Python:** [`falsify.py`](falsify.py) — original reference, uses PyYAML
@@ -160,7 +169,7 @@ Falsify is the reference implementation of **PRML v0.1** — Pre-Registered ML M
160
169
  - **Go:** [`impl/go/`](impl/go/) — third reference, ~450 LOC, hand-rolled, stdlib only
161
170
  - **Rust:** [`impl/rust/`](impl/rust/) — fourth reference, ~600 LOC, hand-rolled, two deps (`serde_json`, `sha2`)
162
171
 
163
- Hosted spec at [spec.falsify.dev/v0.1](https://spec.falsify.dev/v0.1). Public review thread at [GitHub Discussion #6](https://github.com/studio-11-co/falsify/discussions/6). Comments via `hello@studio-11.co`.
172
+ Hosted spec at [spec.falsify.dev/v0.1](https://spec.falsify.dev/v0.1). Public review thread at [GitHub Discussion #6](https://github.com/studio-11-co/falsify/discussions/6). Comments via `hello@falsify.dev`.
164
173
 
165
174
  **Companion projects** (separate repos under `studio-11-co`, each MIT or CC0 licensed):
166
175
 
@@ -216,6 +225,8 @@ is at <https://falsify.dev>, and the project page is at
216
225
 
217
226
  Requires Python **3.11+**.
218
227
 
228
+ > **Two commands, one install.** `falsify` is the **PRML manifest CLI** — `lock` / `verify` / `hash` / `init` / `test-vectors` on a `*.prml.yaml` manifest (shown at the top). `falsify-engine` is the separate **pre-registration workflow engine** — the `init` → `lock` → `run` → `verdict` / `guard` loop over `.falsify/<name>/` specs. The workflow sections further down use `falsify-engine`; substitute it for `falsify` there. No install needed to verify a manifest: paste it at [registry.falsify.dev](https://registry.falsify.dev).
229
+
219
230
  ### Development install (from the repo)
220
231
 
221
232
  ```bash
@@ -257,16 +268,25 @@ exported hooks and how this repo eats its own dog food.
257
268
  ## Quickstart
258
269
 
259
270
  ```bash
260
- ./demo.sh # auto-narrated: PASS tamper FAIL guard block
271
+ # The falsify PRML CLI lock a manifest, run your eval, verify.
272
+ falsify init accuracy.prml.yaml # writes a skeleton manifest
273
+ # edit accuracy.prml.yaml: metric, comparator, threshold, dataset.hash, seed, producer
274
+ falsify lock accuracy.prml.yaml # canonicalize + SHA-256 + write sidecar
275
+ # ... run your eval, get the observed value ...
276
+ falsify verify accuracy.prml.yaml --observed 0.934
277
+ # PASS (exit 0) · FAIL below threshold (exit 10) · TAMPERED if the spec changed (exit 3)
278
+ ```
279
+
280
+ The pre-registration **workflow engine** (claim/falsification specs, `init` → `lock` → `run` → `verdict` → `guard` over `.falsify/<name>/`) ships in the same install as the `falsify-engine` command:
261
281
 
262
- # Either form works — `falsify` is the installed entry point,
263
- # `python3 falsify.py` is the uninstalled fallback.
264
- falsify init my_claim
282
+ ```bash
283
+ ./demo.sh # auto-narrated engine demo (PASS tamper → FAIL → guard)
284
+ falsify-engine init my_claim
265
285
  # edit .falsify/my_claim/spec.yaml to fill in the template
266
- falsify lock my_claim
267
- falsify run my_claim
268
- falsify verdict my_claim
269
- falsify hook install # enable the commit-msg guard
286
+ falsify-engine lock my_claim
287
+ falsify-engine run my_claim
288
+ falsify-engine verdict my_claim
289
+ falsify-engine hook install # enable the commit-msg guard
270
290
  ```
271
291
 
272
292
  Exit code `0` on PASS, `10` on FAIL. Everything else is documented
@@ -279,8 +299,8 @@ New to pre-registration? Walk through [TUTORIAL.md](TUTORIAL.md) — 15 minutes,
279
299
  ```bash
280
300
  falsify init --template accuracy
281
301
  falsify lock accuracy
282
- falsify run accuracy
283
- falsify verdict accuracy
302
+ falsify-engine run accuracy
303
+ falsify-engine verdict accuracy
284
304
  ```
285
305
 
286
306
  Five templates ship with a runnable spec + metric + dataset:
@@ -302,7 +322,7 @@ or the directory with `--dir`.
302
322
  make install # pip install pyyaml
303
323
  make test # run unittest suite
304
324
  make smoke # run tests/smoke_test.sh
305
- make demo # JUJU end-to-end (lock → run → verdict)
325
+ make demo # calibration end-to-end (lock → run → verdict)
306
326
  ```
307
327
 
308
328
  See [Makefile](Makefile) for all targets (`make help`).
@@ -314,18 +334,18 @@ Feature matrix vs adjacent tools: [docs/COMPARISON.md](docs/COMPARISON.md).
314
334
 
315
335
  ### Explain any claim
316
336
 
317
- `falsify why <name>` is the human-friendly companion to `verdict`
337
+ `falsify-engine why <name>` is the human-friendly companion to `verdict`
318
338
  — it always exits `0` and tells you exactly what the next honest
319
339
  move is:
320
340
 
321
341
  ```
322
- claim: juju
342
+ claim: calibration
323
343
  state: STALE
324
344
  reasoning: the spec has been edited (sha256:1038219d75a8) but no run
325
345
  exists against this hash. Last run was against sha256:164f619d4860.
326
346
  locked: yes (sha256:164f619d4860, 2h ago)
327
347
  last run: 2026-04-22T02:10:17+00:00 (2h ago)
328
- next action: `falsify run <name>` to produce a fresh verdict against
348
+ next action: `falsify-engine run <name>` to produce a fresh verdict against
329
349
  the current spec.
330
350
  ```
331
351
 
@@ -334,13 +354,13 @@ and the last five runs.
334
354
 
335
355
  ### Spot drift with a sparkline
336
356
 
337
- `falsify trend <name>` draws an ASCII sparkline of the metric
357
+ `falsify-engine trend <name>` draws an ASCII sparkline of the metric
338
358
  across its recorded runs, marks the threshold line, and classifies
339
359
  the trajectory as **improving**, **degrading**, **flat**, or
340
360
  **mixed**.
341
361
 
342
362
  ```
343
- claim: juju
363
+ claim: calibration
344
364
  threshold: 0.25 (direction: below)
345
365
  runs: 20 shown (of 20)
346
366
 
@@ -362,14 +382,14 @@ trend: degrading
362
382
 
363
383
  ### Measure the CLI itself
364
384
 
365
- `falsify bench` spawns each subcommand under a fresh temporary
385
+ `falsify-engine bench` spawns each subcommand under a fresh temporary
366
386
  directory and records per-command latency (min / median / p95 /
367
387
  max / mean / stddev). Useful as a sanity check before a release
368
388
  or when investigating a suspected startup-time regression.
369
389
 
370
390
  ```bash
371
- falsify bench --runs 5 --commands "--help,list,stats,score"
372
- falsify bench --runs 5 --json # machine-readable output
391
+ falsify-engine bench --runs 5 --commands "--help,list,stats,score"
392
+ falsify-engine bench --runs 5 --json # machine-readable output
373
393
  ```
374
394
 
375
395
  `--runs <N>` sets the timed-iteration count (default 5, capped at
@@ -427,7 +447,7 @@ compose the skills and CLI.
427
447
 
428
448
  **CI** (`.github/workflows/falsify.yml`) — on every push and PR,
429
449
  the workflow runs the unittest suite, `tests/smoke_test.sh`, the
430
- JUJU end-to-end (`lock` → `run` → `verdict`), a guard self-check,
450
+ calibration end-to-end (`lock` → `run` → `verdict`), a guard self-check,
431
451
  and a skill-lint pass over every SKILL.md and agent file.
432
452
 
433
453
  ## Demo
@@ -495,7 +515,7 @@ ln -sf "$(pwd)/hooks/commit-msg" .git/hooks/commit-msg
495
515
  - `hypothesis.schema.yaml` — spec schema (claim, falsification,
496
516
  experiment, environment, artifacts).
497
517
  - `examples/hello_claim/` — tiny smoke-test fixture.
498
- - `examples/juju_sample/` — anonymized 20-row prediction ledger
518
+ - `examples/calibration_sample/` — anonymized 20-row prediction ledger
499
519
  for the Brier score demo.
500
520
  - `hooks/commit-msg` — the guard hook.
501
521
  - `tests/` — `unittest` suite plus `smoke_test.sh` end-to-end driver.
@@ -519,14 +539,16 @@ Run `make dogfood` to re-verify. CI runs these on every PR.
519
539
 
520
540
  See [CHANGELOG.md](CHANGELOG.md) for release history.
521
541
 
542
+ > **Latest — 2026-05-23** · **PRML v0.2 frozen** with all 20 conformance vectors (12 v0.1 stable + 8 v0.2) passing byte-for-byte across the four reference implementations. Lock #2 (public hypothesis on spec's own distribution) resolved at **0/3 external contributors**, mechanism worked, [post-mortem published](https://falsify.dev/notes/lock-2-postmortem/). **`mlflow-falsify` v0.2.0** shipped with `MLFLOW_FALSIFY_TAG_SCOPE=experiment` for HPO sweeps; [MLflow community plugin showcase PR](https://github.com/mlflow/mlflow/pull/23569) is live and under review. Five long-form working notes published for EU AI Act readiness: [readiness assessment](https://falsify.dev/eu-ai-act-readiness/), [2 August 2026 deadline](https://falsify.dev/ai-act-deadline-august-2026/), [Article 12 ten-item checklist](https://falsify.dev/article-12-checklist/), [notified body evidence](https://falsify.dev/notified-body-evidence/), [ISO/IEC 42001 readiness](https://falsify.dev/iso-42001-readiness/). DOI [10.5281/zenodo.20177839](https://doi.org/10.5281/zenodo.20177839). PRML JSON Schema in [SchemaStore](https://github.com/SchemaStore/schemastore/pull/5673) (Mads Kristensen / Microsoft) — `.prml.yaml` files autocomplete in VS Code, JetBrains, Helix, Zed, and Cursor. `registry.falsify.dev` live with README badges at `registry.falsify.dev/badge/<hash>.svg`.
543
+
522
544
  ## Roadmap
523
545
 
524
546
  Two roadmaps run alongside each other:
525
547
 
526
- - **CLI tool roadmap:** [ROADMAP.md](ROADMAP.md) — `falsify` features, integrations, dependencies. CLI v0.2 targeted 2026-06-15.
527
- - **Specification roadmap:** [spec/v0.2/ROADMAP.md](spec/v0.2/ROADMAP.md) — PRML format evolution, canonicalization grammar, conformance. Spec v0.2 freeze 2026-05-22.
548
+ - **CLI tool roadmap:** [ROADMAP.md](ROADMAP.md) — `falsify` features, integrations, dependencies. **CLI v0.2.0 shipped 2026-05-22.** v0.3 features tracked alongside the v0.3 spec backlog.
549
+ - **Specification roadmap:** [spec/v0.2/ROADMAP.md](spec/v0.2/ROADMAP.md) — PRML format evolution, canonicalization grammar, conformance. **Spec v0.2 frozen 2026-05-22.** v0.3 design backlog open under [`spec/v0.3-backlog/`](spec/v0.3-backlog/) (claim trees, suite manifests, selective-disclosure resistance via `leaves_total`).
528
550
 
529
- The CLI is downstream of the spec: when spec v0.2 freezes, CLI v0.2 follows about three weeks later. CLI v0.3 is loosely scoped for Q4 2026.
551
+ The CLI is downstream of the spec: spec v0.2 frozen 2026-05-22, CLI v0.2.0 shipped to PyPI the same week. CLI v0.3 is loosely scoped for Q4 2026, tracking the v0.3 spec backlog.
530
552
 
531
553
  ## Trust model
532
554
 
@@ -2,21 +2,25 @@
2
2
 
3
3
  **ML evaluation claims should be locked before the experiment runs, not reported after.**
4
4
 
5
- `falsify` commits a claim — metric, threshold, dataset hash, seed — as a SHA-256 manifest. Run the eval. The hash either matches or it doesn't.
5
+ PRML commits a claim — metric, threshold, dataset hash, seed — as a SHA-256 manifest. Run the eval. The hash either matches or it doesn't.
6
6
 
7
7
  ```bash
8
- $ falsify lock claim.yaml
9
- locked: sha256:a3f9...c821
8
+ $ pip install falsify
9
+ $ falsify lock claim.prml.yaml
10
+ locked: claim.prml.yaml
11
+ sha256: c30dba8e0f566d1beebf4f8d468e6e07c821f0c72562dfb64ddf6596796f7797
10
12
 
11
- $ falsify verdict claim.yaml
12
- PASS accuracy 0.934 >= 0.90 (hash verified)
13
+ $ falsify verify claim.prml.yaml --observed 0.934
14
+ PASS metric=accuracy observed=0.934 >= threshold=0.9
13
15
 
14
- # tampered:
15
- $ falsify verdict claim.yaml
16
- TAMPERED sha256 mismatch — spec modified after locking (exit 3)
16
+ # spec edited after locking → hash no longer matches:
17
+ $ falsify verify claim.prml.yaml --observed 0.934
18
+ TAMPERED (exit 3)
17
19
  ```
18
20
 
19
- 4 reference implementations Python, JavaScript, Go, Rust — byte-equivalent on the 12 v0.1 conformance vectors (8 v0.2 candidates ship alongside, full 20-vector parity targeted for v0.2 freeze 2026-05-22). Designed for ML eval rigor. Maps to EU AI Act Article 12 evidence as a side effect.
21
+ No install? Verify any manifest in-browser at [registry.falsify.dev](https://registry.falsify.dev). Byte-equivalent reference CLIs also ship for JS (`npm i -g falsify-js`), Go, and Rust.
22
+
23
+ 4 reference implementations (Python, JavaScript, Go, Rust) byte-equivalent on all 20 conformance vectors (12 v0.1 stable + 8 v0.2). PRML v0.2 frozen 2026-05-22. The same day, Lock #2 (a public hypothesis on the spec's own distribution, target ≥3 external contributors in 14 days) resolved at 0/3. The mechanism worked, the post-mortem is at [falsify.dev/notes/lock-2-postmortem](https://falsify.dev/notes/lock-2-postmortem/). Designed for ML eval rigor. Maps to EU AI Act Article 12 evidence as a side effect.
20
24
 
21
25
  > **Pre-registration + CI for AI-agent claims.** Lock the claim and threshold with SHA-256 *before* running the experiment — or the result doesn't count.
22
26
 
@@ -49,10 +53,6 @@ TAMPERED sha256 mismatch — spec modified after locking (exit 3)
49
53
 
50
54
  ---
51
55
 
52
- > **Latest — 2026-05-14** · v0.1 published on Zenodo: citable DOI [10.5281/zenodo.20177839](https://doi.org/10.5281/zenodo.20177839). PRML JSON Schema [merged into SchemaStore](https://github.com/SchemaStore/schemastore/pull/5673) (2026-05-11) by Mads Kristensen (Microsoft) — `.prml.yaml` files now autocomplete in VS Code, JetBrains, Helix, Zed, and Cursor. [OECD.AI Catalogue submission](https://oecd.ai/en/catalogue/tools) filed, vetting in progress. NIST AI 800-2 late comment archived. JTC 21 routed via Dr. Sebastian Hallensleben. `registry.falsify.dev` live with README badges at `registry.falsify.dev/badge/<hash>.svg`. **v0.1.4 released** ([release notes](https://github.com/studio-11-co/falsify/releases/tag/v0.1.4) · `pip install falsify==0.1.4`). PRML v0.1 specification published with **four reference implementations** (Python · [JavaScript](impl/js/) · [Go](impl/go/) · [Rust](impl/rust/)) all reproducing the [12 v0.1 vectors](spec/test-vectors/v0.1/) and [8 v0.2 candidate vectors](spec/test-vectors/v0.2/) byte-for-byte (20 vectors total). [14-page arXiv preprint](spec/paper/) and [v0.2 RFC](https://spec.falsify.dev/v0.2-rfc) (freeze 2026-05-22) open for public review.
53
-
54
- ---
55
-
56
56
  ## The problem
57
57
 
58
58
  Your team claims the model hits **94% accuracy**. You ship it. Three weeks later a customer proves the real number is **71%**.
@@ -65,9 +65,9 @@ PRML does not prove an ML result is true. It proves that a specific evaluation c
65
65
 
66
66
  **Falsify fixes this with a single idea from science:** you must pre-register the claim *before* you run the experiment. If you change the spec after seeing the data, the hash changes, the audit trail breaks, and CI fails with exit code 3.
67
67
 
68
- $ falsify lock accuracy_claim # SHA-256 the spec
69
- $ falsify run accuracy_claim # reproducible experiment
70
- $ falsify verdict accuracy_claim # exit 0 = PASS, 10 = FAIL, 3 = tampered
68
+ $ falsify-engine lock accuracy_claim # SHA-256 the spec
69
+ $ falsify-engine run accuracy_claim # reproducible experiment
70
+ $ falsify-engine verdict accuracy_claim # exit 0 = PASS, 10 = FAIL, 3 = tampered
71
71
 
72
72
  Deterministic exit codes are the API. CI gates on them. Humans read the audit trail. The claim either survives contact with the data or it doesn't.
73
73
 
@@ -96,14 +96,14 @@ See [docs/CASE_STUDIES.md](docs/CASE_STUDIES.md) for three concrete adoption sto
96
96
 
97
97
  ---
98
98
 
99
- **Current version:** 0.1.0 run `python3 falsify.py --version`.
99
+ **Current version:** falsify 0.3.0 (PRML CLI) · falsify-engine 0.2.0 — `falsify --version`.
100
100
  **Working with Claude Code?** See [CLAUDE.md](CLAUDE.md).
101
101
 
102
102
  ---
103
103
 
104
104
  ## Specification artifacts
105
105
 
106
- Falsify is the reference implementation of **PRML v0.1** — Pre-Registered ML Manifest Specification. The spec, conformance suite, and adjacent documents live under `spec/`:
106
+ This repository is the home of **PRML v0.1** — Pre-Registered ML Manifest Specification. The spec, conformance suite, reference implementations (`impl/`, JS/Go/Rust + a Python reference target), and adjacent documents live under `spec/`:
107
107
 
108
108
  - **[`spec/PRML-v0.1.md`](spec/PRML-v0.1.md)** — the spec (RFC-style, CC BY 4.0)
109
109
  - **[`spec/test-vectors/v0.1/`](spec/test-vectors/v0.1/)** — 12 conformance vectors with locked SHA-256 digests
@@ -121,6 +121,15 @@ Falsify is the reference implementation of **PRML v0.1** — Pre-Registered ML M
121
121
  - **[NIST AI RMF 1.0 crosswalk](https://spec.falsify.dev/nist-ai-rmf/)** — GOVERN / MAP / MEASURE / MANAGE subcategory map (incl. AI 600-1 GenAI Profile)
122
122
  - **[ISO/IEC 42001:2023 crosswalk](https://spec.falsify.dev/iso-42001/)** — AIMS clause-by-clause evidence map (Clauses 7-9 + Annex A controls)
123
123
 
124
+ **Long-form working notes** (2026-05-23, written for compliance leads, AI governance officers, and notified body assessors preparing for the 2 August 2026 deadline; CC BY 4.0):
125
+
126
+ - **[EU AI Act readiness assessment](https://falsify.dev/eu-ai-act-readiness/)** — six binding articles, ten-question gap check, evidence shape per obligation
127
+ - **[2 August 2026 deadline](https://falsify.dev/ai-act-deadline-august-2026/)** — three application dates, Article 99 penalty structure, ten-week plan
128
+ - **[Article 12 logging checklist](https://falsify.dev/article-12-checklist/)** — ten closeable questions, six event categories, printable single-page summary
129
+ - **[Notified body evidence](https://falsify.dev/notified-body-evidence/)** — Annex VI vs Annex VII conformity assessment, six artefact families
130
+ - **[ISO/IEC 42001 readiness](https://falsify.dev/iso-42001-readiness/)** — seven clauses, EU AI Act Article 17 overlap, twelve-month certification path
131
+ - **[Lock #2 post-mortem](https://falsify.dev/notes/lock-2-postmortem/)** — field report on running a falsifiable spec in public
132
+
124
133
  **Reference implementations** (four languages, 12 v0.1 + 8 v0.2 candidate vectors = 20 total; multi-lang CI runs all 20 byte-for-byte per push and daily at 04:00 UTC):
125
134
 
126
135
  - **Python:** [`falsify.py`](falsify.py) — original reference, uses PyYAML
@@ -128,7 +137,7 @@ Falsify is the reference implementation of **PRML v0.1** — Pre-Registered ML M
128
137
  - **Go:** [`impl/go/`](impl/go/) — third reference, ~450 LOC, hand-rolled, stdlib only
129
138
  - **Rust:** [`impl/rust/`](impl/rust/) — fourth reference, ~600 LOC, hand-rolled, two deps (`serde_json`, `sha2`)
130
139
 
131
- Hosted spec at [spec.falsify.dev/v0.1](https://spec.falsify.dev/v0.1). Public review thread at [GitHub Discussion #6](https://github.com/studio-11-co/falsify/discussions/6). Comments via `hello@studio-11.co`.
140
+ Hosted spec at [spec.falsify.dev/v0.1](https://spec.falsify.dev/v0.1). Public review thread at [GitHub Discussion #6](https://github.com/studio-11-co/falsify/discussions/6). Comments via `hello@falsify.dev`.
132
141
 
133
142
  **Companion projects** (separate repos under `studio-11-co`, each MIT or CC0 licensed):
134
143
 
@@ -184,6 +193,8 @@ is at <https://falsify.dev>, and the project page is at
184
193
 
185
194
  Requires Python **3.11+**.
186
195
 
196
+ > **Two commands, one install.** `falsify` is the **PRML manifest CLI** — `lock` / `verify` / `hash` / `init` / `test-vectors` on a `*.prml.yaml` manifest (shown at the top). `falsify-engine` is the separate **pre-registration workflow engine** — the `init` → `lock` → `run` → `verdict` / `guard` loop over `.falsify/<name>/` specs. The workflow sections further down use `falsify-engine`; substitute it for `falsify` there. No install needed to verify a manifest: paste it at [registry.falsify.dev](https://registry.falsify.dev).
197
+
187
198
  ### Development install (from the repo)
188
199
 
189
200
  ```bash
@@ -225,16 +236,25 @@ exported hooks and how this repo eats its own dog food.
225
236
  ## Quickstart
226
237
 
227
238
  ```bash
228
- ./demo.sh # auto-narrated: PASS tamper FAIL guard block
239
+ # The falsify PRML CLI lock a manifest, run your eval, verify.
240
+ falsify init accuracy.prml.yaml # writes a skeleton manifest
241
+ # edit accuracy.prml.yaml: metric, comparator, threshold, dataset.hash, seed, producer
242
+ falsify lock accuracy.prml.yaml # canonicalize + SHA-256 + write sidecar
243
+ # ... run your eval, get the observed value ...
244
+ falsify verify accuracy.prml.yaml --observed 0.934
245
+ # PASS (exit 0) · FAIL below threshold (exit 10) · TAMPERED if the spec changed (exit 3)
246
+ ```
247
+
248
+ The pre-registration **workflow engine** (claim/falsification specs, `init` → `lock` → `run` → `verdict` → `guard` over `.falsify/<name>/`) ships in the same install as the `falsify-engine` command:
229
249
 
230
- # Either form works — `falsify` is the installed entry point,
231
- # `python3 falsify.py` is the uninstalled fallback.
232
- falsify init my_claim
250
+ ```bash
251
+ ./demo.sh # auto-narrated engine demo (PASS tamper → FAIL → guard)
252
+ falsify-engine init my_claim
233
253
  # edit .falsify/my_claim/spec.yaml to fill in the template
234
- falsify lock my_claim
235
- falsify run my_claim
236
- falsify verdict my_claim
237
- falsify hook install # enable the commit-msg guard
254
+ falsify-engine lock my_claim
255
+ falsify-engine run my_claim
256
+ falsify-engine verdict my_claim
257
+ falsify-engine hook install # enable the commit-msg guard
238
258
  ```
239
259
 
240
260
  Exit code `0` on PASS, `10` on FAIL. Everything else is documented
@@ -247,8 +267,8 @@ New to pre-registration? Walk through [TUTORIAL.md](TUTORIAL.md) — 15 minutes,
247
267
  ```bash
248
268
  falsify init --template accuracy
249
269
  falsify lock accuracy
250
- falsify run accuracy
251
- falsify verdict accuracy
270
+ falsify-engine run accuracy
271
+ falsify-engine verdict accuracy
252
272
  ```
253
273
 
254
274
  Five templates ship with a runnable spec + metric + dataset:
@@ -270,7 +290,7 @@ or the directory with `--dir`.
270
290
  make install # pip install pyyaml
271
291
  make test # run unittest suite
272
292
  make smoke # run tests/smoke_test.sh
273
- make demo # JUJU end-to-end (lock → run → verdict)
293
+ make demo # calibration end-to-end (lock → run → verdict)
274
294
  ```
275
295
 
276
296
  See [Makefile](Makefile) for all targets (`make help`).
@@ -282,18 +302,18 @@ Feature matrix vs adjacent tools: [docs/COMPARISON.md](docs/COMPARISON.md).
282
302
 
283
303
  ### Explain any claim
284
304
 
285
- `falsify why <name>` is the human-friendly companion to `verdict`
305
+ `falsify-engine why <name>` is the human-friendly companion to `verdict`
286
306
  — it always exits `0` and tells you exactly what the next honest
287
307
  move is:
288
308
 
289
309
  ```
290
- claim: juju
310
+ claim: calibration
291
311
  state: STALE
292
312
  reasoning: the spec has been edited (sha256:1038219d75a8) but no run
293
313
  exists against this hash. Last run was against sha256:164f619d4860.
294
314
  locked: yes (sha256:164f619d4860, 2h ago)
295
315
  last run: 2026-04-22T02:10:17+00:00 (2h ago)
296
- next action: `falsify run <name>` to produce a fresh verdict against
316
+ next action: `falsify-engine run <name>` to produce a fresh verdict against
297
317
  the current spec.
298
318
  ```
299
319
 
@@ -302,13 +322,13 @@ and the last five runs.
302
322
 
303
323
  ### Spot drift with a sparkline
304
324
 
305
- `falsify trend <name>` draws an ASCII sparkline of the metric
325
+ `falsify-engine trend <name>` draws an ASCII sparkline of the metric
306
326
  across its recorded runs, marks the threshold line, and classifies
307
327
  the trajectory as **improving**, **degrading**, **flat**, or
308
328
  **mixed**.
309
329
 
310
330
  ```
311
- claim: juju
331
+ claim: calibration
312
332
  threshold: 0.25 (direction: below)
313
333
  runs: 20 shown (of 20)
314
334
 
@@ -330,14 +350,14 @@ trend: degrading
330
350
 
331
351
  ### Measure the CLI itself
332
352
 
333
- `falsify bench` spawns each subcommand under a fresh temporary
353
+ `falsify-engine bench` spawns each subcommand under a fresh temporary
334
354
  directory and records per-command latency (min / median / p95 /
335
355
  max / mean / stddev). Useful as a sanity check before a release
336
356
  or when investigating a suspected startup-time regression.
337
357
 
338
358
  ```bash
339
- falsify bench --runs 5 --commands "--help,list,stats,score"
340
- falsify bench --runs 5 --json # machine-readable output
359
+ falsify-engine bench --runs 5 --commands "--help,list,stats,score"
360
+ falsify-engine bench --runs 5 --json # machine-readable output
341
361
  ```
342
362
 
343
363
  `--runs <N>` sets the timed-iteration count (default 5, capped at
@@ -395,7 +415,7 @@ compose the skills and CLI.
395
415
 
396
416
  **CI** (`.github/workflows/falsify.yml`) — on every push and PR,
397
417
  the workflow runs the unittest suite, `tests/smoke_test.sh`, the
398
- JUJU end-to-end (`lock` → `run` → `verdict`), a guard self-check,
418
+ calibration end-to-end (`lock` → `run` → `verdict`), a guard self-check,
399
419
  and a skill-lint pass over every SKILL.md and agent file.
400
420
 
401
421
  ## Demo
@@ -463,7 +483,7 @@ ln -sf "$(pwd)/hooks/commit-msg" .git/hooks/commit-msg
463
483
  - `hypothesis.schema.yaml` — spec schema (claim, falsification,
464
484
  experiment, environment, artifacts).
465
485
  - `examples/hello_claim/` — tiny smoke-test fixture.
466
- - `examples/juju_sample/` — anonymized 20-row prediction ledger
486
+ - `examples/calibration_sample/` — anonymized 20-row prediction ledger
467
487
  for the Brier score demo.
468
488
  - `hooks/commit-msg` — the guard hook.
469
489
  - `tests/` — `unittest` suite plus `smoke_test.sh` end-to-end driver.
@@ -487,14 +507,16 @@ Run `make dogfood` to re-verify. CI runs these on every PR.
487
507
 
488
508
  See [CHANGELOG.md](CHANGELOG.md) for release history.
489
509
 
510
+ > **Latest — 2026-05-23** · **PRML v0.2 frozen** with all 20 conformance vectors (12 v0.1 stable + 8 v0.2) passing byte-for-byte across the four reference implementations. Lock #2 (public hypothesis on spec's own distribution) resolved at **0/3 external contributors**, mechanism worked, [post-mortem published](https://falsify.dev/notes/lock-2-postmortem/). **`mlflow-falsify` v0.2.0** shipped with `MLFLOW_FALSIFY_TAG_SCOPE=experiment` for HPO sweeps; [MLflow community plugin showcase PR](https://github.com/mlflow/mlflow/pull/23569) is live and under review. Five long-form working notes published for EU AI Act readiness: [readiness assessment](https://falsify.dev/eu-ai-act-readiness/), [2 August 2026 deadline](https://falsify.dev/ai-act-deadline-august-2026/), [Article 12 ten-item checklist](https://falsify.dev/article-12-checklist/), [notified body evidence](https://falsify.dev/notified-body-evidence/), [ISO/IEC 42001 readiness](https://falsify.dev/iso-42001-readiness/). DOI [10.5281/zenodo.20177839](https://doi.org/10.5281/zenodo.20177839). PRML JSON Schema in [SchemaStore](https://github.com/SchemaStore/schemastore/pull/5673) (Mads Kristensen / Microsoft) — `.prml.yaml` files autocomplete in VS Code, JetBrains, Helix, Zed, and Cursor. `registry.falsify.dev` live with README badges at `registry.falsify.dev/badge/<hash>.svg`.
511
+
490
512
  ## Roadmap
491
513
 
492
514
  Two roadmaps run alongside each other:
493
515
 
494
- - **CLI tool roadmap:** [ROADMAP.md](ROADMAP.md) — `falsify` features, integrations, dependencies. CLI v0.2 targeted 2026-06-15.
495
- - **Specification roadmap:** [spec/v0.2/ROADMAP.md](spec/v0.2/ROADMAP.md) — PRML format evolution, canonicalization grammar, conformance. Spec v0.2 freeze 2026-05-22.
516
+ - **CLI tool roadmap:** [ROADMAP.md](ROADMAP.md) — `falsify` features, integrations, dependencies. **CLI v0.2.0 shipped 2026-05-22.** v0.3 features tracked alongside the v0.3 spec backlog.
517
+ - **Specification roadmap:** [spec/v0.2/ROADMAP.md](spec/v0.2/ROADMAP.md) — PRML format evolution, canonicalization grammar, conformance. **Spec v0.2 frozen 2026-05-22.** v0.3 design backlog open under [`spec/v0.3-backlog/`](spec/v0.3-backlog/) (claim trees, suite manifests, selective-disclosure resistance via `leaves_total`).
496
518
 
497
- The CLI is downstream of the spec: when spec v0.2 freezes, CLI v0.2 follows about three weeks later. CLI v0.3 is loosely scoped for Q4 2026.
519
+ The CLI is downstream of the spec: spec v0.2 frozen 2026-05-22, CLI v0.2.0 shipped to PyPI the same week. CLI v0.3 is loosely scoped for Q4 2026, tracking the v0.3 spec backlog.
498
520
 
499
521
  ## Trust model
500
522