falsify 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {falsify-0.2.0 → falsify-0.3.0}/NOTICE +1 -1
- {falsify-0.2.0/falsify.egg-info → falsify-0.3.0}/PKG-INFO +67 -45
- {falsify-0.2.0 → falsify-0.3.0}/README.md +65 -43
- {falsify-0.2.0 → falsify-0.3.0/falsify.egg-info}/PKG-INFO +67 -45
- {falsify-0.2.0 → falsify-0.3.0}/falsify.egg-info/SOURCES.txt +3 -1
- falsify-0.3.0/falsify.egg-info/entry_points.txt +3 -0
- {falsify-0.2.0 → falsify-0.3.0}/falsify.egg-info/top_level.txt +1 -0
- falsify-0.3.0/falsify_prml.py +321 -0
- {falsify-0.2.0 → falsify-0.3.0}/pyproject.toml +5 -4
- falsify-0.2.0/tests/test_juju_sample.py → falsify-0.3.0/tests/test_calibration_sample.py +10 -10
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_ci_workflow.py +2 -2
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_demo_script.py +1 -1
- falsify-0.3.0/tests/test_prml_cli.py +73 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_pyproject.py +10 -6
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_readme.py +4 -2
- falsify-0.2.0/falsify.egg-info/entry_points.txt +0 -2
- {falsify-0.2.0 → falsify-0.3.0}/LICENSE +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/falsify.egg-info/dependency_links.txt +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/falsify.egg-info/requires.txt +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/falsify.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/mcp_server/__init__.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/mcp_server/__main__.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/mcp_server/server.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/setup.cfg +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_adversarial_doc.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_agent_claim_auditor.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_agent_verdict_refresher.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_architecture.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_bench.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_case_studies_doc.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_changelog.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_claude_md.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_code_of_conduct.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_comparison_doc.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_contributing.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_demo_script_doc.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_demo_shot_list.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_diff.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_docker.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_doctor.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_editorconfig.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_examples_doc.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_export.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_faq.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_github_repo_maturity.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_github_templates.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_gitignore.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_glossary_doc.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_guard.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_hook_install.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_init.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_init_templates.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_integration_e2e.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_list.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_lock.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_makefile.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_managed_agents.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_mcp.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_mcp_server.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_pitch.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_pre_commit.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_prml_v02_candidates.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_prml_vectors.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_release_check.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_release_workflow.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_replay.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_roadmap.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_run.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_score.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_self_dogfood.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_skill_author.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_skill_ci_doctor.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_skill_claim_audit.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_skill_claim_review.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_skill_falsify.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_slash_commands.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_stats.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_stats_html.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_submission.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_submission_md.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_trend.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_tutorial.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_verdict.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_verify.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_version.py +0 -0
- {falsify-0.2.0 → falsify-0.3.0}/tests/test_why.py +0 -0
|
@@ -32,7 +32,7 @@ Teams deploying falsify in production as part of a commercial service
|
|
|
32
32
|
are encouraged — but not required by the MIT License — to contact the
|
|
33
33
|
author about support, SLAs, and enterprise features:
|
|
34
34
|
|
|
35
|
-
hello@
|
|
35
|
+
hello@falsify.dev
|
|
36
36
|
|
|
37
37
|
See docs/COMMERCIAL.md for details.
|
|
38
38
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: falsify
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: PRML reference CLI — pre-register an ML evaluation claim as a SHA-256 manifest; verify PASS/FAIL/TAMPERED.
|
|
5
5
|
Author: Cüneyt Öztürk
|
|
6
6
|
License: MIT
|
|
7
7
|
Project-URL: Homepage, https://falsify.dev
|
|
@@ -34,21 +34,25 @@ Dynamic: license-file
|
|
|
34
34
|
|
|
35
35
|
**ML evaluation claims should be locked before the experiment runs, not reported after.**
|
|
36
36
|
|
|
37
|
-
|
|
37
|
+
PRML commits a claim — metric, threshold, dataset hash, seed — as a SHA-256 manifest. Run the eval. The hash either matches or it doesn't.
|
|
38
38
|
|
|
39
39
|
```bash
|
|
40
|
-
$
|
|
41
|
-
|
|
40
|
+
$ pip install falsify
|
|
41
|
+
$ falsify lock claim.prml.yaml
|
|
42
|
+
locked: claim.prml.yaml
|
|
43
|
+
sha256: c30dba8e0f566d1beebf4f8d468e6e07c821f0c72562dfb64ddf6596796f7797
|
|
42
44
|
|
|
43
|
-
$ falsify
|
|
44
|
-
PASS accuracy
|
|
45
|
+
$ falsify verify claim.prml.yaml --observed 0.934
|
|
46
|
+
PASS metric=accuracy observed=0.934 >= threshold=0.9
|
|
45
47
|
|
|
46
|
-
#
|
|
47
|
-
$ falsify
|
|
48
|
-
TAMPERED
|
|
48
|
+
# spec edited after locking → hash no longer matches:
|
|
49
|
+
$ falsify verify claim.prml.yaml --observed 0.934
|
|
50
|
+
TAMPERED (exit 3)
|
|
49
51
|
```
|
|
50
52
|
|
|
51
|
-
|
|
53
|
+
No install? Verify any manifest in-browser at [registry.falsify.dev](https://registry.falsify.dev). Byte-equivalent reference CLIs also ship for JS (`npm i -g falsify-js`), Go, and Rust.
|
|
54
|
+
|
|
55
|
+
4 reference implementations (Python, JavaScript, Go, Rust) byte-equivalent on all 20 conformance vectors (12 v0.1 stable + 8 v0.2). PRML v0.2 frozen 2026-05-22. The same day, Lock #2 (a public hypothesis on the spec's own distribution, target ≥3 external contributors in 14 days) resolved at 0/3. The mechanism worked, the post-mortem is at [falsify.dev/notes/lock-2-postmortem](https://falsify.dev/notes/lock-2-postmortem/). Designed for ML eval rigor. Maps to EU AI Act Article 12 evidence as a side effect.
|
|
52
56
|
|
|
53
57
|
> **Pre-registration + CI for AI-agent claims.** Lock the claim and threshold with SHA-256 *before* running the experiment — or the result doesn't count.
|
|
54
58
|
|
|
@@ -81,10 +85,6 @@ TAMPERED sha256 mismatch — spec modified after locking (exit 3)
|
|
|
81
85
|
|
|
82
86
|
---
|
|
83
87
|
|
|
84
|
-
> **Latest — 2026-05-14** · v0.1 published on Zenodo: citable DOI [10.5281/zenodo.20177839](https://doi.org/10.5281/zenodo.20177839). PRML JSON Schema [merged into SchemaStore](https://github.com/SchemaStore/schemastore/pull/5673) (2026-05-11) by Mads Kristensen (Microsoft) — `.prml.yaml` files now autocomplete in VS Code, JetBrains, Helix, Zed, and Cursor. [OECD.AI Catalogue submission](https://oecd.ai/en/catalogue/tools) filed, vetting in progress. NIST AI 800-2 late comment archived. JTC 21 routed via Dr. Sebastian Hallensleben. `registry.falsify.dev` live with README badges at `registry.falsify.dev/badge/<hash>.svg`. **v0.1.4 released** ([release notes](https://github.com/studio-11-co/falsify/releases/tag/v0.1.4) · `pip install falsify==0.1.4`). PRML v0.1 specification published with **four reference implementations** (Python · [JavaScript](impl/js/) · [Go](impl/go/) · [Rust](impl/rust/)) all reproducing the [12 v0.1 vectors](spec/test-vectors/v0.1/) and [8 v0.2 candidate vectors](spec/test-vectors/v0.2/) byte-for-byte (20 vectors total). [14-page arXiv preprint](spec/paper/) and [v0.2 RFC](https://spec.falsify.dev/v0.2-rfc) (freeze 2026-05-22) open for public review.
|
|
85
|
-
|
|
86
|
-
---
|
|
87
|
-
|
|
88
88
|
## The problem
|
|
89
89
|
|
|
90
90
|
Your team claims the model hits **94% accuracy**. You ship it. Three weeks later a customer proves the real number is **71%**.
|
|
@@ -97,9 +97,9 @@ PRML does not prove an ML result is true. It proves that a specific evaluation c
|
|
|
97
97
|
|
|
98
98
|
**Falsify fixes this with a single idea from science:** you must pre-register the claim *before* you run the experiment. If you change the spec after seeing the data, the hash changes, the audit trail breaks, and CI fails with exit code 3.
|
|
99
99
|
|
|
100
|
-
$ falsify lock accuracy_claim # SHA-256 the spec
|
|
101
|
-
$ falsify run accuracy_claim # reproducible experiment
|
|
102
|
-
$ falsify verdict accuracy_claim # exit 0 = PASS, 10 = FAIL, 3 = tampered
|
|
100
|
+
$ falsify-engine lock accuracy_claim # SHA-256 the spec
|
|
101
|
+
$ falsify-engine run accuracy_claim # reproducible experiment
|
|
102
|
+
$ falsify-engine verdict accuracy_claim # exit 0 = PASS, 10 = FAIL, 3 = tampered
|
|
103
103
|
|
|
104
104
|
Deterministic exit codes are the API. CI gates on them. Humans read the audit trail. The claim either survives contact with the data or it doesn't.
|
|
105
105
|
|
|
@@ -128,14 +128,14 @@ See [docs/CASE_STUDIES.md](docs/CASE_STUDIES.md) for three concrete adoption sto
|
|
|
128
128
|
|
|
129
129
|
---
|
|
130
130
|
|
|
131
|
-
**Current version:** 0.
|
|
131
|
+
**Current version:** falsify 0.3.0 (PRML CLI) · falsify-engine 0.2.0 — `falsify --version`.
|
|
132
132
|
**Working with Claude Code?** See [CLAUDE.md](CLAUDE.md).
|
|
133
133
|
|
|
134
134
|
---
|
|
135
135
|
|
|
136
136
|
## Specification artifacts
|
|
137
137
|
|
|
138
|
-
|
|
138
|
+
This repository is the home of **PRML v0.1** — Pre-Registered ML Manifest Specification. The spec, conformance suite, reference implementations (`impl/`, JS/Go/Rust + a Python reference target), and adjacent documents live under `spec/`:
|
|
139
139
|
|
|
140
140
|
- **[`spec/PRML-v0.1.md`](spec/PRML-v0.1.md)** — the spec (RFC-style, CC BY 4.0)
|
|
141
141
|
- **[`spec/test-vectors/v0.1/`](spec/test-vectors/v0.1/)** — 12 conformance vectors with locked SHA-256 digests
|
|
@@ -153,6 +153,15 @@ Falsify is the reference implementation of **PRML v0.1** — Pre-Registered ML M
|
|
|
153
153
|
- **[NIST AI RMF 1.0 crosswalk](https://spec.falsify.dev/nist-ai-rmf/)** — GOVERN / MAP / MEASURE / MANAGE subcategory map (incl. AI 600-1 GenAI Profile)
|
|
154
154
|
- **[ISO/IEC 42001:2023 crosswalk](https://spec.falsify.dev/iso-42001/)** — AIMS clause-by-clause evidence map (Clauses 7-9 + Annex A controls)
|
|
155
155
|
|
|
156
|
+
**Long-form working notes** (2026-05-23, written for compliance leads, AI governance officers, and notified body assessors preparing for the 2 August 2026 deadline; CC BY 4.0):
|
|
157
|
+
|
|
158
|
+
- **[EU AI Act readiness assessment](https://falsify.dev/eu-ai-act-readiness/)** — six binding articles, ten-question gap check, evidence shape per obligation
|
|
159
|
+
- **[2 August 2026 deadline](https://falsify.dev/ai-act-deadline-august-2026/)** — three application dates, Article 99 penalty structure, ten-week plan
|
|
160
|
+
- **[Article 12 logging checklist](https://falsify.dev/article-12-checklist/)** — ten closeable questions, six event categories, printable single-page summary
|
|
161
|
+
- **[Notified body evidence](https://falsify.dev/notified-body-evidence/)** — Annex VI vs Annex VII conformity assessment, six artefact families
|
|
162
|
+
- **[ISO/IEC 42001 readiness](https://falsify.dev/iso-42001-readiness/)** — seven clauses, EU AI Act Article 17 overlap, twelve-month certification path
|
|
163
|
+
- **[Lock #2 post-mortem](https://falsify.dev/notes/lock-2-postmortem/)** — field report on running a falsifiable spec in public
|
|
164
|
+
|
|
156
165
|
**Reference implementations** (four languages, 12 v0.1 + 8 v0.2 candidate vectors = 20 total; multi-lang CI runs all 20 byte-for-byte per push and daily at 04:00 UTC):
|
|
157
166
|
|
|
158
167
|
- **Python:** [`falsify.py`](falsify.py) — original reference, uses PyYAML
|
|
@@ -160,7 +169,7 @@ Falsify is the reference implementation of **PRML v0.1** — Pre-Registered ML M
|
|
|
160
169
|
- **Go:** [`impl/go/`](impl/go/) — third reference, ~450 LOC, hand-rolled, stdlib only
|
|
161
170
|
- **Rust:** [`impl/rust/`](impl/rust/) — fourth reference, ~600 LOC, hand-rolled, two deps (`serde_json`, `sha2`)
|
|
162
171
|
|
|
163
|
-
Hosted spec at [spec.falsify.dev/v0.1](https://spec.falsify.dev/v0.1). Public review thread at [GitHub Discussion #6](https://github.com/studio-11-co/falsify/discussions/6). Comments via `hello@
|
|
172
|
+
Hosted spec at [spec.falsify.dev/v0.1](https://spec.falsify.dev/v0.1). Public review thread at [GitHub Discussion #6](https://github.com/studio-11-co/falsify/discussions/6). Comments via `hello@falsify.dev`.
|
|
164
173
|
|
|
165
174
|
**Companion projects** (separate repos under `studio-11-co`, each MIT or CC0 licensed):
|
|
166
175
|
|
|
@@ -216,6 +225,8 @@ is at <https://falsify.dev>, and the project page is at
|
|
|
216
225
|
|
|
217
226
|
Requires Python **3.11+**.
|
|
218
227
|
|
|
228
|
+
> **Two commands, one install.** `falsify` is the **PRML manifest CLI** — `lock` / `verify` / `hash` / `init` / `test-vectors` on a `*.prml.yaml` manifest (shown at the top). `falsify-engine` is the separate **pre-registration workflow engine** — the `init` → `lock` → `run` → `verdict` / `guard` loop over `.falsify/<name>/` specs. The workflow sections further down use `falsify-engine`; substitute it for `falsify` there. No install needed to verify a manifest: paste it at [registry.falsify.dev](https://registry.falsify.dev).
|
|
229
|
+
|
|
219
230
|
### Development install (from the repo)
|
|
220
231
|
|
|
221
232
|
```bash
|
|
@@ -257,16 +268,25 @@ exported hooks and how this repo eats its own dog food.
|
|
|
257
268
|
## Quickstart
|
|
258
269
|
|
|
259
270
|
```bash
|
|
260
|
-
|
|
271
|
+
# The falsify PRML CLI — lock a manifest, run your eval, verify.
|
|
272
|
+
falsify init accuracy.prml.yaml # writes a skeleton manifest
|
|
273
|
+
# edit accuracy.prml.yaml: metric, comparator, threshold, dataset.hash, seed, producer
|
|
274
|
+
falsify lock accuracy.prml.yaml # canonicalize + SHA-256 + write sidecar
|
|
275
|
+
# ... run your eval, get the observed value ...
|
|
276
|
+
falsify verify accuracy.prml.yaml --observed 0.934
|
|
277
|
+
# PASS (exit 0) · FAIL below threshold (exit 10) · TAMPERED if the spec changed (exit 3)
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
The pre-registration **workflow engine** (claim/falsification specs, `init` → `lock` → `run` → `verdict` → `guard` over `.falsify/<name>/`) ships in the same install as the `falsify-engine` command:
|
|
261
281
|
|
|
262
|
-
|
|
263
|
-
#
|
|
264
|
-
falsify init my_claim
|
|
282
|
+
```bash
|
|
283
|
+
./demo.sh # auto-narrated engine demo (PASS → tamper → FAIL → guard)
|
|
284
|
+
falsify-engine init my_claim
|
|
265
285
|
# edit .falsify/my_claim/spec.yaml to fill in the template
|
|
266
|
-
falsify lock my_claim
|
|
267
|
-
falsify run my_claim
|
|
268
|
-
falsify verdict my_claim
|
|
269
|
-
falsify hook install # enable the commit-msg guard
|
|
286
|
+
falsify-engine lock my_claim
|
|
287
|
+
falsify-engine run my_claim
|
|
288
|
+
falsify-engine verdict my_claim
|
|
289
|
+
falsify-engine hook install # enable the commit-msg guard
|
|
270
290
|
```
|
|
271
291
|
|
|
272
292
|
Exit code `0` on PASS, `10` on FAIL. Everything else is documented
|
|
@@ -279,8 +299,8 @@ New to pre-registration? Walk through [TUTORIAL.md](TUTORIAL.md) — 15 minutes,
|
|
|
279
299
|
```bash
|
|
280
300
|
falsify init --template accuracy
|
|
281
301
|
falsify lock accuracy
|
|
282
|
-
falsify run accuracy
|
|
283
|
-
falsify verdict accuracy
|
|
302
|
+
falsify-engine run accuracy
|
|
303
|
+
falsify-engine verdict accuracy
|
|
284
304
|
```
|
|
285
305
|
|
|
286
306
|
Five templates ship with a runnable spec + metric + dataset:
|
|
@@ -302,7 +322,7 @@ or the directory with `--dir`.
|
|
|
302
322
|
make install # pip install pyyaml
|
|
303
323
|
make test # run unittest suite
|
|
304
324
|
make smoke # run tests/smoke_test.sh
|
|
305
|
-
make demo #
|
|
325
|
+
make demo # calibration end-to-end (lock → run → verdict)
|
|
306
326
|
```
|
|
307
327
|
|
|
308
328
|
See [Makefile](Makefile) for all targets (`make help`).
|
|
@@ -314,18 +334,18 @@ Feature matrix vs adjacent tools: [docs/COMPARISON.md](docs/COMPARISON.md).
|
|
|
314
334
|
|
|
315
335
|
### Explain any claim
|
|
316
336
|
|
|
317
|
-
`falsify why <name>` is the human-friendly companion to `verdict`
|
|
337
|
+
`falsify-engine why <name>` is the human-friendly companion to `verdict`
|
|
318
338
|
— it always exits `0` and tells you exactly what the next honest
|
|
319
339
|
move is:
|
|
320
340
|
|
|
321
341
|
```
|
|
322
|
-
claim:
|
|
342
|
+
claim: calibration
|
|
323
343
|
state: STALE
|
|
324
344
|
reasoning: the spec has been edited (sha256:1038219d75a8) but no run
|
|
325
345
|
exists against this hash. Last run was against sha256:164f619d4860.
|
|
326
346
|
locked: yes (sha256:164f619d4860, 2h ago)
|
|
327
347
|
last run: 2026-04-22T02:10:17+00:00 (2h ago)
|
|
328
|
-
next action: `falsify run <name>` to produce a fresh verdict against
|
|
348
|
+
next action: `falsify-engine run <name>` to produce a fresh verdict against
|
|
329
349
|
the current spec.
|
|
330
350
|
```
|
|
331
351
|
|
|
@@ -334,13 +354,13 @@ and the last five runs.
|
|
|
334
354
|
|
|
335
355
|
### Spot drift with a sparkline
|
|
336
356
|
|
|
337
|
-
`falsify trend <name>` draws an ASCII sparkline of the metric
|
|
357
|
+
`falsify-engine trend <name>` draws an ASCII sparkline of the metric
|
|
338
358
|
across its recorded runs, marks the threshold line, and classifies
|
|
339
359
|
the trajectory as **improving**, **degrading**, **flat**, or
|
|
340
360
|
**mixed**.
|
|
341
361
|
|
|
342
362
|
```
|
|
343
|
-
claim:
|
|
363
|
+
claim: calibration
|
|
344
364
|
threshold: 0.25 (direction: below)
|
|
345
365
|
runs: 20 shown (of 20)
|
|
346
366
|
|
|
@@ -362,14 +382,14 @@ trend: degrading
|
|
|
362
382
|
|
|
363
383
|
### Measure the CLI itself
|
|
364
384
|
|
|
365
|
-
`falsify bench` spawns each subcommand under a fresh temporary
|
|
385
|
+
`falsify-engine bench` spawns each subcommand under a fresh temporary
|
|
366
386
|
directory and records per-command latency (min / median / p95 /
|
|
367
387
|
max / mean / stddev). Useful as a sanity check before a release
|
|
368
388
|
or when investigating a suspected startup-time regression.
|
|
369
389
|
|
|
370
390
|
```bash
|
|
371
|
-
falsify bench --runs 5 --commands "--help,list,stats,score"
|
|
372
|
-
falsify bench --runs 5 --json # machine-readable output
|
|
391
|
+
falsify-engine bench --runs 5 --commands "--help,list,stats,score"
|
|
392
|
+
falsify-engine bench --runs 5 --json # machine-readable output
|
|
373
393
|
```
|
|
374
394
|
|
|
375
395
|
`--runs <N>` sets the timed-iteration count (default 5, capped at
|
|
@@ -427,7 +447,7 @@ compose the skills and CLI.
|
|
|
427
447
|
|
|
428
448
|
**CI** (`.github/workflows/falsify.yml`) — on every push and PR,
|
|
429
449
|
the workflow runs the unittest suite, `tests/smoke_test.sh`, the
|
|
430
|
-
|
|
450
|
+
calibration end-to-end (`lock` → `run` → `verdict`), a guard self-check,
|
|
431
451
|
and a skill-lint pass over every SKILL.md and agent file.
|
|
432
452
|
|
|
433
453
|
## Demo
|
|
@@ -495,7 +515,7 @@ ln -sf "$(pwd)/hooks/commit-msg" .git/hooks/commit-msg
|
|
|
495
515
|
- `hypothesis.schema.yaml` — spec schema (claim, falsification,
|
|
496
516
|
experiment, environment, artifacts).
|
|
497
517
|
- `examples/hello_claim/` — tiny smoke-test fixture.
|
|
498
|
-
- `examples/
|
|
518
|
+
- `examples/calibration_sample/` — anonymized 20-row prediction ledger
|
|
499
519
|
for the Brier score demo.
|
|
500
520
|
- `hooks/commit-msg` — the guard hook.
|
|
501
521
|
- `tests/` — `unittest` suite plus `smoke_test.sh` end-to-end driver.
|
|
@@ -519,14 +539,16 @@ Run `make dogfood` to re-verify. CI runs these on every PR.
|
|
|
519
539
|
|
|
520
540
|
See [CHANGELOG.md](CHANGELOG.md) for release history.
|
|
521
541
|
|
|
542
|
+
> **Latest — 2026-05-23** · **PRML v0.2 frozen** with all 20 conformance vectors (12 v0.1 stable + 8 v0.2) passing byte-for-byte across the four reference implementations. Lock #2 (public hypothesis on spec's own distribution) resolved at **0/3 external contributors**, mechanism worked, [post-mortem published](https://falsify.dev/notes/lock-2-postmortem/). **`mlflow-falsify` v0.2.0** shipped with `MLFLOW_FALSIFY_TAG_SCOPE=experiment` for HPO sweeps; [MLflow community plugin showcase PR](https://github.com/mlflow/mlflow/pull/23569) is live and under review. Five long-form working notes published for EU AI Act readiness: [readiness assessment](https://falsify.dev/eu-ai-act-readiness/), [2 August 2026 deadline](https://falsify.dev/ai-act-deadline-august-2026/), [Article 12 ten-item checklist](https://falsify.dev/article-12-checklist/), [notified body evidence](https://falsify.dev/notified-body-evidence/), [ISO/IEC 42001 readiness](https://falsify.dev/iso-42001-readiness/). DOI [10.5281/zenodo.20177839](https://doi.org/10.5281/zenodo.20177839). PRML JSON Schema in [SchemaStore](https://github.com/SchemaStore/schemastore/pull/5673) (Mads Kristensen / Microsoft) — `.prml.yaml` files autocomplete in VS Code, JetBrains, Helix, Zed, and Cursor. `registry.falsify.dev` live with README badges at `registry.falsify.dev/badge/<hash>.svg`.
|
|
543
|
+
|
|
522
544
|
## Roadmap
|
|
523
545
|
|
|
524
546
|
Two roadmaps run alongside each other:
|
|
525
547
|
|
|
526
|
-
- **CLI tool roadmap:** [ROADMAP.md](ROADMAP.md) — `falsify` features, integrations, dependencies. CLI v0.2
|
|
527
|
-
- **Specification roadmap:** [spec/v0.2/ROADMAP.md](spec/v0.2/ROADMAP.md) — PRML format evolution, canonicalization grammar, conformance. Spec v0.2
|
|
548
|
+
- **CLI tool roadmap:** [ROADMAP.md](ROADMAP.md) — `falsify` features, integrations, dependencies. **CLI v0.2.0 shipped 2026-05-22.** v0.3 features tracked alongside the v0.3 spec backlog.
|
|
549
|
+
- **Specification roadmap:** [spec/v0.2/ROADMAP.md](spec/v0.2/ROADMAP.md) — PRML format evolution, canonicalization grammar, conformance. **Spec v0.2 frozen 2026-05-22.** v0.3 design backlog open under [`spec/v0.3-backlog/`](spec/v0.3-backlog/) (claim trees, suite manifests, selective-disclosure resistance via `leaves_total`).
|
|
528
550
|
|
|
529
|
-
The CLI is downstream of the spec:
|
|
551
|
+
The CLI is downstream of the spec: spec v0.2 frozen 2026-05-22, CLI v0.2.0 shipped to PyPI the same week. CLI v0.3 is loosely scoped for Q4 2026, tracking the v0.3 spec backlog.
|
|
530
552
|
|
|
531
553
|
## Trust model
|
|
532
554
|
|
|
@@ -2,21 +2,25 @@
|
|
|
2
2
|
|
|
3
3
|
**ML evaluation claims should be locked before the experiment runs, not reported after.**
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
PRML commits a claim — metric, threshold, dataset hash, seed — as a SHA-256 manifest. Run the eval. The hash either matches or it doesn't.
|
|
6
6
|
|
|
7
7
|
```bash
|
|
8
|
-
$
|
|
9
|
-
|
|
8
|
+
$ pip install falsify
|
|
9
|
+
$ falsify lock claim.prml.yaml
|
|
10
|
+
locked: claim.prml.yaml
|
|
11
|
+
sha256: c30dba8e0f566d1beebf4f8d468e6e07c821f0c72562dfb64ddf6596796f7797
|
|
10
12
|
|
|
11
|
-
$ falsify
|
|
12
|
-
PASS accuracy
|
|
13
|
+
$ falsify verify claim.prml.yaml --observed 0.934
|
|
14
|
+
PASS metric=accuracy observed=0.934 >= threshold=0.9
|
|
13
15
|
|
|
14
|
-
#
|
|
15
|
-
$ falsify
|
|
16
|
-
TAMPERED
|
|
16
|
+
# spec edited after locking → hash no longer matches:
|
|
17
|
+
$ falsify verify claim.prml.yaml --observed 0.934
|
|
18
|
+
TAMPERED (exit 3)
|
|
17
19
|
```
|
|
18
20
|
|
|
19
|
-
|
|
21
|
+
No install? Verify any manifest in-browser at [registry.falsify.dev](https://registry.falsify.dev). Byte-equivalent reference CLIs also ship for JS (`npm i -g falsify-js`), Go, and Rust.
|
|
22
|
+
|
|
23
|
+
4 reference implementations (Python, JavaScript, Go, Rust) byte-equivalent on all 20 conformance vectors (12 v0.1 stable + 8 v0.2). PRML v0.2 frozen 2026-05-22. The same day, Lock #2 (a public hypothesis on the spec's own distribution, target ≥3 external contributors in 14 days) resolved at 0/3. The mechanism worked, the post-mortem is at [falsify.dev/notes/lock-2-postmortem](https://falsify.dev/notes/lock-2-postmortem/). Designed for ML eval rigor. Maps to EU AI Act Article 12 evidence as a side effect.
|
|
20
24
|
|
|
21
25
|
> **Pre-registration + CI for AI-agent claims.** Lock the claim and threshold with SHA-256 *before* running the experiment — or the result doesn't count.
|
|
22
26
|
|
|
@@ -49,10 +53,6 @@ TAMPERED sha256 mismatch — spec modified after locking (exit 3)
|
|
|
49
53
|
|
|
50
54
|
---
|
|
51
55
|
|
|
52
|
-
> **Latest — 2026-05-14** · v0.1 published on Zenodo: citable DOI [10.5281/zenodo.20177839](https://doi.org/10.5281/zenodo.20177839). PRML JSON Schema [merged into SchemaStore](https://github.com/SchemaStore/schemastore/pull/5673) (2026-05-11) by Mads Kristensen (Microsoft) — `.prml.yaml` files now autocomplete in VS Code, JetBrains, Helix, Zed, and Cursor. [OECD.AI Catalogue submission](https://oecd.ai/en/catalogue/tools) filed, vetting in progress. NIST AI 800-2 late comment archived. JTC 21 routed via Dr. Sebastian Hallensleben. `registry.falsify.dev` live with README badges at `registry.falsify.dev/badge/<hash>.svg`. **v0.1.4 released** ([release notes](https://github.com/studio-11-co/falsify/releases/tag/v0.1.4) · `pip install falsify==0.1.4`). PRML v0.1 specification published with **four reference implementations** (Python · [JavaScript](impl/js/) · [Go](impl/go/) · [Rust](impl/rust/)) all reproducing the [12 v0.1 vectors](spec/test-vectors/v0.1/) and [8 v0.2 candidate vectors](spec/test-vectors/v0.2/) byte-for-byte (20 vectors total). [14-page arXiv preprint](spec/paper/) and [v0.2 RFC](https://spec.falsify.dev/v0.2-rfc) (freeze 2026-05-22) open for public review.
|
|
53
|
-
|
|
54
|
-
---
|
|
55
|
-
|
|
56
56
|
## The problem
|
|
57
57
|
|
|
58
58
|
Your team claims the model hits **94% accuracy**. You ship it. Three weeks later a customer proves the real number is **71%**.
|
|
@@ -65,9 +65,9 @@ PRML does not prove an ML result is true. It proves that a specific evaluation c
|
|
|
65
65
|
|
|
66
66
|
**Falsify fixes this with a single idea from science:** you must pre-register the claim *before* you run the experiment. If you change the spec after seeing the data, the hash changes, the audit trail breaks, and CI fails with exit code 3.
|
|
67
67
|
|
|
68
|
-
$ falsify lock accuracy_claim # SHA-256 the spec
|
|
69
|
-
$ falsify run accuracy_claim # reproducible experiment
|
|
70
|
-
$ falsify verdict accuracy_claim # exit 0 = PASS, 10 = FAIL, 3 = tampered
|
|
68
|
+
$ falsify-engine lock accuracy_claim # SHA-256 the spec
|
|
69
|
+
$ falsify-engine run accuracy_claim # reproducible experiment
|
|
70
|
+
$ falsify-engine verdict accuracy_claim # exit 0 = PASS, 10 = FAIL, 3 = tampered
|
|
71
71
|
|
|
72
72
|
Deterministic exit codes are the API. CI gates on them. Humans read the audit trail. The claim either survives contact with the data or it doesn't.
|
|
73
73
|
|
|
@@ -96,14 +96,14 @@ See [docs/CASE_STUDIES.md](docs/CASE_STUDIES.md) for three concrete adoption sto
|
|
|
96
96
|
|
|
97
97
|
---
|
|
98
98
|
|
|
99
|
-
**Current version:** 0.
|
|
99
|
+
**Current version:** falsify 0.3.0 (PRML CLI) · falsify-engine 0.2.0 — `falsify --version`.
|
|
100
100
|
**Working with Claude Code?** See [CLAUDE.md](CLAUDE.md).
|
|
101
101
|
|
|
102
102
|
---
|
|
103
103
|
|
|
104
104
|
## Specification artifacts
|
|
105
105
|
|
|
106
|
-
|
|
106
|
+
This repository is the home of **PRML v0.1** — Pre-Registered ML Manifest Specification. The spec, conformance suite, reference implementations (`impl/`, JS/Go/Rust + a Python reference target), and adjacent documents live under `spec/`:
|
|
107
107
|
|
|
108
108
|
- **[`spec/PRML-v0.1.md`](spec/PRML-v0.1.md)** — the spec (RFC-style, CC BY 4.0)
|
|
109
109
|
- **[`spec/test-vectors/v0.1/`](spec/test-vectors/v0.1/)** — 12 conformance vectors with locked SHA-256 digests
|
|
@@ -121,6 +121,15 @@ Falsify is the reference implementation of **PRML v0.1** — Pre-Registered ML M
|
|
|
121
121
|
- **[NIST AI RMF 1.0 crosswalk](https://spec.falsify.dev/nist-ai-rmf/)** — GOVERN / MAP / MEASURE / MANAGE subcategory map (incl. AI 600-1 GenAI Profile)
|
|
122
122
|
- **[ISO/IEC 42001:2023 crosswalk](https://spec.falsify.dev/iso-42001/)** — AIMS clause-by-clause evidence map (Clauses 7-9 + Annex A controls)
|
|
123
123
|
|
|
124
|
+
**Long-form working notes** (2026-05-23, written for compliance leads, AI governance officers, and notified body assessors preparing for the 2 August 2026 deadline; CC BY 4.0):
|
|
125
|
+
|
|
126
|
+
- **[EU AI Act readiness assessment](https://falsify.dev/eu-ai-act-readiness/)** — six binding articles, ten-question gap check, evidence shape per obligation
|
|
127
|
+
- **[2 August 2026 deadline](https://falsify.dev/ai-act-deadline-august-2026/)** — three application dates, Article 99 penalty structure, ten-week plan
|
|
128
|
+
- **[Article 12 logging checklist](https://falsify.dev/article-12-checklist/)** — ten closeable questions, six event categories, printable single-page summary
|
|
129
|
+
- **[Notified body evidence](https://falsify.dev/notified-body-evidence/)** — Annex VI vs Annex VII conformity assessment, six artefact families
|
|
130
|
+
- **[ISO/IEC 42001 readiness](https://falsify.dev/iso-42001-readiness/)** — seven clauses, EU AI Act Article 17 overlap, twelve-month certification path
|
|
131
|
+
- **[Lock #2 post-mortem](https://falsify.dev/notes/lock-2-postmortem/)** — field report on running a falsifiable spec in public
|
|
132
|
+
|
|
124
133
|
**Reference implementations** (four languages, 12 v0.1 + 8 v0.2 candidate vectors = 20 total; multi-lang CI runs all 20 byte-for-byte per push and daily at 04:00 UTC):
|
|
125
134
|
|
|
126
135
|
- **Python:** [`falsify.py`](falsify.py) — original reference, uses PyYAML
|
|
@@ -128,7 +137,7 @@ Falsify is the reference implementation of **PRML v0.1** — Pre-Registered ML M
|
|
|
128
137
|
- **Go:** [`impl/go/`](impl/go/) — third reference, ~450 LOC, hand-rolled, stdlib only
|
|
129
138
|
- **Rust:** [`impl/rust/`](impl/rust/) — fourth reference, ~600 LOC, hand-rolled, two deps (`serde_json`, `sha2`)
|
|
130
139
|
|
|
131
|
-
Hosted spec at [spec.falsify.dev/v0.1](https://spec.falsify.dev/v0.1). Public review thread at [GitHub Discussion #6](https://github.com/studio-11-co/falsify/discussions/6). Comments via `hello@
|
|
140
|
+
Hosted spec at [spec.falsify.dev/v0.1](https://spec.falsify.dev/v0.1). Public review thread at [GitHub Discussion #6](https://github.com/studio-11-co/falsify/discussions/6). Comments via `hello@falsify.dev`.
|
|
132
141
|
|
|
133
142
|
**Companion projects** (separate repos under `studio-11-co`, each MIT or CC0 licensed):
|
|
134
143
|
|
|
@@ -184,6 +193,8 @@ is at <https://falsify.dev>, and the project page is at
|
|
|
184
193
|
|
|
185
194
|
Requires Python **3.11+**.
|
|
186
195
|
|
|
196
|
+
> **Two commands, one install.** `falsify` is the **PRML manifest CLI** — `lock` / `verify` / `hash` / `init` / `test-vectors` on a `*.prml.yaml` manifest (shown at the top). `falsify-engine` is the separate **pre-registration workflow engine** — the `init` → `lock` → `run` → `verdict` / `guard` loop over `.falsify/<name>/` specs. The workflow sections further down use `falsify-engine`; substitute it for `falsify` there. No install needed to verify a manifest: paste it at [registry.falsify.dev](https://registry.falsify.dev).
|
|
197
|
+
|
|
187
198
|
### Development install (from the repo)
|
|
188
199
|
|
|
189
200
|
```bash
|
|
@@ -225,16 +236,25 @@ exported hooks and how this repo eats its own dog food.
|
|
|
225
236
|
## Quickstart
|
|
226
237
|
|
|
227
238
|
```bash
|
|
228
|
-
|
|
239
|
+
# The falsify PRML CLI — lock a manifest, run your eval, verify.
|
|
240
|
+
falsify init accuracy.prml.yaml # writes a skeleton manifest
|
|
241
|
+
# edit accuracy.prml.yaml: metric, comparator, threshold, dataset.hash, seed, producer
|
|
242
|
+
falsify lock accuracy.prml.yaml # canonicalize + SHA-256 + write sidecar
|
|
243
|
+
# ... run your eval, get the observed value ...
|
|
244
|
+
falsify verify accuracy.prml.yaml --observed 0.934
|
|
245
|
+
# PASS (exit 0) · FAIL below threshold (exit 10) · TAMPERED if the spec changed (exit 3)
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
The pre-registration **workflow engine** (claim/falsification specs, `init` → `lock` → `run` → `verdict` → `guard` over `.falsify/<name>/`) ships in the same install as the `falsify-engine` command:
|
|
229
249
|
|
|
230
|
-
|
|
231
|
-
#
|
|
232
|
-
falsify init my_claim
|
|
250
|
+
```bash
|
|
251
|
+
./demo.sh # auto-narrated engine demo (PASS → tamper → FAIL → guard)
|
|
252
|
+
falsify-engine init my_claim
|
|
233
253
|
# edit .falsify/my_claim/spec.yaml to fill in the template
|
|
234
|
-
falsify lock my_claim
|
|
235
|
-
falsify run my_claim
|
|
236
|
-
falsify verdict my_claim
|
|
237
|
-
falsify hook install # enable the commit-msg guard
|
|
254
|
+
falsify-engine lock my_claim
|
|
255
|
+
falsify-engine run my_claim
|
|
256
|
+
falsify-engine verdict my_claim
|
|
257
|
+
falsify-engine hook install # enable the commit-msg guard
|
|
238
258
|
```
|
|
239
259
|
|
|
240
260
|
Exit code `0` on PASS, `10` on FAIL. Everything else is documented
|
|
@@ -247,8 +267,8 @@ New to pre-registration? Walk through [TUTORIAL.md](TUTORIAL.md) — 15 minutes,
|
|
|
247
267
|
```bash
|
|
248
268
|
falsify init --template accuracy
|
|
249
269
|
falsify lock accuracy
|
|
250
|
-
falsify run accuracy
|
|
251
|
-
falsify verdict accuracy
|
|
270
|
+
falsify-engine run accuracy
|
|
271
|
+
falsify-engine verdict accuracy
|
|
252
272
|
```
|
|
253
273
|
|
|
254
274
|
Five templates ship with a runnable spec + metric + dataset:
|
|
@@ -270,7 +290,7 @@ or the directory with `--dir`.
|
|
|
270
290
|
make install # pip install pyyaml
|
|
271
291
|
make test # run unittest suite
|
|
272
292
|
make smoke # run tests/smoke_test.sh
|
|
273
|
-
make demo #
|
|
293
|
+
make demo # calibration end-to-end (lock → run → verdict)
|
|
274
294
|
```
|
|
275
295
|
|
|
276
296
|
See [Makefile](Makefile) for all targets (`make help`).
|
|
@@ -282,18 +302,18 @@ Feature matrix vs adjacent tools: [docs/COMPARISON.md](docs/COMPARISON.md).
|
|
|
282
302
|
|
|
283
303
|
### Explain any claim
|
|
284
304
|
|
|
285
|
-
`falsify why <name>` is the human-friendly companion to `verdict`
|
|
305
|
+
`falsify-engine why <name>` is the human-friendly companion to `verdict`
|
|
286
306
|
— it always exits `0` and tells you exactly what the next honest
|
|
287
307
|
move is:
|
|
288
308
|
|
|
289
309
|
```
|
|
290
|
-
claim:
|
|
310
|
+
claim: calibration
|
|
291
311
|
state: STALE
|
|
292
312
|
reasoning: the spec has been edited (sha256:1038219d75a8) but no run
|
|
293
313
|
exists against this hash. Last run was against sha256:164f619d4860.
|
|
294
314
|
locked: yes (sha256:164f619d4860, 2h ago)
|
|
295
315
|
last run: 2026-04-22T02:10:17+00:00 (2h ago)
|
|
296
|
-
next action: `falsify run <name>` to produce a fresh verdict against
|
|
316
|
+
next action: `falsify-engine run <name>` to produce a fresh verdict against
|
|
297
317
|
the current spec.
|
|
298
318
|
```
|
|
299
319
|
|
|
@@ -302,13 +322,13 @@ and the last five runs.
|
|
|
302
322
|
|
|
303
323
|
### Spot drift with a sparkline
|
|
304
324
|
|
|
305
|
-
`falsify trend <name>` draws an ASCII sparkline of the metric
|
|
325
|
+
`falsify-engine trend <name>` draws an ASCII sparkline of the metric
|
|
306
326
|
across its recorded runs, marks the threshold line, and classifies
|
|
307
327
|
the trajectory as **improving**, **degrading**, **flat**, or
|
|
308
328
|
**mixed**.
|
|
309
329
|
|
|
310
330
|
```
|
|
311
|
-
claim:
|
|
331
|
+
claim: calibration
|
|
312
332
|
threshold: 0.25 (direction: below)
|
|
313
333
|
runs: 20 shown (of 20)
|
|
314
334
|
|
|
@@ -330,14 +350,14 @@ trend: degrading
|
|
|
330
350
|
|
|
331
351
|
### Measure the CLI itself
|
|
332
352
|
|
|
333
|
-
`falsify bench` spawns each subcommand under a fresh temporary
|
|
353
|
+
`falsify-engine bench` spawns each subcommand under a fresh temporary
|
|
334
354
|
directory and records per-command latency (min / median / p95 /
|
|
335
355
|
max / mean / stddev). Useful as a sanity check before a release
|
|
336
356
|
or when investigating a suspected startup-time regression.
|
|
337
357
|
|
|
338
358
|
```bash
|
|
339
|
-
falsify bench --runs 5 --commands "--help,list,stats,score"
|
|
340
|
-
falsify bench --runs 5 --json # machine-readable output
|
|
359
|
+
falsify-engine bench --runs 5 --commands "--help,list,stats,score"
|
|
360
|
+
falsify-engine bench --runs 5 --json # machine-readable output
|
|
341
361
|
```
|
|
342
362
|
|
|
343
363
|
`--runs <N>` sets the timed-iteration count (default 5, capped at
|
|
@@ -395,7 +415,7 @@ compose the skills and CLI.
|
|
|
395
415
|
|
|
396
416
|
**CI** (`.github/workflows/falsify.yml`) — on every push and PR,
|
|
397
417
|
the workflow runs the unittest suite, `tests/smoke_test.sh`, the
|
|
398
|
-
|
|
418
|
+
calibration end-to-end (`lock` → `run` → `verdict`), a guard self-check,
|
|
399
419
|
and a skill-lint pass over every SKILL.md and agent file.
|
|
400
420
|
|
|
401
421
|
## Demo
|
|
@@ -463,7 +483,7 @@ ln -sf "$(pwd)/hooks/commit-msg" .git/hooks/commit-msg
|
|
|
463
483
|
- `hypothesis.schema.yaml` — spec schema (claim, falsification,
|
|
464
484
|
experiment, environment, artifacts).
|
|
465
485
|
- `examples/hello_claim/` — tiny smoke-test fixture.
|
|
466
|
-
- `examples/
|
|
486
|
+
- `examples/calibration_sample/` — anonymized 20-row prediction ledger
|
|
467
487
|
for the Brier score demo.
|
|
468
488
|
- `hooks/commit-msg` — the guard hook.
|
|
469
489
|
- `tests/` — `unittest` suite plus `smoke_test.sh` end-to-end driver.
|
|
@@ -487,14 +507,16 @@ Run `make dogfood` to re-verify. CI runs these on every PR.
|
|
|
487
507
|
|
|
488
508
|
See [CHANGELOG.md](CHANGELOG.md) for release history.
|
|
489
509
|
|
|
510
|
+
> **Latest — 2026-05-23** · **PRML v0.2 frozen** with all 20 conformance vectors (12 v0.1 stable + 8 v0.2) passing byte-for-byte across the four reference implementations. Lock #2 (public hypothesis on spec's own distribution) resolved at **0/3 external contributors**, mechanism worked, [post-mortem published](https://falsify.dev/notes/lock-2-postmortem/). **`mlflow-falsify` v0.2.0** shipped with `MLFLOW_FALSIFY_TAG_SCOPE=experiment` for HPO sweeps; [MLflow community plugin showcase PR](https://github.com/mlflow/mlflow/pull/23569) is live and under review. Five long-form working notes published for EU AI Act readiness: [readiness assessment](https://falsify.dev/eu-ai-act-readiness/), [2 August 2026 deadline](https://falsify.dev/ai-act-deadline-august-2026/), [Article 12 ten-item checklist](https://falsify.dev/article-12-checklist/), [notified body evidence](https://falsify.dev/notified-body-evidence/), [ISO/IEC 42001 readiness](https://falsify.dev/iso-42001-readiness/). DOI [10.5281/zenodo.20177839](https://doi.org/10.5281/zenodo.20177839). PRML JSON Schema in [SchemaStore](https://github.com/SchemaStore/schemastore/pull/5673) (Mads Kristensen / Microsoft) — `.prml.yaml` files autocomplete in VS Code, JetBrains, Helix, Zed, and Cursor. `registry.falsify.dev` live with README badges at `registry.falsify.dev/badge/<hash>.svg`.
|
|
511
|
+
|
|
490
512
|
## Roadmap
|
|
491
513
|
|
|
492
514
|
Two roadmaps run alongside each other:
|
|
493
515
|
|
|
494
|
-
- **CLI tool roadmap:** [ROADMAP.md](ROADMAP.md) — `falsify` features, integrations, dependencies. CLI v0.2
|
|
495
|
-
- **Specification roadmap:** [spec/v0.2/ROADMAP.md](spec/v0.2/ROADMAP.md) — PRML format evolution, canonicalization grammar, conformance. Spec v0.2
|
|
516
|
+
- **CLI tool roadmap:** [ROADMAP.md](ROADMAP.md) — `falsify` features, integrations, dependencies. **CLI v0.2.0 shipped 2026-05-22.** v0.3 features tracked alongside the v0.3 spec backlog.
|
|
517
|
+
- **Specification roadmap:** [spec/v0.2/ROADMAP.md](spec/v0.2/ROADMAP.md) — PRML format evolution, canonicalization grammar, conformance. **Spec v0.2 frozen 2026-05-22.** v0.3 design backlog open under [`spec/v0.3-backlog/`](spec/v0.3-backlog/) (claim trees, suite manifests, selective-disclosure resistance via `leaves_total`).
|
|
496
518
|
|
|
497
|
-
The CLI is downstream of the spec:
|
|
519
|
+
The CLI is downstream of the spec: spec v0.2 frozen 2026-05-22, CLI v0.2.0 shipped to PyPI the same week. CLI v0.3 is loosely scoped for Q4 2026, tracking the v0.3 spec backlog.
|
|
498
520
|
|
|
499
521
|
## Trust model
|
|
500
522
|
|