falsify 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- falsify-0.1.1/LICENSE +21 -0
- falsify-0.1.1/NOTICE +48 -0
- falsify-0.1.1/PKG-INFO +439 -0
- falsify-0.1.1/README.md +410 -0
- falsify-0.1.1/falsify.egg-info/PKG-INFO +439 -0
- falsify-0.1.1/falsify.egg-info/SOURCES.txt +79 -0
- falsify-0.1.1/falsify.egg-info/dependency_links.txt +1 -0
- falsify-0.1.1/falsify.egg-info/entry_points.txt +2 -0
- falsify-0.1.1/falsify.egg-info/requires.txt +4 -0
- falsify-0.1.1/falsify.egg-info/top_level.txt +2 -0
- falsify-0.1.1/falsify.py +3961 -0
- falsify-0.1.1/mcp_server/__init__.py +24 -0
- falsify-0.1.1/mcp_server/__main__.py +6 -0
- falsify-0.1.1/mcp_server/server.py +315 -0
- falsify-0.1.1/pyproject.toml +43 -0
- falsify-0.1.1/setup.cfg +4 -0
- falsify-0.1.1/tests/test_adversarial_doc.py +103 -0
- falsify-0.1.1/tests/test_agent_claim_auditor.py +72 -0
- falsify-0.1.1/tests/test_agent_verdict_refresher.py +85 -0
- falsify-0.1.1/tests/test_architecture.py +46 -0
- falsify-0.1.1/tests/test_bench.py +198 -0
- falsify-0.1.1/tests/test_case_studies_doc.py +125 -0
- falsify-0.1.1/tests/test_changelog.py +69 -0
- falsify-0.1.1/tests/test_ci_workflow.py +46 -0
- falsify-0.1.1/tests/test_claude_md.py +102 -0
- falsify-0.1.1/tests/test_code_of_conduct.py +38 -0
- falsify-0.1.1/tests/test_comparison_doc.py +137 -0
- falsify-0.1.1/tests/test_contributing.py +54 -0
- falsify-0.1.1/tests/test_demo_script.py +78 -0
- falsify-0.1.1/tests/test_demo_script_doc.py +127 -0
- falsify-0.1.1/tests/test_demo_shot_list.py +62 -0
- falsify-0.1.1/tests/test_diff.py +140 -0
- falsify-0.1.1/tests/test_docker.py +97 -0
- falsify-0.1.1/tests/test_doctor.py +146 -0
- falsify-0.1.1/tests/test_editorconfig.py +58 -0
- falsify-0.1.1/tests/test_examples_doc.py +63 -0
- falsify-0.1.1/tests/test_export.py +230 -0
- falsify-0.1.1/tests/test_faq.py +97 -0
- falsify-0.1.1/tests/test_github_repo_maturity.py +84 -0
- falsify-0.1.1/tests/test_github_templates.py +58 -0
- falsify-0.1.1/tests/test_gitignore.py +46 -0
- falsify-0.1.1/tests/test_glossary_doc.py +125 -0
- falsify-0.1.1/tests/test_guard.py +158 -0
- falsify-0.1.1/tests/test_hook_install.py +118 -0
- falsify-0.1.1/tests/test_init.py +53 -0
- falsify-0.1.1/tests/test_init_templates.py +146 -0
- falsify-0.1.1/tests/test_integration_e2e.py +285 -0
- falsify-0.1.1/tests/test_juju_sample.py +72 -0
- falsify-0.1.1/tests/test_list.py +109 -0
- falsify-0.1.1/tests/test_lock.py +151 -0
- falsify-0.1.1/tests/test_makefile.py +45 -0
- falsify-0.1.1/tests/test_managed_agents.py +82 -0
- falsify-0.1.1/tests/test_mcp.py +178 -0
- falsify-0.1.1/tests/test_mcp_server.py +87 -0
- falsify-0.1.1/tests/test_pitch.py +52 -0
- falsify-0.1.1/tests/test_pre_commit.py +98 -0
- falsify-0.1.1/tests/test_pyproject.py +84 -0
- falsify-0.1.1/tests/test_readme.py +51 -0
- falsify-0.1.1/tests/test_release_check.py +85 -0
- falsify-0.1.1/tests/test_release_workflow.py +54 -0
- falsify-0.1.1/tests/test_replay.py +183 -0
- falsify-0.1.1/tests/test_roadmap.py +56 -0
- falsify-0.1.1/tests/test_run.py +108 -0
- falsify-0.1.1/tests/test_score.py +264 -0
- falsify-0.1.1/tests/test_self_dogfood.py +108 -0
- falsify-0.1.1/tests/test_skill_author.py +58 -0
- falsify-0.1.1/tests/test_skill_ci_doctor.py +129 -0
- falsify-0.1.1/tests/test_skill_claim_audit.py +93 -0
- falsify-0.1.1/tests/test_skill_claim_review.py +84 -0
- falsify-0.1.1/tests/test_skill_falsify.py +88 -0
- falsify-0.1.1/tests/test_slash_commands.py +148 -0
- falsify-0.1.1/tests/test_stats.py +126 -0
- falsify-0.1.1/tests/test_stats_html.py +172 -0
- falsify-0.1.1/tests/test_submission.py +87 -0
- falsify-0.1.1/tests/test_submission_md.py +109 -0
- falsify-0.1.1/tests/test_trend.py +260 -0
- falsify-0.1.1/tests/test_tutorial.py +77 -0
- falsify-0.1.1/tests/test_verdict.py +140 -0
- falsify-0.1.1/tests/test_verify.py +210 -0
- falsify-0.1.1/tests/test_version.py +69 -0
- falsify-0.1.1/tests/test_why.py +195 -0
falsify-0.1.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Cüneyt Öztürk
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
falsify-0.1.1/NOTICE
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
falsify
|
|
2
|
+
Copyright © 2026 Cüneyt Öztürk. All rights reserved.
|
|
3
|
+
|
|
4
|
+
This product includes software developed by Cüneyt Öztürk.
|
|
5
|
+
|
|
6
|
+
The falsify source code is released under the MIT License (see LICENSE).
|
|
7
|
+
|
|
8
|
+
──────────────────────────────────────────────────────────────────────────
|
|
9
|
+
Trademark notice
|
|
10
|
+
──────────────────────────────────────────────────────────────────────────
|
|
11
|
+
|
|
12
|
+
"FALSIFY" and the falsify chevron mark ( ⟐ falsify ) are trademarks of
|
|
13
|
+
Cüneyt Öztürk. The MIT License grants rights to the underlying source
|
|
14
|
+
code; it does NOT grant any right to use the "FALSIFY" name, the
|
|
15
|
+
chevron logo, or any confusingly similar mark in connection with:
|
|
16
|
+
|
|
17
|
+
• a competing or derivative product,
|
|
18
|
+
• a commercial service,
|
|
19
|
+
• marketing, advertising, or promotional material,
|
|
20
|
+
|
|
21
|
+
without prior written permission. Forks, derivatives, and modified
|
|
22
|
+
versions must use a distinct name and logo.
|
|
23
|
+
|
|
24
|
+
──────────────────────────────────────────────────────────────────────────
|
|
25
|
+
Commercial use
|
|
26
|
+
──────────────────────────────────────────────────────────────────────────
|
|
27
|
+
|
|
28
|
+
Individual developers, researchers, and open-source projects may use
|
|
29
|
+
falsify freely under the MIT License.
|
|
30
|
+
|
|
31
|
+
Teams deploying falsify in production as part of a commercial service
|
|
32
|
+
are encouraged — but not required by the MIT License — to contact the
|
|
33
|
+
author about support, SLAs, and enterprise features:
|
|
34
|
+
|
|
35
|
+
hello@studio-11.co
|
|
36
|
+
|
|
37
|
+
See docs/COMMERCIAL.md for details.
|
|
38
|
+
|
|
39
|
+
──────────────────────────────────────────────────────────────────────────
|
|
40
|
+
Contributions
|
|
41
|
+
──────────────────────────────────────────────────────────────────────────
|
|
42
|
+
|
|
43
|
+
By submitting a pull request, you agree that your contribution is
|
|
44
|
+
licensed under the MIT License and you grant Cüneyt Öztürk a perpetual,
|
|
45
|
+
irrevocable license to re-license your contribution under any terms,
|
|
46
|
+
including future commercial or dual-license arrangements.
|
|
47
|
+
|
|
48
|
+
See CONTRIBUTING.md for the full contributor terms.
|
falsify-0.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,439 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: falsify
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Pre-registration and CI for AI-agent claims — deterministic PASS or FAIL.
|
|
5
|
+
Author: Cüneyt Öztürk
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://falsify.dev
|
|
8
|
+
Project-URL: Repository, https://github.com/sk8ordie84/falsify
|
|
9
|
+
Project-URL: Issues, https://github.com/sk8ordie84/falsify/issues
|
|
10
|
+
Project-URL: Changelog, https://github.com/sk8ordie84/falsify/blob/main/CHANGELOG.md
|
|
11
|
+
Keywords: falsifiability,pre-registration,AI,CI,claude,hackathon
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering
|
|
20
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
21
|
+
Requires-Python: >=3.11
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
License-File: NOTICE
|
|
25
|
+
Requires-Dist: pyyaml>=6.0
|
|
26
|
+
Provides-Extra: mcp
|
|
27
|
+
Requires-Dist: mcp>=1.0.0; extra == "mcp"
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
<img src="brand/lockup.svg" alt="falsify" width="320">
|
|
31
|
+
|
|
32
|
+
> **Pre-registration + CI for AI-agent claims.** Lock the claim and threshold with SHA-256 *before* running the experiment — or the result doesn't count.
|
|
33
|
+
|
|
34
|
+

|
|
35
|
+

|
|
36
|
+

|
|
37
|
+

|
|
38
|
+

|
|
39
|
+
|
|
40
|
+
> Code: MIT. "FALSIFY" name and chevron logo: ™ reserved. See [NOTICE](NOTICE) · [docs/COMMERCIAL.md](docs/COMMERCIAL.md).
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## The problem
|
|
45
|
+
|
|
46
|
+
Your team claims the model hits **94% accuracy**. You ship it. Three weeks later a customer proves the real number is **71%**.
|
|
47
|
+
|
|
48
|
+
The claim was never *falsifiable*. Nobody wrote down — cryptographically, before the experiment ran — what "94%" meant, which dataset, which metric, which threshold. So when the number changed, nobody could say whether the claim was wrong, the data drifted, or the metric got silently relaxed.
|
|
49
|
+
|
|
50
|
+
**Falsify fixes this with a single idea from science:** you must pre-register the claim *before* you run the experiment. If you change the spec after seeing the data, the hash changes, the audit trail breaks, and CI fails with exit code 3.
|
|
51
|
+
|
|
52
|
+
$ falsify lock accuracy_claim # SHA-256 the spec
|
|
53
|
+
$ falsify run accuracy_claim # reproducible experiment
|
|
54
|
+
$ falsify verdict accuracy_claim # exit 0 = PASS, 10 = FAIL, 3 = tampered
|
|
55
|
+
|
|
56
|
+
Deterministic exit codes are the API. CI gates on them. Humans read the audit trail. The claim either survives contact with the data or it doesn't.
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## 90-second demo
|
|
61
|
+
|
|
62
|
+
[**▶ Watch the 90-second demo on YouTube**](https://youtu.be/vVZTNeak5PA)
|
|
63
|
+
|
|
64
|
+
Lock a claim, run it, watch it PASS. Then tamper with the threshold and watch CI refuse to run. Full storyboard in [`docs/DEMO_SCRIPT.md`](docs/DEMO_SCRIPT.md).
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
## Why this matters
|
|
69
|
+
|
|
70
|
+
Every week another paper, blog post, or product launch claims an AI metric that quietly evaporates under scrutiny. It's not usually malice — it's that the claim was never structured to be falsifiable. Falsify is the smallest possible tool that forces that structure.
|
|
71
|
+
|
|
72
|
+
- **ML teams** — gate deploys on pre-registered accuracy / NDCG / recall
|
|
73
|
+
- **DevOps** — treat p95 latency claims the same way you treat tests
|
|
74
|
+
- **LLM pipelines** — pin prompt + eval + threshold so "it works" means something
|
|
75
|
+
- **Research** — replicate a paper by running its spec.lock.json
|
|
76
|
+
|
|
77
|
+
See [docs/CASE_STUDIES.md](docs/CASE_STUDIES.md) for three concrete adoption stories.
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
**Current version:** 0.1.0 — run `python3 falsify.py --version`.
|
|
82
|
+
**Working with Claude Code?** See [CLAUDE.md](CLAUDE.md).
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
## Why
|
|
87
|
+
|
|
88
|
+
AI agents make empirical claims all day — *"accuracy is up"*, *"the
|
|
89
|
+
new retriever is faster"*, *"this filter catches every edge case"*.
|
|
90
|
+
We rarely pin down the threshold, the metric, or the stopping rule
|
|
91
|
+
before the data arrives.
|
|
92
|
+
|
|
93
|
+
Without pre-registration, every verdict is post-hoc rationalization:
|
|
94
|
+
the goalposts move a little, the sample is chosen a little, the
|
|
95
|
+
winning explanation is kept.
|
|
96
|
+
|
|
97
|
+
Falsification Engine forces scientific discipline onto that loop.
|
|
98
|
+
You declare the test, lock the spec with a cryptographic hash, run
|
|
99
|
+
the experiment, and read the exit code. PASS or FAIL is mechanical,
|
|
100
|
+
not rhetorical — and CI enforces it on every push.
|
|
101
|
+
|
|
102
|
+
## What you get
|
|
103
|
+
|
|
104
|
+
- A single-file CLI (`falsify`) with **18 subcommands**: `init`,
|
|
105
|
+
`lock`, `run`, `verdict`, `guard`, `list`, `stats`, `diff`, `hook`,
|
|
106
|
+
`doctor`, `version`, `export`, `verify`, `replay`, `why`, `trend`,
|
|
107
|
+
`score`, `bench`.
|
|
108
|
+
- A `commit-msg` git hook that blocks commits whose messages
|
|
109
|
+
contradict a locked verdict.
|
|
110
|
+
- A GitHub Actions workflow that re-verdicts every push and PR
|
|
111
|
+
across Python 3.11 and 3.12.
|
|
112
|
+
- **Five Claude Code skills** and **two forked-context subagents**
|
|
113
|
+
that draft specs, audit arbitrary text against the verdict log,
|
|
114
|
+
review PR diffs for honesty violations, and keep the log itself
|
|
115
|
+
fresh.
|
|
116
|
+
|
|
117
|
+
## Install
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
pip install -e .
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
After install, `falsify` is available as a command on your `PATH`
|
|
124
|
+
— no `python3 falsify.py` prefix needed. The `-e` editable form is
|
|
125
|
+
handy during development; drop the flag for a regular install.
|
|
126
|
+
|
|
127
|
+
### Docker
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
docker build -t falsify-demo . && docker run --rm -it falsify-demo
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
Runs the auto-demo in a clean container. See
|
|
134
|
+
[docs/DOCKER.md](docs/DOCKER.md) for interactive and repo-mount
|
|
135
|
+
modes.
|
|
136
|
+
|
|
137
|
+
### pre-commit integration
|
|
138
|
+
|
|
139
|
+
Consume falsify's hooks from your own repo:
|
|
140
|
+
|
|
141
|
+
```yaml
|
|
142
|
+
repos:
|
|
143
|
+
- repo: https://github.com/sk8ordie84/falsify
|
|
144
|
+
rev: main # pin a tag (e.g. v0.1.1) once releases start
|
|
145
|
+
hooks:
|
|
146
|
+
- id: falsify-guard
|
|
147
|
+
- id: falsify-doctor
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
Then `pre-commit install && pre-commit install --hook-type commit-msg`.
|
|
151
|
+
See [docs/PRE_COMMIT.md](docs/PRE_COMMIT.md) for the full list of
|
|
152
|
+
exported hooks and how this repo eats its own dog food.
|
|
153
|
+
|
|
154
|
+
## Quickstart
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
./demo.sh # auto-narrated: PASS → tamper → FAIL → guard block
|
|
158
|
+
|
|
159
|
+
# Either form works — `falsify` is the installed entry point,
|
|
160
|
+
# `python3 falsify.py` is the uninstalled fallback.
|
|
161
|
+
falsify init my_claim
|
|
162
|
+
# edit .falsify/my_claim/spec.yaml to fill in the template
|
|
163
|
+
falsify lock my_claim
|
|
164
|
+
falsify run my_claim
|
|
165
|
+
falsify verdict my_claim
|
|
166
|
+
falsify hook install # enable the commit-msg guard
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
Exit code `0` on PASS, `10` on FAIL. Everything else is documented
|
|
170
|
+
below.
|
|
171
|
+
|
|
172
|
+
New to pre-registration? Walk through [TUTORIAL.md](TUTORIAL.md) — 15 minutes, zero to first locked claim.
|
|
173
|
+
|
|
174
|
+
### Start from a template
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
falsify init --template accuracy
|
|
178
|
+
falsify lock accuracy
|
|
179
|
+
falsify run accuracy
|
|
180
|
+
falsify verdict accuracy
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
Five templates ship with a runnable spec + metric + dataset:
|
|
184
|
+
|
|
185
|
+
- `accuracy` — classifier holdout accuracy ≥ 0.80
|
|
186
|
+
- `latency` — p95 request latency ≤ 200 ms
|
|
187
|
+
- `brier` — probabilistic calibration Brier ≤ 0.25
|
|
188
|
+
- `llm-judge` — LLM-judge agreement rate ≥ 0.75
|
|
189
|
+
- `ab` — A/B test absolute lift ≥ 0.05
|
|
190
|
+
|
|
191
|
+
Each scaffolds into `claims/<name>/` (sources) and mirrors
|
|
192
|
+
`spec.yaml` into `.falsify/<name>/` so the CLI runtime works
|
|
193
|
+
without further setup. Override the default name with `--name`
|
|
194
|
+
or the directory with `--dir`.
|
|
195
|
+
|
|
196
|
+
### Developer commands
|
|
197
|
+
|
|
198
|
+
```bash
|
|
199
|
+
make install # pip install pyyaml
|
|
200
|
+
make test # run unittest suite
|
|
201
|
+
make smoke # run tests/smoke_test.sh
|
|
202
|
+
make demo # JUJU end-to-end (lock → run → verdict)
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
See [Makefile](Makefile) for all targets (`make help`).
|
|
206
|
+
|
|
207
|
+
Questions and objections? See [docs/FAQ.md](docs/FAQ.md) — 15
|
|
208
|
+
direct answers to "why not just X?" questions.
|
|
209
|
+
|
|
210
|
+
Feature matrix vs adjacent tools: [docs/COMPARISON.md](docs/COMPARISON.md).
|
|
211
|
+
|
|
212
|
+
### Explain any claim
|
|
213
|
+
|
|
214
|
+
`falsify why <name>` is the human-friendly companion to `verdict`
|
|
215
|
+
— it always exits `0` and tells you exactly what the next honest
|
|
216
|
+
move is:
|
|
217
|
+
|
|
218
|
+
```
|
|
219
|
+
claim: juju
|
|
220
|
+
state: STALE
|
|
221
|
+
reasoning: the spec has been edited (sha256:1038219d75a8) but no run
|
|
222
|
+
exists against this hash. Last run was against sha256:164f619d4860.
|
|
223
|
+
locked: yes (sha256:164f619d4860, 2h ago)
|
|
224
|
+
last run: 2026-04-22T02:10:17+00:00 (2h ago)
|
|
225
|
+
next action: `falsify run <name>` to produce a fresh verdict against
|
|
226
|
+
the current spec.
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
Add `--json` for a scripted pipeline, `--verbose` for full hashes
|
|
230
|
+
and the last five runs.
|
|
231
|
+
|
|
232
|
+
### Spot drift with a sparkline
|
|
233
|
+
|
|
234
|
+
`falsify trend <name>` draws an ASCII sparkline of the metric
|
|
235
|
+
across its recorded runs, marks the threshold line, and classifies
|
|
236
|
+
the trajectory as **improving**, **degrading**, **flat**, or
|
|
237
|
+
**mixed**.
|
|
238
|
+
|
|
239
|
+
```
|
|
240
|
+
claim: juju
|
|
241
|
+
threshold: 0.25 (direction: below)
|
|
242
|
+
runs: 20 shown (of 20)
|
|
243
|
+
|
|
244
|
+
▁▂▂▃▃▄▄▅▅▆▆▆▇▇████
|
|
245
|
+
TT
|
|
246
|
+
threshold=0.25 (shown)
|
|
247
|
+
|
|
248
|
+
first: 0.12 @ ... (PASS)
|
|
249
|
+
last: 0.23 @ ... (PASS)
|
|
250
|
+
min: 0.09
|
|
251
|
+
max: 0.23
|
|
252
|
+
mean: 0.17
|
|
253
|
+
latest verdict: PASS
|
|
254
|
+
trend: degrading
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
`--ascii` swaps in `_.oO#`; `--width` resizes the sparkline;
|
|
258
|
+
`--last` caps history (default 20, max 200).
|
|
259
|
+
|
|
260
|
+
### Measure the CLI itself
|
|
261
|
+
|
|
262
|
+
`falsify bench` spawns each subcommand under a fresh temporary
|
|
263
|
+
directory and records per-command latency (min / median / p95 /
|
|
264
|
+
max / mean / stddev). Useful as a sanity check before a release
|
|
265
|
+
or when investigating a suspected startup-time regression.
|
|
266
|
+
|
|
267
|
+
```bash
|
|
268
|
+
falsify bench --runs 5 --commands "--help,list,stats,score"
|
|
269
|
+
falsify bench --runs 5 --json # machine-readable output
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
`--runs <N>` sets the timed-iteration count (default 5, capped at
|
|
273
|
+
100); `--warmup <N>` discards the first N spawns so JIT / import
|
|
274
|
+
caches stabilize before timing (default 1).
|
|
275
|
+
|
|
276
|
+
## Exit codes
|
|
277
|
+
|
|
278
|
+
| Code | Meaning |
|
|
279
|
+
|------|-----------------------------------------------|
|
|
280
|
+
| 0 | PASS |
|
|
281
|
+
| 10 | FAIL |
|
|
282
|
+
| 2 | Bad spec / INCONCLUSIVE |
|
|
283
|
+
| 3 | Hash mismatch (spec tampered) |
|
|
284
|
+
| 11 | Guard violation (commit blocked) |
|
|
285
|
+
|
|
286
|
+
## The Opus 4.7 layers
|
|
287
|
+
|
|
288
|
+
**Skills** (`.claude/skills/`) — in-session helpers that fire on
|
|
289
|
+
trigger phrases.
|
|
290
|
+
- `hypothesis-author` walks the user through a 5-question dialogue
|
|
291
|
+
and writes a falsifiable `spec.yaml`.
|
|
292
|
+
- `falsify` is the orchestrator: routes any empirical claim to the
|
|
293
|
+
right place in the init → lock → run → verdict pipeline.
|
|
294
|
+
- `claim-audit` runs a fast keyword+regex audit over pasted text
|
|
295
|
+
and escalates to the `claim-auditor` subagent when paraphrases or
|
|
296
|
+
>2 claims show up.
|
|
297
|
+
- `claim-review` reads a PR diff and flags unlocked specs, silent
|
|
298
|
+
threshold edits, and `metric_fn` references to missing modules —
|
|
299
|
+
runs in PR CI, exits `1` on any CRITICAL finding. See
|
|
300
|
+
[`docs/PR_REVIEW.md`](docs/PR_REVIEW.md).
|
|
301
|
+
- `falsify-ci-doctor` ingests `make release-check` output and
|
|
302
|
+
maps each FAIL gate to a likely cause and an exact fix command
|
|
303
|
+
— one-shot triage when CI is red.
|
|
304
|
+
|
|
305
|
+
**Subagents** (`.claude/agents/`) — forked-context agents invoked
|
|
306
|
+
via the `Task` tool for heavier work.
|
|
307
|
+
- `claim-auditor` does the semantic cross-reference that the
|
|
308
|
+
keyword-pass `claim-audit` skill deliberately skips; used on PR
|
|
309
|
+
bodies, release notes, and README edits.
|
|
310
|
+
- `verdict-refresher` scans `.falsify/*/` for STALE, INCONCLUSIVE,
|
|
311
|
+
or UNRUN verdicts and re-runs them through the CLI — keeping
|
|
312
|
+
`guard` decisions trustworthy.
|
|
313
|
+
|
|
314
|
+
**Slash commands** (`.claude/commands/`) — in-IDE shortcuts that
|
|
315
|
+
compose the skills and CLI.
|
|
316
|
+
- `/new-claim <template> [name]` — guided scaffold → lock → run →
|
|
317
|
+
verdict for one of the five templates.
|
|
318
|
+
- `/audit-claims` — repo-wide semantic audit; merges
|
|
319
|
+
`list`/`stats`/`score` with findings from the `claim-audit`
|
|
320
|
+
skill into a single markdown report.
|
|
321
|
+
- `/ship-verdict <name>` — four-gate release check (verdict,
|
|
322
|
+
freshness, replay, audit-chain). Exits non-zero on any gate
|
|
323
|
+
failure. Does not ship; only verifies.
|
|
324
|
+
|
|
325
|
+
**CI** (`.github/workflows/falsify.yml`) — on every push and PR,
|
|
326
|
+
the workflow runs the unittest suite, `tests/smoke_test.sh`, the
|
|
327
|
+
JUJU end-to-end (`lock` → `run` → `verdict`), a guard self-check,
|
|
328
|
+
and a skill-lint pass over every SKILL.md and agent file.
|
|
329
|
+
|
|
330
|
+
## Demo
|
|
331
|
+
|
|
332
|
+
- Walk through the pipeline in 5 runnable steps: [DEMO.md](DEMO.md).
|
|
333
|
+
- Second-by-second shooting script for the 3-minute video:
|
|
334
|
+
[docs/DEMO_SHOT_LIST.md](docs/DEMO_SHOT_LIST.md).
|
|
335
|
+
- Four more claim types (accuracy regression, latency gate,
|
|
336
|
+
prediction calibration, LLM agreement, AB test):
|
|
337
|
+
[docs/EXAMPLES.md](docs/EXAMPLES.md).
|
|
338
|
+
|
|
339
|
+
## MCP integration
|
|
340
|
+
|
|
341
|
+
Expose the verdict store to Claude Desktop / Claude Code via
|
|
342
|
+
Model Context Protocol with four read-only tools (`list_verdicts`,
|
|
343
|
+
`get_verdict`, `get_stats`, `check_claim`) and three resource URIs.
|
|
344
|
+
|
|
345
|
+
```bash
|
|
346
|
+
pip install -e '.[mcp]'
|
|
347
|
+
python -m mcp_server # speaks MCP over stdio
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
Then merge the snippet in
|
|
351
|
+
[`mcp_server/claude_desktop_config.example.json`](mcp_server/claude_desktop_config.example.json)
|
|
352
|
+
into your Claude Desktop config, pointing `cwd` at your local
|
|
353
|
+
clone. Every Claude session in your org can now query live
|
|
354
|
+
verdicts — no more *"I think the latency claim still passes"*;
|
|
355
|
+
Claude just asks the MCP server. Falsify itself runs without the
|
|
356
|
+
SDK; if `mcp` isn't installed, `python -m mcp_server` exits 2 with
|
|
357
|
+
a clear install hint. Full surface in
|
|
358
|
+
[`mcp_server/README.md`](mcp_server/README.md).
|
|
359
|
+
|
|
360
|
+
### Managed Agents (optional)
|
|
361
|
+
|
|
362
|
+
Deploy the two subagents (`verdict-refresher`, `claim-auditor`)
|
|
363
|
+
to Anthropic Console for scheduled and on-demand execution.
|
|
364
|
+
See [docs/MANAGED_AGENTS.md](docs/MANAGED_AGENTS.md) for the
|
|
365
|
+
setup recipe and manifests under
|
|
366
|
+
[`managed_agents/`](managed_agents/).
|
|
367
|
+
|
|
368
|
+
## Install the git hook
|
|
369
|
+
|
|
370
|
+
```bash
|
|
371
|
+
cp hooks/commit-msg .git/hooks/commit-msg
|
|
372
|
+
chmod +x .git/hooks/commit-msg
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
Or, as a symlink so hook updates propagate automatically:
|
|
376
|
+
|
|
377
|
+
```bash
|
|
378
|
+
ln -sf "$(pwd)/hooks/commit-msg" .git/hooks/commit-msg
|
|
379
|
+
```
|
|
380
|
+
|
|
381
|
+
## Repository layout
|
|
382
|
+
|
|
383
|
+
- `falsify.py` — single-file CLI, stdlib + pyyaml only.
|
|
384
|
+
- `hypothesis.schema.yaml` — spec schema (claim, falsification,
|
|
385
|
+
experiment, environment, artifacts).
|
|
386
|
+
- `examples/hello_claim/` — tiny smoke-test fixture.
|
|
387
|
+
- `examples/juju_sample/` — anonymized 20-row prediction ledger
|
|
388
|
+
for the Brier score demo.
|
|
389
|
+
- `hooks/commit-msg` — the guard hook.
|
|
390
|
+
- `tests/` — `unittest` suite plus `smoke_test.sh` end-to-end driver.
|
|
391
|
+
- `.claude/skills/` — the five in-session skills.
|
|
392
|
+
- `.claude/agents/` — the two forked-context subagents.
|
|
393
|
+
- `.claude/commands/` — the three slash commands.
|
|
394
|
+
- `.github/workflows/` — CI.
|
|
395
|
+
|
|
396
|
+
## Self-dogfooding
|
|
397
|
+
|
|
398
|
+
Falsify uses itself. Three real claims about this codebase live
|
|
399
|
+
under `claims/self/`:
|
|
400
|
+
|
|
401
|
+
- `cli_startup` — CLI startup stays under 500ms median
|
|
402
|
+
- `test_coverage_count` — test suite has more than 400 test methods
|
|
403
|
+
- `claude_surface` — Claude integration ships more than 8 artifacts
|
|
404
|
+
|
|
405
|
+
Run `make dogfood` to re-verify. CI runs these on every PR.
|
|
406
|
+
|
|
407
|
+
## Changelog
|
|
408
|
+
|
|
409
|
+
See [CHANGELOG.md](CHANGELOG.md) for release history.
|
|
410
|
+
|
|
411
|
+
## Roadmap
|
|
412
|
+
|
|
413
|
+
See [ROADMAP.md](ROADMAP.md) for the post-hackathon direction.
|
|
414
|
+
|
|
415
|
+
## Trust model
|
|
416
|
+
|
|
417
|
+
Falsify is a discipline tool, not a zero-trust system. For a full
|
|
418
|
+
enumeration of attacks defended and NOT defended, with the exact
|
|
419
|
+
exit code or command that catches each, see
|
|
420
|
+
[docs/ADVERSARIAL.md](docs/ADVERSARIAL.md). For private disclosure
|
|
421
|
+
of invariant breaks, see [.github/SECURITY.md](.github/SECURITY.md).
|
|
422
|
+
|
|
423
|
+
## License
|
|
424
|
+
|
|
425
|
+
MIT. See [LICENSE](LICENSE).
|
|
426
|
+
|
|
427
|
+
See [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) for community standards.
|
|
428
|
+
See [.github/CODEOWNERS](.github/CODEOWNERS) for module-level
|
|
429
|
+
reviewers and [.github/dependabot.yml](.github/dependabot.yml) for
|
|
430
|
+
automated dependency updates.
|
|
431
|
+
See [docs/GLOSSARY.md](docs/GLOSSARY.md) for definitions of every
|
|
432
|
+
term used across the docs.
|
|
433
|
+
See [docs/CASE_STUDIES.md](docs/CASE_STUDIES.md) for three concrete
|
|
434
|
+
adoption scenarios: ML team, DevOps team, research group.
|
|
435
|
+
|
|
436
|
+
## Built with
|
|
437
|
+
|
|
438
|
+
Claude Opus 4.7 (1M context), in three days, for the Anthropic
|
|
439
|
+
Built with Opus 4.7 hackathon.
|