falsifyai 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (167) hide show
  1. falsifyai-0.1.0/.claude/CLAUDE.md +144 -0
  2. falsifyai-0.1.0/.claude/settings.json +23 -0
  3. falsifyai-0.1.0/.claude/skills/pr-review/SKILL.md +93 -0
  4. falsifyai-0.1.0/.env.example +56 -0
  5. falsifyai-0.1.0/.github/ISSUE_TEMPLATE/bug_report.md +52 -0
  6. falsifyai-0.1.0/.github/ISSUE_TEMPLATE/feature_request.md +47 -0
  7. falsifyai-0.1.0/.github/PULL_REQUEST_TEMPLATE.md +60 -0
  8. falsifyai-0.1.0/.github/workflows/ci.yml +42 -0
  9. falsifyai-0.1.0/.gitignore +72 -0
  10. falsifyai-0.1.0/.python-version +1 -0
  11. falsifyai-0.1.0/CHANGELOG.md +153 -0
  12. falsifyai-0.1.0/CONTRIBUTING.md +207 -0
  13. falsifyai-0.1.0/LICENSE +201 -0
  14. falsifyai-0.1.0/PKG-INFO +398 -0
  15. falsifyai-0.1.0/README.md +359 -0
  16. falsifyai-0.1.0/dev_notes/PHILOSOPHY.md +345 -0
  17. falsifyai-0.1.0/dev_notes/README.md +25 -0
  18. falsifyai-0.1.0/dev_notes/STRUCTURE.md +151 -0
  19. falsifyai-0.1.0/dev_notes/plans/PR-11-real-verdict-resolver.md +299 -0
  20. falsifyai-0.1.0/dev_notes/plans/PR-13-falsifyai-replay-cli.md +211 -0
  21. falsifyai-0.1.0/dev_notes/plans/PR-14-falsifyai-diff-cli.md +265 -0
  22. falsifyai-0.1.0/dev_notes/plans/PR-15-launch-readiness.md +315 -0
  23. falsifyai-0.1.0/dev_notes/plans/PR-2-spec-loader-execution-adapter.md +129 -0
  24. falsifyai-0.1.0/dev_notes/plans/PR-3-perturbation-runtime.md +104 -0
  25. falsifyai-0.1.0/dev_notes/plans/PR-4-spec-materializer.md +206 -0
  26. falsifyai-0.1.0/dev_notes/plans/PR-5-invariant-runtime.md +202 -0
  27. falsifyai-0.1.0/dev_notes/plans/PR-6-replay-store.md +312 -0
  28. falsifyai-0.1.0/dev_notes/plans/PR-8-falsifyai-run-cli.md +323 -0
  29. falsifyai-0.1.0/dev_notes/plans/PR-9-dogfooded-examples.md +193 -0
  30. falsifyai-0.1.0/dev_notes/plans/README.md +31 -0
  31. falsifyai-0.1.0/dev_notes/research/repo-pressure-extraction.md +694 -0
  32. falsifyai-0.1.0/dev_notes/summaries/PR-11-real-verdict-resolver.md +96 -0
  33. falsifyai-0.1.0/dev_notes/summaries/PR-13-falsifyai-replay-cli.md +90 -0
  34. falsifyai-0.1.0/dev_notes/summaries/PR-14-falsifyai-diff-cli.md +97 -0
  35. falsifyai-0.1.0/dev_notes/summaries/PR-15-launch-readiness.md +105 -0
  36. falsifyai-0.1.0/dev_notes/summaries/PR-2-spec-loader-execution-adapter.md +73 -0
  37. falsifyai-0.1.0/dev_notes/summaries/PR-3-perturbation-runtime.md +59 -0
  38. falsifyai-0.1.0/dev_notes/summaries/PR-4-spec-materializer.md +61 -0
  39. falsifyai-0.1.0/dev_notes/summaries/PR-5-invariant-runtime.md +65 -0
  40. falsifyai-0.1.0/dev_notes/summaries/PR-6-replay-store.md +77 -0
  41. falsifyai-0.1.0/dev_notes/summaries/PR-8-falsifyai-run-cli.md +76 -0
  42. falsifyai-0.1.0/dev_notes/summaries/PR-9-dogfooded-examples.md +74 -0
  43. falsifyai-0.1.0/dev_notes/summaries/README.md +30 -0
  44. falsifyai-0.1.0/dev_notes/walkthroughs/PR-11-real-verdict-resolver.md +613 -0
  45. falsifyai-0.1.0/dev_notes/walkthroughs/PR-13-falsifyai-replay-cli.md +475 -0
  46. falsifyai-0.1.0/dev_notes/walkthroughs/PR-14-falsifyai-diff-cli.md +570 -0
  47. falsifyai-0.1.0/dev_notes/walkthroughs/PR-15-launch-readiness.md +89 -0
  48. falsifyai-0.1.0/dev_notes/walkthroughs/PR-2-spec-loader-execution-adapter.md +368 -0
  49. falsifyai-0.1.0/dev_notes/walkthroughs/PR-3-perturbation-runtime.md +455 -0
  50. falsifyai-0.1.0/dev_notes/walkthroughs/PR-4-spec-materializer.md +383 -0
  51. falsifyai-0.1.0/dev_notes/walkthroughs/PR-5-invariant-runtime.md +470 -0
  52. falsifyai-0.1.0/dev_notes/walkthroughs/PR-6-replay-store.md +573 -0
  53. falsifyai-0.1.0/dev_notes/walkthroughs/PR-8-falsifyai-run-cli.md +455 -0
  54. falsifyai-0.1.0/dev_notes/walkthroughs/PR-9-dogfooded-examples.md +465 -0
  55. falsifyai-0.1.0/dev_notes/walkthroughs/README.md +41 -0
  56. falsifyai-0.1.0/docs/ARCHITECTURE.md +320 -0
  57. falsifyai-0.1.0/docs/DEMO.md +183 -0
  58. falsifyai-0.1.0/docs/RELEASE.md +157 -0
  59. falsifyai-0.1.0/examples/README.md +76 -0
  60. falsifyai-0.1.0/examples/consistently_wrong.yaml +38 -0
  61. falsifyai-0.1.0/examples/fragile.yaml +32 -0
  62. falsifyai-0.1.0/examples/model_migration.yaml +99 -0
  63. falsifyai-0.1.0/examples/stable.yaml +46 -0
  64. falsifyai-0.1.0/falsifyai/__init__.py +3 -0
  65. falsifyai-0.1.0/falsifyai/cli/__init__.py +8 -0
  66. falsifyai-0.1.0/falsifyai/cli/diff.py +237 -0
  67. falsifyai-0.1.0/falsifyai/cli/errors.py +30 -0
  68. falsifyai-0.1.0/falsifyai/cli/main.py +105 -0
  69. falsifyai-0.1.0/falsifyai/cli/render.py +196 -0
  70. falsifyai-0.1.0/falsifyai/cli/replay.py +76 -0
  71. falsifyai-0.1.0/falsifyai/cli/run.py +191 -0
  72. falsifyai-0.1.0/falsifyai/differential/__init__.py +0 -0
  73. falsifyai-0.1.0/falsifyai/execution/__init__.py +19 -0
  74. falsifyai-0.1.0/falsifyai/execution/adapter.py +16 -0
  75. falsifyai-0.1.0/falsifyai/execution/cache.py +38 -0
  76. falsifyai-0.1.0/falsifyai/execution/engine.py +35 -0
  77. falsifyai-0.1.0/falsifyai/execution/errors.py +10 -0
  78. falsifyai-0.1.0/falsifyai/execution/litellm_adapter.py +57 -0
  79. falsifyai-0.1.0/falsifyai/execution/models.py +52 -0
  80. falsifyai-0.1.0/falsifyai/falsifiability/__init__.py +9 -0
  81. falsifyai-0.1.0/falsifyai/falsifiability/score.py +49 -0
  82. falsifyai-0.1.0/falsifyai/invariants/__init__.py +35 -0
  83. falsifyai-0.1.0/falsifyai/invariants/base.py +99 -0
  84. falsifyai-0.1.0/falsifyai/invariants/contains.py +65 -0
  85. falsifyai-0.1.0/falsifyai/invariants/registry.py +35 -0
  86. falsifyai-0.1.0/falsifyai/invariants/semantic.py +110 -0
  87. falsifyai-0.1.0/falsifyai/oracles/__init__.py +0 -0
  88. falsifyai-0.1.0/falsifyai/perturbation/__init__.py +25 -0
  89. falsifyai-0.1.0/falsifyai/perturbation/base.py +91 -0
  90. falsifyai-0.1.0/falsifyai/perturbation/casing_variant.py +79 -0
  91. falsifyai-0.1.0/falsifyai/perturbation/registry.py +28 -0
  92. falsifyai-0.1.0/falsifyai/perturbation/typo_noise.py +158 -0
  93. falsifyai-0.1.0/falsifyai/replay/__init__.py +38 -0
  94. falsifyai-0.1.0/falsifyai/replay/in_memory_store.py +78 -0
  95. falsifyai-0.1.0/falsifyai/replay/models.py +114 -0
  96. falsifyai-0.1.0/falsifyai/replay/protocol.py +52 -0
  97. falsifyai-0.1.0/falsifyai/replay/serialize.py +218 -0
  98. falsifyai-0.1.0/falsifyai/replay/sqlite_store.py +191 -0
  99. falsifyai-0.1.0/falsifyai/reporting/__init__.py +0 -0
  100. falsifyai-0.1.0/falsifyai/session/__init__.py +0 -0
  101. falsifyai-0.1.0/falsifyai/spec/__init__.py +19 -0
  102. falsifyai-0.1.0/falsifyai/spec/errors.py +27 -0
  103. falsifyai-0.1.0/falsifyai/spec/loader.py +40 -0
  104. falsifyai-0.1.0/falsifyai/spec/materializer.py +172 -0
  105. falsifyai-0.1.0/falsifyai/spec/models.py +135 -0
  106. falsifyai-0.1.0/falsifyai/statistical/__init__.py +0 -0
  107. falsifyai-0.1.0/falsifyai/verdict/__init__.py +8 -0
  108. falsifyai-0.1.0/falsifyai/verdict/consistency.py +62 -0
  109. falsifyai-0.1.0/falsifyai/verdict/models.py +36 -0
  110. falsifyai-0.1.0/falsifyai/verdict/resolver.py +174 -0
  111. falsifyai-0.1.0/falsifyai/verdict/stratify.py +93 -0
  112. falsifyai-0.1.0/plan.md +1742 -0
  113. falsifyai-0.1.0/pyproject.toml +116 -0
  114. falsifyai-0.1.0/scripts/scaffold_dev_notes.py +306 -0
  115. falsifyai-0.1.0/tests/__init__.py +0 -0
  116. falsifyai-0.1.0/tests/fixtures/__init__.py +0 -0
  117. falsifyai-0.1.0/tests/fixtures/build_artifact.py +204 -0
  118. falsifyai-0.1.0/tests/fixtures/mock_adapter.py +38 -0
  119. falsifyai-0.1.0/tests/fixtures/mock_embedder.py +64 -0
  120. falsifyai-0.1.0/tests/fixtures/specs/full.yaml +46 -0
  121. falsifyai-0.1.0/tests/fixtures/specs/malformed.yaml +3 -0
  122. falsifyai-0.1.0/tests/fixtures/specs/minimal.yaml +20 -0
  123. falsifyai-0.1.0/tests/fixtures/specs/missing_cases.yaml +12 -0
  124. falsifyai-0.1.0/tests/fixtures/specs/missing_seed.yaml +19 -0
  125. falsifyai-0.1.0/tests/fixtures/specs/missing_threshold.yaml +19 -0
  126. falsifyai-0.1.0/tests/fixtures/specs/run_smoke.yaml +24 -0
  127. falsifyai-0.1.0/tests/fixtures/specs/unknown_field.yaml +22 -0
  128. falsifyai-0.1.0/tests/fixtures/specs/unknown_perturbation_type.yaml +20 -0
  129. falsifyai-0.1.0/tests/integration/__init__.py +0 -0
  130. falsifyai-0.1.0/tests/integration/test_diff_end_to_end.py +104 -0
  131. falsifyai-0.1.0/tests/integration/test_examples.py +250 -0
  132. falsifyai-0.1.0/tests/integration/test_replay_end_to_end.py +79 -0
  133. falsifyai-0.1.0/tests/integration/test_run_end_to_end.py +101 -0
  134. falsifyai-0.1.0/tests/meta/__init__.py +0 -0
  135. falsifyai-0.1.0/tests/unit/__init__.py +0 -0
  136. falsifyai-0.1.0/tests/unit/test_casing_variant.py +99 -0
  137. falsifyai-0.1.0/tests/unit/test_cli_diff.py +287 -0
  138. falsifyai-0.1.0/tests/unit/test_cli_main.py +56 -0
  139. falsifyai-0.1.0/tests/unit/test_cli_render.py +282 -0
  140. falsifyai-0.1.0/tests/unit/test_cli_replay.py +145 -0
  141. falsifyai-0.1.0/tests/unit/test_contains_invariant.py +91 -0
  142. falsifyai-0.1.0/tests/unit/test_execution_cache.py +59 -0
  143. falsifyai-0.1.0/tests/unit/test_execution_engine.py +68 -0
  144. falsifyai-0.1.0/tests/unit/test_execution_models.py +76 -0
  145. falsifyai-0.1.0/tests/unit/test_falsifiability_score.py +48 -0
  146. falsifyai-0.1.0/tests/unit/test_invariant_base.py +68 -0
  147. falsifyai-0.1.0/tests/unit/test_invariant_registry.py +56 -0
  148. falsifyai-0.1.0/tests/unit/test_litellm_adapter.py +129 -0
  149. falsifyai-0.1.0/tests/unit/test_materializer.py +261 -0
  150. falsifyai-0.1.0/tests/unit/test_perturbation_base.py +71 -0
  151. falsifyai-0.1.0/tests/unit/test_perturbation_registry.py +41 -0
  152. falsifyai-0.1.0/tests/unit/test_render_output_schema.py +165 -0
  153. falsifyai-0.1.0/tests/unit/test_replay_models.py +224 -0
  154. falsifyai-0.1.0/tests/unit/test_replay_serialize.py +89 -0
  155. falsifyai-0.1.0/tests/unit/test_replay_store_contract.py +160 -0
  156. falsifyai-0.1.0/tests/unit/test_semantic_equivalence_invariant.py +185 -0
  157. falsifyai-0.1.0/tests/unit/test_smoke.py +13 -0
  158. falsifyai-0.1.0/tests/unit/test_spec_loader.py +59 -0
  159. falsifyai-0.1.0/tests/unit/test_spec_models.py +200 -0
  160. falsifyai-0.1.0/tests/unit/test_sqlite_store.py +109 -0
  161. falsifyai-0.1.0/tests/unit/test_typo_noise.py +120 -0
  162. falsifyai-0.1.0/tests/unit/test_verdict_consistency.py +83 -0
  163. falsifyai-0.1.0/tests/unit/test_verdict_models.py +47 -0
  164. falsifyai-0.1.0/tests/unit/test_verdict_resolver.py +338 -0
  165. falsifyai-0.1.0/tests/unit/test_verdict_stratify.py +201 -0
  166. falsifyai-0.1.0/tests/unit/test_version.py +19 -0
  167. falsifyai-0.1.0/uv.lock +2058 -0
@@ -0,0 +1,144 @@
1
+ # FalsifyAI — Project Context for Claude
2
+
3
+ > Project-scoped instructions. Extends, does not replace, user-global `~/.claude/CLAUDE.md`.
4
+
5
+ ## What this project is
6
+
7
+ **FalsifyAI** is a falsification-first reliability testing framework for AI systems. Status: **active Phase 0 implementation toward `falsifyai==0.1.0`**. Core pipeline is shipped (spec → materialize → execute → judge → save → CLI) with two dogfooded examples; remaining Phase 0 work in [plan.md §22.1](../plan.md).
8
+
9
+ ## Design philosophy (load-bearing)
10
+
11
+ FalsifyAI optimizes for **evidence density over evidence volume**.
12
+
13
+ ```
14
+ minimal meaningful evidence
15
+ + high evidence quality per cognitive load
16
+ + diverse perturbation categories
17
+ + replayable proof
18
+ = better falsification of AI / LLM systems
19
+ ```
20
+
21
+ The goal is **maximum useful signal**, not maximum data. More evidence is not inherently better evidence.
22
+
23
+ ### Four pillars
24
+
25
+ - **Minimal meaningful evidence.** Run the smallest experiment that meaningfully increases confidence in a verdict — no more. Adaptive evidence collection is the long-term ideal.
26
+ - **High evidence quality per cognitive load.** Every line / artifact a user sees has to earn its real estate against: *would removing this make the engineer's decision worse?*
27
+ - **Diverse perturbation categories (orthogonal pressure).** The admission criterion for a new perturbation family is *what new failure mode does this expose?* — not breadth. `typo_noise_v2` ≠ a new family; `paraphrase` is.
28
+ - **Replayable proof.** Replay artifacts are the system's promise that claims are inspectable evidence, not anecdotes. CLI compresses; artifact preserves.
29
+
30
+ ### How this shapes decisions
31
+
32
+ - **CLI output.** One row per case + one-line summary. Not a dashboard.
33
+ - **Verdict design.** Compress evidence into actionable conclusions; don't enumerate it.
34
+ - **Perturbation families.** Each must contribute orthogonal reliability information, not duplicate noise.
35
+ - **Replay artifacts.** Self-contained; carry the full materialized spec so they outlive the YAML file on disk.
36
+ - **MVP scope.** 2 perturbation families, 2 invariants, 5 verdicts — locked in [plan.md §22.1](../plan.md) because *that is enough to tell the story*.
37
+ - **Three-layer architectural separation.** *Evidence generation* (perturbation / materialization / execution) is architecturally distinct from *evidence interpretation* (invariants / verdict resolver / CLI compression), and both are distinct from *evidence preservation* (replay artifacts / stores). New work belongs in exactly one layer; don't let interpretation leak into generation under pressure.
38
+ - **Resolver complexity is bounded.** The verdict resolver is the epistemic authority of the framework; its priority chain must stay compressible and predictable. Expand the consumer surface (replay / diff / future tools) when adding interpretation features, not the verdict logic. The trust test for any resolver change: *a competent user should be able to predict the resolver output from the inputs.*
39
+
40
+ ### Anti-goals / anti-entropy infrastructure
41
+
42
+ FalsifyAI is **not** optimizing for any of these. When pressure pulls toward them, resist:
43
+
44
+ - Maximal perturbation volume
45
+ - Maximal telemetry / metrics
46
+ - Dashboard density
47
+ - Benchmark quantity
48
+ - Metric proliferation
49
+ - Exhaustive output verbosity
50
+ - Configuration knobs for every behavior
51
+ - **Resolver inflation** — accreting heuristics, thresholds, verdict types, or confidence semantics into the verdict resolver. Each addition seems reasonable; cumulative effect destroys predictability.
52
+
53
+ The signal to watch: *does this addition help an engineer make a better decision, or does it crowd the surface where the actual decision lives?* If the latter, defer or rework.
54
+
55
+ ## Naming (locked — do not change without confirmation)
56
+
57
+ | Layer | Value |
58
+ |---|---|
59
+ | PyPI package | `falsifyai` |
60
+ | Python import | `import falsifyai` |
61
+ | CLI binary | `falsifyai` (e.g. `falsifyai run eval.yaml`) |
62
+ | Brand / prose name | "FalsifyAI" |
63
+ | Repo / folder | `falsifyai` |
64
+ | Plugin entry-point groups | `falsifyai.perturbations`, `falsifyai.invariants`, `falsifyai.oracles`, `falsifyai.adapters`, `falsifyai.reporters`, `falsifyai.stores` |
65
+ | Replay cache dir | `.falsifyai/` (matches CLI name, like `.git` / `.pytest_cache`) |
66
+
67
+ **Background on the rename**: the original plan used `falsify` for the CLI binary, the `.falsify/` cache dir, and "Falsify" in prose. That collided with the existing `studio-11-co/falsify` project in the AI eval space. Renamed to `falsifyai` / `.falsifyai/` / "FalsifyAI" for full namespace consistency before any public release.
68
+
69
+ ## Toolchain
70
+
71
+ - **Python:** 3.13+ (locked in `.python-version` and `pyproject.toml`)
72
+ - **Package manager:** `uv` (not pip directly)
73
+ - **Build backend:** `hatchling`
74
+ - **Test:** `pytest` + `pytest-cov`
75
+ - **Lint/format:** `ruff` (line-length 100, target py313)
76
+ - **License:** Apache-2.0
77
+
78
+ The `uv` binary lives at `C:\Users\Eric\AppData\Roaming\Python\Python313\Scripts\uv.exe`. PATH is configured. If a shell can't find `uv`, prepend that directory to `$env:PATH`.
79
+
80
+ ## Branch workflow
81
+
82
+ - **Active development branch is `dev`.** Do not commit directly to `main`.
83
+ - `main` is reserved for tagged releases and merged work. CI is gated on PRs to `main`.
84
+ - Feature commits land on `dev` (or topic branches off `dev`); promote to `main` via PR when a milestone ships.
85
+ - If you find yourself on `main` mid-session, switch to `dev` before staging changes.
86
+
87
+ ## Common commands
88
+
89
+ ```bash
90
+ uv sync --extra dev # install runtime + dev deps into .venv
91
+ uv run pytest # run tests
92
+ uv run ruff check . # lint
93
+ uv run ruff format . # format
94
+ uv run python -c "import falsifyai; print(falsifyai.__version__)"
95
+ ```
96
+
97
+ ## Layout (flat, not src/)
98
+
99
+ Package directory is at repo root, not under `src/`. See [plan.md §4](../plan.md). When the plan says `falsifyai/cli/main.py`, that means `<repo>/falsifyai/cli/main.py`.
100
+
101
+ ```
102
+ falsifyai/ ← repo root
103
+ ├── pyproject.toml
104
+ ├── falsifyai/ ← Python package
105
+ │ ├── cli/ spec/ session/ perturbation/ execution/
106
+ │ ├── invariants/ oracles/ statistical/ falsifiability/
107
+ │ ├── verdict/ replay/ differential/ reporting/
108
+ ├── tests/
109
+ │ ├── unit/ integration/ fixtures/ meta/
110
+ └── examples/
111
+ ```
112
+
113
+ All subpackages have empty `__init__.py` files only — no implementation yet.
114
+
115
+ ## Design anchors (when implementing, do not reinvent)
116
+
117
+ - **8 verdicts in 2D space:** `STABLE`, `INFORMATION_PRESENT`, `CONSISTENTLY_WRONG`, `ADVERSARIALLY_VULNERABLE`, `FRAGILE`, `INFORMATION_NULL`, `AMBIGUOUS`, `INVALID_EVAL` — see [plan.md §2](../plan.md).
118
+ - **Worst-case stratified stability**, not aggregate — see [plan.md §12](../plan.md).
119
+ - **Spec materialization** separates intention (YAML) from instance (realized perturbations) — see [plan.md §8](../plan.md).
120
+ - **Meta-oracle is the sole source of `INVALID_EVAL`** — see [plan.md §11.2](../plan.md).
121
+ - **Perturbation validity is required** (bidirectional NLI default) — see [plan.md §9.3](../plan.md).
122
+ - **`falsifyai diff` is a Phase 1 deliverable**, not Phase 2 — see [plan.md §14](../plan.md).
123
+ - **Storage behind `ReplayStore` protocol** — SQLite default, no SQLite-specific code in core — see [plan.md §18](../plan.md).
124
+ - **Falsifiability scoring is required** for every invariant — see [plan.md §15](../plan.md).
125
+
126
+ ## Scope discipline
127
+
128
+ - **Phase 0 MVP is locked**: 3 weeks, single launch as `falsifyai==0.1.0`. See [plan.md §22.1](../plan.md). Includes `falsifyai diff`, `CONSISTENTLY_WRONG`, falsifiability scoring, and dogfooding from Week 1. Compression around the differentiator, not expansion of timeline.
129
+ - **MVP verdict set**: `STABLE`, `FRAGILE`, `CONSISTENTLY_WRONG`, `INSUFFICIENT`, `INVALID_EVAL` (5 verdicts; full 8 in Phase 1).
130
+ - **MVP perturbations**: `typo_noise` + `casing_variant` only (2 families — required for honest bootstrap CI).
131
+ - **MVP invariants**: `contains` + `semantic_equivalence`.
132
+ - **8-item acceptance gate** ([plan.md §22.1.1](../plan.md)) must pass before tagging 0.1.0. PyPI publication is deployment, not validation.
133
+ - Do not add features beyond what the spec demands. Do not invent abstractions for hypothetical extensions.
134
+ - Do not change naming without explicit user confirmation.
135
+ - Do not deviate from the flat package layout without asking.
136
+ - Cuts from MVP that may feel tempting: rich/colored terminal output (defer), heavyweight NLI for ConsistencyOracle (use embeddings for MVP, NLI in Phase 1), full 8-verdict resolver (5 verdicts for MVP).
137
+
138
+ ## What to NOT do
139
+
140
+ - Don't add `src/` layout.
141
+ - Don't add a `setup.py` or `setup.cfg`. `pyproject.toml` is the only build config.
142
+ - Don't install pytest/ruff via pip directly — use `uv add --dev`.
143
+ - Don't pre-create files for sections of the plan that aren't being implemented yet. Empty `__init__.py` is the current correct state.
144
+ - Don't enable the CLI script entry-point in `pyproject.toml` until `falsifyai/cli/main.py` actually exists.
@@ -0,0 +1,23 @@
1
+ {
2
+ "$schema": "https://json.schemastore.org/claude-code-settings.json",
3
+ "permissions": {
4
+ "allow": [
5
+ "PowerShell(uv:*)",
6
+ "PowerShell(uv sync*)",
7
+ "PowerShell(uv run:*)",
8
+ "PowerShell(uv add:*)",
9
+ "PowerShell(uv lock*)",
10
+ "PowerShell(uv tree*)",
11
+ "PowerShell(uv pip list*)",
12
+ "PowerShell(py -m uv:*)",
13
+ "PowerShell(py --version)",
14
+ "PowerShell(py -c:*)",
15
+ "Bash(uv:*)",
16
+ "Bash(uv sync*)",
17
+ "Bash(uv run:*)",
18
+ "Bash(uv add:*)",
19
+ "Bash(uv lock*)",
20
+ "Bash(uv tree*)"
21
+ ]
22
+ }
23
+ }
@@ -0,0 +1,93 @@
1
+ ---
2
+ name: pr-review
3
+ description: Use this skill before committing, pushing, opening a PR, merging, or starting implementation from a locked plan in the FalsifyAI repo. It performs a pre-flight self-review against the three-layer architecture, evidence-density principle, resolver-inflation guardrail, replay-preservation expectations, dogfood/example requirements, and release/readiness gates.
4
+ ---
5
+
6
+ # pr-review — FalsifyAI pre-flight self-review
7
+
8
+ This skill runs **before** a destination-bound action: a commit, a push, opening a PR, merging a PR, or starting implementation from a locked plan. It is not a code-review pass on someone else's work — it is *your* checklist for whether the change you're about to ship clears FalsifyAI's architectural gates.
9
+
10
+ ## STOP clause (load-bearing)
11
+
12
+ **If any gate below fails, stop.** Do not commit. Do not push. Do not continue implementation. Surface the failing gate to the user verbatim and ask whether to:
13
+
14
+ 1. **Split** the change (most common — usually means the PR is touching multiple layers),
15
+ 2. **Revise** the change to clear the gate, or
16
+ 3. **Explicitly accept the risk** (rare; requires the user to name what they're accepting).
17
+
18
+ A skill that does not stop on failure is decoration. This one stops.
19
+
20
+ ## When to invoke
21
+
22
+ Auto-invoke when the immediate intent is clearly:
23
+
24
+ - about to `git commit` or `git push`
25
+ - about to open a PR (`gh pr create`)
26
+ - about to merge a PR
27
+ - about to start implementation from a plan the user has approved
28
+
29
+ Do **not** auto-invoke for:
30
+
31
+ - casual discussion, brainstorming, or design exploration
32
+ - README copy edits, typo fixes, doc-only formatting
33
+ - exploratory reads / Q&A about the codebase
34
+
35
+ ## The six gates
36
+
37
+ For each gate: state the answer in one sentence. If unclear or "no," that is a failure — stop and surface it.
38
+
39
+ ### Gate 1 — Which layer does this touch?
40
+
41
+ FalsifyAI separates **evidence generation** (perturbation / materialization / execution) from **evidence interpretation** (invariants / verdict resolver / CLI compression) from **evidence preservation** (replay artifacts / stores). See [`docs/ARCHITECTURE.md`](../../../docs/ARCHITECTURE.md) and [`.claude/CLAUDE.md`](../../CLAUDE.md#design-philosophy-load-bearing).
42
+
43
+ **Answer in one of**: generation, interpretation, preservation, consumer surface (CLI / diff / replay), contributor infrastructure, maintainer infrastructure.
44
+
45
+ **Precision note** — these classifications are load-bearing; don't blur them:
46
+
47
+ - *Preservation* is the **replay artifact / store** system only. It is not "things that get committed and persist."
48
+ - README / CHANGELOG / CONTRIBUTING / `.github/` templates → **consumer surface** (what a user or contributor reads on arrival).
49
+ - `docs/ARCHITECTURE.md` / `docs/RELEASE.md` / `docs/DEMO.md` / `dev_notes/*` → **contributor infrastructure** (read by people who change the code).
50
+ - `.claude/skills/*` / `.claude/CLAUDE.md` / maintainer tooling → **maintainer infrastructure** (read by future-you while operating the project).
51
+
52
+ ### Gate 2 — Does it touch more than one layer?
53
+
54
+ If yes: should it be split into separate commits or PRs? Cross-layer changes are the most common source of architectural drift. The default answer is *split it*; the exception requires a one-line justification.
55
+
56
+ ### Gate 3 — Does it inflate the resolver?
57
+
58
+ The verdict resolver is the epistemic authority of the framework. Its priority chain must stay compressible and predictable.
59
+
60
+ **Trust test** (authoritative copy in [`CONTRIBUTING.md`](../../../CONTRIBUTING.md)): *A competent user should be able to predict the resolver output from the inputs.*
61
+
62
+ If this change adds heuristics, thresholds, new verdict types, new confidence semantics, new knobs, or new metrics that the resolver consults — the trust test must still pass after the change. If it does not, the work belongs in the **consumer surface** (replay, diff, future tools), not the resolver.
63
+
64
+ ### Gate 4 — Evidence density or evidence volume?
65
+
66
+ FalsifyAI optimizes for **evidence density**, not volume. See the four pillars in [`.claude/CLAUDE.md`](../../CLAUDE.md#four-pillars).
67
+
68
+ Two sub-checks, in order:
69
+
70
+ 1. **Would removing this output / field / row make the engineer's decision worse?** If no, the addition is volume — cut it.
71
+ 2. **Does this addition crowd the decision surface, or does it sit behind it?** The *decision surface* is where the user actually makes a call (CLI output, exit codes, the README hook, the verdict table). Volume that lives *behind* the decision surface (architecture docs, release runbook, internal notes) is acceptable when each item earns its keep alone. Volume that *crowds* the decision surface is not. When a new doc / row / field is borderline, ask which side it lives on.
72
+
73
+ ### Gate 5 — Are replay artifacts preserved, not recomputed?
74
+
75
+ Replay is read-only. Verdicts shown by `falsifyai replay` are the ones assigned at run time and never re-resolved. If this change reads from a stored artifact and then re-judges, re-resolves, or recomputes a verdict — that is a preservation violation. Move the logic to *run-time* (write path) or to a new consumer surface command that is explicitly not `replay`.
76
+
77
+ ### Gate 6 — Examples dogfooded if user-facing behavior changed?
78
+
79
+ If this change alters CLI behavior, spec language, verdict semantics, or output shape: at least one example under [`examples/`](../../../examples/) and the matching dogfood test in [`tests/integration/test_examples.py`](../../../tests/integration/test_examples.py) must demonstrate the new behavior end-to-end. Examples are the canonical user-facing spec surface — if the parser or resolver no longer accepts something the examples use, CI must fail immediately.
80
+
81
+ ## How to surface a failing gate
82
+
83
+ When stopping, write one short paragraph in this shape:
84
+
85
+ > **Stopping before [commit/push/PR/implementation].** Gate N (*one-line gate name*) failed: *one sentence on why*. Options: split / revise / accept-risk. Which?
86
+
87
+ Do not enumerate every gate that passed. Surface only the failing one. The user already knows what the gates are.
88
+
89
+ ## Scope notes
90
+
91
+ - This skill is a pre-flight self-check, not a substitute for the `code-reviewer` agent on substantive changes — invoke that separately when the change is non-trivial.
92
+ - This skill is project-specific to FalsifyAI. Generic code-quality concerns (file size, naming, error handling) are covered by user-global rules and are not duplicated here.
93
+ - Authoritative philosophy lives in [`.claude/CLAUDE.md`](../../CLAUDE.md), authoritative architecture in [`docs/ARCHITECTURE.md`](../../../docs/ARCHITECTURE.md), authoritative resolver trust test in [`CONTRIBUTING.md`](../../../CONTRIBUTING.md). If a gate here drifts from those docs, the docs win — update the skill, not the doc.
@@ -0,0 +1,56 @@
1
+ # FalsifyAI environment variables (template)
2
+ #
3
+ # Copy this file to `.env` or `.env.local` and fill in the values for the
4
+ # provider(s) you actually use. Both `.env` and `.env.local` are gitignored.
5
+ #
6
+ # FalsifyAI does NOT auto-load these files. LiteLLM (the model adapter
7
+ # layer) reads from process environment variables directly. To use this
8
+ # template, load the values into your shell environment first:
9
+ #
10
+ # ─── bash / zsh ─────────────────────────────────────────────────────────
11
+ #
12
+ # cp .env.example .env.local
13
+ # # edit .env.local with real values
14
+ # set -a; source .env.local; set +a
15
+ # falsifyai run examples/model_migration.yaml
16
+ #
17
+ # ─── PowerShell ─────────────────────────────────────────────────────────
18
+ #
19
+ # cp .env.example .env.local
20
+ # # edit .env.local with real values
21
+ # Get-Content .env.local | ForEach-Object {
22
+ # if ($_ -match '^([A-Z_][A-Z0-9_]*)=(.*)$') {
23
+ # Set-Item "env:$($Matches[1])" $Matches[2]
24
+ # }
25
+ # }
26
+ # falsifyai run examples/model_migration.yaml
27
+ #
28
+ # ─── Or just set inline (no .env.local needed) ──────────────────────────
29
+ #
30
+ # # bash/zsh
31
+ # OPENAI_API_KEY=sk-... falsifyai run examples/model_migration.yaml
32
+ #
33
+ # # PowerShell
34
+ # $env:OPENAI_API_KEY = "sk-..."; falsifyai run examples/model_migration.yaml
35
+ #
36
+ # Or use `direnv` (Unix) for automatic per-directory loading.
37
+
38
+ # ─────────────────────────────────────────────────────────────────────────
39
+ # OpenAI (used by the README walkthrough + most examples)
40
+ # ─────────────────────────────────────────────────────────────────────────
41
+ OPENAI_API_KEY=
42
+
43
+ # Optional: override the default model used in spec files.
44
+ # OPENAI_MODEL=gpt-4o-mini
45
+
46
+ # ─────────────────────────────────────────────────────────────────────────
47
+ # Anthropic
48
+ # ─────────────────────────────────────────────────────────────────────────
49
+ # ANTHROPIC_API_KEY=
50
+
51
+ # ─────────────────────────────────────────────────────────────────────────
52
+ # Other providers
53
+ # ─────────────────────────────────────────────────────────────────────────
54
+ # LiteLLM supports 100+ providers. See https://docs.litellm.ai/docs/providers
55
+ # for the full list and the env var name each one expects (e.g.,
56
+ # GOOGLE_API_KEY for Gemini, COHERE_API_KEY for Cohere, GROQ_API_KEY, etc.).
@@ -0,0 +1,52 @@
1
+ ---
2
+ name: Bug report
3
+ about: Something FalsifyAI did unexpectedly. Include a replay session id if possible.
4
+ title: '[bug] '
5
+ labels: bug
6
+ ---
7
+
8
+ ## What happened
9
+
10
+ <!-- Brief description of the unexpected behavior. -->
11
+
12
+ ## Expected behavior
13
+
14
+ <!-- What you thought should happen instead. -->
15
+
16
+ ## Reproduction
17
+
18
+ <!-- Minimal spec / command sequence that triggers the issue.
19
+ If possible, paste the YAML spec inline. -->
20
+
21
+ ```yaml
22
+ # your spec here
23
+ ```
24
+
25
+ ```bash
26
+ $ falsifyai run ...
27
+ # observed output
28
+ ```
29
+
30
+ ## Replay session id (high-signal!)
31
+
32
+ <!-- The unique value-add: if the bug shows up in a real run, the saved
33
+ replay session contains EVERYTHING needed to reproduce. -->
34
+
35
+ - **Session id:** `<paste the session_id printed at the end of `falsifyai run`>`
36
+ - **Store path:** `<usually .falsifyai/replays.db>`
37
+ - Confirm you're OK sharing the artifact contents (model outputs may
38
+ contain sensitive prompts/responses).
39
+
40
+ If you can attach the `.falsifyai/replays.db` file (or a sanitized copy),
41
+ add it to the issue. That's the deepest reproduction we can ask for.
42
+
43
+ ## Environment
44
+
45
+ - **FalsifyAI version:** <`falsifyai --version` or `python -c "import falsifyai; print(falsifyai.__version__)"`>
46
+ - **Python version:** <`python --version`>
47
+ - **OS:** <macOS / Linux / Windows + version>
48
+ - **Model provider + model:** <e.g., openai/gpt-4o-mini>
49
+
50
+ ## Additional context
51
+
52
+ <!-- Anything else: workarounds you tried, related issues, etc. -->
@@ -0,0 +1,47 @@
1
+ ---
2
+ name: Feature request
3
+ about: Suggest a new feature, perturbation, invariant, or workflow.
4
+ title: '[feature] '
5
+ labels: enhancement
6
+ ---
7
+
8
+ ## Use case
9
+
10
+ <!-- What you're trying to do that FalsifyAI doesn't currently support.
11
+ Concrete scenarios beat abstract requests. -->
12
+
13
+ ## Why current FalsifyAI doesn't cover it
14
+
15
+ <!-- Briefly: which existing feature is closest, and why doesn't it fit? -->
16
+
17
+ ## Proposed surface (if you have one in mind)
18
+
19
+ <!-- CLI command, spec field, output format, etc. Rough is fine. -->
20
+
21
+ ```bash
22
+ # example invocation
23
+ falsifyai ...
24
+ ```
25
+
26
+ ```yaml
27
+ # example spec extension
28
+ ```
29
+
30
+ ## Alternatives considered
31
+
32
+ <!-- Other approaches you thought about and rejected, with brief reasons.
33
+ This is especially useful for resolver / verdict changes (see
34
+ CONTRIBUTING.md on why the resolver complexity is bounded). -->
35
+
36
+ ## Layer
37
+
38
+ <!-- Which architectural layer would this touch?
39
+ - generation (perturbation / materialize / execute)
40
+ - interpretation (invariants / verdict / falsifiability / render)
41
+ - preservation (replay / artifact / store)
42
+ - consumer (new CLI subcommand reading existing data)
43
+ If it touches more than one, decomposition might be in order. -->
44
+
45
+ ## Additional context
46
+
47
+ <!-- Links to similar features in other tools, prior discussion, etc. -->
@@ -0,0 +1,60 @@
1
+ <!--
2
+ Thanks for the PR! This template mirrors FalsifyAI's local dev_notes
3
+ summary format. Fill out what's relevant; delete what's not.
4
+
5
+ For non-trivial changes, see CONTRIBUTING.md for the architectural
6
+ constraints (especially: resolver complexity is bounded; three-layer
7
+ separation is non-negotiable).
8
+ -->
9
+
10
+ ## Headline
11
+
12
+ <!-- One sentence: what does this PR do? -->
13
+
14
+ ## Problem pressure
15
+
16
+ <!-- 1-2 sentences: what gap does this close? Why now? -->
17
+
18
+ ## Abstraction shipped
19
+
20
+ <!-- The new contract / Protocol / module / behavior, named explicitly. -->
21
+
22
+ ## Alternatives rejected
23
+
24
+ <!-- Bullet list, one line each, with one-line reasoning per alternative.
25
+ High-signal for future engineers who hit the same decision fork. -->
26
+
27
+ -
28
+
29
+ ## Architectural invariants
30
+
31
+ <!-- System-level contracts this PR establishes or preserves. NOT coding
32
+ style. If this PR touches the verdict resolver, include an
33
+ explicit answer to the trust test from CONTRIBUTING.md:
34
+ "Can a competent user still predict the resolver output from the
35
+ inputs?" -->
36
+
37
+ -
38
+
39
+ ## Test plan
40
+
41
+ <!-- - [x] specific tests added
42
+ - [ ] manual smoke
43
+ - [ ] `uv run pytest` passes
44
+ - [ ] `uv run ruff check . && uv run ruff format --check .` clean
45
+ - [ ] CI green on `dev`
46
+ - [ ] CI green on PR target `main`
47
+ -->
48
+
49
+ - [ ] `uv run pytest` passes
50
+ - [ ] `uv run ruff check .` clean
51
+ - [ ] `uv run ruff format --check .` clean
52
+
53
+ ## Architectural fit (self-check)
54
+
55
+ - [ ] Touches exactly **one** of the three layers (generation /
56
+ interpretation / preservation), or is a pure consumer.
57
+ - [ ] If touching `falsifyai/verdict/resolver.py`: the trust test still
58
+ passes (a competent user can predict the output from the inputs).
59
+ - [ ] Does not introduce new spec language fields, verdict types, or
60
+ configurable thresholds without a separate architectural conversation.
@@ -0,0 +1,42 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main, dev]
6
+ pull_request:
7
+ branches: [main, dev]
8
+
9
+ concurrency:
10
+ group: ${{ github.workflow }}-${{ github.ref }}
11
+ cancel-in-progress: true
12
+
13
+ jobs:
14
+ lint-and-test:
15
+ name: Lint + test (Python 3.13, Linux)
16
+ runs-on: ubuntu-latest
17
+ timeout-minutes: 15
18
+ steps:
19
+ - name: Checkout
20
+ uses: actions/checkout@v4
21
+
22
+ - name: Install uv
23
+ uses: astral-sh/setup-uv@v3
24
+ with:
25
+ version: "0.11.15"
26
+ enable-cache: true
27
+ cache-dependency-glob: "uv.lock"
28
+
29
+ - name: Set up Python 3.13
30
+ run: uv python install 3.13
31
+
32
+ - name: Sync dependencies
33
+ run: uv sync --extra dev --frozen
34
+
35
+ - name: Ruff lint
36
+ run: uv run ruff check .
37
+
38
+ - name: Ruff format check
39
+ run: uv run ruff format --check .
40
+
41
+ - name: Pytest
42
+ run: uv run pytest -v
@@ -0,0 +1,72 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ share/python-wheels/
20
+ *.egg-info/
21
+ .installed.cfg
22
+ *.egg
23
+ MANIFEST
24
+
25
+ # Virtual environments
26
+ .venv/
27
+ venv/
28
+ env/
29
+ ENV/
30
+
31
+ # Testing / coverage
32
+ .pytest_cache/
33
+ .coverage
34
+ .coverage.*
35
+ htmlcov/
36
+ .tox/
37
+ .nox/
38
+ coverage.xml
39
+ *.cover
40
+ *.py,cover
41
+
42
+ # Type checking
43
+ .mypy_cache/
44
+ .pyright/
45
+ .pytype/
46
+
47
+ # Linting
48
+ .ruff_cache/
49
+
50
+ # FalsifyAI replay artifacts
51
+ .falsifyai/
52
+
53
+ # IDE
54
+ .vscode/
55
+ .idea/
56
+ *.swp
57
+ *.swo
58
+ *~
59
+
60
+ # OS
61
+ .DS_Store
62
+ Thumbs.db
63
+ desktop.ini
64
+
65
+ # Secrets / env
66
+ .env
67
+ .env.local
68
+ *.pem
69
+ *.key
70
+
71
+ # Notebooks
72
+ .ipynb_checkpoints/
@@ -0,0 +1 @@
1
+ 3.13