causal-worlds 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- causal_worlds-0.7.0/.claude/settings.json +26 -0
- causal_worlds-0.7.0/.claude/skills/causal-worlds-conventions/SKILL.md +65 -0
- causal_worlds-0.7.0/.editorconfig +19 -0
- causal_worlds-0.7.0/.github/workflows/ci.yml +32 -0
- causal_worlds-0.7.0/.github/workflows/publish.yml +22 -0
- causal_worlds-0.7.0/.gitignore +35 -0
- causal_worlds-0.7.0/.pre-commit-config.yaml +19 -0
- causal_worlds-0.7.0/CHANGELOG.md +182 -0
- causal_worlds-0.7.0/CLAUDE.md +34 -0
- causal_worlds-0.7.0/LICENSE +21 -0
- causal_worlds-0.7.0/Makefile +26 -0
- causal_worlds-0.7.0/PKG-INFO +197 -0
- causal_worlds-0.7.0/README.md +164 -0
- causal_worlds-0.7.0/RELEASING.md +40 -0
- causal_worlds-0.7.0/benchmark/README.md +30 -0
- causal_worlds-0.7.0/benchmark/prompts.txt +19 -0
- causal_worlds-0.7.0/benchmark/v0.2/index.json +122 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_00/answer_key.json +46 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_00/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_00/manifest.json +33 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_00/spec.json +156 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_01/answer_key.json +34 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_01/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_01/manifest.json +32 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_01/spec.json +114 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_02/answer_key.json +50 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_02/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_02/manifest.json +33 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_02/spec.json +139 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_03/answer_key.json +46 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_03/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_03/manifest.json +33 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_03/spec.json +147 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_04/answer_key.json +42 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_04/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_04/manifest.json +32 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_04/spec.json +121 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_05/answer_key.json +38 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_05/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_05/manifest.json +33 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_05/spec.json +119 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_06/answer_key.json +38 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_06/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_06/manifest.json +32 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_06/spec.json +126 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_07/answer_key.json +54 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_07/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_07/manifest.json +33 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_07/spec.json +155 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_08/answer_key.json +50 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_08/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_08/manifest.json +34 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_08/spec.json +152 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_09/answer_key.json +34 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_09/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_09/manifest.json +32 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_09/spec.json +110 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_10/answer_key.json +33 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_10/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_10/manifest.json +32 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_10/spec.json +138 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_11/answer_key.json +37 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_11/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_11/manifest.json +32 -0
- causal_worlds-0.7.0/benchmark/v0.2/world_11/spec.json +126 -0
- causal_worlds-0.7.0/benchmark/v0.5/README.md +17 -0
- causal_worlds-0.7.0/benchmark/v0.5/index.json +428 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_01/answer_key.json +37 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_01/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_01/manifest.json +41 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_01/spec.json +127 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_02/answer_key.json +50 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_02/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_02/manifest.json +42 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_02/spec.json +180 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_03/answer_key.json +37 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_03/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_03/manifest.json +41 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_03/spec.json +105 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_04/answer_key.json +46 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_04/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_04/manifest.json +41 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_04/spec.json +130 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_05/answer_key.json +46 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_05/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_05/manifest.json +41 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_05/spec.json +171 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_06/answer_key.json +25 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_06/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_06/manifest.json +39 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_06/spec.json +75 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_07/answer_key.json +45 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_07/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_07/manifest.json +40 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_07/spec.json +138 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_08/answer_key.json +46 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_08/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_08/manifest.json +41 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_08/spec.json +153 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_09/answer_key.json +29 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_09/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_09/manifest.json +40 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_09/spec.json +92 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_10/answer_key.json +37 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_10/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_10/manifest.json +41 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_10/spec.json +143 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_11/answer_key.json +54 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_11/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_11/manifest.json +42 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_11/spec.json +168 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_12/answer_key.json +33 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_12/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_12/manifest.json +41 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_12/spec.json +101 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_13/answer_key.json +41 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_13/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_13/manifest.json +41 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_13/spec.json +127 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_14/answer_key.json +38 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_14/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_14/manifest.json +41 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_14/spec.json +137 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_15/answer_key.json +33 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_15/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_15/manifest.json +40 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_15/spec.json +109 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_16/answer_key.json +41 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_16/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_16/manifest.json +41 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_16/spec.json +110 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_17/answer_key.json +46 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_17/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_17/manifest.json +41 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_17/spec.json +157 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_18/answer_key.json +29 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_18/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_18/manifest.json +41 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_18/spec.json +97 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_19/answer_key.json +45 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_19/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_19/manifest.json +41 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_19/spec.json +160 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_20/answer_key.json +45 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_20/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_20/manifest.json +42 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_20/spec.json +199 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_21/answer_key.json +29 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_21/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_21/manifest.json +40 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_21/spec.json +84 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_22/answer_key.json +42 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_22/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_22/manifest.json +41 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_22/spec.json +123 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_23/answer_key.json +37 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_23/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_23/manifest.json +41 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_23/spec.json +157 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_24/answer_key.json +29 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_24/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_24/manifest.json +40 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_24/spec.json +92 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_25/answer_key.json +50 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_25/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_25/manifest.json +42 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_25/spec.json +148 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_26/answer_key.json +58 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_26/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_26/manifest.json +42 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_26/spec.json +190 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_27/answer_key.json +37 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_27/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_27/manifest.json +41 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_27/spec.json +105 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_28/answer_key.json +38 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_28/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_28/manifest.json +41 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_28/spec.json +115 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_29/answer_key.json +50 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_29/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_29/manifest.json +42 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_29/spec.json +178 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_30/answer_key.json +29 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_30/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_30/manifest.json +40 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_30/spec.json +92 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_31/answer_key.json +34 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_31/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_31/manifest.json +40 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_31/spec.json +110 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_32/answer_key.json +38 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_32/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_32/manifest.json +41 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_32/spec.json +177 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_33/answer_key.json +25 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_33/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_33/manifest.json +40 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_33/spec.json +80 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_34/answer_key.json +33 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_34/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_34/manifest.json +41 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_34/spec.json +111 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_35/answer_key.json +46 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_35/data.npz +0 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_35/manifest.json +41 -0
- causal_worlds-0.7.0/benchmark/v0.5/world_35/spec.json +169 -0
- causal_worlds-0.7.0/docs/architecture.md +129 -0
- causal_worlds-0.7.0/docs/blog-the-decisive-experiment.md +80 -0
- causal_worlds-0.7.0/docs/engineering.md +182 -0
- causal_worlds-0.7.0/docs/getting-started.md +121 -0
- causal_worlds-0.7.0/docs/hld.md +305 -0
- causal_worlds-0.7.0/docs/lld.md +169 -0
- causal_worlds-0.7.0/docs/scope.md +130 -0
- causal_worlds-0.7.0/docs/validation.md +68 -0
- causal_worlds-0.7.0/evals/author-model-bakeoff/README.md +12 -0
- causal_worlds-0.7.0/evals/author-model-bakeoff/report.json +190 -0
- causal_worlds-0.7.0/evals/baseline-crossover/README.md +31 -0
- causal_worlds-0.7.0/evals/baseline-crossover/report.json +466 -0
- causal_worlds-0.7.0/evals/baseline-crossover/run_crossover.py +206 -0
- causal_worlds-0.7.0/evals/baseline-crossover/v0.5/README.md +15 -0
- causal_worlds-0.7.0/evals/baseline-crossover/v0.5/report.json +1225 -0
- causal_worlds-0.7.0/evals/run_author_bakeoff.py +155 -0
- causal_worlds-0.7.0/evals/scale/generate_set.py +116 -0
- causal_worlds-0.7.0/evals/structural-difficulty/README.md +23 -0
- causal_worlds-0.7.0/evals/structural-difficulty/report.json +149 -0
- causal_worlds-0.7.0/evals/structural-difficulty/run_analysis.py +113 -0
- causal_worlds-0.7.0/evals/structural-difficulty/v0.5/README.md +12 -0
- causal_worlds-0.7.0/evals/structural-difficulty/v0.5/report.json +402 -0
- causal_worlds-0.7.0/examples/01_grade_your_discoverer.py +45 -0
- causal_worlds-0.7.0/examples/02_inspect_a_bundle.py +31 -0
- causal_worlds-0.7.0/examples/03_author_a_world.py +38 -0
- causal_worlds-0.7.0/examples/README.md +15 -0
- causal_worlds-0.7.0/paper/README.md +80 -0
- causal_worlds-0.7.0/pyproject.toml +128 -0
- causal_worlds-0.7.0/spikes/smoke_live.py +40 -0
- causal_worlds-0.7.0/spikes/spike_author.py +267 -0
- causal_worlds-0.7.0/spikes/spike_coffee.py +161 -0
- causal_worlds-0.7.0/spikes/spike_coffee_general.py +142 -0
- causal_worlds-0.7.0/spikes/spike_grader.py +101 -0
- causal_worlds-0.7.0/spikes/spike_loop.py +93 -0
- causal_worlds-0.7.0/src/causal_worlds/__init__.py +106 -0
- causal_worlds-0.7.0/src/causal_worlds/_version.py +3 -0
- causal_worlds-0.7.0/src/causal_worlds/artifact.py +127 -0
- causal_worlds-0.7.0/src/causal_worlds/author.py +117 -0
- causal_worlds-0.7.0/src/causal_worlds/baselines.py +199 -0
- causal_worlds-0.7.0/src/causal_worlds/bench.py +38 -0
- causal_worlds-0.7.0/src/causal_worlds/cli.py +191 -0
- causal_worlds-0.7.0/src/causal_worlds/config.py +30 -0
- causal_worlds-0.7.0/src/causal_worlds/container.py +47 -0
- causal_worlds-0.7.0/src/causal_worlds/difficulty.py +77 -0
- causal_worlds-0.7.0/src/causal_worlds/discover/__init__.py +8 -0
- causal_worlds-0.7.0/src/causal_worlds/discover/interventional.py +177 -0
- causal_worlds-0.7.0/src/causal_worlds/errors.py +13 -0
- causal_worlds-0.7.0/src/causal_worlds/evaluation.py +73 -0
- causal_worlds-0.7.0/src/causal_worlds/fakes.py +50 -0
- causal_worlds-0.7.0/src/causal_worlds/gates.py +164 -0
- causal_worlds-0.7.0/src/causal_worlds/generate.py +134 -0
- causal_worlds-0.7.0/src/causal_worlds/judge.py +114 -0
- causal_worlds-0.7.0/src/causal_worlds/obs.py +28 -0
- causal_worlds-0.7.0/src/causal_worlds/protocols.py +77 -0
- causal_worlds-0.7.0/src/causal_worlds/py.typed +0 -0
- causal_worlds-0.7.0/src/causal_worlds/sample/__init__.py +5 -0
- causal_worlds-0.7.0/src/causal_worlds/sample/substrate.py +223 -0
- causal_worlds-0.7.0/src/causal_worlds/schema.py +252 -0
- causal_worlds-0.7.0/src/causal_worlds/serde.py +135 -0
- causal_worlds-0.7.0/src/causal_worlds/worlds.py +131 -0
- causal_worlds-0.7.0/tests/test_artifact.py +52 -0
- causal_worlds-0.7.0/tests/test_baselines.py +58 -0
- causal_worlds-0.7.0/tests/test_bench.py +62 -0
- causal_worlds-0.7.0/tests/test_cli.py +101 -0
- causal_worlds-0.7.0/tests/test_difficulty.py +30 -0
- causal_worlds-0.7.0/tests/test_discover.py +44 -0
- causal_worlds-0.7.0/tests/test_evaluation.py +55 -0
- causal_worlds-0.7.0/tests/test_gates.py +67 -0
- causal_worlds-0.7.0/tests/test_generate.py +83 -0
- causal_worlds-0.7.0/tests/test_llm_adapters.py +86 -0
- causal_worlds-0.7.0/tests/test_sample.py +59 -0
- causal_worlds-0.7.0/tests/test_schema.py +116 -0
- causal_worlds-0.7.0/tests/test_serde.py +23 -0
- causal_worlds-0.7.0/tests/test_temporal.py +92 -0
- causal_worlds-0.7.0/tests/test_worlds.py +22 -0
- causal_worlds-0.7.0/uv.lock +2485 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
{
|
|
2
|
+
"hooks": {
|
|
3
|
+
"PreToolUse": [
|
|
4
|
+
{
|
|
5
|
+
"matcher": "Edit|Write|MultiEdit",
|
|
6
|
+
"hooks": [
|
|
7
|
+
{
|
|
8
|
+
"type": "command",
|
|
9
|
+
"command": "f=$(jq -r '.tool_input.file_path // empty'); case \"$f\" in *.env|*/.env|*.env.*) echo 'Refusing to edit secrets (.env) via Claude.' >&2; exit 2;; uv.lock|*/uv.lock) echo 'uv.lock is managed by uv — run `uv lock`, do not hand-edit.' >&2; exit 2;; esac"
|
|
10
|
+
}
|
|
11
|
+
]
|
|
12
|
+
}
|
|
13
|
+
],
|
|
14
|
+
"PostToolUse": [
|
|
15
|
+
{
|
|
16
|
+
"matcher": "Edit|Write|MultiEdit",
|
|
17
|
+
"hooks": [
|
|
18
|
+
{
|
|
19
|
+
"type": "command",
|
|
20
|
+
"command": "f=$(jq -r '.tool_input.file_path // empty'); case \"$f\" in *.py) ( cd \"$CLAUDE_PROJECT_DIR\" && uv run ruff format --force-exclude \"$f\" >/dev/null 2>&1; uv run ruff check --fix --force-exclude \"$f\" >/dev/null 2>&1 ) || true ;; esac"
|
|
21
|
+
}
|
|
22
|
+
]
|
|
23
|
+
}
|
|
24
|
+
]
|
|
25
|
+
}
|
|
26
|
+
}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: causal-worlds-conventions
|
|
3
|
+
description: >-
|
|
4
|
+
Engineering + research conventions for the causal-worlds Python package. Load and apply BEFORE writing,
|
|
5
|
+
refactoring, or reviewing ANY code in this repo (src/, tests/, cli, the package), and before adding a dependency
|
|
6
|
+
or a design pattern. Encodes Clean Code (Uncle Bob), SOLID-via-Protocols, the earned-patterns rule, the uv/ruff/
|
|
7
|
+
mypy/pytest/CI toolchain, src-layout, and the research discipline. Full detail in docs/engineering.md.
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# causal-worlds conventions
|
|
11
|
+
|
|
12
|
+
Apply these when touching this package. Full reference: [docs/engineering.md](../../../docs/engineering.md).
|
|
13
|
+
Mix of engineering + research; CLI-first (typer); Gemini is an *independent* judge (≠ author model family).
|
|
14
|
+
|
|
15
|
+
## Before you write code
|
|
16
|
+
- **Clean Code (all of it), NOT Clean Architecture.** Small functions that do one thing at one level of
|
|
17
|
+
abstraction; intention-revealing names; **≤2 args** (bundle into a value object), **no flag args**, **no hidden
|
|
18
|
+
side effects**, **Command-Query Separation**. **Exceptions, not error codes; never return/pass `null`/`None`-as-
|
|
19
|
+
error.** No commented-out code (delete it). No magic numbers (named constants). Respect the Law of Demeter.
|
|
20
|
+
- **SOLID via `typing.Protocol`.** The four variation points get Protocols + injected impls:
|
|
21
|
+
`Discoverer` (grader), `Judge` (LLM), `Substrate`/`World`, `Gate`. Depend on the Protocol; **never** let
|
|
22
|
+
`causal-learn`/`gies`/Gemini types leak past an **adapter**.
|
|
23
|
+
- **Patterns are earned** (Strategy/Adapter for discoverer·judge·substrate; Pipeline for gates). **No speculative
|
|
24
|
+
abstraction.** If you add a pattern, name it and justify it in the PR.
|
|
25
|
+
- **Separate construction from use:** build/inject dependencies at the edge (`cli`, factories); the core never
|
|
26
|
+
news-up collaborators.
|
|
27
|
+
- **Structure:** `src/causal_worlds/<feature>/`; tests mirror; third-party imports only inside a feature's adapter.
|
|
28
|
+
- **Docstrings:** Google style. **Type everything** (mypy strict).
|
|
29
|
+
- **`from __future__ import annotations` only when needed** (TYPE_CHECKING-only annotation imports / forward refs);
|
|
30
|
+
**never** in typer CLI or pydantic modules (they read annotations at runtime). 3.13 needs no future import for
|
|
31
|
+
`X | None` / `tuple[...]`.
|
|
32
|
+
|
|
33
|
+
## Before you commit — run the gate (it must be green)
|
|
34
|
+
```bash
|
|
35
|
+
make validate # or:
|
|
36
|
+
uv run ruff format --check . && uv run ruff check . && uv run mypy && uv run pytest
|
|
37
|
+
```
|
|
38
|
+
CI runs the same and **fails** on any violation; CI-green is the merge gate. Conventional Commits, atomic, **no
|
|
39
|
+
`Co-Authored-By` trailer**. Push/PR only when asked.
|
|
40
|
+
|
|
41
|
+
## Tests (F.I.R.S.T.)
|
|
42
|
+
Fast · Independent · Repeatable · Self-validating · Timely. One concept per test. Prefer **Hypothesis property
|
|
43
|
+
tests** for invariants (acyclicity, interventions break the right edges, seed→determinism) over fixed-output tests.
|
|
44
|
+
|
|
45
|
+
## Research code (`spikes/`, `experiments/`)
|
|
46
|
+
NOT shipped; lint/type/coverage-exempt. Held to "**is the finding real and honestly reported**," not production
|
|
47
|
+
polish. **Measured, not asserted:** every claim has a runnable script that prints the evidence. Reproducible via
|
|
48
|
+
**seed + `uv.lock` + pinned model ids** (e.g. `gemini-3.5-flash`). Report honest negatives. Use an **independent
|
|
49
|
+
judge** for LLM-output quality (don't grade a model with itself). A proven spike **graduates** into `src/` rebuilt
|
|
50
|
+
to the standards above — the spike is the proof, not the implementation.
|
|
51
|
+
|
|
52
|
+
## Boundaries, LLM I/O & observability
|
|
53
|
+
- **Data models per use-case:** frozen `@dataclass` in the pure core (valid-by-construction; parse-don't-validate);
|
|
54
|
+
**pydantic v2** only at boundaries (LLM output, CLI, config) — convert the pydantic boundary model into the
|
|
55
|
+
dataclass core IR at the edge.
|
|
56
|
+
- **LLM structured output:** use **instructor** (pydantic models, **bounded** re-ask on validation failure, then
|
|
57
|
+
raise — never fabricate) behind the `Judge`/author adapter; Gemini is the independent judge.
|
|
58
|
+
- **Observability from day 1:** **Langfuse (OTEL-based)** spans around LLM calls + each pipeline stage, behind a
|
|
59
|
+
thin tracing seam (optional at runtime). Three channels, never conflated: logs (shell), traces (Langfuse/OTel),
|
|
60
|
+
exceptions (control flow). The pure core stays silent.
|
|
61
|
+
- **Errors & logging:** root `CausalWorldsError` + domain subclasses; **fail loud**; library logs to
|
|
62
|
+
`getLogger("causal_worlds")` + `NullHandler` (the app/CLI owns handlers); **never log secrets.**
|
|
63
|
+
|
|
64
|
+
## Adding a dependency
|
|
65
|
+
Justify it; pin via `uv`; wrap it behind a Protocol+adapter; prefer the standard library and reuse over new deps.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
root = true
|
|
2
|
+
|
|
3
|
+
[*]
|
|
4
|
+
end_of_line = lf
|
|
5
|
+
charset = utf-8
|
|
6
|
+
trim_trailing_whitespace = true
|
|
7
|
+
insert_final_newline = true
|
|
8
|
+
|
|
9
|
+
[*.py]
|
|
10
|
+
indent_style = space
|
|
11
|
+
indent_size = 4
|
|
12
|
+
max_line_length = 100
|
|
13
|
+
|
|
14
|
+
[*.{json,yml,yaml,toml}]
|
|
15
|
+
indent_style = space
|
|
16
|
+
indent_size = 2
|
|
17
|
+
|
|
18
|
+
[Makefile]
|
|
19
|
+
indent_style = tab
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
quality:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v4
|
|
13
|
+
|
|
14
|
+
- name: Install uv
|
|
15
|
+
uses: astral-sh/setup-uv@v5
|
|
16
|
+
with:
|
|
17
|
+
python-version: "3.13"
|
|
18
|
+
|
|
19
|
+
- name: Sync (deps + dev tools)
|
|
20
|
+
run: uv sync
|
|
21
|
+
|
|
22
|
+
- name: Format check (ruff)
|
|
23
|
+
run: uv run ruff format --check .
|
|
24
|
+
|
|
25
|
+
- name: Lint (ruff)
|
|
26
|
+
run: uv run ruff check .
|
|
27
|
+
|
|
28
|
+
- name: Type check (mypy strict)
|
|
29
|
+
run: uv run mypy
|
|
30
|
+
|
|
31
|
+
- name: Tests + coverage floor
|
|
32
|
+
run: uv run pytest
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
# Publishes on a GitHub Release via PyPI Trusted Publishing (OIDC) — no API token is stored anywhere.
|
|
4
|
+
# One-time setup on PyPI is required first; see RELEASING.md.
|
|
5
|
+
|
|
6
|
+
on:
|
|
7
|
+
release:
|
|
8
|
+
types: [published]
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
publish:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
permissions:
|
|
14
|
+
id-token: write # required for trusted publishing (OIDC)
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
- name: Install uv
|
|
18
|
+
uses: astral-sh/setup-uv@v5
|
|
19
|
+
- name: Build sdist + wheel
|
|
20
|
+
run: uv build
|
|
21
|
+
- name: Publish to PyPI
|
|
22
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.eggs/
|
|
6
|
+
build/
|
|
7
|
+
dist/
|
|
8
|
+
.venv/
|
|
9
|
+
venv/
|
|
10
|
+
env/
|
|
11
|
+
|
|
12
|
+
# Env / secrets
|
|
13
|
+
.env
|
|
14
|
+
.env.*
|
|
15
|
+
!.env.example
|
|
16
|
+
|
|
17
|
+
# Tooling caches
|
|
18
|
+
.pytest_cache/
|
|
19
|
+
.mypy_cache/
|
|
20
|
+
.ruff_cache/
|
|
21
|
+
.ipynb_checkpoints/
|
|
22
|
+
|
|
23
|
+
# OS / editor
|
|
24
|
+
.DS_Store
|
|
25
|
+
*.swp
|
|
26
|
+
.idea/
|
|
27
|
+
.vscode/
|
|
28
|
+
|
|
29
|
+
# Generated worlds / scratch / logs
|
|
30
|
+
/out/
|
|
31
|
+
/scratch/
|
|
32
|
+
*.log
|
|
33
|
+
|
|
34
|
+
# coverage data
|
|
35
|
+
.coverage
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
3
|
+
rev: v5.0.0
|
|
4
|
+
hooks:
|
|
5
|
+
- id: trailing-whitespace
|
|
6
|
+
- id: end-of-file-fixer
|
|
7
|
+
- id: check-yaml
|
|
8
|
+
- id: check-toml
|
|
9
|
+
- id: check-merge-conflict
|
|
10
|
+
- id: check-added-large-files
|
|
11
|
+
args: ["--maxkb=1024"]
|
|
12
|
+
- id: detect-private-key
|
|
13
|
+
|
|
14
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
15
|
+
rev: v0.8.4
|
|
16
|
+
hooks:
|
|
17
|
+
- id: ruff
|
|
18
|
+
args: ["--fix"]
|
|
19
|
+
- id: ruff-format
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to causal-worlds are documented here. Format: [Keep a Changelog](https://keepachangelog.com/);
|
|
4
|
+
this project follows [Semantic Versioning](https://semver.org/).
|
|
5
|
+
|
|
6
|
+
## [0.7.0] — 2026-06-23
|
|
7
|
+
|
|
8
|
+
**Temporal worlds (foundation).** Worlds can now carry *time* — lagged edges and autoregression —
|
|
9
|
+
not just cross-sectional structure. (Time-series *grading* + baselines land next.)
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
- **Lagged IR** (`schema`): `Term.lag` (default 0). Only the contemporaneous (lag-0) subgraph must be
|
|
13
|
+
acyclic; lagged edges — including autoregressive self-loops — are valid (they read the past).
|
|
14
|
+
- **Temporal substrate** (`sample`): when any lag is present, sampling becomes sequential over
|
|
15
|
+
timesteps with a burn-in (near-stationary); `do()` interventions hold across time. Cross-sectional
|
|
16
|
+
worlds keep the original vectorized i.i.d. path unchanged.
|
|
17
|
+
- **`temporal_answer_key(spec)`** → lagged ground truth `(src, dst, lag)` incl. autoregression; the
|
|
18
|
+
summary `answer_key` now collapses lags and drops self-loops, so existing tooling is unaffected.
|
|
19
|
+
- **Built-in `supply`** — a temporal world (autoregressive lead time + inventory, a hidden logistics
|
|
20
|
+
confounder), in a separate registry (`worlds.temporal_names()`) so the still-contemporaneous CLI
|
|
21
|
+
`grade`/`gate` don't mis-score it.
|
|
22
|
+
|
|
23
|
+
[0.7.0]: https://github.com/noumenal-ai/causal-worlds/releases/tag/v0.7.0
|
|
24
|
+
|
|
25
|
+
## [0.6.1] — 2026-06-23
|
|
26
|
+
|
|
27
|
+
### Docs
|
|
28
|
+
- Rewrote the README around a getting-started flow (honest shipped-vs-roadmap; the gym/temporal/
|
|
29
|
+
counterfactual claims are now roadmap, not overclaims), with a lead example, the measured crossover
|
|
30
|
+
result, install/extras, concepts, and a roadmap.
|
|
31
|
+
- Added a guided [`docs/getting-started.md`](docs/getting-started.md) and runnable
|
|
32
|
+
[`examples/`](examples/) (grade-your-discoverer, inspect-a-bundle — keyless — and author-a-world).
|
|
33
|
+
|
|
34
|
+
[0.6.1]: https://github.com/noumenal-ai/causal-worlds/releases/tag/v0.6.1
|
|
35
|
+
|
|
36
|
+
## [0.6.0] — 2026-06-23
|
|
37
|
+
|
|
38
|
+
**Use the benchmark.** Grading your own discoverer against a shipped world is now a first-class,
|
|
39
|
+
typed, tested feature — the package's whole purpose.
|
|
40
|
+
|
|
41
|
+
### Added
|
|
42
|
+
- **`bench`**: `grade_spec(spec, discoverer)` and `grade_bundle(bundle_dir, discoverer)` → a `Report`
|
|
43
|
+
scoring any `Discoverer` against a world's declared answer-key (defaults to the reference grader).
|
|
44
|
+
- **CLI `score`**: `causal-worlds score <bundle> [--discoverer module:Class]` grades a discoverer
|
|
45
|
+
(the reference by default, or any importable one) on a persisted world.
|
|
46
|
+
- **Typed distribution**: ship a PEP 561 `py.typed` marker, plus PyPI metadata (classifiers,
|
|
47
|
+
keywords, project URLs).
|
|
48
|
+
|
|
49
|
+
[0.6.0]: https://github.com/noumenal-ai/causal-worlds/releases/tag/v0.6.0
|
|
50
|
+
|
|
51
|
+
## [0.5.0] — 2026-06-23
|
|
52
|
+
|
|
53
|
+
**Scale resolves the difficulty question.** A 36-world set across an easy→hard complexity spread gives
|
|
54
|
+
the analyses real range — and structural difficulty turns out to predict the observational collapse.
|
|
55
|
+
|
|
56
|
+
### Added
|
|
57
|
+
- **Author complexity knob** (`author`): `ClaudeAuthor(..., complexity="easy"|"standard"|"hard")`
|
|
58
|
+
varies how many hidden confounders / regime sign-flips to inject, spreading structural difficulty.
|
|
59
|
+
Recorded per world in the manifest (`Provenance.complexity`).
|
|
60
|
+
- **Scaled benchmark** (`benchmark/v0.5`): 35/36 admitted across complexity levels — mean structural
|
|
61
|
+
difficulty by level 0.0 / 1.4 / 3.0; reference-grader SHD 0.36 / 1.75 / 2.33.
|
|
62
|
+
- **Parameterized evals**: the crossover and structural-difficulty harnesses take a benchmark dir;
|
|
63
|
+
results nest under `evals/*/v0.5/`.
|
|
64
|
+
|
|
65
|
+
### Findings (powered, n=35)
|
|
66
|
+
- **Crossover strengthens**: the interventional-CI grader keeps **confounded-kept = 0** (never reports
|
|
67
|
+
a hidden-confounded pair as causal) at SHD 1.47 / F1 0.91, while PC/FCI/GIES keep 8–17 and post SHD
|
|
68
|
+
2.7–6.7.
|
|
69
|
+
- **Structural difficulty predicts observational error (corr +0.62)** where name-guessability does not
|
|
70
|
+
(+0.14) — the hardness is structural (confounders + sign-flips), resolving v0.4's open question and
|
|
71
|
+
turning difficulty into a usable instrument.
|
|
72
|
+
|
|
73
|
+
[0.5.0]: https://github.com/noumenal-ai/causal-worlds/releases/tag/v0.5.0
|
|
74
|
+
|
|
75
|
+
## [0.4.0] — 2026-06-23
|
|
76
|
+
|
|
77
|
+
**A structural-difficulty axis.** v0.3 showed name-guessability difficulty doesn't predict discovery
|
|
78
|
+
error — the hardness is structural. This adds that axis and tests it honestly.
|
|
79
|
+
|
|
80
|
+
### Added
|
|
81
|
+
- **Structural difficulty** (`difficulty`): `structural_difficulty(spec)` scores discovery-hardness
|
|
82
|
+
from the structure — hidden confounders, confounded pairs, regime **sign-flips**, edge density — with
|
|
83
|
+
a headline trap-count `score`. Pure, deterministic, unit-tested.
|
|
84
|
+
- Structural difficulty is now recorded in every admitted world's `manifest.json`.
|
|
85
|
+
- **Re-analysis** (`evals/structural-difficulty`): reuses the crossover report (no new runs) to test
|
|
86
|
+
whether structural difficulty predicts the collapse.
|
|
87
|
+
|
|
88
|
+
### Findings (honest)
|
|
89
|
+
- At n=12 with a narrow difficulty range, **neither** name-guessability nor structural difficulty
|
|
90
|
+
cleanly predicts the *magnitude* of error (correlations −0.39…+0.14) — a statistical-power problem,
|
|
91
|
+
not a refutation. The v0.3 crossover (standard methods collapse, grader holds) is unaffected.
|
|
92
|
+
Resolving difficulty-predicts-error is deferred to the scaled set (v0.5).
|
|
93
|
+
|
|
94
|
+
[0.4.0]: https://github.com/noumenal-ai/causal-worlds/releases/tag/v0.4.0
|
|
95
|
+
|
|
96
|
+
## [0.3.0] — 2026-06-23
|
|
97
|
+
|
|
98
|
+
**The decisive experiment.** Proves the benchmark's central claim beyond the single `coffee` world:
|
|
99
|
+
standard discovery collapses on our worlds where the reference interventional-CI grader holds.
|
|
100
|
+
|
|
101
|
+
### Added
|
|
102
|
+
- **Baseline suite** (`baselines`): PC, GES, FCI (`causal-learn`) and GIES (`gies`) wrapped behind the
|
|
103
|
+
`Discoverer` Protocol as adapters — lazy-imported (the `discover` extra), so the package imports and
|
|
104
|
+
CI run without them; graph-parsing logic is pure and unit-tested. `BaselineResult` carries directed
|
|
105
|
+
edges, bidirected (confounding) marks, and the skeleton for a fair cross-method comparison.
|
|
106
|
+
- **Crossover eval** (`evals/baseline-crossover`): every benchmark world vs every method across seeds →
|
|
107
|
+
skeleton-SHD, directed F1, and *confounded-pair-kept-as-causal* (the trap). **Result (n=12): GO.**
|
|
108
|
+
Standard methods keep the hidden-confounded pair as causal in 7.3–10.0 of 12 worlds (PC/FCI/GIES) and
|
|
109
|
+
post 2–4× the skeleton error; the interventional grader stays at confounded-kept 0.33, SHD 1.31,
|
|
110
|
+
F1 0.91.
|
|
111
|
+
- **Difficulty-vs-error analysis** — *honest negative*: name-guessability difficulty does not yet
|
|
112
|
+
predict discovery error (corr ~0.1); the hardness is structural (confounder+regime). Sharpens v0.4.
|
|
113
|
+
- **Publication artifacts**: a technical blog post (`docs/blog-the-decisive-experiment.md`) and a
|
|
114
|
+
Framing-B paper skeleton (`paper/`).
|
|
115
|
+
|
|
116
|
+
### Notes
|
|
117
|
+
- `causal-learn`'s GES is numpy-2 incompatible (errors on every world) — reported, not hidden.
|
|
118
|
+
|
|
119
|
+
[0.3.0]: https://github.com/noumenal-ai/causal-worlds/releases/tag/v0.3.0
|
|
120
|
+
|
|
121
|
+
## [0.2.0] — 2026-06-23
|
|
122
|
+
|
|
123
|
+
Closes the generative loop: **natural language in, an admitted causal world out**, plus persistence
|
|
124
|
+
and a shipped benchmark set. The LLM seams are real but isolated — the package still imports and CI
|
|
125
|
+
still runs with no API key (the adapters are unit-tested against fakes).
|
|
126
|
+
|
|
127
|
+
### Added
|
|
128
|
+
- **NL author** (`author`): `ClaudeAuthor` turns a plain-language operation into a `WorldSpec` via
|
|
129
|
+
`instructor` (bounded re-ask), steered toward recoverable, anti-cliché worlds (a hidden confounder
|
|
130
|
+
+ a regime sign-flip). Behind the `Author` Protocol; provider SDK lazy-imported.
|
|
131
|
+
- **Independent judge** (`judge`): `GeminiJudge` guesses the structure from names/roles alone (the
|
|
132
|
+
anti-cliché signal) and scores faithfulness — a *different model family* than the author.
|
|
133
|
+
- **T4 anti-cliché gate** (`gates`): with a judge + prose, rejects unfaithful or guess-from-priors
|
|
134
|
+
worlds and records a `difficulty` score (`1 - F1(judge_prior, truth)`).
|
|
135
|
+
- **The loop** (`generate`): `generate` (author→gate→admit with feedback-driven re-author) and
|
|
136
|
+
`generate_many` (never-raising batch) → `AdmittedWorld`.
|
|
137
|
+
- **Artifact persistence** (`artifact`): self-describing on-disk bundle (`spec.json` / `data.npz` /
|
|
138
|
+
`answer_key.json` / `manifest.json`) with full provenance (models, grader version, seed, grade).
|
|
139
|
+
- **Boundary model** (`serde`): one pydantic `WorldSpecModel` — the author's output target and the
|
|
140
|
+
persisted JSON shape — converting to/from the frozen core IR.
|
|
141
|
+
- **CLI**: `generate <prompt> <out>` and `benchmark <prompts_file> <out>`; author/judge resolved
|
|
142
|
+
through the DI container.
|
|
143
|
+
- **Author-model bake-off** (`evals/author-model-bakeoff`): a reproducible, judged comparison that
|
|
144
|
+
picks the default author model with numbers, not assertion — shipped with the release.
|
|
145
|
+
- **Benchmark set** (`benchmark/v0.2`): 12 authored, admitted worlds across distinct operations —
|
|
146
|
+
mean difficulty 0.28, faithfulness 1.00, reference-grader directed SHD 1.25 / F1 0.92.
|
|
147
|
+
|
|
148
|
+
### Changed
|
|
149
|
+
- Version is single-sourced from `_version.py` (hatchling dynamic). `.coverage` is no longer tracked.
|
|
150
|
+
|
|
151
|
+
[0.2.0]: https://github.com/noumenal-ai/causal-worlds/releases/tag/v0.2.0
|
|
152
|
+
|
|
153
|
+
## [0.1.0] — 2026-06-22
|
|
154
|
+
|
|
155
|
+
First release: **the deterministic benchmark engine**. Generate (programmatically-specified) fictional causal
|
|
156
|
+
worlds with a ground-truth answer-key, sample them, grade a causal-discovery method, and score it — runnable as a
|
|
157
|
+
library and a CLI, with no LLM or API key required.
|
|
158
|
+
|
|
159
|
+
### Added
|
|
160
|
+
- **Schema / IR** (`schema`): `WorldSpec` as the single source of truth (variables incl. hidden confounders +
|
|
161
|
+
generative `Mechanism`s with regime-switching); the `AnswerKey` (observed edges + confounded pairs) is *derived*,
|
|
162
|
+
never stored; `validate()` static gate.
|
|
163
|
+
- **SCM substrate** (`sample`): a deterministic, seeded executable world; `do()` interventions (constant or
|
|
164
|
+
per-row array). The functional core.
|
|
165
|
+
- **Reference grader** (`discover`): `InterventionalCiDiscoverer` — a spec-blind interventional-CI discoverer that
|
|
166
|
+
recovers the confounder + regime-flip trap (directed SHD 0) where standard observational/score-based methods
|
|
167
|
+
(PC, GES, GIES, FCI) fail.
|
|
168
|
+
- **Scoring** (`evaluation`): directed/skeleton SHD, F1, and `confounded_reported` (flags a causal edge claimed for
|
|
169
|
+
a hidden-confounded pair); `Report`.
|
|
170
|
+
- **Validity gates** (`gates`): `run_gates` → T1 (validity) · T2 (sample-sanity) · T3 (non-triviality vs a
|
|
171
|
+
per-world random-graph null). Admits only if all pass.
|
|
172
|
+
- **Built-in worlds** (`worlds`): `coffee` (the confounder + regime-flip trap) and `ecommerce` (easy control).
|
|
173
|
+
- **CLI** (`causal-worlds`): `version` · `worlds` · `grade <world>` · `gate <world>`.
|
|
174
|
+
- **Wiring**: pydantic-settings `config`, a small DI `container`, and a no-op `Tracer` observability seam.
|
|
175
|
+
- **Quality**: uv + ruff (`select=ALL`) + mypy `strict` + pytest with a coverage floor, enforced by CI.
|
|
176
|
+
|
|
177
|
+
### Not yet (tracked as v0.2 issues)
|
|
178
|
+
NL/`WorldBrief` → spec **author**, the independent **Gemini judge** + the T4 anti-cliché gate, conversational
|
|
179
|
+
**elicitation**, the **Langfuse (OTEL)** tracing adapter, **artifact/manifest persistence**, grader **hardening**
|
|
180
|
+
(FCI-with-interventions) + world-diversity sweep + knob calibration, and more built-in/temporal worlds.
|
|
181
|
+
|
|
182
|
+
[0.1.0]: https://github.com/noumenal-ai/causal-worlds/releases/tag/v0.1.0
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# causal-worlds — AI working agreement
|
|
2
|
+
|
|
3
|
+
Short, binding agreement for working in this repo. Keep it short; depth lives in
|
|
4
|
+
[docs/engineering.md](docs/engineering.md), auto-applied via the skill
|
|
5
|
+
[.claude/skills/causal-worlds-conventions/](.claude/skills/causal-worlds-conventions/SKILL.md).
|
|
6
|
+
|
|
7
|
+
## What this is
|
|
8
|
+
A public (MIT) Python package: **generate a fictional-but-coherent causal *operation* from a natural-language
|
|
9
|
+
description** — an executable simulator, the time-series it emits, and a **declared ground-truth causal structure
|
|
10
|
+
(the answer-key)** — for benchmarking causal-discovery agents. A **mix of engineering and research**. CLI-first
|
|
11
|
+
(typer). Consumes **Gemini** as an *independent* LLM judge (must differ from any author model family). Concept &
|
|
12
|
+
approach are **validated** (see [docs/validation.md](docs/validation.md)); this is the production build.
|
|
13
|
+
|
|
14
|
+
## Non-negotiables (full detail in docs/engineering.md)
|
|
15
|
+
- **Clean Code (Uncle Bob) — all of it, NOT Clean Architecture.** **SOLID** via Python `Protocol`s.
|
|
16
|
+
- **Design patterns only at proven variation points** (Strategy/Adapter for discoverer·judge·substrate, Pipeline of
|
|
17
|
+
gates). **No abstraction for hypothetical futures.** Reuse over fork.
|
|
18
|
+
- **Wrap every third-party lib** (`causal-learn`, `gies`, Gemini) **behind our own Protocol + adapter.**
|
|
19
|
+
- **Tooling:** `uv` · `ruff` (`select=ALL` + curated ignores, line 100, `ruff format`) · `mypy strict` · `pytest`
|
|
20
|
+
with a coverage floor · pre-commit · **CI that fails**. `src`-layout, feature/capability modules. Google docstrings.
|
|
21
|
+
- **Run the gate before committing:** `make validate` (or `uv run ruff format --check . && uv run ruff check . &&
|
|
22
|
+
uv run mypy && uv run pytest`). **CI green is a merge gate** — that's how we avoid re-leaving the same review comment.
|
|
23
|
+
- **Measured, not asserted.** Every behavioral claim is backed by a runnable script/test.
|
|
24
|
+
- **`spikes/` and `experiments/` are research, NOT shipped** (lint/type/coverage-exempt); reproducible via seed +
|
|
25
|
+
`uv.lock` + pinned model ids; honest negatives.
|
|
26
|
+
- **Commits:** Conventional Commits, atomic, **no `Co-Authored-By` trailer**. Push/PR only on explicit request.
|
|
27
|
+
|
|
28
|
+
## Map
|
|
29
|
+
- [docs/scope.md](docs/scope.md) · [docs/hld.md](docs/hld.md) · [docs/lld.md](docs/lld.md) ·
|
|
30
|
+
[docs/validation.md](docs/validation.md) — product/design + the validation evidence.
|
|
31
|
+
- [docs/architecture.md](docs/architecture.md) — the finalized system design (pipeline, seams, data contract, DI,
|
|
32
|
+
config, artifact, elicitation).
|
|
33
|
+
- [docs/engineering.md](docs/engineering.md) — the binding code-quality + research guidelines.
|
|
34
|
+
- `spikes/` — the validation spikes (research; the proof, not the implementation).
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Noumenal AI
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
.PHONY: install fmt lint type test check validate hooks
|
|
2
|
+
|
|
3
|
+
install: ## sync env + dev tools
|
|
4
|
+
uv sync
|
|
5
|
+
|
|
6
|
+
fmt: ## auto-format
|
|
7
|
+
uv run ruff format .
|
|
8
|
+
|
|
9
|
+
lint: ## lint (with autofix)
|
|
10
|
+
uv run ruff check --fix .
|
|
11
|
+
|
|
12
|
+
type: ## strict type-check
|
|
13
|
+
uv run mypy
|
|
14
|
+
|
|
15
|
+
test: ## tests + coverage floor
|
|
16
|
+
uv run pytest
|
|
17
|
+
|
|
18
|
+
check: ## the read-only gate (what CI runs, minus tests)
|
|
19
|
+
uv run ruff format --check .
|
|
20
|
+
uv run ruff check .
|
|
21
|
+
uv run mypy
|
|
22
|
+
|
|
23
|
+
validate: check test ## the full gate — run before every commit
|
|
24
|
+
|
|
25
|
+
hooks: ## install pre-commit hooks
|
|
26
|
+
uv run pre-commit install
|