causal-worlds 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (283) hide show
  1. causal_worlds-0.7.0/.claude/settings.json +26 -0
  2. causal_worlds-0.7.0/.claude/skills/causal-worlds-conventions/SKILL.md +65 -0
  3. causal_worlds-0.7.0/.editorconfig +19 -0
  4. causal_worlds-0.7.0/.github/workflows/ci.yml +32 -0
  5. causal_worlds-0.7.0/.github/workflows/publish.yml +22 -0
  6. causal_worlds-0.7.0/.gitignore +35 -0
  7. causal_worlds-0.7.0/.pre-commit-config.yaml +19 -0
  8. causal_worlds-0.7.0/CHANGELOG.md +182 -0
  9. causal_worlds-0.7.0/CLAUDE.md +34 -0
  10. causal_worlds-0.7.0/LICENSE +21 -0
  11. causal_worlds-0.7.0/Makefile +26 -0
  12. causal_worlds-0.7.0/PKG-INFO +197 -0
  13. causal_worlds-0.7.0/README.md +164 -0
  14. causal_worlds-0.7.0/RELEASING.md +40 -0
  15. causal_worlds-0.7.0/benchmark/README.md +30 -0
  16. causal_worlds-0.7.0/benchmark/prompts.txt +19 -0
  17. causal_worlds-0.7.0/benchmark/v0.2/index.json +122 -0
  18. causal_worlds-0.7.0/benchmark/v0.2/world_00/answer_key.json +46 -0
  19. causal_worlds-0.7.0/benchmark/v0.2/world_00/data.npz +0 -0
  20. causal_worlds-0.7.0/benchmark/v0.2/world_00/manifest.json +33 -0
  21. causal_worlds-0.7.0/benchmark/v0.2/world_00/spec.json +156 -0
  22. causal_worlds-0.7.0/benchmark/v0.2/world_01/answer_key.json +34 -0
  23. causal_worlds-0.7.0/benchmark/v0.2/world_01/data.npz +0 -0
  24. causal_worlds-0.7.0/benchmark/v0.2/world_01/manifest.json +32 -0
  25. causal_worlds-0.7.0/benchmark/v0.2/world_01/spec.json +114 -0
  26. causal_worlds-0.7.0/benchmark/v0.2/world_02/answer_key.json +50 -0
  27. causal_worlds-0.7.0/benchmark/v0.2/world_02/data.npz +0 -0
  28. causal_worlds-0.7.0/benchmark/v0.2/world_02/manifest.json +33 -0
  29. causal_worlds-0.7.0/benchmark/v0.2/world_02/spec.json +139 -0
  30. causal_worlds-0.7.0/benchmark/v0.2/world_03/answer_key.json +46 -0
  31. causal_worlds-0.7.0/benchmark/v0.2/world_03/data.npz +0 -0
  32. causal_worlds-0.7.0/benchmark/v0.2/world_03/manifest.json +33 -0
  33. causal_worlds-0.7.0/benchmark/v0.2/world_03/spec.json +147 -0
  34. causal_worlds-0.7.0/benchmark/v0.2/world_04/answer_key.json +42 -0
  35. causal_worlds-0.7.0/benchmark/v0.2/world_04/data.npz +0 -0
  36. causal_worlds-0.7.0/benchmark/v0.2/world_04/manifest.json +32 -0
  37. causal_worlds-0.7.0/benchmark/v0.2/world_04/spec.json +121 -0
  38. causal_worlds-0.7.0/benchmark/v0.2/world_05/answer_key.json +38 -0
  39. causal_worlds-0.7.0/benchmark/v0.2/world_05/data.npz +0 -0
  40. causal_worlds-0.7.0/benchmark/v0.2/world_05/manifest.json +33 -0
  41. causal_worlds-0.7.0/benchmark/v0.2/world_05/spec.json +119 -0
  42. causal_worlds-0.7.0/benchmark/v0.2/world_06/answer_key.json +38 -0
  43. causal_worlds-0.7.0/benchmark/v0.2/world_06/data.npz +0 -0
  44. causal_worlds-0.7.0/benchmark/v0.2/world_06/manifest.json +32 -0
  45. causal_worlds-0.7.0/benchmark/v0.2/world_06/spec.json +126 -0
  46. causal_worlds-0.7.0/benchmark/v0.2/world_07/answer_key.json +54 -0
  47. causal_worlds-0.7.0/benchmark/v0.2/world_07/data.npz +0 -0
  48. causal_worlds-0.7.0/benchmark/v0.2/world_07/manifest.json +33 -0
  49. causal_worlds-0.7.0/benchmark/v0.2/world_07/spec.json +155 -0
  50. causal_worlds-0.7.0/benchmark/v0.2/world_08/answer_key.json +50 -0
  51. causal_worlds-0.7.0/benchmark/v0.2/world_08/data.npz +0 -0
  52. causal_worlds-0.7.0/benchmark/v0.2/world_08/manifest.json +34 -0
  53. causal_worlds-0.7.0/benchmark/v0.2/world_08/spec.json +152 -0
  54. causal_worlds-0.7.0/benchmark/v0.2/world_09/answer_key.json +34 -0
  55. causal_worlds-0.7.0/benchmark/v0.2/world_09/data.npz +0 -0
  56. causal_worlds-0.7.0/benchmark/v0.2/world_09/manifest.json +32 -0
  57. causal_worlds-0.7.0/benchmark/v0.2/world_09/spec.json +110 -0
  58. causal_worlds-0.7.0/benchmark/v0.2/world_10/answer_key.json +33 -0
  59. causal_worlds-0.7.0/benchmark/v0.2/world_10/data.npz +0 -0
  60. causal_worlds-0.7.0/benchmark/v0.2/world_10/manifest.json +32 -0
  61. causal_worlds-0.7.0/benchmark/v0.2/world_10/spec.json +138 -0
  62. causal_worlds-0.7.0/benchmark/v0.2/world_11/answer_key.json +37 -0
  63. causal_worlds-0.7.0/benchmark/v0.2/world_11/data.npz +0 -0
  64. causal_worlds-0.7.0/benchmark/v0.2/world_11/manifest.json +32 -0
  65. causal_worlds-0.7.0/benchmark/v0.2/world_11/spec.json +126 -0
  66. causal_worlds-0.7.0/benchmark/v0.5/README.md +17 -0
  67. causal_worlds-0.7.0/benchmark/v0.5/index.json +428 -0
  68. causal_worlds-0.7.0/benchmark/v0.5/world_01/answer_key.json +37 -0
  69. causal_worlds-0.7.0/benchmark/v0.5/world_01/data.npz +0 -0
  70. causal_worlds-0.7.0/benchmark/v0.5/world_01/manifest.json +41 -0
  71. causal_worlds-0.7.0/benchmark/v0.5/world_01/spec.json +127 -0
  72. causal_worlds-0.7.0/benchmark/v0.5/world_02/answer_key.json +50 -0
  73. causal_worlds-0.7.0/benchmark/v0.5/world_02/data.npz +0 -0
  74. causal_worlds-0.7.0/benchmark/v0.5/world_02/manifest.json +42 -0
  75. causal_worlds-0.7.0/benchmark/v0.5/world_02/spec.json +180 -0
  76. causal_worlds-0.7.0/benchmark/v0.5/world_03/answer_key.json +37 -0
  77. causal_worlds-0.7.0/benchmark/v0.5/world_03/data.npz +0 -0
  78. causal_worlds-0.7.0/benchmark/v0.5/world_03/manifest.json +41 -0
  79. causal_worlds-0.7.0/benchmark/v0.5/world_03/spec.json +105 -0
  80. causal_worlds-0.7.0/benchmark/v0.5/world_04/answer_key.json +46 -0
  81. causal_worlds-0.7.0/benchmark/v0.5/world_04/data.npz +0 -0
  82. causal_worlds-0.7.0/benchmark/v0.5/world_04/manifest.json +41 -0
  83. causal_worlds-0.7.0/benchmark/v0.5/world_04/spec.json +130 -0
  84. causal_worlds-0.7.0/benchmark/v0.5/world_05/answer_key.json +46 -0
  85. causal_worlds-0.7.0/benchmark/v0.5/world_05/data.npz +0 -0
  86. causal_worlds-0.7.0/benchmark/v0.5/world_05/manifest.json +41 -0
  87. causal_worlds-0.7.0/benchmark/v0.5/world_05/spec.json +171 -0
  88. causal_worlds-0.7.0/benchmark/v0.5/world_06/answer_key.json +25 -0
  89. causal_worlds-0.7.0/benchmark/v0.5/world_06/data.npz +0 -0
  90. causal_worlds-0.7.0/benchmark/v0.5/world_06/manifest.json +39 -0
  91. causal_worlds-0.7.0/benchmark/v0.5/world_06/spec.json +75 -0
  92. causal_worlds-0.7.0/benchmark/v0.5/world_07/answer_key.json +45 -0
  93. causal_worlds-0.7.0/benchmark/v0.5/world_07/data.npz +0 -0
  94. causal_worlds-0.7.0/benchmark/v0.5/world_07/manifest.json +40 -0
  95. causal_worlds-0.7.0/benchmark/v0.5/world_07/spec.json +138 -0
  96. causal_worlds-0.7.0/benchmark/v0.5/world_08/answer_key.json +46 -0
  97. causal_worlds-0.7.0/benchmark/v0.5/world_08/data.npz +0 -0
  98. causal_worlds-0.7.0/benchmark/v0.5/world_08/manifest.json +41 -0
  99. causal_worlds-0.7.0/benchmark/v0.5/world_08/spec.json +153 -0
  100. causal_worlds-0.7.0/benchmark/v0.5/world_09/answer_key.json +29 -0
  101. causal_worlds-0.7.0/benchmark/v0.5/world_09/data.npz +0 -0
  102. causal_worlds-0.7.0/benchmark/v0.5/world_09/manifest.json +40 -0
  103. causal_worlds-0.7.0/benchmark/v0.5/world_09/spec.json +92 -0
  104. causal_worlds-0.7.0/benchmark/v0.5/world_10/answer_key.json +37 -0
  105. causal_worlds-0.7.0/benchmark/v0.5/world_10/data.npz +0 -0
  106. causal_worlds-0.7.0/benchmark/v0.5/world_10/manifest.json +41 -0
  107. causal_worlds-0.7.0/benchmark/v0.5/world_10/spec.json +143 -0
  108. causal_worlds-0.7.0/benchmark/v0.5/world_11/answer_key.json +54 -0
  109. causal_worlds-0.7.0/benchmark/v0.5/world_11/data.npz +0 -0
  110. causal_worlds-0.7.0/benchmark/v0.5/world_11/manifest.json +42 -0
  111. causal_worlds-0.7.0/benchmark/v0.5/world_11/spec.json +168 -0
  112. causal_worlds-0.7.0/benchmark/v0.5/world_12/answer_key.json +33 -0
  113. causal_worlds-0.7.0/benchmark/v0.5/world_12/data.npz +0 -0
  114. causal_worlds-0.7.0/benchmark/v0.5/world_12/manifest.json +41 -0
  115. causal_worlds-0.7.0/benchmark/v0.5/world_12/spec.json +101 -0
  116. causal_worlds-0.7.0/benchmark/v0.5/world_13/answer_key.json +41 -0
  117. causal_worlds-0.7.0/benchmark/v0.5/world_13/data.npz +0 -0
  118. causal_worlds-0.7.0/benchmark/v0.5/world_13/manifest.json +41 -0
  119. causal_worlds-0.7.0/benchmark/v0.5/world_13/spec.json +127 -0
  120. causal_worlds-0.7.0/benchmark/v0.5/world_14/answer_key.json +38 -0
  121. causal_worlds-0.7.0/benchmark/v0.5/world_14/data.npz +0 -0
  122. causal_worlds-0.7.0/benchmark/v0.5/world_14/manifest.json +41 -0
  123. causal_worlds-0.7.0/benchmark/v0.5/world_14/spec.json +137 -0
  124. causal_worlds-0.7.0/benchmark/v0.5/world_15/answer_key.json +33 -0
  125. causal_worlds-0.7.0/benchmark/v0.5/world_15/data.npz +0 -0
  126. causal_worlds-0.7.0/benchmark/v0.5/world_15/manifest.json +40 -0
  127. causal_worlds-0.7.0/benchmark/v0.5/world_15/spec.json +109 -0
  128. causal_worlds-0.7.0/benchmark/v0.5/world_16/answer_key.json +41 -0
  129. causal_worlds-0.7.0/benchmark/v0.5/world_16/data.npz +0 -0
  130. causal_worlds-0.7.0/benchmark/v0.5/world_16/manifest.json +41 -0
  131. causal_worlds-0.7.0/benchmark/v0.5/world_16/spec.json +110 -0
  132. causal_worlds-0.7.0/benchmark/v0.5/world_17/answer_key.json +46 -0
  133. causal_worlds-0.7.0/benchmark/v0.5/world_17/data.npz +0 -0
  134. causal_worlds-0.7.0/benchmark/v0.5/world_17/manifest.json +41 -0
  135. causal_worlds-0.7.0/benchmark/v0.5/world_17/spec.json +157 -0
  136. causal_worlds-0.7.0/benchmark/v0.5/world_18/answer_key.json +29 -0
  137. causal_worlds-0.7.0/benchmark/v0.5/world_18/data.npz +0 -0
  138. causal_worlds-0.7.0/benchmark/v0.5/world_18/manifest.json +41 -0
  139. causal_worlds-0.7.0/benchmark/v0.5/world_18/spec.json +97 -0
  140. causal_worlds-0.7.0/benchmark/v0.5/world_19/answer_key.json +45 -0
  141. causal_worlds-0.7.0/benchmark/v0.5/world_19/data.npz +0 -0
  142. causal_worlds-0.7.0/benchmark/v0.5/world_19/manifest.json +41 -0
  143. causal_worlds-0.7.0/benchmark/v0.5/world_19/spec.json +160 -0
  144. causal_worlds-0.7.0/benchmark/v0.5/world_20/answer_key.json +45 -0
  145. causal_worlds-0.7.0/benchmark/v0.5/world_20/data.npz +0 -0
  146. causal_worlds-0.7.0/benchmark/v0.5/world_20/manifest.json +42 -0
  147. causal_worlds-0.7.0/benchmark/v0.5/world_20/spec.json +199 -0
  148. causal_worlds-0.7.0/benchmark/v0.5/world_21/answer_key.json +29 -0
  149. causal_worlds-0.7.0/benchmark/v0.5/world_21/data.npz +0 -0
  150. causal_worlds-0.7.0/benchmark/v0.5/world_21/manifest.json +40 -0
  151. causal_worlds-0.7.0/benchmark/v0.5/world_21/spec.json +84 -0
  152. causal_worlds-0.7.0/benchmark/v0.5/world_22/answer_key.json +42 -0
  153. causal_worlds-0.7.0/benchmark/v0.5/world_22/data.npz +0 -0
  154. causal_worlds-0.7.0/benchmark/v0.5/world_22/manifest.json +41 -0
  155. causal_worlds-0.7.0/benchmark/v0.5/world_22/spec.json +123 -0
  156. causal_worlds-0.7.0/benchmark/v0.5/world_23/answer_key.json +37 -0
  157. causal_worlds-0.7.0/benchmark/v0.5/world_23/data.npz +0 -0
  158. causal_worlds-0.7.0/benchmark/v0.5/world_23/manifest.json +41 -0
  159. causal_worlds-0.7.0/benchmark/v0.5/world_23/spec.json +157 -0
  160. causal_worlds-0.7.0/benchmark/v0.5/world_24/answer_key.json +29 -0
  161. causal_worlds-0.7.0/benchmark/v0.5/world_24/data.npz +0 -0
  162. causal_worlds-0.7.0/benchmark/v0.5/world_24/manifest.json +40 -0
  163. causal_worlds-0.7.0/benchmark/v0.5/world_24/spec.json +92 -0
  164. causal_worlds-0.7.0/benchmark/v0.5/world_25/answer_key.json +50 -0
  165. causal_worlds-0.7.0/benchmark/v0.5/world_25/data.npz +0 -0
  166. causal_worlds-0.7.0/benchmark/v0.5/world_25/manifest.json +42 -0
  167. causal_worlds-0.7.0/benchmark/v0.5/world_25/spec.json +148 -0
  168. causal_worlds-0.7.0/benchmark/v0.5/world_26/answer_key.json +58 -0
  169. causal_worlds-0.7.0/benchmark/v0.5/world_26/data.npz +0 -0
  170. causal_worlds-0.7.0/benchmark/v0.5/world_26/manifest.json +42 -0
  171. causal_worlds-0.7.0/benchmark/v0.5/world_26/spec.json +190 -0
  172. causal_worlds-0.7.0/benchmark/v0.5/world_27/answer_key.json +37 -0
  173. causal_worlds-0.7.0/benchmark/v0.5/world_27/data.npz +0 -0
  174. causal_worlds-0.7.0/benchmark/v0.5/world_27/manifest.json +41 -0
  175. causal_worlds-0.7.0/benchmark/v0.5/world_27/spec.json +105 -0
  176. causal_worlds-0.7.0/benchmark/v0.5/world_28/answer_key.json +38 -0
  177. causal_worlds-0.7.0/benchmark/v0.5/world_28/data.npz +0 -0
  178. causal_worlds-0.7.0/benchmark/v0.5/world_28/manifest.json +41 -0
  179. causal_worlds-0.7.0/benchmark/v0.5/world_28/spec.json +115 -0
  180. causal_worlds-0.7.0/benchmark/v0.5/world_29/answer_key.json +50 -0
  181. causal_worlds-0.7.0/benchmark/v0.5/world_29/data.npz +0 -0
  182. causal_worlds-0.7.0/benchmark/v0.5/world_29/manifest.json +42 -0
  183. causal_worlds-0.7.0/benchmark/v0.5/world_29/spec.json +178 -0
  184. causal_worlds-0.7.0/benchmark/v0.5/world_30/answer_key.json +29 -0
  185. causal_worlds-0.7.0/benchmark/v0.5/world_30/data.npz +0 -0
  186. causal_worlds-0.7.0/benchmark/v0.5/world_30/manifest.json +40 -0
  187. causal_worlds-0.7.0/benchmark/v0.5/world_30/spec.json +92 -0
  188. causal_worlds-0.7.0/benchmark/v0.5/world_31/answer_key.json +34 -0
  189. causal_worlds-0.7.0/benchmark/v0.5/world_31/data.npz +0 -0
  190. causal_worlds-0.7.0/benchmark/v0.5/world_31/manifest.json +40 -0
  191. causal_worlds-0.7.0/benchmark/v0.5/world_31/spec.json +110 -0
  192. causal_worlds-0.7.0/benchmark/v0.5/world_32/answer_key.json +38 -0
  193. causal_worlds-0.7.0/benchmark/v0.5/world_32/data.npz +0 -0
  194. causal_worlds-0.7.0/benchmark/v0.5/world_32/manifest.json +41 -0
  195. causal_worlds-0.7.0/benchmark/v0.5/world_32/spec.json +177 -0
  196. causal_worlds-0.7.0/benchmark/v0.5/world_33/answer_key.json +25 -0
  197. causal_worlds-0.7.0/benchmark/v0.5/world_33/data.npz +0 -0
  198. causal_worlds-0.7.0/benchmark/v0.5/world_33/manifest.json +40 -0
  199. causal_worlds-0.7.0/benchmark/v0.5/world_33/spec.json +80 -0
  200. causal_worlds-0.7.0/benchmark/v0.5/world_34/answer_key.json +33 -0
  201. causal_worlds-0.7.0/benchmark/v0.5/world_34/data.npz +0 -0
  202. causal_worlds-0.7.0/benchmark/v0.5/world_34/manifest.json +41 -0
  203. causal_worlds-0.7.0/benchmark/v0.5/world_34/spec.json +111 -0
  204. causal_worlds-0.7.0/benchmark/v0.5/world_35/answer_key.json +46 -0
  205. causal_worlds-0.7.0/benchmark/v0.5/world_35/data.npz +0 -0
  206. causal_worlds-0.7.0/benchmark/v0.5/world_35/manifest.json +41 -0
  207. causal_worlds-0.7.0/benchmark/v0.5/world_35/spec.json +169 -0
  208. causal_worlds-0.7.0/docs/architecture.md +129 -0
  209. causal_worlds-0.7.0/docs/blog-the-decisive-experiment.md +80 -0
  210. causal_worlds-0.7.0/docs/engineering.md +182 -0
  211. causal_worlds-0.7.0/docs/getting-started.md +121 -0
  212. causal_worlds-0.7.0/docs/hld.md +305 -0
  213. causal_worlds-0.7.0/docs/lld.md +169 -0
  214. causal_worlds-0.7.0/docs/scope.md +130 -0
  215. causal_worlds-0.7.0/docs/validation.md +68 -0
  216. causal_worlds-0.7.0/evals/author-model-bakeoff/README.md +12 -0
  217. causal_worlds-0.7.0/evals/author-model-bakeoff/report.json +190 -0
  218. causal_worlds-0.7.0/evals/baseline-crossover/README.md +31 -0
  219. causal_worlds-0.7.0/evals/baseline-crossover/report.json +466 -0
  220. causal_worlds-0.7.0/evals/baseline-crossover/run_crossover.py +206 -0
  221. causal_worlds-0.7.0/evals/baseline-crossover/v0.5/README.md +15 -0
  222. causal_worlds-0.7.0/evals/baseline-crossover/v0.5/report.json +1225 -0
  223. causal_worlds-0.7.0/evals/run_author_bakeoff.py +155 -0
  224. causal_worlds-0.7.0/evals/scale/generate_set.py +116 -0
  225. causal_worlds-0.7.0/evals/structural-difficulty/README.md +23 -0
  226. causal_worlds-0.7.0/evals/structural-difficulty/report.json +149 -0
  227. causal_worlds-0.7.0/evals/structural-difficulty/run_analysis.py +113 -0
  228. causal_worlds-0.7.0/evals/structural-difficulty/v0.5/README.md +12 -0
  229. causal_worlds-0.7.0/evals/structural-difficulty/v0.5/report.json +402 -0
  230. causal_worlds-0.7.0/examples/01_grade_your_discoverer.py +45 -0
  231. causal_worlds-0.7.0/examples/02_inspect_a_bundle.py +31 -0
  232. causal_worlds-0.7.0/examples/03_author_a_world.py +38 -0
  233. causal_worlds-0.7.0/examples/README.md +15 -0
  234. causal_worlds-0.7.0/paper/README.md +80 -0
  235. causal_worlds-0.7.0/pyproject.toml +128 -0
  236. causal_worlds-0.7.0/spikes/smoke_live.py +40 -0
  237. causal_worlds-0.7.0/spikes/spike_author.py +267 -0
  238. causal_worlds-0.7.0/spikes/spike_coffee.py +161 -0
  239. causal_worlds-0.7.0/spikes/spike_coffee_general.py +142 -0
  240. causal_worlds-0.7.0/spikes/spike_grader.py +101 -0
  241. causal_worlds-0.7.0/spikes/spike_loop.py +93 -0
  242. causal_worlds-0.7.0/src/causal_worlds/__init__.py +106 -0
  243. causal_worlds-0.7.0/src/causal_worlds/_version.py +3 -0
  244. causal_worlds-0.7.0/src/causal_worlds/artifact.py +127 -0
  245. causal_worlds-0.7.0/src/causal_worlds/author.py +117 -0
  246. causal_worlds-0.7.0/src/causal_worlds/baselines.py +199 -0
  247. causal_worlds-0.7.0/src/causal_worlds/bench.py +38 -0
  248. causal_worlds-0.7.0/src/causal_worlds/cli.py +191 -0
  249. causal_worlds-0.7.0/src/causal_worlds/config.py +30 -0
  250. causal_worlds-0.7.0/src/causal_worlds/container.py +47 -0
  251. causal_worlds-0.7.0/src/causal_worlds/difficulty.py +77 -0
  252. causal_worlds-0.7.0/src/causal_worlds/discover/__init__.py +8 -0
  253. causal_worlds-0.7.0/src/causal_worlds/discover/interventional.py +177 -0
  254. causal_worlds-0.7.0/src/causal_worlds/errors.py +13 -0
  255. causal_worlds-0.7.0/src/causal_worlds/evaluation.py +73 -0
  256. causal_worlds-0.7.0/src/causal_worlds/fakes.py +50 -0
  257. causal_worlds-0.7.0/src/causal_worlds/gates.py +164 -0
  258. causal_worlds-0.7.0/src/causal_worlds/generate.py +134 -0
  259. causal_worlds-0.7.0/src/causal_worlds/judge.py +114 -0
  260. causal_worlds-0.7.0/src/causal_worlds/obs.py +28 -0
  261. causal_worlds-0.7.0/src/causal_worlds/protocols.py +77 -0
  262. causal_worlds-0.7.0/src/causal_worlds/py.typed +0 -0
  263. causal_worlds-0.7.0/src/causal_worlds/sample/__init__.py +5 -0
  264. causal_worlds-0.7.0/src/causal_worlds/sample/substrate.py +223 -0
  265. causal_worlds-0.7.0/src/causal_worlds/schema.py +252 -0
  266. causal_worlds-0.7.0/src/causal_worlds/serde.py +135 -0
  267. causal_worlds-0.7.0/src/causal_worlds/worlds.py +131 -0
  268. causal_worlds-0.7.0/tests/test_artifact.py +52 -0
  269. causal_worlds-0.7.0/tests/test_baselines.py +58 -0
  270. causal_worlds-0.7.0/tests/test_bench.py +62 -0
  271. causal_worlds-0.7.0/tests/test_cli.py +101 -0
  272. causal_worlds-0.7.0/tests/test_difficulty.py +30 -0
  273. causal_worlds-0.7.0/tests/test_discover.py +44 -0
  274. causal_worlds-0.7.0/tests/test_evaluation.py +55 -0
  275. causal_worlds-0.7.0/tests/test_gates.py +67 -0
  276. causal_worlds-0.7.0/tests/test_generate.py +83 -0
  277. causal_worlds-0.7.0/tests/test_llm_adapters.py +86 -0
  278. causal_worlds-0.7.0/tests/test_sample.py +59 -0
  279. causal_worlds-0.7.0/tests/test_schema.py +116 -0
  280. causal_worlds-0.7.0/tests/test_serde.py +23 -0
  281. causal_worlds-0.7.0/tests/test_temporal.py +92 -0
  282. causal_worlds-0.7.0/tests/test_worlds.py +22 -0
  283. causal_worlds-0.7.0/uv.lock +2485 -0
@@ -0,0 +1,26 @@
1
+ {
2
+ "hooks": {
3
+ "PreToolUse": [
4
+ {
5
+ "matcher": "Edit|Write|MultiEdit",
6
+ "hooks": [
7
+ {
8
+ "type": "command",
9
+ "command": "f=$(jq -r '.tool_input.file_path // empty'); case \"$f\" in *.env|*/.env|*.env.*) echo 'Refusing to edit secrets (.env) via Claude.' >&2; exit 2;; uv.lock|*/uv.lock) echo 'uv.lock is managed by uv — run `uv lock`, do not hand-edit.' >&2; exit 2;; esac"
10
+ }
11
+ ]
12
+ }
13
+ ],
14
+ "PostToolUse": [
15
+ {
16
+ "matcher": "Edit|Write|MultiEdit",
17
+ "hooks": [
18
+ {
19
+ "type": "command",
20
+ "command": "f=$(jq -r '.tool_input.file_path // empty'); case \"$f\" in *.py) ( cd \"$CLAUDE_PROJECT_DIR\" && uv run ruff format --force-exclude \"$f\" >/dev/null 2>&1; uv run ruff check --fix --force-exclude \"$f\" >/dev/null 2>&1 ) || true ;; esac"
21
+ }
22
+ ]
23
+ }
24
+ ]
25
+ }
26
+ }
@@ -0,0 +1,65 @@
1
+ ---
2
+ name: causal-worlds-conventions
3
+ description: >-
4
+ Engineering + research conventions for the causal-worlds Python package. Load and apply BEFORE writing,
5
+ refactoring, or reviewing ANY code in this repo (src/, tests/, cli, the package), and before adding a dependency
6
+ or a design pattern. Encodes Clean Code (Uncle Bob), SOLID-via-Protocols, the earned-patterns rule, the uv/ruff/
7
+ mypy/pytest/CI toolchain, src-layout, and the research discipline. Full detail in docs/engineering.md.
8
+ ---
9
+
10
+ # causal-worlds conventions
11
+
12
+ Apply these when touching this package. Full reference: [docs/engineering.md](../../../docs/engineering.md).
13
+ Mix of engineering + research; CLI-first (typer); Gemini is an *independent* judge (≠ author model family).
14
+
15
+ ## Before you write code
16
+ - **Clean Code (all of it), NOT Clean Architecture.** Small functions that do one thing at one level of
17
+ abstraction; intention-revealing names; **≤2 args** (bundle into a value object), **no flag args**, **no hidden
18
+ side effects**, **Command-Query Separation**. **Exceptions, not error codes; never return/pass `null`/`None`-as-
19
+ error.** No commented-out code (delete it). No magic numbers (named constants). Respect the Law of Demeter.
20
+ - **SOLID via `typing.Protocol`.** The four variation points get Protocols + injected impls:
21
+ `Discoverer` (grader), `Judge` (LLM), `Substrate`/`World`, `Gate`. Depend on the Protocol; **never** let
22
+ `causal-learn`/`gies`/Gemini types leak past an **adapter**.
23
+ - **Patterns are earned** (Strategy/Adapter for discoverer·judge·substrate; Pipeline for gates). **No speculative
24
+ abstraction.** If you add a pattern, name it and justify it in the PR.
25
+ - **Separate construction from use:** build/inject dependencies at the edge (`cli`, factories); the core never
26
+ news-up collaborators.
27
+ - **Structure:** `src/causal_worlds/<feature>/`; tests mirror; third-party imports only inside a feature's adapter.
28
+ - **Docstrings:** Google style. **Type everything** (mypy strict).
29
+ - **`from __future__ import annotations` only when needed** (TYPE_CHECKING-only annotation imports / forward refs);
30
+ **never** in typer CLI or pydantic modules (they read annotations at runtime). 3.13 needs no future import for
31
+ `X | None` / `tuple[...]`.
32
+
33
+ ## Before you commit — run the gate (it must be green)
34
+ ```bash
35
+ make validate # or:
36
+ uv run ruff format --check . && uv run ruff check . && uv run mypy && uv run pytest
37
+ ```
38
+ CI runs the same and **fails** on any violation; CI-green is the merge gate. Conventional Commits, atomic, **no
39
+ `Co-Authored-By` trailer**. Push/PR only when asked.
40
+
41
+ ## Tests (F.I.R.S.T.)
42
+ Fast · Independent · Repeatable · Self-validating · Timely. One concept per test. Prefer **Hypothesis property
43
+ tests** for invariants (acyclicity, interventions break the right edges, seed→determinism) over fixed-output tests.
44
+
45
+ ## Research code (`spikes/`, `experiments/`)
46
+ NOT shipped; lint/type/coverage-exempt. Held to "**is the finding real and honestly reported**," not production
47
+ polish. **Measured, not asserted:** every claim has a runnable script that prints the evidence. Reproducible via
48
+ **seed + `uv.lock` + pinned model ids** (e.g. `gemini-3.5-flash`). Report honest negatives. Use an **independent
49
+ judge** for LLM-output quality (don't grade a model with itself). A proven spike **graduates** into `src/` rebuilt
50
+ to the standards above — the spike is the proof, not the implementation.
51
+
52
+ ## Boundaries, LLM I/O & observability
53
+ - **Data models per use-case:** frozen `@dataclass` in the pure core (valid-by-construction; parse-don't-validate);
54
+ **pydantic v2** only at boundaries (LLM output, CLI, config) — convert the pydantic boundary model into the
55
+ dataclass core IR at the edge.
56
+ - **LLM structured output:** use **instructor** (pydantic models, **bounded** re-ask on validation failure, then
57
+ raise — never fabricate) behind the `Judge`/author adapter; Gemini is the independent judge.
58
+ - **Observability from day 1:** **Langfuse (OTEL-based)** spans around LLM calls + each pipeline stage, behind a
59
+ thin tracing seam (optional at runtime). Three channels, never conflated: logs (shell), traces (Langfuse/OTel),
60
+ exceptions (control flow). The pure core stays silent.
61
+ - **Errors & logging:** root `CausalWorldsError` + domain subclasses; **fail loud**; library logs to
62
+ `getLogger("causal_worlds")` + `NullHandler` (the app/CLI owns handlers); **never log secrets.**
63
+
64
+ ## Adding a dependency
65
+ Justify it; pin via `uv`; wrap it behind a Protocol+adapter; prefer the standard library and reuse over new deps.
@@ -0,0 +1,19 @@
1
+ root = true
2
+
3
+ [*]
4
+ end_of_line = lf
5
+ charset = utf-8
6
+ trim_trailing_whitespace = true
7
+ insert_final_newline = true
8
+
9
+ [*.py]
10
+ indent_style = space
11
+ indent_size = 4
12
+ max_line_length = 100
13
+
14
+ [*.{json,yml,yaml,toml}]
15
+ indent_style = space
16
+ indent_size = 2
17
+
18
+ [Makefile]
19
+ indent_style = tab
@@ -0,0 +1,32 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ quality:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+
14
+ - name: Install uv
15
+ uses: astral-sh/setup-uv@v5
16
+ with:
17
+ python-version: "3.13"
18
+
19
+ - name: Sync (deps + dev tools)
20
+ run: uv sync
21
+
22
+ - name: Format check (ruff)
23
+ run: uv run ruff format --check .
24
+
25
+ - name: Lint (ruff)
26
+ run: uv run ruff check .
27
+
28
+ - name: Type check (mypy strict)
29
+ run: uv run mypy
30
+
31
+ - name: Tests + coverage floor
32
+ run: uv run pytest
@@ -0,0 +1,22 @@
1
+ name: Publish to PyPI
2
+
3
+ # Publishes on a GitHub Release via PyPI Trusted Publishing (OIDC) — no API token is stored anywhere.
4
+ # One-time setup on PyPI is required first; see RELEASING.md.
5
+
6
+ on:
7
+ release:
8
+ types: [published]
9
+
10
+ jobs:
11
+ publish:
12
+ runs-on: ubuntu-latest
13
+ permissions:
14
+ id-token: write # required for trusted publishing (OIDC)
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ - name: Install uv
18
+ uses: astral-sh/setup-uv@v5
19
+ - name: Build sdist + wheel
20
+ run: uv build
21
+ - name: Publish to PyPI
22
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,35 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .eggs/
6
+ build/
7
+ dist/
8
+ .venv/
9
+ venv/
10
+ env/
11
+
12
+ # Env / secrets
13
+ .env
14
+ .env.*
15
+ !.env.example
16
+
17
+ # Tooling caches
18
+ .pytest_cache/
19
+ .mypy_cache/
20
+ .ruff_cache/
21
+ .ipynb_checkpoints/
22
+
23
+ # OS / editor
24
+ .DS_Store
25
+ *.swp
26
+ .idea/
27
+ .vscode/
28
+
29
+ # Generated worlds / scratch / logs
30
+ /out/
31
+ /scratch/
32
+ *.log
33
+
34
+ # coverage data
35
+ .coverage
@@ -0,0 +1,19 @@
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v5.0.0
4
+ hooks:
5
+ - id: trailing-whitespace
6
+ - id: end-of-file-fixer
7
+ - id: check-yaml
8
+ - id: check-toml
9
+ - id: check-merge-conflict
10
+ - id: check-added-large-files
11
+ args: ["--maxkb=1024"]
12
+ - id: detect-private-key
13
+
14
+ - repo: https://github.com/astral-sh/ruff-pre-commit
15
+ rev: v0.8.4
16
+ hooks:
17
+ - id: ruff
18
+ args: ["--fix"]
19
+ - id: ruff-format
@@ -0,0 +1,182 @@
1
+ # Changelog
2
+
3
+ All notable changes to causal-worlds are documented here. Format: [Keep a Changelog](https://keepachangelog.com/);
4
+ this project follows [Semantic Versioning](https://semver.org/).
5
+
6
+ ## [0.7.0] — 2026-06-23
7
+
8
+ **Temporal worlds (foundation).** Worlds can now carry *time* — lagged edges and autoregression —
9
+ not just cross-sectional structure. (Time-series *grading* + baselines land next.)
10
+
11
+ ### Added
12
+ - **Lagged IR** (`schema`): `Term.lag` (default 0). Only the contemporaneous (lag-0) subgraph must be
13
+ acyclic; lagged edges — including autoregressive self-loops — are valid (they read the past).
14
+ - **Temporal substrate** (`sample`): when any lag is present, sampling becomes sequential over
15
+ timesteps with a burn-in (near-stationary); `do()` interventions hold across time. Cross-sectional
16
+ worlds keep the original vectorized i.i.d. path unchanged.
17
+ - **`temporal_answer_key(spec)`** → lagged ground truth `(src, dst, lag)` incl. autoregression; the
18
+ summary `answer_key` now collapses lags and drops self-loops, so existing tooling is unaffected.
19
+ - **Built-in `supply`** — a temporal world (autoregressive lead time + inventory, a hidden logistics
20
+ confounder), in a separate registry (`worlds.temporal_names()`) so the still-contemporaneous CLI
21
+ `grade`/`gate` don't mis-score it.
22
+
23
+ [0.7.0]: https://github.com/noumenal-ai/causal-worlds/releases/tag/v0.7.0
24
+
25
+ ## [0.6.1] — 2026-06-23
26
+
27
+ ### Docs
28
+ - Rewrote the README around a getting-started flow (honest shipped-vs-roadmap; the gym/temporal/
29
+ counterfactual claims are now roadmap, not overclaims), with a lead example, the measured crossover
30
+ result, install/extras, concepts, and a roadmap.
31
+ - Added a guided [`docs/getting-started.md`](docs/getting-started.md) and runnable
32
+ [`examples/`](examples/) (grade-your-discoverer, inspect-a-bundle — keyless — and author-a-world).
33
+
34
+ [0.6.1]: https://github.com/noumenal-ai/causal-worlds/releases/tag/v0.6.1
35
+
36
+ ## [0.6.0] — 2026-06-23
37
+
38
+ **Use the benchmark.** Grading your own discoverer against a shipped world is now a first-class,
39
+ typed, tested feature — the package's whole purpose.
40
+
41
+ ### Added
42
+ - **`bench`**: `grade_spec(spec, discoverer)` and `grade_bundle(bundle_dir, discoverer)` → a `Report`
43
+ scoring any `Discoverer` against a world's declared answer-key (defaults to the reference grader).
44
+ - **CLI `score`**: `causal-worlds score <bundle> [--discoverer module:Class]` grades a discoverer
45
+ (the reference by default, or any importable one) on a persisted world.
46
+ - **Typed distribution**: ship a PEP 561 `py.typed` marker, plus PyPI metadata (classifiers,
47
+ keywords, project URLs).
48
+
49
+ [0.6.0]: https://github.com/noumenal-ai/causal-worlds/releases/tag/v0.6.0
50
+
51
+ ## [0.5.0] — 2026-06-23
52
+
53
+ **Scale resolves the difficulty question.** A 36-world set across an easy→hard complexity spread gives
54
+ the analyses real range — and structural difficulty turns out to predict the observational collapse.
55
+
56
+ ### Added
57
+ - **Author complexity knob** (`author`): `ClaudeAuthor(..., complexity="easy"|"standard"|"hard")`
58
+ varies how many hidden confounders / regime sign-flips to inject, spreading structural difficulty.
59
+ Recorded per world in the manifest (`Provenance.complexity`).
60
+ - **Scaled benchmark** (`benchmark/v0.5`): 35/36 admitted across complexity levels — mean structural
61
+ difficulty by level 0.0 / 1.4 / 3.0; reference-grader SHD 0.36 / 1.75 / 2.33.
62
+ - **Parameterized evals**: the crossover and structural-difficulty harnesses take a benchmark dir;
63
+ results nest under `evals/*/v0.5/`.
64
+
65
+ ### Findings (powered, n=35)
66
+ - **Crossover strengthens**: the interventional-CI grader keeps **confounded-kept = 0** (never reports
67
+ a hidden-confounded pair as causal) at SHD 1.47 / F1 0.91, while PC/FCI/GIES keep 8–17 and post SHD
68
+ 2.7–6.7.
69
+ - **Structural difficulty predicts observational error (corr +0.62)** where name-guessability does not
70
+ (+0.14) — the hardness is structural (confounders + sign-flips), resolving v0.4's open question and
71
+ turning difficulty into a usable instrument.
72
+
73
+ [0.5.0]: https://github.com/noumenal-ai/causal-worlds/releases/tag/v0.5.0
74
+
75
+ ## [0.4.0] — 2026-06-23
76
+
77
+ **A structural-difficulty axis.** v0.3 showed name-guessability difficulty doesn't predict discovery
78
+ error — the hardness is structural. This adds that axis and tests it honestly.
79
+
80
+ ### Added
81
+ - **Structural difficulty** (`difficulty`): `structural_difficulty(spec)` scores discovery-hardness
82
+ from the structure — hidden confounders, confounded pairs, regime **sign-flips**, edge density — with
83
+ a headline trap-count `score`. Pure, deterministic, unit-tested.
84
+ - Structural difficulty is now recorded in every admitted world's `manifest.json`.
85
+ - **Re-analysis** (`evals/structural-difficulty`): reuses the crossover report (no new runs) to test
86
+ whether structural difficulty predicts the collapse.
87
+
88
+ ### Findings (honest)
89
+ - At n=12 with a narrow difficulty range, **neither** name-guessability nor structural difficulty
90
+ cleanly predicts the *magnitude* of error (correlations −0.39…+0.14) — a statistical-power problem,
91
+ not a refutation. The v0.3 crossover (standard methods collapse, grader holds) is unaffected.
92
+ Resolving difficulty-predicts-error is deferred to the scaled set (v0.5).
93
+
94
+ [0.4.0]: https://github.com/noumenal-ai/causal-worlds/releases/tag/v0.4.0
95
+
96
+ ## [0.3.0] — 2026-06-23
97
+
98
+ **The decisive experiment.** Proves the benchmark's central claim beyond the single `coffee` world:
99
+ standard discovery collapses on our worlds where the reference interventional-CI grader holds.
100
+
101
+ ### Added
102
+ - **Baseline suite** (`baselines`): PC, GES, FCI (`causal-learn`) and GIES (`gies`) wrapped behind the
103
+ `Discoverer` Protocol as adapters — lazy-imported (the `discover` extra), so the package imports and
104
+ CI run without them; graph-parsing logic is pure and unit-tested. `BaselineResult` carries directed
105
+ edges, bidirected (confounding) marks, and the skeleton for a fair cross-method comparison.
106
+ - **Crossover eval** (`evals/baseline-crossover`): every benchmark world vs every method across seeds →
107
+ skeleton-SHD, directed F1, and *confounded-pair-kept-as-causal* (the trap). **Result (n=12): GO.**
108
+ Standard methods keep the hidden-confounded pair as causal in 7.3–10.0 of 12 worlds (PC/FCI/GIES) and
109
+ post 2–4× the skeleton error; the interventional grader stays at confounded-kept 0.33, SHD 1.31,
110
+ F1 0.91.
111
+ - **Difficulty-vs-error analysis** — *honest negative*: name-guessability difficulty does not yet
112
+ predict discovery error (corr ~0.1); the hardness is structural (confounder+regime). Sharpens v0.4.
113
+ - **Publication artifacts**: a technical blog post (`docs/blog-the-decisive-experiment.md`) and a
114
+ Framing-B paper skeleton (`paper/`).
115
+
116
+ ### Notes
117
+ - `causal-learn`'s GES is numpy-2 incompatible (errors on every world) — reported, not hidden.
118
+
119
+ [0.3.0]: https://github.com/noumenal-ai/causal-worlds/releases/tag/v0.3.0
120
+
121
+ ## [0.2.0] — 2026-06-23
122
+
123
+ Closes the generative loop: **natural language in, an admitted causal world out**, plus persistence
124
+ and a shipped benchmark set. The LLM seams are real but isolated — the package still imports and CI
125
+ still runs with no API key (the adapters are unit-tested against fakes).
126
+
127
+ ### Added
128
+ - **NL author** (`author`): `ClaudeAuthor` turns a plain-language operation into a `WorldSpec` via
129
+ `instructor` (bounded re-ask), steered toward recoverable, anti-cliché worlds (a hidden confounder
130
+ + a regime sign-flip). Behind the `Author` Protocol; provider SDK lazy-imported.
131
+ - **Independent judge** (`judge`): `GeminiJudge` guesses the structure from names/roles alone (the
132
+ anti-cliché signal) and scores faithfulness — a *different model family* than the author.
133
+ - **T4 anti-cliché gate** (`gates`): with a judge + prose, rejects unfaithful or guess-from-priors
134
+ worlds and records a `difficulty` score (`1 - F1(judge_prior, truth)`).
135
+ - **The loop** (`generate`): `generate` (author→gate→admit with feedback-driven re-author) and
136
+ `generate_many` (never-raising batch) → `AdmittedWorld`.
137
+ - **Artifact persistence** (`artifact`): self-describing on-disk bundle (`spec.json` / `data.npz` /
138
+ `answer_key.json` / `manifest.json`) with full provenance (models, grader version, seed, grade).
139
+ - **Boundary model** (`serde`): one pydantic `WorldSpecModel` — the author's output target and the
140
+ persisted JSON shape — converting to/from the frozen core IR.
141
+ - **CLI**: `generate <prompt> <out>` and `benchmark <prompts_file> <out>`; author/judge resolved
142
+ through the DI container.
143
+ - **Author-model bake-off** (`evals/author-model-bakeoff`): a reproducible, judged comparison that
144
+ picks the default author model with numbers, not assertion — shipped with the release.
145
+ - **Benchmark set** (`benchmark/v0.2`): 12 authored, admitted worlds across distinct operations —
146
+ mean difficulty 0.28, faithfulness 1.00, reference-grader directed SHD 1.25 / F1 0.92.
147
+
148
+ ### Changed
149
+ - Version is single-sourced from `_version.py` (hatchling dynamic). `.coverage` is no longer tracked.
150
+
151
+ [0.2.0]: https://github.com/noumenal-ai/causal-worlds/releases/tag/v0.2.0
152
+
153
+ ## [0.1.0] — 2026-06-22
154
+
155
+ First release: **the deterministic benchmark engine**. Generate (programmatically-specified) fictional causal
156
+ worlds with a ground-truth answer-key, sample them, grade a causal-discovery method, and score it — runnable as a
157
+ library and a CLI, with no LLM or API key required.
158
+
159
+ ### Added
160
+ - **Schema / IR** (`schema`): `WorldSpec` as the single source of truth (variables incl. hidden confounders +
161
+ generative `Mechanism`s with regime-switching); the `AnswerKey` (observed edges + confounded pairs) is *derived*,
162
+ never stored; `validate()` static gate.
163
+ - **SCM substrate** (`sample`): a deterministic, seeded executable world; `do()` interventions (constant or
164
+ per-row array). The functional core.
165
+ - **Reference grader** (`discover`): `InterventionalCiDiscoverer` — a spec-blind interventional-CI discoverer that
166
+ recovers the confounder + regime-flip trap (directed SHD 0) where standard observational/score-based methods
167
+ (PC, GES, GIES, FCI) fail.
168
+ - **Scoring** (`evaluation`): directed/skeleton SHD, F1, and `confounded_reported` (flags a causal edge claimed for
169
+ a hidden-confounded pair); `Report`.
170
+ - **Validity gates** (`gates`): `run_gates` → T1 (validity) · T2 (sample-sanity) · T3 (non-triviality vs a
171
+ per-world random-graph null). Admits only if all pass.
172
+ - **Built-in worlds** (`worlds`): `coffee` (the confounder + regime-flip trap) and `ecommerce` (easy control).
173
+ - **CLI** (`causal-worlds`): `version` · `worlds` · `grade <world>` · `gate <world>`.
174
+ - **Wiring**: pydantic-settings `config`, a small DI `container`, and a no-op `Tracer` observability seam.
175
+ - **Quality**: uv + ruff (`select=ALL`) + mypy `strict` + pytest with a coverage floor, enforced by CI.
176
+
177
+ ### Not yet (tracked as v0.2 issues)
178
+ NL/`WorldBrief` → spec **author**, the independent **Gemini judge** + the T4 anti-cliché gate, conversational
179
+ **elicitation**, the **Langfuse (OTEL)** tracing adapter, **artifact/manifest persistence**, grader **hardening**
180
+ (FCI-with-interventions) + world-diversity sweep + knob calibration, and more built-in/temporal worlds.
181
+
182
+ [0.1.0]: https://github.com/noumenal-ai/causal-worlds/releases/tag/v0.1.0
@@ -0,0 +1,34 @@
1
+ # causal-worlds — AI working agreement
2
+
3
+ Short, binding agreement for working in this repo. Keep it short; depth lives in
4
+ [docs/engineering.md](docs/engineering.md), auto-applied via the skill
5
+ [.claude/skills/causal-worlds-conventions/](.claude/skills/causal-worlds-conventions/SKILL.md).
6
+
7
+ ## What this is
8
+ A public (MIT) Python package: **generate a fictional-but-coherent causal *operation* from a natural-language
9
+ description** — an executable simulator, the time-series it emits, and a **declared ground-truth causal structure
10
+ (the answer-key)** — for benchmarking causal-discovery agents. A **mix of engineering and research**. CLI-first
11
+ (typer). Consumes **Gemini** as an *independent* LLM judge (must differ from any author model family). Concept &
12
+ approach are **validated** (see [docs/validation.md](docs/validation.md)); this is the production build.
13
+
14
+ ## Non-negotiables (full detail in docs/engineering.md)
15
+ - **Clean Code (Uncle Bob) — all of it, NOT Clean Architecture.** **SOLID** via Python `Protocol`s.
16
+ - **Design patterns only at proven variation points** (Strategy/Adapter for discoverer·judge·substrate, Pipeline of
17
+ gates). **No abstraction for hypothetical futures.** Reuse over fork.
18
+ - **Wrap every third-party lib** (`causal-learn`, `gies`, Gemini) **behind our own Protocol + adapter.**
19
+ - **Tooling:** `uv` · `ruff` (`select=ALL` + curated ignores, line 100, `ruff format`) · `mypy strict` · `pytest`
20
+ with a coverage floor · pre-commit · **CI that fails**. `src`-layout, feature/capability modules. Google docstrings.
21
+ - **Run the gate before committing:** `make validate` (or `uv run ruff format --check . && uv run ruff check . &&
22
+ uv run mypy && uv run pytest`). **CI green is a merge gate** — that's how we avoid re-leaving the same review comment.
23
+ - **Measured, not asserted.** Every behavioral claim is backed by a runnable script/test.
24
+ - **`spikes/` and `experiments/` are research, NOT shipped** (lint/type/coverage-exempt); reproducible via seed +
25
+ `uv.lock` + pinned model ids; honest negatives.
26
+ - **Commits:** Conventional Commits, atomic, **no `Co-Authored-By` trailer**. Push/PR only on explicit request.
27
+
28
+ ## Map
29
+ - [docs/scope.md](docs/scope.md) · [docs/hld.md](docs/hld.md) · [docs/lld.md](docs/lld.md) ·
30
+ [docs/validation.md](docs/validation.md) — product/design + the validation evidence.
31
+ - [docs/architecture.md](docs/architecture.md) — the finalized system design (pipeline, seams, data contract, DI,
32
+ config, artifact, elicitation).
33
+ - [docs/engineering.md](docs/engineering.md) — the binding code-quality + research guidelines.
34
+ - `spikes/` — the validation spikes (research; the proof, not the implementation).
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Noumenal AI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,26 @@
1
+ .PHONY: install fmt lint type test check validate hooks
2
+
3
+ install: ## sync env + dev tools
4
+ uv sync
5
+
6
+ fmt: ## auto-format
7
+ uv run ruff format .
8
+
9
+ lint: ## lint (with autofix)
10
+ uv run ruff check --fix .
11
+
12
+ type: ## strict type-check
13
+ uv run mypy
14
+
15
+ test: ## tests + coverage floor
16
+ uv run pytest
17
+
18
+ check: ## the read-only gate (what CI runs, minus tests)
19
+ uv run ruff format --check .
20
+ uv run ruff check .
21
+ uv run mypy
22
+
23
+ validate: check test ## the full gate — run before every commit
24
+
25
+ hooks: ## install pre-commit hooks
26
+ uv run pre-commit install