probity-bench 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (541) hide show
  1. probity_bench-1.1.0/LICENSE +21 -0
  2. probity_bench-1.1.0/PKG-INFO +193 -0
  3. probity_bench-1.1.0/README.md +154 -0
  4. probity_bench-1.1.0/probity_bench.egg-info/PKG-INFO +193 -0
  5. probity_bench-1.1.0/probity_bench.egg-info/SOURCES.txt +539 -0
  6. probity_bench-1.1.0/probity_bench.egg-info/dependency_links.txt +1 -0
  7. probity_bench-1.1.0/probity_bench.egg-info/entry_points.txt +2 -0
  8. probity_bench-1.1.0/probity_bench.egg-info/top_level.txt +1 -0
  9. probity_bench-1.1.0/probity_cli/__init__.py +9 -0
  10. probity_bench-1.1.0/probity_cli/__main__.py +14 -0
  11. probity_bench-1.1.0/probity_cli/cli.py +100 -0
  12. probity_bench-1.1.0/probity_cli/config.py +98 -0
  13. probity_bench-1.1.0/probity_cli/data/demo/demo.gif +0 -0
  14. probity_bench-1.1.0/probity_cli/data/demo/demo.tape +21 -0
  15. probity_bench-1.1.0/probity_cli/data/demo/onboard.gif +0 -0
  16. probity_bench-1.1.0/probity_cli/data/demo/onboard.tape +41 -0
  17. probity_bench-1.1.0/probity_cli/data/demo/wobble_demo.py +55 -0
  18. probity_bench-1.1.0/probity_cli/data/engine/corpus_utils.py +25 -0
  19. probity_bench-1.1.0/probity_cli/data/engine/edgar.py +63 -0
  20. probity_bench-1.1.0/probity_cli/data/engine/harness.py +207 -0
  21. probity_bench-1.1.0/probity_cli/data/engine/models.py +244 -0
  22. probity_bench-1.1.0/probity_cli/data/engine/normalize.py +106 -0
  23. probity_bench-1.1.0/probity_cli/data/engine/registry.json +762 -0
  24. probity_bench-1.1.0/probity_cli/data/engine/runner.py +118 -0
  25. probity_bench-1.1.0/probity_cli/data/engine/scorer.py +112 -0
  26. probity_bench-1.1.0/probity_cli/data/engine/task_builder.py +19 -0
  27. probity_bench-1.1.0/probity_cli/data/leaves/acceleration_trigger/candidates.jsonl +38 -0
  28. probity_bench-1.1.0/probity_cli/data/leaves/acceleration_trigger/oracle.jsonl +13 -0
  29. probity_bench-1.1.0/probity_cli/data/leaves/acceleration_trigger/run.log +49 -0
  30. probity_bench-1.1.0/probity_cli/data/leaves/acceleration_trigger/run.py +12 -0
  31. probity_bench-1.1.0/probity_cli/data/leaves/acceleration_trigger/runs_deepseek-v4f.jsonl +260 -0
  32. probity_bench-1.1.0/probity_cli/data/leaves/acceleration_trigger/runs_gemma3-1b.jsonl +260 -0
  33. probity_bench-1.1.0/probity_cli/data/leaves/acceleration_trigger/scored.json +390 -0
  34. probity_bench-1.1.0/probity_cli/data/leaves/acceleration_trigger/source.py +73 -0
  35. probity_bench-1.1.0/probity_cli/data/leaves/acceleration_trigger/task.py +45 -0
  36. probity_bench-1.1.0/probity_cli/data/leaves/antidilution_base/oracle.jsonl +10 -0
  37. probity_bench-1.1.0/probity_cli/data/leaves/antidilution_base/run.py +12 -0
  38. probity_bench-1.1.0/probity_cli/data/leaves/antidilution_base/runs_deepseek-v4f.jsonl +200 -0
  39. probity_bench-1.1.0/probity_cli/data/leaves/antidilution_base/runs_gemma3-1b.jsonl +200 -0
  40. probity_bench-1.1.0/probity_cli/data/leaves/antidilution_base/scored.json +312 -0
  41. probity_bench-1.1.0/probity_cli/data/leaves/antidilution_base/source.py +105 -0
  42. probity_bench-1.1.0/probity_cli/data/leaves/antidilution_base/task.py +71 -0
  43. probity_bench-1.1.0/probity_cli/data/leaves/antidilution_type/candidates.jsonl +62 -0
  44. probity_bench-1.1.0/probity_cli/data/leaves/antidilution_type/oracle.jsonl +5 -0
  45. probity_bench-1.1.0/probity_cli/data/leaves/antidilution_type/run.log +49 -0
  46. probity_bench-1.1.0/probity_cli/data/leaves/antidilution_type/run.py +12 -0
  47. probity_bench-1.1.0/probity_cli/data/leaves/antidilution_type/runs_deepseek-v4f.jsonl +100 -0
  48. probity_bench-1.1.0/probity_cli/data/leaves/antidilution_type/runs_gemma3-1b.jsonl +100 -0
  49. probity_bench-1.1.0/probity_cli/data/leaves/antidilution_type/scored.json +182 -0
  50. probity_bench-1.1.0/probity_cli/data/leaves/antidilution_type/source.log +64 -0
  51. probity_bench-1.1.0/probity_cli/data/leaves/antidilution_type/source.py +84 -0
  52. probity_bench-1.1.0/probity_cli/data/leaves/antidilution_type/task.py +62 -0
  53. probity_bench-1.1.0/probity_cli/data/leaves/auto_conversion_trigger/oracle.jsonl +5 -0
  54. probity_bench-1.1.0/probity_cli/data/leaves/auto_conversion_trigger/run.py +12 -0
  55. probity_bench-1.1.0/probity_cli/data/leaves/auto_conversion_trigger/runs_deepseek-v4f.jsonl +100 -0
  56. probity_bench-1.1.0/probity_cli/data/leaves/auto_conversion_trigger/runs_gemma3-1b.jsonl +100 -0
  57. probity_bench-1.1.0/probity_cli/data/leaves/auto_conversion_trigger/scored.json +182 -0
  58. probity_bench-1.1.0/probity_cli/data/leaves/auto_conversion_trigger/source.py +83 -0
  59. probity_bench-1.1.0/probity_cli/data/leaves/auto_conversion_trigger/task.py +54 -0
  60. probity_bench-1.1.0/probity_cli/data/leaves/board_seats_investor/candidates_present.jsonl +40 -0
  61. probity_bench-1.1.0/probity_cli/data/leaves/board_seats_investor/fetch.py +120 -0
  62. probity_bench-1.1.0/probity_cli/data/leaves/board_seats_investor/oracle.jsonl +9 -0
  63. probity_bench-1.1.0/probity_cli/data/leaves/board_seats_investor/run.log +49 -0
  64. probity_bench-1.1.0/probity_cli/data/leaves/board_seats_investor/run.py +12 -0
  65. probity_bench-1.1.0/probity_cli/data/leaves/board_seats_investor/runs_deepseek-v4f.jsonl +180 -0
  66. probity_bench-1.1.0/probity_cli/data/leaves/board_seats_investor/runs_gemma3-1b.jsonl +180 -0
  67. probity_bench-1.1.0/probity_cli/data/leaves/board_seats_investor/scored.json +286 -0
  68. probity_bench-1.1.0/probity_cli/data/leaves/board_seats_investor/source.py +88 -0
  69. probity_bench-1.1.0/probity_cli/data/leaves/board_seats_investor/task.py +31 -0
  70. probity_bench-1.1.0/probity_cli/data/leaves/cliff_present/candidates_absent.jsonl +24 -0
  71. probity_bench-1.1.0/probity_cli/data/leaves/cliff_present/candidates_present.jsonl +30 -0
  72. probity_bench-1.1.0/probity_cli/data/leaves/cliff_present/fetch.py +65 -0
  73. probity_bench-1.1.0/probity_cli/data/leaves/cliff_present/oracle.jsonl +12 -0
  74. probity_bench-1.1.0/probity_cli/data/leaves/cliff_present/run.log +49 -0
  75. probity_bench-1.1.0/probity_cli/data/leaves/cliff_present/run.py +12 -0
  76. probity_bench-1.1.0/probity_cli/data/leaves/cliff_present/runs_deepseek-v4f.jsonl +240 -0
  77. probity_bench-1.1.0/probity_cli/data/leaves/cliff_present/runs_gemma3-1b.jsonl +240 -0
  78. probity_bench-1.1.0/probity_cli/data/leaves/cliff_present/scored.json +364 -0
  79. probity_bench-1.1.0/probity_cli/data/leaves/cliff_present/source.py +71 -0
  80. probity_bench-1.1.0/probity_cli/data/leaves/cliff_present/task.py +46 -0
  81. probity_bench-1.1.0/probity_cli/data/leaves/conversion_ratio/oracle.jsonl +5 -0
  82. probity_bench-1.1.0/probity_cli/data/leaves/conversion_ratio/run.log +49 -0
  83. probity_bench-1.1.0/probity_cli/data/leaves/conversion_ratio/run.py +12 -0
  84. probity_bench-1.1.0/probity_cli/data/leaves/conversion_ratio/runs_deepseek-v4f.jsonl +100 -0
  85. probity_bench-1.1.0/probity_cli/data/leaves/conversion_ratio/runs_gemma3-1b.jsonl +100 -0
  86. probity_bench-1.1.0/probity_cli/data/leaves/conversion_ratio/scored.json +182 -0
  87. probity_bench-1.1.0/probity_cli/data/leaves/conversion_ratio/source.py +73 -0
  88. probity_bench-1.1.0/probity_cli/data/leaves/conversion_ratio/task.py +33 -0
  89. probity_bench-1.1.0/probity_cli/data/leaves/convert_vs_preference_decision/oracle.jsonl +2 -0
  90. probity_bench-1.1.0/probity_cli/data/leaves/convert_vs_preference_decision/run.py +12 -0
  91. probity_bench-1.1.0/probity_cli/data/leaves/convert_vs_preference_decision/runs_deepseek-v4f.jsonl +40 -0
  92. probity_bench-1.1.0/probity_cli/data/leaves/convert_vs_preference_decision/runs_gemma3-1b.jsonl +40 -0
  93. probity_bench-1.1.0/probity_cli/data/leaves/convert_vs_preference_decision/scored.json +104 -0
  94. probity_bench-1.1.0/probity_cli/data/leaves/convert_vs_preference_decision/source.py +73 -0
  95. probity_bench-1.1.0/probity_cli/data/leaves/convert_vs_preference_decision/task.py +46 -0
  96. probity_bench-1.1.0/probity_cli/data/leaves/current_ownership_pct/oracle.jsonl +9 -0
  97. probity_bench-1.1.0/probity_cli/data/leaves/current_ownership_pct/run.py +12 -0
  98. probity_bench-1.1.0/probity_cli/data/leaves/current_ownership_pct/runs_deepseek-v4f.jsonl +180 -0
  99. probity_bench-1.1.0/probity_cli/data/leaves/current_ownership_pct/runs_gemma3-1b.jsonl +180 -0
  100. probity_bench-1.1.0/probity_cli/data/leaves/current_ownership_pct/scored.json +286 -0
  101. probity_bench-1.1.0/probity_cli/data/leaves/current_ownership_pct/source.py +97 -0
  102. probity_bench-1.1.0/probity_cli/data/leaves/current_ownership_pct/task.py +38 -0
  103. probity_bench-1.1.0/probity_cli/data/leaves/dividend_cumulative/oracle.jsonl +16 -0
  104. probity_bench-1.1.0/probity_cli/data/leaves/dividend_cumulative/run.log +49 -0
  105. probity_bench-1.1.0/probity_cli/data/leaves/dividend_cumulative/run.py +13 -0
  106. probity_bench-1.1.0/probity_cli/data/leaves/dividend_cumulative/runs_deepseek-v4f.jsonl +320 -0
  107. probity_bench-1.1.0/probity_cli/data/leaves/dividend_cumulative/runs_gemma3-1b.jsonl +320 -0
  108. probity_bench-1.1.0/probity_cli/data/leaves/dividend_cumulative/scored.json +468 -0
  109. probity_bench-1.1.0/probity_cli/data/leaves/dividend_cumulative/source.py +82 -0
  110. probity_bench-1.1.0/probity_cli/data/leaves/dividend_cumulative/task.py +41 -0
  111. probity_bench-1.1.0/probity_cli/data/leaves/dividend_rate_pct/oracle.jsonl +6 -0
  112. probity_bench-1.1.0/probity_cli/data/leaves/dividend_rate_pct/run.py +12 -0
  113. probity_bench-1.1.0/probity_cli/data/leaves/dividend_rate_pct/runs_deepseek-v4f.jsonl +120 -0
  114. probity_bench-1.1.0/probity_cli/data/leaves/dividend_rate_pct/runs_gemma3-1b.jsonl +120 -0
  115. probity_bench-1.1.0/probity_cli/data/leaves/dividend_rate_pct/scored.json +208 -0
  116. probity_bench-1.1.0/probity_cli/data/leaves/dividend_rate_pct/source.py +92 -0
  117. probity_bench-1.1.0/probity_cli/data/leaves/dividend_rate_pct/task.py +32 -0
  118. probity_bench-1.1.0/probity_cli/data/leaves/down_round_adjustment/DEFERRED.md +43 -0
  119. probity_bench-1.1.0/probity_cli/data/leaves/drag_along/candidates_absent.jsonl +16 -0
  120. probity_bench-1.1.0/probity_cli/data/leaves/drag_along/candidates_present.jsonl +30 -0
  121. probity_bench-1.1.0/probity_cli/data/leaves/drag_along/fetch.py +64 -0
  122. probity_bench-1.1.0/probity_cli/data/leaves/drag_along/oracle.jsonl +12 -0
  123. probity_bench-1.1.0/probity_cli/data/leaves/drag_along/run.py +12 -0
  124. probity_bench-1.1.0/probity_cli/data/leaves/drag_along/runs_deepseek-v4f.jsonl +240 -0
  125. probity_bench-1.1.0/probity_cli/data/leaves/drag_along/runs_gemma3-1b.jsonl +240 -0
  126. probity_bench-1.1.0/probity_cli/data/leaves/drag_along/scored.json +364 -0
  127. probity_bench-1.1.0/probity_cli/data/leaves/drag_along/source.py +85 -0
  128. probity_bench-1.1.0/probity_cli/data/leaves/drag_along/task.py +43 -0
  129. probity_bench-1.1.0/probity_cli/data/leaves/employee_pool_pct/oracle.jsonl +1 -0
  130. probity_bench-1.1.0/probity_cli/data/leaves/employee_pool_pct/run.py +12 -0
  131. probity_bench-1.1.0/probity_cli/data/leaves/employee_pool_pct/runs_deepseek-v4f.jsonl +20 -0
  132. probity_bench-1.1.0/probity_cli/data/leaves/employee_pool_pct/runs_gemma3-1b.jsonl +20 -0
  133. probity_bench-1.1.0/probity_cli/data/leaves/employee_pool_pct/scored.json +78 -0
  134. probity_bench-1.1.0/probity_cli/data/leaves/employee_pool_pct/source.py +91 -0
  135. probity_bench-1.1.0/probity_cli/data/leaves/employee_pool_pct/task.py +38 -0
  136. probity_bench-1.1.0/probity_cli/data/leaves/exercise_window/oracle.jsonl +5 -0
  137. probity_bench-1.1.0/probity_cli/data/leaves/exercise_window/run.py +12 -0
  138. probity_bench-1.1.0/probity_cli/data/leaves/exercise_window/runs_deepseek-v4f.jsonl +100 -0
  139. probity_bench-1.1.0/probity_cli/data/leaves/exercise_window/runs_gemma3-1b.jsonl +100 -0
  140. probity_bench-1.1.0/probity_cli/data/leaves/exercise_window/scored.json +182 -0
  141. probity_bench-1.1.0/probity_cli/data/leaves/exercise_window/source.py +77 -0
  142. probity_bench-1.1.0/probity_cli/data/leaves/exercise_window/task.py +41 -0
  143. probity_bench-1.1.0/probity_cli/data/leaves/financial_statement_qa/oracle.jsonl +5 -0
  144. probity_bench-1.1.0/probity_cli/data/leaves/financial_statement_qa/run.py +12 -0
  145. probity_bench-1.1.0/probity_cli/data/leaves/financial_statement_qa/runs_deepseek-v4f.jsonl +100 -0
  146. probity_bench-1.1.0/probity_cli/data/leaves/financial_statement_qa/runs_gemma3-1b.jsonl +100 -0
  147. probity_bench-1.1.0/probity_cli/data/leaves/financial_statement_qa/scored.json +182 -0
  148. probity_bench-1.1.0/probity_cli/data/leaves/financial_statement_qa/source.py +85 -0
  149. probity_bench-1.1.0/probity_cli/data/leaves/financial_statement_qa/task.py +46 -0
  150. probity_bench-1.1.0/probity_cli/data/leaves/flag_founder_hostile_vesting/DEFERRED.md +31 -0
  151. probity_bench-1.1.0/probity_cli/data/leaves/flag_full_ratchet/candidates_full_ratchet.jsonl +6 -0
  152. probity_bench-1.1.0/probity_cli/data/leaves/flag_full_ratchet/candidates_weighted_avg.jsonl +22 -0
  153. probity_bench-1.1.0/probity_cli/data/leaves/flag_full_ratchet/fetch.py +101 -0
  154. probity_bench-1.1.0/probity_cli/data/leaves/flag_full_ratchet/oracle.jsonl +7 -0
  155. probity_bench-1.1.0/probity_cli/data/leaves/flag_full_ratchet/run.log +49 -0
  156. probity_bench-1.1.0/probity_cli/data/leaves/flag_full_ratchet/run.py +12 -0
  157. probity_bench-1.1.0/probity_cli/data/leaves/flag_full_ratchet/runs_deepseek-v4f.jsonl +140 -0
  158. probity_bench-1.1.0/probity_cli/data/leaves/flag_full_ratchet/runs_gemma3-1b.jsonl +140 -0
  159. probity_bench-1.1.0/probity_cli/data/leaves/flag_full_ratchet/scored.json +234 -0
  160. probity_bench-1.1.0/probity_cli/data/leaves/flag_full_ratchet/source.py +123 -0
  161. probity_bench-1.1.0/probity_cli/data/leaves/flag_full_ratchet/task.py +48 -0
  162. probity_bench-1.1.0/probity_cli/data/leaves/flag_internal_inconsistency/oracle.jsonl +5 -0
  163. probity_bench-1.1.0/probity_cli/data/leaves/flag_internal_inconsistency/run.py +12 -0
  164. probity_bench-1.1.0/probity_cli/data/leaves/flag_internal_inconsistency/runs_deepseek-v4f.jsonl +100 -0
  165. probity_bench-1.1.0/probity_cli/data/leaves/flag_internal_inconsistency/runs_gemma3-1b.jsonl +100 -0
  166. probity_bench-1.1.0/probity_cli/data/leaves/flag_internal_inconsistency/scored.json +182 -0
  167. probity_bench-1.1.0/probity_cli/data/leaves/flag_internal_inconsistency/source.py +97 -0
  168. probity_bench-1.1.0/probity_cli/data/leaves/flag_internal_inconsistency/task.py +47 -0
  169. probity_bench-1.1.0/probity_cli/data/leaves/flag_missing_pro_rata/oracle.jsonl +4 -0
  170. probity_bench-1.1.0/probity_cli/data/leaves/flag_missing_pro_rata/run.py +12 -0
  171. probity_bench-1.1.0/probity_cli/data/leaves/flag_missing_pro_rata/runs_deepseek-v4f.jsonl +80 -0
  172. probity_bench-1.1.0/probity_cli/data/leaves/flag_missing_pro_rata/runs_gemma3-1b.jsonl +80 -0
  173. probity_bench-1.1.0/probity_cli/data/leaves/flag_missing_pro_rata/scored.json +156 -0
  174. probity_bench-1.1.0/probity_cli/data/leaves/flag_missing_pro_rata/source.py +85 -0
  175. probity_bench-1.1.0/probity_cli/data/leaves/flag_missing_pro_rata/task.py +44 -0
  176. probity_bench-1.1.0/probity_cli/data/leaves/flag_offmarket_liqpref/candidates_true.jsonl +2 -0
  177. probity_bench-1.1.0/probity_cli/data/leaves/flag_offmarket_liqpref/oracle.jsonl +10 -0
  178. probity_bench-1.1.0/probity_cli/data/leaves/flag_offmarket_liqpref/run.log +49 -0
  179. probity_bench-1.1.0/probity_cli/data/leaves/flag_offmarket_liqpref/run.py +12 -0
  180. probity_bench-1.1.0/probity_cli/data/leaves/flag_offmarket_liqpref/runs_deepseek-v4f.jsonl +200 -0
  181. probity_bench-1.1.0/probity_cli/data/leaves/flag_offmarket_liqpref/runs_gemma3-1b.jsonl +200 -0
  182. probity_bench-1.1.0/probity_cli/data/leaves/flag_offmarket_liqpref/scored.json +312 -0
  183. probity_bench-1.1.0/probity_cli/data/leaves/flag_offmarket_liqpref/source.py +89 -0
  184. probity_bench-1.1.0/probity_cli/data/leaves/flag_offmarket_liqpref/task.py +44 -0
  185. probity_bench-1.1.0/probity_cli/data/leaves/flag_uncapped_participation/oracle.jsonl +13 -0
  186. probity_bench-1.1.0/probity_cli/data/leaves/flag_uncapped_participation/run.log +49 -0
  187. probity_bench-1.1.0/probity_cli/data/leaves/flag_uncapped_participation/run.py +12 -0
  188. probity_bench-1.1.0/probity_cli/data/leaves/flag_uncapped_participation/runs_deepseek-v4f.jsonl +260 -0
  189. probity_bench-1.1.0/probity_cli/data/leaves/flag_uncapped_participation/runs_gemma3-1b.jsonl +260 -0
  190. probity_bench-1.1.0/probity_cli/data/leaves/flag_uncapped_participation/scored.json +390 -0
  191. probity_bench-1.1.0/probity_cli/data/leaves/flag_uncapped_participation/source.py +75 -0
  192. probity_bench-1.1.0/probity_cli/data/leaves/flag_uncapped_participation/task.py +43 -0
  193. probity_bench-1.1.0/probity_cli/data/leaves/form_d_fields/oracle.jsonl +2 -0
  194. probity_bench-1.1.0/probity_cli/data/leaves/form_d_fields/run.py +12 -0
  195. probity_bench-1.1.0/probity_cli/data/leaves/form_d_fields/runs_deepseek-v4f.jsonl +40 -0
  196. probity_bench-1.1.0/probity_cli/data/leaves/form_d_fields/runs_gemma3-1b.jsonl +40 -0
  197. probity_bench-1.1.0/probity_cli/data/leaves/form_d_fields/scored.json +104 -0
  198. probity_bench-1.1.0/probity_cli/data/leaves/form_d_fields/source.py +74 -0
  199. probity_bench-1.1.0/probity_cli/data/leaves/form_d_fields/task.py +45 -0
  200. probity_bench-1.1.0/probity_cli/data/leaves/founder_ownership_pct/oracle.jsonl +3 -0
  201. probity_bench-1.1.0/probity_cli/data/leaves/founder_ownership_pct/run.py +12 -0
  202. probity_bench-1.1.0/probity_cli/data/leaves/founder_ownership_pct/runs_deepseek-v4f.jsonl +60 -0
  203. probity_bench-1.1.0/probity_cli/data/leaves/founder_ownership_pct/runs_gemma3-1b.jsonl +60 -0
  204. probity_bench-1.1.0/probity_cli/data/leaves/founder_ownership_pct/scored.json +130 -0
  205. probity_bench-1.1.0/probity_cli/data/leaves/founder_ownership_pct/source.py +78 -0
  206. probity_bench-1.1.0/probity_cli/data/leaves/founder_ownership_pct/task.py +38 -0
  207. probity_bench-1.1.0/probity_cli/data/leaves/fully_diluted_basis/candidates_fully_diluted.jsonl +54 -0
  208. probity_bench-1.1.0/probity_cli/data/leaves/fully_diluted_basis/candidates_issued_outstanding.jsonl +50 -0
  209. probity_bench-1.1.0/probity_cli/data/leaves/fully_diluted_basis/fetch.py +97 -0
  210. probity_bench-1.1.0/probity_cli/data/leaves/fully_diluted_basis/fetch2.py +46 -0
  211. probity_bench-1.1.0/probity_cli/data/leaves/fully_diluted_basis/oracle.jsonl +8 -0
  212. probity_bench-1.1.0/probity_cli/data/leaves/fully_diluted_basis/run.py +13 -0
  213. probity_bench-1.1.0/probity_cli/data/leaves/fully_diluted_basis/runs_deepseek-v4f.jsonl +160 -0
  214. probity_bench-1.1.0/probity_cli/data/leaves/fully_diluted_basis/runs_gemma3-1b.jsonl +160 -0
  215. probity_bench-1.1.0/probity_cli/data/leaves/fully_diluted_basis/scored.json +260 -0
  216. probity_bench-1.1.0/probity_cli/data/leaves/fully_diluted_basis/source.py +108 -0
  217. probity_bench-1.1.0/probity_cli/data/leaves/fully_diluted_basis/task.py +46 -0
  218. probity_bench-1.1.0/probity_cli/data/leaves/information_rights/candidates_absent.jsonl +30 -0
  219. probity_bench-1.1.0/probity_cli/data/leaves/information_rights/candidates_present.jsonl +40 -0
  220. probity_bench-1.1.0/probity_cli/data/leaves/information_rights/fetch.py +128 -0
  221. probity_bench-1.1.0/probity_cli/data/leaves/information_rights/oracle.jsonl +12 -0
  222. probity_bench-1.1.0/probity_cli/data/leaves/information_rights/run.log +49 -0
  223. probity_bench-1.1.0/probity_cli/data/leaves/information_rights/run.py +12 -0
  224. probity_bench-1.1.0/probity_cli/data/leaves/information_rights/runs_deepseek-v4f.jsonl +240 -0
  225. probity_bench-1.1.0/probity_cli/data/leaves/information_rights/runs_gemma3-1b.jsonl +240 -0
  226. probity_bench-1.1.0/probity_cli/data/leaves/information_rights/scored.json +364 -0
  227. probity_bench-1.1.0/probity_cli/data/leaves/information_rights/source.py +69 -0
  228. probity_bench-1.1.0/probity_cli/data/leaves/information_rights/task.py +42 -0
  229. probity_bench-1.1.0/probity_cli/data/leaves/investor_ownership_pct/oracle.jsonl +4 -0
  230. probity_bench-1.1.0/probity_cli/data/leaves/investor_ownership_pct/run.py +12 -0
  231. probity_bench-1.1.0/probity_cli/data/leaves/investor_ownership_pct/runs_deepseek-v4f.jsonl +80 -0
  232. probity_bench-1.1.0/probity_cli/data/leaves/investor_ownership_pct/runs_gemma3-1b.jsonl +80 -0
  233. probity_bench-1.1.0/probity_cli/data/leaves/investor_ownership_pct/scored.json +156 -0
  234. probity_bench-1.1.0/probity_cli/data/leaves/investor_ownership_pct/source.py +85 -0
  235. probity_bench-1.1.0/probity_cli/data/leaves/investor_ownership_pct/task.py +39 -0
  236. probity_bench-1.1.0/probity_cli/data/leaves/liquidation_preference_multiple/oracle.jsonl +9 -0
  237. probity_bench-1.1.0/probity_cli/data/leaves/liquidation_preference_multiple/run.log +49 -0
  238. probity_bench-1.1.0/probity_cli/data/leaves/liquidation_preference_multiple/run.py +12 -0
  239. probity_bench-1.1.0/probity_cli/data/leaves/liquidation_preference_multiple/runs_deepseek-v4f.jsonl +180 -0
  240. probity_bench-1.1.0/probity_cli/data/leaves/liquidation_preference_multiple/runs_gemma3-1b.jsonl +180 -0
  241. probity_bench-1.1.0/probity_cli/data/leaves/liquidation_preference_multiple/scored.json +286 -0
  242. probity_bench-1.1.0/probity_cli/data/leaves/liquidation_preference_multiple/source.py +79 -0
  243. probity_bench-1.1.0/probity_cli/data/leaves/liquidation_preference_multiple/task.py +58 -0
  244. probity_bench-1.1.0/probity_cli/data/leaves/liquidation_waterfall_payout/oracle.jsonl +4 -0
  245. probity_bench-1.1.0/probity_cli/data/leaves/liquidation_waterfall_payout/run.py +32 -0
  246. probity_bench-1.1.0/probity_cli/data/leaves/liquidation_waterfall_payout/runs_deepseek-v4f.jsonl +80 -0
  247. probity_bench-1.1.0/probity_cli/data/leaves/liquidation_waterfall_payout/runs_gemma3-1b.jsonl +80 -0
  248. probity_bench-1.1.0/probity_cli/data/leaves/liquidation_waterfall_payout/scored.json +156 -0
  249. probity_bench-1.1.0/probity_cli/data/leaves/liquidation_waterfall_payout/source.py +144 -0
  250. probity_bench-1.1.0/probity_cli/data/leaves/liquidation_waterfall_payout/task.py +91 -0
  251. probity_bench-1.1.0/probity_cli/data/leaves/multi_round_stacked_dilution/oracle.jsonl +5 -0
  252. probity_bench-1.1.0/probity_cli/data/leaves/multi_round_stacked_dilution/run.py +12 -0
  253. probity_bench-1.1.0/probity_cli/data/leaves/multi_round_stacked_dilution/runs_deepseek-v4f.jsonl +100 -0
  254. probity_bench-1.1.0/probity_cli/data/leaves/multi_round_stacked_dilution/runs_gemma3-1b.jsonl +100 -0
  255. probity_bench-1.1.0/probity_cli/data/leaves/multi_round_stacked_dilution/scored.json +182 -0
  256. probity_bench-1.1.0/probity_cli/data/leaves/multi_round_stacked_dilution/source.py +92 -0
  257. probity_bench-1.1.0/probity_cli/data/leaves/multi_round_stacked_dilution/task.py +47 -0
  258. probity_bench-1.1.0/probity_cli/data/leaves/note_conversion_amount/DEFERRED.md +23 -0
  259. probity_bench-1.1.0/probity_cli/data/leaves/note_discount/oracle.jsonl +4 -0
  260. probity_bench-1.1.0/probity_cli/data/leaves/note_discount/run.py +12 -0
  261. probity_bench-1.1.0/probity_cli/data/leaves/note_discount/runs_deepseek-v4f.jsonl +80 -0
  262. probity_bench-1.1.0/probity_cli/data/leaves/note_discount/runs_gemma3-1b.jsonl +80 -0
  263. probity_bench-1.1.0/probity_cli/data/leaves/note_discount/scored.json +156 -0
  264. probity_bench-1.1.0/probity_cli/data/leaves/note_discount/source.py +112 -0
  265. probity_bench-1.1.0/probity_cli/data/leaves/note_discount/task.py +54 -0
  266. probity_bench-1.1.0/probity_cli/data/leaves/note_interest_rate/oracle.jsonl +6 -0
  267. probity_bench-1.1.0/probity_cli/data/leaves/note_interest_rate/run.py +12 -0
  268. probity_bench-1.1.0/probity_cli/data/leaves/note_interest_rate/runs_deepseek-v4f.jsonl +120 -0
  269. probity_bench-1.1.0/probity_cli/data/leaves/note_interest_rate/runs_gemma3-1b.jsonl +120 -0
  270. probity_bench-1.1.0/probity_cli/data/leaves/note_interest_rate/scored.json +208 -0
  271. probity_bench-1.1.0/probity_cli/data/leaves/note_interest_rate/source.py +118 -0
  272. probity_bench-1.1.0/probity_cli/data/leaves/note_interest_rate/task.py +36 -0
  273. probity_bench-1.1.0/probity_cli/data/leaves/note_maturity_date/oracle.jsonl +4 -0
  274. probity_bench-1.1.0/probity_cli/data/leaves/note_maturity_date/run.py +12 -0
  275. probity_bench-1.1.0/probity_cli/data/leaves/note_maturity_date/runs_deepseek-v4f.jsonl +80 -0
  276. probity_bench-1.1.0/probity_cli/data/leaves/note_maturity_date/runs_gemma3-1b.jsonl +80 -0
  277. probity_bench-1.1.0/probity_cli/data/leaves/note_maturity_date/scored.json +156 -0
  278. probity_bench-1.1.0/probity_cli/data/leaves/note_maturity_date/source.py +109 -0
  279. probity_bench-1.1.0/probity_cli/data/leaves/note_maturity_date/task.py +38 -0
  280. probity_bench-1.1.0/probity_cli/data/leaves/note_principal/oracle.jsonl +7 -0
  281. probity_bench-1.1.0/probity_cli/data/leaves/note_principal/run.log +49 -0
  282. probity_bench-1.1.0/probity_cli/data/leaves/note_principal/run.py +12 -0
  283. probity_bench-1.1.0/probity_cli/data/leaves/note_principal/runs_deepseek-v4f.jsonl +140 -0
  284. probity_bench-1.1.0/probity_cli/data/leaves/note_principal/runs_gemma3-1b.jsonl +140 -0
  285. probity_bench-1.1.0/probity_cli/data/leaves/note_principal/scored.json +234 -0
  286. probity_bench-1.1.0/probity_cli/data/leaves/note_principal/source.py +116 -0
  287. probity_bench-1.1.0/probity_cli/data/leaves/note_principal/task.py +30 -0
  288. probity_bench-1.1.0/probity_cli/data/leaves/note_qualified_financing_threshold/oracle.jsonl +2 -0
  289. probity_bench-1.1.0/probity_cli/data/leaves/note_qualified_financing_threshold/run.py +12 -0
  290. probity_bench-1.1.0/probity_cli/data/leaves/note_qualified_financing_threshold/runs_deepseek-v4f.jsonl +40 -0
  291. probity_bench-1.1.0/probity_cli/data/leaves/note_qualified_financing_threshold/runs_gemma3-1b.jsonl +40 -0
  292. probity_bench-1.1.0/probity_cli/data/leaves/note_qualified_financing_threshold/scored.json +104 -0
  293. probity_bench-1.1.0/probity_cli/data/leaves/note_qualified_financing_threshold/source.py +70 -0
  294. probity_bench-1.1.0/probity_cli/data/leaves/note_qualified_financing_threshold/task.py +47 -0
  295. probity_bench-1.1.0/probity_cli/data/leaves/note_valuation_cap/oracle.jsonl +4 -0
  296. probity_bench-1.1.0/probity_cli/data/leaves/note_valuation_cap/run.log +49 -0
  297. probity_bench-1.1.0/probity_cli/data/leaves/note_valuation_cap/run.py +12 -0
  298. probity_bench-1.1.0/probity_cli/data/leaves/note_valuation_cap/runs_deepseek-v4f.jsonl +80 -0
  299. probity_bench-1.1.0/probity_cli/data/leaves/note_valuation_cap/runs_gemma3-1b.jsonl +80 -0
  300. probity_bench-1.1.0/probity_cli/data/leaves/note_valuation_cap/scored.json +156 -0
  301. probity_bench-1.1.0/probity_cli/data/leaves/note_valuation_cap/source.py +80 -0
  302. probity_bench-1.1.0/probity_cli/data/leaves/note_valuation_cap/task.py +33 -0
  303. probity_bench-1.1.0/probity_cli/data/leaves/option_pool_shuffle/oracle.jsonl +3 -0
  304. probity_bench-1.1.0/probity_cli/data/leaves/option_pool_shuffle/run.py +12 -0
  305. probity_bench-1.1.0/probity_cli/data/leaves/option_pool_shuffle/runs_deepseek-v4f.jsonl +60 -0
  306. probity_bench-1.1.0/probity_cli/data/leaves/option_pool_shuffle/runs_gemma3-1b.jsonl +60 -0
  307. probity_bench-1.1.0/probity_cli/data/leaves/option_pool_shuffle/scored.json +130 -0
  308. probity_bench-1.1.0/probity_cli/data/leaves/option_pool_shuffle/source.py +77 -0
  309. probity_bench-1.1.0/probity_cli/data/leaves/option_pool_shuffle/task.py +49 -0
  310. probity_bench-1.1.0/probity_cli/data/leaves/option_strike_409a/oracle.jsonl +7 -0
  311. probity_bench-1.1.0/probity_cli/data/leaves/option_strike_409a/run.log +49 -0
  312. probity_bench-1.1.0/probity_cli/data/leaves/option_strike_409a/run.py +12 -0
  313. probity_bench-1.1.0/probity_cli/data/leaves/option_strike_409a/runs_deepseek-v4f.jsonl +140 -0
  314. probity_bench-1.1.0/probity_cli/data/leaves/option_strike_409a/runs_gemma3-1b.jsonl +140 -0
  315. probity_bench-1.1.0/probity_cli/data/leaves/option_strike_409a/scored.json +234 -0
  316. probity_bench-1.1.0/probity_cli/data/leaves/option_strike_409a/source.py +105 -0
  317. probity_bench-1.1.0/probity_cli/data/leaves/option_strike_409a/task.py +43 -0
  318. probity_bench-1.1.0/probity_cli/data/leaves/participating_vs_nonpart_payout/DEFERRED.md +52 -0
  319. probity_bench-1.1.0/probity_cli/data/leaves/participation_cap/oracle.jsonl +3 -0
  320. probity_bench-1.1.0/probity_cli/data/leaves/participation_cap/run.log +49 -0
  321. probity_bench-1.1.0/probity_cli/data/leaves/participation_cap/run.py +12 -0
  322. probity_bench-1.1.0/probity_cli/data/leaves/participation_cap/runs_deepseek-v4f.jsonl +60 -0
  323. probity_bench-1.1.0/probity_cli/data/leaves/participation_cap/runs_gemma3-1b.jsonl +60 -0
  324. probity_bench-1.1.0/probity_cli/data/leaves/participation_cap/scored.json +130 -0
  325. probity_bench-1.1.0/probity_cli/data/leaves/participation_cap/source.py +72 -0
  326. probity_bench-1.1.0/probity_cli/data/leaves/participation_cap/task.py +37 -0
  327. probity_bench-1.1.0/probity_cli/data/leaves/participation_cap_hit/DEFERRED.md +47 -0
  328. probity_bench-1.1.0/probity_cli/data/leaves/participation_type/candidates.jsonl +43 -0
  329. probity_bench-1.1.0/probity_cli/data/leaves/participation_type/oracle.jsonl +18 -0
  330. probity_bench-1.1.0/probity_cli/data/leaves/participation_type/run.py +105 -0
  331. probity_bench-1.1.0/probity_cli/data/leaves/participation_type/run_ds.log +26 -0
  332. probity_bench-1.1.0/probity_cli/data/leaves/participation_type/run_ladder.log +74 -0
  333. probity_bench-1.1.0/probity_cli/data/leaves/participation_type/runs_deepseek-v4f.jsonl +360 -0
  334. probity_bench-1.1.0/probity_cli/data/leaves/participation_type/runs_gemma3-1b.jsonl +360 -0
  335. probity_bench-1.1.0/probity_cli/data/leaves/participation_type/scored.json +520 -0
  336. probity_bench-1.1.0/probity_cli/data/leaves/participation_type/source.py +180 -0
  337. probity_bench-1.1.0/probity_cli/data/leaves/participation_type/source_more.log +27 -0
  338. probity_bench-1.1.0/probity_cli/data/leaves/participation_type/source_more.py +97 -0
  339. probity_bench-1.1.0/probity_cli/data/leaves/participation_type/task.py +112 -0
  340. probity_bench-1.1.0/probity_cli/data/leaves/per_investor_allocation/oracle.jsonl +5 -0
  341. probity_bench-1.1.0/probity_cli/data/leaves/per_investor_allocation/run.py +12 -0
  342. probity_bench-1.1.0/probity_cli/data/leaves/per_investor_allocation/runs_deepseek-v4f.jsonl +100 -0
  343. probity_bench-1.1.0/probity_cli/data/leaves/per_investor_allocation/runs_gemma3-1b.jsonl +100 -0
  344. probity_bench-1.1.0/probity_cli/data/leaves/per_investor_allocation/scored.json +182 -0
  345. probity_bench-1.1.0/probity_cli/data/leaves/per_investor_allocation/source.py +78 -0
  346. probity_bench-1.1.0/probity_cli/data/leaves/per_investor_allocation/task.py +40 -0
  347. probity_bench-1.1.0/probity_cli/data/leaves/post_money_valuation/candidates.jsonl +20 -0
  348. probity_bench-1.1.0/probity_cli/data/leaves/post_money_valuation/fetch.py +78 -0
  349. probity_bench-1.1.0/probity_cli/data/leaves/post_money_valuation/oracle.jsonl +4 -0
  350. probity_bench-1.1.0/probity_cli/data/leaves/post_money_valuation/run.log +49 -0
  351. probity_bench-1.1.0/probity_cli/data/leaves/post_money_valuation/run.py +12 -0
  352. probity_bench-1.1.0/probity_cli/data/leaves/post_money_valuation/runs_deepseek-v4f.jsonl +80 -0
  353. probity_bench-1.1.0/probity_cli/data/leaves/post_money_valuation/runs_gemma3-1b.jsonl +80 -0
  354. probity_bench-1.1.0/probity_cli/data/leaves/post_money_valuation/scored.json +156 -0
  355. probity_bench-1.1.0/probity_cli/data/leaves/post_money_valuation/source.py +82 -0
  356. probity_bench-1.1.0/probity_cli/data/leaves/post_money_valuation/task.py +36 -0
  357. probity_bench-1.1.0/probity_cli/data/leaves/pre_vs_post_money/candidates_post.jsonl +35 -0
  358. probity_bench-1.1.0/probity_cli/data/leaves/pre_vs_post_money/candidates_pre.jsonl +36 -0
  359. probity_bench-1.1.0/probity_cli/data/leaves/pre_vs_post_money/fetch.py +79 -0
  360. probity_bench-1.1.0/probity_cli/data/leaves/pre_vs_post_money/oracle.jsonl +19 -0
  361. probity_bench-1.1.0/probity_cli/data/leaves/pre_vs_post_money/run.log +49 -0
  362. probity_bench-1.1.0/probity_cli/data/leaves/pre_vs_post_money/run.py +12 -0
  363. probity_bench-1.1.0/probity_cli/data/leaves/pre_vs_post_money/runs_deepseek-v4f.jsonl +380 -0
  364. probity_bench-1.1.0/probity_cli/data/leaves/pre_vs_post_money/runs_gemma3-1b.jsonl +380 -0
  365. probity_bench-1.1.0/probity_cli/data/leaves/pre_vs_post_money/scored.json +546 -0
  366. probity_bench-1.1.0/probity_cli/data/leaves/pre_vs_post_money/source.py +156 -0
  367. probity_bench-1.1.0/probity_cli/data/leaves/pre_vs_post_money/task.py +45 -0
  368. probity_bench-1.1.0/probity_cli/data/leaves/preference_seniority/candidates_pari.jsonl +6 -0
  369. probity_bench-1.1.0/probity_cli/data/leaves/preference_seniority/candidates_stack.jsonl +6 -0
  370. probity_bench-1.1.0/probity_cli/data/leaves/preference_seniority/oracle.jsonl +11 -0
  371. probity_bench-1.1.0/probity_cli/data/leaves/preference_seniority/run.log +49 -0
  372. probity_bench-1.1.0/probity_cli/data/leaves/preference_seniority/run.py +12 -0
  373. probity_bench-1.1.0/probity_cli/data/leaves/preference_seniority/runs_deepseek-v4f.jsonl +220 -0
  374. probity_bench-1.1.0/probity_cli/data/leaves/preference_seniority/runs_gemma3-1b.jsonl +220 -0
  375. probity_bench-1.1.0/probity_cli/data/leaves/preference_seniority/scored.json +338 -0
  376. probity_bench-1.1.0/probity_cli/data/leaves/preference_seniority/source.py +90 -0
  377. probity_bench-1.1.0/probity_cli/data/leaves/preference_seniority/task.py +44 -0
  378. probity_bench-1.1.0/probity_cli/data/leaves/preference_stack_payout/oracle.jsonl +2 -0
  379. probity_bench-1.1.0/probity_cli/data/leaves/preference_stack_payout/run.py +12 -0
  380. probity_bench-1.1.0/probity_cli/data/leaves/preference_stack_payout/runs_deepseek-v4f.jsonl +40 -0
  381. probity_bench-1.1.0/probity_cli/data/leaves/preference_stack_payout/runs_gemma3-1b.jsonl +40 -0
  382. probity_bench-1.1.0/probity_cli/data/leaves/preference_stack_payout/scored.json +104 -0
  383. probity_bench-1.1.0/probity_cli/data/leaves/preference_stack_payout/source.py +63 -0
  384. probity_bench-1.1.0/probity_cli/data/leaves/preference_stack_payout/task.py +40 -0
  385. probity_bench-1.1.0/probity_cli/data/leaves/price_per_share/oracle.jsonl +8 -0
  386. probity_bench-1.1.0/probity_cli/data/leaves/price_per_share/run.py +6 -0
  387. probity_bench-1.1.0/probity_cli/data/leaves/price_per_share/runs_deepseek-v4f.jsonl +160 -0
  388. probity_bench-1.1.0/probity_cli/data/leaves/price_per_share/runs_gemma3-1b.jsonl +160 -0
  389. probity_bench-1.1.0/probity_cli/data/leaves/price_per_share/scored.json +260 -0
  390. probity_bench-1.1.0/probity_cli/data/leaves/price_per_share/source.py +170 -0
  391. probity_bench-1.1.0/probity_cli/data/leaves/price_per_share/task.py +31 -0
  392. probity_bench-1.1.0/probity_cli/data/leaves/pro_rata_rights/candidates_absent.jsonl +35 -0
  393. probity_bench-1.1.0/probity_cli/data/leaves/pro_rata_rights/candidates_present.jsonl +40 -0
  394. probity_bench-1.1.0/probity_cli/data/leaves/pro_rata_rights/fetch.py +67 -0
  395. probity_bench-1.1.0/probity_cli/data/leaves/pro_rata_rights/oracle.jsonl +12 -0
  396. probity_bench-1.1.0/probity_cli/data/leaves/pro_rata_rights/run.log +49 -0
  397. probity_bench-1.1.0/probity_cli/data/leaves/pro_rata_rights/run.py +12 -0
  398. probity_bench-1.1.0/probity_cli/data/leaves/pro_rata_rights/runs_deepseek-v4f.jsonl +240 -0
  399. probity_bench-1.1.0/probity_cli/data/leaves/pro_rata_rights/runs_gemma3-1b.jsonl +240 -0
  400. probity_bench-1.1.0/probity_cli/data/leaves/pro_rata_rights/scored.json +364 -0
  401. probity_bench-1.1.0/probity_cli/data/leaves/pro_rata_rights/source.py +69 -0
  402. probity_bench-1.1.0/probity_cli/data/leaves/pro_rata_rights/task.py +46 -0
  403. probity_bench-1.1.0/probity_cli/data/leaves/protective_provisions/candidates_absent.jsonl +22 -0
  404. probity_bench-1.1.0/probity_cli/data/leaves/protective_provisions/candidates_present.jsonl +40 -0
  405. probity_bench-1.1.0/probity_cli/data/leaves/protective_provisions/fetch.py +71 -0
  406. probity_bench-1.1.0/probity_cli/data/leaves/protective_provisions/oracle.jsonl +12 -0
  407. probity_bench-1.1.0/probity_cli/data/leaves/protective_provisions/run.log +49 -0
  408. probity_bench-1.1.0/probity_cli/data/leaves/protective_provisions/run.py +12 -0
  409. probity_bench-1.1.0/probity_cli/data/leaves/protective_provisions/runs_deepseek-v4f.jsonl +240 -0
  410. probity_bench-1.1.0/probity_cli/data/leaves/protective_provisions/runs_gemma3-1b.jsonl +240 -0
  411. probity_bench-1.1.0/probity_cli/data/leaves/protective_provisions/scored.json +364 -0
  412. probity_bench-1.1.0/probity_cli/data/leaves/protective_provisions/source.py +69 -0
  413. probity_bench-1.1.0/probity_cli/data/leaves/protective_provisions/task.py +44 -0
  414. probity_bench-1.1.0/probity_cli/data/leaves/redemption_rights/candidates_no.jsonl +3 -0
  415. probity_bench-1.1.0/probity_cli/data/leaves/redemption_rights/candidates_yes.jsonl +4 -0
  416. probity_bench-1.1.0/probity_cli/data/leaves/redemption_rights/oracle.jsonl +10 -0
  417. probity_bench-1.1.0/probity_cli/data/leaves/redemption_rights/run.log +49 -0
  418. probity_bench-1.1.0/probity_cli/data/leaves/redemption_rights/run.py +12 -0
  419. probity_bench-1.1.0/probity_cli/data/leaves/redemption_rights/runs_deepseek-v4f.jsonl +200 -0
  420. probity_bench-1.1.0/probity_cli/data/leaves/redemption_rights/runs_gemma3-1b.jsonl +200 -0
  421. probity_bench-1.1.0/probity_cli/data/leaves/redemption_rights/scored.json +312 -0
  422. probity_bench-1.1.0/probity_cli/data/leaves/redemption_rights/source.py +89 -0
  423. probity_bench-1.1.0/probity_cli/data/leaves/redemption_rights/task.py +41 -0
  424. probity_bench-1.1.0/probity_cli/data/leaves/rofr_cosale/fetch.py +22 -0
  425. probity_bench-1.1.0/probity_cli/data/leaves/rofr_cosale/oracle.jsonl +12 -0
  426. probity_bench-1.1.0/probity_cli/data/leaves/rofr_cosale/run.log +38 -0
  427. probity_bench-1.1.0/probity_cli/data/leaves/rofr_cosale/run.py +12 -0
  428. probity_bench-1.1.0/probity_cli/data/leaves/rofr_cosale/run_ds.log +49 -0
  429. probity_bench-1.1.0/probity_cli/data/leaves/rofr_cosale/runs_deepseek-v4f.jsonl +240 -0
  430. probity_bench-1.1.0/probity_cli/data/leaves/rofr_cosale/runs_gemma3-1b.jsonl +240 -0
  431. probity_bench-1.1.0/probity_cli/data/leaves/rofr_cosale/scored.json +364 -0
  432. probity_bench-1.1.0/probity_cli/data/leaves/rofr_cosale/source.py +77 -0
  433. probity_bench-1.1.0/probity_cli/data/leaves/rofr_cosale/task.py +48 -0
  434. probity_bench-1.1.0/probity_cli/data/leaves/round_size/oracle.jsonl +10 -0
  435. probity_bench-1.1.0/probity_cli/data/leaves/round_size/run.py +12 -0
  436. probity_bench-1.1.0/probity_cli/data/leaves/round_size/runs_deepseek-v4f.jsonl +200 -0
  437. probity_bench-1.1.0/probity_cli/data/leaves/round_size/runs_gemma3-1b.jsonl +200 -0
  438. probity_bench-1.1.0/probity_cli/data/leaves/round_size/scored.json +312 -0
  439. probity_bench-1.1.0/probity_cli/data/leaves/round_size/source.py +88 -0
  440. probity_bench-1.1.0/probity_cli/data/leaves/round_size/task.py +55 -0
  441. probity_bench-1.1.0/probity_cli/data/leaves/s1_risk_factors/oracle.jsonl +5 -0
  442. probity_bench-1.1.0/probity_cli/data/leaves/s1_risk_factors/run.py +12 -0
  443. probity_bench-1.1.0/probity_cli/data/leaves/s1_risk_factors/runs_deepseek-v4f.jsonl +100 -0
  444. probity_bench-1.1.0/probity_cli/data/leaves/s1_risk_factors/runs_gemma3-1b.jsonl +100 -0
  445. probity_bench-1.1.0/probity_cli/data/leaves/s1_risk_factors/scored.json +182 -0
  446. probity_bench-1.1.0/probity_cli/data/leaves/s1_risk_factors/source.py +96 -0
  447. probity_bench-1.1.0/probity_cli/data/leaves/s1_risk_factors/task.py +42 -0
  448. probity_bench-1.1.0/probity_cli/data/leaves/s1_use_of_proceeds/oracle.jsonl +5 -0
  449. probity_bench-1.1.0/probity_cli/data/leaves/s1_use_of_proceeds/run.py +12 -0
  450. probity_bench-1.1.0/probity_cli/data/leaves/s1_use_of_proceeds/runs_deepseek-v4f.jsonl +100 -0
  451. probity_bench-1.1.0/probity_cli/data/leaves/s1_use_of_proceeds/runs_gemma3-1b.jsonl +100 -0
  452. probity_bench-1.1.0/probity_cli/data/leaves/s1_use_of_proceeds/scored.json +182 -0
  453. probity_bench-1.1.0/probity_cli/data/leaves/s1_use_of_proceeds/source.py +80 -0
  454. probity_bench-1.1.0/probity_cli/data/leaves/s1_use_of_proceeds/task.py +51 -0
  455. probity_bench-1.1.0/probity_cli/data/leaves/safe_cap_vs_discount_applies/oracle.jsonl +13 -0
  456. probity_bench-1.1.0/probity_cli/data/leaves/safe_cap_vs_discount_applies/run.py +8 -0
  457. probity_bench-1.1.0/probity_cli/data/leaves/safe_cap_vs_discount_applies/runs_deepseek-v4f.jsonl +260 -0
  458. probity_bench-1.1.0/probity_cli/data/leaves/safe_cap_vs_discount_applies/runs_gemma3-1b.jsonl +260 -0
  459. probity_bench-1.1.0/probity_cli/data/leaves/safe_cap_vs_discount_applies/scored.json +390 -0
  460. probity_bench-1.1.0/probity_cli/data/leaves/safe_cap_vs_discount_applies/source.py +175 -0
  461. probity_bench-1.1.0/probity_cli/data/leaves/safe_cap_vs_discount_applies/task.py +45 -0
  462. probity_bench-1.1.0/probity_cli/data/leaves/safe_conversion_shares/DEFERRED.md +28 -0
  463. probity_bench-1.1.0/probity_cli/data/leaves/safe_discount_rate/oracle.jsonl +9 -0
  464. probity_bench-1.1.0/probity_cli/data/leaves/safe_discount_rate/run.log +49 -0
  465. probity_bench-1.1.0/probity_cli/data/leaves/safe_discount_rate/run.py +12 -0
  466. probity_bench-1.1.0/probity_cli/data/leaves/safe_discount_rate/runs_deepseek-v4f.jsonl +180 -0
  467. probity_bench-1.1.0/probity_cli/data/leaves/safe_discount_rate/runs_gemma3-1b.jsonl +180 -0
  468. probity_bench-1.1.0/probity_cli/data/leaves/safe_discount_rate/scored.json +286 -0
  469. probity_bench-1.1.0/probity_cli/data/leaves/safe_discount_rate/source.py +95 -0
  470. probity_bench-1.1.0/probity_cli/data/leaves/safe_discount_rate/task.py +34 -0
  471. probity_bench-1.1.0/probity_cli/data/leaves/safe_mfn_present/oracle.jsonl +7 -0
  472. probity_bench-1.1.0/probity_cli/data/leaves/safe_mfn_present/run.log +49 -0
  473. probity_bench-1.1.0/probity_cli/data/leaves/safe_mfn_present/run.py +12 -0
  474. probity_bench-1.1.0/probity_cli/data/leaves/safe_mfn_present/runs_deepseek-v4f.jsonl +140 -0
  475. probity_bench-1.1.0/probity_cli/data/leaves/safe_mfn_present/runs_gemma3-1b.jsonl +140 -0
  476. probity_bench-1.1.0/probity_cli/data/leaves/safe_mfn_present/scored.json +234 -0
  477. probity_bench-1.1.0/probity_cli/data/leaves/safe_mfn_present/source.py +86 -0
  478. probity_bench-1.1.0/probity_cli/data/leaves/safe_mfn_present/task.py +44 -0
  479. probity_bench-1.1.0/probity_cli/data/leaves/safe_note_conversion_impact/DEFERRED.md +14 -0
  480. probity_bench-1.1.0/probity_cli/data/leaves/safe_pre_post/candidates.jsonl +20 -0
  481. probity_bench-1.1.0/probity_cli/data/leaves/safe_pre_post/oracle.jsonl +16 -0
  482. probity_bench-1.1.0/probity_cli/data/leaves/safe_pre_post/run.log +95 -0
  483. probity_bench-1.1.0/probity_cli/data/leaves/safe_pre_post/run.py +92 -0
  484. probity_bench-1.1.0/probity_cli/data/leaves/safe_pre_post/runs_deepseek-v4f.jsonl +320 -0
  485. probity_bench-1.1.0/probity_cli/data/leaves/safe_pre_post/runs_gemma3-1b.jsonl +320 -0
  486. probity_bench-1.1.0/probity_cli/data/leaves/safe_pre_post/runs_gemma4-12b.jsonl +320 -0
  487. probity_bench-1.1.0/probity_cli/data/leaves/safe_pre_post/runs_llama3.2-3b.jsonl +320 -0
  488. probity_bench-1.1.0/probity_cli/data/leaves/safe_pre_post/scored.json +934 -0
  489. probity_bench-1.1.0/probity_cli/data/leaves/safe_pre_post/source.log +19 -0
  490. probity_bench-1.1.0/probity_cli/data/leaves/safe_pre_post/source.py +62 -0
  491. probity_bench-1.1.0/probity_cli/data/leaves/safe_pre_post/task.py +39 -0
  492. probity_bench-1.1.0/probity_cli/data/leaves/safe_pro_rata_side_letter/fetch.py +66 -0
  493. probity_bench-1.1.0/probity_cli/data/leaves/safe_pro_rata_side_letter/oracle.jsonl +15 -0
  494. probity_bench-1.1.0/probity_cli/data/leaves/safe_pro_rata_side_letter/run.log +49 -0
  495. probity_bench-1.1.0/probity_cli/data/leaves/safe_pro_rata_side_letter/run.py +12 -0
  496. probity_bench-1.1.0/probity_cli/data/leaves/safe_pro_rata_side_letter/runs_deepseek-v4f.jsonl +300 -0
  497. probity_bench-1.1.0/probity_cli/data/leaves/safe_pro_rata_side_letter/runs_gemma3-1b.jsonl +300 -0
  498. probity_bench-1.1.0/probity_cli/data/leaves/safe_pro_rata_side_letter/scored.json +442 -0
  499. probity_bench-1.1.0/probity_cli/data/leaves/safe_pro_rata_side_letter/source.py +71 -0
  500. probity_bench-1.1.0/probity_cli/data/leaves/safe_pro_rata_side_letter/task.py +42 -0
  501. probity_bench-1.1.0/probity_cli/data/leaves/safe_valuation_cap/oracle.jsonl +8 -0
  502. probity_bench-1.1.0/probity_cli/data/leaves/safe_valuation_cap/run.log +49 -0
  503. probity_bench-1.1.0/probity_cli/data/leaves/safe_valuation_cap/run.py +12 -0
  504. probity_bench-1.1.0/probity_cli/data/leaves/safe_valuation_cap/runs_deepseek-v4f.jsonl +160 -0
  505. probity_bench-1.1.0/probity_cli/data/leaves/safe_valuation_cap/runs_gemma3-1b.jsonl +160 -0
  506. probity_bench-1.1.0/probity_cli/data/leaves/safe_valuation_cap/scored.json +260 -0
  507. probity_bench-1.1.0/probity_cli/data/leaves/safe_valuation_cap/source.py +90 -0
  508. probity_bench-1.1.0/probity_cli/data/leaves/safe_valuation_cap/task.py +31 -0
  509. probity_bench-1.1.0/probity_cli/data/leaves/securities_exemption/oracle.jsonl +10 -0
  510. probity_bench-1.1.0/probity_cli/data/leaves/securities_exemption/run.py +12 -0
  511. probity_bench-1.1.0/probity_cli/data/leaves/securities_exemption/runs_deepseek-v4f.jsonl +200 -0
  512. probity_bench-1.1.0/probity_cli/data/leaves/securities_exemption/runs_gemma3-1b.jsonl +200 -0
  513. probity_bench-1.1.0/probity_cli/data/leaves/securities_exemption/scored.json +312 -0
  514. probity_bench-1.1.0/probity_cli/data/leaves/securities_exemption/source.py +102 -0
  515. probity_bench-1.1.0/probity_cli/data/leaves/securities_exemption/task.py +49 -0
  516. probity_bench-1.1.0/probity_cli/data/leaves/vesting_acceleration/candidates_absent.jsonl +48 -0
  517. probity_bench-1.1.0/probity_cli/data/leaves/vesting_acceleration/candidates_present.jsonl +58 -0
  518. probity_bench-1.1.0/probity_cli/data/leaves/vesting_acceleration/fetch.py +65 -0
  519. probity_bench-1.1.0/probity_cli/data/leaves/vesting_acceleration/oracle.jsonl +9 -0
  520. probity_bench-1.1.0/probity_cli/data/leaves/vesting_acceleration/run.log +49 -0
  521. probity_bench-1.1.0/probity_cli/data/leaves/vesting_acceleration/run.py +12 -0
  522. probity_bench-1.1.0/probity_cli/data/leaves/vesting_acceleration/runs_deepseek-v4f.jsonl +180 -0
  523. probity_bench-1.1.0/probity_cli/data/leaves/vesting_acceleration/runs_gemma3-1b.jsonl +180 -0
  524. probity_bench-1.1.0/probity_cli/data/leaves/vesting_acceleration/scored.json +286 -0
  525. probity_bench-1.1.0/probity_cli/data/leaves/vesting_acceleration/source.py +73 -0
  526. probity_bench-1.1.0/probity_cli/data/leaves/vesting_acceleration/task.py +39 -0
  527. probity_bench-1.1.0/probity_cli/data/leaves/vesting_schedule/oracle.jsonl +9 -0
  528. probity_bench-1.1.0/probity_cli/data/leaves/vesting_schedule/run.py +12 -0
  529. probity_bench-1.1.0/probity_cli/data/leaves/vesting_schedule/runs_deepseek-v4f.jsonl +180 -0
  530. probity_bench-1.1.0/probity_cli/data/leaves/vesting_schedule/runs_gemma3-1b.jsonl +180 -0
  531. probity_bench-1.1.0/probity_cli/data/leaves/vesting_schedule/scored.json +286 -0
  532. probity_bench-1.1.0/probity_cli/data/leaves/vesting_schedule/source.py +109 -0
  533. probity_bench-1.1.0/probity_cli/data/leaves/vesting_schedule/task.py +50 -0
  534. probity_bench-1.1.0/probity_cli/data/results/RESULTS.md +2417 -0
  535. probity_bench-1.1.0/probity_cli/data/results/render.py +677 -0
  536. probity_bench-1.1.0/probity_cli/materialize.py +67 -0
  537. probity_bench-1.1.0/probity_cli/onboard.py +191 -0
  538. probity_bench-1.1.0/pyproject.toml +37 -0
  539. probity_bench-1.1.0/setup.cfg +4 -0
  540. probity_bench-1.1.0/tests/test_engine.py +200 -0
  541. probity_bench-1.1.0/tests/test_probity_cli.py +201 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Seyed Mosayeb Alam
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,193 @@
1
+ Metadata-Version: 2.4
2
+ Name: probity-bench
3
+ Version: 1.1.0
4
+ Summary: An LLM reliability + accuracy benchmark for real fundraising documents -- because LLMs are probabilistic and finance needs determinism.
5
+ Author: eikiyo
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 Seyed Mosayeb Alam
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/eikiyo/probity
29
+ Project-URL: Repository, https://github.com/eikiyo/probity
30
+ Project-URL: Changelog, https://github.com/eikiyo/probity/blob/main/CHANGELOG.md
31
+ Classifier: Programming Language :: Python :: 3
32
+ Classifier: License :: OSI Approved :: MIT License
33
+ Classifier: Operating System :: OS Independent
34
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
35
+ Requires-Python: >=3.9
36
+ Description-Content-Type: text/markdown
37
+ License-File: LICENSE
38
+ Dynamic: license-file
39
+
40
+ # Probity
41
+
42
+ [![CI](https://github.com/eikiyo/probity/actions/workflows/ci.yml/badge.svg)](https://github.com/eikiyo/probity/actions/workflows/ci.yml)
43
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
44
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9%2B-blue)](https://www.python.org/)
45
+
46
+ ![Probity demo — the same question asked 20 times, same clause, same model, flipping between pre-money and post-money](demo/demo.gif)
47
+
48
+ LLMs are fundamentally probabilistic. Ask one the same question twice and you can get two
49
+ different answers — that's not a bug, it's how sampling works. Most of the time that's fine. It is
50
+ **not fine** when the question is "is this a pre-money or post-money valuation" and the answer
51
+ decides who owns what in a startup financing. Finance needs determinism; LLMs supply probability.
52
+ Nobody was measuring that gap, so Probity does: it benchmarks how often a model's answer *wobbles*
53
+ on real term sheets, charters, SAFEs, convertible notes, and cap tables — before you ever get to
54
+ whether the answer is right.
55
+
56
+ - **Wobble** (the core metric) — does the model give the *same* answer when you ask it the same
57
+ question 20 times at temperature 0.7? A model whose answer flips run to run cannot be trusted in
58
+ a workflow that touches money, even when it is often right. This is label-free: it needs no
59
+ ground truth, only repetition.
60
+ - **Accuracy** — does the model get the answer *right*, graded against a validated answer that a
61
+ human extracted from the source document (not authored by an AI)?
62
+
63
+ These are scored separately and never averaged into one headline — a model can be perfectly
64
+ consistent and consistently wrong. Models are run across a **size ladder** (1B → 12B local, plus a
65
+ hosted model) to test whether wobble falls as capability rises. Heavier models (a 27B local model
66
+ and hosted frontier models) are reserved for a single comprehensive sweep once every test is built.
67
+
68
+ ## Quickstart
69
+
70
+ ### Option A — install the package (fastest way to run a real benchmark yourself)
71
+
72
+ ```bash
73
+ pip install probity-bench
74
+ probity-bench onboard # pick documents to fetch, models to run, and store your API key(s)
75
+ ```
76
+
77
+ `onboard` is a guided wizard — same idea as `openclaw onboard` or `claude setup`: it walks you
78
+ through which leaves to pull real SEC documents for, which models to benchmark (auto-detects local
79
+ Ollama models; DeepSeek/Gemini for hosted), and collects + **verifies** any API key by making one
80
+ real call before it lets you proceed. Everything is stored locally at `~/.probity/` — nothing
81
+ leaves your machine except the model calls you explicitly configure.
82
+
83
+ ![Probity onboarding — documents, models, and API key setup, all local](demo/onboard.gif)
84
+
85
+ The package ships the **full pipeline** — `engine/`, all 60 leaves' code, oracles, and prior
86
+ results — everything except the raw SEC documents themselves (fetch those via `onboard` or
87
+ `source.py`, per leaf) and, obviously, no model weights (those come from Ollama/DeepSeek/Gemini).
88
+
89
+ ```bash
90
+ probity-bench demo # zero-config: replay a real wobble example, no install/network needed
91
+ probity-bench results # print the 2 summary tables from bundled scored.json
92
+ probity-bench list # every leaf + whether you've fetched its corpus
93
+ probity-bench run <leaf> # fetch (if needed) + benchmark one leaf with your configured models
94
+ ```
95
+
96
+ ### Option B — clone the repo (full reproducibility, no package boundary)
97
+
98
+ ```bash
99
+ git clone https://github.com/eikiyo/probity.git
100
+ cd probity
101
+ make setup # runs the test suite + regenerates results/RESULTS.md + this README's tables from disk
102
+ ```
103
+
104
+ That's it — zero third-party dependencies, pure Python 3 stdlib, no network call, no API key.
105
+ (No `make`? `python3 -m unittest discover -s tests && python3 results/render.py` does the same thing.)
106
+
107
+ To **re-run a test yourself** against live models (needs [Ollama](https://ollama.com) running
108
+ `gemma3:1b` locally + a DeepSeek API key — see [`.env.example`](.env.example)):
109
+
110
+ ```bash
111
+ cp .env.example .env && set -a && source .env && set +a
112
+ cd leaves/vesting_schedule # or any other leaf under leaves/
113
+ python3 source.py # fetch the real SEC documents into corpus/
114
+ python3 run.py # run the model ladder, N=20 each, writes scored.json
115
+ python3 ../../results/render.py # regenerate the tables with your fresh numbers
116
+ ```
117
+
118
+ ## Benchmark results
119
+
120
+ <!-- BENCHMARK:START -->
121
+ *60 tests, each item run 20x/item at temp 0.7 across a model size ladder. **Wobble** (lower = better) is the run-to-run inconsistency rate, weighted by item count across every test that model ran. Full per-test breakdown (all 60 tables): [`results/RESULTS.md`](results/RESULTS.md).*
122
+
123
+ ### Does reliability improve with model size?
124
+
125
+ | Model | Size | Tests covered | **Wobble** ↓ | Accuracy |
126
+ |---|---|---|---|---|
127
+ | `deepseek-v4-flash` | hosted | 58 | ![6%](https://img.shields.io/badge/-6%25-brightgreen) | ![95%](https://img.shields.io/badge/-95%25-brightgreen) |
128
+ | `gemma3:1b` | 1B, local | 51 | ![44%](https://img.shields.io/badge/-44%25-red) | ![54%](https://img.shields.io/badge/-54%25-red) |
129
+ | `llama3.2:latest` | 3B, local | 1 | ![56%](https://img.shields.io/badge/-56%25-red) | ![81%](https://img.shields.io/badge/-81%25-yellow) |
130
+ | `gemma4:12b` | 12B, local | 1 | ![0%](https://img.shields.io/badge/-0%25-brightgreen) | ![100%](https://img.shields.io/badge/-100%25-brightgreen) |
131
+
132
+ ### By fundraising-document category
133
+
134
+ | Category | Tests | **Wobble** ↓ (deepseek) | Accuracy (deepseek) |
135
+ |---|---|---|---|
136
+ | Priced equity rounds | 16 | ![5%](https://img.shields.io/badge/-5%25-brightgreen) | ![90%](https://img.shields.io/badge/-90%25-brightgreen) |
137
+ | SAFEs & convertible notes | 12 | ![4%](https://img.shields.io/badge/-4%25-brightgreen) | ![100%](https://img.shields.io/badge/-100%25-brightgreen) |
138
+ | Cap table math | 7 | ![6%](https://img.shields.io/badge/-6%25-brightgreen) | ![94%](https://img.shields.io/badge/-94%25-brightgreen) |
139
+ | Investor rights & governance | 7 | ![6%](https://img.shields.io/badge/-6%25-brightgreen) | ![95%](https://img.shields.io/badge/-95%25-brightgreen) |
140
+ | Founder & employee vesting | 5 | ![2%](https://img.shields.io/badge/-2%25-brightgreen) | ![98%](https://img.shields.io/badge/-98%25-brightgreen) |
141
+ | Regulatory disclosures | 5 | ![15%](https://img.shields.io/badge/-15%25-yellow) | ![100%](https://img.shields.io/badge/-100%25-brightgreen) |
142
+ | Off-market risk flags | 5 | ![8%](https://img.shields.io/badge/-8%25-brightgreen) | ![92%](https://img.shields.io/badge/-92%25-brightgreen) |
143
+ | Exit waterfalls | 1 | ![25%](https://img.shields.io/badge/-25%25-yellow) | ![100%](https://img.shields.io/badge/-100%25-brightgreen) |
144
+
145
+ <!-- BENCHMARK:END -->
146
+
147
+ Full per-item breakdown — including which clauses make each model wobble — in
148
+ [`results/RESULTS.md`](results/RESULTS.md).
149
+
150
+ ## Why the answers are trustworthy
151
+
152
+ Most LLM benchmarks in niche domains are built from synthetic data with synthetic answers. That has
153
+ a hidden flaw: if an AI writes both the question and the answer key, the answer key can be wrong in
154
+ exactly the ways the model under test is wrong. Probity avoids this with a strict **oracle layer**:
155
+
156
+ 1. **Source a real document** that contains the ground truth in its own authoritative text — for
157
+ example, a Certificate of Incorporation filed with the SEC that states, in legally precise
158
+ language, whether its preferred stock is participating.
159
+ 2. **A human separates the question from the answer.** The model sees only the clause (the question).
160
+ The validated label, plus the exact quote that proves it, is stored in a separate oracle file the
161
+ model never sees. Items whose answer cannot be determined with confidence are *excluded*, not guessed.
162
+ 3. **Run only the question** through each model, N times, and score the majority answer against the
163
+ validated label.
164
+
165
+ Synthetic instantiation is used only to *multiply* difficulty (varying numbers, off-market terms,
166
+ ambiguous phrasing) on top of a real, human-validated seed — never as the sole source of truth.
167
+
168
+ ## The test map
169
+
170
+ Probity's full test backlog is a structured map of fundraising-reasoning capabilities
171
+ (`engine/registry.json`) — 67 atomic checks across priced equity, convertibles, cap-table math,
172
+ exit waterfalls, investor rights, founder equity, regulatory filings, and off-market risk flags.
173
+ Each check is built one at a time, to depth, against real sourced documents.
174
+
175
+ ## Structure
176
+
177
+ ```
178
+ engine/ the model-agnostic core: clients, run harness, normalizer, reliability+accuracy scorers
179
+ leaves/ one folder per test, each with its real-document corpus, its separated oracle, and its runner
180
+ results/ the living benchmark table
181
+ ```
182
+
183
+ See the [Quickstart](#quickstart) above for the full clone → run → reproduce path.
184
+
185
+ ## Contributing
186
+
187
+ Bug reports, new leaves, and sourcing improvements are welcome — see
188
+ [CONTRIBUTING.md](CONTRIBUTING.md). Security issues: see [SECURITY.md](SECURITY.md), never a
189
+ public issue.
190
+
191
+ ## License
192
+
193
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,154 @@
1
+ # Probity
2
+
3
+ [![CI](https://github.com/eikiyo/probity/actions/workflows/ci.yml/badge.svg)](https://github.com/eikiyo/probity/actions/workflows/ci.yml)
4
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
5
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9%2B-blue)](https://www.python.org/)
6
+
7
+ ![Probity demo — the same question asked 20 times, same clause, same model, flipping between pre-money and post-money](demo/demo.gif)
8
+
9
+ LLMs are fundamentally probabilistic. Ask one the same question twice and you can get two
10
+ different answers — that's not a bug, it's how sampling works. Most of the time that's fine. It is
11
+ **not fine** when the question is "is this a pre-money or post-money valuation" and the answer
12
+ decides who owns what in a startup financing. Finance needs determinism; LLMs supply probability.
13
+ Nobody was measuring that gap, so Probity does: it benchmarks how often a model's answer *wobbles*
14
+ on real term sheets, charters, SAFEs, convertible notes, and cap tables — before you ever get to
15
+ whether the answer is right.
16
+
17
+ - **Wobble** (the core metric) — does the model give the *same* answer when you ask it the same
18
+ question 20 times at temperature 0.7? A model whose answer flips run to run cannot be trusted in
19
+ a workflow that touches money, even when it is often right. This is label-free: it needs no
20
+ ground truth, only repetition.
21
+ - **Accuracy** — does the model get the answer *right*, graded against a validated answer that a
22
+ human extracted from the source document (not authored by an AI)?
23
+
24
+ These are scored separately and never averaged into one headline — a model can be perfectly
25
+ consistent and consistently wrong. Models are run across a **size ladder** (1B → 12B local, plus a
26
+ hosted model) to test whether wobble falls as capability rises. Heavier models (a 27B local model
27
+ and hosted frontier models) are reserved for a single comprehensive sweep once every test is built.
28
+
29
+ ## Quickstart
30
+
31
+ ### Option A — install the package (fastest way to run a real benchmark yourself)
32
+
33
+ ```bash
34
+ pip install probity-bench
35
+ probity-bench onboard # pick documents to fetch, models to run, and store your API key(s)
36
+ ```
37
+
38
+ `onboard` is a guided wizard — same idea as `openclaw onboard` or `claude setup`: it walks you
39
+ through which leaves to pull real SEC documents for, which models to benchmark (auto-detects local
40
+ Ollama models; DeepSeek/Gemini for hosted), and collects + **verifies** any API key by making one
41
+ real call before it lets you proceed. Everything is stored locally at `~/.probity/` — nothing
42
+ leaves your machine except the model calls you explicitly configure.
43
+
44
+ ![Probity onboarding — documents, models, and API key setup, all local](demo/onboard.gif)
45
+
46
+ The package ships the **full pipeline** — `engine/`, all 60 leaves' code, oracles, and prior
47
+ results — everything except the raw SEC documents themselves (fetch those via `onboard` or
48
+ `source.py`, per leaf) and, obviously, no model weights (those come from Ollama/DeepSeek/Gemini).
49
+
50
+ ```bash
51
+ probity-bench demo # zero-config: replay a real wobble example, no install/network needed
52
+ probity-bench results # print the 2 summary tables from bundled scored.json
53
+ probity-bench list # every leaf + whether you've fetched its corpus
54
+ probity-bench run <leaf> # fetch (if needed) + benchmark one leaf with your configured models
55
+ ```
56
+
57
+ ### Option B — clone the repo (full reproducibility, no package boundary)
58
+
59
+ ```bash
60
+ git clone https://github.com/eikiyo/probity.git
61
+ cd probity
62
+ make setup # runs the test suite + regenerates results/RESULTS.md + this README's tables from disk
63
+ ```
64
+
65
+ That's it — zero third-party dependencies, pure Python 3 stdlib, no network call, no API key.
66
+ (No `make`? `python3 -m unittest discover -s tests && python3 results/render.py` does the same thing.)
67
+
68
+ To **re-run a test yourself** against live models (needs [Ollama](https://ollama.com) running
69
+ `gemma3:1b` locally + a DeepSeek API key — see [`.env.example`](.env.example)):
70
+
71
+ ```bash
72
+ cp .env.example .env && set -a && source .env && set +a
73
+ cd leaves/vesting_schedule # or any other leaf under leaves/
74
+ python3 source.py # fetch the real SEC documents into corpus/
75
+ python3 run.py # run the model ladder, N=20 each, writes scored.json
76
+ python3 ../../results/render.py # regenerate the tables with your fresh numbers
77
+ ```
78
+
79
+ ## Benchmark results
80
+
81
+ <!-- BENCHMARK:START -->
82
+ *60 tests, each item run 20x/item at temp 0.7 across a model size ladder. **Wobble** (lower = better) is the run-to-run inconsistency rate, weighted by item count across every test that model ran. Full per-test breakdown (all 60 tables): [`results/RESULTS.md`](results/RESULTS.md).*
83
+
84
+ ### Does reliability improve with model size?
85
+
86
+ | Model | Size | Tests covered | **Wobble** ↓ | Accuracy |
87
+ |---|---|---|---|---|
88
+ | `deepseek-v4-flash` | hosted | 58 | ![6%](https://img.shields.io/badge/-6%25-brightgreen) | ![95%](https://img.shields.io/badge/-95%25-brightgreen) |
89
+ | `gemma3:1b` | 1B, local | 51 | ![44%](https://img.shields.io/badge/-44%25-red) | ![54%](https://img.shields.io/badge/-54%25-red) |
90
+ | `llama3.2:latest` | 3B, local | 1 | ![56%](https://img.shields.io/badge/-56%25-red) | ![81%](https://img.shields.io/badge/-81%25-yellow) |
91
+ | `gemma4:12b` | 12B, local | 1 | ![0%](https://img.shields.io/badge/-0%25-brightgreen) | ![100%](https://img.shields.io/badge/-100%25-brightgreen) |
92
+
93
+ ### By fundraising-document category
94
+
95
+ | Category | Tests | **Wobble** ↓ (deepseek) | Accuracy (deepseek) |
96
+ |---|---|---|---|
97
+ | Priced equity rounds | 16 | ![5%](https://img.shields.io/badge/-5%25-brightgreen) | ![90%](https://img.shields.io/badge/-90%25-brightgreen) |
98
+ | SAFEs & convertible notes | 12 | ![4%](https://img.shields.io/badge/-4%25-brightgreen) | ![100%](https://img.shields.io/badge/-100%25-brightgreen) |
99
+ | Cap table math | 7 | ![6%](https://img.shields.io/badge/-6%25-brightgreen) | ![94%](https://img.shields.io/badge/-94%25-brightgreen) |
100
+ | Investor rights & governance | 7 | ![6%](https://img.shields.io/badge/-6%25-brightgreen) | ![95%](https://img.shields.io/badge/-95%25-brightgreen) |
101
+ | Founder & employee vesting | 5 | ![2%](https://img.shields.io/badge/-2%25-brightgreen) | ![98%](https://img.shields.io/badge/-98%25-brightgreen) |
102
+ | Regulatory disclosures | 5 | ![15%](https://img.shields.io/badge/-15%25-yellow) | ![100%](https://img.shields.io/badge/-100%25-brightgreen) |
103
+ | Off-market risk flags | 5 | ![8%](https://img.shields.io/badge/-8%25-brightgreen) | ![92%](https://img.shields.io/badge/-92%25-brightgreen) |
104
+ | Exit waterfalls | 1 | ![25%](https://img.shields.io/badge/-25%25-yellow) | ![100%](https://img.shields.io/badge/-100%25-brightgreen) |
105
+
106
+ <!-- BENCHMARK:END -->
107
+
108
+ Full per-item breakdown — including which clauses make each model wobble — in
109
+ [`results/RESULTS.md`](results/RESULTS.md).
110
+
111
+ ## Why the answers are trustworthy
112
+
113
+ Most LLM benchmarks in niche domains are built from synthetic data with synthetic answers. That has
114
+ a hidden flaw: if an AI writes both the question and the answer key, the answer key can be wrong in
115
+ exactly the ways the model under test is wrong. Probity avoids this with a strict **oracle layer**:
116
+
117
+ 1. **Source a real document** that contains the ground truth in its own authoritative text — for
118
+ example, a Certificate of Incorporation filed with the SEC that states, in legally precise
119
+ language, whether its preferred stock is participating.
120
+ 2. **A human separates the question from the answer.** The model sees only the clause (the question).
121
+ The validated label, plus the exact quote that proves it, is stored in a separate oracle file the
122
+ model never sees. Items whose answer cannot be determined with confidence are *excluded*, not guessed.
123
+ 3. **Run only the question** through each model, N times, and score the majority answer against the
124
+ validated label.
125
+
126
+ Synthetic instantiation is used only to *multiply* difficulty (varying numbers, off-market terms,
127
+ ambiguous phrasing) on top of a real, human-validated seed — never as the sole source of truth.
128
+
129
+ ## The test map
130
+
131
+ Probity's full test backlog is a structured map of fundraising-reasoning capabilities
132
+ (`engine/registry.json`) — 67 atomic checks across priced equity, convertibles, cap-table math,
133
+ exit waterfalls, investor rights, founder equity, regulatory filings, and off-market risk flags.
134
+ Each check is built one at a time, to depth, against real sourced documents.
135
+
136
+ ## Structure
137
+
138
+ ```
139
+ engine/ the model-agnostic core: clients, run harness, normalizer, reliability+accuracy scorers
140
+ leaves/ one folder per test, each with its real-document corpus, its separated oracle, and its runner
141
+ results/ the living benchmark table
142
+ ```
143
+
144
+ See the [Quickstart](#quickstart) above for the full clone → run → reproduce path.
145
+
146
+ ## Contributing
147
+
148
+ Bug reports, new leaves, and sourcing improvements are welcome — see
149
+ [CONTRIBUTING.md](CONTRIBUTING.md). Security issues: see [SECURITY.md](SECURITY.md), never a
150
+ public issue.
151
+
152
+ ## License
153
+
154
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,193 @@
1
+ Metadata-Version: 2.4
2
+ Name: probity-bench
3
+ Version: 1.1.0
4
+ Summary: An LLM reliability + accuracy benchmark for real fundraising documents -- because LLMs are probabilistic and finance needs determinism.
5
+ Author: eikiyo
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 Seyed Mosayeb Alam
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/eikiyo/probity
29
+ Project-URL: Repository, https://github.com/eikiyo/probity
30
+ Project-URL: Changelog, https://github.com/eikiyo/probity/blob/main/CHANGELOG.md
31
+ Classifier: Programming Language :: Python :: 3
32
+ Classifier: License :: OSI Approved :: MIT License
33
+ Classifier: Operating System :: OS Independent
34
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
35
+ Requires-Python: >=3.9
36
+ Description-Content-Type: text/markdown
37
+ License-File: LICENSE
38
+ Dynamic: license-file
39
+
40
+ # Probity
41
+
42
+ [![CI](https://github.com/eikiyo/probity/actions/workflows/ci.yml/badge.svg)](https://github.com/eikiyo/probity/actions/workflows/ci.yml)
43
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
44
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9%2B-blue)](https://www.python.org/)
45
+
46
+ ![Probity demo — the same question asked 20 times, same clause, same model, flipping between pre-money and post-money](demo/demo.gif)
47
+
48
+ LLMs are fundamentally probabilistic. Ask one the same question twice and you can get two
49
+ different answers — that's not a bug, it's how sampling works. Most of the time that's fine. It is
50
+ **not fine** when the question is "is this a pre-money or post-money valuation" and the answer
51
+ decides who owns what in a startup financing. Finance needs determinism; LLMs supply probability.
52
+ Nobody was measuring that gap, so Probity does: it benchmarks how often a model's answer *wobbles*
53
+ on real term sheets, charters, SAFEs, convertible notes, and cap tables — before you ever get to
54
+ whether the answer is right.
55
+
56
+ - **Wobble** (the core metric) — does the model give the *same* answer when you ask it the same
57
+ question 20 times at temperature 0.7? A model whose answer flips run to run cannot be trusted in
58
+ a workflow that touches money, even when it is often right. This is label-free: it needs no
59
+ ground truth, only repetition.
60
+ - **Accuracy** — does the model get the answer *right*, graded against a validated answer that a
61
+ human extracted from the source document (not authored by an AI)?
62
+
63
+ These are scored separately and never averaged into one headline — a model can be perfectly
64
+ consistent and consistently wrong. Models are run across a **size ladder** (1B → 12B local, plus a
65
+ hosted model) to test whether wobble falls as capability rises. Heavier models (a 27B local model
66
+ and hosted frontier models) are reserved for a single comprehensive sweep once every test is built.
67
+
68
+ ## Quickstart
69
+
70
+ ### Option A — install the package (fastest way to run a real benchmark yourself)
71
+
72
+ ```bash
73
+ pip install probity-bench
74
+ probity-bench onboard # pick documents to fetch, models to run, and store your API key(s)
75
+ ```
76
+
77
+ `onboard` is a guided wizard — same idea as `openclaw onboard` or `claude setup`: it walks you
78
+ through which leaves to pull real SEC documents for, which models to benchmark (auto-detects local
79
+ Ollama models; DeepSeek/Gemini for hosted), and collects + **verifies** any API key by making one
80
+ real call before it lets you proceed. Everything is stored locally at `~/.probity/` — nothing
81
+ leaves your machine except the model calls you explicitly configure.
82
+
83
+ ![Probity onboarding — documents, models, and API key setup, all local](demo/onboard.gif)
84
+
85
+ The package ships the **full pipeline** — `engine/`, all 60 leaves' code, oracles, and prior
86
+ results — everything except the raw SEC documents themselves (fetch those via `onboard` or
87
+ `source.py`, per leaf) and, obviously, no model weights (those come from Ollama/DeepSeek/Gemini).
88
+
89
+ ```bash
90
+ probity-bench demo # zero-config: replay a real wobble example, no install/network needed
91
+ probity-bench results # print the 2 summary tables from bundled scored.json
92
+ probity-bench list # every leaf + whether you've fetched its corpus
93
+ probity-bench run <leaf> # fetch (if needed) + benchmark one leaf with your configured models
94
+ ```
95
+
96
+ ### Option B — clone the repo (full reproducibility, no package boundary)
97
+
98
+ ```bash
99
+ git clone https://github.com/eikiyo/probity.git
100
+ cd probity
101
+ make setup # runs the test suite + regenerates results/RESULTS.md + this README's tables from disk
102
+ ```
103
+
104
+ That's it — zero third-party dependencies, pure Python 3 stdlib, no network call, no API key.
105
+ (No `make`? `python3 -m unittest discover -s tests && python3 results/render.py` does the same thing.)
106
+
107
+ To **re-run a test yourself** against live models (needs [Ollama](https://ollama.com) running
108
+ `gemma3:1b` locally + a DeepSeek API key — see [`.env.example`](.env.example)):
109
+
110
+ ```bash
111
+ cp .env.example .env && set -a && source .env && set +a
112
+ cd leaves/vesting_schedule # or any other leaf under leaves/
113
+ python3 source.py # fetch the real SEC documents into corpus/
114
+ python3 run.py # run the model ladder, N=20 each, writes scored.json
115
+ python3 ../../results/render.py # regenerate the tables with your fresh numbers
116
+ ```
117
+
118
+ ## Benchmark results
119
+
120
+ <!-- BENCHMARK:START -->
121
+ *60 tests, each item run 20x/item at temp 0.7 across a model size ladder. **Wobble** (lower = better) is the run-to-run inconsistency rate, weighted by item count across every test that model ran. Full per-test breakdown (all 60 tables): [`results/RESULTS.md`](results/RESULTS.md).*
122
+
123
+ ### Does reliability improve with model size?
124
+
125
+ | Model | Size | Tests covered | **Wobble** ↓ | Accuracy |
126
+ |---|---|---|---|---|
127
+ | `deepseek-v4-flash` | hosted | 58 | ![6%](https://img.shields.io/badge/-6%25-brightgreen) | ![95%](https://img.shields.io/badge/-95%25-brightgreen) |
128
+ | `gemma3:1b` | 1B, local | 51 | ![44%](https://img.shields.io/badge/-44%25-red) | ![54%](https://img.shields.io/badge/-54%25-red) |
129
+ | `llama3.2:latest` | 3B, local | 1 | ![56%](https://img.shields.io/badge/-56%25-red) | ![81%](https://img.shields.io/badge/-81%25-yellow) |
130
+ | `gemma4:12b` | 12B, local | 1 | ![0%](https://img.shields.io/badge/-0%25-brightgreen) | ![100%](https://img.shields.io/badge/-100%25-brightgreen) |
131
+
132
+ ### By fundraising-document category
133
+
134
+ | Category | Tests | **Wobble** ↓ (deepseek) | Accuracy (deepseek) |
135
+ |---|---|---|---|
136
+ | Priced equity rounds | 16 | ![5%](https://img.shields.io/badge/-5%25-brightgreen) | ![90%](https://img.shields.io/badge/-90%25-brightgreen) |
137
+ | SAFEs & convertible notes | 12 | ![4%](https://img.shields.io/badge/-4%25-brightgreen) | ![100%](https://img.shields.io/badge/-100%25-brightgreen) |
138
+ | Cap table math | 7 | ![6%](https://img.shields.io/badge/-6%25-brightgreen) | ![94%](https://img.shields.io/badge/-94%25-brightgreen) |
139
+ | Investor rights & governance | 7 | ![6%](https://img.shields.io/badge/-6%25-brightgreen) | ![95%](https://img.shields.io/badge/-95%25-brightgreen) |
140
+ | Founder & employee vesting | 5 | ![2%](https://img.shields.io/badge/-2%25-brightgreen) | ![98%](https://img.shields.io/badge/-98%25-brightgreen) |
141
+ | Regulatory disclosures | 5 | ![15%](https://img.shields.io/badge/-15%25-yellow) | ![100%](https://img.shields.io/badge/-100%25-brightgreen) |
142
+ | Off-market risk flags | 5 | ![8%](https://img.shields.io/badge/-8%25-brightgreen) | ![92%](https://img.shields.io/badge/-92%25-brightgreen) |
143
+ | Exit waterfalls | 1 | ![25%](https://img.shields.io/badge/-25%25-yellow) | ![100%](https://img.shields.io/badge/-100%25-brightgreen) |
144
+
145
+ <!-- BENCHMARK:END -->
146
+
147
+ Full per-item breakdown — including which clauses make each model wobble — in
148
+ [`results/RESULTS.md`](results/RESULTS.md).
149
+
150
+ ## Why the answers are trustworthy
151
+
152
+ Most LLM benchmarks in niche domains are built from synthetic data with synthetic answers. That has
153
+ a hidden flaw: if an AI writes both the question and the answer key, the answer key can be wrong in
154
+ exactly the ways the model under test is wrong. Probity avoids this with a strict **oracle layer**:
155
+
156
+ 1. **Source a real document** that contains the ground truth in its own authoritative text — for
157
+ example, a Certificate of Incorporation filed with the SEC that states, in legally precise
158
+ language, whether its preferred stock is participating.
159
+ 2. **A human separates the question from the answer.** The model sees only the clause (the question).
160
+ The validated label, plus the exact quote that proves it, is stored in a separate oracle file the
161
+ model never sees. Items whose answer cannot be determined with confidence are *excluded*, not guessed.
162
+ 3. **Run only the question** through each model, N times, and score the majority answer against the
163
+ validated label.
164
+
165
+ Synthetic instantiation is used only to *multiply* difficulty (varying numbers, off-market terms,
166
+ ambiguous phrasing) on top of a real, human-validated seed — never as the sole source of truth.
167
+
168
+ ## The test map
169
+
170
+ Probity's full test backlog is a structured map of fundraising-reasoning capabilities
171
+ (`engine/registry.json`) — 67 atomic checks across priced equity, convertibles, cap-table math,
172
+ exit waterfalls, investor rights, founder equity, regulatory filings, and off-market risk flags.
173
+ Each check is built one at a time, to depth, against real sourced documents.
174
+
175
+ ## Structure
176
+
177
+ ```
178
+ engine/ the model-agnostic core: clients, run harness, normalizer, reliability+accuracy scorers
179
+ leaves/ one folder per test, each with its real-document corpus, its separated oracle, and its runner
180
+ results/ the living benchmark table
181
+ ```
182
+
183
+ See the [Quickstart](#quickstart) above for the full clone → run → reproduce path.
184
+
185
+ ## Contributing
186
+
187
+ Bug reports, new leaves, and sourcing improvements are welcome — see
188
+ [CONTRIBUTING.md](CONTRIBUTING.md). Security issues: see [SECURITY.md](SECURITY.md), never a
189
+ public issue.
190
+
191
+ ## License
192
+
193
+ MIT — see [LICENSE](LICENSE).