@pennyfarthing/core 10.1.0 → 10.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (422) hide show
  1. package/README.md +22 -24
  2. package/package.json +3 -1
  3. package/packages/core/dist/cli/commands/doctor-file-layout.test.js.map +1 -1
  4. package/packages/core/dist/cli/commands/doctor-legacy.test.js +24 -0
  5. package/packages/core/dist/cli/commands/doctor-legacy.test.js.map +1 -1
  6. package/packages/core/dist/cli/commands/doctor.d.ts.map +1 -1
  7. package/packages/core/dist/cli/commands/doctor.js +101 -15
  8. package/packages/core/dist/cli/commands/doctor.js.map +1 -1
  9. package/packages/core/dist/cli/commands/e2e-fresh-install.test.js +2 -2
  10. package/packages/core/dist/cli/commands/e2e-fresh-install.test.js.map +1 -1
  11. package/packages/core/dist/cli/commands/e2e-upgrade.test.js +2 -2
  12. package/packages/core/dist/cli/commands/e2e-upgrade.test.js.map +1 -1
  13. package/packages/core/dist/cli/commands/hooks-consolidation.test.js +2 -2
  14. package/packages/core/dist/cli/commands/hooks-consolidation.test.js.map +1 -1
  15. package/packages/core/dist/cli/commands/init-consolidation.test.js.map +1 -1
  16. package/packages/core/dist/cli/commands/theme.js +1 -1
  17. package/packages/core/dist/cli/commands/theme.js.map +1 -1
  18. package/packages/core/dist/cli/commands/uninstall.d.ts.map +1 -1
  19. package/packages/core/dist/cli/commands/uninstall.js +24 -13
  20. package/packages/core/dist/cli/commands/uninstall.js.map +1 -1
  21. package/packages/core/dist/cli/commands/update-consolidation.test.js +0 -10
  22. package/packages/core/dist/cli/commands/update-consolidation.test.js.map +1 -1
  23. package/packages/core/dist/cli/commands/update.js.map +1 -1
  24. package/packages/core/dist/cli/ocean-profiles.test.js.map +1 -1
  25. package/packages/core/dist/cli/theme-maker.test.js +64 -115
  26. package/packages/core/dist/cli/theme-maker.test.js.map +1 -1
  27. package/packages/core/dist/cli/utils/themes.d.ts.map +1 -1
  28. package/packages/core/dist/cli/utils/themes.js +3 -2
  29. package/packages/core/dist/cli/utils/themes.js.map +1 -1
  30. package/packages/core/dist/index.d.ts +1 -1
  31. package/packages/core/dist/index.d.ts.map +1 -1
  32. package/packages/core/dist/index.js +2 -2
  33. package/packages/core/dist/index.js.map +1 -1
  34. package/packages/core/dist/plugins/plugin-discovery.d.ts +116 -0
  35. package/packages/core/dist/plugins/plugin-discovery.d.ts.map +1 -0
  36. package/packages/core/dist/plugins/plugin-discovery.js +165 -0
  37. package/packages/core/dist/plugins/plugin-discovery.js.map +1 -0
  38. package/packages/core/dist/plugins/plugin-discovery.test.d.ts +22 -0
  39. package/packages/core/dist/plugins/plugin-discovery.test.d.ts.map +1 -0
  40. package/packages/core/dist/plugins/plugin-discovery.test.js +498 -0
  41. package/packages/core/dist/plugins/plugin-discovery.test.js.map +1 -0
  42. package/packages/core/dist/scripts/add-ocean-profiles.js +1 -1
  43. package/packages/core/dist/scripts/add-ocean-profiles.js.map +1 -1
  44. package/packages/core/dist/scripts/generate-all-spiders.js +2 -0
  45. package/packages/core/dist/scripts/generate-all-spiders.js.map +1 -1
  46. package/packages/core/dist/scripts/generate-report.d.ts.map +1 -1
  47. package/packages/core/dist/scripts/generate-report.js +2 -0
  48. package/packages/core/dist/scripts/generate-report.js.map +1 -1
  49. package/packages/core/dist/scripts/generate-spider-report.js.map +1 -1
  50. package/packages/core/dist/scripts/generate-spider.d.ts.map +1 -1
  51. package/packages/core/dist/scripts/generate-spider.js +2 -0
  52. package/packages/core/dist/scripts/generate-spider.js.map +1 -1
  53. package/packages/core/dist/scripts/validate-ocean-profiles.js +1 -1
  54. package/packages/core/dist/scripts/validate-ocean-profiles.js.map +1 -1
  55. package/packages/core/dist/workflow/file-watch.d.ts +82 -0
  56. package/packages/core/dist/workflow/file-watch.d.ts.map +1 -0
  57. package/packages/core/dist/workflow/file-watch.js +198 -0
  58. package/packages/core/dist/workflow/file-watch.js.map +1 -0
  59. package/packages/core/dist/workflow/file-watch.test.d.ts +21 -0
  60. package/packages/core/dist/workflow/file-watch.test.d.ts.map +1 -0
  61. package/packages/core/dist/workflow/file-watch.test.js +469 -0
  62. package/packages/core/dist/workflow/file-watch.test.js.map +1 -0
  63. package/packages/core/dist/workflow/observation-writer.d.ts +79 -0
  64. package/packages/core/dist/workflow/observation-writer.d.ts.map +1 -0
  65. package/packages/core/dist/workflow/observation-writer.js +97 -0
  66. package/packages/core/dist/workflow/observation-writer.js.map +1 -0
  67. package/packages/core/dist/workflow/observation-writer.test.d.ts +18 -0
  68. package/packages/core/dist/workflow/observation-writer.test.d.ts.map +1 -0
  69. package/packages/core/dist/workflow/observation-writer.test.js +424 -0
  70. package/packages/core/dist/workflow/observation-writer.test.js.map +1 -0
  71. package/packages/core/dist/workflow/output-path-normalizer.d.ts +47 -0
  72. package/packages/core/dist/workflow/output-path-normalizer.d.ts.map +1 -0
  73. package/packages/core/dist/workflow/output-path-normalizer.js +79 -0
  74. package/packages/core/dist/workflow/output-path-normalizer.js.map +1 -0
  75. package/packages/core/dist/workflow/output-path-normalizer.test.d.ts +16 -0
  76. package/packages/core/dist/workflow/output-path-normalizer.test.d.ts.map +1 -0
  77. package/packages/core/dist/workflow/output-path-normalizer.test.js +157 -0
  78. package/packages/core/dist/workflow/output-path-normalizer.test.js.map +1 -0
  79. package/packages/core/dist/workflow/story-workflow-routing.test.js +4 -2
  80. package/packages/core/dist/workflow/story-workflow-routing.test.js.map +1 -1
  81. package/packages/core/dist/workflow/tandem-lifecycle.d.ts +117 -0
  82. package/packages/core/dist/workflow/tandem-lifecycle.d.ts.map +1 -0
  83. package/packages/core/dist/workflow/tandem-lifecycle.js +186 -0
  84. package/packages/core/dist/workflow/tandem-lifecycle.js.map +1 -0
  85. package/packages/core/dist/workflow/tandem-lifecycle.test.d.ts +16 -0
  86. package/packages/core/dist/workflow/tandem-lifecycle.test.d.ts.map +1 -0
  87. package/packages/core/dist/workflow/tandem-lifecycle.test.js +531 -0
  88. package/packages/core/dist/workflow/tandem-lifecycle.test.js.map +1 -0
  89. package/packages/core/dist/workflow/tool-watch.d.ts +68 -0
  90. package/packages/core/dist/workflow/tool-watch.d.ts.map +1 -0
  91. package/packages/core/dist/workflow/tool-watch.js +166 -0
  92. package/packages/core/dist/workflow/tool-watch.js.map +1 -0
  93. package/packages/core/dist/workflow/tool-watch.test.d.ts +18 -0
  94. package/packages/core/dist/workflow/tool-watch.test.d.ts.map +1 -0
  95. package/packages/core/dist/workflow/tool-watch.test.js +717 -0
  96. package/packages/core/dist/workflow/tool-watch.test.js.map +1 -0
  97. package/packages/core/dist/workflow/variable-resolver.js +1 -1
  98. package/packages/core/dist/workflow/variable-resolver.js.map +1 -1
  99. package/packages/core/dist/workflow/workflow-migration.test.js +8 -4
  100. package/packages/core/dist/workflow/workflow-migration.test.js.map +1 -1
  101. package/packages/core/dist/workflow/workflow-schema.d.ts +7 -0
  102. package/packages/core/dist/workflow/workflow-schema.d.ts.map +1 -1
  103. package/packages/core/dist/workflow/workflow-schema.js +44 -0
  104. package/packages/core/dist/workflow/workflow-schema.js.map +1 -1
  105. package/packages/core/dist/workflow/workflow-schema.test.d.ts.map +1 -1
  106. package/packages/core/dist/workflow/workflow-schema.test.js +192 -0
  107. package/packages/core/dist/workflow/workflow-schema.test.js.map +1 -1
  108. package/pennyfarthing-dist/agents/README.md +3 -1
  109. package/pennyfarthing-dist/agents/ba.md +165 -0
  110. package/pennyfarthing-dist/agents/handoff.md +18 -3
  111. package/pennyfarthing-dist/agents/sm-finish.md +1 -1
  112. package/pennyfarthing-dist/agents/sm-handoff.md +27 -4
  113. package/pennyfarthing-dist/agents/sm.md +11 -5
  114. package/pennyfarthing-dist/agents/tandem-backseat.md +119 -0
  115. package/pennyfarthing-dist/commands/ba.md +17 -0
  116. package/pennyfarthing-dist/commands/setup.md +4 -0
  117. package/pennyfarthing-dist/guides/agent-behavior.md +62 -6
  118. package/pennyfarthing-dist/guides/bikelane.md +3 -2
  119. package/pennyfarthing-dist/guides/scale-levels.md +4 -6
  120. package/pennyfarthing-dist/guides/tandem-protocol.md +158 -0
  121. package/pennyfarthing-dist/guides/workflow-schema.md +1 -1
  122. package/pennyfarthing-dist/personas/themes/a-team.yaml +30 -0
  123. package/pennyfarthing-dist/personas/themes/alice-in-wonderland.yaml +30 -0
  124. package/pennyfarthing-dist/personas/themes/battlestar-galactica.yaml +30 -0
  125. package/pennyfarthing-dist/personas/themes/blade-runner.yaml +30 -0
  126. package/pennyfarthing-dist/personas/themes/catch-22.yaml +30 -0
  127. package/pennyfarthing-dist/personas/themes/control.yaml +30 -0
  128. package/pennyfarthing-dist/personas/themes/cowboy-bebop.yaml +31 -0
  129. package/pennyfarthing-dist/personas/themes/discworld.yaml +32 -1
  130. package/pennyfarthing-dist/personas/themes/doctor-who.yaml +31 -0
  131. package/pennyfarthing-dist/personas/themes/dune.yaml +32 -0
  132. package/pennyfarthing-dist/personas/themes/fifth-element.yaml +327 -0
  133. package/pennyfarthing-dist/personas/themes/firefly.yaml +31 -0
  134. package/pennyfarthing-dist/personas/themes/game-of-thrones.yaml +30 -0
  135. package/pennyfarthing-dist/personas/themes/harry-potter.yaml +30 -0
  136. package/pennyfarthing-dist/personas/themes/hitchhikers-guide.yaml +30 -0
  137. package/pennyfarthing-dist/personas/themes/lord-of-the-rings.yaml +30 -0
  138. package/pennyfarthing-dist/personas/themes/mad-max.yaml +30 -0
  139. package/pennyfarthing-dist/personas/themes/mash.yaml +33 -0
  140. package/pennyfarthing-dist/personas/themes/princess-bride.yaml +34 -0
  141. package/pennyfarthing-dist/personas/themes/sandman.yaml +33 -0
  142. package/pennyfarthing-dist/personas/themes/star-trek-tng.yaml +34 -0
  143. package/pennyfarthing-dist/personas/themes/star-wars.yaml +33 -0
  144. package/pennyfarthing-dist/personas/themes/the-expanse.yaml +30 -0
  145. package/pennyfarthing-dist/personas/themes/the-matrix.yaml +30 -0
  146. package/pennyfarthing-dist/personas/themes/watchmen.yaml +30 -0
  147. package/pennyfarthing-dist/personas/themes/west-wing.yaml +30 -0
  148. package/pennyfarthing-dist/personas/themes/x-files.yaml +30 -0
  149. package/pennyfarthing-dist/scripts/README.md +1 -1
  150. package/pennyfarthing-dist/scripts/core/agent-session.sh +1 -1
  151. package/pennyfarthing-dist/scripts/hooks/bell-mode-hook.sh +131 -54
  152. package/pennyfarthing-dist/scripts/hooks/post-merge.sh +20 -10
  153. package/pennyfarthing-dist/scripts/misc/statusline.sh +50 -8
  154. package/pennyfarthing-dist/scripts/portraits/generate-portraits.py +2 -2
  155. package/pennyfarthing-dist/scripts/validation/validate-agent-schema.sh +1 -0
  156. package/pennyfarthing-dist/scripts/workflow/README.md +2 -2
  157. package/pennyfarthing-dist/scripts/workflow/finish-story.sh +10 -189
  158. package/pennyfarthing-dist/skills/skill-registry.schema.json +8 -0
  159. package/pennyfarthing-dist/skills/skill-registry.yaml +1 -1
  160. package/pennyfarthing-dist/skills/sprint/skill.md +25 -2
  161. package/pennyfarthing-dist/skills/theme/skill.md +1 -1
  162. package/pennyfarthing-dist/skills/workflow/skill.md +24 -1
  163. package/pennyfarthing-dist/workflows/architecture/workflow.yaml +65 -0
  164. package/pennyfarthing-dist/workflows/architecture.yaml +2 -2
  165. package/pennyfarthing-dist/workflows/bdd-tandem.yaml +70 -0
  166. package/pennyfarthing-dist/workflows/epics-and-stories/workflow.yaml +2 -2
  167. package/pennyfarthing-dist/workflows/implementation-readiness/workflow.yaml +2 -2
  168. package/pennyfarthing-dist/workflows/prd/workflow.yaml +2 -2
  169. package/pennyfarthing-dist/workflows/product-brief/workflow.yaml +2 -2
  170. package/pennyfarthing-dist/workflows/project-context/workflow.yaml +2 -2
  171. package/pennyfarthing-dist/workflows/quick-dev/workflow.yaml +2 -2
  172. package/pennyfarthing-dist/workflows/research/workflow.yaml +2 -2
  173. package/pennyfarthing-dist/workflows/retrospective/workflow.yaml +1 -1
  174. package/pennyfarthing-dist/workflows/sprint-planning/workflow.yaml +3 -3
  175. package/pennyfarthing-dist/workflows/tdd-tandem.yaml +61 -0
  176. package/pennyfarthing-dist/workflows/ux-design/workflow.yaml +2 -2
  177. package/pennyfarthing_scripts/__pycache__/cli.cpython-314.pyc +0 -0
  178. package/pennyfarthing_scripts/__pycache__/hooks.cpython-314.pyc +0 -0
  179. package/pennyfarthing_scripts/__pycache__/pretooluse_hook.cpython-314.pyc +0 -0
  180. package/pennyfarthing_scripts/__pycache__/schema_validation_hook.cpython-314.pyc +0 -0
  181. package/pennyfarthing_scripts/__pycache__/workflow.cpython-314.pyc +0 -0
  182. package/pennyfarthing_scripts/bellmode_hook.py +202 -47
  183. package/pennyfarthing_scripts/bikerack/__init__.py +36 -0
  184. package/pennyfarthing_scripts/bikerack/__main__.py +5 -0
  185. package/pennyfarthing_scripts/bikerack/__pycache__/__init__.cpython-314.pyc +0 -0
  186. package/pennyfarthing_scripts/bikerack/__pycache__/__main__.cpython-314.pyc +0 -0
  187. package/pennyfarthing_scripts/bikerack/__pycache__/cli.cpython-314.pyc +0 -0
  188. package/pennyfarthing_scripts/bikerack/__pycache__/launcher.cpython-314.pyc +0 -0
  189. package/pennyfarthing_scripts/bikerack/cli.py +148 -0
  190. package/pennyfarthing_scripts/bikerack/launcher.py +181 -0
  191. package/pennyfarthing_scripts/brownfield/__init__.py +6 -6
  192. package/pennyfarthing_scripts/brownfield/__main__.py +1 -0
  193. package/pennyfarthing_scripts/brownfield/cli.py +0 -1
  194. package/pennyfarthing_scripts/brownfield/discover.py +1 -2
  195. package/pennyfarthing_scripts/cli.py +16 -6
  196. package/pennyfarthing_scripts/codemarkers/__init__.py +5 -1
  197. package/pennyfarthing_scripts/codemarkers/__pycache__/__init__.cpython-314.pyc +0 -0
  198. package/pennyfarthing_scripts/codemarkers/__pycache__/__main__.cpython-314.pyc +0 -0
  199. package/pennyfarthing_scripts/codemarkers/__pycache__/analyze.cpython-314.pyc +0 -0
  200. package/pennyfarthing_scripts/codemarkers/__pycache__/cli.cpython-314.pyc +0 -0
  201. package/pennyfarthing_scripts/codemarkers/__pycache__/formatters.cpython-314.pyc +0 -0
  202. package/pennyfarthing_scripts/codemarkers/__pycache__/models.cpython-314.pyc +0 -0
  203. package/pennyfarthing_scripts/codemarkers/analyze.py +177 -2
  204. package/pennyfarthing_scripts/codemarkers/cli.py +50 -0
  205. package/pennyfarthing_scripts/codemarkers/formatters.py +0 -1
  206. package/pennyfarthing_scripts/codemarkers/models.py +15 -0
  207. package/pennyfarthing_scripts/common/__init__.py +8 -9
  208. package/pennyfarthing_scripts/common/__pycache__/__init__.cpython-314.pyc +0 -0
  209. package/pennyfarthing_scripts/common/__pycache__/config.cpython-314.pyc +0 -0
  210. package/pennyfarthing_scripts/common/config.py +1 -1
  211. package/pennyfarthing_scripts/complexity/__init__.py +1 -1
  212. package/pennyfarthing_scripts/complexity/__pycache__/__init__.cpython-314.pyc +0 -0
  213. package/pennyfarthing_scripts/complexity/__pycache__/__main__.cpython-314.pyc +0 -0
  214. package/pennyfarthing_scripts/complexity/__pycache__/analyze.cpython-314.pyc +0 -0
  215. package/pennyfarthing_scripts/complexity/__pycache__/cli.cpython-314.pyc +0 -0
  216. package/pennyfarthing_scripts/complexity/__pycache__/formatters.cpython-314.pyc +0 -0
  217. package/pennyfarthing_scripts/complexity/__pycache__/models.cpython-314.pyc +0 -0
  218. package/pennyfarthing_scripts/complexity/analyze.py +1 -1
  219. package/pennyfarthing_scripts/complexity/cli.py +5 -1
  220. package/pennyfarthing_scripts/complexity/formatters.py +1 -1
  221. package/pennyfarthing_scripts/context.py +14 -15
  222. package/pennyfarthing_scripts/deadcode/__pycache__/__init__.cpython-314.pyc +0 -0
  223. package/pennyfarthing_scripts/deadcode/__pycache__/__main__.cpython-314.pyc +0 -0
  224. package/pennyfarthing_scripts/deadcode/__pycache__/analyze.cpython-314.pyc +0 -0
  225. package/pennyfarthing_scripts/deadcode/__pycache__/cli.cpython-314.pyc +0 -0
  226. package/pennyfarthing_scripts/deadcode/__pycache__/formatters.cpython-314.pyc +0 -0
  227. package/pennyfarthing_scripts/deadcode/__pycache__/models.cpython-314.pyc +0 -0
  228. package/pennyfarthing_scripts/deadcode/analyze.py +3 -4
  229. package/pennyfarthing_scripts/deadcode/cli.py +2 -2
  230. package/pennyfarthing_scripts/dependencies/__init__.py +2 -2
  231. package/pennyfarthing_scripts/dependencies/__pycache__/__init__.cpython-314.pyc +0 -0
  232. package/pennyfarthing_scripts/dependencies/__pycache__/__main__.cpython-314.pyc +0 -0
  233. package/pennyfarthing_scripts/dependencies/__pycache__/analyze.cpython-314.pyc +0 -0
  234. package/pennyfarthing_scripts/dependencies/__pycache__/cli.cpython-314.pyc +0 -0
  235. package/pennyfarthing_scripts/dependencies/__pycache__/formatters.cpython-314.pyc +0 -0
  236. package/pennyfarthing_scripts/dependencies/__pycache__/models.cpython-314.pyc +0 -0
  237. package/pennyfarthing_scripts/dependencies/analyze.py +1 -1
  238. package/pennyfarthing_scripts/dependencies/cli.py +8 -4
  239. package/pennyfarthing_scripts/dependencies/formatters.py +1 -1
  240. package/pennyfarthing_scripts/git/__init__.py +5 -5
  241. package/pennyfarthing_scripts/git/create_branches.py +3 -2
  242. package/pennyfarthing_scripts/git/status_all.py +1 -1
  243. package/pennyfarthing_scripts/healthscore/__init__.py +2 -2
  244. package/pennyfarthing_scripts/healthscore/__main__.py +8 -0
  245. package/pennyfarthing_scripts/healthscore/__pycache__/__init__.cpython-314.pyc +0 -0
  246. package/pennyfarthing_scripts/healthscore/__pycache__/__main__.cpython-314.pyc +0 -0
  247. package/pennyfarthing_scripts/healthscore/__pycache__/analyze.cpython-314.pyc +0 -0
  248. package/pennyfarthing_scripts/healthscore/__pycache__/cli.cpython-314.pyc +0 -0
  249. package/pennyfarthing_scripts/healthscore/__pycache__/formatters.cpython-314.pyc +0 -0
  250. package/pennyfarthing_scripts/healthscore/__pycache__/models.cpython-314.pyc +0 -0
  251. package/pennyfarthing_scripts/healthscore/analyze.py +452 -21
  252. package/pennyfarthing_scripts/healthscore/cli.py +5 -1
  253. package/pennyfarthing_scripts/healthscore/models.py +0 -1
  254. package/pennyfarthing_scripts/hooks.py +8 -11
  255. package/pennyfarthing_scripts/hotspots/__init__.py +6 -6
  256. package/pennyfarthing_scripts/hotspots/__pycache__/__init__.cpython-314.pyc +0 -0
  257. package/pennyfarthing_scripts/hotspots/__pycache__/analyze.cpython-314.pyc +0 -0
  258. package/pennyfarthing_scripts/hotspots/__pycache__/cli.cpython-314.pyc +0 -0
  259. package/pennyfarthing_scripts/hotspots/__pycache__/models.cpython-314.pyc +0 -0
  260. package/pennyfarthing_scripts/hotspots/analyze.py +128 -14
  261. package/pennyfarthing_scripts/hotspots/cli.py +2 -2
  262. package/pennyfarthing_scripts/hotspots/models.py +0 -1
  263. package/pennyfarthing_scripts/jira/__init__.py +15 -17
  264. package/pennyfarthing_scripts/jira/__pycache__/__init__.cpython-314.pyc +0 -0
  265. package/pennyfarthing_scripts/jira/__pycache__/bidirectional.cpython-314.pyc +0 -0
  266. package/pennyfarthing_scripts/jira/__pycache__/claim.cpython-314.pyc +0 -0
  267. package/pennyfarthing_scripts/jira/__pycache__/cli.cpython-314.pyc +0 -0
  268. package/pennyfarthing_scripts/jira/__pycache__/client.cpython-314.pyc +0 -0
  269. package/pennyfarthing_scripts/jira/__pycache__/create.cpython-314.pyc +0 -0
  270. package/pennyfarthing_scripts/jira/__pycache__/epic.cpython-314.pyc +0 -0
  271. package/pennyfarthing_scripts/jira/__pycache__/reconcile.cpython-314.pyc +0 -0
  272. package/pennyfarthing_scripts/jira/__pycache__/story.cpython-314.pyc +0 -0
  273. package/pennyfarthing_scripts/jira/__pycache__/sync.cpython-314.pyc +0 -0
  274. package/pennyfarthing_scripts/jira/bidirectional.py +2 -3
  275. package/pennyfarthing_scripts/jira/claim.py +21 -0
  276. package/pennyfarthing_scripts/jira/cli.py +2 -2
  277. package/pennyfarthing_scripts/jira/client.py +4 -4
  278. package/pennyfarthing_scripts/jira/create.py +45 -1
  279. package/pennyfarthing_scripts/jira/epic.py +3 -2
  280. package/pennyfarthing_scripts/jira/reconcile.py +0 -1
  281. package/pennyfarthing_scripts/jira/story.py +2 -0
  282. package/pennyfarthing_scripts/jira/sync.py +1 -1
  283. package/pennyfarthing_scripts/migration/__pycache__/__init__.cpython-314.pyc +0 -0
  284. package/pennyfarthing_scripts/migration/__pycache__/session.cpython-314.pyc +0 -0
  285. package/pennyfarthing_scripts/migration/__pycache__/skill.cpython-314.pyc +0 -0
  286. package/pennyfarthing_scripts/migration/__pycache__/step.cpython-314.pyc +0 -0
  287. package/pennyfarthing_scripts/migration/__pycache__/validate.cpython-314.pyc +0 -0
  288. package/pennyfarthing_scripts/migration/skill.py +0 -1
  289. package/pennyfarthing_scripts/migration/step.py +0 -1
  290. package/pennyfarthing_scripts/migration/validate.py +8 -5
  291. package/pennyfarthing_scripts/patch_mode.py +2 -2
  292. package/pennyfarthing_scripts/preflight/__init__.py +1 -1
  293. package/pennyfarthing_scripts/preflight/__pycache__/__init__.cpython-314.pyc +0 -0
  294. package/pennyfarthing_scripts/preflight/__pycache__/finish.cpython-314.pyc +0 -0
  295. package/pennyfarthing_scripts/preflight/finish.py +0 -1
  296. package/pennyfarthing_scripts/pretooluse_hook.py +6 -7
  297. package/pennyfarthing_scripts/prime/__init__.py +2 -0
  298. package/pennyfarthing_scripts/prime/__pycache__/__init__.cpython-314.pyc +0 -0
  299. package/pennyfarthing_scripts/prime/__pycache__/cli.cpython-314.pyc +0 -0
  300. package/pennyfarthing_scripts/prime/__pycache__/loader.cpython-314.pyc +0 -0
  301. package/pennyfarthing_scripts/prime/__pycache__/persona.cpython-314.pyc +0 -0
  302. package/pennyfarthing_scripts/prime/__pycache__/tiers.cpython-314.pyc +0 -0
  303. package/pennyfarthing_scripts/prime/cli.py +18 -1
  304. package/pennyfarthing_scripts/prime/loader.py +72 -3
  305. package/pennyfarthing_scripts/prime/persona.py +4 -2
  306. package/pennyfarthing_scripts/prime/tiers.py +17 -4
  307. package/pennyfarthing_scripts/schema_validation_hook.py +2 -3
  308. package/pennyfarthing_scripts/sprint/__init__.py +10 -12
  309. package/pennyfarthing_scripts/sprint/__main__.py +2 -2
  310. package/pennyfarthing_scripts/sprint/__pycache__/__init__.cpython-314.pyc +0 -0
  311. package/pennyfarthing_scripts/sprint/__pycache__/archive.cpython-314.pyc +0 -0
  312. package/pennyfarthing_scripts/sprint/__pycache__/archive_epic.cpython-314.pyc +0 -0
  313. package/pennyfarthing_scripts/sprint/__pycache__/cli.cpython-314.pyc +0 -0
  314. package/pennyfarthing_scripts/sprint/__pycache__/epic_add.cpython-314.pyc +0 -0
  315. package/pennyfarthing_scripts/sprint/__pycache__/import_epic.cpython-314.pyc +0 -0
  316. package/pennyfarthing_scripts/sprint/__pycache__/loader.cpython-314.pyc +0 -0
  317. package/pennyfarthing_scripts/sprint/__pycache__/status.cpython-314.pyc +0 -0
  318. package/pennyfarthing_scripts/sprint/__pycache__/story_add.cpython-314.pyc +0 -0
  319. package/pennyfarthing_scripts/sprint/__pycache__/story_finish.cpython-314.pyc +0 -0
  320. package/pennyfarthing_scripts/sprint/__pycache__/story_update.cpython-314.pyc +0 -0
  321. package/pennyfarthing_scripts/sprint/__pycache__/validate_cmd.cpython-314.pyc +0 -0
  322. package/pennyfarthing_scripts/sprint/__pycache__/validator.cpython-314.pyc +0 -0
  323. package/pennyfarthing_scripts/sprint/__pycache__/work.cpython-314.pyc +0 -0
  324. package/pennyfarthing_scripts/sprint/__pycache__/yaml_io.cpython-314.pyc +0 -0
  325. package/pennyfarthing_scripts/sprint/archive.py +0 -1
  326. package/pennyfarthing_scripts/sprint/archive_epic.py +1 -4
  327. package/pennyfarthing_scripts/sprint/cli.py +34 -28
  328. package/pennyfarthing_scripts/sprint/epic_add.py +8 -1
  329. package/pennyfarthing_scripts/sprint/import_epic.py +42 -18
  330. package/pennyfarthing_scripts/sprint/loader.py +6 -0
  331. package/pennyfarthing_scripts/sprint/status.py +1 -2
  332. package/pennyfarthing_scripts/sprint/story_add.py +2 -2
  333. package/pennyfarthing_scripts/sprint/story_finish.py +3 -5
  334. package/pennyfarthing_scripts/sprint/story_update.py +11 -3
  335. package/pennyfarthing_scripts/sprint/validate_cmd.py +0 -1
  336. package/pennyfarthing_scripts/sprint/validator.py +120 -6
  337. package/pennyfarthing_scripts/sprint/work.py +1 -4
  338. package/pennyfarthing_scripts/sprint/yaml_io.py +10 -2
  339. package/pennyfarthing_scripts/story/__init__.py +14 -16
  340. package/pennyfarthing_scripts/story/__pycache__/__init__.cpython-314.pyc +0 -0
  341. package/pennyfarthing_scripts/story/__pycache__/size.cpython-314.pyc +0 -0
  342. package/pennyfarthing_scripts/story/__pycache__/template.cpython-314.pyc +0 -0
  343. package/pennyfarthing_scripts/story/size.py +0 -1
  344. package/pennyfarthing_scripts/story/template.py +0 -1
  345. package/pennyfarthing_scripts/swebench.py +1 -2
  346. package/pennyfarthing_scripts/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
  347. package/pennyfarthing_scripts/tests/__pycache__/test_bikerack.cpython-314-pytest-9.0.2.pyc +0 -0
  348. package/pennyfarthing_scripts/tests/__pycache__/test_epic_shard_validation.cpython-314-pytest-9.0.2.pyc +0 -0
  349. package/pennyfarthing_scripts/tests/__pycache__/test_healthscore.cpython-314-pytest-9.0.2.pyc +0 -0
  350. package/pennyfarthing_scripts/tests/__pycache__/test_sprint_validator.cpython-314-pytest-9.0.2.pyc +0 -0
  351. package/pennyfarthing_scripts/tests/__pycache__/test_yaml_io.cpython-314-pytest-9.0.2.pyc +0 -0
  352. package/pennyfarthing_scripts/tests/conftest.py +1 -2
  353. package/pennyfarthing_scripts/tests/test_bikerack.py +785 -0
  354. package/pennyfarthing_scripts/tests/test_brownfield.py +10 -13
  355. package/pennyfarthing_scripts/tests/test_cli_modules.py +0 -4
  356. package/pennyfarthing_scripts/tests/test_codemarkers.py +13 -8
  357. package/pennyfarthing_scripts/tests/test_common.py +9 -4
  358. package/pennyfarthing_scripts/tests/test_epic_shard_validation.py +699 -0
  359. package/pennyfarthing_scripts/tests/test_git_utils.py +10 -13
  360. package/pennyfarthing_scripts/tests/test_healthscore.py +17 -25
  361. package/pennyfarthing_scripts/tests/test_jira_package.py +0 -3
  362. package/pennyfarthing_scripts/tests/test_package_structure.py +3 -16
  363. package/pennyfarthing_scripts/tests/test_patch_mode.py +7 -11
  364. package/pennyfarthing_scripts/tests/test_prime.py +39 -21
  365. package/pennyfarthing_scripts/tests/test_sprint_package.py +3 -8
  366. package/pennyfarthing_scripts/tests/test_sprint_validator.py +53 -5
  367. package/pennyfarthing_scripts/tests/test_story_add.py +3 -7
  368. package/pennyfarthing_scripts/tests/test_story_package.py +0 -3
  369. package/pennyfarthing_scripts/tests/test_story_update.py +5 -10
  370. package/pennyfarthing_scripts/tests/test_tiers.py +18 -17
  371. package/pennyfarthing_scripts/tests/test_token_counting.py +19 -13
  372. package/pennyfarthing_scripts/tests/test_topology_loader.py +620 -0
  373. package/pennyfarthing_scripts/tests/test_validate_cmd.py +2 -7
  374. package/pennyfarthing_scripts/tests/test_workflow_check.py +0 -2
  375. package/pennyfarthing_scripts/tests/test_yaml_io.py +0 -3
  376. package/pennyfarthing_scripts/theme/__pycache__/__init__.cpython-314.pyc +0 -0
  377. package/pennyfarthing_scripts/theme/__pycache__/cli.cpython-314.pyc +0 -0
  378. package/pennyfarthing_scripts/theme/cli.py +3 -2
  379. package/pennyfarthing_scripts/validate/__init__.py +21 -0
  380. package/pennyfarthing_scripts/validate/__pycache__/__init__.cpython-314.pyc +0 -0
  381. package/pennyfarthing_scripts/validate/__pycache__/cli.cpython-314.pyc +0 -0
  382. package/pennyfarthing_scripts/validate/adapters/__init__.py +0 -0
  383. package/pennyfarthing_scripts/validate/adapters/__pycache__/__init__.cpython-314.pyc +0 -0
  384. package/pennyfarthing_scripts/validate/adapters/__pycache__/agent.cpython-314.pyc +0 -0
  385. package/pennyfarthing_scripts/validate/adapters/__pycache__/schema.cpython-314.pyc +0 -0
  386. package/pennyfarthing_scripts/validate/adapters/__pycache__/skill_command.cpython-314.pyc +0 -0
  387. package/pennyfarthing_scripts/validate/adapters/__pycache__/sprint.cpython-314.pyc +0 -0
  388. package/pennyfarthing_scripts/validate/adapters/__pycache__/workflow.cpython-314.pyc +0 -0
  389. package/pennyfarthing_scripts/validate/adapters/agent.py +239 -0
  390. package/pennyfarthing_scripts/validate/adapters/schema.py +30 -0
  391. package/pennyfarthing_scripts/validate/adapters/skill_command.py +291 -0
  392. package/pennyfarthing_scripts/validate/adapters/sprint.py +69 -0
  393. package/pennyfarthing_scripts/validate/adapters/workflow.py +320 -0
  394. package/pennyfarthing_scripts/validate/cli.py +141 -0
  395. package/pennyfarthing_scripts/welcome_hook.py +2 -3
  396. package/pennyfarthing_scripts/workflow.py +3 -3
  397. package/scripts/README.md +3 -15
  398. package/pennyfarthing-dist/commands/benchmark-control.md +0 -69
  399. package/pennyfarthing-dist/commands/benchmark.md +0 -485
  400. package/pennyfarthing-dist/commands/job-fair.md +0 -102
  401. package/pennyfarthing-dist/commands/solo.md +0 -447
  402. package/pennyfarthing-dist/guides/benchmarks.md +0 -62
  403. package/pennyfarthing-dist/scripts/test/ensure-swebench-data.sh +0 -59
  404. package/pennyfarthing-dist/scripts/test/ground-truth-judge.py +0 -220
  405. package/pennyfarthing-dist/scripts/test/swebench-judge.py +0 -374
  406. package/pennyfarthing-dist/scripts/test/test-cache.sh +0 -165
  407. package/pennyfarthing-dist/scripts/test/test-setup.sh +0 -337
  408. package/pennyfarthing-dist/scripts/theme/compute-theme-tiers.sh +0 -13
  409. package/pennyfarthing-dist/scripts/theme/compute_theme_tiers.py +0 -402
  410. package/pennyfarthing-dist/scripts/theme/update-theme-tiers.sh +0 -97
  411. package/pennyfarthing-dist/skills/finalize-run/SKILL.md +0 -261
  412. package/pennyfarthing-dist/skills/judge/SKILL.md +0 -644
  413. package/pennyfarthing-dist/skills/persona-benchmark/SKILL.md +0 -187
  414. package/pennyfarthing-dist/workflows/dev-story/checklist.md +0 -80
  415. package/pennyfarthing-dist/workflows/dev-story/instructions.xml +0 -410
  416. package/pennyfarthing-dist/workflows/dev-story/workflow.yaml +0 -50
  417. package/pennyfarthing-dist/workflows/quick-spec/steps/step-01-understand.md +0 -201
  418. package/pennyfarthing-dist/workflows/quick-spec/steps/step-02-investigate.md +0 -156
  419. package/pennyfarthing-dist/workflows/quick-spec/steps/step-03-generate.md +0 -140
  420. package/pennyfarthing-dist/workflows/quick-spec/steps/step-04-review.md +0 -203
  421. package/pennyfarthing-dist/workflows/quick-spec/tech-spec-template.md +0 -74
  422. package/pennyfarthing-dist/workflows/quick-spec/workflow.yaml +0 -27
@@ -1,220 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Ground-truth judge for SWE-bench scenarios.
4
-
5
- Compares Claude's proposed fix against the actual SWE-bench patch.
6
- Scores based on:
7
- - File identification (20%)
8
- - Function/location identification (20%)
9
- - Fix logic match (40%)
10
- - Completeness (20%)
11
- """
12
-
13
- import json
14
- import re
15
- import sys
16
- from pathlib import Path
17
- from difflib import SequenceMatcher
18
-
19
- # Add parent to path for pennyfarthing_scripts imports
20
- sys.path.insert(0, str(Path(__file__).resolve().parents[3]))
21
-
22
- from pennyfarthing_scripts.swebench import (
23
- extract_patch_info,
24
- extract_problem_keywords,
25
- find_scenario,
26
- get_meaningful_patterns,
27
- load_swebench_data,
28
- )
29
-
30
-
31
- def score_response(response_text, ground_truth):
32
- """Score a response against ground truth patch."""
33
- patch_info = extract_patch_info(ground_truth['patch'])
34
-
35
- scores = {
36
- 'file_identification': 0,
37
- 'location_identification': 0,
38
- 'fix_logic_match': 0,
39
- 'completeness': 0,
40
- 'details': {}
41
- }
42
-
43
- response_lower = response_text.lower()
44
-
45
- # 1. FILE IDENTIFICATION (20 points)
46
- files_found = 0
47
- for f in patch_info.files:
48
- # Check various forms of the filename
49
- filename = Path(f).name
50
- if filename.lower() in response_lower or f.lower() in response_lower:
51
- files_found += 1
52
-
53
- if patch_info.files:
54
- file_score = (files_found / len(patch_info.files)) * 20
55
- scores['file_identification'] = min(20, file_score)
56
- scores['details']['files_expected'] = patch_info.files
57
- scores['details']['files_found'] = files_found
58
- else:
59
- scores['file_identification'] = 20 # No specific file in patch
60
-
61
- # 2. LOCATION IDENTIFICATION (20 points)
62
- # Look for function/class names mentioned in the patch
63
- locations_found = 0
64
- for func in patch_info.functions:
65
- # Extract the function/class name
66
- func_match = re.search(r'(def|class)\s+(\w+)', func)
67
- if func_match:
68
- func_name = func_match.group(2)
69
- if func_name.lower() in response_lower:
70
- locations_found += 1
71
- elif func.strip() and func.strip().split()[0] in response_lower:
72
- locations_found += 1
73
-
74
- if patch_info.functions:
75
- loc_score = (locations_found / len(patch_info.functions)) * 20
76
- scores['location_identification'] = min(20, loc_score)
77
- scores['details']['locations_expected'] = patch_info.functions[:3]
78
- scores['details']['locations_found'] = locations_found
79
- else:
80
- scores['location_identification'] = 10 # Partial credit
81
-
82
- # 3. FIX LOGIC MATCH (40 points)
83
- # Check if key code patterns from the fix appear in the response
84
- meaningful_patterns = get_meaningful_patterns(patch_info.key_patterns)
85
-
86
- patterns_found = 0
87
- for pattern in meaningful_patterns:
88
- if pattern.lower() in response_lower:
89
- patterns_found += 1
90
-
91
- if meaningful_patterns:
92
- pattern_score = (patterns_found / len(meaningful_patterns)) * 20
93
- scores['details']['patterns_expected'] = meaningful_patterns[:10]
94
- scores['details']['patterns_found'] = patterns_found
95
- else:
96
- pattern_score = 10
97
-
98
- # Check for actual code additions
99
- additions_matched = 0
100
- for addition in patch_info.additions[:5]: # Check first 5 additions
101
- # Normalize and check
102
- addition_normalized = re.sub(r'\s+', ' ', addition.lower())
103
- response_normalized = re.sub(r'\s+', ' ', response_lower)
104
-
105
- # Use fuzzy matching
106
- similarity = SequenceMatcher(None, addition_normalized, response_normalized).ratio()
107
- if similarity > 0.6 or addition_normalized in response_normalized:
108
- additions_matched += 1
109
-
110
- if patch_info.additions:
111
- addition_score = (additions_matched / min(5, len(patch_info.additions))) * 20
112
- scores['details']['additions_matched'] = additions_matched
113
- else:
114
- addition_score = 10
115
-
116
- scores['fix_logic_match'] = min(40, pattern_score + addition_score)
117
-
118
- # 4. COMPLETENESS (20 points)
119
- # Does the response have all the elements of a good fix?
120
- completeness_score = 0
121
-
122
- # Has code block?
123
- if '```' in response_text:
124
- completeness_score += 5
125
-
126
- # Has test considerations?
127
- if 'test' in response_lower:
128
- completeness_score += 5
129
-
130
- # Mentions the specific error/issue?
131
- problem_keywords = extract_problem_keywords(ground_truth.get('problem_statement', ''))
132
- keywords_found = sum(1 for kw in problem_keywords if kw.lower() in response_lower)
133
- if problem_keywords:
134
- completeness_score += min(5, (keywords_found / len(problem_keywords)) * 5)
135
- else:
136
- completeness_score += 2.5
137
-
138
- # Has explanation of why fix works?
139
- explanation_words = ['because', 'this fixes', 'this resolves', 'the issue', 'the problem', 'solution']
140
- if any(word in response_lower for word in explanation_words):
141
- completeness_score += 5
142
-
143
- scores['completeness'] = min(20, completeness_score)
144
-
145
- # Total
146
- scores['total'] = round(
147
- scores['file_identification'] +
148
- scores['location_identification'] +
149
- scores['fix_logic_match'] +
150
- scores['completeness']
151
- , 1)
152
-
153
- return scores
154
-
155
-
156
- def main():
157
- if len(sys.argv) < 3:
158
- print("Usage: ground-truth-judge.py <scenario_name> <response_file>")
159
- print("Example: ground-truth-judge.py flask-5014 run_20260102T134237Z.json")
160
- sys.exit(1)
161
-
162
- scenario_name = sys.argv[1]
163
- response_file = sys.argv[2]
164
-
165
- # Load SWE-bench data
166
- swebench_data = load_swebench_data()
167
-
168
- # Find scenario
169
- scenario = find_scenario(swebench_data, scenario_name)
170
- if not scenario:
171
- print(f"Error: Scenario '{scenario_name}' not found in SWE-bench data")
172
- sys.exit(1)
173
-
174
- # Load response
175
- with open(response_file, 'r') as f:
176
- response_data = json.load(f)
177
-
178
- response_text = response_data.get('result', '')
179
- if not response_text:
180
- print("Error: No 'result' field in response file")
181
- sys.exit(1)
182
-
183
- # Score
184
- scores = score_response(response_text, scenario)
185
-
186
- # Output
187
- print(f"\n{'='*60}")
188
- print(f"GROUND TRUTH EVALUATION: {scenario_name}")
189
- print(f"{'='*60}")
190
- print(f"\nScores:")
191
- print(f" File Identification: {scores['file_identification']:5.1f}/20")
192
- print(f" Location Identification: {scores['location_identification']:5.1f}/20")
193
- print(f" Fix Logic Match: {scores['fix_logic_match']:5.1f}/40")
194
- print(f" Completeness: {scores['completeness']:5.1f}/20")
195
- print(f" {'─'*40}")
196
- print(f" TOTAL: {scores['total']:5.1f}/100")
197
-
198
- print(f"\nDetails:")
199
- for key, value in scores['details'].items():
200
- print(f" {key}: {value}")
201
-
202
- # Output JSON for programmatic use
203
- output = {
204
- 'scenario': scenario_name,
205
- 'instance_id': scenario.get('instance_id'),
206
- 'scores': scores,
207
- 'ground_truth_patch_preview': scenario.get('patch', '')[:300]
208
- }
209
-
210
- # Save judge output
211
- output_path = response_file.replace('run_', 'gt_judge_')
212
- with open(output_path, 'w') as f:
213
- json.dump(output, f, indent=2)
214
- print(f"\nSaved to: {output_path}")
215
-
216
- return scores
217
-
218
-
219
- if __name__ == '__main__':
220
- main()
@@ -1,374 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- SWE-bench scenario judge using:
4
- 1. Scenario-specific scoring rubric from YAML
5
- 2. Ground-truth validation from actual SWE-bench patches
6
-
7
- Scoring structure:
8
- - root_cause (30%): IDENTIFIES_BUG_LOCATION (15) + EXPLAINS_WHY_BROKEN (15)
9
- - fix_quality (40%): FIX_ADDRESSES_ISSUE (20) + FIX_IS_MINIMAL (10) + FIX_SYNTAX_CORRECT (10)
10
- - completeness (20%): EDGE_CASES (10) + TEST_COVERAGE (10)
11
- - persona (10%): IN_CHARACTER (10)
12
- """
13
-
14
- import json
15
- import re
16
- import sys
17
- from pathlib import Path
18
- from difflib import SequenceMatcher
19
-
20
- # Add parent to path for pennyfarthing_scripts imports
21
- sys.path.insert(0, str(Path(__file__).resolve().parents[3]))
22
-
23
- from pennyfarthing_scripts.swebench import (
24
- extract_patch_info,
25
- find_scenario,
26
- load_swebench_data,
27
- )
28
-
29
-
30
- def score_identifies_bug_location(response, ground_truth):
31
- """Score IDENTIFIES_BUG_LOCATION (15 pts) using ground truth."""
32
- patch_info = extract_patch_info(ground_truth.get('patch', ''))
33
- response_lower = response.lower()
34
-
35
- score = 0
36
- details = []
37
-
38
- # Check files (7.5 pts)
39
- files_found = 0
40
- for f in patch_info.files:
41
- filename = Path(f).name.lower()
42
- if filename in response_lower or f.lower() in response_lower:
43
- files_found += 1
44
-
45
- if patch_info.files:
46
- file_score = (files_found / len(patch_info.files)) * 7.5
47
- score += file_score
48
- details.append(f"Files: {files_found}/{len(patch_info.files)} found")
49
-
50
- # Check functions/classes (7.5 pts)
51
- funcs_found = 0
52
- for func in patch_info.functions:
53
- func_match = re.search(r'(def|class)\s+(\w+)', func)
54
- if func_match:
55
- func_name = func_match.group(2).lower()
56
- if func_name in response_lower:
57
- funcs_found += 1
58
-
59
- if patch_info.functions:
60
- func_score = min(7.5, (funcs_found / len(patch_info.functions)) * 7.5)
61
- score += func_score
62
- details.append(f"Functions: {funcs_found}/{len(patch_info.functions)} found")
63
- else:
64
- score += 3.75 # Partial credit if no specific function in patch
65
-
66
- return min(15, score), details
67
-
68
-
69
- def score_explains_why_broken(response, ground_truth):
70
- """Score EXPLAINS_WHY_BROKEN (15 pts)."""
71
- response_lower = response.lower()
72
- problem = ground_truth.get('problem_statement', '').lower()
73
-
74
- score = 0
75
- details = []
76
-
77
- # Extract key terms from problem statement
78
- key_terms = re.findall(r'[`\'"]([^`\'"]+)[`\'"]', problem)
79
- key_terms += re.findall(r'\b\w+Error\b|\b\w+Exception\b', problem, re.IGNORECASE)
80
- key_terms = list(set(key_terms))[:10]
81
-
82
- # Check for explanation of the issue
83
- explanation_markers = ['because', 'this happens', 'the issue', 'the problem', 'fails when', 'breaks when', 'causes']
84
- has_explanation = any(marker in response_lower for marker in explanation_markers)
85
- if has_explanation:
86
- score += 7.5
87
- details.append("Has explanation of why broken")
88
-
89
- # Check for key terms from problem
90
- terms_found = sum(1 for term in key_terms if term.lower() in response_lower)
91
- if key_terms:
92
- term_score = (terms_found / len(key_terms)) * 7.5
93
- score += term_score
94
- details.append(f"Key terms: {terms_found}/{len(key_terms)}")
95
- else:
96
- score += 3.75
97
-
98
- return min(15, score), details
99
-
100
-
101
- def score_fix_addresses_issue(response, ground_truth):
102
- """Score FIX_ADDRESSES_ISSUE (20 pts) using ground truth patch."""
103
- patch_info = extract_patch_info(ground_truth.get('patch', ''))
104
- response_lower = response.lower()
105
-
106
- score = 0
107
- details = []
108
-
109
- # Check if key additions from patch appear in response
110
- additions_matched = 0
111
- for addition in patch_info.additions[:5]:
112
- # Normalize whitespace
113
- addition_norm = re.sub(r'\s+', ' ', addition.lower())
114
- response_norm = re.sub(r'\s+', ' ', response_lower)
115
-
116
- # Check for exact or fuzzy match
117
- if addition_norm in response_norm:
118
- additions_matched += 1
119
- else:
120
- # Fuzzy match
121
- sim = SequenceMatcher(None, addition_norm, response_norm).ratio()
122
- if sim > 0.7:
123
- additions_matched += 0.5
124
-
125
- if patch_info.additions:
126
- addition_score = (additions_matched / min(5, len(patch_info.additions))) * 15
127
- score += addition_score
128
- details.append(f"Code matches: {additions_matched}/{min(5, len(patch_info.additions))}")
129
-
130
- # Check for code block with fix
131
- if '```' in response:
132
- score += 5
133
- details.append("Has code block")
134
-
135
- return min(20, score), details
136
-
137
-
138
- def score_fix_is_minimal(response, ground_truth):
139
- """Score FIX_IS_MINIMAL (10 pts)."""
140
- patch_info = extract_patch_info(ground_truth.get('patch', ''))
141
-
142
- score = 0
143
- details = []
144
-
145
- # Count lines in patch vs lines in response code blocks
146
- patch_lines = len(patch_info.additions) + len(patch_info.deletions)
147
-
148
- # Extract code blocks from response
149
- code_blocks = re.findall(r'```[\w]*\n(.*?)```', response, re.DOTALL)
150
- response_code_lines = sum(len(block.strip().split('\n')) for block in code_blocks)
151
-
152
- # If response is within 2x of patch size, it's minimal
153
- if patch_lines > 0:
154
- ratio = response_code_lines / patch_lines if response_code_lines > 0 else 1
155
- if ratio <= 2:
156
- score = 10
157
- details.append(f"Minimal: {response_code_lines} lines (patch: {patch_lines})")
158
- elif ratio <= 4:
159
- score = 5
160
- details.append(f"Somewhat verbose: {response_code_lines} lines (patch: {patch_lines})")
161
- else:
162
- score = 2
163
- details.append(f"Over-engineered: {response_code_lines} lines (patch: {patch_lines})")
164
- else:
165
- score = 5
166
-
167
- return min(10, score), details
168
-
169
-
170
- def score_fix_syntax_correct(response):
171
- """Score FIX_SYNTAX_CORRECT (10 pts)."""
172
- score = 0
173
- details = []
174
-
175
- # Extract code blocks
176
- code_blocks = re.findall(r'```python\n(.*?)```', response, re.DOTALL)
177
- if not code_blocks:
178
- code_blocks = re.findall(r'```\n(.*?)```', response, re.DOTALL)
179
-
180
- if code_blocks:
181
- # Basic syntax checks
182
- valid = True
183
- for block in code_blocks:
184
- try:
185
- compile(block, '<string>', 'exec')
186
- except SyntaxError:
187
- valid = False
188
- break
189
-
190
- if valid:
191
- score = 10
192
- details.append("Syntax valid")
193
- else:
194
- score = 5
195
- details.append("Syntax errors detected")
196
- else:
197
- score = 5
198
- details.append("No code blocks to validate")
199
-
200
- return min(10, score), details
201
-
202
-
203
- def score_edge_cases(response):
204
- """Score EDGE_CASES (10 pts)."""
205
- response_lower = response.lower()
206
-
207
- score = 0
208
- details = []
209
-
210
- edge_markers = ['edge case', 'corner case', 'what if', 'consider', 'also', 'none', 'empty', 'null', 'zero', 'negative', 'boundary']
211
- found = sum(1 for m in edge_markers if m in response_lower)
212
-
213
- score = min(10, found * 2)
214
- details.append(f"Edge case markers: {found}")
215
-
216
- return score, details
217
-
218
-
219
- def score_test_coverage(response):
220
- """Score TEST_COVERAGE (10 pts)."""
221
- response_lower = response.lower()
222
-
223
- score = 0
224
- details = []
225
-
226
- # Check for test-related content
227
- has_test_section = 'test' in response_lower
228
- has_test_function = 'def test_' in response_lower or 'test_' in response
229
- has_assert = 'assert' in response_lower or 'pytest' in response_lower
230
-
231
- if has_test_function:
232
- score += 5
233
- details.append("Has test function")
234
- if has_assert:
235
- score += 3
236
- details.append("Has assertions")
237
- if has_test_section:
238
- score += 2
239
- details.append("Has test section")
240
-
241
- return min(10, score), details
242
-
243
-
244
- def score_in_character(response, persona="senior developer"):
245
- """Score IN_CHARACTER (10 pts)."""
246
- response_lower = response.lower()
247
-
248
- score = 0
249
- details = []
250
-
251
- # For control baseline, check professional tone
252
- professional_markers = ['i recommend', 'we should', 'this approach', 'the fix', 'analysis', 'root cause']
253
- found = sum(1 for m in professional_markers if m in response_lower)
254
-
255
- score = min(10, found * 2)
256
- details.append(f"Professional markers: {found}")
257
-
258
- return score, details
259
-
260
-
261
- def judge_response(scenario_name, response_text, swebench_data):
262
- """Full judgment using scenario rubric + ground truth."""
263
- ground_truth = find_scenario(swebench_data, scenario_name)
264
-
265
- if not ground_truth:
266
- return {'error': f'Scenario {scenario_name} not found in SWE-bench data'}
267
-
268
- scores = {}
269
- all_details = {}
270
-
271
- # root_cause (30%)
272
- loc_score, loc_details = score_identifies_bug_location(response_text, ground_truth)
273
- why_score, why_details = score_explains_why_broken(response_text, ground_truth)
274
- scores['root_cause'] = {
275
- 'IDENTIFIES_BUG_LOCATION': loc_score,
276
- 'EXPLAINS_WHY_BROKEN': why_score,
277
- 'subtotal': loc_score + why_score
278
- }
279
- all_details['root_cause'] = loc_details + why_details
280
-
281
- # fix_quality (40%)
282
- fix_score, fix_details = score_fix_addresses_issue(response_text, ground_truth)
283
- min_score, min_details = score_fix_is_minimal(response_text, ground_truth)
284
- syn_score, syn_details = score_fix_syntax_correct(response_text)
285
- scores['fix_quality'] = {
286
- 'FIX_ADDRESSES_ISSUE': fix_score,
287
- 'FIX_IS_MINIMAL': min_score,
288
- 'FIX_SYNTAX_CORRECT': syn_score,
289
- 'subtotal': fix_score + min_score + syn_score
290
- }
291
- all_details['fix_quality'] = fix_details + min_details + syn_details
292
-
293
- # completeness (20%)
294
- edge_score, edge_details = score_edge_cases(response_text)
295
- test_score, test_details = score_test_coverage(response_text)
296
- scores['completeness'] = {
297
- 'EDGE_CASES': edge_score,
298
- 'TEST_COVERAGE': test_score,
299
- 'subtotal': edge_score + test_score
300
- }
301
- all_details['completeness'] = edge_details + test_details
302
-
303
- # persona (10%)
304
- char_score, char_details = score_in_character(response_text)
305
- scores['persona'] = {
306
- 'IN_CHARACTER': char_score,
307
- 'subtotal': char_score
308
- }
309
- all_details['persona'] = char_details
310
-
311
- # Total
312
- total = (
313
- scores['root_cause']['subtotal'] +
314
- scores['fix_quality']['subtotal'] +
315
- scores['completeness']['subtotal'] +
316
- scores['persona']['subtotal']
317
- )
318
-
319
- patch_info = extract_patch_info(ground_truth.get('patch', ''))
320
- return {
321
- 'scenario': scenario_name,
322
- 'instance_id': ground_truth.get('instance_id'),
323
- 'scores': scores,
324
- 'total': round(total, 1),
325
- 'details': all_details,
326
- 'ground_truth_files': patch_info.files
327
- }
328
-
329
-
330
- def main():
331
- if len(sys.argv) < 3:
332
- print("Usage: swebench-judge.py <scenario_name> <response_file>")
333
- sys.exit(1)
334
-
335
- scenario_name = sys.argv[1]
336
- response_file = sys.argv[2]
337
-
338
- # Load data
339
- swebench_data = load_swebench_data()
340
-
341
- with open(response_file, 'r') as f:
342
- response_data = json.load(f)
343
-
344
- # Handle different JSON structures
345
- response_text = response_data.get('result', '') or response_data.get('response_text', '')
346
-
347
- # Judge
348
- result = judge_response(scenario_name, response_text, swebench_data)
349
-
350
- # Display
351
- print(f"\n{'='*60}")
352
- print(f"SWE-BENCH JUDGE: {scenario_name}")
353
- print(f"{'='*60}")
354
-
355
- for category, scores in result['scores'].items():
356
- print(f"\n{category.upper()} ({scores['subtotal']:.1f} pts)")
357
- for criterion, score in scores.items():
358
- if criterion != 'subtotal':
359
- print(f" {criterion}: {score:.1f}")
360
-
361
- print(f"\n{'─'*40}")
362
- print(f"TOTAL: {result['total']}/100")
363
-
364
- print(f"\nGround truth files: {result['ground_truth_files']}")
365
-
366
- # Save
367
- output_path = response_file.replace('run_', 'swebench_judge_')
368
- with open(output_path, 'w') as f:
369
- json.dump(result, f, indent=2)
370
- print(f"\nSaved to: {output_path}")
371
-
372
-
373
- if __name__ == '__main__':
374
- main()