@pennyfarthing/core 10.1.0 → 10.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -24
- package/package.json +3 -1
- package/packages/core/dist/cli/commands/doctor-file-layout.test.js.map +1 -1
- package/packages/core/dist/cli/commands/doctor-legacy.test.js +24 -0
- package/packages/core/dist/cli/commands/doctor-legacy.test.js.map +1 -1
- package/packages/core/dist/cli/commands/doctor.d.ts.map +1 -1
- package/packages/core/dist/cli/commands/doctor.js +101 -15
- package/packages/core/dist/cli/commands/doctor.js.map +1 -1
- package/packages/core/dist/cli/commands/e2e-fresh-install.test.js +2 -2
- package/packages/core/dist/cli/commands/e2e-fresh-install.test.js.map +1 -1
- package/packages/core/dist/cli/commands/e2e-upgrade.test.js +2 -2
- package/packages/core/dist/cli/commands/e2e-upgrade.test.js.map +1 -1
- package/packages/core/dist/cli/commands/hooks-consolidation.test.js +2 -2
- package/packages/core/dist/cli/commands/hooks-consolidation.test.js.map +1 -1
- package/packages/core/dist/cli/commands/init-consolidation.test.js.map +1 -1
- package/packages/core/dist/cli/commands/theme.js +1 -1
- package/packages/core/dist/cli/commands/theme.js.map +1 -1
- package/packages/core/dist/cli/commands/uninstall.d.ts.map +1 -1
- package/packages/core/dist/cli/commands/uninstall.js +24 -13
- package/packages/core/dist/cli/commands/uninstall.js.map +1 -1
- package/packages/core/dist/cli/commands/update-consolidation.test.js +0 -10
- package/packages/core/dist/cli/commands/update-consolidation.test.js.map +1 -1
- package/packages/core/dist/cli/commands/update.js.map +1 -1
- package/packages/core/dist/cli/ocean-profiles.test.js.map +1 -1
- package/packages/core/dist/cli/theme-maker.test.js +64 -115
- package/packages/core/dist/cli/theme-maker.test.js.map +1 -1
- package/packages/core/dist/cli/utils/themes.d.ts.map +1 -1
- package/packages/core/dist/cli/utils/themes.js +3 -2
- package/packages/core/dist/cli/utils/themes.js.map +1 -1
- package/packages/core/dist/index.d.ts +1 -1
- package/packages/core/dist/index.d.ts.map +1 -1
- package/packages/core/dist/index.js +2 -2
- package/packages/core/dist/index.js.map +1 -1
- package/packages/core/dist/plugins/plugin-discovery.d.ts +116 -0
- package/packages/core/dist/plugins/plugin-discovery.d.ts.map +1 -0
- package/packages/core/dist/plugins/plugin-discovery.js +165 -0
- package/packages/core/dist/plugins/plugin-discovery.js.map +1 -0
- package/packages/core/dist/plugins/plugin-discovery.test.d.ts +22 -0
- package/packages/core/dist/plugins/plugin-discovery.test.d.ts.map +1 -0
- package/packages/core/dist/plugins/plugin-discovery.test.js +498 -0
- package/packages/core/dist/plugins/plugin-discovery.test.js.map +1 -0
- package/packages/core/dist/scripts/add-ocean-profiles.js +1 -1
- package/packages/core/dist/scripts/add-ocean-profiles.js.map +1 -1
- package/packages/core/dist/scripts/generate-all-spiders.js +2 -0
- package/packages/core/dist/scripts/generate-all-spiders.js.map +1 -1
- package/packages/core/dist/scripts/generate-report.d.ts.map +1 -1
- package/packages/core/dist/scripts/generate-report.js +2 -0
- package/packages/core/dist/scripts/generate-report.js.map +1 -1
- package/packages/core/dist/scripts/generate-spider-report.js.map +1 -1
- package/packages/core/dist/scripts/generate-spider.d.ts.map +1 -1
- package/packages/core/dist/scripts/generate-spider.js +2 -0
- package/packages/core/dist/scripts/generate-spider.js.map +1 -1
- package/packages/core/dist/scripts/validate-ocean-profiles.js +1 -1
- package/packages/core/dist/scripts/validate-ocean-profiles.js.map +1 -1
- package/packages/core/dist/workflow/file-watch.d.ts +82 -0
- package/packages/core/dist/workflow/file-watch.d.ts.map +1 -0
- package/packages/core/dist/workflow/file-watch.js +198 -0
- package/packages/core/dist/workflow/file-watch.js.map +1 -0
- package/packages/core/dist/workflow/file-watch.test.d.ts +21 -0
- package/packages/core/dist/workflow/file-watch.test.d.ts.map +1 -0
- package/packages/core/dist/workflow/file-watch.test.js +469 -0
- package/packages/core/dist/workflow/file-watch.test.js.map +1 -0
- package/packages/core/dist/workflow/observation-writer.d.ts +79 -0
- package/packages/core/dist/workflow/observation-writer.d.ts.map +1 -0
- package/packages/core/dist/workflow/observation-writer.js +97 -0
- package/packages/core/dist/workflow/observation-writer.js.map +1 -0
- package/packages/core/dist/workflow/observation-writer.test.d.ts +18 -0
- package/packages/core/dist/workflow/observation-writer.test.d.ts.map +1 -0
- package/packages/core/dist/workflow/observation-writer.test.js +424 -0
- package/packages/core/dist/workflow/observation-writer.test.js.map +1 -0
- package/packages/core/dist/workflow/output-path-normalizer.d.ts +47 -0
- package/packages/core/dist/workflow/output-path-normalizer.d.ts.map +1 -0
- package/packages/core/dist/workflow/output-path-normalizer.js +79 -0
- package/packages/core/dist/workflow/output-path-normalizer.js.map +1 -0
- package/packages/core/dist/workflow/output-path-normalizer.test.d.ts +16 -0
- package/packages/core/dist/workflow/output-path-normalizer.test.d.ts.map +1 -0
- package/packages/core/dist/workflow/output-path-normalizer.test.js +157 -0
- package/packages/core/dist/workflow/output-path-normalizer.test.js.map +1 -0
- package/packages/core/dist/workflow/story-workflow-routing.test.js +4 -2
- package/packages/core/dist/workflow/story-workflow-routing.test.js.map +1 -1
- package/packages/core/dist/workflow/tandem-lifecycle.d.ts +117 -0
- package/packages/core/dist/workflow/tandem-lifecycle.d.ts.map +1 -0
- package/packages/core/dist/workflow/tandem-lifecycle.js +186 -0
- package/packages/core/dist/workflow/tandem-lifecycle.js.map +1 -0
- package/packages/core/dist/workflow/tandem-lifecycle.test.d.ts +16 -0
- package/packages/core/dist/workflow/tandem-lifecycle.test.d.ts.map +1 -0
- package/packages/core/dist/workflow/tandem-lifecycle.test.js +531 -0
- package/packages/core/dist/workflow/tandem-lifecycle.test.js.map +1 -0
- package/packages/core/dist/workflow/tool-watch.d.ts +68 -0
- package/packages/core/dist/workflow/tool-watch.d.ts.map +1 -0
- package/packages/core/dist/workflow/tool-watch.js +166 -0
- package/packages/core/dist/workflow/tool-watch.js.map +1 -0
- package/packages/core/dist/workflow/tool-watch.test.d.ts +18 -0
- package/packages/core/dist/workflow/tool-watch.test.d.ts.map +1 -0
- package/packages/core/dist/workflow/tool-watch.test.js +717 -0
- package/packages/core/dist/workflow/tool-watch.test.js.map +1 -0
- package/packages/core/dist/workflow/variable-resolver.js +1 -1
- package/packages/core/dist/workflow/variable-resolver.js.map +1 -1
- package/packages/core/dist/workflow/workflow-migration.test.js +8 -4
- package/packages/core/dist/workflow/workflow-migration.test.js.map +1 -1
- package/packages/core/dist/workflow/workflow-schema.d.ts +7 -0
- package/packages/core/dist/workflow/workflow-schema.d.ts.map +1 -1
- package/packages/core/dist/workflow/workflow-schema.js +44 -0
- package/packages/core/dist/workflow/workflow-schema.js.map +1 -1
- package/packages/core/dist/workflow/workflow-schema.test.d.ts.map +1 -1
- package/packages/core/dist/workflow/workflow-schema.test.js +192 -0
- package/packages/core/dist/workflow/workflow-schema.test.js.map +1 -1
- package/pennyfarthing-dist/agents/README.md +3 -1
- package/pennyfarthing-dist/agents/ba.md +165 -0
- package/pennyfarthing-dist/agents/handoff.md +18 -3
- package/pennyfarthing-dist/agents/sm-finish.md +1 -1
- package/pennyfarthing-dist/agents/sm-handoff.md +27 -4
- package/pennyfarthing-dist/agents/sm.md +11 -5
- package/pennyfarthing-dist/agents/tandem-backseat.md +119 -0
- package/pennyfarthing-dist/commands/ba.md +17 -0
- package/pennyfarthing-dist/commands/setup.md +4 -0
- package/pennyfarthing-dist/guides/agent-behavior.md +62 -6
- package/pennyfarthing-dist/guides/bikelane.md +3 -2
- package/pennyfarthing-dist/guides/scale-levels.md +4 -6
- package/pennyfarthing-dist/guides/tandem-protocol.md +158 -0
- package/pennyfarthing-dist/guides/workflow-schema.md +1 -1
- package/pennyfarthing-dist/personas/themes/a-team.yaml +30 -0
- package/pennyfarthing-dist/personas/themes/alice-in-wonderland.yaml +30 -0
- package/pennyfarthing-dist/personas/themes/battlestar-galactica.yaml +30 -0
- package/pennyfarthing-dist/personas/themes/blade-runner.yaml +30 -0
- package/pennyfarthing-dist/personas/themes/catch-22.yaml +30 -0
- package/pennyfarthing-dist/personas/themes/control.yaml +30 -0
- package/pennyfarthing-dist/personas/themes/cowboy-bebop.yaml +31 -0
- package/pennyfarthing-dist/personas/themes/discworld.yaml +32 -1
- package/pennyfarthing-dist/personas/themes/doctor-who.yaml +31 -0
- package/pennyfarthing-dist/personas/themes/dune.yaml +32 -0
- package/pennyfarthing-dist/personas/themes/fifth-element.yaml +327 -0
- package/pennyfarthing-dist/personas/themes/firefly.yaml +31 -0
- package/pennyfarthing-dist/personas/themes/game-of-thrones.yaml +30 -0
- package/pennyfarthing-dist/personas/themes/harry-potter.yaml +30 -0
- package/pennyfarthing-dist/personas/themes/hitchhikers-guide.yaml +30 -0
- package/pennyfarthing-dist/personas/themes/lord-of-the-rings.yaml +30 -0
- package/pennyfarthing-dist/personas/themes/mad-max.yaml +30 -0
- package/pennyfarthing-dist/personas/themes/mash.yaml +33 -0
- package/pennyfarthing-dist/personas/themes/princess-bride.yaml +34 -0
- package/pennyfarthing-dist/personas/themes/sandman.yaml +33 -0
- package/pennyfarthing-dist/personas/themes/star-trek-tng.yaml +34 -0
- package/pennyfarthing-dist/personas/themes/star-wars.yaml +33 -0
- package/pennyfarthing-dist/personas/themes/the-expanse.yaml +30 -0
- package/pennyfarthing-dist/personas/themes/the-matrix.yaml +30 -0
- package/pennyfarthing-dist/personas/themes/watchmen.yaml +30 -0
- package/pennyfarthing-dist/personas/themes/west-wing.yaml +30 -0
- package/pennyfarthing-dist/personas/themes/x-files.yaml +30 -0
- package/pennyfarthing-dist/scripts/README.md +1 -1
- package/pennyfarthing-dist/scripts/core/agent-session.sh +1 -1
- package/pennyfarthing-dist/scripts/hooks/bell-mode-hook.sh +131 -54
- package/pennyfarthing-dist/scripts/hooks/post-merge.sh +20 -10
- package/pennyfarthing-dist/scripts/misc/statusline.sh +50 -8
- package/pennyfarthing-dist/scripts/portraits/generate-portraits.py +2 -2
- package/pennyfarthing-dist/scripts/validation/validate-agent-schema.sh +1 -0
- package/pennyfarthing-dist/scripts/workflow/README.md +2 -2
- package/pennyfarthing-dist/scripts/workflow/finish-story.sh +10 -189
- package/pennyfarthing-dist/skills/skill-registry.schema.json +8 -0
- package/pennyfarthing-dist/skills/skill-registry.yaml +1 -1
- package/pennyfarthing-dist/skills/sprint/skill.md +25 -2
- package/pennyfarthing-dist/skills/theme/skill.md +1 -1
- package/pennyfarthing-dist/skills/workflow/skill.md +24 -1
- package/pennyfarthing-dist/workflows/architecture/workflow.yaml +65 -0
- package/pennyfarthing-dist/workflows/architecture.yaml +2 -2
- package/pennyfarthing-dist/workflows/bdd-tandem.yaml +70 -0
- package/pennyfarthing-dist/workflows/epics-and-stories/workflow.yaml +2 -2
- package/pennyfarthing-dist/workflows/implementation-readiness/workflow.yaml +2 -2
- package/pennyfarthing-dist/workflows/prd/workflow.yaml +2 -2
- package/pennyfarthing-dist/workflows/product-brief/workflow.yaml +2 -2
- package/pennyfarthing-dist/workflows/project-context/workflow.yaml +2 -2
- package/pennyfarthing-dist/workflows/quick-dev/workflow.yaml +2 -2
- package/pennyfarthing-dist/workflows/research/workflow.yaml +2 -2
- package/pennyfarthing-dist/workflows/retrospective/workflow.yaml +1 -1
- package/pennyfarthing-dist/workflows/sprint-planning/workflow.yaml +3 -3
- package/pennyfarthing-dist/workflows/tdd-tandem.yaml +61 -0
- package/pennyfarthing-dist/workflows/ux-design/workflow.yaml +2 -2
- package/pennyfarthing_scripts/__pycache__/cli.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/__pycache__/hooks.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/__pycache__/pretooluse_hook.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/__pycache__/schema_validation_hook.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/__pycache__/workflow.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/bellmode_hook.py +202 -47
- package/pennyfarthing_scripts/bikerack/__init__.py +36 -0
- package/pennyfarthing_scripts/bikerack/__main__.py +5 -0
- package/pennyfarthing_scripts/bikerack/__pycache__/__init__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/bikerack/__pycache__/__main__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/bikerack/__pycache__/cli.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/bikerack/__pycache__/launcher.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/bikerack/cli.py +148 -0
- package/pennyfarthing_scripts/bikerack/launcher.py +181 -0
- package/pennyfarthing_scripts/brownfield/__init__.py +6 -6
- package/pennyfarthing_scripts/brownfield/__main__.py +1 -0
- package/pennyfarthing_scripts/brownfield/cli.py +0 -1
- package/pennyfarthing_scripts/brownfield/discover.py +1 -2
- package/pennyfarthing_scripts/cli.py +16 -6
- package/pennyfarthing_scripts/codemarkers/__init__.py +5 -1
- package/pennyfarthing_scripts/codemarkers/__pycache__/__init__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/codemarkers/__pycache__/__main__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/codemarkers/__pycache__/analyze.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/codemarkers/__pycache__/cli.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/codemarkers/__pycache__/formatters.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/codemarkers/__pycache__/models.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/codemarkers/analyze.py +177 -2
- package/pennyfarthing_scripts/codemarkers/cli.py +50 -0
- package/pennyfarthing_scripts/codemarkers/formatters.py +0 -1
- package/pennyfarthing_scripts/codemarkers/models.py +15 -0
- package/pennyfarthing_scripts/common/__init__.py +8 -9
- package/pennyfarthing_scripts/common/__pycache__/__init__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/common/__pycache__/config.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/common/config.py +1 -1
- package/pennyfarthing_scripts/complexity/__init__.py +1 -1
- package/pennyfarthing_scripts/complexity/__pycache__/__init__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/complexity/__pycache__/__main__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/complexity/__pycache__/analyze.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/complexity/__pycache__/cli.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/complexity/__pycache__/formatters.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/complexity/__pycache__/models.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/complexity/analyze.py +1 -1
- package/pennyfarthing_scripts/complexity/cli.py +5 -1
- package/pennyfarthing_scripts/complexity/formatters.py +1 -1
- package/pennyfarthing_scripts/context.py +14 -15
- package/pennyfarthing_scripts/deadcode/__pycache__/__init__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/deadcode/__pycache__/__main__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/deadcode/__pycache__/analyze.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/deadcode/__pycache__/cli.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/deadcode/__pycache__/formatters.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/deadcode/__pycache__/models.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/deadcode/analyze.py +3 -4
- package/pennyfarthing_scripts/deadcode/cli.py +2 -2
- package/pennyfarthing_scripts/dependencies/__init__.py +2 -2
- package/pennyfarthing_scripts/dependencies/__pycache__/__init__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/dependencies/__pycache__/__main__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/dependencies/__pycache__/analyze.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/dependencies/__pycache__/cli.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/dependencies/__pycache__/formatters.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/dependencies/__pycache__/models.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/dependencies/analyze.py +1 -1
- package/pennyfarthing_scripts/dependencies/cli.py +8 -4
- package/pennyfarthing_scripts/dependencies/formatters.py +1 -1
- package/pennyfarthing_scripts/git/__init__.py +5 -5
- package/pennyfarthing_scripts/git/create_branches.py +3 -2
- package/pennyfarthing_scripts/git/status_all.py +1 -1
- package/pennyfarthing_scripts/healthscore/__init__.py +2 -2
- package/pennyfarthing_scripts/healthscore/__main__.py +8 -0
- package/pennyfarthing_scripts/healthscore/__pycache__/__init__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/healthscore/__pycache__/__main__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/healthscore/__pycache__/analyze.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/healthscore/__pycache__/cli.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/healthscore/__pycache__/formatters.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/healthscore/__pycache__/models.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/healthscore/analyze.py +452 -21
- package/pennyfarthing_scripts/healthscore/cli.py +5 -1
- package/pennyfarthing_scripts/healthscore/models.py +0 -1
- package/pennyfarthing_scripts/hooks.py +8 -11
- package/pennyfarthing_scripts/hotspots/__init__.py +6 -6
- package/pennyfarthing_scripts/hotspots/__pycache__/__init__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/hotspots/__pycache__/analyze.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/hotspots/__pycache__/cli.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/hotspots/__pycache__/models.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/hotspots/analyze.py +128 -14
- package/pennyfarthing_scripts/hotspots/cli.py +2 -2
- package/pennyfarthing_scripts/hotspots/models.py +0 -1
- package/pennyfarthing_scripts/jira/__init__.py +15 -17
- package/pennyfarthing_scripts/jira/__pycache__/__init__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/jira/__pycache__/bidirectional.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/jira/__pycache__/claim.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/jira/__pycache__/cli.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/jira/__pycache__/client.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/jira/__pycache__/create.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/jira/__pycache__/epic.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/jira/__pycache__/reconcile.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/jira/__pycache__/story.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/jira/__pycache__/sync.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/jira/bidirectional.py +2 -3
- package/pennyfarthing_scripts/jira/claim.py +21 -0
- package/pennyfarthing_scripts/jira/cli.py +2 -2
- package/pennyfarthing_scripts/jira/client.py +4 -4
- package/pennyfarthing_scripts/jira/create.py +45 -1
- package/pennyfarthing_scripts/jira/epic.py +3 -2
- package/pennyfarthing_scripts/jira/reconcile.py +0 -1
- package/pennyfarthing_scripts/jira/story.py +2 -0
- package/pennyfarthing_scripts/jira/sync.py +1 -1
- package/pennyfarthing_scripts/migration/__pycache__/__init__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/migration/__pycache__/session.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/migration/__pycache__/skill.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/migration/__pycache__/step.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/migration/__pycache__/validate.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/migration/skill.py +0 -1
- package/pennyfarthing_scripts/migration/step.py +0 -1
- package/pennyfarthing_scripts/migration/validate.py +8 -5
- package/pennyfarthing_scripts/patch_mode.py +2 -2
- package/pennyfarthing_scripts/preflight/__init__.py +1 -1
- package/pennyfarthing_scripts/preflight/__pycache__/__init__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/preflight/__pycache__/finish.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/preflight/finish.py +0 -1
- package/pennyfarthing_scripts/pretooluse_hook.py +6 -7
- package/pennyfarthing_scripts/prime/__init__.py +2 -0
- package/pennyfarthing_scripts/prime/__pycache__/__init__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/prime/__pycache__/cli.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/prime/__pycache__/loader.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/prime/__pycache__/persona.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/prime/__pycache__/tiers.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/prime/cli.py +18 -1
- package/pennyfarthing_scripts/prime/loader.py +72 -3
- package/pennyfarthing_scripts/prime/persona.py +4 -2
- package/pennyfarthing_scripts/prime/tiers.py +17 -4
- package/pennyfarthing_scripts/schema_validation_hook.py +2 -3
- package/pennyfarthing_scripts/sprint/__init__.py +10 -12
- package/pennyfarthing_scripts/sprint/__main__.py +2 -2
- package/pennyfarthing_scripts/sprint/__pycache__/__init__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/sprint/__pycache__/archive.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/sprint/__pycache__/archive_epic.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/sprint/__pycache__/cli.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/sprint/__pycache__/epic_add.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/sprint/__pycache__/import_epic.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/sprint/__pycache__/loader.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/sprint/__pycache__/status.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/sprint/__pycache__/story_add.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/sprint/__pycache__/story_finish.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/sprint/__pycache__/story_update.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/sprint/__pycache__/validate_cmd.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/sprint/__pycache__/validator.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/sprint/__pycache__/work.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/sprint/__pycache__/yaml_io.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/sprint/archive.py +0 -1
- package/pennyfarthing_scripts/sprint/archive_epic.py +1 -4
- package/pennyfarthing_scripts/sprint/cli.py +34 -28
- package/pennyfarthing_scripts/sprint/epic_add.py +8 -1
- package/pennyfarthing_scripts/sprint/import_epic.py +42 -18
- package/pennyfarthing_scripts/sprint/loader.py +6 -0
- package/pennyfarthing_scripts/sprint/status.py +1 -2
- package/pennyfarthing_scripts/sprint/story_add.py +2 -2
- package/pennyfarthing_scripts/sprint/story_finish.py +3 -5
- package/pennyfarthing_scripts/sprint/story_update.py +11 -3
- package/pennyfarthing_scripts/sprint/validate_cmd.py +0 -1
- package/pennyfarthing_scripts/sprint/validator.py +120 -6
- package/pennyfarthing_scripts/sprint/work.py +1 -4
- package/pennyfarthing_scripts/sprint/yaml_io.py +10 -2
- package/pennyfarthing_scripts/story/__init__.py +14 -16
- package/pennyfarthing_scripts/story/__pycache__/__init__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/story/__pycache__/size.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/story/__pycache__/template.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/story/size.py +0 -1
- package/pennyfarthing_scripts/story/template.py +0 -1
- package/pennyfarthing_scripts/swebench.py +1 -2
- package/pennyfarthing_scripts/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
- package/pennyfarthing_scripts/tests/__pycache__/test_bikerack.cpython-314-pytest-9.0.2.pyc +0 -0
- package/pennyfarthing_scripts/tests/__pycache__/test_epic_shard_validation.cpython-314-pytest-9.0.2.pyc +0 -0
- package/pennyfarthing_scripts/tests/__pycache__/test_healthscore.cpython-314-pytest-9.0.2.pyc +0 -0
- package/pennyfarthing_scripts/tests/__pycache__/test_sprint_validator.cpython-314-pytest-9.0.2.pyc +0 -0
- package/pennyfarthing_scripts/tests/__pycache__/test_yaml_io.cpython-314-pytest-9.0.2.pyc +0 -0
- package/pennyfarthing_scripts/tests/conftest.py +1 -2
- package/pennyfarthing_scripts/tests/test_bikerack.py +785 -0
- package/pennyfarthing_scripts/tests/test_brownfield.py +10 -13
- package/pennyfarthing_scripts/tests/test_cli_modules.py +0 -4
- package/pennyfarthing_scripts/tests/test_codemarkers.py +13 -8
- package/pennyfarthing_scripts/tests/test_common.py +9 -4
- package/pennyfarthing_scripts/tests/test_epic_shard_validation.py +699 -0
- package/pennyfarthing_scripts/tests/test_git_utils.py +10 -13
- package/pennyfarthing_scripts/tests/test_healthscore.py +17 -25
- package/pennyfarthing_scripts/tests/test_jira_package.py +0 -3
- package/pennyfarthing_scripts/tests/test_package_structure.py +3 -16
- package/pennyfarthing_scripts/tests/test_patch_mode.py +7 -11
- package/pennyfarthing_scripts/tests/test_prime.py +39 -21
- package/pennyfarthing_scripts/tests/test_sprint_package.py +3 -8
- package/pennyfarthing_scripts/tests/test_sprint_validator.py +53 -5
- package/pennyfarthing_scripts/tests/test_story_add.py +3 -7
- package/pennyfarthing_scripts/tests/test_story_package.py +0 -3
- package/pennyfarthing_scripts/tests/test_story_update.py +5 -10
- package/pennyfarthing_scripts/tests/test_tiers.py +18 -17
- package/pennyfarthing_scripts/tests/test_token_counting.py +19 -13
- package/pennyfarthing_scripts/tests/test_topology_loader.py +620 -0
- package/pennyfarthing_scripts/tests/test_validate_cmd.py +2 -7
- package/pennyfarthing_scripts/tests/test_workflow_check.py +0 -2
- package/pennyfarthing_scripts/tests/test_yaml_io.py +0 -3
- package/pennyfarthing_scripts/theme/__pycache__/__init__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/theme/__pycache__/cli.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/theme/cli.py +3 -2
- package/pennyfarthing_scripts/validate/__init__.py +21 -0
- package/pennyfarthing_scripts/validate/__pycache__/__init__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/validate/__pycache__/cli.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/validate/adapters/__init__.py +0 -0
- package/pennyfarthing_scripts/validate/adapters/__pycache__/__init__.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/validate/adapters/__pycache__/agent.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/validate/adapters/__pycache__/schema.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/validate/adapters/__pycache__/skill_command.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/validate/adapters/__pycache__/sprint.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/validate/adapters/__pycache__/workflow.cpython-314.pyc +0 -0
- package/pennyfarthing_scripts/validate/adapters/agent.py +239 -0
- package/pennyfarthing_scripts/validate/adapters/schema.py +30 -0
- package/pennyfarthing_scripts/validate/adapters/skill_command.py +291 -0
- package/pennyfarthing_scripts/validate/adapters/sprint.py +69 -0
- package/pennyfarthing_scripts/validate/adapters/workflow.py +320 -0
- package/pennyfarthing_scripts/validate/cli.py +141 -0
- package/pennyfarthing_scripts/welcome_hook.py +2 -3
- package/pennyfarthing_scripts/workflow.py +3 -3
- package/scripts/README.md +3 -15
- package/pennyfarthing-dist/commands/benchmark-control.md +0 -69
- package/pennyfarthing-dist/commands/benchmark.md +0 -485
- package/pennyfarthing-dist/commands/job-fair.md +0 -102
- package/pennyfarthing-dist/commands/solo.md +0 -447
- package/pennyfarthing-dist/guides/benchmarks.md +0 -62
- package/pennyfarthing-dist/scripts/test/ensure-swebench-data.sh +0 -59
- package/pennyfarthing-dist/scripts/test/ground-truth-judge.py +0 -220
- package/pennyfarthing-dist/scripts/test/swebench-judge.py +0 -374
- package/pennyfarthing-dist/scripts/test/test-cache.sh +0 -165
- package/pennyfarthing-dist/scripts/test/test-setup.sh +0 -337
- package/pennyfarthing-dist/scripts/theme/compute-theme-tiers.sh +0 -13
- package/pennyfarthing-dist/scripts/theme/compute_theme_tiers.py +0 -402
- package/pennyfarthing-dist/scripts/theme/update-theme-tiers.sh +0 -97
- package/pennyfarthing-dist/skills/finalize-run/SKILL.md +0 -261
- package/pennyfarthing-dist/skills/judge/SKILL.md +0 -644
- package/pennyfarthing-dist/skills/persona-benchmark/SKILL.md +0 -187
- package/pennyfarthing-dist/workflows/dev-story/checklist.md +0 -80
- package/pennyfarthing-dist/workflows/dev-story/instructions.xml +0 -410
- package/pennyfarthing-dist/workflows/dev-story/workflow.yaml +0 -50
- package/pennyfarthing-dist/workflows/quick-spec/steps/step-01-understand.md +0 -201
- package/pennyfarthing-dist/workflows/quick-spec/steps/step-02-investigate.md +0 -156
- package/pennyfarthing-dist/workflows/quick-spec/steps/step-03-generate.md +0 -140
- package/pennyfarthing-dist/workflows/quick-spec/steps/step-04-review.md +0 -203
- package/pennyfarthing-dist/workflows/quick-spec/tech-spec-template.md +0 -74
- package/pennyfarthing-dist/workflows/quick-spec/workflow.yaml +0 -27
|
@@ -1,220 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Ground-truth judge for SWE-bench scenarios.
|
|
4
|
-
|
|
5
|
-
Compares Claude's proposed fix against the actual SWE-bench patch.
|
|
6
|
-
Scores based on:
|
|
7
|
-
- File identification (20%)
|
|
8
|
-
- Function/location identification (20%)
|
|
9
|
-
- Fix logic match (40%)
|
|
10
|
-
- Completeness (20%)
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
import json
|
|
14
|
-
import re
|
|
15
|
-
import sys
|
|
16
|
-
from pathlib import Path
|
|
17
|
-
from difflib import SequenceMatcher
|
|
18
|
-
|
|
19
|
-
# Add parent to path for pennyfarthing_scripts imports
|
|
20
|
-
sys.path.insert(0, str(Path(__file__).resolve().parents[3]))
|
|
21
|
-
|
|
22
|
-
from pennyfarthing_scripts.swebench import (
|
|
23
|
-
extract_patch_info,
|
|
24
|
-
extract_problem_keywords,
|
|
25
|
-
find_scenario,
|
|
26
|
-
get_meaningful_patterns,
|
|
27
|
-
load_swebench_data,
|
|
28
|
-
)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def score_response(response_text, ground_truth):
|
|
32
|
-
"""Score a response against ground truth patch."""
|
|
33
|
-
patch_info = extract_patch_info(ground_truth['patch'])
|
|
34
|
-
|
|
35
|
-
scores = {
|
|
36
|
-
'file_identification': 0,
|
|
37
|
-
'location_identification': 0,
|
|
38
|
-
'fix_logic_match': 0,
|
|
39
|
-
'completeness': 0,
|
|
40
|
-
'details': {}
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
response_lower = response_text.lower()
|
|
44
|
-
|
|
45
|
-
# 1. FILE IDENTIFICATION (20 points)
|
|
46
|
-
files_found = 0
|
|
47
|
-
for f in patch_info.files:
|
|
48
|
-
# Check various forms of the filename
|
|
49
|
-
filename = Path(f).name
|
|
50
|
-
if filename.lower() in response_lower or f.lower() in response_lower:
|
|
51
|
-
files_found += 1
|
|
52
|
-
|
|
53
|
-
if patch_info.files:
|
|
54
|
-
file_score = (files_found / len(patch_info.files)) * 20
|
|
55
|
-
scores['file_identification'] = min(20, file_score)
|
|
56
|
-
scores['details']['files_expected'] = patch_info.files
|
|
57
|
-
scores['details']['files_found'] = files_found
|
|
58
|
-
else:
|
|
59
|
-
scores['file_identification'] = 20 # No specific file in patch
|
|
60
|
-
|
|
61
|
-
# 2. LOCATION IDENTIFICATION (20 points)
|
|
62
|
-
# Look for function/class names mentioned in the patch
|
|
63
|
-
locations_found = 0
|
|
64
|
-
for func in patch_info.functions:
|
|
65
|
-
# Extract the function/class name
|
|
66
|
-
func_match = re.search(r'(def|class)\s+(\w+)', func)
|
|
67
|
-
if func_match:
|
|
68
|
-
func_name = func_match.group(2)
|
|
69
|
-
if func_name.lower() in response_lower:
|
|
70
|
-
locations_found += 1
|
|
71
|
-
elif func.strip() and func.strip().split()[0] in response_lower:
|
|
72
|
-
locations_found += 1
|
|
73
|
-
|
|
74
|
-
if patch_info.functions:
|
|
75
|
-
loc_score = (locations_found / len(patch_info.functions)) * 20
|
|
76
|
-
scores['location_identification'] = min(20, loc_score)
|
|
77
|
-
scores['details']['locations_expected'] = patch_info.functions[:3]
|
|
78
|
-
scores['details']['locations_found'] = locations_found
|
|
79
|
-
else:
|
|
80
|
-
scores['location_identification'] = 10 # Partial credit
|
|
81
|
-
|
|
82
|
-
# 3. FIX LOGIC MATCH (40 points)
|
|
83
|
-
# Check if key code patterns from the fix appear in the response
|
|
84
|
-
meaningful_patterns = get_meaningful_patterns(patch_info.key_patterns)
|
|
85
|
-
|
|
86
|
-
patterns_found = 0
|
|
87
|
-
for pattern in meaningful_patterns:
|
|
88
|
-
if pattern.lower() in response_lower:
|
|
89
|
-
patterns_found += 1
|
|
90
|
-
|
|
91
|
-
if meaningful_patterns:
|
|
92
|
-
pattern_score = (patterns_found / len(meaningful_patterns)) * 20
|
|
93
|
-
scores['details']['patterns_expected'] = meaningful_patterns[:10]
|
|
94
|
-
scores['details']['patterns_found'] = patterns_found
|
|
95
|
-
else:
|
|
96
|
-
pattern_score = 10
|
|
97
|
-
|
|
98
|
-
# Check for actual code additions
|
|
99
|
-
additions_matched = 0
|
|
100
|
-
for addition in patch_info.additions[:5]: # Check first 5 additions
|
|
101
|
-
# Normalize and check
|
|
102
|
-
addition_normalized = re.sub(r'\s+', ' ', addition.lower())
|
|
103
|
-
response_normalized = re.sub(r'\s+', ' ', response_lower)
|
|
104
|
-
|
|
105
|
-
# Use fuzzy matching
|
|
106
|
-
similarity = SequenceMatcher(None, addition_normalized, response_normalized).ratio()
|
|
107
|
-
if similarity > 0.6 or addition_normalized in response_normalized:
|
|
108
|
-
additions_matched += 1
|
|
109
|
-
|
|
110
|
-
if patch_info.additions:
|
|
111
|
-
addition_score = (additions_matched / min(5, len(patch_info.additions))) * 20
|
|
112
|
-
scores['details']['additions_matched'] = additions_matched
|
|
113
|
-
else:
|
|
114
|
-
addition_score = 10
|
|
115
|
-
|
|
116
|
-
scores['fix_logic_match'] = min(40, pattern_score + addition_score)
|
|
117
|
-
|
|
118
|
-
# 4. COMPLETENESS (20 points)
|
|
119
|
-
# Does the response have all the elements of a good fix?
|
|
120
|
-
completeness_score = 0
|
|
121
|
-
|
|
122
|
-
# Has code block?
|
|
123
|
-
if '```' in response_text:
|
|
124
|
-
completeness_score += 5
|
|
125
|
-
|
|
126
|
-
# Has test considerations?
|
|
127
|
-
if 'test' in response_lower:
|
|
128
|
-
completeness_score += 5
|
|
129
|
-
|
|
130
|
-
# Mentions the specific error/issue?
|
|
131
|
-
problem_keywords = extract_problem_keywords(ground_truth.get('problem_statement', ''))
|
|
132
|
-
keywords_found = sum(1 for kw in problem_keywords if kw.lower() in response_lower)
|
|
133
|
-
if problem_keywords:
|
|
134
|
-
completeness_score += min(5, (keywords_found / len(problem_keywords)) * 5)
|
|
135
|
-
else:
|
|
136
|
-
completeness_score += 2.5
|
|
137
|
-
|
|
138
|
-
# Has explanation of why fix works?
|
|
139
|
-
explanation_words = ['because', 'this fixes', 'this resolves', 'the issue', 'the problem', 'solution']
|
|
140
|
-
if any(word in response_lower for word in explanation_words):
|
|
141
|
-
completeness_score += 5
|
|
142
|
-
|
|
143
|
-
scores['completeness'] = min(20, completeness_score)
|
|
144
|
-
|
|
145
|
-
# Total
|
|
146
|
-
scores['total'] = round(
|
|
147
|
-
scores['file_identification'] +
|
|
148
|
-
scores['location_identification'] +
|
|
149
|
-
scores['fix_logic_match'] +
|
|
150
|
-
scores['completeness']
|
|
151
|
-
, 1)
|
|
152
|
-
|
|
153
|
-
return scores
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
def main():
|
|
157
|
-
if len(sys.argv) < 3:
|
|
158
|
-
print("Usage: ground-truth-judge.py <scenario_name> <response_file>")
|
|
159
|
-
print("Example: ground-truth-judge.py flask-5014 run_20260102T134237Z.json")
|
|
160
|
-
sys.exit(1)
|
|
161
|
-
|
|
162
|
-
scenario_name = sys.argv[1]
|
|
163
|
-
response_file = sys.argv[2]
|
|
164
|
-
|
|
165
|
-
# Load SWE-bench data
|
|
166
|
-
swebench_data = load_swebench_data()
|
|
167
|
-
|
|
168
|
-
# Find scenario
|
|
169
|
-
scenario = find_scenario(swebench_data, scenario_name)
|
|
170
|
-
if not scenario:
|
|
171
|
-
print(f"Error: Scenario '{scenario_name}' not found in SWE-bench data")
|
|
172
|
-
sys.exit(1)
|
|
173
|
-
|
|
174
|
-
# Load response
|
|
175
|
-
with open(response_file, 'r') as f:
|
|
176
|
-
response_data = json.load(f)
|
|
177
|
-
|
|
178
|
-
response_text = response_data.get('result', '')
|
|
179
|
-
if not response_text:
|
|
180
|
-
print("Error: No 'result' field in response file")
|
|
181
|
-
sys.exit(1)
|
|
182
|
-
|
|
183
|
-
# Score
|
|
184
|
-
scores = score_response(response_text, scenario)
|
|
185
|
-
|
|
186
|
-
# Output
|
|
187
|
-
print(f"\n{'='*60}")
|
|
188
|
-
print(f"GROUND TRUTH EVALUATION: {scenario_name}")
|
|
189
|
-
print(f"{'='*60}")
|
|
190
|
-
print(f"\nScores:")
|
|
191
|
-
print(f" File Identification: {scores['file_identification']:5.1f}/20")
|
|
192
|
-
print(f" Location Identification: {scores['location_identification']:5.1f}/20")
|
|
193
|
-
print(f" Fix Logic Match: {scores['fix_logic_match']:5.1f}/40")
|
|
194
|
-
print(f" Completeness: {scores['completeness']:5.1f}/20")
|
|
195
|
-
print(f" {'─'*40}")
|
|
196
|
-
print(f" TOTAL: {scores['total']:5.1f}/100")
|
|
197
|
-
|
|
198
|
-
print(f"\nDetails:")
|
|
199
|
-
for key, value in scores['details'].items():
|
|
200
|
-
print(f" {key}: {value}")
|
|
201
|
-
|
|
202
|
-
# Output JSON for programmatic use
|
|
203
|
-
output = {
|
|
204
|
-
'scenario': scenario_name,
|
|
205
|
-
'instance_id': scenario.get('instance_id'),
|
|
206
|
-
'scores': scores,
|
|
207
|
-
'ground_truth_patch_preview': scenario.get('patch', '')[:300]
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
# Save judge output
|
|
211
|
-
output_path = response_file.replace('run_', 'gt_judge_')
|
|
212
|
-
with open(output_path, 'w') as f:
|
|
213
|
-
json.dump(output, f, indent=2)
|
|
214
|
-
print(f"\nSaved to: {output_path}")
|
|
215
|
-
|
|
216
|
-
return scores
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
if __name__ == '__main__':
|
|
220
|
-
main()
|
|
@@ -1,374 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
SWE-bench scenario judge using:
|
|
4
|
-
1. Scenario-specific scoring rubric from YAML
|
|
5
|
-
2. Ground-truth validation from actual SWE-bench patches
|
|
6
|
-
|
|
7
|
-
Scoring structure:
|
|
8
|
-
- root_cause (30%): IDENTIFIES_BUG_LOCATION (15) + EXPLAINS_WHY_BROKEN (15)
|
|
9
|
-
- fix_quality (40%): FIX_ADDRESSES_ISSUE (20) + FIX_IS_MINIMAL (10) + FIX_SYNTAX_CORRECT (10)
|
|
10
|
-
- completeness (20%): EDGE_CASES (10) + TEST_COVERAGE (10)
|
|
11
|
-
- persona (10%): IN_CHARACTER (10)
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
import json
|
|
15
|
-
import re
|
|
16
|
-
import sys
|
|
17
|
-
from pathlib import Path
|
|
18
|
-
from difflib import SequenceMatcher
|
|
19
|
-
|
|
20
|
-
# Add parent to path for pennyfarthing_scripts imports
|
|
21
|
-
sys.path.insert(0, str(Path(__file__).resolve().parents[3]))
|
|
22
|
-
|
|
23
|
-
from pennyfarthing_scripts.swebench import (
|
|
24
|
-
extract_patch_info,
|
|
25
|
-
find_scenario,
|
|
26
|
-
load_swebench_data,
|
|
27
|
-
)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def score_identifies_bug_location(response, ground_truth):
|
|
31
|
-
"""Score IDENTIFIES_BUG_LOCATION (15 pts) using ground truth."""
|
|
32
|
-
patch_info = extract_patch_info(ground_truth.get('patch', ''))
|
|
33
|
-
response_lower = response.lower()
|
|
34
|
-
|
|
35
|
-
score = 0
|
|
36
|
-
details = []
|
|
37
|
-
|
|
38
|
-
# Check files (7.5 pts)
|
|
39
|
-
files_found = 0
|
|
40
|
-
for f in patch_info.files:
|
|
41
|
-
filename = Path(f).name.lower()
|
|
42
|
-
if filename in response_lower or f.lower() in response_lower:
|
|
43
|
-
files_found += 1
|
|
44
|
-
|
|
45
|
-
if patch_info.files:
|
|
46
|
-
file_score = (files_found / len(patch_info.files)) * 7.5
|
|
47
|
-
score += file_score
|
|
48
|
-
details.append(f"Files: {files_found}/{len(patch_info.files)} found")
|
|
49
|
-
|
|
50
|
-
# Check functions/classes (7.5 pts)
|
|
51
|
-
funcs_found = 0
|
|
52
|
-
for func in patch_info.functions:
|
|
53
|
-
func_match = re.search(r'(def|class)\s+(\w+)', func)
|
|
54
|
-
if func_match:
|
|
55
|
-
func_name = func_match.group(2).lower()
|
|
56
|
-
if func_name in response_lower:
|
|
57
|
-
funcs_found += 1
|
|
58
|
-
|
|
59
|
-
if patch_info.functions:
|
|
60
|
-
func_score = min(7.5, (funcs_found / len(patch_info.functions)) * 7.5)
|
|
61
|
-
score += func_score
|
|
62
|
-
details.append(f"Functions: {funcs_found}/{len(patch_info.functions)} found")
|
|
63
|
-
else:
|
|
64
|
-
score += 3.75 # Partial credit if no specific function in patch
|
|
65
|
-
|
|
66
|
-
return min(15, score), details
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
def score_explains_why_broken(response, ground_truth):
|
|
70
|
-
"""Score EXPLAINS_WHY_BROKEN (15 pts)."""
|
|
71
|
-
response_lower = response.lower()
|
|
72
|
-
problem = ground_truth.get('problem_statement', '').lower()
|
|
73
|
-
|
|
74
|
-
score = 0
|
|
75
|
-
details = []
|
|
76
|
-
|
|
77
|
-
# Extract key terms from problem statement
|
|
78
|
-
key_terms = re.findall(r'[`\'"]([^`\'"]+)[`\'"]', problem)
|
|
79
|
-
key_terms += re.findall(r'\b\w+Error\b|\b\w+Exception\b', problem, re.IGNORECASE)
|
|
80
|
-
key_terms = list(set(key_terms))[:10]
|
|
81
|
-
|
|
82
|
-
# Check for explanation of the issue
|
|
83
|
-
explanation_markers = ['because', 'this happens', 'the issue', 'the problem', 'fails when', 'breaks when', 'causes']
|
|
84
|
-
has_explanation = any(marker in response_lower for marker in explanation_markers)
|
|
85
|
-
if has_explanation:
|
|
86
|
-
score += 7.5
|
|
87
|
-
details.append("Has explanation of why broken")
|
|
88
|
-
|
|
89
|
-
# Check for key terms from problem
|
|
90
|
-
terms_found = sum(1 for term in key_terms if term.lower() in response_lower)
|
|
91
|
-
if key_terms:
|
|
92
|
-
term_score = (terms_found / len(key_terms)) * 7.5
|
|
93
|
-
score += term_score
|
|
94
|
-
details.append(f"Key terms: {terms_found}/{len(key_terms)}")
|
|
95
|
-
else:
|
|
96
|
-
score += 3.75
|
|
97
|
-
|
|
98
|
-
return min(15, score), details
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
def score_fix_addresses_issue(response, ground_truth):
|
|
102
|
-
"""Score FIX_ADDRESSES_ISSUE (20 pts) using ground truth patch."""
|
|
103
|
-
patch_info = extract_patch_info(ground_truth.get('patch', ''))
|
|
104
|
-
response_lower = response.lower()
|
|
105
|
-
|
|
106
|
-
score = 0
|
|
107
|
-
details = []
|
|
108
|
-
|
|
109
|
-
# Check if key additions from patch appear in response
|
|
110
|
-
additions_matched = 0
|
|
111
|
-
for addition in patch_info.additions[:5]:
|
|
112
|
-
# Normalize whitespace
|
|
113
|
-
addition_norm = re.sub(r'\s+', ' ', addition.lower())
|
|
114
|
-
response_norm = re.sub(r'\s+', ' ', response_lower)
|
|
115
|
-
|
|
116
|
-
# Check for exact or fuzzy match
|
|
117
|
-
if addition_norm in response_norm:
|
|
118
|
-
additions_matched += 1
|
|
119
|
-
else:
|
|
120
|
-
# Fuzzy match
|
|
121
|
-
sim = SequenceMatcher(None, addition_norm, response_norm).ratio()
|
|
122
|
-
if sim > 0.7:
|
|
123
|
-
additions_matched += 0.5
|
|
124
|
-
|
|
125
|
-
if patch_info.additions:
|
|
126
|
-
addition_score = (additions_matched / min(5, len(patch_info.additions))) * 15
|
|
127
|
-
score += addition_score
|
|
128
|
-
details.append(f"Code matches: {additions_matched}/{min(5, len(patch_info.additions))}")
|
|
129
|
-
|
|
130
|
-
# Check for code block with fix
|
|
131
|
-
if '```' in response:
|
|
132
|
-
score += 5
|
|
133
|
-
details.append("Has code block")
|
|
134
|
-
|
|
135
|
-
return min(20, score), details
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
def score_fix_is_minimal(response, ground_truth):
|
|
139
|
-
"""Score FIX_IS_MINIMAL (10 pts)."""
|
|
140
|
-
patch_info = extract_patch_info(ground_truth.get('patch', ''))
|
|
141
|
-
|
|
142
|
-
score = 0
|
|
143
|
-
details = []
|
|
144
|
-
|
|
145
|
-
# Count lines in patch vs lines in response code blocks
|
|
146
|
-
patch_lines = len(patch_info.additions) + len(patch_info.deletions)
|
|
147
|
-
|
|
148
|
-
# Extract code blocks from response
|
|
149
|
-
code_blocks = re.findall(r'```[\w]*\n(.*?)```', response, re.DOTALL)
|
|
150
|
-
response_code_lines = sum(len(block.strip().split('\n')) for block in code_blocks)
|
|
151
|
-
|
|
152
|
-
# If response is within 2x of patch size, it's minimal
|
|
153
|
-
if patch_lines > 0:
|
|
154
|
-
ratio = response_code_lines / patch_lines if response_code_lines > 0 else 1
|
|
155
|
-
if ratio <= 2:
|
|
156
|
-
score = 10
|
|
157
|
-
details.append(f"Minimal: {response_code_lines} lines (patch: {patch_lines})")
|
|
158
|
-
elif ratio <= 4:
|
|
159
|
-
score = 5
|
|
160
|
-
details.append(f"Somewhat verbose: {response_code_lines} lines (patch: {patch_lines})")
|
|
161
|
-
else:
|
|
162
|
-
score = 2
|
|
163
|
-
details.append(f"Over-engineered: {response_code_lines} lines (patch: {patch_lines})")
|
|
164
|
-
else:
|
|
165
|
-
score = 5
|
|
166
|
-
|
|
167
|
-
return min(10, score), details
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
def score_fix_syntax_correct(response):
|
|
171
|
-
"""Score FIX_SYNTAX_CORRECT (10 pts)."""
|
|
172
|
-
score = 0
|
|
173
|
-
details = []
|
|
174
|
-
|
|
175
|
-
# Extract code blocks
|
|
176
|
-
code_blocks = re.findall(r'```python\n(.*?)```', response, re.DOTALL)
|
|
177
|
-
if not code_blocks:
|
|
178
|
-
code_blocks = re.findall(r'```\n(.*?)```', response, re.DOTALL)
|
|
179
|
-
|
|
180
|
-
if code_blocks:
|
|
181
|
-
# Basic syntax checks
|
|
182
|
-
valid = True
|
|
183
|
-
for block in code_blocks:
|
|
184
|
-
try:
|
|
185
|
-
compile(block, '<string>', 'exec')
|
|
186
|
-
except SyntaxError:
|
|
187
|
-
valid = False
|
|
188
|
-
break
|
|
189
|
-
|
|
190
|
-
if valid:
|
|
191
|
-
score = 10
|
|
192
|
-
details.append("Syntax valid")
|
|
193
|
-
else:
|
|
194
|
-
score = 5
|
|
195
|
-
details.append("Syntax errors detected")
|
|
196
|
-
else:
|
|
197
|
-
score = 5
|
|
198
|
-
details.append("No code blocks to validate")
|
|
199
|
-
|
|
200
|
-
return min(10, score), details
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
def score_edge_cases(response):
|
|
204
|
-
"""Score EDGE_CASES (10 pts)."""
|
|
205
|
-
response_lower = response.lower()
|
|
206
|
-
|
|
207
|
-
score = 0
|
|
208
|
-
details = []
|
|
209
|
-
|
|
210
|
-
edge_markers = ['edge case', 'corner case', 'what if', 'consider', 'also', 'none', 'empty', 'null', 'zero', 'negative', 'boundary']
|
|
211
|
-
found = sum(1 for m in edge_markers if m in response_lower)
|
|
212
|
-
|
|
213
|
-
score = min(10, found * 2)
|
|
214
|
-
details.append(f"Edge case markers: {found}")
|
|
215
|
-
|
|
216
|
-
return score, details
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
def score_test_coverage(response):
|
|
220
|
-
"""Score TEST_COVERAGE (10 pts)."""
|
|
221
|
-
response_lower = response.lower()
|
|
222
|
-
|
|
223
|
-
score = 0
|
|
224
|
-
details = []
|
|
225
|
-
|
|
226
|
-
# Check for test-related content
|
|
227
|
-
has_test_section = 'test' in response_lower
|
|
228
|
-
has_test_function = 'def test_' in response_lower or 'test_' in response
|
|
229
|
-
has_assert = 'assert' in response_lower or 'pytest' in response_lower
|
|
230
|
-
|
|
231
|
-
if has_test_function:
|
|
232
|
-
score += 5
|
|
233
|
-
details.append("Has test function")
|
|
234
|
-
if has_assert:
|
|
235
|
-
score += 3
|
|
236
|
-
details.append("Has assertions")
|
|
237
|
-
if has_test_section:
|
|
238
|
-
score += 2
|
|
239
|
-
details.append("Has test section")
|
|
240
|
-
|
|
241
|
-
return min(10, score), details
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
def score_in_character(response, persona="senior developer"):
|
|
245
|
-
"""Score IN_CHARACTER (10 pts)."""
|
|
246
|
-
response_lower = response.lower()
|
|
247
|
-
|
|
248
|
-
score = 0
|
|
249
|
-
details = []
|
|
250
|
-
|
|
251
|
-
# For control baseline, check professional tone
|
|
252
|
-
professional_markers = ['i recommend', 'we should', 'this approach', 'the fix', 'analysis', 'root cause']
|
|
253
|
-
found = sum(1 for m in professional_markers if m in response_lower)
|
|
254
|
-
|
|
255
|
-
score = min(10, found * 2)
|
|
256
|
-
details.append(f"Professional markers: {found}")
|
|
257
|
-
|
|
258
|
-
return score, details
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
def judge_response(scenario_name, response_text, swebench_data):
|
|
262
|
-
"""Full judgment using scenario rubric + ground truth."""
|
|
263
|
-
ground_truth = find_scenario(swebench_data, scenario_name)
|
|
264
|
-
|
|
265
|
-
if not ground_truth:
|
|
266
|
-
return {'error': f'Scenario {scenario_name} not found in SWE-bench data'}
|
|
267
|
-
|
|
268
|
-
scores = {}
|
|
269
|
-
all_details = {}
|
|
270
|
-
|
|
271
|
-
# root_cause (30%)
|
|
272
|
-
loc_score, loc_details = score_identifies_bug_location(response_text, ground_truth)
|
|
273
|
-
why_score, why_details = score_explains_why_broken(response_text, ground_truth)
|
|
274
|
-
scores['root_cause'] = {
|
|
275
|
-
'IDENTIFIES_BUG_LOCATION': loc_score,
|
|
276
|
-
'EXPLAINS_WHY_BROKEN': why_score,
|
|
277
|
-
'subtotal': loc_score + why_score
|
|
278
|
-
}
|
|
279
|
-
all_details['root_cause'] = loc_details + why_details
|
|
280
|
-
|
|
281
|
-
# fix_quality (40%)
|
|
282
|
-
fix_score, fix_details = score_fix_addresses_issue(response_text, ground_truth)
|
|
283
|
-
min_score, min_details = score_fix_is_minimal(response_text, ground_truth)
|
|
284
|
-
syn_score, syn_details = score_fix_syntax_correct(response_text)
|
|
285
|
-
scores['fix_quality'] = {
|
|
286
|
-
'FIX_ADDRESSES_ISSUE': fix_score,
|
|
287
|
-
'FIX_IS_MINIMAL': min_score,
|
|
288
|
-
'FIX_SYNTAX_CORRECT': syn_score,
|
|
289
|
-
'subtotal': fix_score + min_score + syn_score
|
|
290
|
-
}
|
|
291
|
-
all_details['fix_quality'] = fix_details + min_details + syn_details
|
|
292
|
-
|
|
293
|
-
# completeness (20%)
|
|
294
|
-
edge_score, edge_details = score_edge_cases(response_text)
|
|
295
|
-
test_score, test_details = score_test_coverage(response_text)
|
|
296
|
-
scores['completeness'] = {
|
|
297
|
-
'EDGE_CASES': edge_score,
|
|
298
|
-
'TEST_COVERAGE': test_score,
|
|
299
|
-
'subtotal': edge_score + test_score
|
|
300
|
-
}
|
|
301
|
-
all_details['completeness'] = edge_details + test_details
|
|
302
|
-
|
|
303
|
-
# persona (10%)
|
|
304
|
-
char_score, char_details = score_in_character(response_text)
|
|
305
|
-
scores['persona'] = {
|
|
306
|
-
'IN_CHARACTER': char_score,
|
|
307
|
-
'subtotal': char_score
|
|
308
|
-
}
|
|
309
|
-
all_details['persona'] = char_details
|
|
310
|
-
|
|
311
|
-
# Total
|
|
312
|
-
total = (
|
|
313
|
-
scores['root_cause']['subtotal'] +
|
|
314
|
-
scores['fix_quality']['subtotal'] +
|
|
315
|
-
scores['completeness']['subtotal'] +
|
|
316
|
-
scores['persona']['subtotal']
|
|
317
|
-
)
|
|
318
|
-
|
|
319
|
-
patch_info = extract_patch_info(ground_truth.get('patch', ''))
|
|
320
|
-
return {
|
|
321
|
-
'scenario': scenario_name,
|
|
322
|
-
'instance_id': ground_truth.get('instance_id'),
|
|
323
|
-
'scores': scores,
|
|
324
|
-
'total': round(total, 1),
|
|
325
|
-
'details': all_details,
|
|
326
|
-
'ground_truth_files': patch_info.files
|
|
327
|
-
}
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
def main():
|
|
331
|
-
if len(sys.argv) < 3:
|
|
332
|
-
print("Usage: swebench-judge.py <scenario_name> <response_file>")
|
|
333
|
-
sys.exit(1)
|
|
334
|
-
|
|
335
|
-
scenario_name = sys.argv[1]
|
|
336
|
-
response_file = sys.argv[2]
|
|
337
|
-
|
|
338
|
-
# Load data
|
|
339
|
-
swebench_data = load_swebench_data()
|
|
340
|
-
|
|
341
|
-
with open(response_file, 'r') as f:
|
|
342
|
-
response_data = json.load(f)
|
|
343
|
-
|
|
344
|
-
# Handle different JSON structures
|
|
345
|
-
response_text = response_data.get('result', '') or response_data.get('response_text', '')
|
|
346
|
-
|
|
347
|
-
# Judge
|
|
348
|
-
result = judge_response(scenario_name, response_text, swebench_data)
|
|
349
|
-
|
|
350
|
-
# Display
|
|
351
|
-
print(f"\n{'='*60}")
|
|
352
|
-
print(f"SWE-BENCH JUDGE: {scenario_name}")
|
|
353
|
-
print(f"{'='*60}")
|
|
354
|
-
|
|
355
|
-
for category, scores in result['scores'].items():
|
|
356
|
-
print(f"\n{category.upper()} ({scores['subtotal']:.1f} pts)")
|
|
357
|
-
for criterion, score in scores.items():
|
|
358
|
-
if criterion != 'subtotal':
|
|
359
|
-
print(f" {criterion}: {score:.1f}")
|
|
360
|
-
|
|
361
|
-
print(f"\n{'─'*40}")
|
|
362
|
-
print(f"TOTAL: {result['total']}/100")
|
|
363
|
-
|
|
364
|
-
print(f"\nGround truth files: {result['ground_truth_files']}")
|
|
365
|
-
|
|
366
|
-
# Save
|
|
367
|
-
output_path = response_file.replace('run_', 'swebench_judge_')
|
|
368
|
-
with open(output_path, 'w') as f:
|
|
369
|
-
json.dump(result, f, indent=2)
|
|
370
|
-
print(f"\nSaved to: {output_path}")
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
if __name__ == '__main__':
|
|
374
|
-
main()
|