@pennyfarthing/core 10.0.0 → 10.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +14 -0
- package/package.json +22 -12
- package/pennyfarthing-dist/agents/README.md +348 -0
- package/pennyfarthing-dist/agents/architect.md +180 -0
- package/pennyfarthing-dist/agents/dev.md +169 -0
- package/pennyfarthing-dist/agents/devops.md +203 -0
- package/pennyfarthing-dist/agents/handoff.md +235 -0
- package/pennyfarthing-dist/agents/orchestrator.md +182 -0
- package/pennyfarthing-dist/agents/pm.md +152 -0
- package/pennyfarthing-dist/agents/reviewer-preflight.md +129 -0
- package/pennyfarthing-dist/agents/reviewer.md +197 -0
- package/pennyfarthing-dist/agents/sm-file-summary.md +79 -0
- package/pennyfarthing-dist/agents/sm-finish.md +82 -0
- package/pennyfarthing-dist/agents/sm-handoff.md +129 -0
- package/pennyfarthing-dist/agents/sm-setup.md +251 -0
- package/pennyfarthing-dist/agents/sm.md +298 -0
- package/pennyfarthing-dist/agents/tea.md +161 -0
- package/pennyfarthing-dist/agents/tech-writer.md +226 -0
- package/pennyfarthing-dist/agents/testing-runner.md +184 -0
- package/pennyfarthing-dist/agents/ux-designer.md +236 -0
- package/pennyfarthing-dist/agents/workflow-status-check.md +96 -0
- package/pennyfarthing-dist/commands/architect.md +9 -0
- package/pennyfarthing-dist/commands/benchmark-control.md +69 -0
- package/pennyfarthing-dist/commands/benchmark.md +485 -0
- package/pennyfarthing-dist/commands/brainstorming.md +91 -0
- package/pennyfarthing-dist/commands/check.md +156 -0
- package/pennyfarthing-dist/commands/chore.md +218 -0
- package/pennyfarthing-dist/commands/close-epic.md +139 -0
- package/pennyfarthing-dist/commands/continue-session.md +218 -0
- package/pennyfarthing-dist/commands/create-branches-from-story.md +358 -0
- package/pennyfarthing-dist/commands/create-theme.md +29 -0
- package/pennyfarthing-dist/commands/dev.md +9 -0
- package/pennyfarthing-dist/commands/devops.md +9 -0
- package/pennyfarthing-dist/commands/fix-blocker.md +22 -0
- package/pennyfarthing-dist/commands/git-cleanup.md +57 -0
- package/pennyfarthing-dist/commands/health-check.md +143 -0
- package/pennyfarthing-dist/commands/help.md +264 -0
- package/pennyfarthing-dist/commands/job-fair.md +102 -0
- package/pennyfarthing-dist/commands/list-themes.md +21 -0
- package/pennyfarthing-dist/commands/new-work.md +23 -0
- package/pennyfarthing-dist/commands/orchestrator.md +9 -0
- package/pennyfarthing-dist/commands/parallel-work.md +73 -0
- package/pennyfarthing-dist/commands/party-mode.md +77 -0
- package/pennyfarthing-dist/commands/patch.md +210 -0
- package/pennyfarthing-dist/commands/permissions.md +193 -0
- package/pennyfarthing-dist/commands/pm.md +9 -0
- package/pennyfarthing-dist/commands/prime.md +136 -0
- package/pennyfarthing-dist/commands/release.md +74 -0
- package/pennyfarthing-dist/commands/repo-status.md +49 -0
- package/pennyfarthing-dist/commands/retro.md +200 -0
- package/pennyfarthing-dist/commands/reviewer.md +9 -0
- package/pennyfarthing-dist/commands/run-ci.md +116 -0
- package/pennyfarthing-dist/commands/set-theme.md +56 -0
- package/pennyfarthing-dist/commands/setup.md +65 -0
- package/pennyfarthing-dist/commands/show-theme.md +21 -0
- package/pennyfarthing-dist/commands/sm.md +9 -0
- package/pennyfarthing-dist/commands/solo.md +447 -0
- package/pennyfarthing-dist/commands/sprint-planning.md +109 -0
- package/pennyfarthing-dist/commands/sprint.md +142 -0
- package/pennyfarthing-dist/commands/standalone.md +194 -0
- package/pennyfarthing-dist/commands/start-epic.md +168 -0
- package/pennyfarthing-dist/commands/sync-epic-to-jira.md +184 -0
- package/pennyfarthing-dist/commands/sync-work-with-sprint.md +373 -0
- package/pennyfarthing-dist/commands/tea.md +9 -0
- package/pennyfarthing-dist/commands/tech-writer.md +9 -0
- package/pennyfarthing-dist/commands/theme-maker.md +676 -0
- package/pennyfarthing-dist/commands/update-domain-docs.md +83 -0
- package/pennyfarthing-dist/commands/ux-designer.md +9 -0
- package/pennyfarthing-dist/commands/work.md +25 -0
- package/pennyfarthing-dist/commands/workflow.md +21 -0
- package/pennyfarthing-dist/guides/agent-behavior.md +92 -0
- package/pennyfarthing-dist/guides/agent-coordination.md +475 -0
- package/pennyfarthing-dist/guides/agent-tag-taxonomy.md +432 -0
- package/pennyfarthing-dist/guides/agent-template-strategic.md +148 -0
- package/pennyfarthing-dist/guides/agent-template-tactical.md +162 -0
- package/pennyfarthing-dist/guides/hooks.md +230 -0
- package/pennyfarthing-dist/guides/measurement-framework.md +210 -0
- package/pennyfarthing-dist/guides/patterns/approval-gates-pattern.md +766 -0
- package/pennyfarthing-dist/guides/patterns/fan-out-fan-in-pattern.md +574 -0
- package/pennyfarthing-dist/guides/patterns/helper-delegation-pattern.md +488 -0
- package/pennyfarthing-dist/guides/patterns/tdd-flow-pattern.md +402 -0
- package/pennyfarthing-dist/guides/permission-protocol.md +188 -0
- package/pennyfarthing-dist/guides/persona-loading.md +46 -0
- package/pennyfarthing-dist/guides/prompt-patterns.md +338 -0
- package/pennyfarthing-dist/guides/scale-levels.md +114 -0
- package/pennyfarthing-dist/guides/session-artifacts.md +193 -0
- package/pennyfarthing-dist/guides/session-schema.md +346 -0
- package/pennyfarthing-dist/guides/skill-schema.md +412 -0
- package/pennyfarthing-dist/guides/workflow-schema.md +257 -0
- package/pennyfarthing-dist/guides/workflow-step-schema.md +512 -0
- package/pennyfarthing-dist/guides/worktree-mode.md +113 -0
- package/pennyfarthing-dist/guides/xml-tags.md +627 -0
- package/pennyfarthing-dist/output-styles/teaching.md +33 -0
- package/pennyfarthing-dist/output-styles/terse.md +20 -0
- package/pennyfarthing-dist/output-styles/verbose.md +28 -0
- package/pennyfarthing-dist/personas/themes/a-team.yaml +331 -0
- package/pennyfarthing-dist/personas/themes/alice-in-wonderland.yaml +324 -0
- package/pennyfarthing-dist/personas/themes/battlestar-galactica.yaml +282 -0
- package/pennyfarthing-dist/personas/themes/blade-runner.yaml +289 -0
- package/pennyfarthing-dist/personas/themes/catch-22.yaml +304 -0
- package/pennyfarthing-dist/personas/themes/control.yaml +201 -0
- package/pennyfarthing-dist/personas/themes/cowboy-bebop.yaml +315 -0
- package/pennyfarthing-dist/personas/themes/discworld.yaml +334 -0
- package/pennyfarthing-dist/personas/themes/doctor-who.yaml +284 -0
- package/pennyfarthing-dist/personas/themes/dune.yaml +301 -0
- package/pennyfarthing-dist/personas/themes/firefly.yaml +320 -0
- package/pennyfarthing-dist/personas/themes/game-of-thrones.yaml +284 -0
- package/pennyfarthing-dist/personas/themes/harry-potter.yaml +316 -0
- package/pennyfarthing-dist/personas/themes/hitchhikers-guide.yaml +323 -0
- package/pennyfarthing-dist/personas/themes/lord-of-the-rings.yaml +326 -0
- package/pennyfarthing-dist/personas/themes/mad-max.yaml +349 -0
- package/pennyfarthing-dist/personas/themes/mash.yaml +329 -0
- package/pennyfarthing-dist/personas/themes/princess-bride.yaml +344 -0
- package/pennyfarthing-dist/personas/themes/sandman.yaml +282 -0
- package/pennyfarthing-dist/personas/themes/star-trek-tng.yaml +358 -0
- package/pennyfarthing-dist/personas/themes/star-wars.yaml +297 -0
- package/pennyfarthing-dist/personas/themes/the-expanse.yaml +337 -0
- package/pennyfarthing-dist/personas/themes/the-matrix.yaml +342 -0
- package/pennyfarthing-dist/personas/themes/watchmen.yaml +285 -0
- package/pennyfarthing-dist/personas/themes/west-wing.yaml +285 -0
- package/pennyfarthing-dist/personas/themes/x-files.yaml +296 -0
- package/pennyfarthing-dist/scripts/README.md +87 -0
- package/pennyfarthing-dist/scripts/core/README.md +25 -0
- package/pennyfarthing-dist/scripts/core/agent-session.sh +390 -0
- package/pennyfarthing-dist/scripts/core/check-context.sh +194 -0
- package/pennyfarthing-dist/scripts/core/handoff-marker.sh +112 -0
- package/pennyfarthing-dist/scripts/core/phase-check-start.sh +90 -0
- package/pennyfarthing-dist/scripts/core/prime.sh +30 -0
- package/pennyfarthing-dist/scripts/cyclist/is-cyclist.sh +21 -0
- package/pennyfarthing-dist/scripts/git/README.md +25 -0
- package/pennyfarthing-dist/scripts/git/create-feature-branches.sh +267 -0
- package/pennyfarthing-dist/scripts/git/git-status-all.sh +152 -0
- package/pennyfarthing-dist/scripts/git/install-git-hooks.sh +79 -0
- package/pennyfarthing-dist/scripts/git/release.sh +246 -0
- package/pennyfarthing-dist/scripts/git/worktree-manager.sh +497 -0
- package/pennyfarthing-dist/scripts/health/drift-detection.sh +156 -0
- package/pennyfarthing-dist/scripts/hooks/README.md +32 -0
- package/pennyfarthing-dist/scripts/hooks/__pycache__/question_reflector_check.cpython-314.pyc +0 -0
- package/pennyfarthing-dist/scripts/hooks/bell-mode-hook.sh +106 -0
- package/pennyfarthing-dist/scripts/hooks/context-circuit-breaker.sh +95 -0
- package/pennyfarthing-dist/scripts/hooks/context-warning.sh +65 -0
- package/pennyfarthing-dist/scripts/hooks/otel-auto-config.sh +35 -0
- package/pennyfarthing-dist/scripts/hooks/post-merge.sh +150 -0
- package/pennyfarthing-dist/scripts/hooks/pre-commit.sh +190 -0
- package/pennyfarthing-dist/scripts/hooks/pre-edit-check.sh +71 -0
- package/pennyfarthing-dist/scripts/hooks/pre-push.sh +42 -0
- package/pennyfarthing-dist/scripts/hooks/question-reflector-check.sh +20 -0
- package/pennyfarthing-dist/scripts/hooks/question_reflector_check.py +499 -0
- package/pennyfarthing-dist/scripts/hooks/schema-validation.sh +30 -0
- package/pennyfarthing-dist/scripts/hooks/session-start.sh +97 -0
- package/pennyfarthing-dist/scripts/hooks/session-stop.sh +65 -0
- package/pennyfarthing-dist/scripts/hooks/sprint-yaml-validation.sh +78 -0
- package/pennyfarthing-dist/scripts/hooks/welcome-hook.sh +94 -0
- package/pennyfarthing-dist/scripts/jira/README.md +36 -0
- package/pennyfarthing-dist/scripts/jira/create-jira-epic.sh +95 -0
- package/pennyfarthing-dist/scripts/jira/create-jira-story.sh +91 -0
- package/pennyfarthing-dist/scripts/jira/jira-claim-story.sh +22 -0
- package/pennyfarthing-dist/scripts/jira/jira-lib.sh +464 -0
- package/pennyfarthing-dist/scripts/jira/jira-reconcile.sh +260 -0
- package/pennyfarthing-dist/scripts/jira/jira-sync-story.sh +18 -0
- package/pennyfarthing-dist/scripts/jira/jira-sync.sh +16 -0
- package/pennyfarthing-dist/scripts/jira/sync-epic-jira.sh +16 -0
- package/pennyfarthing-dist/scripts/jira/sync-epic-to-jira.sh +16 -0
- package/pennyfarthing-dist/scripts/lib/README.md +29 -0
- package/pennyfarthing-dist/scripts/lib/background-tasks.sh +177 -0
- package/pennyfarthing-dist/scripts/lib/checkpoint.sh +136 -0
- package/pennyfarthing-dist/scripts/lib/common.sh +212 -0
- package/pennyfarthing-dist/scripts/lib/file-lock.sh +269 -0
- package/pennyfarthing-dist/scripts/lib/find-root.sh +58 -0
- package/pennyfarthing-dist/scripts/lib/logging.sh +186 -0
- package/pennyfarthing-dist/scripts/lib/retry.sh +76 -0
- package/pennyfarthing-dist/scripts/maintenance/migrate-theme-schema.mjs +102 -0
- package/pennyfarthing-dist/scripts/maintenance/sidecar-health.sh +91 -0
- package/pennyfarthing-dist/scripts/misc/README.md +44 -0
- package/pennyfarthing-dist/scripts/misc/add-short-names.sh +13 -0
- package/pennyfarthing-dist/scripts/misc/add_short_names.py +226 -0
- package/pennyfarthing-dist/scripts/misc/backlog.sh +77 -0
- package/pennyfarthing-dist/scripts/misc/check-status.sh +247 -0
- package/pennyfarthing-dist/scripts/misc/find-related-work.sh +231 -0
- package/pennyfarthing-dist/scripts/misc/generate-skill-docs.sh +107 -0
- package/pennyfarthing-dist/scripts/misc/log-skill-usage.sh +74 -0
- package/pennyfarthing-dist/scripts/misc/migrate-bmad-workflow.sh +10 -0
- package/pennyfarthing-dist/scripts/misc/migrate_bmad_workflow.py +319 -0
- package/pennyfarthing-dist/scripts/misc/repo-scan.sh +141 -0
- package/pennyfarthing-dist/scripts/misc/repo-utils.sh +778 -0
- package/pennyfarthing-dist/scripts/misc/run-ci.sh +212 -0
- package/pennyfarthing-dist/scripts/misc/run-timestamp.sh +7 -0
- package/pennyfarthing-dist/scripts/misc/session-cleanup.sh +319 -0
- package/pennyfarthing-dist/scripts/misc/skill-usage-report.sh +193 -0
- package/pennyfarthing-dist/scripts/misc/statusline.sh +257 -0
- package/pennyfarthing-dist/scripts/misc/uninstall.sh +275 -0
- package/pennyfarthing-dist/scripts/misc/validate-subagent-frontmatter.sh +160 -0
- package/pennyfarthing-dist/scripts/portraits/generate-portraits.py +417 -0
- package/pennyfarthing-dist/scripts/portraits/generate-portraits.sh +54 -0
- package/pennyfarthing-dist/scripts/sprint/README.md +29 -0
- package/pennyfarthing-dist/scripts/sprint/archive-story.sh +133 -0
- package/pennyfarthing-dist/scripts/sprint/available-stories.sh +91 -0
- package/pennyfarthing-dist/scripts/sprint/check-story.sh +158 -0
- package/pennyfarthing-dist/scripts/sprint/get-epic-field.sh +52 -0
- package/pennyfarthing-dist/scripts/sprint/get-story-field.sh +63 -0
- package/pennyfarthing-dist/scripts/sprint/list-future.sh +145 -0
- package/pennyfarthing-dist/scripts/sprint/new-sprint.sh +110 -0
- package/pennyfarthing-dist/scripts/sprint/promote-epic.sh +148 -0
- package/pennyfarthing-dist/scripts/sprint/sprint-common.sh +415 -0
- package/pennyfarthing-dist/scripts/sprint/sprint-info.sh +33 -0
- package/pennyfarthing-dist/scripts/sprint/sprint-metrics.sh +230 -0
- package/pennyfarthing-dist/scripts/sprint/sprint-status.sh +134 -0
- package/pennyfarthing-dist/scripts/sprint/validate-sprint-yaml.sh +139 -0
- package/pennyfarthing-dist/scripts/story/README.md +23 -0
- package/pennyfarthing-dist/scripts/story/create-story.sh +19 -0
- package/pennyfarthing-dist/scripts/story/size-story.sh +18 -0
- package/pennyfarthing-dist/scripts/story/story-template.sh +18 -0
- package/pennyfarthing-dist/scripts/test/README.md +23 -0
- package/pennyfarthing-dist/scripts/test/ensure-swebench-data.sh +59 -0
- package/pennyfarthing-dist/scripts/test/ground-truth-judge.py +220 -0
- package/pennyfarthing-dist/scripts/test/swebench-judge.py +374 -0
- package/pennyfarthing-dist/scripts/test/test-cache.sh +165 -0
- package/pennyfarthing-dist/scripts/test/test-setup.sh +337 -0
- package/pennyfarthing-dist/scripts/tests/check.test.sh +582 -0
- package/pennyfarthing-dist/scripts/tests/dev-story-workflow-import.test.sh +515 -0
- package/pennyfarthing-dist/scripts/tests/epics-and-stories-workflow-import.test.sh +599 -0
- package/pennyfarthing-dist/scripts/tests/handoff-phase-update.test.sh +332 -0
- package/pennyfarthing-dist/scripts/tests/implementation-readiness-workflow-import.test.sh +573 -0
- package/pennyfarthing-dist/scripts/tests/migrate-bmad-workflow.test.sh +859 -0
- package/pennyfarthing-dist/scripts/tests/prd-workflow-import.test.sh +662 -0
- package/pennyfarthing-dist/scripts/tests/project-context-workflow-import.test.sh +589 -0
- package/pennyfarthing-dist/scripts/tests/test-character-voice.sh +106 -0
- package/pennyfarthing-dist/scripts/tests/test-drift-detection.sh +597 -0
- package/pennyfarthing-dist/scripts/tests/test-post-merge-hook.sh +514 -0
- package/pennyfarthing-dist/scripts/tests/test-session-checkpoint.sh +517 -0
- package/pennyfarthing-dist/scripts/tests/test-solo-command.sh +331 -0
- package/pennyfarthing-dist/scripts/tests/ux-design-workflow-import.test.sh +647 -0
- package/pennyfarthing-dist/scripts/theme/README.md +22 -0
- package/pennyfarthing-dist/scripts/theme/compute-theme-tiers.sh +13 -0
- package/pennyfarthing-dist/scripts/theme/compute_theme_tiers.py +402 -0
- package/pennyfarthing-dist/scripts/theme/list-themes.sh +30 -0
- package/pennyfarthing-dist/scripts/theme/update-theme-tiers.sh +97 -0
- package/pennyfarthing-dist/scripts/validation/validate-agent-schema.sh +576 -0
- package/pennyfarthing-dist/scripts/workflow/README.md +28 -0
- package/pennyfarthing-dist/scripts/workflow/check.py +502 -0
- package/pennyfarthing-dist/scripts/workflow/check.sh +24 -0
- package/pennyfarthing-dist/scripts/workflow/complete-step.py +304 -0
- package/pennyfarthing-dist/scripts/workflow/finish-story.sh +154 -0
- package/pennyfarthing-dist/scripts/workflow/fix-session-phase.sh +222 -0
- package/pennyfarthing-dist/scripts/workflow/get-workflow-type.py +61 -0
- package/pennyfarthing-dist/scripts/workflow/get-workflow-type.sh +13 -0
- package/pennyfarthing-dist/scripts/workflow/list-workflows.sh +124 -0
- package/pennyfarthing-dist/scripts/workflow/phase-owner.sh +34 -0
- package/pennyfarthing-dist/scripts/workflow/resume-workflow.sh +157 -0
- package/pennyfarthing-dist/scripts/workflow/show-workflow.sh +132 -0
- package/pennyfarthing-dist/scripts/workflow/start-workflow.sh +250 -0
- package/pennyfarthing-dist/scripts/workflow/workflow-status.sh +161 -0
- package/pennyfarthing-dist/skills/agentic-patterns/SKILL.md +246 -0
- package/pennyfarthing-dist/skills/changelog/SKILL.md +385 -0
- package/pennyfarthing-dist/skills/code-review/SKILL.md +172 -0
- package/pennyfarthing-dist/skills/context-engineering/SKILL.md +277 -0
- package/pennyfarthing-dist/skills/cyclist/SKILL.md +88 -0
- package/pennyfarthing-dist/skills/dev-patterns/SKILL.md +461 -0
- package/pennyfarthing-dist/skills/finalize-run/SKILL.md +261 -0
- package/pennyfarthing-dist/skills/jira/SKILL.md +508 -0
- package/pennyfarthing-dist/skills/judge/SKILL.md +644 -0
- package/pennyfarthing-dist/skills/just/SKILL.md +414 -0
- package/pennyfarthing-dist/skills/mermaid/SKILL.md +256 -0
- package/pennyfarthing-dist/skills/otel/skill.md +227 -0
- package/pennyfarthing-dist/skills/permissions/skill.md +157 -0
- package/pennyfarthing-dist/skills/persona-benchmark/SKILL.md +187 -0
- package/pennyfarthing-dist/skills/skill-registry.schema.json +107 -0
- package/pennyfarthing-dist/skills/skill-registry.yaml +393 -0
- package/pennyfarthing-dist/skills/sprint/scripts/archive-story.sh +101 -0
- package/pennyfarthing-dist/skills/sprint/scripts/available-stories.sh +97 -0
- package/pennyfarthing-dist/skills/sprint/scripts/check-story.sh +164 -0
- package/pennyfarthing-dist/skills/sprint/scripts/create-jira-epic.sh +101 -0
- package/pennyfarthing-dist/skills/sprint/scripts/new-sprint.sh +116 -0
- package/pennyfarthing-dist/skills/sprint/scripts/promote-epic.sh +164 -0
- package/pennyfarthing-dist/skills/sprint/scripts/sprint-info.sh +39 -0
- package/pennyfarthing-dist/skills/sprint/scripts/sprint-status.sh +147 -0
- package/pennyfarthing-dist/skills/sprint/scripts/sync-epic-jira.sh +93 -0
- package/pennyfarthing-dist/skills/sprint/skill.md +465 -0
- package/pennyfarthing-dist/skills/story/scripts/create-story.sh +159 -0
- package/pennyfarthing-dist/skills/story/scripts/size-story.sh +198 -0
- package/pennyfarthing-dist/skills/story/scripts/story-template.sh +162 -0
- package/pennyfarthing-dist/skills/story/skill.md +219 -0
- package/pennyfarthing-dist/skills/systematic-debugging/SKILL.md +446 -0
- package/pennyfarthing-dist/skills/testing/SKILL.md +121 -0
- package/pennyfarthing-dist/skills/testing/references/troubleshooting.md +124 -0
- package/pennyfarthing-dist/skills/theme/skill.md +141 -0
- package/pennyfarthing-dist/skills/theme-creation/SKILL.md +178 -0
- package/pennyfarthing-dist/skills/workflow/scripts/list-workflows.sh +91 -0
- package/pennyfarthing-dist/skills/workflow/scripts/resume-workflow.sh +163 -0
- package/pennyfarthing-dist/skills/workflow/scripts/show-workflow.sh +138 -0
- package/pennyfarthing-dist/skills/workflow/scripts/start-workflow.sh +273 -0
- package/pennyfarthing-dist/skills/workflow/scripts/workflow-status.sh +167 -0
- package/pennyfarthing-dist/skills/workflow/skill.md +345 -0
- package/pennyfarthing-dist/skills/yq/SKILL.md +272 -0
- package/pennyfarthing-dist/templates/LEADERBOARD.schema.yaml +187 -0
- package/pennyfarthing-dist/templates/LEADERBOARD.template.md +59 -0
- package/pennyfarthing-dist/templates/agent-scopes.yaml.template +276 -0
- package/pennyfarthing-dist/templates/pennyfarthing-settings.yaml.template +61 -0
- package/pennyfarthing-dist/templates/persona-config.yaml.template +22 -0
- package/pennyfarthing-dist/templates/preferences.yaml.template +15 -0
- package/pennyfarthing-dist/templates/settings.local.json.template +130 -0
- package/pennyfarthing-dist/templates/setup-env.sh.template +18 -0
- package/pennyfarthing-dist/templates/shared-context.md.template +70 -0
- package/pennyfarthing-dist/templates/sidecar/decisions.md.template +40 -0
- package/pennyfarthing-dist/templates/sidecar/gotchas.md.template +37 -0
- package/pennyfarthing-dist/templates/sidecar/patterns.md.template +34 -0
- package/pennyfarthing-dist/workflows/agent-docs.yaml +70 -0
- package/pennyfarthing-dist/workflows/architecture/steps/step-01-initialize.md +113 -0
- package/pennyfarthing-dist/workflows/architecture/steps/step-01b-continue.md +105 -0
- package/pennyfarthing-dist/workflows/architecture/steps/step-02-context.md +127 -0
- package/pennyfarthing-dist/workflows/architecture/steps/step-03-patterns.md +145 -0
- package/pennyfarthing-dist/workflows/architecture/steps/step-04-components.md +150 -0
- package/pennyfarthing-dist/workflows/architecture/steps/step-05-interfaces.md +145 -0
- package/pennyfarthing-dist/workflows/architecture/steps/step-06-risks.md +154 -0
- package/pennyfarthing-dist/workflows/architecture/steps/step-07-document.md +172 -0
- package/pennyfarthing-dist/workflows/architecture/templates/architecture-decision.md +102 -0
- package/pennyfarthing-dist/workflows/architecture.yaml +65 -0
- package/pennyfarthing-dist/workflows/bdd.yaml +60 -0
- package/pennyfarthing-dist/workflows/brainstorming/brain-methods.csv +62 -0
- package/pennyfarthing-dist/workflows/brainstorming/checklist.md +44 -0
- package/pennyfarthing-dist/workflows/brainstorming/instructions.md +736 -0
- package/pennyfarthing-dist/workflows/brainstorming/workflow.yaml +49 -0
- package/pennyfarthing-dist/workflows/code-review/checklist.md +23 -0
- package/pennyfarthing-dist/workflows/code-review/instructions.md +234 -0
- package/pennyfarthing-dist/workflows/code-review/workflow.yaml +51 -0
- package/pennyfarthing-dist/workflows/dev-story/checklist.md +80 -0
- package/pennyfarthing-dist/workflows/dev-story/instructions.xml +410 -0
- package/pennyfarthing-dist/workflows/dev-story/workflow.yaml +50 -0
- package/pennyfarthing-dist/workflows/epics-and-stories/steps/step-01-validate-prerequisites.md +281 -0
- package/pennyfarthing-dist/workflows/epics-and-stories/steps/step-02-design-epics.md +256 -0
- package/pennyfarthing-dist/workflows/epics-and-stories/steps/step-03-create-stories.md +298 -0
- package/pennyfarthing-dist/workflows/epics-and-stories/steps/step-04-final-validation.md +177 -0
- package/pennyfarthing-dist/workflows/epics-and-stories/steps/step-05-import-to-future.md +145 -0
- package/pennyfarthing-dist/workflows/epics-and-stories/templates/epics-template.md +57 -0
- package/pennyfarthing-dist/workflows/epics-and-stories/workflow.yaml +28 -0
- package/pennyfarthing-dist/workflows/git-cleanup/steps/step-01-analyze.md +103 -0
- package/pennyfarthing-dist/workflows/git-cleanup/steps/step-02-categorize.md +147 -0
- package/pennyfarthing-dist/workflows/git-cleanup/steps/step-03-execute.md +215 -0
- package/pennyfarthing-dist/workflows/git-cleanup/steps/step-04-verify.md +97 -0
- package/pennyfarthing-dist/workflows/git-cleanup/steps/step-05-complete.md +78 -0
- package/pennyfarthing-dist/workflows/git-cleanup.yaml +59 -0
- package/pennyfarthing-dist/workflows/implementation-readiness/steps/step-01-document-discovery.md +211 -0
- package/pennyfarthing-dist/workflows/implementation-readiness/steps/step-02-prd-analysis.md +199 -0
- package/pennyfarthing-dist/workflows/implementation-readiness/steps/step-03-epic-coverage-validation.md +202 -0
- package/pennyfarthing-dist/workflows/implementation-readiness/steps/step-04-ux-alignment.md +162 -0
- package/pennyfarthing-dist/workflows/implementation-readiness/steps/step-05-epic-quality-review.md +280 -0
- package/pennyfarthing-dist/workflows/implementation-readiness/steps/step-06-final-assessment.md +158 -0
- package/pennyfarthing-dist/workflows/implementation-readiness/templates/readiness-report-template.md +4 -0
- package/pennyfarthing-dist/workflows/implementation-readiness/workflow.yaml +40 -0
- package/pennyfarthing-dist/workflows/interactive-debug/steps/step-01-connect.md +257 -0
- package/pennyfarthing-dist/workflows/interactive-debug/steps/step-02-explore.md +107 -0
- package/pennyfarthing-dist/workflows/interactive-debug/steps/step-03-fix.md +127 -0
- package/pennyfarthing-dist/workflows/interactive-debug/steps/step-04-commit.md +122 -0
- package/pennyfarthing-dist/workflows/interactive-debug/workflow.yaml +51 -0
- package/pennyfarthing-dist/workflows/patch.yaml +67 -0
- package/pennyfarthing-dist/workflows/prd/data/domain-complexity.csv +13 -0
- package/pennyfarthing-dist/workflows/prd/data/prd-purpose.md +197 -0
- package/pennyfarthing-dist/workflows/prd/data/project-types.csv +11 -0
- package/pennyfarthing-dist/workflows/prd/steps-c/step-01-init.md +197 -0
- package/pennyfarthing-dist/workflows/prd/steps-c/step-01b-continue.md +159 -0
- package/pennyfarthing-dist/workflows/prd/steps-c/step-02-discovery.md +230 -0
- package/pennyfarthing-dist/workflows/prd/steps-c/step-03-success.md +232 -0
- package/pennyfarthing-dist/workflows/prd/steps-c/step-04-journeys.md +219 -0
- package/pennyfarthing-dist/workflows/prd/steps-c/step-05-domain.md +213 -0
- package/pennyfarthing-dist/workflows/prd/steps-c/step-06-innovation.md +232 -0
- package/pennyfarthing-dist/workflows/prd/steps-c/step-07-project-type.md +243 -0
- package/pennyfarthing-dist/workflows/prd/steps-c/step-08-scoping.md +234 -0
- package/pennyfarthing-dist/workflows/prd/steps-c/step-09-functional.md +237 -0
- package/pennyfarthing-dist/workflows/prd/steps-c/step-10-nonfunctional.md +248 -0
- package/pennyfarthing-dist/workflows/prd/steps-c/step-11-polish.md +223 -0
- package/pennyfarthing-dist/workflows/prd/steps-c/step-12-complete.md +186 -0
- package/pennyfarthing-dist/workflows/prd/steps-e/step-e-01-discovery.md +253 -0
- package/pennyfarthing-dist/workflows/prd/steps-e/step-e-01b-legacy-conversion.md +214 -0
- package/pennyfarthing-dist/workflows/prd/steps-e/step-e-02-review.md +255 -0
- package/pennyfarthing-dist/workflows/prd/steps-e/step-e-03-edit.md +259 -0
- package/pennyfarthing-dist/workflows/prd/steps-e/step-e-04-complete.md +174 -0
- package/pennyfarthing-dist/workflows/prd/steps-v/step-v-01-discovery.md +224 -0
- package/pennyfarthing-dist/workflows/prd/steps-v/step-v-02-format-detection.md +197 -0
- package/pennyfarthing-dist/workflows/prd/steps-v/step-v-02b-parity-check.md +215 -0
- package/pennyfarthing-dist/workflows/prd/steps-v/step-v-03-density-validation.md +180 -0
- package/pennyfarthing-dist/workflows/prd/steps-v/step-v-04-brief-coverage-validation.md +220 -0
- package/pennyfarthing-dist/workflows/prd/steps-v/step-v-05-measurability-validation.md +234 -0
- package/pennyfarthing-dist/workflows/prd/steps-v/step-v-06-traceability-validation.md +223 -0
- package/pennyfarthing-dist/workflows/prd/steps-v/step-v-07-implementation-leakage-validation.md +211 -0
- package/pennyfarthing-dist/workflows/prd/steps-v/step-v-08-domain-compliance-validation.md +249 -0
- package/pennyfarthing-dist/workflows/prd/steps-v/step-v-09-project-type-validation.md +269 -0
- package/pennyfarthing-dist/workflows/prd/steps-v/step-v-10-smart-validation.md +215 -0
- package/pennyfarthing-dist/workflows/prd/steps-v/step-v-11-holistic-quality-validation.md +270 -0
- package/pennyfarthing-dist/workflows/prd/steps-v/step-v-12-completeness-validation.md +248 -0
- package/pennyfarthing-dist/workflows/prd/steps-v/step-v-13-report-complete.md +238 -0
- package/pennyfarthing-dist/workflows/prd/templates/prd-template.md +10 -0
- package/pennyfarthing-dist/workflows/prd/workflow.yaml +42 -0
- package/pennyfarthing-dist/workflows/product-brief/steps/step-01-init.md +195 -0
- package/pennyfarthing-dist/workflows/product-brief/steps/step-01b-continue.md +180 -0
- package/pennyfarthing-dist/workflows/product-brief/steps/step-02-vision.md +221 -0
- package/pennyfarthing-dist/workflows/product-brief/steps/step-03-users.md +224 -0
- package/pennyfarthing-dist/workflows/product-brief/steps/step-04-metrics.md +228 -0
- package/pennyfarthing-dist/workflows/product-brief/steps/step-05-scope.md +243 -0
- package/pennyfarthing-dist/workflows/product-brief/steps/step-06-complete.md +216 -0
- package/pennyfarthing-dist/workflows/product-brief/templates/product-brief.template.md +10 -0
- package/pennyfarthing-dist/workflows/product-brief/workflow.yaml +31 -0
- package/pennyfarthing-dist/workflows/project-context/project-context-template.md +21 -0
- package/pennyfarthing-dist/workflows/project-context/steps/step-01-discover.md +206 -0
- package/pennyfarthing-dist/workflows/project-context/steps/step-02-generate.md +349 -0
- package/pennyfarthing-dist/workflows/project-context/steps/step-03-complete.md +306 -0
- package/pennyfarthing-dist/workflows/project-context/workflow.yaml +27 -0
- package/pennyfarthing-dist/workflows/project-setup/steps/step-01-discover.md +157 -0
- package/pennyfarthing-dist/workflows/project-setup/steps/step-02-clone-repos.md +217 -0
- package/pennyfarthing-dist/workflows/project-setup/steps/step-03-repos-yaml.md +159 -0
- package/pennyfarthing-dist/workflows/project-setup/steps/step-04-claude-md.md +186 -0
- package/pennyfarthing-dist/workflows/project-setup/steps/step-05-shared-context.md +185 -0
- package/pennyfarthing-dist/workflows/project-setup/steps/step-06-task-runner.md +279 -0
- package/pennyfarthing-dist/workflows/project-setup/steps/step-07-theme.md +200 -0
- package/pennyfarthing-dist/workflows/project-setup/steps/step-08-theme-packs.md +142 -0
- package/pennyfarthing-dist/workflows/project-setup/steps/step-09-cyclist.md +245 -0
- package/pennyfarthing-dist/workflows/project-setup/steps/step-10-complete.md +204 -0
- package/pennyfarthing-dist/workflows/project-setup/workflow.yaml +41 -0
- package/pennyfarthing-dist/workflows/quick-dev/steps/step-01-mode-detection.md +177 -0
- package/pennyfarthing-dist/workflows/quick-dev/steps/step-02-context-gathering.md +143 -0
- package/pennyfarthing-dist/workflows/quick-dev/steps/step-03-execute.md +138 -0
- package/pennyfarthing-dist/workflows/quick-dev/steps/step-04-self-check.md +135 -0
- package/pennyfarthing-dist/workflows/quick-dev/steps/step-05-adversarial-review.md +129 -0
- package/pennyfarthing-dist/workflows/quick-dev/steps/step-06-resolve-findings.md +163 -0
- package/pennyfarthing-dist/workflows/quick-dev/workflow.yaml +27 -0
- package/pennyfarthing-dist/workflows/quick-spec/steps/step-01-understand.md +201 -0
- package/pennyfarthing-dist/workflows/quick-spec/steps/step-02-investigate.md +156 -0
- package/pennyfarthing-dist/workflows/quick-spec/steps/step-03-generate.md +140 -0
- package/pennyfarthing-dist/workflows/quick-spec/steps/step-04-review.md +203 -0
- package/pennyfarthing-dist/workflows/quick-spec/tech-spec-template.md +74 -0
- package/pennyfarthing-dist/workflows/quick-spec/workflow.yaml +27 -0
- package/pennyfarthing-dist/workflows/release/steps/step-01-preflight.md +105 -0
- package/pennyfarthing-dist/workflows/release/steps/step-02-bump.md +95 -0
- package/pennyfarthing-dist/workflows/release/steps/step-03-changelog.md +125 -0
- package/pennyfarthing-dist/workflows/release/steps/step-04-readme.md +101 -0
- package/pennyfarthing-dist/workflows/release/steps/step-05-claude-md.md +102 -0
- package/pennyfarthing-dist/workflows/release/steps/step-06-retro.md +59 -0
- package/pennyfarthing-dist/workflows/release/steps/step-07-commit.md +109 -0
- package/pennyfarthing-dist/workflows/release/steps/step-08-merge.md +65 -0
- package/pennyfarthing-dist/workflows/release/steps/step-09-push.md +75 -0
- package/pennyfarthing-dist/workflows/release/steps/step-10-publish.md +93 -0
- package/pennyfarthing-dist/workflows/release/steps/step-11-finalize.md +71 -0
- package/pennyfarthing-dist/workflows/release.yaml +62 -0
- package/pennyfarthing-dist/workflows/research/steps-domain/step-01-init.md +159 -0
- package/pennyfarthing-dist/workflows/research/steps-domain/step-02-domain-analysis.md +253 -0
- package/pennyfarthing-dist/workflows/research/steps-domain/step-03-competitive-landscape.md +263 -0
- package/pennyfarthing-dist/workflows/research/steps-domain/step-04-regulatory-focus.md +232 -0
- package/pennyfarthing-dist/workflows/research/steps-domain/step-05-technical-trends.md +260 -0
- package/pennyfarthing-dist/workflows/research/steps-domain/step-06-research-synthesis.md +477 -0
- package/pennyfarthing-dist/workflows/research/steps-market/step-01-init.md +205 -0
- package/pennyfarthing-dist/workflows/research/steps-market/step-02-customer-behavior.md +262 -0
- package/pennyfarthing-dist/workflows/research/steps-market/step-02-customer-insights.md +227 -0
- package/pennyfarthing-dist/workflows/research/steps-market/step-03-customer-pain-points.md +275 -0
- package/pennyfarthing-dist/workflows/research/steps-market/step-04-customer-decisions.md +286 -0
- package/pennyfarthing-dist/workflows/research/steps-market/step-05-competitive-analysis.md +203 -0
- package/pennyfarthing-dist/workflows/research/steps-market/step-06-research-completion.md +510 -0
- package/pennyfarthing-dist/workflows/research/steps-technical/step-01-init.md +159 -0
- package/pennyfarthing-dist/workflows/research/steps-technical/step-02-technical-overview.md +264 -0
- package/pennyfarthing-dist/workflows/research/steps-technical/step-03-integration-patterns.md +274 -0
- package/pennyfarthing-dist/workflows/research/steps-technical/step-04-architectural-patterns.md +228 -0
- package/pennyfarthing-dist/workflows/research/steps-technical/step-05-implementation-research.md +267 -0
- package/pennyfarthing-dist/workflows/research/steps-technical/step-06-research-synthesis.md +522 -0
- package/pennyfarthing-dist/workflows/research/templates/research.template.md +29 -0
- package/pennyfarthing-dist/workflows/research/workflow.yaml +45 -0
- package/pennyfarthing-dist/workflows/retrospective/checklist.md +31 -0
- package/pennyfarthing-dist/workflows/retrospective/instructions.md +1443 -0
- package/pennyfarthing-dist/workflows/retrospective/workflow.yaml +50 -0
- package/pennyfarthing-dist/workflows/sprint-planning/checklist.md +33 -0
- package/pennyfarthing-dist/workflows/sprint-planning/sprint-status-template.yaml +55 -0
- package/pennyfarthing-dist/workflows/sprint-planning/steps/step-01-parse-epic-files.md +69 -0
- package/pennyfarthing-dist/workflows/sprint-planning/steps/step-02-build-sprint-status.md +61 -0
- package/pennyfarthing-dist/workflows/sprint-planning/steps/step-03-status-detection.md +80 -0
- package/pennyfarthing-dist/workflows/sprint-planning/steps/step-04-generate-status-file.md +90 -0
- package/pennyfarthing-dist/workflows/sprint-planning/steps/step-05-validate-and-report.md +78 -0
- package/pennyfarthing-dist/workflows/sprint-planning/workflow.yaml +34 -0
- package/pennyfarthing-dist/workflows/tdd.yaml +50 -0
- package/pennyfarthing-dist/workflows/trivial.yaml +40 -0
- package/pennyfarthing-dist/workflows/ux-design/steps/step-01-init.md +141 -0
- package/pennyfarthing-dist/workflows/ux-design/steps/step-01b-continue.md +133 -0
- package/pennyfarthing-dist/workflows/ux-design/steps/step-02-discovery.md +196 -0
- package/pennyfarthing-dist/workflows/ux-design/steps/step-03-core-experience.md +222 -0
- package/pennyfarthing-dist/workflows/ux-design/steps/step-04-emotional-response.md +225 -0
- package/pennyfarthing-dist/workflows/ux-design/steps/step-05-inspiration.md +240 -0
- package/pennyfarthing-dist/workflows/ux-design/steps/step-06-design-system.md +258 -0
- package/pennyfarthing-dist/workflows/ux-design/steps/step-07-defining-experience.md +260 -0
- package/pennyfarthing-dist/workflows/ux-design/steps/step-08-visual-foundation.md +230 -0
- package/pennyfarthing-dist/workflows/ux-design/steps/step-09-design-directions.md +230 -0
- package/pennyfarthing-dist/workflows/ux-design/steps/step-10-user-journeys.md +247 -0
- package/pennyfarthing-dist/workflows/ux-design/steps/step-11-component-strategy.md +254 -0
- package/pennyfarthing-dist/workflows/ux-design/steps/step-12-ux-patterns.md +243 -0
- package/pennyfarthing-dist/workflows/ux-design/steps/step-13-responsive-accessibility.md +270 -0
- package/pennyfarthing-dist/workflows/ux-design/steps/step-14-complete.md +234 -0
- package/pennyfarthing-dist/workflows/ux-design/ux-design-template.md +13 -0
- package/pennyfarthing-dist/workflows/ux-design/workflow.yaml +41 -0
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Ground-truth judge for SWE-bench scenarios.
|
|
4
|
+
|
|
5
|
+
Compares Claude's proposed fix against the actual SWE-bench patch.
|
|
6
|
+
Scores based on:
|
|
7
|
+
- File identification (20%)
|
|
8
|
+
- Function/location identification (20%)
|
|
9
|
+
- Fix logic match (40%)
|
|
10
|
+
- Completeness (20%)
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import re
|
|
15
|
+
import sys
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from difflib import SequenceMatcher
|
|
18
|
+
|
|
19
|
+
# Add parent to path for pennyfarthing_scripts imports
|
|
20
|
+
sys.path.insert(0, str(Path(__file__).resolve().parents[3]))
|
|
21
|
+
|
|
22
|
+
from pennyfarthing_scripts.swebench import (
|
|
23
|
+
extract_patch_info,
|
|
24
|
+
extract_problem_keywords,
|
|
25
|
+
find_scenario,
|
|
26
|
+
get_meaningful_patterns,
|
|
27
|
+
load_swebench_data,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def score_response(response_text, ground_truth):
|
|
32
|
+
"""Score a response against ground truth patch."""
|
|
33
|
+
patch_info = extract_patch_info(ground_truth['patch'])
|
|
34
|
+
|
|
35
|
+
scores = {
|
|
36
|
+
'file_identification': 0,
|
|
37
|
+
'location_identification': 0,
|
|
38
|
+
'fix_logic_match': 0,
|
|
39
|
+
'completeness': 0,
|
|
40
|
+
'details': {}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
response_lower = response_text.lower()
|
|
44
|
+
|
|
45
|
+
# 1. FILE IDENTIFICATION (20 points)
|
|
46
|
+
files_found = 0
|
|
47
|
+
for f in patch_info.files:
|
|
48
|
+
# Check various forms of the filename
|
|
49
|
+
filename = Path(f).name
|
|
50
|
+
if filename.lower() in response_lower or f.lower() in response_lower:
|
|
51
|
+
files_found += 1
|
|
52
|
+
|
|
53
|
+
if patch_info.files:
|
|
54
|
+
file_score = (files_found / len(patch_info.files)) * 20
|
|
55
|
+
scores['file_identification'] = min(20, file_score)
|
|
56
|
+
scores['details']['files_expected'] = patch_info.files
|
|
57
|
+
scores['details']['files_found'] = files_found
|
|
58
|
+
else:
|
|
59
|
+
scores['file_identification'] = 20 # No specific file in patch
|
|
60
|
+
|
|
61
|
+
# 2. LOCATION IDENTIFICATION (20 points)
|
|
62
|
+
# Look for function/class names mentioned in the patch
|
|
63
|
+
locations_found = 0
|
|
64
|
+
for func in patch_info.functions:
|
|
65
|
+
# Extract the function/class name
|
|
66
|
+
func_match = re.search(r'(def|class)\s+(\w+)', func)
|
|
67
|
+
if func_match:
|
|
68
|
+
func_name = func_match.group(2)
|
|
69
|
+
if func_name.lower() in response_lower:
|
|
70
|
+
locations_found += 1
|
|
71
|
+
elif func.strip() and func.strip().split()[0] in response_lower:
|
|
72
|
+
locations_found += 1
|
|
73
|
+
|
|
74
|
+
if patch_info.functions:
|
|
75
|
+
loc_score = (locations_found / len(patch_info.functions)) * 20
|
|
76
|
+
scores['location_identification'] = min(20, loc_score)
|
|
77
|
+
scores['details']['locations_expected'] = patch_info.functions[:3]
|
|
78
|
+
scores['details']['locations_found'] = locations_found
|
|
79
|
+
else:
|
|
80
|
+
scores['location_identification'] = 10 # Partial credit
|
|
81
|
+
|
|
82
|
+
# 3. FIX LOGIC MATCH (40 points)
|
|
83
|
+
# Check if key code patterns from the fix appear in the response
|
|
84
|
+
meaningful_patterns = get_meaningful_patterns(patch_info.key_patterns)
|
|
85
|
+
|
|
86
|
+
patterns_found = 0
|
|
87
|
+
for pattern in meaningful_patterns:
|
|
88
|
+
if pattern.lower() in response_lower:
|
|
89
|
+
patterns_found += 1
|
|
90
|
+
|
|
91
|
+
if meaningful_patterns:
|
|
92
|
+
pattern_score = (patterns_found / len(meaningful_patterns)) * 20
|
|
93
|
+
scores['details']['patterns_expected'] = meaningful_patterns[:10]
|
|
94
|
+
scores['details']['patterns_found'] = patterns_found
|
|
95
|
+
else:
|
|
96
|
+
pattern_score = 10
|
|
97
|
+
|
|
98
|
+
# Check for actual code additions
|
|
99
|
+
additions_matched = 0
|
|
100
|
+
for addition in patch_info.additions[:5]: # Check first 5 additions
|
|
101
|
+
# Normalize and check
|
|
102
|
+
addition_normalized = re.sub(r'\s+', ' ', addition.lower())
|
|
103
|
+
response_normalized = re.sub(r'\s+', ' ', response_lower)
|
|
104
|
+
|
|
105
|
+
# Use fuzzy matching
|
|
106
|
+
similarity = SequenceMatcher(None, addition_normalized, response_normalized).ratio()
|
|
107
|
+
if similarity > 0.6 or addition_normalized in response_normalized:
|
|
108
|
+
additions_matched += 1
|
|
109
|
+
|
|
110
|
+
if patch_info.additions:
|
|
111
|
+
addition_score = (additions_matched / min(5, len(patch_info.additions))) * 20
|
|
112
|
+
scores['details']['additions_matched'] = additions_matched
|
|
113
|
+
else:
|
|
114
|
+
addition_score = 10
|
|
115
|
+
|
|
116
|
+
scores['fix_logic_match'] = min(40, pattern_score + addition_score)
|
|
117
|
+
|
|
118
|
+
# 4. COMPLETENESS (20 points)
|
|
119
|
+
# Does the response have all the elements of a good fix?
|
|
120
|
+
completeness_score = 0
|
|
121
|
+
|
|
122
|
+
# Has code block?
|
|
123
|
+
if '```' in response_text:
|
|
124
|
+
completeness_score += 5
|
|
125
|
+
|
|
126
|
+
# Has test considerations?
|
|
127
|
+
if 'test' in response_lower:
|
|
128
|
+
completeness_score += 5
|
|
129
|
+
|
|
130
|
+
# Mentions the specific error/issue?
|
|
131
|
+
problem_keywords = extract_problem_keywords(ground_truth.get('problem_statement', ''))
|
|
132
|
+
keywords_found = sum(1 for kw in problem_keywords if kw.lower() in response_lower)
|
|
133
|
+
if problem_keywords:
|
|
134
|
+
completeness_score += min(5, (keywords_found / len(problem_keywords)) * 5)
|
|
135
|
+
else:
|
|
136
|
+
completeness_score += 2.5
|
|
137
|
+
|
|
138
|
+
# Has explanation of why fix works?
|
|
139
|
+
explanation_words = ['because', 'this fixes', 'this resolves', 'the issue', 'the problem', 'solution']
|
|
140
|
+
if any(word in response_lower for word in explanation_words):
|
|
141
|
+
completeness_score += 5
|
|
142
|
+
|
|
143
|
+
scores['completeness'] = min(20, completeness_score)
|
|
144
|
+
|
|
145
|
+
# Total
|
|
146
|
+
scores['total'] = round(
|
|
147
|
+
scores['file_identification'] +
|
|
148
|
+
scores['location_identification'] +
|
|
149
|
+
scores['fix_logic_match'] +
|
|
150
|
+
scores['completeness']
|
|
151
|
+
, 1)
|
|
152
|
+
|
|
153
|
+
return scores
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def main():
|
|
157
|
+
if len(sys.argv) < 3:
|
|
158
|
+
print("Usage: ground-truth-judge.py <scenario_name> <response_file>")
|
|
159
|
+
print("Example: ground-truth-judge.py flask-5014 run_20260102T134237Z.json")
|
|
160
|
+
sys.exit(1)
|
|
161
|
+
|
|
162
|
+
scenario_name = sys.argv[1]
|
|
163
|
+
response_file = sys.argv[2]
|
|
164
|
+
|
|
165
|
+
# Load SWE-bench data
|
|
166
|
+
swebench_data = load_swebench_data()
|
|
167
|
+
|
|
168
|
+
# Find scenario
|
|
169
|
+
scenario = find_scenario(swebench_data, scenario_name)
|
|
170
|
+
if not scenario:
|
|
171
|
+
print(f"Error: Scenario '{scenario_name}' not found in SWE-bench data")
|
|
172
|
+
sys.exit(1)
|
|
173
|
+
|
|
174
|
+
# Load response
|
|
175
|
+
with open(response_file, 'r') as f:
|
|
176
|
+
response_data = json.load(f)
|
|
177
|
+
|
|
178
|
+
response_text = response_data.get('result', '')
|
|
179
|
+
if not response_text:
|
|
180
|
+
print("Error: No 'result' field in response file")
|
|
181
|
+
sys.exit(1)
|
|
182
|
+
|
|
183
|
+
# Score
|
|
184
|
+
scores = score_response(response_text, scenario)
|
|
185
|
+
|
|
186
|
+
# Output
|
|
187
|
+
print(f"\n{'='*60}")
|
|
188
|
+
print(f"GROUND TRUTH EVALUATION: {scenario_name}")
|
|
189
|
+
print(f"{'='*60}")
|
|
190
|
+
print(f"\nScores:")
|
|
191
|
+
print(f" File Identification: {scores['file_identification']:5.1f}/20")
|
|
192
|
+
print(f" Location Identification: {scores['location_identification']:5.1f}/20")
|
|
193
|
+
print(f" Fix Logic Match: {scores['fix_logic_match']:5.1f}/40")
|
|
194
|
+
print(f" Completeness: {scores['completeness']:5.1f}/20")
|
|
195
|
+
print(f" {'─'*40}")
|
|
196
|
+
print(f" TOTAL: {scores['total']:5.1f}/100")
|
|
197
|
+
|
|
198
|
+
print(f"\nDetails:")
|
|
199
|
+
for key, value in scores['details'].items():
|
|
200
|
+
print(f" {key}: {value}")
|
|
201
|
+
|
|
202
|
+
# Output JSON for programmatic use
|
|
203
|
+
output = {
|
|
204
|
+
'scenario': scenario_name,
|
|
205
|
+
'instance_id': scenario.get('instance_id'),
|
|
206
|
+
'scores': scores,
|
|
207
|
+
'ground_truth_patch_preview': scenario.get('patch', '')[:300]
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
# Save judge output
|
|
211
|
+
output_path = response_file.replace('run_', 'gt_judge_')
|
|
212
|
+
with open(output_path, 'w') as f:
|
|
213
|
+
json.dump(output, f, indent=2)
|
|
214
|
+
print(f"\nSaved to: {output_path}")
|
|
215
|
+
|
|
216
|
+
return scores
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
if __name__ == '__main__':
|
|
220
|
+
main()
|
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
SWE-bench scenario judge using:
|
|
4
|
+
1. Scenario-specific scoring rubric from YAML
|
|
5
|
+
2. Ground-truth validation from actual SWE-bench patches
|
|
6
|
+
|
|
7
|
+
Scoring structure:
|
|
8
|
+
- root_cause (30%): IDENTIFIES_BUG_LOCATION (15) + EXPLAINS_WHY_BROKEN (15)
|
|
9
|
+
- fix_quality (40%): FIX_ADDRESSES_ISSUE (20) + FIX_IS_MINIMAL (10) + FIX_SYNTAX_CORRECT (10)
|
|
10
|
+
- completeness (20%): EDGE_CASES (10) + TEST_COVERAGE (10)
|
|
11
|
+
- persona (10%): IN_CHARACTER (10)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import re
|
|
16
|
+
import sys
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from difflib import SequenceMatcher
|
|
19
|
+
|
|
20
|
+
# Add parent to path for pennyfarthing_scripts imports
|
|
21
|
+
sys.path.insert(0, str(Path(__file__).resolve().parents[3]))
|
|
22
|
+
|
|
23
|
+
from pennyfarthing_scripts.swebench import (
|
|
24
|
+
extract_patch_info,
|
|
25
|
+
find_scenario,
|
|
26
|
+
load_swebench_data,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def score_identifies_bug_location(response, ground_truth):
|
|
31
|
+
"""Score IDENTIFIES_BUG_LOCATION (15 pts) using ground truth."""
|
|
32
|
+
patch_info = extract_patch_info(ground_truth.get('patch', ''))
|
|
33
|
+
response_lower = response.lower()
|
|
34
|
+
|
|
35
|
+
score = 0
|
|
36
|
+
details = []
|
|
37
|
+
|
|
38
|
+
# Check files (7.5 pts)
|
|
39
|
+
files_found = 0
|
|
40
|
+
for f in patch_info.files:
|
|
41
|
+
filename = Path(f).name.lower()
|
|
42
|
+
if filename in response_lower or f.lower() in response_lower:
|
|
43
|
+
files_found += 1
|
|
44
|
+
|
|
45
|
+
if patch_info.files:
|
|
46
|
+
file_score = (files_found / len(patch_info.files)) * 7.5
|
|
47
|
+
score += file_score
|
|
48
|
+
details.append(f"Files: {files_found}/{len(patch_info.files)} found")
|
|
49
|
+
|
|
50
|
+
# Check functions/classes (7.5 pts)
|
|
51
|
+
funcs_found = 0
|
|
52
|
+
for func in patch_info.functions:
|
|
53
|
+
func_match = re.search(r'(def|class)\s+(\w+)', func)
|
|
54
|
+
if func_match:
|
|
55
|
+
func_name = func_match.group(2).lower()
|
|
56
|
+
if func_name in response_lower:
|
|
57
|
+
funcs_found += 1
|
|
58
|
+
|
|
59
|
+
if patch_info.functions:
|
|
60
|
+
func_score = min(7.5, (funcs_found / len(patch_info.functions)) * 7.5)
|
|
61
|
+
score += func_score
|
|
62
|
+
details.append(f"Functions: {funcs_found}/{len(patch_info.functions)} found")
|
|
63
|
+
else:
|
|
64
|
+
score += 3.75 # Partial credit if no specific function in patch
|
|
65
|
+
|
|
66
|
+
return min(15, score), details
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def score_explains_why_broken(response, ground_truth):
|
|
70
|
+
"""Score EXPLAINS_WHY_BROKEN (15 pts)."""
|
|
71
|
+
response_lower = response.lower()
|
|
72
|
+
problem = ground_truth.get('problem_statement', '').lower()
|
|
73
|
+
|
|
74
|
+
score = 0
|
|
75
|
+
details = []
|
|
76
|
+
|
|
77
|
+
# Extract key terms from problem statement
|
|
78
|
+
key_terms = re.findall(r'[`\'"]([^`\'"]+)[`\'"]', problem)
|
|
79
|
+
key_terms += re.findall(r'\b\w+Error\b|\b\w+Exception\b', problem, re.IGNORECASE)
|
|
80
|
+
key_terms = list(set(key_terms))[:10]
|
|
81
|
+
|
|
82
|
+
# Check for explanation of the issue
|
|
83
|
+
explanation_markers = ['because', 'this happens', 'the issue', 'the problem', 'fails when', 'breaks when', 'causes']
|
|
84
|
+
has_explanation = any(marker in response_lower for marker in explanation_markers)
|
|
85
|
+
if has_explanation:
|
|
86
|
+
score += 7.5
|
|
87
|
+
details.append("Has explanation of why broken")
|
|
88
|
+
|
|
89
|
+
# Check for key terms from problem
|
|
90
|
+
terms_found = sum(1 for term in key_terms if term.lower() in response_lower)
|
|
91
|
+
if key_terms:
|
|
92
|
+
term_score = (terms_found / len(key_terms)) * 7.5
|
|
93
|
+
score += term_score
|
|
94
|
+
details.append(f"Key terms: {terms_found}/{len(key_terms)}")
|
|
95
|
+
else:
|
|
96
|
+
score += 3.75
|
|
97
|
+
|
|
98
|
+
return min(15, score), details
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def score_fix_addresses_issue(response, ground_truth):
|
|
102
|
+
"""Score FIX_ADDRESSES_ISSUE (20 pts) using ground truth patch."""
|
|
103
|
+
patch_info = extract_patch_info(ground_truth.get('patch', ''))
|
|
104
|
+
response_lower = response.lower()
|
|
105
|
+
|
|
106
|
+
score = 0
|
|
107
|
+
details = []
|
|
108
|
+
|
|
109
|
+
# Check if key additions from patch appear in response
|
|
110
|
+
additions_matched = 0
|
|
111
|
+
for addition in patch_info.additions[:5]:
|
|
112
|
+
# Normalize whitespace
|
|
113
|
+
addition_norm = re.sub(r'\s+', ' ', addition.lower())
|
|
114
|
+
response_norm = re.sub(r'\s+', ' ', response_lower)
|
|
115
|
+
|
|
116
|
+
# Check for exact or fuzzy match
|
|
117
|
+
if addition_norm in response_norm:
|
|
118
|
+
additions_matched += 1
|
|
119
|
+
else:
|
|
120
|
+
# Fuzzy match
|
|
121
|
+
sim = SequenceMatcher(None, addition_norm, response_norm).ratio()
|
|
122
|
+
if sim > 0.7:
|
|
123
|
+
additions_matched += 0.5
|
|
124
|
+
|
|
125
|
+
if patch_info.additions:
|
|
126
|
+
addition_score = (additions_matched / min(5, len(patch_info.additions))) * 15
|
|
127
|
+
score += addition_score
|
|
128
|
+
details.append(f"Code matches: {additions_matched}/{min(5, len(patch_info.additions))}")
|
|
129
|
+
|
|
130
|
+
# Check for code block with fix
|
|
131
|
+
if '```' in response:
|
|
132
|
+
score += 5
|
|
133
|
+
details.append("Has code block")
|
|
134
|
+
|
|
135
|
+
return min(20, score), details
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def score_fix_is_minimal(response, ground_truth):
|
|
139
|
+
"""Score FIX_IS_MINIMAL (10 pts)."""
|
|
140
|
+
patch_info = extract_patch_info(ground_truth.get('patch', ''))
|
|
141
|
+
|
|
142
|
+
score = 0
|
|
143
|
+
details = []
|
|
144
|
+
|
|
145
|
+
# Count lines in patch vs lines in response code blocks
|
|
146
|
+
patch_lines = len(patch_info.additions) + len(patch_info.deletions)
|
|
147
|
+
|
|
148
|
+
# Extract code blocks from response
|
|
149
|
+
code_blocks = re.findall(r'```[\w]*\n(.*?)```', response, re.DOTALL)
|
|
150
|
+
response_code_lines = sum(len(block.strip().split('\n')) for block in code_blocks)
|
|
151
|
+
|
|
152
|
+
# If response is within 2x of patch size, it's minimal
|
|
153
|
+
if patch_lines > 0:
|
|
154
|
+
ratio = response_code_lines / patch_lines if response_code_lines > 0 else 1
|
|
155
|
+
if ratio <= 2:
|
|
156
|
+
score = 10
|
|
157
|
+
details.append(f"Minimal: {response_code_lines} lines (patch: {patch_lines})")
|
|
158
|
+
elif ratio <= 4:
|
|
159
|
+
score = 5
|
|
160
|
+
details.append(f"Somewhat verbose: {response_code_lines} lines (patch: {patch_lines})")
|
|
161
|
+
else:
|
|
162
|
+
score = 2
|
|
163
|
+
details.append(f"Over-engineered: {response_code_lines} lines (patch: {patch_lines})")
|
|
164
|
+
else:
|
|
165
|
+
score = 5
|
|
166
|
+
|
|
167
|
+
return min(10, score), details
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def score_fix_syntax_correct(response):
|
|
171
|
+
"""Score FIX_SYNTAX_CORRECT (10 pts)."""
|
|
172
|
+
score = 0
|
|
173
|
+
details = []
|
|
174
|
+
|
|
175
|
+
# Extract code blocks
|
|
176
|
+
code_blocks = re.findall(r'```python\n(.*?)```', response, re.DOTALL)
|
|
177
|
+
if not code_blocks:
|
|
178
|
+
code_blocks = re.findall(r'```\n(.*?)```', response, re.DOTALL)
|
|
179
|
+
|
|
180
|
+
if code_blocks:
|
|
181
|
+
# Basic syntax checks
|
|
182
|
+
valid = True
|
|
183
|
+
for block in code_blocks:
|
|
184
|
+
try:
|
|
185
|
+
compile(block, '<string>', 'exec')
|
|
186
|
+
except SyntaxError:
|
|
187
|
+
valid = False
|
|
188
|
+
break
|
|
189
|
+
|
|
190
|
+
if valid:
|
|
191
|
+
score = 10
|
|
192
|
+
details.append("Syntax valid")
|
|
193
|
+
else:
|
|
194
|
+
score = 5
|
|
195
|
+
details.append("Syntax errors detected")
|
|
196
|
+
else:
|
|
197
|
+
score = 5
|
|
198
|
+
details.append("No code blocks to validate")
|
|
199
|
+
|
|
200
|
+
return min(10, score), details
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def score_edge_cases(response):
|
|
204
|
+
"""Score EDGE_CASES (10 pts)."""
|
|
205
|
+
response_lower = response.lower()
|
|
206
|
+
|
|
207
|
+
score = 0
|
|
208
|
+
details = []
|
|
209
|
+
|
|
210
|
+
edge_markers = ['edge case', 'corner case', 'what if', 'consider', 'also', 'none', 'empty', 'null', 'zero', 'negative', 'boundary']
|
|
211
|
+
found = sum(1 for m in edge_markers if m in response_lower)
|
|
212
|
+
|
|
213
|
+
score = min(10, found * 2)
|
|
214
|
+
details.append(f"Edge case markers: {found}")
|
|
215
|
+
|
|
216
|
+
return score, details
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def score_test_coverage(response):
|
|
220
|
+
"""Score TEST_COVERAGE (10 pts)."""
|
|
221
|
+
response_lower = response.lower()
|
|
222
|
+
|
|
223
|
+
score = 0
|
|
224
|
+
details = []
|
|
225
|
+
|
|
226
|
+
# Check for test-related content
|
|
227
|
+
has_test_section = 'test' in response_lower
|
|
228
|
+
has_test_function = 'def test_' in response_lower or 'test_' in response
|
|
229
|
+
has_assert = 'assert' in response_lower or 'pytest' in response_lower
|
|
230
|
+
|
|
231
|
+
if has_test_function:
|
|
232
|
+
score += 5
|
|
233
|
+
details.append("Has test function")
|
|
234
|
+
if has_assert:
|
|
235
|
+
score += 3
|
|
236
|
+
details.append("Has assertions")
|
|
237
|
+
if has_test_section:
|
|
238
|
+
score += 2
|
|
239
|
+
details.append("Has test section")
|
|
240
|
+
|
|
241
|
+
return min(10, score), details
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def score_in_character(response, persona="senior developer"):
|
|
245
|
+
"""Score IN_CHARACTER (10 pts)."""
|
|
246
|
+
response_lower = response.lower()
|
|
247
|
+
|
|
248
|
+
score = 0
|
|
249
|
+
details = []
|
|
250
|
+
|
|
251
|
+
# For control baseline, check professional tone
|
|
252
|
+
professional_markers = ['i recommend', 'we should', 'this approach', 'the fix', 'analysis', 'root cause']
|
|
253
|
+
found = sum(1 for m in professional_markers if m in response_lower)
|
|
254
|
+
|
|
255
|
+
score = min(10, found * 2)
|
|
256
|
+
details.append(f"Professional markers: {found}")
|
|
257
|
+
|
|
258
|
+
return score, details
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def judge_response(scenario_name, response_text, swebench_data):
|
|
262
|
+
"""Full judgment using scenario rubric + ground truth."""
|
|
263
|
+
ground_truth = find_scenario(swebench_data, scenario_name)
|
|
264
|
+
|
|
265
|
+
if not ground_truth:
|
|
266
|
+
return {'error': f'Scenario {scenario_name} not found in SWE-bench data'}
|
|
267
|
+
|
|
268
|
+
scores = {}
|
|
269
|
+
all_details = {}
|
|
270
|
+
|
|
271
|
+
# root_cause (30%)
|
|
272
|
+
loc_score, loc_details = score_identifies_bug_location(response_text, ground_truth)
|
|
273
|
+
why_score, why_details = score_explains_why_broken(response_text, ground_truth)
|
|
274
|
+
scores['root_cause'] = {
|
|
275
|
+
'IDENTIFIES_BUG_LOCATION': loc_score,
|
|
276
|
+
'EXPLAINS_WHY_BROKEN': why_score,
|
|
277
|
+
'subtotal': loc_score + why_score
|
|
278
|
+
}
|
|
279
|
+
all_details['root_cause'] = loc_details + why_details
|
|
280
|
+
|
|
281
|
+
# fix_quality (40%)
|
|
282
|
+
fix_score, fix_details = score_fix_addresses_issue(response_text, ground_truth)
|
|
283
|
+
min_score, min_details = score_fix_is_minimal(response_text, ground_truth)
|
|
284
|
+
syn_score, syn_details = score_fix_syntax_correct(response_text)
|
|
285
|
+
scores['fix_quality'] = {
|
|
286
|
+
'FIX_ADDRESSES_ISSUE': fix_score,
|
|
287
|
+
'FIX_IS_MINIMAL': min_score,
|
|
288
|
+
'FIX_SYNTAX_CORRECT': syn_score,
|
|
289
|
+
'subtotal': fix_score + min_score + syn_score
|
|
290
|
+
}
|
|
291
|
+
all_details['fix_quality'] = fix_details + min_details + syn_details
|
|
292
|
+
|
|
293
|
+
# completeness (20%)
|
|
294
|
+
edge_score, edge_details = score_edge_cases(response_text)
|
|
295
|
+
test_score, test_details = score_test_coverage(response_text)
|
|
296
|
+
scores['completeness'] = {
|
|
297
|
+
'EDGE_CASES': edge_score,
|
|
298
|
+
'TEST_COVERAGE': test_score,
|
|
299
|
+
'subtotal': edge_score + test_score
|
|
300
|
+
}
|
|
301
|
+
all_details['completeness'] = edge_details + test_details
|
|
302
|
+
|
|
303
|
+
# persona (10%)
|
|
304
|
+
char_score, char_details = score_in_character(response_text)
|
|
305
|
+
scores['persona'] = {
|
|
306
|
+
'IN_CHARACTER': char_score,
|
|
307
|
+
'subtotal': char_score
|
|
308
|
+
}
|
|
309
|
+
all_details['persona'] = char_details
|
|
310
|
+
|
|
311
|
+
# Total
|
|
312
|
+
total = (
|
|
313
|
+
scores['root_cause']['subtotal'] +
|
|
314
|
+
scores['fix_quality']['subtotal'] +
|
|
315
|
+
scores['completeness']['subtotal'] +
|
|
316
|
+
scores['persona']['subtotal']
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
patch_info = extract_patch_info(ground_truth.get('patch', ''))
|
|
320
|
+
return {
|
|
321
|
+
'scenario': scenario_name,
|
|
322
|
+
'instance_id': ground_truth.get('instance_id'),
|
|
323
|
+
'scores': scores,
|
|
324
|
+
'total': round(total, 1),
|
|
325
|
+
'details': all_details,
|
|
326
|
+
'ground_truth_files': patch_info.files
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def main():
|
|
331
|
+
if len(sys.argv) < 3:
|
|
332
|
+
print("Usage: swebench-judge.py <scenario_name> <response_file>")
|
|
333
|
+
sys.exit(1)
|
|
334
|
+
|
|
335
|
+
scenario_name = sys.argv[1]
|
|
336
|
+
response_file = sys.argv[2]
|
|
337
|
+
|
|
338
|
+
# Load data
|
|
339
|
+
swebench_data = load_swebench_data()
|
|
340
|
+
|
|
341
|
+
with open(response_file, 'r') as f:
|
|
342
|
+
response_data = json.load(f)
|
|
343
|
+
|
|
344
|
+
# Handle different JSON structures
|
|
345
|
+
response_text = response_data.get('result', '') or response_data.get('response_text', '')
|
|
346
|
+
|
|
347
|
+
# Judge
|
|
348
|
+
result = judge_response(scenario_name, response_text, swebench_data)
|
|
349
|
+
|
|
350
|
+
# Display
|
|
351
|
+
print(f"\n{'='*60}")
|
|
352
|
+
print(f"SWE-BENCH JUDGE: {scenario_name}")
|
|
353
|
+
print(f"{'='*60}")
|
|
354
|
+
|
|
355
|
+
for category, scores in result['scores'].items():
|
|
356
|
+
print(f"\n{category.upper()} ({scores['subtotal']:.1f} pts)")
|
|
357
|
+
for criterion, score in scores.items():
|
|
358
|
+
if criterion != 'subtotal':
|
|
359
|
+
print(f" {criterion}: {score:.1f}")
|
|
360
|
+
|
|
361
|
+
print(f"\n{'─'*40}")
|
|
362
|
+
print(f"TOTAL: {result['total']}/100")
|
|
363
|
+
|
|
364
|
+
print(f"\nGround truth files: {result['ground_truth_files']}")
|
|
365
|
+
|
|
366
|
+
# Save
|
|
367
|
+
output_path = response_file.replace('run_', 'swebench_judge_')
|
|
368
|
+
with open(output_path, 'w') as f:
|
|
369
|
+
json.dump(result, f, indent=2)
|
|
370
|
+
print(f"\nSaved to: {output_path}")
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
if __name__ == '__main__':
|
|
374
|
+
main()
|