workflow-ai 1.0.63 → 1.0.65
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +239 -145
- package/configs/agent-health-rules.yaml +64 -0
- package/configs/config.yaml +134 -0
- package/configs/pipeline.yaml +901 -0
- package/configs/ticket-movement-rules.yaml +80 -0
- package/package.json +1 -1
- package/src/global-dir.mjs +25 -1
- package/src/init.mjs +20 -3
- package/src/lib/agent-health-registry.mjs +245 -0
- package/src/lib/artifact-snapshot.mjs +233 -0
- package/src/lib/error-classifier.mjs +274 -0
- package/src/lib/test-error-classifier.mjs +60 -0
- package/src/lib/test-extends.mjs +58 -0
- package/src/lib/test-version.mjs +21 -0
- package/src/scripts/move-to-review.js +5 -7
- package/src/scripts/reset-agent-health.js +62 -0
- package/src/scripts/run-skill-tests.js +348 -136
- package/src/skills/analyze-report/README.md +44 -0
- package/src/skills/analyze-report/SKILL.md +121 -0
- package/src/skills/analyze-report/algorithms/progress-assessment.md +108 -0
- package/src/skills/analyze-report/knowledge/analysis-frameworks.md +66 -0
- package/src/skills/analyze-report/knowledge/report-structure.md +61 -0
- package/src/skills/analyze-report/scripts/calc-plan-metrics.js +234 -0
- package/src/skills/analyze-report/templates/analysis-report.md +80 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/claude-sonnet/trial-1.md +69 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/claude-sonnet/trial-2.md +103 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/claude-sonnet/trial-3.md +99 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/judge.json +163 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-deepseek/trial-1.md +89 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-deepseek/trial-2.md +88 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-deepseek/trial-3.md +100 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-glm/trial-1.md +77 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-glm/trial-2.md +64 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-glm/trial-3.md +110 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-minimax/trial-1.md +74 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-minimax/trial-2.md +38 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-minimax/trial-3.md +61 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/meta.json +115 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001-evidence-from-log.yaml +60 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/claude-sonnet/trial-1.md +90 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/claude-sonnet/trial-2.md +89 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/claude-sonnet/trial-3.md +77 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/judge.json +163 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-deepseek/trial-1.md +84 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-deepseek/trial-2.md +77 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-deepseek/trial-3.md +89 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-glm/trial-1.md +103 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-glm/trial-2.md +103 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-glm/trial-3.md +103 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-minimax/trial-1.md +93 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-minimax/trial-2.md +93 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-minimax/trial-3.md +86 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/meta.json +115 -0
- package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002-result-block-format.yaml +44 -0
- package/src/skills/analyze-report/tests/fixtures/REPORT-002-incorrect-attribution.md +27 -0
- package/src/skills/analyze-report/tests/fixtures/pipeline-2026-04-06_qa-001-skip.log +32 -0
- package/src/skills/analyze-report/tests/index.yaml +25 -0
- package/src/skills/analyze-report/tests/rubrics/evidence-from-log.md +22 -0
- package/src/skills/analyze-report/tests/rubrics/result-block-format.md +22 -0
- package/src/skills/analyze-report/workflows/progress.md +158 -0
- package/src/skills/analyze-report/workflows/retrospective.md +143 -0
- package/src/skills/coach/README.md +43 -0
- package/src/skills/coach/SKILL.md +167 -0
- package/src/skills/coach/SKILL.md.legacy +157 -0
- package/src/skills/coach/algorithms/gap-analysis.md +69 -0
- package/src/skills/coach/algorithms/improvement-prioritization.md +62 -0
- package/src/skills/coach/algorithms/skill-scoring.md +80 -0
- package/src/skills/coach/knowledge/audit-applied-changes-clean.txt +11 -0
- package/src/skills/coach/knowledge/backlog-management.md +67 -0
- package/src/skills/coach/knowledge/backlog-management.md.legacy +90 -0
- package/src/skills/coach/knowledge/common-antipatterns.md +76 -0
- package/src/skills/coach/knowledge/prompt-engineering.md +45 -0
- package/src/skills/coach/knowledge/shared-knowledge-guide.md +44 -0
- package/src/skills/coach/knowledge/skill-anatomy.md +49 -0
- package/src/skills/coach/knowledge/test-authorship.md +141 -0
- package/src/skills/coach/templates/audit-report.md +39 -0
- package/src/skills/coach/templates/coach-backlog-init.yaml +14 -0
- package/src/skills/coach/templates/coach-backlog-init.yaml.legacy +10 -0
- package/src/skills/coach/templates/improvement-plan.md +42 -0
- package/src/skills/coach/templates/new-skill.md +95 -0
- package/src/skills/coach/tests/cases/TC-COACH-001/current/claude-sonnet/trial-1.md +58 -0
- package/src/skills/coach/tests/cases/TC-COACH-001/current/claude-sonnet/trial-2.md +65 -0
- package/src/skills/coach/tests/cases/TC-COACH-001/current/claude-sonnet/trial-3.md +58 -0
- package/src/skills/coach/tests/cases/TC-COACH-001/current/judge.json +151 -0
- package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-deepseek/trial-1.md +46 -0
- package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-deepseek/trial-2.md +0 -0
- package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-deepseek/trial-3.md +75 -0
- package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-glm/trial-1.md +81 -0
- package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-glm/trial-2.md +101 -0
- package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-glm/trial-3.md +91 -0
- package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-minimax/trial-1.md +48 -0
- package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-minimax/trial-2.md +30 -0
- package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-minimax/trial-3.md +55 -0
- package/src/skills/coach/tests/cases/TC-COACH-001/current/meta.json +94 -0
- package/src/skills/coach/tests/cases/TC-COACH-001-evidence-based-temporal-diagram.yaml +53 -0
- package/src/skills/coach/tests/cases/TC-COACH-002/current/claude-sonnet/trial-1.md +46 -0
- package/src/skills/coach/tests/cases/TC-COACH-002/current/claude-sonnet/trial-2.md +50 -0
- package/src/skills/coach/tests/cases/TC-COACH-002/current/claude-sonnet/trial-3.md +48 -0
- package/src/skills/coach/tests/cases/TC-COACH-002/current/judge.json +151 -0
- package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-deepseek/trial-1.md +0 -0
- package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-deepseek/trial-2.md +37 -0
- package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-deepseek/trial-3.md +30 -0
- package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-glm/trial-1.md +23 -0
- package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-glm/trial-2.md +29 -0
- package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-glm/trial-3.md +35 -0
- package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-minimax/trial-1.md +13 -0
- package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-minimax/trial-2.md +19 -0
- package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-minimax/trial-3.md +33 -0
- package/src/skills/coach/tests/cases/TC-COACH-002/current/meta.json +94 -0
- package/src/skills/coach/tests/cases/TC-COACH-002-root-cause-first.yaml +57 -0
- package/src/skills/coach/tests/fixtures/pipeline-2026-04-06_id-collision.log +77 -0
- package/src/skills/coach/tests/index.yaml +29 -0
- package/src/skills/coach/tests/rubrics/calibration/evidence-based-bad.md +13 -0
- package/src/skills/coach/tests/rubrics/calibration/evidence-based-good.md +29 -0
- package/src/skills/coach/tests/rubrics/evidence-based.md +26 -0
- package/src/skills/coach/tests/rubrics/root-cause-first.md +21 -0
- package/src/skills/coach/workflows/analyze.md +79 -0
- package/src/skills/coach/workflows/analyze.md.legacy +64 -0
- package/src/skills/coach/workflows/audit.md +74 -0
- package/src/skills/coach/workflows/audit.md.legacy +59 -0
- package/src/skills/coach/workflows/create.md +80 -0
- package/src/skills/coach/workflows/create.md.legacy +67 -0
- package/src/skills/coach/workflows/improve.md +71 -0
- package/src/skills/coach/workflows/improve.md.legacy +60 -0
- package/src/skills/coach/workflows/research.md +55 -0
- package/src/skills/coach/workflows/review.md +52 -0
- package/src/skills/coach/workflows/review.md.legacy +48 -0
- package/src/skills/coach/workflows/test.md +97 -0
- package/src/skills/create-plan/README.md +39 -0
- package/src/skills/create-plan/SKILL.md +104 -0
- package/src/skills/create-plan/algorithms/risk-assessment.md +73 -0
- package/src/skills/create-plan/knowledge/plan-completeness.md +67 -0
- package/src/skills/create-plan/knowledge/plan-lifecycle.md +33 -0
- package/src/skills/create-plan/knowledge/task-verification-pairs.md +151 -0
- package/src/skills/create-plan/scripts/validate-completeness.js +182 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/claude-sonnet/trial-1.md +5 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/claude-sonnet/trial-2.md +39 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/claude-sonnet/trial-3.md +35 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/judge.json +167 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-deepseek/trial-1.md +5 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-deepseek/trial-2.md +10 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-deepseek/trial-3.md +5 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-glm/trial-1.md +26 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-glm/trial-2.md +86 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-glm/trial-3.md +5 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-minimax/trial-1.md +11 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-minimax/trial-2.md +15 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-minimax/trial-3.md +14 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/meta.json +119 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001-validate-completeness.yaml +41 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/claude-sonnet/trial-1.md +25 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/claude-sonnet/trial-2.md +30 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/claude-sonnet/trial-3.md +37 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/judge.json +164 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-deepseek/trial-1.md +3 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-deepseek/trial-2.md +11 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-deepseek/trial-3.md +13 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-glm/trial-1.md +44 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-glm/trial-2.md +5 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-glm/trial-3.md +49 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-minimax/trial-1.md +6 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-minimax/trial-2.md +11 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-minimax/trial-3.md +16 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/meta.json +116 -0
- package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002-task-granularity.yaml +39 -0
- package/src/skills/create-plan/tests/index.yaml +25 -0
- package/src/skills/create-plan/tests/rubrics/task-granularity.md +21 -0
- package/src/skills/create-plan/tests/rubrics/validate-completeness.md +21 -0
- package/src/skills/create-plan/workflows/create.md +136 -0
- package/src/skills/create-report/README.md +40 -0
- package/src/skills/create-report/SKILL.md +73 -0
- package/src/skills/create-report/algorithms/metric-calculation.md +93 -0
- package/src/skills/create-report/knowledge/report-metrics.md +82 -0
- package/src/skills/create-report/scripts/calc-metrics.js +383 -0
- package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/claude-sonnet/trial-1.md +25 -0
- package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/claude-sonnet/trial-2.md +26 -0
- package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/claude-sonnet/trial-3.md +28 -0
- package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/judge.json +163 -0
- package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-deepseek/trial-1.md +4 -0
- package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-deepseek/trial-2.md +3 -0
- package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-deepseek/trial-3.md +6 -0
- package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-glm/trial-1.md +8 -0
- package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-glm/trial-2.md +12 -0
- package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-glm/trial-3.md +7 -0
- package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-minimax/trial-1.md +12 -0
- package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-minimax/trial-2.md +22 -0
- package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-minimax/trial-3.md +13 -0
- package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/meta.json +115 -0
- package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001-root-cause-attribution.yaml +57 -0
- package/src/skills/create-report/tests/index.yaml +20 -0
- package/src/skills/create-report/tests/rubrics/root-cause-attribution.md +21 -0
- package/src/skills/create-report/workflows/standard.md +175 -0
- package/src/skills/decompose-gaps/README.md +39 -0
- package/src/skills/decompose-gaps/SKILL.md +78 -0
- package/src/skills/decompose-gaps/algorithms/scope-check.md +110 -0
- package/src/skills/decompose-gaps/knowledge/scope-validation.md +65 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/claude-sonnet/trial-1.md +41 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/claude-sonnet/trial-2.md +41 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/claude-sonnet/trial-3.md +56 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/judge.json +164 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-deepseek/trial-1.md +25 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-deepseek/trial-2.md +17 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-deepseek/trial-3.md +22 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-glm/trial-1.md +25 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-glm/trial-2.md +5 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-glm/trial-3.md +29 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-minimax/trial-1.md +27 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-minimax/trial-2.md +35 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-minimax/trial-3.md +18 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/meta.json +116 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001-scope-exclusion.yaml +46 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/claude-sonnet/trial-1.md +27 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/claude-sonnet/trial-2.md +30 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/claude-sonnet/trial-3.md +27 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/judge.json +163 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-deepseek/trial-1.md +0 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-deepseek/trial-2.md +15 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-deepseek/trial-3.md +7 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-glm/trial-1.md +21 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-glm/trial-2.md +38 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-glm/trial-3.md +16 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-minimax/trial-1.md +5 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-minimax/trial-2.md +10 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-minimax/trial-3.md +9 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/meta.json +115 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002-glob-before-write.yaml +36 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/claude-sonnet/trial-1.md +30 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/claude-sonnet/trial-2.md +30 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/claude-sonnet/trial-3.md +30 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/judge.json +165 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/kilo-deepseek/trial-1.md +5 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/kilo-deepseek/trial-2.md +26 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/kilo-deepseek/trial-3.md +5 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/kilo-glm/trial-1.md +39 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/kilo-glm/trial-2.md +37 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/kilo-glm/trial-3.md +45 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/kilo-minimax/trial-1.md +26 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/kilo-minimax/trial-2.md +27 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/kilo-minimax/trial-3.md +7 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/meta.json +117 -0
- package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003-parent-plan-mandatory.yaml +41 -0
- package/src/skills/decompose-gaps/tests/index.yaml +30 -0
- package/src/skills/decompose-gaps/tests/rubrics/glob-before-write.md +21 -0
- package/src/skills/decompose-gaps/tests/rubrics/parent-plan-mandatory.md +22 -0
- package/src/skills/decompose-gaps/tests/rubrics/scope-exclusion.md +21 -0
- package/src/skills/decompose-gaps/workflows/decompose.md +123 -0
- package/src/skills/decompose-plan/README.md +43 -0
- package/src/skills/decompose-plan/SKILL.md +87 -0
- package/src/skills/decompose-plan/algorithms/deduplication.md +101 -0
- package/src/skills/decompose-plan/knowledge/atomicity-checklist.md +139 -0
- package/src/skills/decompose-plan/knowledge/capabilities.md +68 -0
- package/src/skills/decompose-plan/knowledge/human-task-rules.md +82 -0
- package/src/skills/decompose-plan/knowledge/scope-guard-checklist.md +73 -0
- package/src/skills/decompose-plan/scripts/check-atomicity-limit.js +47 -0
- package/src/skills/decompose-plan/scripts/check-duplicates.js +323 -0
- package/src/skills/decompose-plan/scripts/verify-atomicity.js +408 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/claude-sonnet/trial-1.md +30 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/claude-sonnet/trial-2.md +36 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/claude-sonnet/trial-3.md +37 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/judge.json +163 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-deepseek/trial-1.md +20 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-deepseek/trial-2.md +17 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-deepseek/trial-3.md +28 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-glm/trial-1.md +114 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-glm/trial-2.md +137 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-glm/trial-3.md +188 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-minimax/trial-1.md +0 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-minimax/trial-2.md +32 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-minimax/trial-3.md +110 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/meta.json +115 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001-atomicity-no-1to1.yaml +56 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/claude-sonnet/trial-1.md +47 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/claude-sonnet/trial-2.md +54 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/claude-sonnet/trial-3.md +43 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/judge.json +163 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-deepseek/trial-1.md +15 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-deepseek/trial-2.md +5 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-deepseek/trial-3.md +12 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-glm/trial-1.md +34 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-glm/trial-2.md +30 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-glm/trial-3.md +35 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-minimax/trial-1.md +0 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-minimax/trial-2.md +31 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-minimax/trial-3.md +0 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/meta.json +115 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002-get-next-id-mandatory.yaml +44 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/claude-sonnet/trial-1.md +21 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/claude-sonnet/trial-2.md +38 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/claude-sonnet/trial-3.md +30 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/judge.json +163 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-deepseek/trial-1.md +31 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-deepseek/trial-2.md +35 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-deepseek/trial-3.md +48 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-glm/trial-1.md +167 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-glm/trial-2.md +62 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-glm/trial-3.md +174 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-minimax/trial-1.md +0 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-minimax/trial-2.md +0 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-minimax/trial-3.md +0 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/meta.json +115 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003-verbatim-dod-transfer.yaml +42 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/claude-sonnet/trial-1.md +55 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/claude-sonnet/trial-2.md +49 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/claude-sonnet/trial-3.md +49 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/judge.json +163 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/kilo-deepseek/trial-1.md +104 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/kilo-deepseek/trial-2.md +45 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/kilo-deepseek/trial-3.md +58 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/kilo-glm/trial-1.md +193 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/kilo-glm/trial-2.md +202 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/kilo-glm/trial-3.md +155 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/kilo-minimax/trial-1.md +52 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/kilo-minimax/trial-2.md +17 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/kilo-minimax/trial-3.md +0 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/meta.json +115 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004-executor-atomicity.yaml +64 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/claude-sonnet/trial-1.md +59 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/claude-sonnet/trial-2.md +204 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/claude-sonnet/trial-3.md +213 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/judge.json +163 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/kilo-deepseek/trial-1.md +0 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/kilo-deepseek/trial-2.md +57 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/kilo-deepseek/trial-3.md +54 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/kilo-glm/trial-1.md +147 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/kilo-glm/trial-2.md +165 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/kilo-glm/trial-3.md +133 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/kilo-minimax/trial-1.md +81 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/kilo-minimax/trial-2.md +108 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/kilo-minimax/trial-3.md +3 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/meta.json +114 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005-capabilities-registry.yaml +78 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/claude-sonnet/trial-1.md +225 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/claude-sonnet/trial-2.md +66 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/claude-sonnet/trial-3.md +36 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/judge.json +163 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/kilo-deepseek/trial-1.md +42 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/kilo-deepseek/trial-2.md +67 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/kilo-deepseek/trial-3.md +40 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/kilo-glm/trial-1.md +122 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/kilo-glm/trial-2.md +131 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/kilo-glm/trial-3.md +138 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/kilo-minimax/trial-1.md +41 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/kilo-minimax/trial-2.md +88 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/kilo-minimax/trial-3.md +0 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/meta.json +115 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006-dod-threshold.yaml +72 -0
- package/src/skills/decompose-plan/tests/index.yaml +45 -0
- package/src/skills/decompose-plan/tests/rubrics/atomicity-no-1to1.md +21 -0
- package/src/skills/decompose-plan/tests/rubrics/capabilities-registry.md +21 -0
- package/src/skills/decompose-plan/tests/rubrics/dod-threshold.md +21 -0
- package/src/skills/decompose-plan/tests/rubrics/executor-atomicity.md +21 -0
- package/src/skills/decompose-plan/tests/rubrics/get-next-id-mandatory.md +21 -0
- package/src/skills/decompose-plan/tests/rubrics/verbatim-dod-transfer.md +21 -0
- package/src/skills/decompose-plan/workflows/decompose.md +305 -0
- package/src/skills/deep-research/README.md +36 -0
- package/src/skills/deep-research/SKILL.md +106 -0
- package/src/skills/deep-research/algorithms/source-scoring.md +63 -0
- package/src/skills/deep-research/algorithms/synthesis.md +67 -0
- package/src/skills/deep-research/knowledge/data-validation.md +44 -0
- package/src/skills/deep-research/knowledge/perplexity-config.md +30 -0
- package/src/skills/deep-research/knowledge/research-methodology.md +54 -0
- package/src/skills/deep-research/knowledge/source-evaluation.md +33 -0
- package/src/skills/deep-research/scripts/perplexity-research.js +315 -0
- package/src/skills/deep-research/templates/brief-summary.md +25 -0
- package/src/skills/deep-research/templates/research-report.md +76 -0
- package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/claude-haiku/trial-1.md +48 -0
- package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/claude-haiku/trial-2.md +88 -0
- package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/claude-haiku/trial-3.md +56 -0
- package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/judge.json +163 -0
- package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-free/trial-1.md +58 -0
- package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-free/trial-2.md +249 -0
- package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-free/trial-3.md +44 -0
- package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm/trial-1.md +96 -0
- package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm/trial-2.md +56 -0
- package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm/trial-3.md +94 -0
- package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm-air/trial-1.md +11 -0
- package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm-air/trial-2.md +1 -0
- package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm-air/trial-3.md +1 -0
- package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/meta.json +115 -0
- package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001-self-check-url.yaml +58 -0
- package/src/skills/deep-research/tests/index.yaml +20 -0
- package/src/skills/deep-research/tests/rubrics/self-check-url.md +34 -0
- package/src/skills/deep-research/workflows/base-checklist.md +19 -0
- package/src/skills/deep-research/workflows/benchmark.md +38 -0
- package/src/skills/deep-research/workflows/competitor.md +44 -0
- package/src/skills/deep-research/workflows/custom.md +32 -0
- package/src/skills/deep-research/workflows/market.md +44 -0
- package/src/skills/deep-research/workflows/technology.md +40 -0
- package/src/skills/deep-research/workflows/trend.md +40 -0
- package/src/skills/execute-task/README.md +44 -0
- package/src/skills/execute-task/SKILL.md +292 -0
- package/src/skills/execute-task/algorithms/execution-strategy.md +136 -0
- package/src/skills/execute-task/knowledge/context-checkpoints.md +75 -0
- package/src/skills/execute-task/knowledge/ticket-structure.md +70 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/claude-haiku/trial-1.md +5 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/claude-haiku/trial-2.md +5 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/claude-haiku/trial-3.md +5 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/judge.json +124 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-free/trial-1.md +4 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-free/trial-2.md +4 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-free/trial-3.md +4 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-glm-air/trial-1.md +4 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-glm-air/trial-2.md +4 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-glm-air/trial-3.md +11 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/meta.json +88 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001-no-ticket-creation.yaml +48 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/claude-haiku/trial-1.md +5 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/claude-haiku/trial-2.md +6 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/claude-haiku/trial-3.md +5 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/judge.json +124 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-free/trial-1.md +4 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-free/trial-2.md +4 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-free/trial-3.md +8 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-glm-air/trial-1.md +9 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-glm-air/trial-2.md +26 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-glm-air/trial-3.md +4 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/meta.json +89 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002-no-duplicate-dod.yaml +44 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003/current/claude-haiku/trial-1.md +5 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003/current/claude-haiku/trial-2.md +5 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003/current/claude-haiku/trial-3.md +5 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003/current/judge.json +46 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003/current/meta.json +37 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003-verification-proportionality.yaml +46 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/claude-haiku/trial-1.md +18 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/claude-haiku/trial-2.md +16 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/claude-haiku/trial-3.md +14 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/judge.json +124 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-free/trial-1.md +5 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-free/trial-2.md +5 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-free/trial-3.md +1 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-glm-air/trial-1.md +8 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-glm-air/trial-2.md +5 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-glm-air/trial-3.md +4 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/meta.json +89 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004-no-foreign-ticket-edit.yaml +50 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/claude-haiku/trial-1.md +5 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/claude-haiku/trial-2.md +5 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/claude-haiku/trial-3.md +5 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/judge.json +124 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-free/trial-1.md +15 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-free/trial-2.md +4 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-free/trial-3.md +5 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-glm-air/trial-1.md +11 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-glm-air/trial-2.md +11 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-glm-air/trial-3.md +4 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/meta.json +88 -0
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005-ticket-fields-updated.yaml +39 -0
- package/src/skills/execute-task/tests/fixtures/IMPL-902-create-file.md +41 -0
- package/src/skills/execute-task/tests/fixtures/IMPL-904-current-task.md +40 -0
- package/src/skills/execute-task/tests/fixtures/IMPL-906-fill-ticket.md +42 -0
- package/src/skills/execute-task/tests/fixtures/QA-901-button-click.md +41 -0
- package/src/skills/execute-task/tests/fixtures/QA-903-visual-figma.md +40 -0
- package/src/skills/execute-task/tests/fixtures/TASK-905-done-with-typo.md +36 -0
- package/src/skills/execute-task/tests/index.yaml +39 -0
- package/src/skills/execute-task/tests/rubrics/no-duplicate-dod.md +22 -0
- package/src/skills/execute-task/tests/rubrics/no-foreign-ticket-edit.md +20 -0
- package/src/skills/execute-task/tests/rubrics/no-ticket-creation.md +21 -0
- package/src/skills/execute-task/tests/rubrics/ticket-fields-updated.md +23 -0
- package/src/skills/execute-task/tests/rubrics/verification-proportionality.md +22 -0
- package/src/skills/execute-task/workflows/execute.md +104 -0
- package/src/skills/manual-testing/README.md +63 -0
- package/src/skills/manual-testing/SKILL.md +176 -0
- package/src/skills/manual-testing/algorithms/blocked-tool-strategy.md +74 -0
- package/src/skills/manual-testing/algorithms/bug-severity.md +73 -0
- package/src/skills/manual-testing/algorithms/mcp-budget.md +97 -0
- package/src/skills/manual-testing/algorithms/test-prioritization.md +69 -0
- package/src/skills/manual-testing/knowledge/browser-extension-testing.md +102 -0
- package/src/skills/manual-testing/knowledge/browser-tools.md +114 -0
- package/src/skills/manual-testing/knowledge/desktop-tools-advanced.md +92 -0
- package/src/skills/manual-testing/knowledge/desktop-tools-core.md +76 -0
- package/src/skills/manual-testing/knowledge/sandbox-advanced.md +83 -0
- package/src/skills/manual-testing/knowledge/sandbox-core.md +67 -0
- package/src/skills/manual-testing/knowledge/stateful-edge-cases.md +69 -0
- package/src/skills/manual-testing/knowledge/test-case-design.md +107 -0
- package/src/skills/manual-testing/knowledge/testing-types.md +45 -0
- package/src/skills/manual-testing/templates/bug-report.md +52 -0
- package/src/skills/manual-testing/templates/test-case.md +34 -0
- package/src/skills/manual-testing/templates/test-plan.md +97 -0
- package/src/skills/manual-testing/templates/test-session-report.md +56 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/claude-sonnet/trial-1.md +34 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/claude-sonnet/trial-2.md +32 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/claude-sonnet/trial-3.md +30 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/judge.json +163 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-deepseek/trial-1.md +0 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-deepseek/trial-2.md +7 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-deepseek/trial-3.md +0 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-glm/trial-1.md +4 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-glm/trial-2.md +15 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-glm/trial-3.md +8 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-minimax/trial-1.md +5 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-minimax/trial-2.md +7 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-minimax/trial-3.md +7 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/meta.json +114 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001-sandbox-mandatory.yaml +38 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/claude-sonnet/trial-1.md +44 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/claude-sonnet/trial-2.md +32 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/claude-sonnet/trial-3.md +47 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/judge.json +163 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-deepseek/trial-1.md +19 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-deepseek/trial-2.md +15 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-deepseek/trial-3.md +24 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-glm/trial-1.md +19 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-glm/trial-2.md +13 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-glm/trial-3.md +18 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-minimax/trial-1.md +21 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-minimax/trial-2.md +15 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-minimax/trial-3.md +14 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/meta.json +114 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002-visual-tc-screenshot.yaml +37 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-003/current/claude-sonnet/trial-1.md +76 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-003/current/claude-sonnet/trial-2.md +71 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-003/current/claude-sonnet/trial-3.md +85 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-003/current/judge.json +46 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-003/current/meta.json +36 -0
- package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-003-qa-non-ui-assertion.yaml +65 -0
- package/src/skills/manual-testing/tests/index.yaml +30 -0
- package/src/skills/manual-testing/tests/last-run-tc001-sonnet.log +140 -0
- package/src/skills/manual-testing/tests/last-run-tc002.log +1 -0
- package/src/skills/manual-testing/tests/last-run.log +1469 -0
- package/src/skills/manual-testing/tests/rubrics/qa-non-ui-assertion.md +31 -0
- package/src/skills/manual-testing/tests/rubrics/sandbox-mandatory.md +20 -0
- package/src/skills/manual-testing/tests/rubrics/visual-tc-screenshot.md +21 -0
- package/src/skills/manual-testing/workflows/acceptance.md +80 -0
- package/src/skills/manual-testing/workflows/exploratory.md +84 -0
- package/src/skills/manual-testing/workflows/regression.md +76 -0
- package/src/skills/manual-testing/workflows/smoke.md +109 -0
- package/src/skills/manual-testing/workflows/test-plan.md +75 -0
- package/src/skills/review-result/README.md +59 -0
- package/src/skills/review-result/SKILL.md +138 -0
- package/src/skills/review-result/algorithms/verification.md +112 -0
- package/src/skills/review-result/knowledge/dod-patterns.md +115 -0
- package/src/skills/review-result/scripts/verify-artifacts.js +384 -0
- package/src/skills/review-result/templates/verdict.md +153 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-haiku/trial-1.md +22 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-haiku/trial-2.md +7 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-haiku/trial-3.md +21 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-sonnet/trial-1.md +6 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-sonnet/trial-2.md +6 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-sonnet/trial-3.md +18 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/judge.json +164 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-deepseek/trial-1.md +5 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-deepseek/trial-2.md +7 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-deepseek/trial-3.md +6 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-glm/trial-1.md +49 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-glm/trial-2.md +28 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-glm/trial-3.md +37 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-minimax/trial-1.md +22 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-minimax/trial-2.md +13 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-minimax/trial-3.md +21 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/meta.json +116 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001-visual-tc-trigger.yaml +51 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-haiku/trial-1.md +23 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-haiku/trial-2.md +22 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-haiku/trial-3.md +28 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-sonnet/trial-1.md +4 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-sonnet/trial-2.md +36 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-sonnet/trial-3.md +4 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/judge.json +163 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-deepseek/trial-1.md +4 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-deepseek/trial-2.md +0 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-deepseek/trial-3.md +4 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-glm/trial-1.md +39 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-glm/trial-2.md +25 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-glm/trial-3.md +32 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-minimax/trial-1.md +34 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-minimax/trial-2.md +8 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-minimax/trial-3.md +23 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/meta.json +115 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002-path-line-suffix.yaml +39 -0
- package/src/skills/review-result/tests/fixtures/IMPL-902-path-with-line.md +43 -0
- package/src/skills/review-result/tests/fixtures/QA-901-visual-button.md +46 -0
- package/src/skills/review-result/tests/index.yaml +25 -0
- package/src/skills/review-result/tests/rubrics/path-line-suffix.md +19 -0
- package/src/skills/review-result/tests/rubrics/visual-tc-trigger.md +19 -0
- package/src/skills/review-result/workflows/review.md +209 -0
|
@@ -13,6 +13,55 @@ const __filename = fileURLToPath(import.meta.url);
|
|
|
13
13
|
const __dirname = path.dirname(__filename);
|
|
14
14
|
const projectRoot = findProjectRoot(process.cwd());
|
|
15
15
|
|
|
16
|
+
import os from 'os';
|
|
17
|
+
import { execSync } from 'child_process';
|
|
18
|
+
|
|
19
|
+
function createTestWorkdir(skillName, suffix = '') {
|
|
20
|
+
const prefix = suffix ? `wf-test-${skillName}-${suffix}-` : `wf-test-${skillName}-`;
|
|
21
|
+
const tmpRoot = fs.mkdtempSync(path.join(os.tmpdir(), prefix));
|
|
22
|
+
const workflowDir = path.join(tmpRoot, '.workflow');
|
|
23
|
+
fs.mkdirSync(workflowDir, { recursive: true });
|
|
24
|
+
for (const sub of ['tickets/backlog', 'tickets/ready', 'tickets/in-progress', 'tickets/review', 'tickets/done', 'tickets/archive', 'plans/current', 'plans/archive', 'reports', 'logs']) {
|
|
25
|
+
fs.mkdirSync(path.join(workflowDir, sub), { recursive: true });
|
|
26
|
+
}
|
|
27
|
+
fs.writeFileSync(path.join(workflowDir, 'coach-backlog.yaml'), 'version: 1\nanalyzed_tickets: []\naudited_skills: {}\n', 'utf8');
|
|
28
|
+
|
|
29
|
+
const srcDir = path.join(workflowDir, 'src');
|
|
30
|
+
fs.mkdirSync(srcDir, { recursive: true });
|
|
31
|
+
const realSkills = path.join(projectRoot, 'src', 'skills');
|
|
32
|
+
const realScripts = path.join(projectRoot, 'src', 'scripts');
|
|
33
|
+
const linkSkills = path.join(srcDir, 'skills');
|
|
34
|
+
const linkScripts = path.join(srcDir, 'scripts');
|
|
35
|
+
const configDir = path.join(workflowDir, 'config');
|
|
36
|
+
const realConfigs = path.join(projectRoot, 'configs');
|
|
37
|
+
|
|
38
|
+
// Skills are COPIED (not junctioned) so that agents cannot write to real source files.
|
|
39
|
+
fs.cpSync(realSkills, linkSkills, { recursive: true, dereference: true });
|
|
40
|
+
|
|
41
|
+
// Scripts and configs are junctioned — read-only for agents in practice.
|
|
42
|
+
if (process.platform === 'win32') {
|
|
43
|
+
try { execSync(`mklink /J "${linkScripts}" "${realScripts}"`, { stdio: 'pipe', shell: true }); } catch {}
|
|
44
|
+
try { execSync(`mklink /J "${configDir}" "${realConfigs}"`, { stdio: 'pipe', shell: true }); } catch {}
|
|
45
|
+
} else {
|
|
46
|
+
try { fs.symlinkSync(realScripts, linkScripts, 'dir'); } catch {}
|
|
47
|
+
try { fs.symlinkSync(realConfigs, configDir, 'dir'); } catch {}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
return tmpRoot;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function cleanupTestWorkdir(tmpRoot) {
|
|
54
|
+
if (!tmpRoot || !fs.existsSync(tmpRoot)) return;
|
|
55
|
+
// Remove junctions first so that their targets are not touched by rmSync.
|
|
56
|
+
if (process.platform === 'win32') {
|
|
57
|
+
for (const link of ['src/scripts', 'config']) {
|
|
58
|
+
const p = path.join(tmpRoot, '.workflow', link);
|
|
59
|
+
try { execSync(`rmdir "${p}"`, { stdio: 'pipe', shell: true }); } catch {}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
try { fs.rmSync(tmpRoot, { recursive: true, force: true }); } catch {}
|
|
63
|
+
}
|
|
64
|
+
|
|
16
65
|
function parseArgs() {
|
|
17
66
|
const args = process.argv.slice(2);
|
|
18
67
|
const opts = {
|
|
@@ -720,13 +769,23 @@ async function writeJudgeResults(skillName, caseId, results) {
|
|
|
720
769
|
const skillsDir = findSkillsDir();
|
|
721
770
|
const caseDir = path.join(skillsDir, skillName, 'tests', 'cases', caseId, 'current');
|
|
722
771
|
ensureDir(caseDir);
|
|
723
|
-
|
|
724
|
-
const
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
772
|
+
|
|
773
|
+
const judgePath = path.join(caseDir, 'judge.json');
|
|
774
|
+
let judgeData = { per_model: {}, rubric_scores: [], timestamp: new Date().toISOString() };
|
|
775
|
+
if (fs.existsSync(judgePath)) {
|
|
776
|
+
try {
|
|
777
|
+
const existing = JSON.parse(fs.readFileSync(judgePath, 'utf8'));
|
|
778
|
+
judgeData.per_model = existing.per_model || {};
|
|
779
|
+
judgeData.rubric_scores = existing.rubric_scores || [];
|
|
780
|
+
} catch {}
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
const newAgentIds = new Set(Object.keys(results.per_model || {}));
|
|
784
|
+
judgeData.rubric_scores = judgeData.rubric_scores.filter(r => !newAgentIds.has(r.agentId));
|
|
785
|
+
for (const r of (results.rubric_scores || [])) {
|
|
786
|
+
judgeData.rubric_scores.push(r);
|
|
787
|
+
}
|
|
788
|
+
|
|
730
789
|
for (const [agentId, modelData] of Object.entries(results.per_model || {})) {
|
|
731
790
|
judgeData.per_model[agentId] = {
|
|
732
791
|
pass_count: modelData.pass_count,
|
|
@@ -738,12 +797,10 @@ async function writeJudgeResults(skillName, caseId, results) {
|
|
|
738
797
|
}))
|
|
739
798
|
};
|
|
740
799
|
}
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
'utf8'
|
|
746
|
-
);
|
|
800
|
+
|
|
801
|
+
judgeData.timestamp = new Date().toISOString();
|
|
802
|
+
|
|
803
|
+
fs.writeFileSync(judgePath, JSON.stringify(judgeData, null, 2), 'utf8');
|
|
747
804
|
}
|
|
748
805
|
|
|
749
806
|
async function preFlightApproval(numCases, numModels, trials, judgeAgentCost = 0.02, targetAgentCost = 0.01) {
|
|
@@ -776,7 +833,7 @@ async function preFlightApproval(numCases, numModels, trials, judgeAgentCost = 0
|
|
|
776
833
|
}
|
|
777
834
|
|
|
778
835
|
async function runL2Evaluation(skillName, testCase, caseDef, targetAgents, judgeAgentId, pipelineConfig, options = {}) {
|
|
779
|
-
const { trials = 3,
|
|
836
|
+
const { trials = 3, timeout = 300 } = options;
|
|
780
837
|
|
|
781
838
|
const judgeAgentConfig = pipelineConfig.agents[judgeAgentId];
|
|
782
839
|
if (!judgeAgentConfig) {
|
|
@@ -799,12 +856,12 @@ async function runL2Evaluation(skillName, testCase, caseDef, targetAgents, judge
|
|
|
799
856
|
};
|
|
800
857
|
|
|
801
858
|
const caseId = caseDef?.id || 'unknown';
|
|
802
|
-
|
|
803
|
-
function buildTargetPrompt() {
|
|
859
|
+
|
|
860
|
+
function buildTargetPrompt(taskWorkdir) {
|
|
804
861
|
let targetPrompt = '';
|
|
805
862
|
const testsDir = findSkillTestsDir(skillName);
|
|
806
863
|
const caseDir = caseDef?.file ? path.dirname(caseDef.file) : '';
|
|
807
|
-
|
|
864
|
+
|
|
808
865
|
if (testCase.scenario?.system_prompt_file) {
|
|
809
866
|
const systemPromptPath = path.join(testsDir, caseDir, testCase.scenario.system_prompt_file);
|
|
810
867
|
if (fs.existsSync(systemPromptPath)) {
|
|
@@ -824,6 +881,28 @@ async function runL2Evaluation(skillName, testCase, caseDef, targetAgents, judge
|
|
|
824
881
|
targetPrompt += `## ${input.as || 'Input'}\n`;
|
|
825
882
|
targetPrompt += fs.readFileSync(fixturePath, 'utf8') + '\n\n';
|
|
826
883
|
}
|
|
884
|
+
} else if (input.kind === 'inline') {
|
|
885
|
+
if (input.content) {
|
|
886
|
+
targetPrompt += `## ${input.as || 'Input'}\n`;
|
|
887
|
+
targetPrompt += input.content + '\n\n';
|
|
888
|
+
}
|
|
889
|
+
} else if (input.kind === 'ticket_file') {
|
|
890
|
+
const fixturePath = path.join(testsDir, caseDir, input.path);
|
|
891
|
+
const destDir = input.dest_dir || 'in-progress';
|
|
892
|
+
const ticketId = input.ticket_id;
|
|
893
|
+
if (!ticketId) {
|
|
894
|
+
throw new Error(`ticket_file input requires ticket_id (case ${caseId})`);
|
|
895
|
+
}
|
|
896
|
+
if (!taskWorkdir) {
|
|
897
|
+
throw new Error(`ticket_file input requires task workdir (case ${caseId})`);
|
|
898
|
+
}
|
|
899
|
+
if (!fs.existsSync(fixturePath)) {
|
|
900
|
+
throw new Error(`ticket_file fixture not found: ${fixturePath}`);
|
|
901
|
+
}
|
|
902
|
+
const destPath = path.join(taskWorkdir, '.workflow', 'tickets', destDir, `${ticketId}.md`);
|
|
903
|
+
fs.mkdirSync(path.dirname(destPath), { recursive: true });
|
|
904
|
+
fs.copyFileSync(fixturePath, destPath);
|
|
905
|
+
targetPrompt += `## Context\nticket_id: ${ticketId}\n\n`;
|
|
827
906
|
}
|
|
828
907
|
}
|
|
829
908
|
}
|
|
@@ -831,46 +910,65 @@ async function runL2Evaluation(skillName, testCase, caseDef, targetAgents, judge
|
|
|
831
910
|
if (!targetPrompt.trim()) {
|
|
832
911
|
targetPrompt = testCase.prompt || testCase.input || '';
|
|
833
912
|
}
|
|
834
|
-
|
|
913
|
+
|
|
835
914
|
return targetPrompt;
|
|
836
915
|
}
|
|
837
916
|
|
|
917
|
+
const allTasks = [];
|
|
838
918
|
for (const agentId of targetAgents) {
|
|
839
919
|
const agentConfig = pipelineConfig.agents[agentId];
|
|
840
920
|
if (!agentConfig) {
|
|
841
921
|
throw new Error(`Target agent not found: ${agentId}`);
|
|
842
922
|
}
|
|
843
|
-
|
|
844
923
|
results.per_model[agentId] = {
|
|
845
924
|
trials: [],
|
|
846
925
|
pass_count: 0,
|
|
847
926
|
total: trials
|
|
848
927
|
};
|
|
849
|
-
|
|
850
|
-
const tasks = [];
|
|
851
928
|
for (let trial = 1; trial <= trials; trial++) {
|
|
852
|
-
|
|
929
|
+
allTasks.push({ agentId, trial, agentConfig, judgeAgentConfig, rubric, testCase });
|
|
853
930
|
}
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
931
|
+
}
|
|
932
|
+
|
|
933
|
+
const allResults = await Promise.all(
|
|
934
|
+
allTasks.map(async (task) => {
|
|
935
|
+
const taskSuffix = `${caseId}-${task.agentId}-t${task.trial}`;
|
|
936
|
+
let taskWorkdir = null;
|
|
937
|
+
try {
|
|
938
|
+
taskWorkdir = createTestWorkdir(skillName, taskSuffix);
|
|
939
|
+
const targetPrompt = buildTargetPrompt(taskWorkdir);
|
|
940
|
+
const targetOutput = await spawnAgent(task.agentConfig, targetPrompt, {
|
|
941
|
+
timeout,
|
|
942
|
+
stageId: `${caseId}-${task.agentId}-trial-${task.trial}`,
|
|
943
|
+
projectRoot: taskWorkdir
|
|
944
|
+
});
|
|
945
|
+
|
|
946
|
+
// Snapshot ticket files after target-run (for judge to inspect actual file state).
|
|
947
|
+
let ticketFilesSection = '';
|
|
948
|
+
const ticketInputs = (testCase.scenario?.inputs || []).filter(i => i.kind === 'ticket_file');
|
|
949
|
+
for (const input of ticketInputs) {
|
|
950
|
+
const ticketPath = path.join(
|
|
951
|
+
taskWorkdir,
|
|
952
|
+
'.workflow', 'tickets',
|
|
953
|
+
input.dest_dir || 'in-progress',
|
|
954
|
+
`${input.ticket_id}.md`
|
|
955
|
+
);
|
|
956
|
+
if (fs.existsSync(ticketPath)) {
|
|
957
|
+
const content = fs.readFileSync(ticketPath, 'utf8');
|
|
958
|
+
ticketFilesSection += `\n## Ticket File After Execution — ${input.ticket_id} (${input.dest_dir || 'in-progress'}/)\n\n\`\`\`markdown\n${content}\n\`\`\`\n`;
|
|
959
|
+
} else {
|
|
960
|
+
ticketFilesSection += `\n## Ticket File After Execution — ${input.ticket_id}\n\n(file missing at ${input.dest_dir || 'in-progress'}/${input.ticket_id}.md)\n`;
|
|
961
|
+
}
|
|
962
|
+
}
|
|
963
|
+
|
|
964
|
+
const judgePrompt = `You are a judge evaluating the output of an AI agent.
|
|
867
965
|
|
|
868
966
|
## Rubric
|
|
869
967
|
${rubric}
|
|
870
968
|
|
|
871
969
|
## Target Agent Output
|
|
872
970
|
${targetOutput.output || targetOutput.status || 'No output'}
|
|
873
|
-
|
|
971
|
+
${ticketFilesSection}
|
|
874
972
|
## Task
|
|
875
973
|
${testCase.description || testCase.name || 'Evaluate the response'}
|
|
876
974
|
|
|
@@ -881,54 +979,77 @@ score: <number 1-5>
|
|
|
881
979
|
reason: <brief explanation>
|
|
882
980
|
---RESULT---`;
|
|
883
981
|
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
});
|
|
888
|
-
|
|
889
|
-
let score = 3;
|
|
890
|
-
const parsed = parseJudgeResult(judgeResult.output);
|
|
891
|
-
if (parsed && parsed.score) {
|
|
892
|
-
score = parsed.score;
|
|
893
|
-
}
|
|
894
|
-
|
|
895
|
-
await writeTrialOutput(skillName, caseId, task.agentId, task.trial, targetOutput.output || '');
|
|
896
|
-
|
|
897
|
-
return {
|
|
898
|
-
trial: task.trial,
|
|
899
|
-
agentId: task.agentId,
|
|
900
|
-
score,
|
|
901
|
-
output: targetOutput.output || '',
|
|
902
|
-
judge_output: judgeResult.output || '',
|
|
903
|
-
passed: score >= 4
|
|
904
|
-
};
|
|
905
|
-
} catch (err) {
|
|
906
|
-
console.error(`[Runner] Trial failed: ${task.agentId} trial ${task.trial}`, err.message);
|
|
907
|
-
return {
|
|
908
|
-
trial: task.trial,
|
|
909
|
-
agentId: task.agentId,
|
|
910
|
-
score: 1,
|
|
911
|
-
error: err.message,
|
|
912
|
-
passed: false
|
|
913
|
-
};
|
|
914
|
-
}
|
|
915
|
-
})
|
|
916
|
-
);
|
|
917
|
-
|
|
918
|
-
for (const result of batchResults) {
|
|
919
|
-
results.per_model[result.agentId].trials.push(result);
|
|
920
|
-
if (result.passed) {
|
|
921
|
-
results.per_model[result.agentId].pass_count++;
|
|
922
|
-
}
|
|
923
|
-
results.rubric_scores.push({
|
|
924
|
-
agentId: result.agentId,
|
|
925
|
-
trial: result.trial,
|
|
926
|
-
score: result.score
|
|
982
|
+
const judgeResult = await spawnAgent(task.judgeAgentConfig, judgePrompt, {
|
|
983
|
+
timeout: 60,
|
|
984
|
+
stageId: `${caseId}-judge-${task.agentId}-trial-${task.trial}`
|
|
927
985
|
});
|
|
986
|
+
|
|
987
|
+
let score = 3;
|
|
988
|
+
const parsed = parseJudgeResult(judgeResult.output);
|
|
989
|
+
if (parsed && parsed.score) {
|
|
990
|
+
score = parsed.score;
|
|
991
|
+
}
|
|
992
|
+
|
|
993
|
+
await writeTrialOutput(skillName, caseId, task.agentId, task.trial, targetOutput.output || '');
|
|
994
|
+
|
|
995
|
+
return {
|
|
996
|
+
trial: task.trial,
|
|
997
|
+
agentId: task.agentId,
|
|
998
|
+
score,
|
|
999
|
+
output: targetOutput.output || '',
|
|
1000
|
+
judge_output: judgeResult.output || '',
|
|
1001
|
+
passed: score >= 4,
|
|
1002
|
+
errored: false
|
|
1003
|
+
};
|
|
1004
|
+
} catch (err) {
|
|
1005
|
+
console.error(`[Runner] Trial errored: ${task.agentId} trial ${task.trial} — ${err.message}`);
|
|
1006
|
+
try {
|
|
1007
|
+
await writeTrialOutput(
|
|
1008
|
+
skillName,
|
|
1009
|
+
caseId,
|
|
1010
|
+
task.agentId,
|
|
1011
|
+
task.trial,
|
|
1012
|
+
`# TRIAL ERRORED\n\nagent: ${task.agentId}\ntrial: ${task.trial}\nerror: ${err.message}\n`
|
|
1013
|
+
);
|
|
1014
|
+
} catch {}
|
|
1015
|
+
return {
|
|
1016
|
+
trial: task.trial,
|
|
1017
|
+
agentId: task.agentId,
|
|
1018
|
+
score: null,
|
|
1019
|
+
error: err.message,
|
|
1020
|
+
passed: false,
|
|
1021
|
+
errored: true
|
|
1022
|
+
};
|
|
1023
|
+
} finally {
|
|
1024
|
+
if (taskWorkdir) {
|
|
1025
|
+
cleanupTestWorkdir(taskWorkdir);
|
|
1026
|
+
}
|
|
928
1027
|
}
|
|
1028
|
+
})
|
|
1029
|
+
);
|
|
1030
|
+
|
|
1031
|
+
for (const result of allResults) {
|
|
1032
|
+
results.per_model[result.agentId].trials.push(result);
|
|
1033
|
+
if (result.errored) {
|
|
1034
|
+
results.per_model[result.agentId].error_count = (results.per_model[result.agentId].error_count || 0) + 1;
|
|
1035
|
+
} else if (result.passed) {
|
|
1036
|
+
results.per_model[result.agentId].pass_count++;
|
|
929
1037
|
}
|
|
1038
|
+
results.rubric_scores.push({
|
|
1039
|
+
agentId: result.agentId,
|
|
1040
|
+
trial: result.trial,
|
|
1041
|
+
score: result.score,
|
|
1042
|
+
errored: !!result.errored,
|
|
1043
|
+
error: result.error || undefined
|
|
1044
|
+
});
|
|
930
1045
|
}
|
|
931
|
-
|
|
1046
|
+
for (const agentId of Object.keys(results.per_model)) {
|
|
1047
|
+
results.per_model[agentId].trials.sort((a, b) => a.trial - b.trial);
|
|
1048
|
+
}
|
|
1049
|
+
results.rubric_scores.sort((a, b) =>
|
|
1050
|
+
a.agentId === b.agentId ? a.trial - b.trial : a.agentId.localeCompare(b.agentId)
|
|
1051
|
+
);
|
|
1052
|
+
|
|
932
1053
|
return results;
|
|
933
1054
|
}
|
|
934
1055
|
|
|
@@ -961,19 +1082,27 @@ function aggregateResults(results, testCase) {
|
|
|
961
1082
|
|
|
962
1083
|
for (const [agentId, modelData] of Object.entries(results.per_model)) {
|
|
963
1084
|
const passCount = modelData.pass_count;
|
|
1085
|
+
const errorCount = modelData.error_count || 0;
|
|
964
1086
|
const total = modelData.total;
|
|
1087
|
+
const effective = total - errorCount;
|
|
965
1088
|
const threshold = Math.ceil(total / 2);
|
|
966
|
-
|
|
1089
|
+
|
|
967
1090
|
let passed;
|
|
968
|
-
|
|
1091
|
+
let errored = false;
|
|
1092
|
+
if (effective === 0) {
|
|
1093
|
+
passed = false;
|
|
1094
|
+
errored = true;
|
|
1095
|
+
} else if (useAll) {
|
|
969
1096
|
passed = passCount === total;
|
|
970
1097
|
} else {
|
|
971
1098
|
passed = passCount >= threshold;
|
|
972
1099
|
}
|
|
973
|
-
|
|
1100
|
+
|
|
974
1101
|
perModelResults[agentId] = {
|
|
975
1102
|
passed,
|
|
1103
|
+
errored,
|
|
976
1104
|
pass_count: passCount,
|
|
1105
|
+
error_count: errorCount,
|
|
977
1106
|
total,
|
|
978
1107
|
threshold: useAll ? total : threshold
|
|
979
1108
|
};
|
|
@@ -991,40 +1120,66 @@ async function writeMetaJson(caseId, skillName, status, durationMs, l2Results =
|
|
|
991
1120
|
const skillsDir = findSkillsDir();
|
|
992
1121
|
const caseDir = path.join(skillsDir, skillName, 'tests', 'cases', caseId, 'current');
|
|
993
1122
|
ensureDir(caseDir);
|
|
994
|
-
|
|
1123
|
+
|
|
1124
|
+
const metaPath = path.join(caseDir, 'meta.json');
|
|
1125
|
+
let existing = null;
|
|
1126
|
+
if (fs.existsSync(metaPath)) {
|
|
1127
|
+
try {
|
|
1128
|
+
existing = JSON.parse(fs.readFileSync(metaPath, 'utf8'));
|
|
1129
|
+
} catch {}
|
|
1130
|
+
}
|
|
1131
|
+
|
|
995
1132
|
const meta = {
|
|
996
1133
|
date: new Date().toISOString(),
|
|
997
1134
|
skill_sha: getSkillSha(skillName),
|
|
998
1135
|
status,
|
|
999
1136
|
duration_ms: durationMs
|
|
1000
1137
|
};
|
|
1001
|
-
|
|
1138
|
+
|
|
1002
1139
|
if (l1_skipped) {
|
|
1003
1140
|
meta.l1_skipped = true;
|
|
1004
1141
|
}
|
|
1005
|
-
|
|
1142
|
+
|
|
1143
|
+
const mergedPerModel = (existing && existing.per_model) ? { ...existing.per_model } : {};
|
|
1144
|
+
let mergedRubricScores = (existing && existing.rubric_scores) ? [...existing.rubric_scores] : [];
|
|
1145
|
+
|
|
1006
1146
|
if (l2Results) {
|
|
1007
1147
|
const aggregated = aggregateResults(l2Results, {});
|
|
1008
|
-
|
|
1009
|
-
|
|
1148
|
+
const newAgentIds = new Set(Object.keys(aggregated.per_model || {}));
|
|
1149
|
+
for (const [agentId, data] of Object.entries(aggregated.per_model || {})) {
|
|
1150
|
+
mergedPerModel[agentId] = data;
|
|
1151
|
+
}
|
|
1152
|
+
mergedRubricScores = mergedRubricScores.filter(r => !newAgentIds.has(r.agentId));
|
|
1153
|
+
for (const r of (l2Results.rubric_scores || [])) {
|
|
1154
|
+
mergedRubricScores.push(r);
|
|
1155
|
+
}
|
|
1010
1156
|
if (l2Results.tokens) {
|
|
1011
1157
|
meta.tokens = l2Results.tokens;
|
|
1012
1158
|
}
|
|
1013
1159
|
}
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1160
|
+
|
|
1161
|
+
if (Object.keys(mergedPerModel).length > 0) {
|
|
1162
|
+
meta.per_model = mergedPerModel;
|
|
1163
|
+
}
|
|
1164
|
+
if (mergedRubricScores.length > 0) {
|
|
1165
|
+
meta.rubric_scores = mergedRubricScores;
|
|
1166
|
+
}
|
|
1167
|
+
|
|
1168
|
+
const allPassed = Object.values(mergedPerModel).every(m => m.passed);
|
|
1169
|
+
if (Object.keys(mergedPerModel).length > 0) {
|
|
1170
|
+
meta.status = allPassed ? 'passed' : 'failed';
|
|
1171
|
+
}
|
|
1172
|
+
|
|
1173
|
+
fs.writeFileSync(metaPath, JSON.stringify(meta, null, 2), 'utf8');
|
|
1020
1174
|
}
|
|
1021
1175
|
|
|
1022
1176
|
async function runTestsForSkill(skillName, opts) {
|
|
1177
|
+
console.log(`[Runner] Per-task isolated workdirs will be created for each (case × agent × trial)`);
|
|
1023
1178
|
const result = {
|
|
1024
1179
|
skill: skillName,
|
|
1025
1180
|
status: 'passed',
|
|
1026
1181
|
total: 0,
|
|
1027
|
-
current_run: { passed: 0, failed: 0 },
|
|
1182
|
+
current_run: { passed: 0, failed: 0, no_coverage: 0 },
|
|
1028
1183
|
baseline_ref: 'origin/main',
|
|
1029
1184
|
target_agents: [],
|
|
1030
1185
|
judge_agent: null
|
|
@@ -1117,14 +1272,56 @@ async function runTestsForSkill(skillName, opts) {
|
|
|
1117
1272
|
|
|
1118
1273
|
const runL2 = !opts.layer || opts.layer === 'l2';
|
|
1119
1274
|
|
|
1120
|
-
|
|
1275
|
+
const casesWithRubric = cases.filter(cd => {
|
|
1276
|
+
try {
|
|
1277
|
+
const tc = loadTestCase(skillName, cd.file);
|
|
1278
|
+
return tc.assertions?.rubric && tc.assertions.rubric.length > 0;
|
|
1279
|
+
} catch { return false; }
|
|
1280
|
+
});
|
|
1281
|
+
const anyHasRubric = casesWithRubric.length > 0;
|
|
1282
|
+
|
|
1283
|
+
if (casesWithRubric.length < cases.length) {
|
|
1284
|
+
const missing = cases.length - casesWithRubric.length;
|
|
1285
|
+
console.log(`[Runner] ${missing}/${cases.length} cases have no rubric — L2 will be skipped for them`);
|
|
1286
|
+
}
|
|
1287
|
+
|
|
1288
|
+
if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent && anyHasRubric) {
|
|
1121
1289
|
const trials = opts.fast ? 1 : 3;
|
|
1122
1290
|
const totalModels = effectiveTargetAgents.length;
|
|
1123
|
-
|
|
1124
|
-
|
|
1291
|
+
await preFlightApproval(casesWithRubric.length, totalModels, trials);
|
|
1292
|
+
}
|
|
1293
|
+
|
|
1294
|
+
let secretScanFailed = false;
|
|
1295
|
+
let calibrationFailedResult = null;
|
|
1296
|
+
|
|
1297
|
+
const anyRunL1 = !opts.layer || opts.layer === 'deterministic';
|
|
1298
|
+
const anyRunL2 = !opts.layer || opts.layer === 'l2';
|
|
1299
|
+
|
|
1300
|
+
if (anyRunL1 && !opts.skipSecretScan) {
|
|
1301
|
+
const scanResult = await runSecretScan();
|
|
1302
|
+
if (!scanResult.passed) {
|
|
1303
|
+
secretScanFailed = true;
|
|
1304
|
+
result.error = 'Secret scan failed - secrets detected in fixtures';
|
|
1305
|
+
}
|
|
1125
1306
|
}
|
|
1126
1307
|
|
|
1127
|
-
|
|
1308
|
+
if (anyRunL2 && effectiveTargetAgents.length > 0 && judgeAgent && anyHasRubric && !secretScanFailed) {
|
|
1309
|
+
const calibrationResult = await runCalibrationGate(skillName, pipelineConfig);
|
|
1310
|
+
if (!calibrationResult.passed) {
|
|
1311
|
+
console.error(`[Runner] Calibration gate FAILED: ${calibrationResult.error}`);
|
|
1312
|
+
calibrationFailedResult = calibrationResult;
|
|
1313
|
+
result.status = 'calibration_failed';
|
|
1314
|
+
result.error = calibrationResult.error;
|
|
1315
|
+
result.calibration = calibrationResult;
|
|
1316
|
+
return { ...result, cases, currentRunStatuses };
|
|
1317
|
+
}
|
|
1318
|
+
if (calibrationResult.warnings && calibrationResult.warnings.length > 0) {
|
|
1319
|
+
console.log(`[Runner] Calibration warnings: ${calibrationResult.warnings.join(', ')}`);
|
|
1320
|
+
}
|
|
1321
|
+
console.log('[Runner] Calibration gate PASSED');
|
|
1322
|
+
}
|
|
1323
|
+
|
|
1324
|
+
await Promise.all(cases.map(async (caseDef) => {
|
|
1128
1325
|
const caseStart = Date.now();
|
|
1129
1326
|
|
|
1130
1327
|
try {
|
|
@@ -1136,17 +1333,13 @@ async function runTestsForSkill(skillName, opts) {
|
|
|
1136
1333
|
const runL1 = !opts.layer || opts.layer === 'deterministic';
|
|
1137
1334
|
const runL2 = !opts.layer || opts.layer === 'l2';
|
|
1138
1335
|
|
|
1139
|
-
// Secret scan
|
|
1140
|
-
if (runL1 && !opts.skipSecretScan) {
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
currentRunStatuses[caseDef.id] = 'failed';
|
|
1147
|
-
await writeMetaJson(caseDef.id, skillName, 'failed', Date.now() - caseStart);
|
|
1148
|
-
continue;
|
|
1149
|
-
}
|
|
1336
|
+
// Secret scan result propagated from pre-loop
|
|
1337
|
+
if (runL1 && !opts.skipSecretScan && secretScanFailed) {
|
|
1338
|
+
result.current_run.failed++;
|
|
1339
|
+
result.status = 'failed';
|
|
1340
|
+
currentRunStatuses[caseDef.id] = 'failed';
|
|
1341
|
+
await writeMetaJson(caseDef.id, skillName, 'failed', Date.now() - caseStart);
|
|
1342
|
+
return;
|
|
1150
1343
|
}
|
|
1151
1344
|
|
|
1152
1345
|
// L0 static assertions
|
|
@@ -1158,7 +1351,7 @@ async function runTestsForSkill(skillName, opts) {
|
|
|
1158
1351
|
result.status = 'failed';
|
|
1159
1352
|
currentRunStatuses[caseDef.id] = 'failed';
|
|
1160
1353
|
await writeMetaJson(caseDef.id, skillName, 'failed', Date.now() - caseStart);
|
|
1161
|
-
|
|
1354
|
+
return;
|
|
1162
1355
|
}
|
|
1163
1356
|
}
|
|
1164
1357
|
|
|
@@ -1167,13 +1360,28 @@ async function runTestsForSkill(skillName, opts) {
|
|
|
1167
1360
|
const l1Results = runL1Assertions(mockOutput, testCase);
|
|
1168
1361
|
const l1Failed = l1Results.filter(r => !r.passed);
|
|
1169
1362
|
const l1Skipped = l1Results.some(r => r.skipped);
|
|
1363
|
+
const l1Declared = (testCase.assertions?.deterministic || []).length;
|
|
1364
|
+
const l1Executed = l1Results.filter(r => !r.skipped).length;
|
|
1170
1365
|
|
|
1171
|
-
const
|
|
1172
|
-
|
|
1366
|
+
const willRunL2 = runL2 && effectiveTargetAgents.length > 0 && judgeAgent && hasRubric;
|
|
1367
|
+
const noCoverage = l1Declared > 0 && l1Executed === 0 && !willRunL2;
|
|
1173
1368
|
|
|
1369
|
+
let caseStatus;
|
|
1174
1370
|
if (l1Failed.length > 0) {
|
|
1371
|
+
caseStatus = 'failed';
|
|
1372
|
+
} else if (noCoverage) {
|
|
1373
|
+
caseStatus = 'no_coverage';
|
|
1374
|
+
} else {
|
|
1375
|
+
caseStatus = 'passed';
|
|
1376
|
+
}
|
|
1377
|
+
currentRunStatuses[caseDef.id] = caseStatus;
|
|
1378
|
+
|
|
1379
|
+
if (caseStatus === 'failed') {
|
|
1175
1380
|
result.current_run.failed++;
|
|
1176
1381
|
result.status = 'failed';
|
|
1382
|
+
} else if (caseStatus === 'no_coverage') {
|
|
1383
|
+
result.current_run.no_coverage = (result.current_run.no_coverage || 0) + 1;
|
|
1384
|
+
console.log(`[Runner] ${caseDef.id}: no_coverage — L1 assertions require agent output but L2 is not configured (no rubric or no agents)`);
|
|
1177
1385
|
} else {
|
|
1178
1386
|
result.current_run.passed++;
|
|
1179
1387
|
}
|
|
@@ -1182,36 +1390,25 @@ async function runTestsForSkill(skillName, opts) {
|
|
|
1182
1390
|
result.l1_skipped = true;
|
|
1183
1391
|
}
|
|
1184
1392
|
|
|
1185
|
-
if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent && hasRubric) {
|
|
1186
|
-
const calibrationResult = await runCalibrationGate(skillName, pipelineConfig);
|
|
1187
|
-
|
|
1188
|
-
if (!calibrationResult.passed) {
|
|
1189
|
-
console.error(`[Runner] Calibration gate FAILED: ${calibrationResult.error}`);
|
|
1190
|
-
result.status = 'calibration_failed';
|
|
1191
|
-
result.error = calibrationResult.error;
|
|
1192
|
-
result.calibration = calibrationResult;
|
|
1193
|
-
return result;
|
|
1194
|
-
}
|
|
1195
|
-
|
|
1196
|
-
if (calibrationResult.warnings && calibrationResult.warnings.length > 0) {
|
|
1197
|
-
console.log(`[Runner] Calibration warnings: ${calibrationResult.warnings.join(', ')}`);
|
|
1198
|
-
}
|
|
1199
|
-
|
|
1200
|
-
console.log('[Runner] Calibration gate PASSED');
|
|
1201
|
-
}
|
|
1202
|
-
|
|
1203
1393
|
let l2Results = null;
|
|
1204
|
-
if (
|
|
1394
|
+
if (willRunL2) {
|
|
1205
1395
|
const trials = opts.fast ? 1 : 3;
|
|
1206
1396
|
const index = loadIndexYaml(skillName);
|
|
1207
1397
|
const defaultTimeout = index.execution?.default_timeout_s || 300;
|
|
1208
1398
|
const timeout = testCase.execution?.timeout_s || defaultTimeout;
|
|
1399
|
+
const caseTargetAgents = testCase.execution?.target_agents;
|
|
1400
|
+
const perCaseAgents = caseTargetAgents && caseTargetAgents.length > 0
|
|
1401
|
+
? (validateAgents(caseTargetAgents, pipelineConfig), caseTargetAgents)
|
|
1402
|
+
: effectiveTargetAgents;
|
|
1403
|
+
if (caseTargetAgents && caseTargetAgents.length > 0) {
|
|
1404
|
+
console.log(`[Runner] ${caseDef.id}: per-case target_agents override → ${perCaseAgents.join(', ')}`);
|
|
1405
|
+
}
|
|
1209
1406
|
try {
|
|
1210
1407
|
l2Results = await runL2Evaluation(
|
|
1211
1408
|
skillName,
|
|
1212
1409
|
testCase,
|
|
1213
1410
|
caseDef,
|
|
1214
|
-
|
|
1411
|
+
perCaseAgents,
|
|
1215
1412
|
judgeAgent,
|
|
1216
1413
|
pipelineConfig,
|
|
1217
1414
|
{ trials, concurrency: 2, timeout }
|
|
@@ -1238,6 +1435,13 @@ async function runTestsForSkill(skillName, opts) {
|
|
|
1238
1435
|
const trials = opts.fast ? 1 : 3;
|
|
1239
1436
|
const defaultTimeout = index.execution?.default_timeout_s || 300;
|
|
1240
1437
|
const timeout = testCase.execution?.timeout_s || defaultTimeout;
|
|
1438
|
+
const caseTargetAgents = testCase.execution?.target_agents;
|
|
1439
|
+
const perCaseAgents = caseTargetAgents && caseTargetAgents.length > 0
|
|
1440
|
+
? (validateAgents(caseTargetAgents, pipelineConfig), caseTargetAgents)
|
|
1441
|
+
: effectiveTargetAgents;
|
|
1442
|
+
if (caseTargetAgents && caseTargetAgents.length > 0) {
|
|
1443
|
+
console.log(`[Runner] ${caseDef.id}: per-case target_agents override → ${perCaseAgents.join(', ')}`);
|
|
1444
|
+
}
|
|
1241
1445
|
let l2Results = null;
|
|
1242
1446
|
let caseStatus = 'passed';
|
|
1243
1447
|
try {
|
|
@@ -1245,7 +1449,7 @@ async function runTestsForSkill(skillName, opts) {
|
|
|
1245
1449
|
skillName,
|
|
1246
1450
|
testCase,
|
|
1247
1451
|
caseDef,
|
|
1248
|
-
|
|
1452
|
+
perCaseAgents,
|
|
1249
1453
|
judgeAgent,
|
|
1250
1454
|
pipelineConfig,
|
|
1251
1455
|
{ trials, concurrency: 2, timeout }
|
|
@@ -1283,6 +1487,10 @@ async function runTestsForSkill(skillName, opts) {
|
|
|
1283
1487
|
currentRunStatuses[caseDef.id] = 'error';
|
|
1284
1488
|
await writeMetaJson(caseDef.id, skillName, 'error', Date.now() - caseStart);
|
|
1285
1489
|
}
|
|
1490
|
+
}));
|
|
1491
|
+
|
|
1492
|
+
if (result.status === 'passed' && result.current_run.no_coverage > 0 && result.current_run.passed === 0) {
|
|
1493
|
+
result.status = 'no_coverage';
|
|
1286
1494
|
}
|
|
1287
1495
|
} catch (e) {
|
|
1288
1496
|
result.status = 'error';
|
|
@@ -1307,7 +1515,7 @@ async function runSkillTests(opts) {
|
|
|
1307
1515
|
skill: opts.skill || 'unknown',
|
|
1308
1516
|
mode: 'deterministic',
|
|
1309
1517
|
total: 0,
|
|
1310
|
-
current_run: { passed: 0, failed: 0 },
|
|
1518
|
+
current_run: { passed: 0, failed: 0, no_coverage: 0 },
|
|
1311
1519
|
baseline_ref: 'origin/main',
|
|
1312
1520
|
git_head_comparison: null,
|
|
1313
1521
|
verdict: 'ready_for_user_review',
|
|
@@ -1323,6 +1531,7 @@ async function runSkillTests(opts) {
|
|
|
1323
1531
|
results.total = skillResult.total;
|
|
1324
1532
|
results.current_run.passed = skillResult.current_run.passed;
|
|
1325
1533
|
results.current_run.failed = skillResult.current_run.failed;
|
|
1534
|
+
results.current_run.no_coverage = skillResult.current_run.no_coverage || 0;
|
|
1326
1535
|
results.status = skillResult.status;
|
|
1327
1536
|
results.target_agents = skillResult.target_agents;
|
|
1328
1537
|
results.judge_agent = skillResult.judge_agent;
|
|
@@ -1406,7 +1615,7 @@ async function runSkillTests(opts) {
|
|
|
1406
1615
|
}
|
|
1407
1616
|
|
|
1408
1617
|
return results;
|
|
1409
|
-
}
|
|
1618
|
+
}
|
|
1410
1619
|
|
|
1411
1620
|
function printResult(result) {
|
|
1412
1621
|
console.log('---RESULT---');
|
|
@@ -1416,6 +1625,9 @@ function printResult(result) {
|
|
|
1416
1625
|
console.log(`total: ${result.total}`);
|
|
1417
1626
|
console.log(`current_run.passed: ${result.current_run.passed}`);
|
|
1418
1627
|
console.log(`current_run.failed: ${result.current_run.failed}`);
|
|
1628
|
+
if (result.current_run.no_coverage) {
|
|
1629
|
+
console.log(`current_run.no_coverage: ${result.current_run.no_coverage}`);
|
|
1630
|
+
}
|
|
1419
1631
|
|
|
1420
1632
|
if (result.baseline_ref) {
|
|
1421
1633
|
console.log(`baseline_ref: ${result.baseline_ref}`);
|