devlyn-cli 1.15.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. package/AGENTS.md +104 -0
  2. package/CLAUDE.md +135 -21
  3. package/README.md +43 -125
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
  5. package/benchmark/auto-resolve/README.md +114 -0
  6. package/benchmark/auto-resolve/RUBRIC.md +162 -0
  7. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
  9. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
  10. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
  11. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
  12. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
  13. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
  14. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
  16. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
  17. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
  18. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
  19. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
  20. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
  21. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
  22. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
  23. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
  27. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
  28. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
  29. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
  30. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
  31. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
  32. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
  33. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
  34. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
  35. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
  37. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
  39. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
  40. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
  41. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
  42. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
  43. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
  44. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
  46. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
  47. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
  48. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
  49. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
  50. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
  51. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
  52. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
  53. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
  54. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
  55. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
  57. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
  58. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
  59. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
  60. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
  61. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
  62. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
  63. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
  64. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
  65. package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
  66. package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
  67. package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
  68. package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
  69. package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
  70. package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
  71. package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
  72. package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
  73. package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
  74. package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
  75. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
  76. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
  77. package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
  78. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
  79. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
  80. package/benchmark/auto-resolve/scripts/judge.sh +359 -0
  81. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
  82. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
  83. package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
  84. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
  85. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
  86. package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
  87. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
  88. package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
  89. package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
  90. package/bin/devlyn.js +175 -17
  91. package/config/skills/_shared/adapters/README.md +64 -0
  92. package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
  93. package/config/skills/_shared/adapters/opus-4-7.md +29 -0
  94. package/config/skills/{devlyn:auto-resolve/scripts → _shared}/archive_run.py +26 -0
  95. package/config/skills/_shared/codex-config.md +54 -0
  96. package/config/skills/_shared/codex-monitored.sh +141 -0
  97. package/config/skills/_shared/engine-preflight.md +35 -0
  98. package/config/skills/_shared/expected.schema.json +93 -0
  99. package/config/skills/_shared/pair-plan-schema.md +298 -0
  100. package/config/skills/_shared/runtime-principles.md +110 -0
  101. package/config/skills/_shared/spec-verify-check.py +519 -0
  102. package/config/skills/devlyn:ideate/SKILL.md +99 -429
  103. package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
  104. package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
  105. package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
  106. package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
  107. package/config/skills/devlyn:resolve/SKILL.md +172 -184
  108. package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
  109. package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
  110. package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
  111. package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
  112. package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
  113. package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
  114. package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
  115. package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
  116. package/{config/skills → optional-skills}/devlyn:reap/SKILL.md +1 -0
  117. package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
  118. package/package.json +12 -2
  119. package/scripts/lint-skills.sh +431 -0
  120. package/config/skills/devlyn:auto-resolve/SKILL.md +0 -252
  121. package/config/skills/devlyn:auto-resolve/evals/evals.json +0 -21
  122. package/config/skills/devlyn:auto-resolve/evals/task-doctor-subcommand.md +0 -42
  123. package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -130
  124. package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -82
  125. package/config/skills/devlyn:auto-resolve/references/findings-schema.md +0 -103
  126. package/config/skills/devlyn:auto-resolve/references/phases/phase-1-build.md +0 -54
  127. package/config/skills/devlyn:auto-resolve/references/phases/phase-2-evaluate.md +0 -45
  128. package/config/skills/devlyn:auto-resolve/references/phases/phase-3-critic.md +0 -84
  129. package/config/skills/devlyn:auto-resolve/references/pipeline-routing.md +0 -114
  130. package/config/skills/devlyn:auto-resolve/references/pipeline-state.md +0 -201
  131. package/config/skills/devlyn:auto-resolve/scripts/terminal_verdict.py +0 -96
  132. package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
  133. package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
  134. package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
  135. package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
  136. package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
  137. package/config/skills/devlyn:clean/SKILL.md +0 -285
  138. package/config/skills/devlyn:design-ui/SKILL.md +0 -351
  139. package/config/skills/devlyn:discover-product/SKILL.md +0 -124
  140. package/config/skills/devlyn:evaluate/SKILL.md +0 -564
  141. package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
  142. package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
  143. package/config/skills/devlyn:ideate/references/codex-critic-template.md +0 -42
  144. package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
  145. package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
  146. package/config/skills/devlyn:preflight/SKILL.md +0 -355
  147. package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
  148. package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -86
  149. package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
  150. package/config/skills/devlyn:product-spec/SKILL.md +0 -603
  151. package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
  152. package/config/skills/devlyn:review/SKILL.md +0 -161
  153. package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
  154. package/config/skills/devlyn:team-review/SKILL.md +0 -493
  155. package/config/skills/devlyn:update-docs/SKILL.md +0 -463
  156. package/config/skills/workflow-routing/SKILL.md +0 -73
  157. /package/{config/skills → optional-skills}/devlyn:reap/scripts/reap.sh +0 -0
  158. /package/{config/skills → optional-skills}/devlyn:reap/scripts/scan.sh +0 -0
@@ -1,54 +0,0 @@
1
- # PHASE 1 — BUILD (agent prompt body)
2
-
3
- Spawned when PHASE 1 runs. Engine: BUILD row of `engine-routing.md`.
4
-
5
- Orchestrator passes the task description as the final section, and sets the team flag (`team: true|false`) per orchestrator rule: team only when `--team` flag OR `state.route.selected == "strict"`.
6
-
7
- ---
8
-
9
- <spec_integrity_check>
10
- Before reading anything else:
11
- - If `pipeline.state.json:source.type == "spec"`, compute `sha256(state.source.spec_path)`. If it differs from `state.source.spec_sha256`, write `phases.build.verdict: "BLOCKED"` with reason `"spec_sha256 mismatch"` and return. The spec changed mid-run — invariant violation per `references/pipeline-state.md`.
12
- - If `source.type == "generated"` and `state.source.criteria_sha256` exists, verify the same way.
13
- - If the hash field is absent (first phase to populate the file), skip this check this one time only.
14
- </spec_integrity_check>
15
-
16
- <goal>
17
- Implement code changes that satisfy every pending criterion in `pipeline.state.json:criteria[]` without violating anything declared Out of Scope or Constraints. Make the source's intent run in the code.
18
- </goal>
19
-
20
- <input>
21
- - Canonical criteria: `pipeline.state.json:source`. Follow `source.spec_path` (read directly, do not copy) or `source.criteria_path` (`.devlyn/criteria.generated.md` — may not yet exist; see OUTPUT CONTRACT).
22
- - Codebase at `pipeline.state.json:base_ref.sha`.
23
- - Task statement appended at the bottom of this prompt.
24
- </input>
25
-
26
- <output_contract>
27
- - **Code changes** implementing every `pending` criterion. Verify with `git diff`.
28
- - **state.json criteria updates**: for each criterion satisfied, set `status: "implemented"` and append an `evidence` record `{"file": "...", "line": N, "note": "brief"}`.
29
- - **If `source.type == "generated"` and `.devlyn/criteria.generated.md` does not exist**: create it once with `## Requirements` (each `- [ ]` testable in under 30 seconds, specific, scoped), `## Out of Scope`, `## Verification`. Populate `state.criteria[]` with `{"id": "C<N>", "ref": "criteria.generated://requirements/<N-1>", "status": "pending", "evidence": [], "failed_by_finding_ids": []}`. Classify task complexity into `low` / `medium` / `high` and write to `phases.build.complexity`. Compute `criteria_sha256 = sha256(criteria.generated.md)` and store in `state.source.criteria_sha256`.
30
- - **No pending criterion remains**: every `criteria[]` entry must transition to `status: "implemented"` with an `evidence` record before you exit. If a criterion genuinely cannot be satisfied (missing external dep, blocking ambiguity), set `phases.build.verdict: "BLOCKED"` and report. Never exit with a criterion still `pending`. BUILD must not mark any criterion `failed` — that's EVAL-only. Legal transitions: `pending → implemented`, or halt via `verdict: "BLOCKED"`.
31
- - **Tests** added or updated for changed behavior. Run the full test suite before stopping.
32
- - **Team** (only if orchestrator set `team: true`): use `TeamCreate` per the role table below; collect findings; shut down the team before exiting. Otherwise implement directly — the default.
33
- </output_contract>
34
-
35
- <quality_bar>
36
- - Criteria and Out-of-Scope are the contract — never weaken, reword, or delete them.
37
- - Read only files the source implicates (Architecture Notes + Dependencies + touched patterns), not the whole codebase.
38
- - Bugs: failing test first, then fix. Features: follow existing patterns, then write tests. Refactors: tests pass before and after.
39
- - Fix root causes only — no `any`, `@ts-ignore`, silent `catch`, or hardcoded values.
40
- </quality_bar>
41
-
42
- <principle>
43
- The source is the contract. Your output is evidence that the contract now runs in code.
44
- </principle>
45
-
46
- <team_role_selection>
47
- When `team: true`, select teammates by task type (per-role engine routing per `references/engine-routing.md`):
48
- - Bug fix: root-cause-analyst + test-engineer (+ security-auditor / performance-engineer as needed)
49
- - Feature: implementation-planner + test-engineer (+ ux-designer / architecture-reviewer / api-designer as needed)
50
- - Refactor: architecture-reviewer + test-engineer
51
- - UI/UX: product-designer + ux-designer + ui-designer (+ accessibility-auditor as needed)
52
- </team_role_selection>
53
-
54
- The task is: [orchestrator pastes the task description here]
@@ -1,45 +0,0 @@
1
- # PHASE 2 — EVALUATE (agent prompt body)
2
-
3
- Spawned when PHASE 2 runs. Engine: Claude (cross-model critic when builder was Codex).
4
-
5
- ---
6
-
7
- <spec_integrity_check>
8
- Before reading anything: verify source hash per `references/phases/phase-1-build.md#spec_integrity_check`. Apply the same rule (spec_sha256 for spec runs, criteria_sha256 for generated).
9
- </spec_integrity_check>
10
-
11
- <goal>
12
- Independently verify whether every criterion in `pipeline.state.json:criteria[]` is satisfied by the current code. Surface every defect with file:line evidence. You are a skeptic, not a cheerleader — praise is not your job.
13
- </goal>
14
-
15
- <input>
16
- - Canonical rubric: `pipeline.state.json:source`. Follow `source.spec_path` or `source.criteria_path` and read Requirements + Out of Scope + Verification directly.
17
- - Change surface: `git diff <pipeline.state.json:base_ref.sha>` + `git status`. Read every changed/new file in full — not just the hunks.
18
- - Prior browser findings at `.devlyn/browser_validate.findings.jsonl` (if that phase ran).
19
- </input>
20
-
21
- <output_contract>
22
- - **`.devlyn/evaluate.findings.jsonl`** — one JSON per line (schema: `references/findings-schema.md`). Per finding:
23
- `id` (`EVAL-<4digit>`), `rule_id` (stable kebab-case, e.g. `correctness.silent-error`, `ux.missing-error-state`, `architecture.duplication`, `security.missing-validation`, `types.any-cast-escape`, `style.let-vs-const`, `scope.out-of-scope-violation`, `hygiene.unused-import`), `level` (`error`/`warning`/`note` — map from severity: CRITICAL/HIGH → error, MEDIUM → warning, LOW → note), `severity` (`CRITICAL`/`HIGH`/`MEDIUM`/`LOW`), `confidence` (0.0–1.0), `message` (one line naming the issue, not symptoms), `file`, `line` (1-based primary location), `phase: "evaluate"`, `criterion_ref` (exact `ref` string from a `criteria[]` entry — e.g. `"spec://requirements/2"` — when the finding fails a specific criterion; or a section anchor from `state.source.criteria_anchors` such as `"spec://constraints"` / `"spec://out-of-scope"` when cross-cutting; `null` when scope-broader than any anchor), `fix_hint` (concrete action quoting file:line), `blocking` (CRITICAL/HIGH/MEDIUM default true, LOW false), `status: "open"`. Dedup key is `(rule_id, file, line)` — no fingerprint bookkeeping.
24
- - **`.devlyn/evaluate.log.md`** — 3–5 line human summary: verdict + criteria pass/fail counts + top 3 risks + cross-cutting patterns if any. Prose here; structured data in the JSONL.
25
- - **state.json criteria updates** — every `criteria[]` entry leaves Evaluate in a terminal state. Incoming status from BUILD is normally `implemented`; transition each to `status: "verified"` (append `evidence` record confirming satisfaction) OR `status: "failed"` (set `failed_by_finding_ids` to the IDs you emitted). If a criterion is still `pending` (BUILD did not satisfy it), mark it `failed` with a finding whose `rule_id` is `correctness.criterion-unimplemented`. No `criteria[]` entry may remain `pending` or `implemented` after Evaluate.
26
- - **state.json phases.evaluate** — `verdict` per taxonomy, `engine: "claude"`, `model`, timing, `round`, `artifacts.{findings_file, log_file}`.
27
-
28
- Verdict taxonomy: `BLOCKED` (any CRITICAL) / `NEEDS_WORK` (HIGH or MEDIUM present) / `PASS_WITH_ISSUES` (LOW only) / `PASS` (clean).
29
- </output_contract>
30
-
31
- <quality_bar>
32
- - Every finding points at a file:line you have opened and read. No real anchor = speculation; exclude it.
33
- - Every failed criterion maps to ≥1 finding `id`.
34
- - **Coverage over comfort**: report uncertain and LOW findings too; downstream filters rank them. Missing a real defect ships broken code — the asymmetry is decisive.
35
- - Audit each changed file for: correctness (logic errors, silent failures, null access, wrong API contracts), architecture (pattern violations, duplication, missing integration), security (if auth/secrets/user-data touched: injection, hardcoded credentials, missing validation), frontend (if UI changed: missing error/loading/empty states, React anti-patterns, server/client boundaries), test coverage (untested modules, missing edge cases), hygiene (unused imports, dead code, unused deps — `hygiene.*` at LOW), defensive programming (recursion depth/cycle guards, boundary conditions, missing null checks — severity per blast radius: `correctness.*` when it can crash or corrupt, `hygiene.*` when cosmetic).
36
- - Calibration: a catch block that logs but doesn't surface the error to the user → HIGH, not MEDIUM (logging ≠ error handling). A `let` that could be `const` → LOW (linters catch it). "Error handling is generally quite good" is not a finding — count instances, name files.
37
- - "Pre-existing" findings still count if they relate to the criteria. Working software, not blame attribution.
38
- - **Out-of-Scope violations are findings**: if BUILD added behavior the source's `## Out of Scope` excludes, emit `rule_id: "scope.out-of-scope-violation"`, `severity: HIGH`, `criterion_ref: "spec://out-of-scope"` (or `"criteria.generated://out-of-scope"`), `fix_hint` naming what to remove.
39
- </quality_bar>
40
-
41
- <principle>
42
- Missing a real defect is worse than reporting an extra one. Asymmetric cost demands bias toward reporting.
43
- </principle>
44
-
45
- Do not delete `pipeline.state.json` or the JSONL/log files.
@@ -1,84 +0,0 @@
1
- # PHASE 3 — CRITIC (agent prompt body)
2
-
3
- Spawned when PHASE 3 runs. Engine: CRITIC row of `engine-routing.md` — design sub-pass always Claude; security sub-pass Dual on `--engine auto`, single on others.
4
-
5
- **Findings-only**: CRITIC does NOT write code. Orchestrator routes `NEEDS_WORK`/`BLOCKED` findings into PHASE 2.5 with `triggered_by: "critic"`. No bespoke mini-loop inside CRITIC.
6
-
7
- ---
8
-
9
- <spec_integrity_check>
10
- Before reading anything: verify source hash per `references/phases/phase-1-build.md#spec_integrity_check`.
11
- </spec_integrity_check>
12
-
13
- <goal>
14
- One post-EVAL critic pass with two parallel sub-concerns. Produce a single `.devlyn/critic.findings.jsonl` tagged by rule_id prefix, plus a single `.devlyn/critic.log.md`.
15
- </goal>
16
-
17
- <input>
18
- - Change surface: `git diff <pipeline.state.json:base_ref.sha>`. Read every changed file in full, not just the hunks.
19
- - `package.json` / `requirements.txt` / lockfiles (`package-lock.json`, `pnpm-lock.yaml`, `yarn.lock`, `Pipfile.lock`, `poetry.lock`, `Cargo.lock`, `go.sum`) — for dependency audit.
20
- </input>
21
-
22
- ## Sub-pass 1: DESIGN (always Claude)
23
-
24
- <design_goal>
25
- Read the diff cold — no checklist, no prior-phase context. Find what a staff engineer would block before this PR ships. Any hesitation is a finding.
26
- </design_goal>
27
-
28
- <design_quality_bar>
29
- - Every finding anchored to `file:line` in code you have opened, with a concrete fix. Vague ≠ finding.
30
- - `fix_hint` is a specific change ("change X to Y because Z"), never "consider improving".
31
- - Interrogate: would this survive 10x traffic? A midnight oncall page? A junior dev in 6 months? Are baked-in assumptions stated out loud (hardcoded limits, implicit ordering, missed business-logic edges)? Is error handling actually helpful or does it prevent crashes while leaving users confused? Are there simpler idiomatic approaches — not "clever" but genuinely better?
32
- - Do not open with praise.
33
- - Rule_ids: `design.non-atomic-transaction`, `design.duplicate-pattern`, `design.hidden-assumption`, `design.unidiomatic-pattern`, `design.missing-integration`, etc.
34
- - Severities: CRITICAL / HIGH / MEDIUM — no LOW (design is ship/no-ship).
35
-
36
- **Design sub-verdict**: `PASS` only if zero design findings. Any open design finding → `NEEDS_WORK`.
37
- </design_quality_bar>
38
-
39
- ## Sub-pass 2: SECURITY (Dual on `--engine auto`, single otherwise)
40
-
41
- <security_goal>
42
- Dedicated security audit of all recent changes. NOT a general code review — focus exclusively on security concerns. File:line evidence for every finding.
43
- </security_goal>
44
-
45
- <security_quality_bar>
46
- Check every changed file for:
47
- 1. **Input validation**: trace every user input entry → storage/output. SQL injection, XSS, command injection, path traversal, SSRF.
48
- 2. **Auth & authorization**: new endpoints protected? Auth checks consistent? Privilege escalation / BOLA paths?
49
- 3. **Secrets & credentials**: grep for hardcoded API keys, tokens, passwords, private keys. Secrets from env vars. `.gitignore` covers sensitive files.
50
- 4. **Data exposure**: error messages leaking internal details? Logs capturing sensitive data? API responses returning more than needed?
51
- 5. **Dependencies** — **MANDATORY** when any dep manifest or lockfile changed (see `<input>` list above). Run the package manager's audit command:
52
- - `npm audit --json` (Node/pnpm/yarn — all write to `npm audit`-compatible JSON)
53
- - `pip-audit --format json`
54
- - `cargo audit`
55
- - `govulncheck ./...`
56
- Report findings at CRITICAL/HIGH as blocking. Record the command run and its JSON output in `critic.log.md`.
57
- 6. **CSRF/CORS**: new endpoints with side effects → CSRF protection. CORS not overly permissive.
58
-
59
- Rule_ids: `security.sql-injection`, `security.xss`, `security.path-traversal`, `security.ssrf`, `security.hardcoded-credential`, `security.missing-input-validation`, `security.missing-auth-check`, `security.privilege-escalation`, `security.data-exposure`, `security.insecure-dependency`, `security.missing-csrf`, `security.permissive-cors`.
60
-
61
- **Security sub-verdict** (stricter than general — same as v3.2 SECURITY):
62
- - `PASS` — zero findings
63
- - `PASS_WITH_ISSUES` — LOW only
64
- - `NEEDS_WORK` — HIGH or MEDIUM present (security MEDIUM is blocking by design)
65
- - `BLOCKED` — any CRITICAL
66
-
67
- **Dual merging** (when `--engine auto`): same finding from both models → keep more detailed wording, mark "confirmed by both". Codex-only → prefix message with `[codex]`. Conflicts → keep both. Take the MORE SEVERE severity between the two.
68
- </security_quality_bar>
69
-
70
- ## Output contract
71
-
72
- - **`.devlyn/critic.findings.jsonl`** — one JSONL file containing BOTH sub-passes' findings. Every line carries `phase: "critic"`. Rule_id prefix (`design.*` vs `security.*`) distinguishes sub-pass. ID prefix: `CRIT-<4digit>` (single sequence shared by both sub-passes for simplicity).
73
- - **`.devlyn/critic.log.md`** — single prose summary: two sections ("Design" + "Security"). Each section: verdict + top 3 concerns framed actionably. Security section records the dep-audit command and its result.
74
- - **state.json phases.critic** — record both sub-verdicts AND the combined verdict. Combined verdict = WORSE of the two:
75
- - Any `BLOCKED` → `BLOCKED`
76
- - Any `NEEDS_WORK` → `NEEDS_WORK`
77
- - Any `PASS_WITH_ISSUES` → `PASS_WITH_ISSUES`
78
- - Both `PASS` → `PASS`
79
-
80
- ## Principles
81
-
82
- - Cold eyes catch what structured reviews miss. For design: "would I ship this with my name on it?" is the only question.
83
- - For security: OWASP-anchored findings, file:line evidence. Speculative security concerns without a concrete attack vector are noise.
84
- - Do NOT write code changes. Do NOT commit. Orchestrator handles routing.
@@ -1,114 +0,0 @@
1
- # Pipeline Routing — 3 Routes + Stage A + Stage B LITE
2
-
3
- Auto-resolve adapts its pipeline shape to each task. Single source of truth for route selection; the orchestrator reads it, SKILL.md does not restate the rules.
4
-
5
- ## The 3 routes
6
-
7
- | Route | Intended for | Phases that run |
8
- |-------|-------------|-----------------|
9
- | `fast` | Trivial / low-complexity, zero risk signals | PARSE → BUILD → BUILD GATE → [BROWSER if web] → EVAL → [FIX if findings] → FINAL REPORT |
10
- | `standard` | Default for medium work | `fast` + CRITIC (findings-only) + DOCS |
11
- | `strict` | High-complexity OR risk signals present OR escalated | `standard` + team-assembled BUILD + BUILD GATE strict mode |
12
-
13
- Every route runs PARSE, BUILD, BUILD GATE, EVAL, and FINAL REPORT. Routes differ in whether CRITIC/DOCS run and whether BUILD assembles a team.
14
-
15
- **Findings-only** (CRITIC) means the phase emits a `.findings.jsonl` + `.log.md` but does not write code. The orchestrator routes any NEEDS_WORK/BLOCKED findings through the unified fix loop (see SKILL.md `PHASE 2.5`), which re-runs EVAL. This enforces the post-EVAL invariant: all semantic changes go through EVAL.
16
-
17
- ## Default guardrails (route-invariant under `auto`)
18
-
19
- These hold across all three routes with no `--bypass`:
20
-
21
- 1. **BUILD GATE PASS** — `fast` runs the gate too.
22
- 2. **Independent EVAL PASS** — file:line evidence required.
23
- 3. **Every criterion terminal** (`verified` or `failed`).
24
- 4. **Zero open HIGH/CRITICAL findings** at pipeline exit (subject to `--max-rounds` — see exhaustion table).
25
- 5. **Web file changes force BROWSER VALIDATE** (`.tsx/.jsx/.vue/.svelte/.css/.html`, `page.*/layout.*/route.*`).
26
- 6. **Post-BUILD risk detection auto-escalates** via Stage B LITE.
27
-
28
- ## The `--bypass` flag
29
-
30
- Semantics: `--bypass <phase>[,<phase>...]`. Bypassable phases: `build-gate`, `browser`, `critic`, `docs`.
31
-
32
- Every bypass is recorded in `state.route.bypasses` and surfaced in the final report's `Guardrails bypassed:` line.
33
-
34
- **Deprecated aliases** (still accepted, log warning once): `--skip-build-gate`, `--skip-browser`, `--skip-review`, `--skip-clean`, `--skip-docs`, `--security-review skip`, `--bypass simplify|review|clean|security|challenge` all map to `--bypass critic` for the post-EVAL group or the appropriate phase otherwise. Removed next minor version.
35
-
36
- ## `--max-rounds` exhaustion
37
-
38
- When the fix loop exhausts `max_rounds` with findings still open:
39
-
40
- | `triggered_by` | exhaustion behavior |
41
- |---|---|
42
- | `build_gate` | **halt** — skip to FINAL REPORT with `BUILD GATE EXHAUSTED` banner |
43
- | `browser_validate` | **halt** — skip to FINAL REPORT with `BROWSER EXHAUSTED` banner |
44
- | `evaluate` | **proceed_with_warning** — FINAL REPORT shows `EVAL EXHAUSTED` banner + open findings |
45
- | `critic` | **proceed_with_warning** — FINAL REPORT shows `CRITIC EXHAUSTED` banner + open findings |
46
-
47
- Guardrail #4 is suspended under `_with_warning` exhaustion: report banner shows what's unresolved.
48
-
49
- ## Stage A — Pre-build (PHASE 0)
50
-
51
- Decision order (first match wins):
52
-
53
- 1. **User override** (`--route fast|standard|strict`): set `route.selected`, `route.user_override: true`. Stage B LITE will not run.
54
- 2. **Hard blocker**: missing spec or unmet internal deps → halt BLOCKED.
55
- 3. **Risk keywords in source**: grep source body (spec body for spec-driven, task description for generated) for `auth, login, session, token, secret, password, crypto, api, env, permission, access, database, migration, payment`. Any hit → `strict`. Record matched keywords.
56
- 4. **Complexity-based** (spec-driven only):
57
- - `spec.frontmatter.complexity == "high"` → `strict`
58
- - `spec.frontmatter.complexity == "medium"` → `standard`
59
- - `spec.frontmatter.complexity == "low"` → `fast`
60
- 5. **Generated tasks**: default to `standard`, Stage B LITE may escalate after BUILD.
61
-
62
- Stage A writes to `state.route.stage_a.{at, reasons}`.
63
-
64
- ## Stage B LITE — Post-BUILD-GATE (PHASE 1.4)
65
-
66
- **One rule** (simplified from v3.2's multi-heuristic machinery). Does not run if `route.user_override == true`. Only escalates, never de-escalates.
67
-
68
- **Rule**: escalate to `strict` if `git diff <state.base_ref.sha>` meets ANY of:
69
-
70
- - **Risk keyword in diff content** — matches any of the 14 Stage A risk keywords.
71
- - **API surface** — changed files include paths under `src/api/`, `routes/`, `handlers/`, `app/api/`.
72
- - **Dependency change** — any of: `package.json`, `requirements.txt`, `package-lock.json`, `pnpm-lock.yaml`, `yarn.lock`, `Pipfile.lock`, `poetry.lock`, `Cargo.toml`, `Cargo.lock`, `go.mod`, `go.sum`.
73
-
74
- Stage B LITE writes to `state.route.stage_b.{at, escalated_from, reasons}`. No escalation → `stage_b.at` remains `null`.
75
-
76
- ## Phase inclusion matrix
77
-
78
- | Phase | `fast` | `standard` | `strict` |
79
- |-------|--------|-----------|----------|
80
- | 0 PARSE + PREFLIGHT + ROUTE | ✓ | ✓ | ✓ |
81
- | 1 BUILD (solo) | ✓ | ✓ | — (team) |
82
- | 1 BUILD (team) | `--team` | `--team` | ✓ |
83
- | 1.4 BUILD GATE (auto) | ✓ | ✓ | — (strict+docker) |
84
- | 1.4 BUILD GATE (strict+docker) | — | — | ✓ |
85
- | 1.5 BROWSER VALIDATE | ✓ (web) | ✓ (web) | ✓ (web) |
86
- | 2 EVALUATE | ✓ | ✓ | ✓ |
87
- | 2.5 UNIFIED FIX LOOP | ✓ (if findings) | ✓ (if findings) | ✓ (if findings) |
88
- | 3 CRITIC (findings-only) | — | ✓ | ✓ (Dual security sub-pass) |
89
- | 4 DOCS (doc-files only) | — | ✓ | ✓ |
90
- | 5 FINAL REPORT + ARCHIVE | ✓ | ✓ | ✓ |
91
-
92
- Legend: ✓ runs, — skipped by route. `--bypass <phase>` forces skip on any route. `fast` skips CRITIC and DOCS.
93
-
94
- ## Terminal-state algorithm (PHASE 5)
95
-
96
- Final verdict computed across all findings files in precedence order:
97
-
98
- 1. **BUILD GATE FAIL** at exhaustion → `BLOCKED` with `BUILD GATE EXHAUSTED`.
99
- 2. **BROWSER VALIDATE BLOCKED** at exhaustion → `BLOCKED`.
100
- 3. **Any unresolved CRITICAL** in any `<phase>.findings.jsonl` → `BLOCKED`.
101
- 4. **Any unresolved HIGH** with `rule_id` prefix `correctness.*` / `security.*` / `design.*` → `NEEDS_WORK`.
102
- 5. **Any unresolved HIGH** (other categories) → `NEEDS_WORK`.
103
- 6. **Any unresolved MEDIUM security.*** → `NEEDS_WORK` (security stricter than general by design).
104
- 7. **Any unresolved MEDIUM** (other categories) → `PASS_WITH_ISSUES`.
105
- 8. **Only LOW / none** → `PASS`.
106
-
107
- "Unresolved" means `status == "open"` in the latest round's file.
108
-
109
- ## Non-goals
110
-
111
- - Per-criterion routing (every phase sees every criterion).
112
- - Route re-evaluation mid-round.
113
- - De-escalation.
114
- - Replacing `--bypass` (bypass is an orthogonal opt-out).
@@ -1,201 +0,0 @@
1
- # Pipeline State — `.devlyn/pipeline.state.json`
2
-
3
- Control plane for a single auto-resolve run. Contains pointers and state only — never copied content from the spec or findings files.
4
-
5
- ## Purpose
6
-
7
- Every phase reads `pipeline.state.json` to answer:
8
- - What base git SHA am I diffing against? (prevents diff-scope drift across phases)
9
- - Where is the canonical criteria source? (spec file path or generated file path)
10
- - What route was selected and why?
11
- - Which criteria are verified / failed and with what evidence?
12
- - What is the current fix-loop round and max?
13
- - Where are the artifacts from phases that already ran?
14
- - What SHA did EVALUATE first pass at? (post-EVAL invariant check)
15
-
16
- State.json is the only cross-phase mutable state. Spec files and `<phase>.findings.jsonl` are immutable within a run.
17
-
18
- ## File location
19
-
20
- `.devlyn/pipeline.state.json` during a run; moved to `.devlyn/runs/<run_id>/pipeline.state.json` at PHASE 5 (archive).
21
-
22
- Created by PHASE 0 on run start. At PHASE 5, the entire `.devlyn/` run artifact set is **moved** (not deleted) into `.devlyn/runs/<run_id>/`. See `## Archive contract` below.
23
-
24
- ## Canonical schema (v1.2)
25
-
26
- ```json
27
- {
28
- "version": "1.2",
29
- "run_id": "ar-<ISO8601-compact>-<uuidv7-short>",
30
- "started_at": "<ISO-8601 UTC>",
31
- "engine": "auto" | "codex" | "claude",
32
- "base_ref": {
33
- "branch": "<string, e.g. 'main'>",
34
- "sha": "<full 40-char git sha captured at Phase 0 start>"
35
- },
36
- "eval_passed_sha": "<git sha recorded when PHASE 2 first returns PASS or PASS_WITH_ISSUES>" | null,
37
- "route": {
38
- "selected": "fast" | "standard" | "strict" | null,
39
- "user_override": true | false,
40
- "bypasses": ["<phase-name>", "..."],
41
- "stage_a": {
42
- "at": "<ISO-8601 UTC>" | null,
43
- "reasons": ["<string>", "..."]
44
- },
45
- "stage_b": {
46
- "at": "<ISO-8601 UTC>" | null,
47
- "escalated_from": "fast" | "standard" | null,
48
- "reasons": ["<string>", "..."]
49
- }
50
- },
51
- "source": {
52
- "type": "spec" | "generated",
53
- "spec_path": "<string path>" | null,
54
- "spec_sha256": "<hex>" | null,
55
- "criteria_path": "<string path>" | null,
56
- "criteria_sha256": "<hex>" | null,
57
- "criteria_anchors": ["spec://requirements", "..."]
58
- },
59
- "criteria": [
60
- {
61
- "id": "C1",
62
- "ref": "<anchor>",
63
- "status": "pending" | "implemented" | "verified" | "failed",
64
- "evidence": [
65
- {"file": "<string>", "line": <int>, "note": "<string>"}
66
- ],
67
- "failed_by_finding_ids": ["<string>"]
68
- }
69
- ],
70
- "phases": {
71
- "<phase_name>": {
72
- "verdict": "PASS" | "PASS_WITH_ISSUES" | "NEEDS_WORK" | "FAIL" | "BLOCKED" | null,
73
- "engine": "codex" | "claude" | "bash" | "dual" | null,
74
- "model": "<string>" | null,
75
- "started_at": "<ISO-8601 UTC>" | null,
76
- "completed_at": "<ISO-8601 UTC>" | null,
77
- "duration_ms": <int> | null,
78
- "round": <int>,
79
- "triggered_by": "<phase-name>" | null,
80
- "pre_sha": "<git sha captured before this phase spawned; used for per-phase diff invariant>" | null,
81
- "artifacts": {
82
- "findings_file": "<path>" | null,
83
- "log_file": "<path>" | null
84
- }
85
- }
86
- },
87
- "rounds": {
88
- "global": <int>,
89
- "max_rounds": <int>
90
- },
91
- "perf": { // OPTIONAL — present only when --perf flag is passed (v3.4 demoted from mandatory)
92
- "wall_ms": <int>,
93
- "tokens_total": <int>,
94
- "per_phase": [
95
- {"phase": "<name>", "engine": "codex" | "claude" | "bash" | "dual", "wall_ms": <int>, "tokens": <int>, "round": <int>, "triggered_by": "<phase>" | null}
96
- ]
97
- }
98
- }
99
- ```
100
-
101
- ## Field semantics
102
-
103
- ### Top-level
104
-
105
- - `version` — schema version; current value `1.2`. Orchestrators must refuse incompatible versions.
106
- - `run_id` — unique, time-sortable run identifier in format `ar-<UTC-compact>-<12 hex>`. Example: `ar-20260423T163044Z-018f4c2a1b9c`.
107
- - `started_at` — Phase 0 start, ISO-8601 UTC.
108
- - `engine` — user-provided `--engine` flag value, or `auto` default.
109
- - `base_ref` — git state captured at Phase 0. **All subsequent `git diff` commands use this SHA**, not `HEAD~1` or `main`. This eliminates diff-scope drift.
110
- - `eval_passed_sha` — `null` until PHASE 2 first returns `PASS` or `PASS_WITH_ISSUES`. At that moment the orchestrator records `git rev-parse HEAD` here. After this field is populated, the **post-EVAL findings-only invariant** applies: PHASE 3 (CRITIC) must not write any non-doc files (reverted on violation), and PHASE 4 (DOCS) may only touch doc-allowlist paths. See `invariants` section of the skill.
111
-
112
- ### Route
113
-
114
- - `selected` — `fast` / `standard` / `strict`, or `null` before Phase 0 decides.
115
- - `user_override` — `true` if user passed `--route <value>`.
116
- - `bypasses` — list of phase names the user explicitly bypassed via `--bypass <phase>`. Surfaced in the final report's `Guardrails bypassed` line. Empty list if no bypass.
117
- - `stage_a` — initial routing at Phase 0, based on spec frontmatter + content scan.
118
- - `stage_b` — post-BUILD checkpoint at Phase 1.4 completion. **Can only escalate** (fast → standard → strict), never de-escalate. `at` is `null` if no escalation.
119
- - `reasons` — human-readable decision rationale, surfaced in final report.
120
-
121
- ### Source
122
-
123
- - `type` — `spec` (roadmap spec file) or `generated` (ad-hoc task).
124
- - `spec_path` + `spec_sha256` — canonical spec pointer + integrity hash for spec runs. Each phase re-computes and compares before reading. Mismatch → phase writes `verdict: "BLOCKED"` with reason `spec_sha256 mismatch`.
125
- - `criteria_path` + `criteria_sha256` — same pair for generated runs. `criteria_sha256` is populated by PHASE 1 BUILD after it creates `criteria.generated.md`. Subsequent phases verify it the same way.
126
- - `criteria_anchors` — enumerated anchors downstream phases may reference.
127
-
128
- ### Criteria
129
-
130
- One entry per testable criterion extracted from the source. State machine: `pending → implemented → verified | failed`.
131
-
132
- ### Phases
133
-
134
- Key is phase name (v3.4 set): `build`, `build_gate`, `browser_validate`, `evaluate`, `fix_loop`, `critic`, `docs`, `final_report`.
135
-
136
- - `verdict` — `PASS` / `PASS_WITH_ISSUES` / `NEEDS_WORK` / `FAIL` / `BLOCKED` / `null`. **Single canonical verdict source** — orchestrator branches on this, never by parsing artifact files.
137
- - `engine` / `model` — which model ran this phase. `bash` for build-gate. `dual` for `critic` security sub-pass on `--engine auto`.
138
- - `round` — which fix-loop round this execution belongs to. Phases that run once: `1`. `build_gate`, `browser_validate`, `evaluate`, `critic` increment with fix-loop iterations.
139
- - `triggered_by` — for phases re-run via the unified fix loop (PHASE 2.5), records the triggering phase name (`build_gate` / `browser_validate` / `evaluate` / `critic`). Also written on fix-loop entries themselves. `null` for the first run.
140
- - `pre_sha` — captured by the orchestrator immediately before spawning a post-EVAL phase (`git rev-parse HEAD`). Used by the post-EVAL invariant to diff **only what this phase touched**. Applies to `critic` and `docs`. `null` for PARSE/BUILD/BUILD_GATE/BROWSER/EVAL (those use `base_ref.sha`).
141
- - `artifacts` — pointers to phase output files. Phases that emit structured findings write both `findings_file` and `log_file`. `critic` writes a single `.devlyn/critic.findings.jsonl` carrying both design and security rule_id prefixes. DOCS leaves both `null` (its output is git commits).
142
- - `sub_verdicts` (only on `critic`) — `{"design": <verdict>, "security": <verdict>}`; overall `verdict` = WORSE of the two per `references/phases/phase-3-critic.md`.
143
- - `dep_audit` (only on `critic`) — `{"ran": bool, "command": "<cmd>", "high": N, "critical": N}` populated when critic's security sub-pass ran `npm audit` / `pip-audit` / equivalent.
144
-
145
- ### Rounds
146
-
147
- - `global` — shared round counter across all fix-loop invocations regardless of trigger. Increments once per fix-loop iteration.
148
- - `max_rounds` — cap from `--max-rounds` flag (default 4).
149
-
150
- ### Perf (opt-in via `--perf`, v3.4)
151
-
152
- When `--perf` is passed, the orchestrator records wall-time and token consumption per phase for retrospective benchmarking. When the flag is omitted (the default), the `perf` block is absent from state.json and the orchestrator skips timing/token bookkeeping — Karpathy P2 (Simplicity First) applied: no mandatory meta-measurement.
153
-
154
- When enabled:
155
- - `wall_ms` — total wall-clock from PHASE 0 start to PHASE 5 end, in milliseconds.
156
- - `tokens_total` — sum of `per_phase[].tokens`.
157
- - `per_phase` — one entry per phase execution. Fields: `phase`, `engine`, `wall_ms`, `tokens` (from subagent `total_tokens` or Codex usage; `bash` reports 0), `round`, `triggered_by`.
158
-
159
- Written at phase completion; totals roll up at PHASE 5.
160
-
161
- ## Anchor syntax
162
-
163
- Format: `<scheme>://<section>[/<index>]`. `scheme` is `spec` or `criteria.generated`. `section` is slug-lowercased H2. `index` is optional 0-based position.
164
-
165
- ## Write protocol
166
-
167
- - **Phase 0 (PARSE + PREFLIGHT + ROUTE)** — creates state.json with `version`, `run_id`, `started_at`, `engine`, `base_ref`, `rounds.max_rounds`, empty `phases`, and (after preflight step) populates `source`, `criteria[]` with `status: pending`, `route.selected`, `route.stage_a`, `route.bypasses`. `eval_passed_sha` remains `null`.
168
- - **Each phase start** — orchestrator writes `phases.<name>.started_at`, `round`, `triggered_by` (if re-run).
169
- - **Each phase end** — phase writes `phases.<name>.{verdict, completed_at, duration_ms, artifacts}`. Build and Evaluate additionally update `criteria[]` state. **When EVALUATE first returns PASS/PASS_WITH_ISSUES**, orchestrator sets `state.eval_passed_sha = git rev-parse HEAD` — this is the reference point for the post-EVAL invariant.
170
- - **Phase 1.4 completion checkpoint** — orchestrator runs Stage B LITE routing check; writes `route.stage_b` on escalation.
171
- - **Phase 5 (FINAL REPORT + ARCHIVE)** — reads state.json for the report, renders the report, then archives (see below).
172
-
173
- ## Archive contract (PHASE 5)
174
-
175
- Best-effort move-and-prune. Replaces the previous "delete `.devlyn/`" behavior.
176
-
177
- 1. Create `.devlyn/runs/<run_id>/` with `mkdir -p`.
178
- 2. Move `.devlyn/pipeline.state.json`, every `.devlyn/<phase>.findings.jsonl`, every `.devlyn/<phase>.log.md`, every `.devlyn/fix-batch.round-*.json`, and `.devlyn/criteria.generated.md` (if exists) into that directory. Use `mv` (atomic within a filesystem).
179
- 3. Prune to the last 10 completed runs. List `.devlyn/runs/*/pipeline.state.json`, sort by enclosing `run_id` (lexicographic = chronological because run_ids start with a compact ISO8601 timestamp), and delete the oldest directories until at most 10 remain. **Never delete a directory whose `pipeline.state.json` has `phases.final_report.verdict == null`** — those are still in flight.
180
- 4. Kill any dev-server process spawned by PHASE 1.5 (BROWSER VALIDATE).
181
-
182
- Best-effort; no cross-process lock. Pruning is idempotent on sorted run_id list, so concurrent runs at worst delete a run already slated for pruning.
183
-
184
- ## Integrity invariants
185
-
186
- The orchestrator enforces:
187
-
188
- 1. `base_ref.sha` never changes after Phase 0.
189
- 2. `source.spec_sha256` (or `source.criteria_sha256` for generated runs) is re-verified at every phase start. Mismatch → the phase writes `verdict: "BLOCKED"` with reason. Missing hash is allowed ONLY on the phase that first populates it (PHASE 0 for spec; PHASE 1 for generated).
190
- 3. `route.selected` can only escalate via `stage_b`. No de-escalation.
191
- 4. `rounds.global` never exceeds `rounds.max_rounds`.
192
- 5. `criteria[].status` progression is monotonic per round: `pending → implemented → verified | failed`. A `failed` criterion can return to `implemented` via a subsequent fix-loop round, then be re-evaluated.
193
- 6. **Post-EVAL findings-only** (per-phase diff, not cumulative): once `eval_passed_sha` is non-null, each post-EVAL phase (CRITIC, DOCS) records `phases.<phase>.pre_sha = git rev-parse HEAD` at spawn time. After completion, the orchestrator runs `git diff --name-only <phases.<phase>.pre_sha> -- ':!.devlyn/**'`. For CRITIC (findings-only), any non-empty diff triggers `git reset --hard <pre_sha>` + `invariant.post-eval-code-mutation` finding + fix-loop entry. For DOCS, only doc-file-allowlist paths are legal; everything else triggers the same flow. `pre_sha` (not cumulative `eval_passed_sha`) is the correct baseline because fix-loop commits between EVAL and CRITIC are legitimate — they were re-EVALed. The `:!.devlyn/**` pathspec excludes orchestrator bookkeeping writes.
194
-
195
- Violations indicate a bug in the orchestrator. Do not attempt silent recovery.
196
-
197
- ## Non-goals
198
-
199
- - Crash-resume / workflow-engine semantics. State.json enables audit and orchestrator branching, not resume-from-crash.
200
- - Full SARIF export from state.json. `<phase>.findings.jsonl` is the SARIF-aligned surface; state.json is internal.
201
- - Per-finding history across runs. Current run's findings live in its `runs/<run_id>/` directory; cross-run comparison is manual.
@@ -1,96 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Compute auto-resolve terminal verdict per references/pipeline-routing.md#terminal-state-algorithm.
3
-
4
- Usage:
5
- python3 scripts/terminal_verdict.py [--devlyn-dir .devlyn] [--json]
6
-
7
- Reads every `.devlyn/<phase>.findings.jsonl`, filters `status == "open"`, applies the
8
- precedence list, and prints the verdict (stdout) and exit code.
9
-
10
- Exit codes: 0 = PASS | 1 = PASS_WITH_ISSUES | 2 = NEEDS_WORK | 3 = BLOCKED
11
-
12
- The pipeline routing file defines the authoritative precedence. This script implements
13
- it deterministically so the orchestrator does not re-reason through the rule set per run.
14
- """
15
- from __future__ import annotations
16
-
17
- import argparse
18
- import json
19
- import pathlib
20
- import sys
21
- from collections import Counter
22
-
23
-
24
- PRECEDENCE = [
25
- # (label, predicate on finding list) — first True wins
26
- ("BLOCKED", lambda fs: any(f["severity"] == "CRITICAL" for f in fs)),
27
- ("NEEDS_WORK", lambda fs: any(
28
- f["severity"] == "HIGH"
29
- and any(f.get("rule_id", "").startswith(p) for p in ("correctness.", "security.", "design."))
30
- for f in fs
31
- )),
32
- ("NEEDS_WORK", lambda fs: any(f["severity"] == "HIGH" for f in fs)),
33
- ("NEEDS_WORK", lambda fs: any(
34
- f["severity"] == "MEDIUM" and f.get("rule_id", "").startswith("security.")
35
- for f in fs
36
- )),
37
- ("PASS_WITH_ISSUES", lambda fs: any(f["severity"] == "MEDIUM" for f in fs)),
38
- ("PASS_WITH_ISSUES", lambda fs: any(f["severity"] == "LOW" for f in fs)),
39
- ("PASS", lambda fs: True), # fallthrough
40
- ]
41
-
42
- EXIT = {"PASS": 0, "PASS_WITH_ISSUES": 1, "NEEDS_WORK": 2, "BLOCKED": 3}
43
-
44
-
45
- def collect_open(devlyn: pathlib.Path) -> list[dict]:
46
- open_findings: list[dict] = []
47
- for jsonl in devlyn.glob("*.findings.jsonl"):
48
- for line in jsonl.read_text(encoding="utf-8").splitlines():
49
- line = line.strip()
50
- if not line:
51
- continue
52
- try:
53
- f = json.loads(line)
54
- except json.JSONDecodeError:
55
- # Malformed line surfaces explicitly rather than silently dropping.
56
- sys.stderr.write(f"warn: malformed finding in {jsonl}: {line[:80]}\n")
57
- continue
58
- if f.get("status") == "open":
59
- open_findings.append(f)
60
- return open_findings
61
-
62
-
63
- def compute(findings: list[dict]) -> str:
64
- for label, pred in PRECEDENCE:
65
- if pred(findings):
66
- return label
67
- return "PASS"
68
-
69
-
70
- def main() -> int:
71
- p = argparse.ArgumentParser(description=__doc__.splitlines()[0])
72
- p.add_argument("--devlyn-dir", default=".devlyn", help="path to .devlyn/ (default: ./.devlyn)")
73
- p.add_argument("--json", action="store_true", help="emit JSON summary to stdout")
74
- args = p.parse_args()
75
-
76
- devlyn = pathlib.Path(args.devlyn_dir)
77
- if not devlyn.is_dir():
78
- sys.stderr.write(f"error: {devlyn} is not a directory\n")
79
- return 3
80
-
81
- findings = collect_open(devlyn)
82
- verdict = compute(findings)
83
- by_sev = Counter(f["severity"] for f in findings)
84
-
85
- if args.json:
86
- json.dump({"verdict": verdict, "open": len(findings), "by_severity": dict(by_sev)}, sys.stdout)
87
- sys.stdout.write("\n")
88
- else:
89
- sys.stdout.write(f"{verdict}\n")
90
- sys.stdout.write(f"open: {len(findings)} ({' '.join(f'{k}={v}' for k, v in sorted(by_sev.items()))})\n")
91
-
92
- return EXIT[verdict]
93
-
94
-
95
- if __name__ == "__main__":
96
- raise SystemExit(main())