devlyn-cli 1.15.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. package/AGENTS.md +104 -0
  2. package/CLAUDE.md +135 -21
  3. package/README.md +43 -125
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
  5. package/benchmark/auto-resolve/README.md +114 -0
  6. package/benchmark/auto-resolve/RUBRIC.md +162 -0
  7. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
  9. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
  10. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
  11. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
  12. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
  13. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
  14. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
  16. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
  17. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
  18. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
  19. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
  20. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
  21. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
  22. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
  23. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
  27. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
  28. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
  29. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
  30. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
  31. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
  32. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
  33. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
  34. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
  35. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
  37. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
  39. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
  40. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
  41. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
  42. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
  43. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
  44. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
  46. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
  47. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
  48. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
  49. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
  50. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
  51. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
  52. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
  53. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
  54. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
  55. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
  57. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
  58. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
  59. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
  60. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
  61. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
  62. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
  63. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
  64. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
  65. package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
  66. package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
  67. package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
  68. package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
  69. package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
  70. package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
  71. package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
  72. package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
  73. package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
  74. package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
  75. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
  76. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
  77. package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
  78. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
  79. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
  80. package/benchmark/auto-resolve/scripts/judge.sh +359 -0
  81. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
  82. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
  83. package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
  84. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
  85. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
  86. package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
  87. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
  88. package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
  89. package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
  90. package/bin/devlyn.js +129 -17
  91. package/config/skills/_shared/adapters/README.md +64 -0
  92. package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
  93. package/config/skills/_shared/adapters/opus-4-7.md +29 -0
  94. package/config/skills/{devlyn:auto-resolve/scripts → _shared}/archive_run.py +26 -0
  95. package/config/skills/_shared/codex-config.md +54 -0
  96. package/config/skills/_shared/codex-monitored.sh +141 -0
  97. package/config/skills/_shared/engine-preflight.md +35 -0
  98. package/config/skills/_shared/expected.schema.json +93 -0
  99. package/config/skills/_shared/pair-plan-schema.md +298 -0
  100. package/config/skills/_shared/runtime-principles.md +110 -0
  101. package/config/skills/_shared/spec-verify-check.py +519 -0
  102. package/config/skills/devlyn:ideate/SKILL.md +99 -429
  103. package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
  104. package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
  105. package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
  106. package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
  107. package/config/skills/devlyn:resolve/SKILL.md +172 -184
  108. package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
  109. package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
  110. package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
  111. package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
  112. package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
  113. package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
  114. package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
  115. package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
  116. package/{config/skills → optional-skills}/devlyn:reap/SKILL.md +1 -0
  117. package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
  118. package/package.json +12 -2
  119. package/scripts/lint-skills.sh +431 -0
  120. package/config/skills/devlyn:auto-resolve/SKILL.md +0 -252
  121. package/config/skills/devlyn:auto-resolve/evals/evals.json +0 -21
  122. package/config/skills/devlyn:auto-resolve/evals/task-doctor-subcommand.md +0 -42
  123. package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -130
  124. package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -82
  125. package/config/skills/devlyn:auto-resolve/references/findings-schema.md +0 -103
  126. package/config/skills/devlyn:auto-resolve/references/phases/phase-1-build.md +0 -54
  127. package/config/skills/devlyn:auto-resolve/references/phases/phase-2-evaluate.md +0 -45
  128. package/config/skills/devlyn:auto-resolve/references/phases/phase-3-critic.md +0 -84
  129. package/config/skills/devlyn:auto-resolve/references/pipeline-routing.md +0 -114
  130. package/config/skills/devlyn:auto-resolve/references/pipeline-state.md +0 -201
  131. package/config/skills/devlyn:auto-resolve/scripts/terminal_verdict.py +0 -96
  132. package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
  133. package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
  134. package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
  135. package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
  136. package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
  137. package/config/skills/devlyn:clean/SKILL.md +0 -285
  138. package/config/skills/devlyn:design-ui/SKILL.md +0 -351
  139. package/config/skills/devlyn:discover-product/SKILL.md +0 -124
  140. package/config/skills/devlyn:evaluate/SKILL.md +0 -564
  141. package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
  142. package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
  143. package/config/skills/devlyn:ideate/references/codex-critic-template.md +0 -42
  144. package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
  145. package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
  146. package/config/skills/devlyn:preflight/SKILL.md +0 -355
  147. package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
  148. package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -86
  149. package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
  150. package/config/skills/devlyn:product-spec/SKILL.md +0 -603
  151. package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
  152. package/config/skills/devlyn:review/SKILL.md +0 -161
  153. package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
  154. package/config/skills/devlyn:team-review/SKILL.md +0 -493
  155. package/config/skills/devlyn:update-docs/SKILL.md +0 -463
  156. package/config/skills/workflow-routing/SKILL.md +0 -73
  157. /package/{config/skills → optional-skills}/devlyn:reap/scripts/reap.sh +0 -0
  158. /package/{config/skills → optional-skills}/devlyn:reap/scripts/scan.sh +0 -0
@@ -0,0 +1,22 @@
1
+ {
2
+ "name": "bench-test-repo",
3
+ "version": "0.1.0",
4
+ "private": true,
5
+ "description": "Deterministic base Node project for devlyn-cli auto-resolve benchmarks. Every fixture starts from a fresh copy of this directory.",
6
+ "bin": {
7
+ "bench-cli": "bin/cli.js"
8
+ },
9
+ "scripts": {
10
+ "cli": "node bin/cli.js",
11
+ "start": "node server/index.js",
12
+ "test": "node --test tests/",
13
+ "lint:json": "node scripts/lint-json.js"
14
+ },
15
+ "dependencies": {
16
+ "express": "4.19.2"
17
+ },
18
+ "devDependencies": {},
19
+ "engines": {
20
+ "node": ">=18.0.0"
21
+ }
22
+ }
@@ -0,0 +1,17 @@
1
+ // Playwright config used only by browser-validate benchmark fixtures.
2
+ // Runs against web/index.html served via `npx serve web` (fixture setup.sh
3
+ // starts the server). Keep config minimal.
4
+ module.exports = {
5
+ testDir: './tests/e2e',
6
+ timeout: 30_000,
7
+ use: {
8
+ baseURL: 'http://127.0.0.1:5173',
9
+ headless: true,
10
+ },
11
+ webServer: {
12
+ command: 'npx --yes serve -l 5173 web',
13
+ port: 5173,
14
+ reuseExistingServer: !process.env.CI,
15
+ timeout: 15_000,
16
+ },
17
+ };
@@ -0,0 +1,37 @@
1
+ // Tiny Express server used by backend-contract fixtures. Intentionally small.
2
+ const express = require('express');
3
+
4
+ const app = express();
5
+ app.use(express.json());
6
+
7
+ const items = [
8
+ { id: 1, name: 'alpha', qty: 3 },
9
+ { id: 2, name: 'beta', qty: 5 },
10
+ ];
11
+
12
+ app.get('/health', (_req, res) => {
13
+ res.json({ status: 'ok' });
14
+ });
15
+
16
+ app.get('/items', (_req, res) => {
17
+ res.json({ items });
18
+ });
19
+
20
+ app.get('/items/:id', (req, res) => {
21
+ const id = Number(req.params.id);
22
+ const item = items.find((it) => it.id === id);
23
+ if (!item) {
24
+ res.status(404).json({ error: 'not_found', id });
25
+ return;
26
+ }
27
+ res.json({ item });
28
+ });
29
+
30
+ if (require.main === module) {
31
+ const port = Number(process.env.PORT) || 3000;
32
+ app.listen(port, () => {
33
+ console.log(`bench-test-repo server listening on :${port}`);
34
+ });
35
+ }
36
+
37
+ module.exports = { app };
@@ -0,0 +1,25 @@
1
+ const { test } = require('node:test');
2
+ const assert = require('node:assert');
3
+ const { execFileSync } = require('node:child_process');
4
+ const path = require('node:path');
5
+
6
+ const CLI = path.join(__dirname, '..', 'bin', 'cli.js');
7
+
8
+ function run(args) {
9
+ return execFileSync('node', [CLI, ...args], { encoding: 'utf8' });
10
+ }
11
+
12
+ test('hello default', () => {
13
+ const out = run(['hello']);
14
+ assert.match(out, /Hello, world!/);
15
+ });
16
+
17
+ test('hello with --name', () => {
18
+ const out = run(['hello', '--name', 'alice']);
19
+ assert.match(out, /Hello, alice!/);
20
+ });
21
+
22
+ test('version prints package version', () => {
23
+ const out = run(['version']);
24
+ assert.match(out, /\d+\.\d+\.\d+/);
25
+ });
@@ -0,0 +1,58 @@
1
+ const { test } = require('node:test');
2
+ const assert = require('node:assert');
3
+ const http = require('node:http');
4
+ const { app } = require('../server');
5
+
6
+ function startServer() {
7
+ return new Promise((resolve) => {
8
+ const server = http.createServer(app);
9
+ server.listen(0, () => resolve(server));
10
+ });
11
+ }
12
+
13
+ function get(server, path) {
14
+ return new Promise((resolve, reject) => {
15
+ const { port } = server.address();
16
+ http
17
+ .get(`http://127.0.0.1:${port}${path}`, (res) => {
18
+ let body = '';
19
+ res.on('data', (chunk) => (body += chunk));
20
+ res.on('end', () => resolve({ status: res.statusCode, body: JSON.parse(body) }));
21
+ })
22
+ .on('error', reject);
23
+ });
24
+ }
25
+
26
+ test('GET /health returns ok', async () => {
27
+ const server = await startServer();
28
+ try {
29
+ const { status, body } = await get(server, '/health');
30
+ assert.strictEqual(status, 200);
31
+ assert.deepStrictEqual(body, { status: 'ok' });
32
+ } finally {
33
+ server.close();
34
+ }
35
+ });
36
+
37
+ test('GET /items returns list', async () => {
38
+ const server = await startServer();
39
+ try {
40
+ const { status, body } = await get(server, '/items');
41
+ assert.strictEqual(status, 200);
42
+ assert.ok(Array.isArray(body.items));
43
+ assert.ok(body.items.length >= 2);
44
+ } finally {
45
+ server.close();
46
+ }
47
+ });
48
+
49
+ test('GET /items/:id returns 404 for missing', async () => {
50
+ const server = await startServer();
51
+ try {
52
+ const { status, body } = await get(server, '/items/99999');
53
+ assert.strictEqual(status, 404);
54
+ assert.strictEqual(body.error, 'not_found');
55
+ } finally {
56
+ server.close();
57
+ }
58
+ });
@@ -0,0 +1,37 @@
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <title>bench-test-repo</title>
6
+ <style>
7
+ body {
8
+ font-family: system-ui, sans-serif;
9
+ margin: 2rem;
10
+ max-width: 40rem;
11
+ }
12
+ button {
13
+ padding: 0.5rem 1rem;
14
+ font-size: 1rem;
15
+ }
16
+ #output {
17
+ margin-top: 1rem;
18
+ padding: 1rem;
19
+ border: 1px solid #ccc;
20
+ border-radius: 4px;
21
+ min-height: 2rem;
22
+ }
23
+ </style>
24
+ </head>
25
+ <body>
26
+ <h1>bench-test-repo</h1>
27
+ <p>Minimal page used by browser-validate benchmark fixtures.</p>
28
+ <button id="greet">Greet</button>
29
+ <div id="output" data-testid="output"></div>
30
+ <script>
31
+ document.getElementById('greet').addEventListener('click', () => {
32
+ const out = document.getElementById('output');
33
+ out.textContent = 'Hello from bench-test-repo';
34
+ });
35
+ </script>
36
+ </body>
37
+ </html>
@@ -0,0 +1,174 @@
1
+ #!/usr/bin/env python3
2
+ """Build the iter-0033c pair-eligible manifest (Codex R0/R0.5 + R0-infra/R0.5-infra).
3
+
4
+ Manifest captures the immutable Gate-3 input to iter-0033c-compare.py:
5
+ - which fixtures are pair-eligible (high-value ∪ L1≤L0 ∪ F9-if-iter-0033a-passed)
6
+ - what the Gate-3 threshold count is
7
+ - sha256 over the canonical document so any post-write tampering is detectable
8
+
9
+ Hashing pattern is the pre-stamp form lifted from
10
+ benchmark/auto-resolve/scripts/pair-plan-lint.py:81-91 — deep-copy the manifest,
11
+ zero out `manifest_sha256`, serialize with `sort_keys=True, separators=(",",":"),
12
+ ensure_ascii=False, allow_nan=False`, then sha256 the bytes.
13
+
14
+ Inputs (all required):
15
+ --c1-summary <path> iter-0033 (C1) summary.json (selection grounds; never a comparison baseline)
16
+ --f9-judge <path> iter-0033a F9 judge.json (F9 inclusion proof)
17
+ --l1-rerun-summary <path> L1 rerun summary at iter-0033c HEAD (fresh baseline)
18
+ --output <path> destination .devlyn/manifests/iter-0033c-pair-eligible.json
19
+
20
+ Selection rule (frozen pre-registration, iter-0033c §"Pair-eligible fixture set"):
21
+ high_value = {F2, F3, F4, F6, F7}
22
+ promoted_by_l1_le_l0 = {f ∈ C1 summary | solo_claude.score ≤ bare.score}
23
+ conditional_excluded = {F1, F5} # promoted only if L1≤L0
24
+ reporting_only = {F8} # excluded from Gate 3
25
+ pair_eligible = high_value ∪ promoted_by_l1_le_l0 ∪ {F9 if iter-0033a passed}
26
+ − reporting_only
27
+ − conditional_excluded that did not get promoted
28
+ """
29
+ import argparse
30
+ import copy
31
+ import hashlib
32
+ import json
33
+ import subprocess
34
+ import sys
35
+ from pathlib import Path
36
+
37
+ HIGH_VALUE = ["F2", "F3", "F4", "F6", "F7"]
38
+ CONDITIONAL = ["F1", "F5"]
39
+ REPORTING_ONLY = ["F8"]
40
+
41
+
42
+ def file_sha256(path: Path) -> str:
43
+ return hashlib.sha256(path.read_bytes()).hexdigest()
44
+
45
+
46
+ def canonical_manifest_sha256(manifest: dict) -> str:
47
+ """Pre-stamp hash per pair-plan-lint.py:81-91 — zero out the stamp, then sha256."""
48
+ pre = copy.deepcopy(manifest)
49
+ pre["manifest_sha256"] = ""
50
+ s = json.dumps(
51
+ pre,
52
+ sort_keys=True,
53
+ separators=(",", ":"),
54
+ ensure_ascii=False,
55
+ allow_nan=False,
56
+ )
57
+ return hashlib.sha256(s.encode("utf-8")).hexdigest()
58
+
59
+
60
+ def fixture_short_id(full: str) -> str:
61
+ """'F3-backend-contract-risk' -> 'F3'. Pure prefix; matches existing convention."""
62
+ return full.split("-", 1)[0] if "-" in full else full
63
+
64
+
65
+ def compute_promoted_l1_le_l0(c1_rows: list) -> list:
66
+ """Return short fixture IDs (e.g. 'F3') where solo_claude.score ≤ bare.score in C1."""
67
+ promoted = []
68
+ for row in c1_rows:
69
+ arms = row.get("arms", {})
70
+ solo = arms.get("solo_claude", {}).get("score")
71
+ bare = arms.get("bare", {}).get("score")
72
+ if solo is None or bare is None:
73
+ continue
74
+ if solo <= bare:
75
+ promoted.append(fixture_short_id(row["fixture"]))
76
+ return promoted
77
+
78
+
79
+ def f9_passed(f9_judge: dict) -> bool:
80
+ """iter-0033a passed iff A score > B score AND A is not disqualified."""
81
+ a = f9_judge.get("a_score")
82
+ b = f9_judge.get("b_score")
83
+ dqs = f9_judge.get("disqualifiers") or {}
84
+ if a is None or b is None:
85
+ return False
86
+ return a > b and not bool(dqs.get("A", False))
87
+
88
+
89
+ def head_sha() -> str:
90
+ try:
91
+ out = subprocess.check_output(
92
+ ["git", "rev-parse", "HEAD"], stderr=subprocess.DEVNULL
93
+ )
94
+ return out.decode().strip()
95
+ except Exception:
96
+ return ""
97
+
98
+
99
+ def main() -> int:
100
+ ap = argparse.ArgumentParser()
101
+ ap.add_argument("--c1-summary", required=True)
102
+ ap.add_argument("--f9-judge", required=True)
103
+ ap.add_argument("--l1-rerun-summary", required=True)
104
+ ap.add_argument("--output", required=True)
105
+ args = ap.parse_args()
106
+
107
+ c1_path = Path(args.c1_summary)
108
+ f9_path = Path(args.f9_judge)
109
+ l1_path = Path(args.l1_rerun_summary)
110
+ out_path = Path(args.output)
111
+
112
+ for p, label in [(c1_path, "c1-summary"), (f9_path, "f9-judge"), (l1_path, "l1-rerun-summary")]:
113
+ if not p.is_file():
114
+ print(f"error: {label} not found: {p}", file=sys.stderr)
115
+ return 2
116
+
117
+ c1 = json.loads(c1_path.read_text())
118
+ f9 = json.loads(f9_path.read_text())
119
+
120
+ promoted = compute_promoted_l1_le_l0(c1.get("rows", []))
121
+ f9_in = f9_passed(f9)
122
+
123
+ pair_eligible = list(HIGH_VALUE) # frozen high-value list, ordered
124
+ for fx in promoted:
125
+ if fx not in pair_eligible and fx not in REPORTING_ONLY:
126
+ pair_eligible.append(fx)
127
+ if f9_in and "F9" not in pair_eligible:
128
+ pair_eligible.append("F9")
129
+ pair_eligible = [fx for fx in pair_eligible if fx not in REPORTING_ONLY]
130
+
131
+ conditional_promoted = [fx for fx in CONDITIONAL if fx in promoted]
132
+ conditional_excluded = [fx for fx in CONDITIONAL if fx not in promoted]
133
+ pair_eligible_sorted = sorted(pair_eligible, key=lambda s: (s[0], int(s[1:])))
134
+
135
+ gate3_total = len(pair_eligible_sorted)
136
+ gate3_threshold = (gate3_total + 1) // 2 # ≥50% — ceil(gate3_total / 2)
137
+
138
+ manifest = {
139
+ "schema_version": "1.0",
140
+ "iter": "0033c",
141
+ "head": head_sha(),
142
+ "sources": {
143
+ "c1_summary": {"path": str(c1_path), "sha256": file_sha256(c1_path)},
144
+ "f9_judge": {"path": str(f9_path), "sha256": file_sha256(f9_path)},
145
+ "l1_rerun_summary": {"path": str(l1_path), "sha256": file_sha256(l1_path)},
146
+ },
147
+ "selection_rule": {
148
+ "high_value": HIGH_VALUE,
149
+ "promoted_by_l1_le_l0": sorted(set(promoted)),
150
+ "f9_included": f9_in,
151
+ "f9_passed_iter_0033a": f9_in,
152
+ "reporting_only": REPORTING_ONLY,
153
+ "conditional_excluded": conditional_excluded,
154
+ "conditional_promoted": conditional_promoted,
155
+ },
156
+ "fixtures_pair_eligible": pair_eligible_sorted,
157
+ "gate3_threshold_count": gate3_threshold,
158
+ "gate3_total": gate3_total,
159
+ "manifest_sha256": "",
160
+ }
161
+ manifest["manifest_sha256"] = canonical_manifest_sha256(manifest)
162
+
163
+ out_path.parent.mkdir(parents=True, exist_ok=True)
164
+ out_path.write_text(json.dumps(manifest, indent=2) + "\n")
165
+
166
+ print(f"[manifest] wrote {out_path}")
167
+ print(f"[manifest] pair-eligible: {pair_eligible_sorted} "
168
+ f"(gate3 ≥ {gate3_threshold} / {gate3_total})")
169
+ print(f"[manifest] sha256: {manifest['manifest_sha256']}")
170
+ return 0
171
+
172
+
173
+ if __name__ == "__main__":
174
+ sys.exit(main())
@@ -0,0 +1,256 @@
1
+ #!/usr/bin/env python3
2
+ """F9 variant/solo arm artifact + transcript fingerprint check.
3
+
4
+ Out-of-band per Codex R0.5 §B (iter-0033a): expected.json.verification_commands
5
+ apply to ALL arms (run-fixture.sh:472), so a `docs/specs/**` check there would
6
+ punish bare. This script runs AFTER run-fixture.sh and asserts variant/solo
7
+ arms produced the artifacts the 2-skill ideate→resolve chain should emit.
8
+
9
+ Bare arm is exempt by construction.
10
+
11
+ Usage:
12
+ check-f9-artifacts.py --result-dir <results/<run_id>/F9-e2e-ideate-to-resolve/<arm>>
13
+
14
+ Exits:
15
+ 0 — all checks pass (or bare arm — exempt).
16
+ 1 — variant/solo arm but artifact contract violated.
17
+ 2 — invalid invocation (missing args, missing dir).
18
+
19
+ Emits a small JSON report at <result-dir>/check-f9-artifacts.json.
20
+ """
21
+ import argparse
22
+ import json
23
+ import os
24
+ import re
25
+ import sys
26
+ from pathlib import Path
27
+
28
+
29
+ VARIANT_ARMS = {"variant", "solo_claude", "l2_gated", "l2_forced"}
30
+ EXEMPT_ARMS = {"bare"}
31
+
32
+ SPEC_DIR_GLOB = "docs/specs/*/spec.md"
33
+ SPEC_EXPECTED_GLOB = "docs/specs/*/spec.expected.json"
34
+
35
+ # Transcript fingerprint regexes (negative checks only — `claude -p`
36
+ # transcript captures only the agent's final reply, not intermediate
37
+ # tool calls; positive resolve invocation evidence lives in state).
38
+ RE_AUTO_RESOLVE = re.compile(r"/devlyn:auto-resolve\b")
39
+ RE_PREFLIGHT = re.compile(r"/devlyn:preflight\b")
40
+
41
+
42
+ def main() -> int:
43
+ p = argparse.ArgumentParser(description=__doc__.split("\n", 1)[0])
44
+ p.add_argument("--result-dir", required=True,
45
+ help="Path to results/<run_id>/<fixture>/<arm>/")
46
+ args = p.parse_args()
47
+
48
+ result_dir = Path(args.result_dir)
49
+ if not result_dir.is_dir():
50
+ print(f"error: result dir not found: {result_dir}", file=sys.stderr)
51
+ return 2
52
+
53
+ arm = result_dir.name
54
+ fixture = result_dir.parent.name
55
+
56
+ if fixture != "F9-e2e-ideate-to-resolve":
57
+ print(f"error: this script is F9-only (got fixture={fixture})", file=sys.stderr)
58
+ return 2
59
+
60
+ report = {
61
+ "fixture": fixture,
62
+ "arm": arm,
63
+ "checks": [],
64
+ "exempt": False,
65
+ "pass": True,
66
+ }
67
+
68
+ if arm in EXEMPT_ARMS:
69
+ report["exempt"] = True
70
+ report["checks"].append({"name": "arm-is-bare-exempt", "pass": True})
71
+ _write_report(result_dir, report)
72
+ return 0
73
+
74
+ if arm not in VARIANT_ARMS:
75
+ print(f"error: unknown arm '{arm}' (expected one of {VARIANT_ARMS | EXEMPT_ARMS})",
76
+ file=sys.stderr)
77
+ return 2
78
+
79
+ # The fixture's work-dir is referenced from result_dir/timing.json. The
80
+ # arm produced files inside that work-dir; we glob from there.
81
+ timing_path = result_dir / "timing.json"
82
+ work_dir: Path
83
+ if timing_path.is_file():
84
+ try:
85
+ timing = json.loads(timing_path.read_text())
86
+ work_dir = Path(timing.get("work_dir", ""))
87
+ except Exception:
88
+ work_dir = Path("")
89
+ else:
90
+ work_dir = Path("")
91
+
92
+ if not work_dir.is_dir():
93
+ report["checks"].append({
94
+ "name": "work-dir-resolvable",
95
+ "pass": False,
96
+ "reason": f"work_dir from timing.json not usable: {work_dir!r}",
97
+ })
98
+ report["pass"] = False
99
+ _write_report(result_dir, report)
100
+ return 1
101
+
102
+ # Check 1: docs/specs/<id>-<slug>/spec.md exists.
103
+ specs_root = work_dir / "docs" / "specs"
104
+ spec_md_files = list(specs_root.glob("*/spec.md")) if specs_root.is_dir() else []
105
+ spec_md_present = bool(spec_md_files)
106
+ report["checks"].append({
107
+ "name": "spec.md-exists-under-docs/specs",
108
+ "pass": spec_md_present,
109
+ "matched": [str(p.relative_to(work_dir)) for p in spec_md_files],
110
+ })
111
+ if not spec_md_present:
112
+ report["pass"] = False
113
+
114
+ # Check 2: spec.expected.json exists at the same dir.
115
+ spec_exp_files = list(specs_root.glob("*/spec.expected.json")) if specs_root.is_dir() else []
116
+ spec_exp_present = bool(spec_exp_files)
117
+ report["checks"].append({
118
+ "name": "spec.expected.json-exists-under-docs/specs",
119
+ "pass": spec_exp_present,
120
+ "matched": [str(p.relative_to(work_dir)) for p in spec_exp_files],
121
+ })
122
+ if not spec_exp_present:
123
+ report["pass"] = False
124
+
125
+ # Path-shape regression: the parent dir name should be `<id>-<slug>` shape.
126
+ # Both id and slug are kebab-case, so the dir must contain at least one
127
+ # hyphen. Bare `<id>/spec.md` (no hyphen) is the legacy shape we reject.
128
+ if spec_md_files:
129
+ bad_shapes = [p for p in spec_md_files if "-" not in p.parent.name]
130
+ report["checks"].append({
131
+ "name": "path-shape-id-slug",
132
+ "pass": not bad_shapes,
133
+ "non_conforming": [str(p.relative_to(work_dir)) for p in bad_shapes],
134
+ })
135
+ if bad_shapes:
136
+ report["pass"] = False
137
+
138
+ # Resolve invocation evidence — primary source is pipeline.state.json,
139
+ # NOT transcript.txt. `claude -p` only emits the agent's final reply to
140
+ # stdout; intermediate Skill / Agent / Bash tool calls do not appear in
141
+ # transcript.txt. Therefore "regex /devlyn:resolve --spec in transcript"
142
+ # is the wrong source. The authoritative evidence resolve actually ran
143
+ # in --spec mode is `state.mode == "spec"` plus `state.source.type ==
144
+ # "spec"` plus a populated `state.source.spec_path` pointing under
145
+ # `docs/specs/`. Per state-schema.md this is single-source-of-truth.
146
+ # Look for the archive first (preferred), then fall back to the live
147
+ # in-flight location. NEW resolve currently lands artifacts directly in
148
+ # `.devlyn/` and may skip the move-to-runs/ archive step (TODO: separate
149
+ # iter to fix archive); both locations carry the same authoritative
150
+ # state shape.
151
+ archived_paths = list(work_dir.glob(".devlyn/runs/*/pipeline.state.json"))
152
+ live_path = work_dir / ".devlyn" / "pipeline.state.json"
153
+ state_paths = archived_paths if archived_paths else (
154
+ [live_path] if live_path.is_file() else []
155
+ )
156
+ if not state_paths:
157
+ report["checks"].append({
158
+ "name": "pipeline.state.json-present",
159
+ "pass": False,
160
+ "reason": "neither .devlyn/runs/*/pipeline.state.json nor .devlyn/pipeline.state.json found in work_dir",
161
+ })
162
+ report["pass"] = False
163
+ else:
164
+ # Read the most recent run.
165
+ state_path = sorted(state_paths)[-1]
166
+ try:
167
+ state = json.loads(state_path.read_text())
168
+ except Exception as exc:
169
+ report["checks"].append({
170
+ "name": "pipeline.state.json-parses",
171
+ "pass": False,
172
+ "reason": f"{exc.__class__.__name__}: {exc}",
173
+ })
174
+ report["pass"] = False
175
+ state = None
176
+
177
+ if state is not None:
178
+ archived = "/runs/" in str(state_path)
179
+ report["checks"].append({
180
+ "name": "pipeline.state.json-present",
181
+ "pass": True,
182
+ "path": str(state_path.relative_to(work_dir)),
183
+ "archived_to_runs_dir": archived,
184
+ })
185
+ if not archived:
186
+ # Not a fail — note for harness developer that NEW resolve
187
+ # is skipping the archive step in this run.
188
+ report["checks"].append({
189
+ "name": "archive-step-completed",
190
+ "pass": True,
191
+ "warning": "NEW resolve left artifacts in .devlyn/ instead of .devlyn/runs/<id>/ — archive step skipped (separate iter for harness fix)",
192
+ })
193
+ mode = state.get("mode")
194
+ src_type = (state.get("source") or {}).get("type")
195
+ spec_path = (state.get("source") or {}).get("spec_path") or ""
196
+ spec_under_specs = spec_path.startswith("docs/specs/") and spec_path.endswith("spec.md")
197
+ mode_ok = mode == "spec"
198
+ src_ok = src_type == "spec"
199
+ report["checks"].append({
200
+ "name": "state.mode-and-source-spec",
201
+ "pass": mode_ok and src_ok and spec_under_specs,
202
+ "mode": mode,
203
+ "source.type": src_type,
204
+ "source.spec_path": spec_path,
205
+ })
206
+ if not (mode_ok and src_ok and spec_under_specs):
207
+ report["pass"] = False
208
+
209
+ # Transcript fingerprint — negative checks only. transcript.txt records
210
+ # the agent's final reply; if the agent (or any subagent) had invoked
211
+ # /devlyn:auto-resolve or /devlyn:preflight, the prompt-following gate
212
+ # should still surface the name in the summary. Positive resolve
213
+ # evidence lives in state above; here we just rule out the deprecated
214
+ # 3-skill chain names.
215
+ transcript_path = result_dir / "transcript.txt"
216
+ if not transcript_path.is_file():
217
+ report["checks"].append({
218
+ "name": "transcript-readable",
219
+ "pass": False,
220
+ "reason": f"transcript.txt missing at {transcript_path}",
221
+ })
222
+ report["pass"] = False
223
+ _write_report(result_dir, report)
224
+ return 1
225
+
226
+ transcript = transcript_path.read_text(errors="replace")
227
+
228
+ auto_resolve_hits = RE_AUTO_RESOLVE.findall(transcript)
229
+ report["checks"].append({
230
+ "name": "transcript-no-auto-resolve",
231
+ "pass": len(auto_resolve_hits) == 0,
232
+ "count": len(auto_resolve_hits),
233
+ })
234
+ if auto_resolve_hits:
235
+ report["pass"] = False
236
+
237
+ preflight_hits = RE_PREFLIGHT.findall(transcript)
238
+ report["checks"].append({
239
+ "name": "transcript-no-preflight",
240
+ "pass": len(preflight_hits) == 0,
241
+ "count": len(preflight_hits),
242
+ })
243
+ if preflight_hits:
244
+ report["pass"] = False
245
+
246
+ _write_report(result_dir, report)
247
+ return 0 if report["pass"] else 1
248
+
249
+
250
+ def _write_report(result_dir: Path, report: dict) -> None:
251
+ out_path = result_dir / "check-f9-artifacts.json"
252
+ out_path.write_text(json.dumps(report, indent=2) + "\n")
253
+
254
+
255
+ if __name__ == "__main__":
256
+ sys.exit(main())