devlyn-cli 1.15.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. package/AGENTS.md +104 -0
  2. package/CLAUDE.md +135 -21
  3. package/README.md +43 -125
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
  5. package/benchmark/auto-resolve/README.md +114 -0
  6. package/benchmark/auto-resolve/RUBRIC.md +162 -0
  7. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
  9. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
  10. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
  11. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
  12. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
  13. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
  14. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
  16. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
  17. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
  18. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
  19. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
  20. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
  21. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
  22. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
  23. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
  27. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
  28. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
  29. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
  30. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
  31. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
  32. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
  33. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
  34. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
  35. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
  37. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
  39. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
  40. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
  41. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
  42. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
  43. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
  44. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
  46. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
  47. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
  48. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
  49. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
  50. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
  51. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
  52. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
  53. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
  54. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
  55. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
  57. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
  58. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
  59. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
  60. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
  61. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
  62. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
  63. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
  64. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
  65. package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
  66. package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
  67. package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
  68. package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
  69. package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
  70. package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
  71. package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
  72. package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
  73. package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
  74. package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
  75. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
  76. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
  77. package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
  78. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
  79. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
  80. package/benchmark/auto-resolve/scripts/judge.sh +359 -0
  81. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
  82. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
  83. package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
  84. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
  85. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
  86. package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
  87. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
  88. package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
  89. package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
  90. package/bin/devlyn.js +175 -17
  91. package/config/skills/_shared/adapters/README.md +64 -0
  92. package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
  93. package/config/skills/_shared/adapters/opus-4-7.md +29 -0
  94. package/config/skills/{devlyn:auto-resolve/scripts → _shared}/archive_run.py +26 -0
  95. package/config/skills/_shared/codex-config.md +54 -0
  96. package/config/skills/_shared/codex-monitored.sh +141 -0
  97. package/config/skills/_shared/engine-preflight.md +35 -0
  98. package/config/skills/_shared/expected.schema.json +93 -0
  99. package/config/skills/_shared/pair-plan-schema.md +298 -0
  100. package/config/skills/_shared/runtime-principles.md +110 -0
  101. package/config/skills/_shared/spec-verify-check.py +519 -0
  102. package/config/skills/devlyn:ideate/SKILL.md +99 -429
  103. package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
  104. package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
  105. package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
  106. package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
  107. package/config/skills/devlyn:resolve/SKILL.md +172 -184
  108. package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
  109. package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
  110. package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
  111. package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
  112. package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
  113. package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
  114. package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
  115. package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
  116. package/{config/skills → optional-skills}/devlyn:reap/SKILL.md +1 -0
  117. package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
  118. package/package.json +12 -2
  119. package/scripts/lint-skills.sh +431 -0
  120. package/config/skills/devlyn:auto-resolve/SKILL.md +0 -252
  121. package/config/skills/devlyn:auto-resolve/evals/evals.json +0 -21
  122. package/config/skills/devlyn:auto-resolve/evals/task-doctor-subcommand.md +0 -42
  123. package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -130
  124. package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -82
  125. package/config/skills/devlyn:auto-resolve/references/findings-schema.md +0 -103
  126. package/config/skills/devlyn:auto-resolve/references/phases/phase-1-build.md +0 -54
  127. package/config/skills/devlyn:auto-resolve/references/phases/phase-2-evaluate.md +0 -45
  128. package/config/skills/devlyn:auto-resolve/references/phases/phase-3-critic.md +0 -84
  129. package/config/skills/devlyn:auto-resolve/references/pipeline-routing.md +0 -114
  130. package/config/skills/devlyn:auto-resolve/references/pipeline-state.md +0 -201
  131. package/config/skills/devlyn:auto-resolve/scripts/terminal_verdict.py +0 -96
  132. package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
  133. package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
  134. package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
  135. package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
  136. package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
  137. package/config/skills/devlyn:clean/SKILL.md +0 -285
  138. package/config/skills/devlyn:design-ui/SKILL.md +0 -351
  139. package/config/skills/devlyn:discover-product/SKILL.md +0 -124
  140. package/config/skills/devlyn:evaluate/SKILL.md +0 -564
  141. package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
  142. package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
  143. package/config/skills/devlyn:ideate/references/codex-critic-template.md +0 -42
  144. package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
  145. package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
  146. package/config/skills/devlyn:preflight/SKILL.md +0 -355
  147. package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
  148. package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -86
  149. package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
  150. package/config/skills/devlyn:product-spec/SKILL.md +0 -603
  151. package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
  152. package/config/skills/devlyn:review/SKILL.md +0 -161
  153. package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
  154. package/config/skills/devlyn:team-review/SKILL.md +0 -493
  155. package/config/skills/devlyn:update-docs/SKILL.md +0 -463
  156. package/config/skills/workflow-routing/SKILL.md +0 -73
  157. /package/{config/skills → optional-skills}/devlyn:reap/scripts/reap.sh +0 -0
  158. /package/{config/skills → optional-skills}/devlyn:reap/scripts/scan.sh +0 -0
@@ -0,0 +1,130 @@
1
+ # Fixture Schema
2
+
3
+ Every fixture is a directory under `benchmark/auto-resolve/fixtures/F<N>-<slug>/` with these files. **All six files are required** (setup.sh may be empty when the starting `test-repo` copy needs no modification).
4
+
5
+ ## metadata.json
6
+
7
+ ```json
8
+ {
9
+ "id": "F2-cli-medium-subcommand",
10
+ "category": "medium",
11
+ "difficulty": "medium",
12
+ "timeout_seconds": 1200,
13
+ "required_tools": ["node"],
14
+ "browser": false,
15
+ "deps_change_expected": false,
16
+ "intent": "One-sentence plain-language statement of the work, the SINGLE source of truth for spec.md and task.txt."
17
+ }
18
+ ```
19
+
20
+ - **id** — matches directory name, used across artifacts.
21
+ - **category** — one of `trivial | medium | high-risk | stress | edge | e2e`. Drives which ship-gate rule applies.
22
+ - **difficulty** — expected difficulty independent of category. Rubric uses this only for saturation detection (when both arms > 95 for two versions, flag fixture for rotation).
23
+ - **timeout_seconds** — per-arm hard timeout. Runner kills the arm at this limit and marks result `TIMEOUT`.
24
+ - **required_tools** — binaries the arm's environment must provide. Runner checks before invocation.
25
+ - **browser** — true if arm must be able to run Playwright. Runner uses this to decide whether `test-repo`'s Playwright deps get installed before the arm starts.
26
+ - **deps_change_expected** — true if the task involves modifying `package.json` / lockfiles. Variant's CRITIC security sub-pass is expected to trigger native `security-review` dep audit when true.
27
+ - **intent** — **load-bearing**. A short plain-language statement shared by both arms. `spec.md` formalizes it into auto-resolve-ready form; `task.txt` renders it as a direct prompt. A CI lint ensures both derive from this field and stay in sync.
28
+
29
+ ## spec.md
30
+
31
+ Auto-resolve-ready spec for the pipeline arm. Same format `/devlyn:ideate` produces:
32
+
33
+ ```markdown
34
+ ---
35
+ id: "<fixture-id>"
36
+ title: "<short title>"
37
+ status: planned
38
+ complexity: medium
39
+ depends-on: []
40
+ ---
41
+
42
+ # <fixture-id> <Title>
43
+
44
+ ## Context
45
+ 2-3 sentences describing WHY (not HOW). Must be traceable back to `metadata.intent`.
46
+
47
+ ## Requirements
48
+ - [ ] Specific, testable, scoped.
49
+ - [ ] ...
50
+
51
+ ## Constraints
52
+ - Concrete, with reasoning for each (not bare).
53
+
54
+ ## Out of Scope
55
+ - Explicit "must NOT build" list. Audited by preflight as anti-commitments.
56
+
57
+ ## Verification
58
+ - Concrete commands whose expected behavior is named.
59
+ ```
60
+
61
+ ## task.txt
62
+
63
+ Bare-arm input. Plain English, same intent, but framed as a user request rather than a formal spec. Intentionally lacks the structured Requirements/Constraints/Out-of-Scope sections — bare must make those calls itself. Must not leak "use the devlyn skill" hints.
64
+
65
+ ## expected.json
66
+
67
+ Machine-readable acceptance criteria used by both `run-fixture.sh` verification and the judge's rubric anchor.
68
+
69
+ ```json
70
+ {
71
+ "verification_commands": [
72
+ {
73
+ "cmd": "node bin/cli.js doctor",
74
+ "exit_code": 0,
75
+ "stdout_contains": ["doctor: "],
76
+ "stdout_not_contains": ["undefined"]
77
+ }
78
+ ],
79
+ "forbidden_patterns": [
80
+ {
81
+ "pattern": "catch\\s*\\(\\s*[a-zA-Z_]*\\s*\\)\\s*\\{\\s*return",
82
+ "description": "silent catch returning a fallback value — violates no-silent-catches policy",
83
+ "files": ["bin/cli.js"],
84
+ "severity": "disqualifier"
85
+ }
86
+ ],
87
+ "required_files": ["bin/cli.js"],
88
+ "forbidden_files": [],
89
+ "max_deps_added": 0
90
+ }
91
+ ```
92
+
93
+ - **verification_commands** — runner executes each. Each command's pass/fail contributes to the arm's `verify_score`.
94
+ - **forbidden_patterns** — regexes scanned across `diff.patch`. Match at `severity: "disqualifier"` is a hard-floor fail. Match at `severity: "warning"` goes into the judge's critical-findings report.
95
+ - **required_files** — must exist after the arm runs.
96
+ - **forbidden_files** — must NOT appear in the arm's diff.
97
+ - **max_deps_added** — count of new entries under `dependencies`/`devDependencies` in `package.json`. Exceeds → hard-floor fail.
98
+
99
+ ## NOTES.md
100
+
101
+ Human-readable explanation of why this fixture exists. Must answer:
102
+
103
+ 1. What specific failure mode does this fixture detect?
104
+ 2. What pipeline phase(s) is this testing?
105
+ 3. Why can't another fixture cover this?
106
+ 4. When should this fixture be retired or replaced?
107
+
108
+ Notes are read during suite design review, not during runs.
109
+
110
+ ## setup.sh
111
+
112
+ Deterministic starting state. Runs against a fresh copy of `benchmark/auto-resolve/fixtures/test-repo/` before either arm starts. Common uses:
113
+
114
+ - Install extra deps (`npm install --prefix . something`).
115
+ - Apply a `.patch` that introduces a bug to fix.
116
+ - Create pre-existing files referenced by the spec.
117
+
118
+ Script must be idempotent when re-applied. Empty file (just `#!/usr/bin/env bash\nset -e\n`) is valid when no setup needed.
119
+
120
+ ---
121
+
122
+ ## Drift Prevention
123
+
124
+ A CI lint step (`scripts/lint-fixtures.sh`) verifies:
125
+
126
+ - All six files present per fixture.
127
+ - `metadata.intent` substring appears in both `spec.md::Context` and `task.txt` (≥ 60% token overlap using simple tokenization).
128
+ - `spec.md` frontmatter `id` matches directory name.
129
+ - `expected.json` is valid JSON.
130
+ - `setup.sh` is executable.
@@ -0,0 +1,27 @@
1
+ # bench-test-repo
2
+
3
+ Deterministic base Node project used by every devlyn-cli auto-resolve
4
+ benchmark fixture. Fixtures extend this skeleton via `setup.sh` patches.
5
+
6
+ ## What's in it
7
+
8
+ - `bin/cli.js` — tiny CLI (`hello`, `version`)
9
+ - `server/index.js` — tiny Express app (`/health`, `/items`, `/items/:id`)
10
+ - `web/index.html` — minimal static page with a click interaction
11
+ - `tests/cli.test.js`, `tests/server.test.js` — node:test fixtures
12
+ - `playwright.config.js` — used by web/browser fixtures only
13
+ - `package.json` — `express` dep, `engines: node >= 18`
14
+
15
+ ## How it's used
16
+
17
+ `run-fixture.sh` copies this directory to a temp path per run, applies the
18
+ fixture's `setup.sh`, then invokes the arm (variant or bare) against that
19
+ copy. No fixture modifies this source tree — modifications happen only in
20
+ the per-run temp copies.
21
+
22
+ ## Keep it minimal
23
+
24
+ Adding features to `test-repo` enlarges the surface every fixture works
25
+ against. Add only when an existing fixture can't express itself against the
26
+ current baseline. Preferred path: push complexity into the fixture's
27
+ `setup.sh`, not into this base.
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env node
2
+ // bench-test-repo — tiny CLI used as the deterministic base for benchmark fixtures.
3
+ // Fixtures extend or modify this file; keep the baseline minimal and obvious.
4
+
5
+ const fs = require('fs');
6
+ const path = require('path');
7
+
8
+ const USAGE = `Usage: bench-cli <command> [options]
9
+
10
+ Commands:
11
+ hello [--name NAME] Print a greeting (default name: "world")
12
+ version Print the CLI version from package.json
13
+ --help, -h Show this help
14
+
15
+ Examples:
16
+ bench-cli hello
17
+ bench-cli hello --name alice
18
+ bench-cli version
19
+ `;
20
+
21
+ function readPackageVersion() {
22
+ const pkgPath = path.join(__dirname, '..', 'package.json');
23
+ const raw = fs.readFileSync(pkgPath, 'utf8');
24
+ return JSON.parse(raw).version;
25
+ }
26
+
27
+ function parseNameFlag(argv) {
28
+ const idx = argv.indexOf('--name');
29
+ if (idx === -1) return 'world';
30
+ const value = argv[idx + 1];
31
+ if (!value || value.startsWith('-')) {
32
+ console.error('--name requires a value');
33
+ process.exit(1);
34
+ }
35
+ return value;
36
+ }
37
+
38
+ function main(argv) {
39
+ const [command, ...rest] = argv;
40
+
41
+ if (!command || command === '--help' || command === '-h') {
42
+ process.stdout.write(USAGE);
43
+ return;
44
+ }
45
+
46
+ switch (command) {
47
+ case 'hello': {
48
+ const name = parseNameFlag(rest);
49
+ console.log(`Hello, ${name}!`);
50
+ return;
51
+ }
52
+ case 'version': {
53
+ console.log(readPackageVersion());
54
+ return;
55
+ }
56
+ default:
57
+ console.error(`Unknown command: ${command}`);
58
+ process.stderr.write(USAGE);
59
+ process.exit(1);
60
+ }
61
+ }
62
+
63
+ main(process.argv.slice(2));