devlyn-cli 1.15.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. package/AGENTS.md +104 -0
  2. package/CLAUDE.md +135 -21
  3. package/README.md +43 -125
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
  5. package/benchmark/auto-resolve/README.md +114 -0
  6. package/benchmark/auto-resolve/RUBRIC.md +162 -0
  7. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
  9. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
  10. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
  11. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
  12. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
  13. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
  14. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
  16. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
  17. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
  18. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
  19. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
  20. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
  21. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
  22. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
  23. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
  27. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
  28. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
  29. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
  30. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
  31. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
  32. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
  33. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
  34. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
  35. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
  37. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
  39. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
  40. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
  41. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
  42. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
  43. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
  44. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
  46. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
  47. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
  48. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
  49. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
  50. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
  51. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
  52. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
  53. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
  54. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
  55. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
  57. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
  58. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
  59. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
  60. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
  61. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
  62. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
  63. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
  64. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
  65. package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
  66. package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
  67. package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
  68. package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
  69. package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
  70. package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
  71. package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
  72. package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
  73. package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
  74. package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
  75. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
  76. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
  77. package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
  78. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
  79. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
  80. package/benchmark/auto-resolve/scripts/judge.sh +359 -0
  81. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
  82. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
  83. package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
  84. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
  85. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
  86. package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
  87. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
  88. package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
  89. package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
  90. package/bin/devlyn.js +175 -17
  91. package/config/skills/_shared/adapters/README.md +64 -0
  92. package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
  93. package/config/skills/_shared/adapters/opus-4-7.md +29 -0
  94. package/config/skills/{devlyn:auto-resolve/scripts → _shared}/archive_run.py +26 -0
  95. package/config/skills/_shared/codex-config.md +54 -0
  96. package/config/skills/_shared/codex-monitored.sh +141 -0
  97. package/config/skills/_shared/engine-preflight.md +35 -0
  98. package/config/skills/_shared/expected.schema.json +93 -0
  99. package/config/skills/_shared/pair-plan-schema.md +298 -0
  100. package/config/skills/_shared/runtime-principles.md +110 -0
  101. package/config/skills/_shared/spec-verify-check.py +519 -0
  102. package/config/skills/devlyn:ideate/SKILL.md +99 -429
  103. package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
  104. package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
  105. package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
  106. package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
  107. package/config/skills/devlyn:resolve/SKILL.md +172 -184
  108. package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
  109. package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
  110. package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
  111. package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
  112. package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
  113. package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
  114. package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
  115. package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
  116. package/{config/skills → optional-skills}/devlyn:reap/SKILL.md +1 -0
  117. package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
  118. package/package.json +12 -2
  119. package/scripts/lint-skills.sh +431 -0
  120. package/config/skills/devlyn:auto-resolve/SKILL.md +0 -252
  121. package/config/skills/devlyn:auto-resolve/evals/evals.json +0 -21
  122. package/config/skills/devlyn:auto-resolve/evals/task-doctor-subcommand.md +0 -42
  123. package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -130
  124. package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -82
  125. package/config/skills/devlyn:auto-resolve/references/findings-schema.md +0 -103
  126. package/config/skills/devlyn:auto-resolve/references/phases/phase-1-build.md +0 -54
  127. package/config/skills/devlyn:auto-resolve/references/phases/phase-2-evaluate.md +0 -45
  128. package/config/skills/devlyn:auto-resolve/references/phases/phase-3-critic.md +0 -84
  129. package/config/skills/devlyn:auto-resolve/references/pipeline-routing.md +0 -114
  130. package/config/skills/devlyn:auto-resolve/references/pipeline-state.md +0 -201
  131. package/config/skills/devlyn:auto-resolve/scripts/terminal_verdict.py +0 -96
  132. package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
  133. package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
  134. package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
  135. package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
  136. package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
  137. package/config/skills/devlyn:clean/SKILL.md +0 -285
  138. package/config/skills/devlyn:design-ui/SKILL.md +0 -351
  139. package/config/skills/devlyn:discover-product/SKILL.md +0 -124
  140. package/config/skills/devlyn:evaluate/SKILL.md +0 -564
  141. package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
  142. package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
  143. package/config/skills/devlyn:ideate/references/codex-critic-template.md +0 -42
  144. package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
  145. package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
  146. package/config/skills/devlyn:preflight/SKILL.md +0 -355
  147. package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
  148. package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -86
  149. package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
  150. package/config/skills/devlyn:product-spec/SKILL.md +0 -603
  151. package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
  152. package/config/skills/devlyn:review/SKILL.md +0 -161
  153. package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
  154. package/config/skills/devlyn:team-review/SKILL.md +0 -493
  155. package/config/skills/devlyn:update-docs/SKILL.md +0 -463
  156. package/config/skills/workflow-routing/SKILL.md +0 -73
  157. /package/{config/skills → optional-skills}/devlyn:reap/scripts/reap.sh +0 -0
  158. /package/{config/skills → optional-skills}/devlyn:reap/scripts/scan.sh +0 -0
@@ -0,0 +1,274 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ oracle-scope-tier-b.py — transitive-import classifier for benchmark arm diffs.
4
+
5
+ For each arm-touched file that is NOT in Tier C (spec_output_files) and NOT
6
+ in tier_a_waivers, determines whether it is reachable from a Tier C seed via
7
+ the static import/require graph:
8
+ - Reachable → `tier-b-reachable` (legitimate structural extension)
9
+ - Unreachable → `scope-unmatched` (may overlap with step 2's Tier A globals;
10
+ step 5's scoring dedupes against step 2)
11
+
12
+ BFS seeds = (spec_output_files glob matches in POST-arm work_dir) ∩
13
+ (arm-touched files).
14
+ The intersection prevents BFS blow-up when Tier C globs are broad (e.g.
15
+ `bin/**`) and keeps the trace meaningful — "what the arm changed and where
16
+ did it propagate?" not "every theoretically-in-scope file."
17
+
18
+ Step 4 scope:
19
+ - JS/TS only (matches step 1 language scope). TS tsconfig path aliases NOT
20
+ handled; none of the current fixtures use them.
21
+ - Static string-literal imports only. Dynamic requires via variables
22
+ (`require(someVar)`) are invisible to the trace — documented limitation.
23
+ - Findings-only at this stage; scoring integration is step 5.
24
+
25
+ The `trace_method: "regex"` field in the output lets step 5 differentiate
26
+ heuristic traces from future AST-based traces without schema changes.
27
+ """
28
+ import argparse
29
+ import fnmatch
30
+ import json
31
+ import os
32
+ import pathlib
33
+ import re
34
+ import subprocess
35
+ import sys
36
+
37
+ ORACLE_NAME = "scope-tier-b"
38
+
39
+ # iter-0022: stable category enumeration. tier-b-reachable is `info` severity
40
+ # (positive signal: touched file is reachable from spec_output_files via
41
+ # static imports) and is intentionally OMITTED from the registry — it is
42
+ # context, not an invariant violation. Only scope-unmatched is registered.
43
+ CATEGORIES = [
44
+ {
45
+ "id": "scope-tier-b:scope-unmatched",
46
+ "severity": "warn",
47
+ "applies_when": "fixture has expected.json:spec_output_files (the BFS seed set is non-empty)",
48
+ "operational_check": "every variant-touched file MUST be either inside spec_output_files (Tier C) OR reachable from a Tier C seed via static JS/TS imports OR matched by expected.json:tier_a_waivers",
49
+ "evidence_source_files": ["oracle-scope-tier-b.py"],
50
+ },
51
+ ]
52
+
53
+ TRACE_METHOD = "regex"
54
+
55
+ # Static-import patterns. Order matters only for readability; duplicates
56
+ # are harmless because we dedupe by resolved path in BFS.
57
+ IMPORT_PATTERNS = [
58
+ # CommonJS: require('./foo')
59
+ r"require\(\s*['\"]([^'\"]+)['\"]\s*\)",
60
+ # ES module static import (with or without binding)
61
+ r"import\s+(?:[\w*{},\s\n]+\s+from\s+)?['\"]([^'\"]+)['\"]",
62
+ # ES module re-export
63
+ r"export\s+(?:\*|\{[^}]*\})\s+from\s+['\"]([^'\"]+)['\"]",
64
+ # Dynamic import with string literal
65
+ r"import\(\s*['\"]([^'\"]+)['\"]\s*\)",
66
+ ]
67
+
68
+ # Extension order for resolution. .json is a valid import target but is a
69
+ # leaf (we don't recurse into it).
70
+ RESOLUTION_EXTENSIONS = (".js", ".mjs", ".cjs", ".ts", ".tsx", ".jsx", ".json")
71
+ TRACEABLE_EXTENSIONS = (".js", ".mjs", ".cjs", ".ts", ".tsx", ".jsx")
72
+ INDEX_EXTENSIONS = (".js", ".mjs", ".ts")
73
+
74
+
75
+ def is_relative(spec: str) -> bool:
76
+ return spec.startswith("./") or spec.startswith("../") or spec.startswith("/")
77
+
78
+
79
+ def resolve_import(source_rel: str, spec: str, work_dir: pathlib.Path):
80
+ """Resolve a relative import. Returns a repo-root-relative path or None."""
81
+ if spec.startswith("/"):
82
+ target = spec.lstrip("/")
83
+ else:
84
+ source_dir = os.path.dirname(source_rel)
85
+ target = os.path.normpath(os.path.join(source_dir, spec))
86
+ # Normalize to forward slashes
87
+ target = target.replace(os.sep, "/")
88
+ # Reject paths that escape work_dir (e.g. `../../outside-repo`)
89
+ if target.startswith("../") or target.startswith("/"):
90
+ return None
91
+ # Exact file
92
+ if (work_dir / target).is_file():
93
+ return target
94
+ # Suffix candidates
95
+ for ext in RESOLUTION_EXTENSIONS:
96
+ cand = f"{target}{ext}"
97
+ if (work_dir / cand).is_file():
98
+ return cand
99
+ # /index.* in directory
100
+ for ext in INDEX_EXTENSIONS:
101
+ cand = f"{target}/index{ext}"
102
+ if (work_dir / cand).is_file():
103
+ return cand
104
+ return None
105
+
106
+
107
+ def read_imports(file_path: pathlib.Path):
108
+ try:
109
+ content = file_path.read_text(encoding="utf-8", errors="replace")
110
+ except OSError:
111
+ return []
112
+ specs = []
113
+ for pattern in IMPORT_PATTERNS:
114
+ for m in re.finditer(pattern, content, re.MULTILINE):
115
+ specs.append(m.group(1))
116
+ return specs
117
+
118
+
119
+ def bfs_trace(seeds, work_dir: pathlib.Path):
120
+ """BFS following static imports. Returns dict: path → (depth, via)."""
121
+ reachable = {s: (0, None) for s in seeds}
122
+ queue = [(s, 0) for s in seeds]
123
+ while queue:
124
+ current, depth = queue.pop(0)
125
+ if not any(current.endswith(ext) for ext in TRACEABLE_EXTENSIONS):
126
+ continue
127
+ full = work_dir / current
128
+ if not full.is_file():
129
+ continue
130
+ for spec in read_imports(full):
131
+ if not is_relative(spec):
132
+ continue
133
+ resolved = resolve_import(current, spec, work_dir)
134
+ if resolved is None or resolved in reachable:
135
+ continue
136
+ if "node_modules" in resolved.split("/"):
137
+ continue
138
+ reachable[resolved] = (depth + 1, current)
139
+ queue.append((resolved, depth + 1))
140
+ return reachable
141
+
142
+
143
+ def git_touched_files(scaffold_sha: str, work_dir: pathlib.Path):
144
+ """Arm-touched files (relative paths), excluding deletions."""
145
+ r = subprocess.run(
146
+ ["git", "diff", "--name-status", "-M", scaffold_sha],
147
+ cwd=str(work_dir), capture_output=True, text=True,
148
+ )
149
+ touched = []
150
+ for line in r.stdout.splitlines():
151
+ parts = line.split("\t")
152
+ if len(parts) < 2:
153
+ continue
154
+ status = parts[0]
155
+ if status == "D":
156
+ continue
157
+ path = parts[-1]
158
+ touched.append(path)
159
+ return touched
160
+
161
+
162
+ def match_any(path: str, patterns) -> bool:
163
+ return any(fnmatch.fnmatch(path, p) for p in patterns)
164
+
165
+
166
+ def analyze(work_dir_str: str, scaffold_sha: str, tier_c_globs, waivers,
167
+ fixture_id=None):
168
+ work_dir = pathlib.Path(work_dir_str).resolve()
169
+ touched = git_touched_files(scaffold_sha, work_dir)
170
+
171
+ # Seeds = arm-touched files matching spec_output_files globs.
172
+ seeds = sorted(p for p in touched if match_any(p, tier_c_globs))
173
+
174
+ reachable = bfs_trace(seeds, work_dir)
175
+
176
+ # Structural exemption: the fixture's own spec file at
177
+ # docs/roadmap/phase-*/<fixture_id>.md is always authorized — DOCS
178
+ # phase Job 1 flips its frontmatter status by design. Kept in sync
179
+ # with oracle-scope-tier-a.py.
180
+ own_spec_globs = []
181
+ if fixture_id:
182
+ own_spec_globs.append(f"docs/roadmap/phase-*/{fixture_id}.md")
183
+
184
+ findings = []
185
+ for path in sorted(touched):
186
+ if match_any(path, tier_c_globs):
187
+ continue
188
+ if match_any(path, waivers):
189
+ continue
190
+ if match_any(path, own_spec_globs):
191
+ continue
192
+ if path in reachable:
193
+ depth, via = reachable[path]
194
+ findings.append({
195
+ "file": path,
196
+ "type": "tier-b-reachable",
197
+ "severity": "info",
198
+ "reachable_via": via,
199
+ "depth": depth,
200
+ "verdict": "Reachable from Tier C via import chain",
201
+ })
202
+ else:
203
+ findings.append({
204
+ "file": path,
205
+ "type": "scope-unmatched",
206
+ "severity": "warn",
207
+ "verdict": "Not in Tier C, not reachable from Tier C via static imports",
208
+ })
209
+
210
+ return seeds, findings
211
+
212
+
213
+ def main():
214
+ ap = argparse.ArgumentParser()
215
+ ap.add_argument("--work")
216
+ ap.add_argument("--scaffold")
217
+ ap.add_argument("--expected",
218
+ help="Path to fixture expected.json")
219
+ ap.add_argument(
220
+ "--list-categories",
221
+ action="store_true",
222
+ help="Emit the stable oracle CATEGORIES enum as JSON and exit (iter-0022).",
223
+ )
224
+ args = ap.parse_args()
225
+
226
+ if args.list_categories:
227
+ print(json.dumps({"oracle": ORACLE_NAME, "categories": CATEGORIES}, indent=2, sort_keys=True))
228
+ return
229
+
230
+ if not args.work or not args.scaffold or not args.expected:
231
+ ap.error("--work, --scaffold, and --expected are required unless --list-categories is set")
232
+
233
+ try:
234
+ expected = json.loads(pathlib.Path(args.expected).read_text())
235
+ except (OSError, json.JSONDecodeError) as e:
236
+ sys.stderr.write(f"[oracle-scope-tier-b] cannot read expected: {e}\n")
237
+ print(json.dumps({
238
+ "oracle": "scope-tier-b",
239
+ "trace_method": TRACE_METHOD,
240
+ "tier_c_seeds_matched": [],
241
+ "findings": [],
242
+ "error": f"expected.json unreadable: {e}",
243
+ }, indent=2))
244
+ return
245
+
246
+ tier_c = expected.get("spec_output_files", [])
247
+ waivers = expected.get("tier_a_waivers", [])
248
+ # fixture_id = parent directory name of expected.json
249
+ fixture_id = pathlib.Path(args.expected).parent.name
250
+
251
+ if not tier_c:
252
+ print(json.dumps({
253
+ "oracle": "scope-tier-b",
254
+ "trace_method": TRACE_METHOD,
255
+ "tier_c_seeds_matched": [],
256
+ "fixture_id": fixture_id,
257
+ "findings": [],
258
+ "error": "no spec_output_files in expected.json",
259
+ }, indent=2))
260
+ return
261
+
262
+ seeds, findings = analyze(args.work, args.scaffold, tier_c, waivers,
263
+ fixture_id=fixture_id)
264
+ print(json.dumps({
265
+ "oracle": "scope-tier-b",
266
+ "trace_method": TRACE_METHOD,
267
+ "tier_c_seeds_matched": seeds,
268
+ "fixture_id": fixture_id,
269
+ "findings": findings,
270
+ }, indent=2))
271
+
272
+
273
+ if __name__ == "__main__":
274
+ main()
@@ -0,0 +1,328 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ oracle-test-fidelity.py — deterministic check for existing-test weakening.
4
+
5
+ Compares each modified test file's post-variant content against its scaffold
6
+ version and emits findings when:
7
+ - Effective assertion count dropped or tests were silently skipped (Signal A)
8
+ - Real-network/filesystem calls swapped for mocks in the same file (Signal B)
9
+ - An existing test file was deleted outright
10
+
11
+ Step 1 scope: JavaScript/TypeScript test files. Other languages may match the
12
+ path filters but the pattern lists are JS/TS-specific; the `--lang` flag is a
13
+ stub for later expansion.
14
+
15
+ Output: JSON on stdout.
16
+ """
17
+ import argparse
18
+ import fnmatch
19
+ import json
20
+ import pathlib
21
+ import re
22
+ import subprocess
23
+ import sys
24
+
25
+ ORACLE_NAME = "test-fidelity"
26
+
27
+ # iter-0022: stable category enumeration for the pair-plan registry. The id
28
+ # strings are pinned — pair-plan.json invariants reference them verbatim.
29
+ # Edits here must update every fixture's metadata.json:pair_plan_oracle_categories
30
+ # allowlist that names the changed/removed id, and bump pair-plan-schema.md
31
+ # schema_version if a category is renamed.
32
+ CATEGORIES = [
33
+ {
34
+ "id": "test-fidelity:test-file-deleted",
35
+ "severity": "flag",
36
+ "applies_when": "any test file present at scaffold (default for js/ts fixtures with tests/)",
37
+ "operational_check": "no scaffold-present test file may be deleted by the variant arm; deletion of an existing tests/*.test.* / *.spec.* / *.e2e.* file is a flag-severity finding",
38
+ "evidence_source_files": ["oracle-test-fidelity.py"],
39
+ },
40
+ {
41
+ "id": "test-fidelity:test-file-renamed",
42
+ "severity": "warn",
43
+ "applies_when": "any test file present at scaffold",
44
+ "operational_check": "rename of a scaffold-present test file is warn-severity (content fidelity not verified across renames in step 1)",
45
+ "evidence_source_files": ["oracle-test-fidelity.py"],
46
+ },
47
+ {
48
+ "id": "test-fidelity:mock-swap",
49
+ "severity": "flag",
50
+ "applies_when": "fixture's scaffold-present test files contain real-network/filesystem/process patterns (REAL_PATTERNS — listen/createServer/fetch/http.request/supertest/readFile*/writeFile*/spawn/exec)",
51
+ "operational_check": "post-arm test file MUST NOT swap REAL_PATTERNS hits for MOCK_PATTERNS hits (jest/vi/sinon, nock/msw, app.handle/inject/callback, hand-rolled IncomingMessage/ServerResponse, etc.); a drop in real_calls combined with a rise in mock_calls is a mock-swap flag",
52
+ "evidence_source_files": ["oracle-test-fidelity.py"],
53
+ },
54
+ {
55
+ "id": "test-fidelity:assertion-regression",
56
+ "severity": "warn",
57
+ "applies_when": "any test file present at scaffold",
58
+ "operational_check": "effective assertion count MUST NOT drop and skipped-test count MUST NOT rise; vacuous expect.assertions(0) is treated as a real regression",
59
+ "evidence_source_files": ["oracle-test-fidelity.py"],
60
+ },
61
+ ]
62
+
63
+ TEST_FILE_GLOBS = ["*.test.*", "*.spec.*", "*.e2e.*"]
64
+ TEST_DIR_PARTS = {"tests", "test", "__tests__", "spec"}
65
+
66
+ # Assertion-call patterns. `expect(` matches expect(value) but not
67
+ # expect.assertions(...) / expect.fail(...) because those start with `expect.`.
68
+ ASSERT_PATTERNS = [
69
+ r"\bassert\.(equal|strictEqual|deepEqual|deepStrictEqual|ok|match|notEqual|fail|throws|rejects|doesNotThrow)\(",
70
+ r"\bt\.(equal|strictEqual|deepEqual|ok|match|notEqual|fail|throws)\(",
71
+ r"\bexpect\(",
72
+ ]
73
+
74
+ # Explicitly skipped tests — count stays the same but coverage drops silently.
75
+ SKIP_PATTERNS = [
76
+ r"\btest\.skip\(",
77
+ r"\bit\.skip\(",
78
+ r"\bdescribe\.skip\(",
79
+ r"\bxit\(",
80
+ r"\bxdescribe\(",
81
+ r"\bxtest\(",
82
+ ]
83
+
84
+ # Vacuous-assertion markers — assertion count reads normal but test asserts nothing.
85
+ VACUOUS_PATTERNS = [
86
+ r"expect\.assertions\(\s*0\s*\)",
87
+ ]
88
+
89
+ # Real-network / filesystem call patterns (what we hope stays).
90
+ REAL_PATTERNS = [
91
+ r"\.listen\(",
92
+ r"\bcreateServer\(",
93
+ r"\bfetch\(",
94
+ r"\bhttp\.request\(",
95
+ r"\bsupertest\(",
96
+ r"\.readFileSync\(",
97
+ r"\.readFile\(",
98
+ r"\.writeFileSync\(",
99
+ r"\.writeFile\(",
100
+ r"\bspawn(Sync)?\(",
101
+ r"\bexec(Sync)?\(",
102
+ ]
103
+
104
+ # Mock replacement patterns. Includes hand-rolled Node mocks, module-boundary
105
+ # mocks (jest/vitest/sinon), HTTP-level mocks (nock/msw), and bypass patterns
106
+ # that directly invoke app handlers without the real HTTP server.
107
+ MOCK_PATTERNS = [
108
+ # Hand-rolled req/res (bare or module-prefixed)
109
+ r"\bnew\s+(?:http\.)?IncomingMessage\b",
110
+ r"\bnew\s+(?:http\.)?ServerResponse\b",
111
+ r"\bnew\s+Duplex\s*\(\s*\{",
112
+ r"\bhandlers?\[0\]\(",
113
+ r"\bmockReq\b|\bfakeReq\b|\bstubReq\b",
114
+ r"\bReadable\.from\(\[",
115
+ # Server-bypass direct-handler invocation (Express/Koa/Fastify inject)
116
+ r"\bapp\.handle\(",
117
+ r"\bapp\.callback\(",
118
+ r"\bapp\.inject\(",
119
+ r"\bapp\._router\.",
120
+ # Module-boundary mock libraries
121
+ r"\bjest\.fn\(",
122
+ r"\bvi\.fn\(",
123
+ r"\bsinon\.stub\(",
124
+ r"\bsinon\.spy\(",
125
+ r"\bjest\.mock\(",
126
+ r"\bvi\.mock\(",
127
+ # HTTP-interception libraries
128
+ r"\bnock\(",
129
+ r"\bmsw\b",
130
+ ]
131
+
132
+
133
+ def is_test_path(path: str) -> bool:
134
+ parts = pathlib.PurePath(path).parts
135
+ lower_parts = {p.lower() for p in parts}
136
+ if lower_parts & TEST_DIR_PARTS:
137
+ return True
138
+ name = pathlib.PurePath(path).name
139
+ if any(fnmatch.fnmatch(name, g) for g in TEST_FILE_GLOBS):
140
+ return True
141
+ return False
142
+
143
+
144
+ def run_git(args, cwd, check=False):
145
+ r = subprocess.run(
146
+ ["git", *args], cwd=cwd, capture_output=True, text=True
147
+ )
148
+ if check and r.returncode != 0:
149
+ raise RuntimeError(f"git {' '.join(args)} failed: {r.stderr.strip()}")
150
+ return r
151
+
152
+
153
+ def git_diff_status(scaffold_sha: str, cwd: str):
154
+ """Return list of (status, path) for files changed scaffold..HEAD + worktree."""
155
+ r = run_git(
156
+ ["diff", "--name-status", "-M", scaffold_sha],
157
+ cwd=cwd,
158
+ )
159
+ entries = []
160
+ for line in r.stdout.splitlines():
161
+ line = line.strip()
162
+ if not line:
163
+ continue
164
+ parts = line.split("\t")
165
+ status = parts[0]
166
+ # Rename lines look like "R<score>\told\tnew"; copy "C<score>\told\tnew".
167
+ if status.startswith("R") or status.startswith("C"):
168
+ if len(parts) >= 3:
169
+ entries.append((status[0], parts[2])) # treat as new path, keep R/C letter
170
+ else:
171
+ if len(parts) >= 2:
172
+ entries.append((status, parts[1]))
173
+ return entries
174
+
175
+
176
+ def git_show(scaffold_sha: str, path: str, cwd: str):
177
+ r = run_git(["show", f"{scaffold_sha}:{path}"], cwd=cwd)
178
+ if r.returncode != 0:
179
+ return None
180
+ return r.stdout
181
+
182
+
183
+ def count_patterns(text: str, patterns) -> int:
184
+ total = 0
185
+ for p in patterns:
186
+ total += len(re.findall(p, text))
187
+ return total
188
+
189
+
190
+ def effective_assertions(text: str):
191
+ raw = count_patterns(text, ASSERT_PATTERNS)
192
+ vacuous = count_patterns(text, VACUOUS_PATTERNS)
193
+ skips = count_patterns(text, SKIP_PATTERNS)
194
+ return raw - vacuous, skips
195
+
196
+
197
+ def analyze(work_dir: str, scaffold_sha: str):
198
+ findings = []
199
+ for status, path in git_diff_status(scaffold_sha, work_dir):
200
+ if not is_test_path(path):
201
+ continue
202
+
203
+ if status == "D":
204
+ findings.append({
205
+ "file": path,
206
+ "type": "test-file-deleted",
207
+ "severity": "flag",
208
+ "verdict": "Existing test file deleted entirely",
209
+ })
210
+ continue
211
+
212
+ if status in ("A",):
213
+ # New test file — not a weakening.
214
+ continue
215
+
216
+ if status in ("R", "C"):
217
+ # Rename/copy — known evasion path. Flag lightly so it's visible
218
+ # but we don't attempt content diff (old path resolution is
219
+ # brittle). Step 2+ can harden this.
220
+ findings.append({
221
+ "file": path,
222
+ "type": "test-file-renamed",
223
+ "severity": "warn",
224
+ "verdict": "Test file renamed — content fidelity not verified",
225
+ })
226
+ continue
227
+
228
+ if status != "M":
229
+ continue
230
+
231
+ pre = git_show(scaffold_sha, path, work_dir)
232
+ if pre is None:
233
+ continue
234
+
235
+ post_path = pathlib.Path(work_dir) / path
236
+ if not post_path.exists():
237
+ continue
238
+ try:
239
+ post = post_path.read_text(encoding="utf-8", errors="replace")
240
+ except OSError:
241
+ continue
242
+
243
+ pre_asserts, pre_skips = effective_assertions(pre)
244
+ post_asserts, post_skips = effective_assertions(post)
245
+ pre_real = count_patterns(pre, REAL_PATTERNS)
246
+ post_real = count_patterns(post, REAL_PATTERNS)
247
+ pre_mock = count_patterns(pre, MOCK_PATTERNS)
248
+ post_mock = count_patterns(post, MOCK_PATTERNS)
249
+
250
+ assert_delta = post_asserts - pre_asserts
251
+ skip_delta = post_skips - pre_skips
252
+ real_delta = post_real - pre_real
253
+ mock_delta = post_mock - pre_mock
254
+
255
+ signal_a = assert_delta < 0 or skip_delta > 0
256
+ signal_b = real_delta < 0 and mock_delta > 0
257
+
258
+ if signal_a and signal_b:
259
+ findings.append({
260
+ "file": path,
261
+ "type": "mock-swap",
262
+ "severity": "flag",
263
+ "assertion_delta": assert_delta,
264
+ "skip_delta": skip_delta,
265
+ "real_calls_delta": real_delta,
266
+ "mock_calls_delta": mock_delta,
267
+ "verdict": "Integration test weakened: effective assertions dropped AND real calls swapped for mocks",
268
+ })
269
+ elif signal_b:
270
+ findings.append({
271
+ "file": path,
272
+ "type": "mock-swap",
273
+ "severity": "flag",
274
+ "real_calls_delta": real_delta,
275
+ "mock_calls_delta": mock_delta,
276
+ "verdict": "Real-network/filesystem calls swapped for mocks",
277
+ })
278
+ elif signal_a:
279
+ findings.append({
280
+ "file": path,
281
+ "type": "assertion-regression",
282
+ "severity": "warn",
283
+ "assertion_delta": assert_delta,
284
+ "skip_delta": skip_delta,
285
+ "verdict": "Effective assertion count dropped or tests were skipped",
286
+ })
287
+ return findings
288
+
289
+
290
+ def main():
291
+ ap = argparse.ArgumentParser()
292
+ ap.add_argument("--work", help="Arm work directory")
293
+ ap.add_argument("--scaffold", help="Scaffold commit SHA")
294
+ ap.add_argument(
295
+ "--lang",
296
+ default="js-ts",
297
+ help="Language profile (only js-ts implemented in step 1)",
298
+ )
299
+ ap.add_argument(
300
+ "--list-categories",
301
+ action="store_true",
302
+ help="Emit the stable oracle CATEGORIES enum as JSON and exit (iter-0022, used by pair-plan-idgen.py).",
303
+ )
304
+ args = ap.parse_args()
305
+
306
+ if args.list_categories:
307
+ print(json.dumps({"oracle": ORACLE_NAME, "categories": CATEGORIES}, indent=2, sort_keys=True))
308
+ return
309
+
310
+ if not args.work or not args.scaffold:
311
+ ap.error("--work and --scaffold are required unless --list-categories is set")
312
+
313
+ if args.lang != "js-ts":
314
+ sys.stderr.write(
315
+ f"[oracle-test-fidelity] lang={args.lang} not implemented; "
316
+ "falling back to js-ts patterns\n"
317
+ )
318
+
319
+ findings = analyze(args.work, args.scaffold)
320
+ print(json.dumps({
321
+ "oracle": "test-fidelity",
322
+ "lang": args.lang,
323
+ "findings": findings,
324
+ }, indent=2))
325
+
326
+
327
+ if __name__ == "__main__":
328
+ main()