devlyn-cli 1.15.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +104 -0
- package/CLAUDE.md +135 -21
- package/README.md +43 -125
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
- package/benchmark/auto-resolve/README.md +114 -0
- package/benchmark/auto-resolve/RUBRIC.md +162 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
- package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
- package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
- package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
- package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
- package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
- package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
- package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
- package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
- package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
- package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
- package/benchmark/auto-resolve/scripts/judge.sh +359 -0
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
- package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
- package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
- package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
- package/bin/devlyn.js +175 -17
- package/config/skills/_shared/adapters/README.md +64 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
- package/config/skills/_shared/adapters/opus-4-7.md +29 -0
- package/config/skills/{devlyn:auto-resolve/scripts → _shared}/archive_run.py +26 -0
- package/config/skills/_shared/codex-config.md +54 -0
- package/config/skills/_shared/codex-monitored.sh +141 -0
- package/config/skills/_shared/engine-preflight.md +35 -0
- package/config/skills/_shared/expected.schema.json +93 -0
- package/config/skills/_shared/pair-plan-schema.md +298 -0
- package/config/skills/_shared/runtime-principles.md +110 -0
- package/config/skills/_shared/spec-verify-check.py +519 -0
- package/config/skills/devlyn:ideate/SKILL.md +99 -429
- package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
- package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
- package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
- package/config/skills/devlyn:resolve/SKILL.md +172 -184
- package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
- package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
- package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
- package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
- package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
- package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
- package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
- package/{config/skills → optional-skills}/devlyn:reap/SKILL.md +1 -0
- package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
- package/package.json +12 -2
- package/scripts/lint-skills.sh +431 -0
- package/config/skills/devlyn:auto-resolve/SKILL.md +0 -252
- package/config/skills/devlyn:auto-resolve/evals/evals.json +0 -21
- package/config/skills/devlyn:auto-resolve/evals/task-doctor-subcommand.md +0 -42
- package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -130
- package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -82
- package/config/skills/devlyn:auto-resolve/references/findings-schema.md +0 -103
- package/config/skills/devlyn:auto-resolve/references/phases/phase-1-build.md +0 -54
- package/config/skills/devlyn:auto-resolve/references/phases/phase-2-evaluate.md +0 -45
- package/config/skills/devlyn:auto-resolve/references/phases/phase-3-critic.md +0 -84
- package/config/skills/devlyn:auto-resolve/references/pipeline-routing.md +0 -114
- package/config/skills/devlyn:auto-resolve/references/pipeline-state.md +0 -201
- package/config/skills/devlyn:auto-resolve/scripts/terminal_verdict.py +0 -96
- package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
- package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
- package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
- package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
- package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
- package/config/skills/devlyn:clean/SKILL.md +0 -285
- package/config/skills/devlyn:design-ui/SKILL.md +0 -351
- package/config/skills/devlyn:discover-product/SKILL.md +0 -124
- package/config/skills/devlyn:evaluate/SKILL.md +0 -564
- package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
- package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
- package/config/skills/devlyn:ideate/references/codex-critic-template.md +0 -42
- package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
- package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
- package/config/skills/devlyn:preflight/SKILL.md +0 -355
- package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
- package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -86
- package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
- package/config/skills/devlyn:product-spec/SKILL.md +0 -603
- package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
- package/config/skills/devlyn:review/SKILL.md +0 -161
- package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
- package/config/skills/devlyn:team-review/SKILL.md +0 -493
- package/config/skills/devlyn:update-docs/SKILL.md +0 -463
- package/config/skills/workflow-routing/SKILL.md +0 -73
- /package/{config/skills → optional-skills}/devlyn:reap/scripts/reap.sh +0 -0
- /package/{config/skills → optional-skills}/devlyn:reap/scripts/scan.sh +0 -0
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
oracle-scope-tier-b.py — transitive-import classifier for benchmark arm diffs.
|
|
4
|
+
|
|
5
|
+
For each arm-touched file that is NOT in Tier C (spec_output_files) and NOT
|
|
6
|
+
in tier_a_waivers, determines whether it is reachable from a Tier C seed via
|
|
7
|
+
the static import/require graph:
|
|
8
|
+
- Reachable → `tier-b-reachable` (legitimate structural extension)
|
|
9
|
+
- Unreachable → `scope-unmatched` (may overlap with step 2's Tier A globals;
|
|
10
|
+
step 5's scoring dedupes against step 2)
|
|
11
|
+
|
|
12
|
+
BFS seeds = (spec_output_files glob matches in POST-arm work_dir) ∩
|
|
13
|
+
(arm-touched files).
|
|
14
|
+
The intersection prevents BFS blow-up when Tier C globs are broad (e.g.
|
|
15
|
+
`bin/**`) and keeps the trace meaningful — "what the arm changed and where
|
|
16
|
+
did it propagate?" not "every theoretically-in-scope file."
|
|
17
|
+
|
|
18
|
+
Step 4 scope:
|
|
19
|
+
- JS/TS only (matches step 1 language scope). TS tsconfig path aliases NOT
|
|
20
|
+
handled; none of the current fixtures use them.
|
|
21
|
+
- Static string-literal imports only. Dynamic requires via variables
|
|
22
|
+
(`require(someVar)`) are invisible to the trace — documented limitation.
|
|
23
|
+
- Findings-only at this stage; scoring integration is step 5.
|
|
24
|
+
|
|
25
|
+
The `trace_method: "regex"` field in the output lets step 5 differentiate
|
|
26
|
+
heuristic traces from future AST-based traces without schema changes.
|
|
27
|
+
"""
|
|
28
|
+
import argparse
|
|
29
|
+
import fnmatch
|
|
30
|
+
import json
|
|
31
|
+
import os
|
|
32
|
+
import pathlib
|
|
33
|
+
import re
|
|
34
|
+
import subprocess
|
|
35
|
+
import sys
|
|
36
|
+
|
|
37
|
+
ORACLE_NAME = "scope-tier-b"
|
|
38
|
+
|
|
39
|
+
# iter-0022: stable category enumeration. tier-b-reachable is `info` severity
|
|
40
|
+
# (positive signal: touched file is reachable from spec_output_files via
|
|
41
|
+
# static imports) and is intentionally OMITTED from the registry — it is
|
|
42
|
+
# context, not an invariant violation. Only scope-unmatched is registered.
|
|
43
|
+
CATEGORIES = [
|
|
44
|
+
{
|
|
45
|
+
"id": "scope-tier-b:scope-unmatched",
|
|
46
|
+
"severity": "warn",
|
|
47
|
+
"applies_when": "fixture has expected.json:spec_output_files (the BFS seed set is non-empty)",
|
|
48
|
+
"operational_check": "every variant-touched file MUST be either inside spec_output_files (Tier C) OR reachable from a Tier C seed via static JS/TS imports OR matched by expected.json:tier_a_waivers",
|
|
49
|
+
"evidence_source_files": ["oracle-scope-tier-b.py"],
|
|
50
|
+
},
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
TRACE_METHOD = "regex"
|
|
54
|
+
|
|
55
|
+
# Static-import patterns. Order matters only for readability; duplicates
|
|
56
|
+
# are harmless because we dedupe by resolved path in BFS.
|
|
57
|
+
IMPORT_PATTERNS = [
|
|
58
|
+
# CommonJS: require('./foo')
|
|
59
|
+
r"require\(\s*['\"]([^'\"]+)['\"]\s*\)",
|
|
60
|
+
# ES module static import (with or without binding)
|
|
61
|
+
r"import\s+(?:[\w*{},\s\n]+\s+from\s+)?['\"]([^'\"]+)['\"]",
|
|
62
|
+
# ES module re-export
|
|
63
|
+
r"export\s+(?:\*|\{[^}]*\})\s+from\s+['\"]([^'\"]+)['\"]",
|
|
64
|
+
# Dynamic import with string literal
|
|
65
|
+
r"import\(\s*['\"]([^'\"]+)['\"]\s*\)",
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
# Extension order for resolution. .json is a valid import target but is a
|
|
69
|
+
# leaf (we don't recurse into it).
|
|
70
|
+
RESOLUTION_EXTENSIONS = (".js", ".mjs", ".cjs", ".ts", ".tsx", ".jsx", ".json")
|
|
71
|
+
TRACEABLE_EXTENSIONS = (".js", ".mjs", ".cjs", ".ts", ".tsx", ".jsx")
|
|
72
|
+
INDEX_EXTENSIONS = (".js", ".mjs", ".ts")
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def is_relative(spec: str) -> bool:
|
|
76
|
+
return spec.startswith("./") or spec.startswith("../") or spec.startswith("/")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def resolve_import(source_rel: str, spec: str, work_dir: pathlib.Path):
|
|
80
|
+
"""Resolve a relative import. Returns a repo-root-relative path or None."""
|
|
81
|
+
if spec.startswith("/"):
|
|
82
|
+
target = spec.lstrip("/")
|
|
83
|
+
else:
|
|
84
|
+
source_dir = os.path.dirname(source_rel)
|
|
85
|
+
target = os.path.normpath(os.path.join(source_dir, spec))
|
|
86
|
+
# Normalize to forward slashes
|
|
87
|
+
target = target.replace(os.sep, "/")
|
|
88
|
+
# Reject paths that escape work_dir (e.g. `../../outside-repo`)
|
|
89
|
+
if target.startswith("../") or target.startswith("/"):
|
|
90
|
+
return None
|
|
91
|
+
# Exact file
|
|
92
|
+
if (work_dir / target).is_file():
|
|
93
|
+
return target
|
|
94
|
+
# Suffix candidates
|
|
95
|
+
for ext in RESOLUTION_EXTENSIONS:
|
|
96
|
+
cand = f"{target}{ext}"
|
|
97
|
+
if (work_dir / cand).is_file():
|
|
98
|
+
return cand
|
|
99
|
+
# /index.* in directory
|
|
100
|
+
for ext in INDEX_EXTENSIONS:
|
|
101
|
+
cand = f"{target}/index{ext}"
|
|
102
|
+
if (work_dir / cand).is_file():
|
|
103
|
+
return cand
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def read_imports(file_path: pathlib.Path):
|
|
108
|
+
try:
|
|
109
|
+
content = file_path.read_text(encoding="utf-8", errors="replace")
|
|
110
|
+
except OSError:
|
|
111
|
+
return []
|
|
112
|
+
specs = []
|
|
113
|
+
for pattern in IMPORT_PATTERNS:
|
|
114
|
+
for m in re.finditer(pattern, content, re.MULTILINE):
|
|
115
|
+
specs.append(m.group(1))
|
|
116
|
+
return specs
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def bfs_trace(seeds, work_dir: pathlib.Path):
|
|
120
|
+
"""BFS following static imports. Returns dict: path → (depth, via)."""
|
|
121
|
+
reachable = {s: (0, None) for s in seeds}
|
|
122
|
+
queue = [(s, 0) for s in seeds]
|
|
123
|
+
while queue:
|
|
124
|
+
current, depth = queue.pop(0)
|
|
125
|
+
if not any(current.endswith(ext) for ext in TRACEABLE_EXTENSIONS):
|
|
126
|
+
continue
|
|
127
|
+
full = work_dir / current
|
|
128
|
+
if not full.is_file():
|
|
129
|
+
continue
|
|
130
|
+
for spec in read_imports(full):
|
|
131
|
+
if not is_relative(spec):
|
|
132
|
+
continue
|
|
133
|
+
resolved = resolve_import(current, spec, work_dir)
|
|
134
|
+
if resolved is None or resolved in reachable:
|
|
135
|
+
continue
|
|
136
|
+
if "node_modules" in resolved.split("/"):
|
|
137
|
+
continue
|
|
138
|
+
reachable[resolved] = (depth + 1, current)
|
|
139
|
+
queue.append((resolved, depth + 1))
|
|
140
|
+
return reachable
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def git_touched_files(scaffold_sha: str, work_dir: pathlib.Path):
|
|
144
|
+
"""Arm-touched files (relative paths), excluding deletions."""
|
|
145
|
+
r = subprocess.run(
|
|
146
|
+
["git", "diff", "--name-status", "-M", scaffold_sha],
|
|
147
|
+
cwd=str(work_dir), capture_output=True, text=True,
|
|
148
|
+
)
|
|
149
|
+
touched = []
|
|
150
|
+
for line in r.stdout.splitlines():
|
|
151
|
+
parts = line.split("\t")
|
|
152
|
+
if len(parts) < 2:
|
|
153
|
+
continue
|
|
154
|
+
status = parts[0]
|
|
155
|
+
if status == "D":
|
|
156
|
+
continue
|
|
157
|
+
path = parts[-1]
|
|
158
|
+
touched.append(path)
|
|
159
|
+
return touched
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def match_any(path: str, patterns) -> bool:
|
|
163
|
+
return any(fnmatch.fnmatch(path, p) for p in patterns)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def analyze(work_dir_str: str, scaffold_sha: str, tier_c_globs, waivers,
|
|
167
|
+
fixture_id=None):
|
|
168
|
+
work_dir = pathlib.Path(work_dir_str).resolve()
|
|
169
|
+
touched = git_touched_files(scaffold_sha, work_dir)
|
|
170
|
+
|
|
171
|
+
# Seeds = arm-touched files matching spec_output_files globs.
|
|
172
|
+
seeds = sorted(p for p in touched if match_any(p, tier_c_globs))
|
|
173
|
+
|
|
174
|
+
reachable = bfs_trace(seeds, work_dir)
|
|
175
|
+
|
|
176
|
+
# Structural exemption: the fixture's own spec file at
|
|
177
|
+
# docs/roadmap/phase-*/<fixture_id>.md is always authorized — DOCS
|
|
178
|
+
# phase Job 1 flips its frontmatter status by design. Kept in sync
|
|
179
|
+
# with oracle-scope-tier-a.py.
|
|
180
|
+
own_spec_globs = []
|
|
181
|
+
if fixture_id:
|
|
182
|
+
own_spec_globs.append(f"docs/roadmap/phase-*/{fixture_id}.md")
|
|
183
|
+
|
|
184
|
+
findings = []
|
|
185
|
+
for path in sorted(touched):
|
|
186
|
+
if match_any(path, tier_c_globs):
|
|
187
|
+
continue
|
|
188
|
+
if match_any(path, waivers):
|
|
189
|
+
continue
|
|
190
|
+
if match_any(path, own_spec_globs):
|
|
191
|
+
continue
|
|
192
|
+
if path in reachable:
|
|
193
|
+
depth, via = reachable[path]
|
|
194
|
+
findings.append({
|
|
195
|
+
"file": path,
|
|
196
|
+
"type": "tier-b-reachable",
|
|
197
|
+
"severity": "info",
|
|
198
|
+
"reachable_via": via,
|
|
199
|
+
"depth": depth,
|
|
200
|
+
"verdict": "Reachable from Tier C via import chain",
|
|
201
|
+
})
|
|
202
|
+
else:
|
|
203
|
+
findings.append({
|
|
204
|
+
"file": path,
|
|
205
|
+
"type": "scope-unmatched",
|
|
206
|
+
"severity": "warn",
|
|
207
|
+
"verdict": "Not in Tier C, not reachable from Tier C via static imports",
|
|
208
|
+
})
|
|
209
|
+
|
|
210
|
+
return seeds, findings
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def main():
|
|
214
|
+
ap = argparse.ArgumentParser()
|
|
215
|
+
ap.add_argument("--work")
|
|
216
|
+
ap.add_argument("--scaffold")
|
|
217
|
+
ap.add_argument("--expected",
|
|
218
|
+
help="Path to fixture expected.json")
|
|
219
|
+
ap.add_argument(
|
|
220
|
+
"--list-categories",
|
|
221
|
+
action="store_true",
|
|
222
|
+
help="Emit the stable oracle CATEGORIES enum as JSON and exit (iter-0022).",
|
|
223
|
+
)
|
|
224
|
+
args = ap.parse_args()
|
|
225
|
+
|
|
226
|
+
if args.list_categories:
|
|
227
|
+
print(json.dumps({"oracle": ORACLE_NAME, "categories": CATEGORIES}, indent=2, sort_keys=True))
|
|
228
|
+
return
|
|
229
|
+
|
|
230
|
+
if not args.work or not args.scaffold or not args.expected:
|
|
231
|
+
ap.error("--work, --scaffold, and --expected are required unless --list-categories is set")
|
|
232
|
+
|
|
233
|
+
try:
|
|
234
|
+
expected = json.loads(pathlib.Path(args.expected).read_text())
|
|
235
|
+
except (OSError, json.JSONDecodeError) as e:
|
|
236
|
+
sys.stderr.write(f"[oracle-scope-tier-b] cannot read expected: {e}\n")
|
|
237
|
+
print(json.dumps({
|
|
238
|
+
"oracle": "scope-tier-b",
|
|
239
|
+
"trace_method": TRACE_METHOD,
|
|
240
|
+
"tier_c_seeds_matched": [],
|
|
241
|
+
"findings": [],
|
|
242
|
+
"error": f"expected.json unreadable: {e}",
|
|
243
|
+
}, indent=2))
|
|
244
|
+
return
|
|
245
|
+
|
|
246
|
+
tier_c = expected.get("spec_output_files", [])
|
|
247
|
+
waivers = expected.get("tier_a_waivers", [])
|
|
248
|
+
# fixture_id = parent directory name of expected.json
|
|
249
|
+
fixture_id = pathlib.Path(args.expected).parent.name
|
|
250
|
+
|
|
251
|
+
if not tier_c:
|
|
252
|
+
print(json.dumps({
|
|
253
|
+
"oracle": "scope-tier-b",
|
|
254
|
+
"trace_method": TRACE_METHOD,
|
|
255
|
+
"tier_c_seeds_matched": [],
|
|
256
|
+
"fixture_id": fixture_id,
|
|
257
|
+
"findings": [],
|
|
258
|
+
"error": "no spec_output_files in expected.json",
|
|
259
|
+
}, indent=2))
|
|
260
|
+
return
|
|
261
|
+
|
|
262
|
+
seeds, findings = analyze(args.work, args.scaffold, tier_c, waivers,
|
|
263
|
+
fixture_id=fixture_id)
|
|
264
|
+
print(json.dumps({
|
|
265
|
+
"oracle": "scope-tier-b",
|
|
266
|
+
"trace_method": TRACE_METHOD,
|
|
267
|
+
"tier_c_seeds_matched": seeds,
|
|
268
|
+
"fixture_id": fixture_id,
|
|
269
|
+
"findings": findings,
|
|
270
|
+
}, indent=2))
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
if __name__ == "__main__":
|
|
274
|
+
main()
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
oracle-test-fidelity.py — deterministic check for existing-test weakening.
|
|
4
|
+
|
|
5
|
+
Compares each modified test file's post-variant content against its scaffold
|
|
6
|
+
version and emits findings when:
|
|
7
|
+
- Effective assertion count dropped or tests were silently skipped (Signal A)
|
|
8
|
+
- Real-network/filesystem calls swapped for mocks in the same file (Signal B)
|
|
9
|
+
- An existing test file was deleted outright
|
|
10
|
+
|
|
11
|
+
Step 1 scope: JavaScript/TypeScript test files. Other languages may match the
|
|
12
|
+
path filters but the pattern lists are JS/TS-specific; the `--lang` flag is a
|
|
13
|
+
stub for later expansion.
|
|
14
|
+
|
|
15
|
+
Output: JSON on stdout.
|
|
16
|
+
"""
|
|
17
|
+
import argparse
|
|
18
|
+
import fnmatch
|
|
19
|
+
import json
|
|
20
|
+
import pathlib
|
|
21
|
+
import re
|
|
22
|
+
import subprocess
|
|
23
|
+
import sys
|
|
24
|
+
|
|
25
|
+
ORACLE_NAME = "test-fidelity"
|
|
26
|
+
|
|
27
|
+
# iter-0022: stable category enumeration for the pair-plan registry. The id
|
|
28
|
+
# strings are pinned — pair-plan.json invariants reference them verbatim.
|
|
29
|
+
# Edits here must update every fixture's metadata.json:pair_plan_oracle_categories
|
|
30
|
+
# allowlist that names the changed/removed id, and bump pair-plan-schema.md
|
|
31
|
+
# schema_version if a category is renamed.
|
|
32
|
+
CATEGORIES = [
|
|
33
|
+
{
|
|
34
|
+
"id": "test-fidelity:test-file-deleted",
|
|
35
|
+
"severity": "flag",
|
|
36
|
+
"applies_when": "any test file present at scaffold (default for js/ts fixtures with tests/)",
|
|
37
|
+
"operational_check": "no scaffold-present test file may be deleted by the variant arm; deletion of an existing tests/*.test.* / *.spec.* / *.e2e.* file is a flag-severity finding",
|
|
38
|
+
"evidence_source_files": ["oracle-test-fidelity.py"],
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"id": "test-fidelity:test-file-renamed",
|
|
42
|
+
"severity": "warn",
|
|
43
|
+
"applies_when": "any test file present at scaffold",
|
|
44
|
+
"operational_check": "rename of a scaffold-present test file is warn-severity (content fidelity not verified across renames in step 1)",
|
|
45
|
+
"evidence_source_files": ["oracle-test-fidelity.py"],
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
"id": "test-fidelity:mock-swap",
|
|
49
|
+
"severity": "flag",
|
|
50
|
+
"applies_when": "fixture's scaffold-present test files contain real-network/filesystem/process patterns (REAL_PATTERNS — listen/createServer/fetch/http.request/supertest/readFile*/writeFile*/spawn/exec)",
|
|
51
|
+
"operational_check": "post-arm test file MUST NOT swap REAL_PATTERNS hits for MOCK_PATTERNS hits (jest/vi/sinon, nock/msw, app.handle/inject/callback, hand-rolled IncomingMessage/ServerResponse, etc.); a drop in real_calls combined with a rise in mock_calls is a mock-swap flag",
|
|
52
|
+
"evidence_source_files": ["oracle-test-fidelity.py"],
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
"id": "test-fidelity:assertion-regression",
|
|
56
|
+
"severity": "warn",
|
|
57
|
+
"applies_when": "any test file present at scaffold",
|
|
58
|
+
"operational_check": "effective assertion count MUST NOT drop and skipped-test count MUST NOT rise; vacuous expect.assertions(0) is treated as a real regression",
|
|
59
|
+
"evidence_source_files": ["oracle-test-fidelity.py"],
|
|
60
|
+
},
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
TEST_FILE_GLOBS = ["*.test.*", "*.spec.*", "*.e2e.*"]
|
|
64
|
+
TEST_DIR_PARTS = {"tests", "test", "__tests__", "spec"}
|
|
65
|
+
|
|
66
|
+
# Assertion-call patterns. `expect(` matches expect(value) but not
|
|
67
|
+
# expect.assertions(...) / expect.fail(...) because those start with `expect.`.
|
|
68
|
+
ASSERT_PATTERNS = [
|
|
69
|
+
r"\bassert\.(equal|strictEqual|deepEqual|deepStrictEqual|ok|match|notEqual|fail|throws|rejects|doesNotThrow)\(",
|
|
70
|
+
r"\bt\.(equal|strictEqual|deepEqual|ok|match|notEqual|fail|throws)\(",
|
|
71
|
+
r"\bexpect\(",
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
# Explicitly skipped tests — count stays the same but coverage drops silently.
|
|
75
|
+
SKIP_PATTERNS = [
|
|
76
|
+
r"\btest\.skip\(",
|
|
77
|
+
r"\bit\.skip\(",
|
|
78
|
+
r"\bdescribe\.skip\(",
|
|
79
|
+
r"\bxit\(",
|
|
80
|
+
r"\bxdescribe\(",
|
|
81
|
+
r"\bxtest\(",
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
# Vacuous-assertion markers — assertion count reads normal but test asserts nothing.
|
|
85
|
+
VACUOUS_PATTERNS = [
|
|
86
|
+
r"expect\.assertions\(\s*0\s*\)",
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
# Real-network / filesystem call patterns (what we hope stays).
|
|
90
|
+
REAL_PATTERNS = [
|
|
91
|
+
r"\.listen\(",
|
|
92
|
+
r"\bcreateServer\(",
|
|
93
|
+
r"\bfetch\(",
|
|
94
|
+
r"\bhttp\.request\(",
|
|
95
|
+
r"\bsupertest\(",
|
|
96
|
+
r"\.readFileSync\(",
|
|
97
|
+
r"\.readFile\(",
|
|
98
|
+
r"\.writeFileSync\(",
|
|
99
|
+
r"\.writeFile\(",
|
|
100
|
+
r"\bspawn(Sync)?\(",
|
|
101
|
+
r"\bexec(Sync)?\(",
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
# Mock replacement patterns. Includes hand-rolled Node mocks, module-boundary
|
|
105
|
+
# mocks (jest/vitest/sinon), HTTP-level mocks (nock/msw), and bypass patterns
|
|
106
|
+
# that directly invoke app handlers without the real HTTP server.
|
|
107
|
+
MOCK_PATTERNS = [
|
|
108
|
+
# Hand-rolled req/res (bare or module-prefixed)
|
|
109
|
+
r"\bnew\s+(?:http\.)?IncomingMessage\b",
|
|
110
|
+
r"\bnew\s+(?:http\.)?ServerResponse\b",
|
|
111
|
+
r"\bnew\s+Duplex\s*\(\s*\{",
|
|
112
|
+
r"\bhandlers?\[0\]\(",
|
|
113
|
+
r"\bmockReq\b|\bfakeReq\b|\bstubReq\b",
|
|
114
|
+
r"\bReadable\.from\(\[",
|
|
115
|
+
# Server-bypass direct-handler invocation (Express/Koa/Fastify inject)
|
|
116
|
+
r"\bapp\.handle\(",
|
|
117
|
+
r"\bapp\.callback\(",
|
|
118
|
+
r"\bapp\.inject\(",
|
|
119
|
+
r"\bapp\._router\.",
|
|
120
|
+
# Module-boundary mock libraries
|
|
121
|
+
r"\bjest\.fn\(",
|
|
122
|
+
r"\bvi\.fn\(",
|
|
123
|
+
r"\bsinon\.stub\(",
|
|
124
|
+
r"\bsinon\.spy\(",
|
|
125
|
+
r"\bjest\.mock\(",
|
|
126
|
+
r"\bvi\.mock\(",
|
|
127
|
+
# HTTP-interception libraries
|
|
128
|
+
r"\bnock\(",
|
|
129
|
+
r"\bmsw\b",
|
|
130
|
+
]
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def is_test_path(path: str) -> bool:
|
|
134
|
+
parts = pathlib.PurePath(path).parts
|
|
135
|
+
lower_parts = {p.lower() for p in parts}
|
|
136
|
+
if lower_parts & TEST_DIR_PARTS:
|
|
137
|
+
return True
|
|
138
|
+
name = pathlib.PurePath(path).name
|
|
139
|
+
if any(fnmatch.fnmatch(name, g) for g in TEST_FILE_GLOBS):
|
|
140
|
+
return True
|
|
141
|
+
return False
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def run_git(args, cwd, check=False):
|
|
145
|
+
r = subprocess.run(
|
|
146
|
+
["git", *args], cwd=cwd, capture_output=True, text=True
|
|
147
|
+
)
|
|
148
|
+
if check and r.returncode != 0:
|
|
149
|
+
raise RuntimeError(f"git {' '.join(args)} failed: {r.stderr.strip()}")
|
|
150
|
+
return r
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def git_diff_status(scaffold_sha: str, cwd: str):
|
|
154
|
+
"""Return list of (status, path) for files changed scaffold..HEAD + worktree."""
|
|
155
|
+
r = run_git(
|
|
156
|
+
["diff", "--name-status", "-M", scaffold_sha],
|
|
157
|
+
cwd=cwd,
|
|
158
|
+
)
|
|
159
|
+
entries = []
|
|
160
|
+
for line in r.stdout.splitlines():
|
|
161
|
+
line = line.strip()
|
|
162
|
+
if not line:
|
|
163
|
+
continue
|
|
164
|
+
parts = line.split("\t")
|
|
165
|
+
status = parts[0]
|
|
166
|
+
# Rename lines look like "R<score>\told\tnew"; copy "C<score>\told\tnew".
|
|
167
|
+
if status.startswith("R") or status.startswith("C"):
|
|
168
|
+
if len(parts) >= 3:
|
|
169
|
+
entries.append((status[0], parts[2])) # treat as new path, keep R/C letter
|
|
170
|
+
else:
|
|
171
|
+
if len(parts) >= 2:
|
|
172
|
+
entries.append((status, parts[1]))
|
|
173
|
+
return entries
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def git_show(scaffold_sha: str, path: str, cwd: str):
|
|
177
|
+
r = run_git(["show", f"{scaffold_sha}:{path}"], cwd=cwd)
|
|
178
|
+
if r.returncode != 0:
|
|
179
|
+
return None
|
|
180
|
+
return r.stdout
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def count_patterns(text: str, patterns) -> int:
|
|
184
|
+
total = 0
|
|
185
|
+
for p in patterns:
|
|
186
|
+
total += len(re.findall(p, text))
|
|
187
|
+
return total
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def effective_assertions(text: str):
|
|
191
|
+
raw = count_patterns(text, ASSERT_PATTERNS)
|
|
192
|
+
vacuous = count_patterns(text, VACUOUS_PATTERNS)
|
|
193
|
+
skips = count_patterns(text, SKIP_PATTERNS)
|
|
194
|
+
return raw - vacuous, skips
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def analyze(work_dir: str, scaffold_sha: str):
|
|
198
|
+
findings = []
|
|
199
|
+
for status, path in git_diff_status(scaffold_sha, work_dir):
|
|
200
|
+
if not is_test_path(path):
|
|
201
|
+
continue
|
|
202
|
+
|
|
203
|
+
if status == "D":
|
|
204
|
+
findings.append({
|
|
205
|
+
"file": path,
|
|
206
|
+
"type": "test-file-deleted",
|
|
207
|
+
"severity": "flag",
|
|
208
|
+
"verdict": "Existing test file deleted entirely",
|
|
209
|
+
})
|
|
210
|
+
continue
|
|
211
|
+
|
|
212
|
+
if status in ("A",):
|
|
213
|
+
# New test file — not a weakening.
|
|
214
|
+
continue
|
|
215
|
+
|
|
216
|
+
if status in ("R", "C"):
|
|
217
|
+
# Rename/copy — known evasion path. Flag lightly so it's visible
|
|
218
|
+
# but we don't attempt content diff (old path resolution is
|
|
219
|
+
# brittle). Step 2+ can harden this.
|
|
220
|
+
findings.append({
|
|
221
|
+
"file": path,
|
|
222
|
+
"type": "test-file-renamed",
|
|
223
|
+
"severity": "warn",
|
|
224
|
+
"verdict": "Test file renamed — content fidelity not verified",
|
|
225
|
+
})
|
|
226
|
+
continue
|
|
227
|
+
|
|
228
|
+
if status != "M":
|
|
229
|
+
continue
|
|
230
|
+
|
|
231
|
+
pre = git_show(scaffold_sha, path, work_dir)
|
|
232
|
+
if pre is None:
|
|
233
|
+
continue
|
|
234
|
+
|
|
235
|
+
post_path = pathlib.Path(work_dir) / path
|
|
236
|
+
if not post_path.exists():
|
|
237
|
+
continue
|
|
238
|
+
try:
|
|
239
|
+
post = post_path.read_text(encoding="utf-8", errors="replace")
|
|
240
|
+
except OSError:
|
|
241
|
+
continue
|
|
242
|
+
|
|
243
|
+
pre_asserts, pre_skips = effective_assertions(pre)
|
|
244
|
+
post_asserts, post_skips = effective_assertions(post)
|
|
245
|
+
pre_real = count_patterns(pre, REAL_PATTERNS)
|
|
246
|
+
post_real = count_patterns(post, REAL_PATTERNS)
|
|
247
|
+
pre_mock = count_patterns(pre, MOCK_PATTERNS)
|
|
248
|
+
post_mock = count_patterns(post, MOCK_PATTERNS)
|
|
249
|
+
|
|
250
|
+
assert_delta = post_asserts - pre_asserts
|
|
251
|
+
skip_delta = post_skips - pre_skips
|
|
252
|
+
real_delta = post_real - pre_real
|
|
253
|
+
mock_delta = post_mock - pre_mock
|
|
254
|
+
|
|
255
|
+
signal_a = assert_delta < 0 or skip_delta > 0
|
|
256
|
+
signal_b = real_delta < 0 and mock_delta > 0
|
|
257
|
+
|
|
258
|
+
if signal_a and signal_b:
|
|
259
|
+
findings.append({
|
|
260
|
+
"file": path,
|
|
261
|
+
"type": "mock-swap",
|
|
262
|
+
"severity": "flag",
|
|
263
|
+
"assertion_delta": assert_delta,
|
|
264
|
+
"skip_delta": skip_delta,
|
|
265
|
+
"real_calls_delta": real_delta,
|
|
266
|
+
"mock_calls_delta": mock_delta,
|
|
267
|
+
"verdict": "Integration test weakened: effective assertions dropped AND real calls swapped for mocks",
|
|
268
|
+
})
|
|
269
|
+
elif signal_b:
|
|
270
|
+
findings.append({
|
|
271
|
+
"file": path,
|
|
272
|
+
"type": "mock-swap",
|
|
273
|
+
"severity": "flag",
|
|
274
|
+
"real_calls_delta": real_delta,
|
|
275
|
+
"mock_calls_delta": mock_delta,
|
|
276
|
+
"verdict": "Real-network/filesystem calls swapped for mocks",
|
|
277
|
+
})
|
|
278
|
+
elif signal_a:
|
|
279
|
+
findings.append({
|
|
280
|
+
"file": path,
|
|
281
|
+
"type": "assertion-regression",
|
|
282
|
+
"severity": "warn",
|
|
283
|
+
"assertion_delta": assert_delta,
|
|
284
|
+
"skip_delta": skip_delta,
|
|
285
|
+
"verdict": "Effective assertion count dropped or tests were skipped",
|
|
286
|
+
})
|
|
287
|
+
return findings
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def main():
|
|
291
|
+
ap = argparse.ArgumentParser()
|
|
292
|
+
ap.add_argument("--work", help="Arm work directory")
|
|
293
|
+
ap.add_argument("--scaffold", help="Scaffold commit SHA")
|
|
294
|
+
ap.add_argument(
|
|
295
|
+
"--lang",
|
|
296
|
+
default="js-ts",
|
|
297
|
+
help="Language profile (only js-ts implemented in step 1)",
|
|
298
|
+
)
|
|
299
|
+
ap.add_argument(
|
|
300
|
+
"--list-categories",
|
|
301
|
+
action="store_true",
|
|
302
|
+
help="Emit the stable oracle CATEGORIES enum as JSON and exit (iter-0022, used by pair-plan-idgen.py).",
|
|
303
|
+
)
|
|
304
|
+
args = ap.parse_args()
|
|
305
|
+
|
|
306
|
+
if args.list_categories:
|
|
307
|
+
print(json.dumps({"oracle": ORACLE_NAME, "categories": CATEGORIES}, indent=2, sort_keys=True))
|
|
308
|
+
return
|
|
309
|
+
|
|
310
|
+
if not args.work or not args.scaffold:
|
|
311
|
+
ap.error("--work and --scaffold are required unless --list-categories is set")
|
|
312
|
+
|
|
313
|
+
if args.lang != "js-ts":
|
|
314
|
+
sys.stderr.write(
|
|
315
|
+
f"[oracle-test-fidelity] lang={args.lang} not implemented; "
|
|
316
|
+
"falling back to js-ts patterns\n"
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
findings = analyze(args.work, args.scaffold)
|
|
320
|
+
print(json.dumps({
|
|
321
|
+
"oracle": "test-fidelity",
|
|
322
|
+
"lang": args.lang,
|
|
323
|
+
"findings": findings,
|
|
324
|
+
}, indent=2))
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
if __name__ == "__main__":
|
|
328
|
+
main()
|