@maintainabilityai/research-runner 0.1.43 → 0.1.45
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -0
- package/dist/runner/skills.js +400 -53
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -77,6 +77,43 @@ LLM provider/model, token count, cost, grounding score, and audit chain hash.
|
|
|
77
77
|
Auditors verify the artifact by re-running the chain against the recorded
|
|
78
78
|
mesh sha.
|
|
79
79
|
|
|
80
|
+
## Versioning + workflow-template pin scheme
|
|
81
|
+
|
|
82
|
+
The mesh-deployed workflow templates pin this package with a **tilde range**:
|
|
83
|
+
|
|
84
|
+
```
|
|
85
|
+
npx -y @maintainabilityai/research-runner@~0.1.42 skill-<name>
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
`~0.1.42` allows patch releases (`0.1.43`, `0.1.44`, …) but not minor
|
|
89
|
+
or major bumps. The reasons:
|
|
90
|
+
|
|
91
|
+
1. **Auto-publish bumps patch on every merge.** The
|
|
92
|
+
`npm-publish-research-runner.yml` workflow runs `npm version patch`
|
|
93
|
+
when anything under `packages/research-runner/**` changes. A new
|
|
94
|
+
patch is published within minutes of merge.
|
|
95
|
+
2. **Templates pinned exactly would force a follow-up edit on every
|
|
96
|
+
patch.** With `@0.1.42` (exact), every patch bump would leave the
|
|
97
|
+
templates stale until someone edited them. With `~0.1.42`, the
|
|
98
|
+
templates carry on transparently.
|
|
99
|
+
3. **A minor bump is a deliberate review event.** When the runner ships
|
|
100
|
+
a contract change (new event field, new skill API shape, removed
|
|
101
|
+
field), bump `version` from `0.1.x` to `0.2.0` and update the
|
|
102
|
+
templates in the same PR. A `phaseSpec.test.ts` parity test fails
|
|
103
|
+
loudly when the templates' major.minor doesn't match `package.json`.
|
|
104
|
+
|
|
105
|
+
**When you change anything under `packages/research-runner/**`:** you
|
|
106
|
+
do NOT need to edit workflow templates. The auto-publish handles it.
|
|
107
|
+
**When you ship a contract-breaking change:** bump the minor version
|
|
108
|
+
in `packages/research-runner/package.json` AND update every
|
|
109
|
+
`@maintainabilityai/research-runner@~0.X.Y` reference in
|
|
110
|
+
`vscode-extension/code-templates/**` to match. Tests enforce this.
|
|
111
|
+
|
|
112
|
+
The off-by-one risk the tilde range eliminates: a developer trying to
|
|
113
|
+
mentally compute "what patch will the auto-publish produce" and pinning
|
|
114
|
+
to the wrong value. With tilde, the patch resolves at run-time from
|
|
115
|
+
npm, and the mental math goes away.
|
|
116
|
+
|
|
80
117
|
## License
|
|
81
118
|
|
|
82
119
|
MIT
|
package/dist/runner/skills.js
CHANGED
|
@@ -902,6 +902,11 @@ const handleSelfReviewCodeSecurity = makeCodeReviewHandler('code-security');
|
|
|
902
902
|
// ─────────────────────────────────────────────────────────────────────
|
|
903
903
|
const KnowledgeCodeInput = zod_1.z.object({
|
|
904
904
|
okrId: zod_1.z.string().min(1),
|
|
905
|
+
// Bug-Q phase 2 — `runId` is the cache key for the clone retained
|
|
906
|
+
// between this skill and `knowledge-code-read`. Falls back to the
|
|
907
|
+
// RUN_ID env var when omitted (the runner already sets it from
|
|
908
|
+
// session context); failing both yields a clear error.
|
|
909
|
+
runId: zod_1.z.string().min(1).optional(),
|
|
905
910
|
repoUrl: zod_1.z.string().min(1),
|
|
906
911
|
repoStatus: zod_1.z.enum(['connected', 'not-connected', 'create', 'unreachable']),
|
|
907
912
|
ref: zod_1.z.string().optional(),
|
|
@@ -980,34 +985,187 @@ function walkRepo(rootDir, maxFiles) {
|
|
|
980
985
|
recurse(rootDir, '');
|
|
981
986
|
return out;
|
|
982
987
|
}
|
|
988
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
989
|
+
// Bug-Q phase 2 — brownfield clone cache.
|
|
990
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
991
|
+
// Until phase 2, `knowledge-code` cloned + classified + deleted the
|
|
992
|
+
// tree in one invocation. That left the agent with structural metadata
|
|
993
|
+
// only — no way to read actual file contents to ground its design.
|
|
994
|
+
// Codex audit round 2 (B1) flagged this: the prompt asks for
|
|
995
|
+
// `src/state/profileStore.ts`-level paths against a substrate that
|
|
996
|
+
// returns only top-dirs and language counts.
|
|
997
|
+
//
|
|
998
|
+
// Phase 2 splits the lifecycle:
|
|
999
|
+
// 1. `knowledge-code` clones + walks + classifies + RETAINS the clone
|
|
1000
|
+
// in a per-runId tmpdir cache.
|
|
1001
|
+
// 2. `knowledge-code-read` reads files FROM that cache (with a
|
|
1002
|
+
// content-addressable re-clone fallback so a stale or expired
|
|
1003
|
+
// cache doesn't break the agent).
|
|
1004
|
+
//
|
|
1005
|
+
// The cache key is `(runId, owner, name)` — one clone per (session, repo)
|
|
1006
|
+
// pair. Workflow runners are sandboxed per-job so tmpdir starts empty,
|
|
1007
|
+
// so cross-run pollution is impossible. Local dev / tests can stale the
|
|
1008
|
+
// cache; `.cache-meta.json` carries `ref` + `sha` so the read skill can
|
|
1009
|
+
// detect staleness and re-clone.
|
|
1010
|
+
//
|
|
1011
|
+
// SECURITY: `knowledge-code-read` enforces a path perimeter — relative
|
|
1012
|
+
// paths only, no `..` segments, resolved path must be a child of the
|
|
1013
|
+
// clone root. Any escape attempt is rejected without reading bytes.
|
|
1014
|
+
function knowledgeCodeCacheDir(runId, owner, name) {
|
|
1015
|
+
// Filesystem-safe key. runId / owner / name are short ascii so the
|
|
1016
|
+
// basename can't blow up POSIX path limits.
|
|
1017
|
+
return path.join(os.tmpdir(), 'knowledge-code-cache', runId, `${owner}-${name}`);
|
|
1018
|
+
}
|
|
1019
|
+
function ensureClone(runId, repoUrl, ref, owner, name) {
|
|
1020
|
+
const cacheDir = knowledgeCodeCacheDir(runId, owner, name);
|
|
1021
|
+
const metaPath = path.join(cacheDir, '.cache-meta.json');
|
|
1022
|
+
// Cache hit if meta exists AND ref matches.
|
|
1023
|
+
if (fs.existsSync(metaPath)) {
|
|
1024
|
+
try {
|
|
1025
|
+
const meta = JSON.parse(fs.readFileSync(metaPath, 'utf8'));
|
|
1026
|
+
if (meta.ref === ref && typeof meta.sha === 'string') {
|
|
1027
|
+
return { ok: true, path: cacheDir, sha: meta.sha, reused: true };
|
|
1028
|
+
}
|
|
1029
|
+
}
|
|
1030
|
+
catch { /* unreadable meta — re-clone */ }
|
|
1031
|
+
}
|
|
1032
|
+
// Clean out a stale cache before re-cloning to avoid mixing two refs.
|
|
1033
|
+
try {
|
|
1034
|
+
fs.rmSync(cacheDir, { recursive: true, force: true });
|
|
1035
|
+
}
|
|
1036
|
+
catch { /* ignore */ }
|
|
1037
|
+
fs.mkdirSync(path.dirname(cacheDir), { recursive: true });
|
|
1038
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
1039
|
+
const { execFileSync } = require('node:child_process');
|
|
1040
|
+
const cloneArgs = ['clone', '--depth=1', '--filter=blob:limit=10m'];
|
|
1041
|
+
if (ref && ref !== 'HEAD') {
|
|
1042
|
+
cloneArgs.push('--branch', ref);
|
|
1043
|
+
}
|
|
1044
|
+
cloneArgs.push(repoUrl, cacheDir);
|
|
1045
|
+
try {
|
|
1046
|
+
execFileSync('git', cloneArgs, { stdio: ['ignore', 'pipe', 'pipe'], timeout: 60_000 });
|
|
1047
|
+
}
|
|
1048
|
+
catch (err) {
|
|
1049
|
+
try {
|
|
1050
|
+
fs.rmSync(cacheDir, { recursive: true, force: true });
|
|
1051
|
+
}
|
|
1052
|
+
catch { /* ignore */ }
|
|
1053
|
+
return { ok: false, path: '', sha: '', reused: false, error: err instanceof Error ? err.message : String(err) };
|
|
1054
|
+
}
|
|
1055
|
+
let sha = '';
|
|
1056
|
+
try {
|
|
1057
|
+
sha = execFileSync('git', ['rev-parse', 'HEAD'], { cwd: cacheDir, encoding: 'utf8', stdio: ['ignore', 'pipe', 'pipe'] }).trim();
|
|
1058
|
+
}
|
|
1059
|
+
catch { /* sha stays empty */ }
|
|
1060
|
+
try {
|
|
1061
|
+
fs.writeFileSync(metaPath, JSON.stringify({ owner, name, ref, sha, clonedAt: new Date().toISOString() }), 'utf8');
|
|
1062
|
+
}
|
|
1063
|
+
catch { /* meta write failure is non-fatal — next call will just re-clone */ }
|
|
1064
|
+
return { ok: true, path: cacheDir, sha, reused: false };
|
|
1065
|
+
}
|
|
1066
|
+
function classifyRole(filePath, lang) {
|
|
1067
|
+
const lower = filePath.toLowerCase();
|
|
1068
|
+
// Tests — broadest match wins. `__tests__/` dir, `.test.`, `.spec.`,
|
|
1069
|
+
// or top-level `test/` / `tests/`.
|
|
1070
|
+
if (/(^|\/)__tests__\//.test(lower)
|
|
1071
|
+
|| /\.(test|spec)\.(t|j)sx?$/.test(lower)
|
|
1072
|
+
|| /\.(test|spec)\.py$/.test(lower)
|
|
1073
|
+
|| /^test(s)?\//.test(lower)) {
|
|
1074
|
+
return 'test';
|
|
1075
|
+
}
|
|
1076
|
+
// Routes — files in `routes/`, `pages/` (Next), `app/` (Next/Nuxt
|
|
1077
|
+
// app router), or files named `*.route(s).*`.
|
|
1078
|
+
if (/(^|\/)(routes|pages|app)\//.test(lower)
|
|
1079
|
+
|| /\.routes?\.(t|j)sx?$/.test(lower)) {
|
|
1080
|
+
return 'route';
|
|
1081
|
+
}
|
|
1082
|
+
// Docs — `.md`, or anything in `docs/` / `doc/`.
|
|
1083
|
+
if (/\.md$/i.test(lower)
|
|
1084
|
+
|| /^docs?\//.test(lower)) {
|
|
1085
|
+
return 'doc';
|
|
1086
|
+
}
|
|
1087
|
+
// Config — top-level YAML/JSON/TOML config files, manifests,
|
|
1088
|
+
// dot-files at root.
|
|
1089
|
+
const base = path.basename(filePath);
|
|
1090
|
+
if (MANIFEST_FILES.has(base)
|
|
1091
|
+
|| /^\.[\w.-]+$/.test(base) // .eslintrc, .gitignore, …
|
|
1092
|
+
|| /(^|\/)tsconfig(\.[^.]+)?\.json$/.test(lower)
|
|
1093
|
+
|| /(^|\/)[^/]+\.config\.(t|j)sx?$/.test(lower)) {
|
|
1094
|
+
return 'config';
|
|
1095
|
+
}
|
|
1096
|
+
if (lang && lang !== 'unknown') {
|
|
1097
|
+
return 'source';
|
|
1098
|
+
}
|
|
1099
|
+
return 'other';
|
|
1100
|
+
}
|
|
983
1101
|
/**
|
|
984
1102
|
* Guess the primary BAR-level language + framework from the manifest +
|
|
985
|
-
* file mix
|
|
986
|
-
*
|
|
1103
|
+
* file mix, AND surface bounded file/test/route/module inventories the
|
|
1104
|
+
* agent + workflow gate can use to ground brownfield decisions.
|
|
1105
|
+
*
|
|
1106
|
+
* Bug-Q phase 2 (Codex audit round 2 / B1) extended the return shape
|
|
1107
|
+
* with `files[]`, `tests[]`, `routes[]`, `modules[]`. Before phase 2,
|
|
1108
|
+
* the only structural outputs were `topDirs` + `languages` + manifest
|
|
1109
|
+
* count — enough for the agent to KNOW what kind of repo it was, not
|
|
1110
|
+
* enough to GROUND specific file-level design choices.
|
|
987
1111
|
*/
|
|
988
|
-
function classifyRepo(
|
|
1112
|
+
function classifyRepo(filesRaw) {
|
|
989
1113
|
const topDirs = new Set();
|
|
990
1114
|
const languages = {};
|
|
991
1115
|
const packageManifests = [];
|
|
992
|
-
|
|
1116
|
+
const files = [];
|
|
1117
|
+
const tests = [];
|
|
1118
|
+
const routes = [];
|
|
1119
|
+
const moduleCounts = {};
|
|
1120
|
+
for (const f of filesRaw) {
|
|
993
1121
|
const slashIdx = f.indexOf('/');
|
|
994
1122
|
if (slashIdx > 0) {
|
|
995
1123
|
topDirs.add(f.slice(0, slashIdx));
|
|
996
1124
|
}
|
|
997
1125
|
const ext = path.extname(f).toLowerCase();
|
|
998
|
-
const lang = LANG_EXTS[ext];
|
|
999
|
-
if (
|
|
1126
|
+
const lang = LANG_EXTS[ext] ?? 'unknown';
|
|
1127
|
+
if (LANG_EXTS[ext]) {
|
|
1000
1128
|
languages[lang] = (languages[lang] ?? 0) + 1;
|
|
1001
1129
|
}
|
|
1002
1130
|
const base = path.basename(f);
|
|
1003
1131
|
if (MANIFEST_FILES.has(base)) {
|
|
1004
1132
|
packageManifests.push(f);
|
|
1005
1133
|
}
|
|
1134
|
+
const role = classifyRole(f, lang);
|
|
1135
|
+
files.push({ path: f, lang, role });
|
|
1136
|
+
if (role === 'test') {
|
|
1137
|
+
tests.push(f);
|
|
1138
|
+
}
|
|
1139
|
+
if (role === 'route') {
|
|
1140
|
+
routes.push(f);
|
|
1141
|
+
}
|
|
1142
|
+
// Modules — top-level subdirectory of `src/` if present, otherwise
|
|
1143
|
+
// top-level repo subdir. Skips files at the repo root (those aren't
|
|
1144
|
+
// module-organized).
|
|
1145
|
+
const srcMatch = /^src\/([^/]+)\//.exec(f);
|
|
1146
|
+
if (srcMatch) {
|
|
1147
|
+
moduleCounts[srcMatch[1]] = (moduleCounts[srcMatch[1]] ?? 0) + 1;
|
|
1148
|
+
}
|
|
1149
|
+
else if (slashIdx > 0) {
|
|
1150
|
+
const topDir = f.slice(0, slashIdx);
|
|
1151
|
+
// Avoid double-counting top-level dirs that are clearly not
|
|
1152
|
+
// modules (tests, docs, config dirs, infra dirs).
|
|
1153
|
+
if (!['tests', 'test', '__tests__', 'docs', 'doc', '.github', '.vscode', 'scripts'].includes(topDir)) {
|
|
1154
|
+
moduleCounts[topDir] = (moduleCounts[topDir] ?? 0) + 1;
|
|
1155
|
+
}
|
|
1156
|
+
}
|
|
1006
1157
|
}
|
|
1158
|
+
const modules = Object.entries(moduleCounts)
|
|
1159
|
+
.map(([name, fileCount]) => ({ name, fileCount }))
|
|
1160
|
+
.sort((a, b) => b.fileCount - a.fileCount);
|
|
1007
1161
|
return {
|
|
1008
1162
|
topDirs: Array.from(topDirs).sort(),
|
|
1009
1163
|
languages,
|
|
1010
1164
|
packageManifests: packageManifests.sort(),
|
|
1165
|
+
files,
|
|
1166
|
+
tests: tests.sort(),
|
|
1167
|
+
routes: routes.sort(),
|
|
1168
|
+
modules,
|
|
1011
1169
|
};
|
|
1012
1170
|
}
|
|
1013
1171
|
/**
|
|
@@ -1078,55 +1236,42 @@ const handleKnowledgeCode = async (input) => {
|
|
|
1078
1236
|
};
|
|
1079
1237
|
}
|
|
1080
1238
|
// ─── Brownfield branch (connected) ─────────────────────────────────
|
|
1081
|
-
//
|
|
1082
|
-
//
|
|
1083
|
-
//
|
|
1084
|
-
//
|
|
1239
|
+
// Bug-Q phase 2 — uses the per-runId clone cache (`ensureClone`)
|
|
1240
|
+
// so `knowledge-code-read` can read the same files later in the
|
|
1241
|
+
// session without re-cloning. The cache stays for the runner-job
|
|
1242
|
+
// tmpdir lifetime (workflow runners get a clean tmpdir per job, so
|
|
1243
|
+
// cross-run pollution is impossible).
|
|
1085
1244
|
if (!gh) {
|
|
1086
1245
|
return { ok: false, reason: 'repo-url-not-github', repo: repoUrl };
|
|
1087
1246
|
}
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
const
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
let cloneError = '';
|
|
1099
|
-
try {
|
|
1100
|
-
execFileSync('git', cloneArgs, { stdio: ['ignore', 'pipe', 'pipe'], timeout: 60_000 });
|
|
1101
|
-
}
|
|
1102
|
-
catch (err) {
|
|
1103
|
-
cloneOk = false;
|
|
1104
|
-
cloneError = err instanceof Error ? err.message : String(err);
|
|
1247
|
+
// Resolve the session runId — explicit input wins; fall back to
|
|
1248
|
+
// RUN_ID env var (the runner sets this from session context).
|
|
1249
|
+
const runId = parsed.data.runId ?? process.env.RUN_ID;
|
|
1250
|
+
if (!runId) {
|
|
1251
|
+
return {
|
|
1252
|
+
ok: false,
|
|
1253
|
+
reason: 'missing-run-id',
|
|
1254
|
+
repo: repoSlug,
|
|
1255
|
+
remediation: "knowledge-code needs a session runId to scope the clone cache. Either pass `runId` in the skill input, or set the RUN_ID env var before invoking (the agent does this automatically via session-context export — see agent.md step 1b).",
|
|
1256
|
+
};
|
|
1105
1257
|
}
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
fs.rmSync(tmpRoot, { recursive: true, force: true });
|
|
1110
|
-
}
|
|
1111
|
-
catch { /* ignore */ }
|
|
1258
|
+
const cloneRef = ref ?? 'HEAD';
|
|
1259
|
+
const cloneResult = ensureClone(runId, repoUrl, cloneRef, gh.owner, gh.name);
|
|
1260
|
+
if (!cloneResult.ok) {
|
|
1112
1261
|
const auditMetadata = { phase: 'what', repo: repoSlug, mode: 'brownfield-clone-failed', repo_status: 'connected', okr_id: okrId };
|
|
1113
1262
|
return {
|
|
1114
1263
|
ok: false,
|
|
1115
1264
|
reason: 'clone-failed',
|
|
1116
1265
|
repo: repoSlug,
|
|
1117
|
-
remediation: `git clone failed for ${repoUrl}. Verify the GitHub App install is approved on this repo and the ref (${cloneRef}) exists. Underlying error: ${
|
|
1266
|
+
remediation: `git clone failed for ${repoUrl}. Verify the GitHub App install is approved on this repo and the ref (${cloneRef}) exists. Underlying error: ${cloneResult.error ?? 'unknown'}`,
|
|
1118
1267
|
auditMetadata,
|
|
1119
1268
|
};
|
|
1120
1269
|
}
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
try {
|
|
1124
|
-
sha = execFileSync('git', ['rev-parse', 'HEAD'], { cwd: cloneTarget, encoding: 'utf8', stdio: ['ignore', 'pipe', 'pipe'] }).trim();
|
|
1125
|
-
}
|
|
1126
|
-
catch { /* sha stays empty */ }
|
|
1270
|
+
const cloneTarget = cloneResult.path;
|
|
1271
|
+
const sha = cloneResult.sha;
|
|
1127
1272
|
const cap = maxFiles ?? 200;
|
|
1128
|
-
const
|
|
1129
|
-
const structure = classifyRepo(
|
|
1273
|
+
const filesRaw = walkRepo(cloneTarget, cap);
|
|
1274
|
+
const structure = classifyRepo(filesRaw);
|
|
1130
1275
|
// Best-effort entrypoint detection from the most-common manifest +
|
|
1131
1276
|
// top-level layout. Conservative: only mark something as an entrypoint
|
|
1132
1277
|
// when we have positive signal (manifest field OR conventional path).
|
|
@@ -1166,13 +1311,16 @@ const handleKnowledgeCode = async (input) => {
|
|
|
1166
1311
|
catch { /* manifest unreadable / non-JSON; skip */ }
|
|
1167
1312
|
}
|
|
1168
1313
|
}
|
|
1169
|
-
//
|
|
1170
|
-
//
|
|
1171
|
-
|
|
1172
|
-
fs.rmSync(tmpRoot, { recursive: true, force: true });
|
|
1173
|
-
}
|
|
1174
|
-
catch { /* ignore */ }
|
|
1314
|
+
// Bug-Q phase 2 — DO NOT delete the clone here. `knowledge-code-read`
|
|
1315
|
+
// will reuse it through `ensureClone`. Workflow-runner tmpdir is wiped
|
|
1316
|
+
// when the job ends, so cleanup happens for free at the right scope.
|
|
1175
1317
|
const primaryLanguage = Object.entries(structure.languages).sort((a, b) => b[1] - a[1])[0]?.[0] ?? 'unknown';
|
|
1318
|
+
// Bug-Q phase 2 — surface the file/test/route/module inventory in the
|
|
1319
|
+
// audit payload so the workflow path-citation gate can cross-check
|
|
1320
|
+
// every brownfield path cited in code-design.md against what actually
|
|
1321
|
+
// exists in the clone. `inventory_paths` is the flat list of file
|
|
1322
|
+
// paths (sorted) the workflow uses as its membership set.
|
|
1323
|
+
const inventoryPaths = structure.files.map(f => f.path).sort();
|
|
1176
1324
|
const auditMetadata = {
|
|
1177
1325
|
phase: 'what',
|
|
1178
1326
|
repo: repoSlug,
|
|
@@ -1180,9 +1328,15 @@ const handleKnowledgeCode = async (input) => {
|
|
|
1180
1328
|
repo_status: 'connected',
|
|
1181
1329
|
okr_id: okrId,
|
|
1182
1330
|
sha: sha.slice(0, 12),
|
|
1183
|
-
file_count:
|
|
1331
|
+
file_count: filesRaw.length,
|
|
1184
1332
|
primary_language: primaryLanguage,
|
|
1185
1333
|
manifests: structure.packageManifests.length,
|
|
1334
|
+
test_count: structure.tests.length,
|
|
1335
|
+
route_count: structure.routes.length,
|
|
1336
|
+
module_count: structure.modules.length,
|
|
1337
|
+
// Inventory: flat path list — bounded by the `maxFiles` cap above.
|
|
1338
|
+
// Workflow gate consumes this to validate cited paths.
|
|
1339
|
+
inventory_paths: inventoryPaths,
|
|
1186
1340
|
};
|
|
1187
1341
|
return {
|
|
1188
1342
|
ok: true,
|
|
@@ -1194,6 +1348,147 @@ const handleKnowledgeCode = async (input) => {
|
|
|
1194
1348
|
};
|
|
1195
1349
|
};
|
|
1196
1350
|
// ─────────────────────────────────────────────────────────────────────
|
|
1351
|
+
// knowledge-code-read — Bug-Q phase 2 (Codex audit round 2 / B1).
|
|
1352
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
1353
|
+
// `knowledge-code` returns structural metadata; this skill returns
|
|
1354
|
+
// bounded file CONTENTS so the agent can ground design with real code,
|
|
1355
|
+
// not paraphrased guesses. Same session-scoped clone cache as
|
|
1356
|
+
// `knowledge-code` — the read is essentially free after the initial
|
|
1357
|
+
// clone.
|
|
1358
|
+
//
|
|
1359
|
+
// SECURITY PERIMETER: the runner only reads paths that resolve INSIDE
|
|
1360
|
+
// the cloned repo. Path-traversal attempts (`../`, absolute paths) are
|
|
1361
|
+
// rejected without reading bytes. The clone is a shallow git clone in
|
|
1362
|
+
// an isolated tmpdir; even if a malicious file in the repo contained
|
|
1363
|
+
// a symlink to /etc/passwd, the `realpath` check below would refuse.
|
|
1364
|
+
//
|
|
1365
|
+
// CONTENT BOUNDS: max 10 KB per response; binary files (any NUL byte)
|
|
1366
|
+
// rejected. The agent is meant to read CODE, not blobs.
|
|
1367
|
+
//
|
|
1368
|
+
// AUDIT: every read auto-emits a skill_call event with file + bytes
|
|
1369
|
+
// returned, so the chain captures exactly which files the agent
|
|
1370
|
+
// consulted while writing the design.
|
|
1371
|
+
const KnowledgeCodeReadInput = zod_1.z.object({
|
|
1372
|
+
okrId: zod_1.z.string().min(1),
|
|
1373
|
+
runId: zod_1.z.string().min(1).optional(),
|
|
1374
|
+
repoUrl: zod_1.z.string().min(1),
|
|
1375
|
+
ref: zod_1.z.string().optional(),
|
|
1376
|
+
filePath: zod_1.z.string().min(1),
|
|
1377
|
+
});
|
|
1378
|
+
const KNOWLEDGE_CODE_READ_MAX_BYTES = 10_240; // 10 KB cap per response
|
|
1379
|
+
const handleKnowledgeCodeRead = async (input) => {
|
|
1380
|
+
const parsed = KnowledgeCodeReadInput.safeParse(input);
|
|
1381
|
+
if (!parsed.success) {
|
|
1382
|
+
return { ok: false, reason: `bad-input: ${parsed.error.message}` };
|
|
1383
|
+
}
|
|
1384
|
+
const { okrId, repoUrl, ref, filePath } = parsed.data;
|
|
1385
|
+
const gh = parseGithubUrl(repoUrl);
|
|
1386
|
+
if (!gh) {
|
|
1387
|
+
return { ok: false, reason: 'repo-url-not-github', repo: repoUrl };
|
|
1388
|
+
}
|
|
1389
|
+
const runId = parsed.data.runId ?? process.env.RUN_ID;
|
|
1390
|
+
if (!runId) {
|
|
1391
|
+
return {
|
|
1392
|
+
ok: false,
|
|
1393
|
+
reason: 'missing-run-id',
|
|
1394
|
+
remediation: "knowledge-code-read needs a session runId to find the clone cache shared with knowledge-code. Pass `runId` in input or set the RUN_ID env var (the agent does this via session-context export).",
|
|
1395
|
+
};
|
|
1396
|
+
}
|
|
1397
|
+
// Security perimeter — reject obvious escape attempts BEFORE touching
|
|
1398
|
+
// the filesystem so the audit chain captures the rejection cleanly.
|
|
1399
|
+
if (path.isAbsolute(filePath)) {
|
|
1400
|
+
return { ok: false, reason: `path-rejected: absolute paths are forbidden (${filePath})` };
|
|
1401
|
+
}
|
|
1402
|
+
// Normalize and re-check — a path like `foo/../../bar` would resolve
|
|
1403
|
+
// up two levels even though the literal string contains no leading
|
|
1404
|
+
// `../`. `path.normalize` collapses it; we then reject if it starts
|
|
1405
|
+
// with `..`.
|
|
1406
|
+
const normalized = path.normalize(filePath);
|
|
1407
|
+
if (normalized.startsWith('..') || normalized === '..' || normalized.includes(`${path.sep}..${path.sep}`)) {
|
|
1408
|
+
return { ok: false, reason: `path-rejected: path-traversal segments forbidden (${filePath} -> ${normalized})` };
|
|
1409
|
+
}
|
|
1410
|
+
// Reuse the cached clone from knowledge-code; clone fresh if missing
|
|
1411
|
+
// (e.g. agent called knowledge-code-read without calling knowledge-
|
|
1412
|
+
// code first — supported but slower).
|
|
1413
|
+
const cloneResult = ensureClone(runId, repoUrl, ref ?? 'HEAD', gh.owner, gh.name);
|
|
1414
|
+
if (!cloneResult.ok) {
|
|
1415
|
+
return {
|
|
1416
|
+
ok: false,
|
|
1417
|
+
reason: 'clone-failed',
|
|
1418
|
+
repo: `${gh.owner}/${gh.name}`,
|
|
1419
|
+
remediation: `Could not access clone for ${repoUrl}. Underlying error: ${cloneResult.error ?? 'unknown'}`,
|
|
1420
|
+
};
|
|
1421
|
+
}
|
|
1422
|
+
const absPath = path.join(cloneResult.path, normalized);
|
|
1423
|
+
// Final paranoia check — resolve the real path and verify it's still
|
|
1424
|
+
// a child of the clone root. Defends against symlink-shaped escapes
|
|
1425
|
+
// (an attacker-controlled file in the repo that's a symlink to /etc).
|
|
1426
|
+
let realPath;
|
|
1427
|
+
try {
|
|
1428
|
+
realPath = fs.realpathSync.native(absPath);
|
|
1429
|
+
}
|
|
1430
|
+
catch {
|
|
1431
|
+
return { ok: false, reason: `file-not-found: ${filePath} not in ${gh.owner}/${gh.name}@${cloneResult.sha.slice(0, 12)}` };
|
|
1432
|
+
}
|
|
1433
|
+
const realClone = fs.realpathSync.native(cloneResult.path);
|
|
1434
|
+
if (!realPath.startsWith(realClone + path.sep) && realPath !== realClone) {
|
|
1435
|
+
return { ok: false, reason: `path-escape: resolved path falls outside the cloned repo (${filePath} -> ${realPath})` };
|
|
1436
|
+
}
|
|
1437
|
+
let stat;
|
|
1438
|
+
try {
|
|
1439
|
+
stat = fs.statSync(realPath);
|
|
1440
|
+
}
|
|
1441
|
+
catch {
|
|
1442
|
+
return { ok: false, reason: `file-not-found: ${filePath}` };
|
|
1443
|
+
}
|
|
1444
|
+
if (stat.isDirectory()) {
|
|
1445
|
+
return { ok: false, reason: `path-is-directory: ${filePath} is a directory; knowledge-code-read returns file contents only` };
|
|
1446
|
+
}
|
|
1447
|
+
// Read + truncate + reject binary.
|
|
1448
|
+
let buf;
|
|
1449
|
+
try {
|
|
1450
|
+
buf = fs.readFileSync(realPath);
|
|
1451
|
+
}
|
|
1452
|
+
catch (err) {
|
|
1453
|
+
return { ok: false, reason: `read-failed: ${err instanceof Error ? err.message : String(err)}` };
|
|
1454
|
+
}
|
|
1455
|
+
// Heuristic: a NUL byte in the first 8 KB is a strong binary signal.
|
|
1456
|
+
// Strings of bytes that legitimately contain NUL bytes (gzip, images,
|
|
1457
|
+
// wasm) are not source code; refuse them.
|
|
1458
|
+
if (buf.slice(0, Math.min(buf.length, 8192)).includes(0)) {
|
|
1459
|
+
return { ok: false, reason: `binary-file: ${filePath} contains NUL bytes; knowledge-code-read returns text only` };
|
|
1460
|
+
}
|
|
1461
|
+
const totalBytes = buf.length;
|
|
1462
|
+
const truncated = totalBytes > KNOWLEDGE_CODE_READ_MAX_BYTES;
|
|
1463
|
+
const content = (truncated ? buf.subarray(0, KNOWLEDGE_CODE_READ_MAX_BYTES) : buf).toString('utf8');
|
|
1464
|
+
const lang = LANG_EXTS[path.extname(filePath).toLowerCase()] ?? 'unknown';
|
|
1465
|
+
const lineCount = content.split('\n').length;
|
|
1466
|
+
const auditMetadata = {
|
|
1467
|
+
phase: 'what',
|
|
1468
|
+
repo: `${gh.owner}/${gh.name}`,
|
|
1469
|
+
file: normalized,
|
|
1470
|
+
sha: cloneResult.sha.slice(0, 12),
|
|
1471
|
+
bytes_returned: content.length,
|
|
1472
|
+
bytes_total: totalBytes,
|
|
1473
|
+
truncated,
|
|
1474
|
+
lang,
|
|
1475
|
+
okr_id: okrId,
|
|
1476
|
+
};
|
|
1477
|
+
return {
|
|
1478
|
+
ok: true,
|
|
1479
|
+
repo: `${gh.owner}/${gh.name}`,
|
|
1480
|
+
file: normalized,
|
|
1481
|
+
sha: cloneResult.sha,
|
|
1482
|
+
content,
|
|
1483
|
+
lang,
|
|
1484
|
+
lineCount,
|
|
1485
|
+
truncated,
|
|
1486
|
+
bytesReturned: content.length,
|
|
1487
|
+
bytesTotal: totalBytes,
|
|
1488
|
+
auditMetadata,
|
|
1489
|
+
};
|
|
1490
|
+
};
|
|
1491
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
1197
1492
|
// Search skills — thin wrappers over the existing search nodes
|
|
1198
1493
|
// ─────────────────────────────────────────────────────────────────────
|
|
1199
1494
|
const SearchQueriesInput = zod_1.z.object({
|
|
@@ -1230,6 +1525,52 @@ function detectAllQueriesFailed(envelopes, skill) {
|
|
|
1230
1525
|
// pattern matching of firewall-block vs query-quality failures.
|
|
1231
1526
|
return `all-queries-failed: ${skill} — ${firstError}`;
|
|
1232
1527
|
}
|
|
1528
|
+
/**
|
|
1529
|
+
* Bug-Q phase 3 (Codex audit follow-up / oracle evidence) — search
|
|
1530
|
+
* audit metadata now carries a bounded preview of WHICH results came
|
|
1531
|
+
* back, not just HOW MANY. Without this, a reviewer who wants to
|
|
1532
|
+
* verify "S-3 cites a real arXiv paper, not a hallucinated one"
|
|
1533
|
+
* has nothing in the chain to verify against — they'd have to trust
|
|
1534
|
+
* the agent's research-doc citations and re-run the search.
|
|
1535
|
+
*
|
|
1536
|
+
* Preview shape per hit: { provider, query, title, url, snippet?,
|
|
1537
|
+
* score?, publishedDate? } where:
|
|
1538
|
+
* - snippet is truncated to ~200 chars (the ProviderResult.content
|
|
1539
|
+
* field already caps at ~500; we shorten further for chain size)
|
|
1540
|
+
* - score is rounded to 2 decimals
|
|
1541
|
+
*
|
|
1542
|
+
* Total preview cap: 25 hits per skill_call. Search runs typically
|
|
1543
|
+
* return 10-30 results per provider before dedupe; the cap keeps the
|
|
1544
|
+
* audit JSONL compact while still proving "real evidence behind every
|
|
1545
|
+
* citation."
|
|
1546
|
+
*/
|
|
1547
|
+
const SEARCH_RESULTS_PREVIEW_CAP = 25;
|
|
1548
|
+
const SEARCH_SNIPPET_CAP = 200;
|
|
1549
|
+
function buildSearchAuditMetadata(queries, results) {
|
|
1550
|
+
const preview = results.slice(0, SEARCH_RESULTS_PREVIEW_CAP).map((r) => {
|
|
1551
|
+
const snippet = (r.content || '').replace(/\s+/g, ' ').trim();
|
|
1552
|
+
const truncated = snippet.length > SEARCH_SNIPPET_CAP
|
|
1553
|
+
? snippet.slice(0, SEARCH_SNIPPET_CAP) + '…'
|
|
1554
|
+
: snippet;
|
|
1555
|
+
const entry = {
|
|
1556
|
+
provider: r.provider,
|
|
1557
|
+
query: r.fromQuery,
|
|
1558
|
+
title: r.title,
|
|
1559
|
+
url: r.url,
|
|
1560
|
+
};
|
|
1561
|
+
if (truncated) {
|
|
1562
|
+
entry.snippet = truncated;
|
|
1563
|
+
}
|
|
1564
|
+
if (typeof r.score === 'number' && isFinite(r.score)) {
|
|
1565
|
+
entry.score = Math.round(r.score * 100) / 100;
|
|
1566
|
+
}
|
|
1567
|
+
if (r.publishedDate) {
|
|
1568
|
+
entry.publishedDate = r.publishedDate;
|
|
1569
|
+
}
|
|
1570
|
+
return entry;
|
|
1571
|
+
});
|
|
1572
|
+
return { queries, result_count: results.length, results_preview: preview };
|
|
1573
|
+
}
|
|
1233
1574
|
const handleTavilySearch = async (input) => {
|
|
1234
1575
|
const parsed = SearchQueriesInput.safeParse(input);
|
|
1235
1576
|
if (!parsed.success) {
|
|
@@ -1245,7 +1586,7 @@ const handleTavilySearch = async (input) => {
|
|
|
1245
1586
|
queries: parsed.data.queries,
|
|
1246
1587
|
maxResultsPerQuery: parsed.data.maxResults,
|
|
1247
1588
|
});
|
|
1248
|
-
const auditMetadata =
|
|
1589
|
+
const auditMetadata = buildSearchAuditMetadata(parsed.data.queries, res.results);
|
|
1249
1590
|
const failure = detectAllQueriesFailed(res.envelopes, 'tavily-search');
|
|
1250
1591
|
if (failure) {
|
|
1251
1592
|
return { ok: false, reason: failure, envelopes: res.envelopes, auditMetadata };
|
|
@@ -1266,7 +1607,7 @@ const handleArxivSearch = async (input) => {
|
|
|
1266
1607
|
queries: parsed.data.queries,
|
|
1267
1608
|
maxResultsPerQuery: parsed.data.maxResults,
|
|
1268
1609
|
});
|
|
1269
|
-
const auditMetadata =
|
|
1610
|
+
const auditMetadata = buildSearchAuditMetadata(parsed.data.queries, res.results);
|
|
1270
1611
|
const failure = detectAllQueriesFailed(res.envelopes, 'arxiv-search');
|
|
1271
1612
|
if (failure) {
|
|
1272
1613
|
return { ok: false, reason: failure, envelopes: res.envelopes, auditMetadata };
|
|
@@ -1292,7 +1633,7 @@ const handleUsptoSearch = async (input) => {
|
|
|
1292
1633
|
queries: parsed.data.queries,
|
|
1293
1634
|
maxResultsPerQuery: parsed.data.maxResults,
|
|
1294
1635
|
});
|
|
1295
|
-
const auditMetadata =
|
|
1636
|
+
const auditMetadata = buildSearchAuditMetadata(parsed.data.queries, res.results);
|
|
1296
1637
|
const failure = detectAllQueriesFailed(res.envelopes, 'uspto-search');
|
|
1297
1638
|
if (failure) {
|
|
1298
1639
|
return { ok: false, reason: failure, envelopes: res.envelopes, auditMetadata };
|
|
@@ -1313,7 +1654,7 @@ const handleHackerNewsSearch = async (input) => {
|
|
|
1313
1654
|
queries: parsed.data.queries,
|
|
1314
1655
|
hitsPerQuery: parsed.data.maxResults,
|
|
1315
1656
|
});
|
|
1316
|
-
const auditMetadata =
|
|
1657
|
+
const auditMetadata = buildSearchAuditMetadata(parsed.data.queries, res.results);
|
|
1317
1658
|
const failure = detectAllQueriesFailed(res.envelopes, 'hackernews-search');
|
|
1318
1659
|
if (failure) {
|
|
1319
1660
|
return { ok: false, reason: failure, envelopes: res.envelopes, auditMetadata };
|
|
@@ -2050,6 +2391,12 @@ exports.SKILLS = {
|
|
|
2050
2391
|
// targetCodeRepoStatus: brownfield (clone + classify), greenfield
|
|
2051
2392
|
// (scaffolding hints, no clone), refuse (not-connected / unreachable).
|
|
2052
2393
|
'knowledge-code': handleKnowledgeCode,
|
|
2394
|
+
// Bug-Q phase 2 — knowledge-code-read returns bounded file CONTENT
|
|
2395
|
+
// from the brownfield clone retained by knowledge-code. Lets the
|
|
2396
|
+
// agent ground design decisions in real code excerpts (Codex audit
|
|
2397
|
+
// round 2 / B1: agent was hallucinating brownfield file paths
|
|
2398
|
+
// because the substrate was structural metadata only).
|
|
2399
|
+
'knowledge-code-read': handleKnowledgeCodeRead,
|
|
2053
2400
|
'tavily-search': handleTavilySearch,
|
|
2054
2401
|
'arxiv-search': handleArxivSearch,
|
|
2055
2402
|
'uspto-search': handleUsptoSearch,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@maintainabilityai/research-runner",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.45",
|
|
4
4
|
"description": "Research + PRD agent runner — orchestrates the Archeologist and PRD pipelines for the MaintainabilityAI governance mesh",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"author": "MaintainabilityAI",
|