@maintainabilityai/research-runner 0.1.43 → 0.1.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -77,6 +77,43 @@ LLM provider/model, token count, cost, grounding score, and audit chain hash.
77
77
  Auditors verify the artifact by re-running the chain against the recorded
78
78
  mesh sha.
79
79
 
80
+ ## Versioning + workflow-template pin scheme
81
+
82
+ The mesh-deployed workflow templates pin this package with a **tilde range**:
83
+
84
+ ```
85
+ npx -y @maintainabilityai/research-runner@~0.1.42 skill-<name>
86
+ ```
87
+
88
+ `~0.1.42` allows patch releases (`0.1.43`, `0.1.44`, …) but not minor
89
+ or major bumps. The reasons:
90
+
91
+ 1. **Auto-publish bumps patch on every merge.** The
92
+ `npm-publish-research-runner.yml` workflow runs `npm version patch`
93
+ when anything under `packages/research-runner/**` changes. A new
94
+ patch is published within minutes of merge.
95
+ 2. **Templates pinned exactly would force a follow-up edit on every
96
+ patch.** With `@0.1.42` (exact), every patch bump would leave the
97
+ templates stale until someone edited them. With `~0.1.42`, the
98
+ templates carry on transparently.
99
+ 3. **A minor bump is a deliberate review event.** When the runner ships
100
+ a contract change (new event field, new skill API shape, removed
101
+ field), bump `version` from `0.1.x` to `0.2.0` and update the
102
+ templates in the same PR. A `phaseSpec.test.ts` parity test fails
103
+ loudly when the templates' major.minor doesn't match `package.json`.
104
+
105
+ **When you change anything under `packages/research-runner/**`:** you
106
+ do NOT need to edit workflow templates. The auto-publish handles it.
107
+ **When you ship a contract-breaking change:** bump the minor version
108
+ in `packages/research-runner/package.json` AND update every
109
+ `@maintainabilityai/research-runner@~0.X.Y` reference in
110
+ `vscode-extension/code-templates/**` to match. Tests enforce this.
111
+
112
+ The off-by-one risk the tilde range eliminates: a developer trying to
113
+ mentally compute "what patch will the auto-publish produce" and pinning
114
+ to the wrong value. With tilde, the patch resolves at run-time from
115
+ npm, and the mental math goes away.
116
+
80
117
  ## License
81
118
 
82
119
  MIT
@@ -902,6 +902,11 @@ const handleSelfReviewCodeSecurity = makeCodeReviewHandler('code-security');
902
902
  // ─────────────────────────────────────────────────────────────────────
903
903
  const KnowledgeCodeInput = zod_1.z.object({
904
904
  okrId: zod_1.z.string().min(1),
905
+ // Bug-Q phase 2 — `runId` is the cache key for the clone retained
906
+ // between this skill and `knowledge-code-read`. Falls back to the
907
+ // RUN_ID env var when omitted (the runner already sets it from
908
+ // session context); failing both yields a clear error.
909
+ runId: zod_1.z.string().min(1).optional(),
905
910
  repoUrl: zod_1.z.string().min(1),
906
911
  repoStatus: zod_1.z.enum(['connected', 'not-connected', 'create', 'unreachable']),
907
912
  ref: zod_1.z.string().optional(),
@@ -980,34 +985,187 @@ function walkRepo(rootDir, maxFiles) {
980
985
  recurse(rootDir, '');
981
986
  return out;
982
987
  }
988
+ // ─────────────────────────────────────────────────────────────────────
989
+ // Bug-Q phase 2 — brownfield clone cache.
990
+ // ─────────────────────────────────────────────────────────────────────
991
+ // Until phase 2, `knowledge-code` cloned + classified + deleted the
992
+ // tree in one invocation. That left the agent with structural metadata
993
+ // only — no way to read actual file contents to ground its design.
994
+ // Codex audit round 2 (B1) flagged this: the prompt asks for
995
+ // `src/state/profileStore.ts`-level paths against a substrate that
996
+ // returns only top-dirs and language counts.
997
+ //
998
+ // Phase 2 splits the lifecycle:
999
+ // 1. `knowledge-code` clones + walks + classifies + RETAINS the clone
1000
+ // in a per-runId tmpdir cache.
1001
+ // 2. `knowledge-code-read` reads files FROM that cache (with a
1002
+ // content-addressable re-clone fallback so a stale or expired
1003
+ // cache doesn't break the agent).
1004
+ //
1005
+ // The cache key is `(runId, owner, name)` — one clone per (session, repo)
1006
+ // pair. Workflow runners are sandboxed per-job so tmpdir starts empty,
1007
+ // so cross-run pollution is impossible. Local dev / tests can stale the
1008
+ // cache; `.cache-meta.json` carries `ref` + `sha` so the read skill can
1009
+ // detect staleness and re-clone.
1010
+ //
1011
+ // SECURITY: `knowledge-code-read` enforces a path perimeter — relative
1012
+ // paths only, no `..` segments, resolved path must be a child of the
1013
+ // clone root. Any escape attempt is rejected without reading bytes.
1014
+ function knowledgeCodeCacheDir(runId, owner, name) {
1015
+ // Filesystem-safe key. runId / owner / name are short ascii so the
1016
+ // basename can't blow up POSIX path limits.
1017
+ return path.join(os.tmpdir(), 'knowledge-code-cache', runId, `${owner}-${name}`);
1018
+ }
1019
+ function ensureClone(runId, repoUrl, ref, owner, name) {
1020
+ const cacheDir = knowledgeCodeCacheDir(runId, owner, name);
1021
+ const metaPath = path.join(cacheDir, '.cache-meta.json');
1022
+ // Cache hit if meta exists AND ref matches.
1023
+ if (fs.existsSync(metaPath)) {
1024
+ try {
1025
+ const meta = JSON.parse(fs.readFileSync(metaPath, 'utf8'));
1026
+ if (meta.ref === ref && typeof meta.sha === 'string') {
1027
+ return { ok: true, path: cacheDir, sha: meta.sha, reused: true };
1028
+ }
1029
+ }
1030
+ catch { /* unreadable meta — re-clone */ }
1031
+ }
1032
+ // Clean out a stale cache before re-cloning to avoid mixing two refs.
1033
+ try {
1034
+ fs.rmSync(cacheDir, { recursive: true, force: true });
1035
+ }
1036
+ catch { /* ignore */ }
1037
+ fs.mkdirSync(path.dirname(cacheDir), { recursive: true });
1038
+ // eslint-disable-next-line @typescript-eslint/no-require-imports
1039
+ const { execFileSync } = require('node:child_process');
1040
+ const cloneArgs = ['clone', '--depth=1', '--filter=blob:limit=10m'];
1041
+ if (ref && ref !== 'HEAD') {
1042
+ cloneArgs.push('--branch', ref);
1043
+ }
1044
+ cloneArgs.push(repoUrl, cacheDir);
1045
+ try {
1046
+ execFileSync('git', cloneArgs, { stdio: ['ignore', 'pipe', 'pipe'], timeout: 60_000 });
1047
+ }
1048
+ catch (err) {
1049
+ try {
1050
+ fs.rmSync(cacheDir, { recursive: true, force: true });
1051
+ }
1052
+ catch { /* ignore */ }
1053
+ return { ok: false, path: '', sha: '', reused: false, error: err instanceof Error ? err.message : String(err) };
1054
+ }
1055
+ let sha = '';
1056
+ try {
1057
+ sha = execFileSync('git', ['rev-parse', 'HEAD'], { cwd: cacheDir, encoding: 'utf8', stdio: ['ignore', 'pipe', 'pipe'] }).trim();
1058
+ }
1059
+ catch { /* sha stays empty */ }
1060
+ try {
1061
+ fs.writeFileSync(metaPath, JSON.stringify({ owner, name, ref, sha, clonedAt: new Date().toISOString() }), 'utf8');
1062
+ }
1063
+ catch { /* meta write failure is non-fatal — next call will just re-clone */ }
1064
+ return { ok: true, path: cacheDir, sha, reused: false };
1065
+ }
1066
+ function classifyRole(filePath, lang) {
1067
+ const lower = filePath.toLowerCase();
1068
+ // Tests — broadest match wins. `__tests__/` dir, `.test.`, `.spec.`,
1069
+ // or top-level `test/` / `tests/`.
1070
+ if (/(^|\/)__tests__\//.test(lower)
1071
+ || /\.(test|spec)\.(t|j)sx?$/.test(lower)
1072
+ || /\.(test|spec)\.py$/.test(lower)
1073
+ || /^test(s)?\//.test(lower)) {
1074
+ return 'test';
1075
+ }
1076
+ // Routes — files in `routes/`, `pages/` (Next), `app/` (Next/Nuxt
1077
+ // app router), or files named `*.route(s).*`.
1078
+ if (/(^|\/)(routes|pages|app)\//.test(lower)
1079
+ || /\.routes?\.(t|j)sx?$/.test(lower)) {
1080
+ return 'route';
1081
+ }
1082
+ // Docs — `.md`, or anything in `docs/` / `doc/`.
1083
+ if (/\.md$/i.test(lower)
1084
+ || /^docs?\//.test(lower)) {
1085
+ return 'doc';
1086
+ }
1087
+ // Config — top-level YAML/JSON/TOML config files, manifests,
1088
+ // dot-files at root.
1089
+ const base = path.basename(filePath);
1090
+ if (MANIFEST_FILES.has(base)
1091
+ || /^\.[\w.-]+$/.test(base) // .eslintrc, .gitignore, …
1092
+ || /(^|\/)tsconfig(\.[^.]+)?\.json$/.test(lower)
1093
+ || /(^|\/)[^/]+\.config\.(t|j)sx?$/.test(lower)) {
1094
+ return 'config';
1095
+ }
1096
+ if (lang && lang !== 'unknown') {
1097
+ return 'source';
1098
+ }
1099
+ return 'other';
1100
+ }
983
1101
  /**
984
1102
  * Guess the primary BAR-level language + framework from the manifest +
985
- * file mix. For greenfield scaffolding the agent can override these from
986
- * BAR-app.yaml calm-node hints; this is just the brownfield read.
1103
+ * file mix, AND surface bounded file/test/route/module inventories the
1104
+ * agent + workflow gate can use to ground brownfield decisions.
1105
+ *
1106
+ * Bug-Q phase 2 (Codex audit round 2 / B1) extended the return shape
1107
+ * with `files[]`, `tests[]`, `routes[]`, `modules[]`. Before phase 2,
1108
+ * the only structural outputs were `topDirs` + `languages` + manifest
1109
+ * count — enough for the agent to KNOW what kind of repo it was, not
1110
+ * enough to GROUND specific file-level design choices.
987
1111
  */
988
- function classifyRepo(files) {
1112
+ function classifyRepo(filesRaw) {
989
1113
  const topDirs = new Set();
990
1114
  const languages = {};
991
1115
  const packageManifests = [];
992
- for (const f of files) {
1116
+ const files = [];
1117
+ const tests = [];
1118
+ const routes = [];
1119
+ const moduleCounts = {};
1120
+ for (const f of filesRaw) {
993
1121
  const slashIdx = f.indexOf('/');
994
1122
  if (slashIdx > 0) {
995
1123
  topDirs.add(f.slice(0, slashIdx));
996
1124
  }
997
1125
  const ext = path.extname(f).toLowerCase();
998
- const lang = LANG_EXTS[ext];
999
- if (lang) {
1126
+ const lang = LANG_EXTS[ext] ?? 'unknown';
1127
+ if (LANG_EXTS[ext]) {
1000
1128
  languages[lang] = (languages[lang] ?? 0) + 1;
1001
1129
  }
1002
1130
  const base = path.basename(f);
1003
1131
  if (MANIFEST_FILES.has(base)) {
1004
1132
  packageManifests.push(f);
1005
1133
  }
1134
+ const role = classifyRole(f, lang);
1135
+ files.push({ path: f, lang, role });
1136
+ if (role === 'test') {
1137
+ tests.push(f);
1138
+ }
1139
+ if (role === 'route') {
1140
+ routes.push(f);
1141
+ }
1142
+ // Modules — top-level subdirectory of `src/` if present, otherwise
1143
+ // top-level repo subdir. Skips files at the repo root (those aren't
1144
+ // module-organized).
1145
+ const srcMatch = /^src\/([^/]+)\//.exec(f);
1146
+ if (srcMatch) {
1147
+ moduleCounts[srcMatch[1]] = (moduleCounts[srcMatch[1]] ?? 0) + 1;
1148
+ }
1149
+ else if (slashIdx > 0) {
1150
+ const topDir = f.slice(0, slashIdx);
1151
+ // Avoid double-counting top-level dirs that are clearly not
1152
+ // modules (tests, docs, config dirs, infra dirs).
1153
+ if (!['tests', 'test', '__tests__', 'docs', 'doc', '.github', '.vscode', 'scripts'].includes(topDir)) {
1154
+ moduleCounts[topDir] = (moduleCounts[topDir] ?? 0) + 1;
1155
+ }
1156
+ }
1006
1157
  }
1158
+ const modules = Object.entries(moduleCounts)
1159
+ .map(([name, fileCount]) => ({ name, fileCount }))
1160
+ .sort((a, b) => b.fileCount - a.fileCount);
1007
1161
  return {
1008
1162
  topDirs: Array.from(topDirs).sort(),
1009
1163
  languages,
1010
1164
  packageManifests: packageManifests.sort(),
1165
+ files,
1166
+ tests: tests.sort(),
1167
+ routes: routes.sort(),
1168
+ modules,
1011
1169
  };
1012
1170
  }
1013
1171
  /**
@@ -1078,55 +1236,42 @@ const handleKnowledgeCode = async (input) => {
1078
1236
  };
1079
1237
  }
1080
1238
  // ─── Brownfield branch (connected) ─────────────────────────────────
1081
- // Shallow git clone (`--depth=1`) into a tmp dir, walk + classify.
1082
- // Cleanup on exit (process-scoped tmpdir). On clone failure we degrade
1083
- // to a soft-refuse rather than crash the agent can still attempt
1084
- // partial grounding from the SKILL response shape.
1239
+ // Bug-Q phase 2 uses the per-runId clone cache (`ensureClone`)
1240
+ // so `knowledge-code-read` can read the same files later in the
1241
+ // session without re-cloning. The cache stays for the runner-job
1242
+ // tmpdir lifetime (workflow runners get a clean tmpdir per job, so
1243
+ // cross-run pollution is impossible).
1085
1244
  if (!gh) {
1086
1245
  return { ok: false, reason: 'repo-url-not-github', repo: repoUrl };
1087
1246
  }
1088
- const { execFileSync } = await Promise.resolve().then(() => __importStar(require('node:child_process')));
1089
- const tmpRoot = fs.mkdtempSync(path.join(os.tmpdir(), `knowledge-code-${gh.name}-`));
1090
- const cloneTarget = path.join(tmpRoot, gh.name);
1091
- const cloneRef = ref ?? 'HEAD';
1092
- const cloneArgs = ['clone', '--depth=1', '--filter=blob:limit=10m'];
1093
- if (ref && ref !== 'HEAD') {
1094
- cloneArgs.push('--branch', ref);
1095
- }
1096
- cloneArgs.push(repoUrl, cloneTarget);
1097
- let cloneOk = true;
1098
- let cloneError = '';
1099
- try {
1100
- execFileSync('git', cloneArgs, { stdio: ['ignore', 'pipe', 'pipe'], timeout: 60_000 });
1101
- }
1102
- catch (err) {
1103
- cloneOk = false;
1104
- cloneError = err instanceof Error ? err.message : String(err);
1247
+ // Resolve the session runId explicit input wins; fall back to
1248
+ // RUN_ID env var (the runner sets this from session context).
1249
+ const runId = parsed.data.runId ?? process.env.RUN_ID;
1250
+ if (!runId) {
1251
+ return {
1252
+ ok: false,
1253
+ reason: 'missing-run-id',
1254
+ repo: repoSlug,
1255
+ remediation: "knowledge-code needs a session runId to scope the clone cache. Either pass `runId` in the skill input, or set the RUN_ID env var before invoking (the agent does this automatically via session-context export — see agent.md step 1b).",
1256
+ };
1105
1257
  }
1106
- if (!cloneOk) {
1107
- // Clean up the empty tmpdir before bailing.
1108
- try {
1109
- fs.rmSync(tmpRoot, { recursive: true, force: true });
1110
- }
1111
- catch { /* ignore */ }
1258
+ const cloneRef = ref ?? 'HEAD';
1259
+ const cloneResult = ensureClone(runId, repoUrl, cloneRef, gh.owner, gh.name);
1260
+ if (!cloneResult.ok) {
1112
1261
  const auditMetadata = { phase: 'what', repo: repoSlug, mode: 'brownfield-clone-failed', repo_status: 'connected', okr_id: okrId };
1113
1262
  return {
1114
1263
  ok: false,
1115
1264
  reason: 'clone-failed',
1116
1265
  repo: repoSlug,
1117
- remediation: `git clone failed for ${repoUrl}. Verify the GitHub App install is approved on this repo and the ref (${cloneRef}) exists. Underlying error: ${cloneError}`,
1266
+ remediation: `git clone failed for ${repoUrl}. Verify the GitHub App install is approved on this repo and the ref (${cloneRef}) exists. Underlying error: ${cloneResult.error ?? 'unknown'}`,
1118
1267
  auditMetadata,
1119
1268
  };
1120
1269
  }
1121
- // Resolve the actual SHA so the response is reproducible.
1122
- let sha = '';
1123
- try {
1124
- sha = execFileSync('git', ['rev-parse', 'HEAD'], { cwd: cloneTarget, encoding: 'utf8', stdio: ['ignore', 'pipe', 'pipe'] }).trim();
1125
- }
1126
- catch { /* sha stays empty */ }
1270
+ const cloneTarget = cloneResult.path;
1271
+ const sha = cloneResult.sha;
1127
1272
  const cap = maxFiles ?? 200;
1128
- const files = walkRepo(cloneTarget, cap);
1129
- const structure = classifyRepo(files);
1273
+ const filesRaw = walkRepo(cloneTarget, cap);
1274
+ const structure = classifyRepo(filesRaw);
1130
1275
  // Best-effort entrypoint detection from the most-common manifest +
1131
1276
  // top-level layout. Conservative: only mark something as an entrypoint
1132
1277
  // when we have positive signal (manifest field OR conventional path).
@@ -1166,13 +1311,16 @@ const handleKnowledgeCode = async (input) => {
1166
1311
  catch { /* manifest unreadable / non-JSON; skip */ }
1167
1312
  }
1168
1313
  }
1169
- // Clean up the cloned tree the SKILL is a one-shot read, no need to
1170
- // keep ~10MB of git data per invocation.
1171
- try {
1172
- fs.rmSync(tmpRoot, { recursive: true, force: true });
1173
- }
1174
- catch { /* ignore */ }
1314
+ // Bug-Q phase 2 DO NOT delete the clone here. `knowledge-code-read`
1315
+ // will reuse it through `ensureClone`. Workflow-runner tmpdir is wiped
1316
+ // when the job ends, so cleanup happens for free at the right scope.
1175
1317
  const primaryLanguage = Object.entries(structure.languages).sort((a, b) => b[1] - a[1])[0]?.[0] ?? 'unknown';
1318
+ // Bug-Q phase 2 — surface the file/test/route/module inventory in the
1319
+ // audit payload so the workflow path-citation gate can cross-check
1320
+ // every brownfield path cited in code-design.md against what actually
1321
+ // exists in the clone. `inventory_paths` is the flat list of file
1322
+ // paths (sorted) the workflow uses as its membership set.
1323
+ const inventoryPaths = structure.files.map(f => f.path).sort();
1176
1324
  const auditMetadata = {
1177
1325
  phase: 'what',
1178
1326
  repo: repoSlug,
@@ -1180,9 +1328,15 @@ const handleKnowledgeCode = async (input) => {
1180
1328
  repo_status: 'connected',
1181
1329
  okr_id: okrId,
1182
1330
  sha: sha.slice(0, 12),
1183
- file_count: files.length,
1331
+ file_count: filesRaw.length,
1184
1332
  primary_language: primaryLanguage,
1185
1333
  manifests: structure.packageManifests.length,
1334
+ test_count: structure.tests.length,
1335
+ route_count: structure.routes.length,
1336
+ module_count: structure.modules.length,
1337
+ // Inventory: flat path list — bounded by the `maxFiles` cap above.
1338
+ // Workflow gate consumes this to validate cited paths.
1339
+ inventory_paths: inventoryPaths,
1186
1340
  };
1187
1341
  return {
1188
1342
  ok: true,
@@ -1194,6 +1348,147 @@ const handleKnowledgeCode = async (input) => {
1194
1348
  };
1195
1349
  };
1196
1350
  // ─────────────────────────────────────────────────────────────────────
1351
+ // knowledge-code-read — Bug-Q phase 2 (Codex audit round 2 / B1).
1352
+ // ─────────────────────────────────────────────────────────────────────
1353
+ // `knowledge-code` returns structural metadata; this skill returns
1354
+ // bounded file CONTENTS so the agent can ground design with real code,
1355
+ // not paraphrased guesses. Same session-scoped clone cache as
1356
+ // `knowledge-code` — the read is essentially free after the initial
1357
+ // clone.
1358
+ //
1359
+ // SECURITY PERIMETER: the runner only reads paths that resolve INSIDE
1360
+ // the cloned repo. Path-traversal attempts (`../`, absolute paths) are
1361
+ // rejected without reading bytes. The clone is a shallow git clone in
1362
+ // an isolated tmpdir; even if a malicious file in the repo contained
1363
+ // a symlink to /etc/passwd, the `realpath` check below would refuse.
1364
+ //
1365
+ // CONTENT BOUNDS: max 10 KB per response; binary files (any NUL byte)
1366
+ // rejected. The agent is meant to read CODE, not blobs.
1367
+ //
1368
+ // AUDIT: every read auto-emits a skill_call event with file + bytes
1369
+ // returned, so the chain captures exactly which files the agent
1370
+ // consulted while writing the design.
1371
+ const KnowledgeCodeReadInput = zod_1.z.object({
1372
+ okrId: zod_1.z.string().min(1),
1373
+ runId: zod_1.z.string().min(1).optional(),
1374
+ repoUrl: zod_1.z.string().min(1),
1375
+ ref: zod_1.z.string().optional(),
1376
+ filePath: zod_1.z.string().min(1),
1377
+ });
1378
+ const KNOWLEDGE_CODE_READ_MAX_BYTES = 10_240; // 10 KB cap per response
1379
+ const handleKnowledgeCodeRead = async (input) => {
1380
+ const parsed = KnowledgeCodeReadInput.safeParse(input);
1381
+ if (!parsed.success) {
1382
+ return { ok: false, reason: `bad-input: ${parsed.error.message}` };
1383
+ }
1384
+ const { okrId, repoUrl, ref, filePath } = parsed.data;
1385
+ const gh = parseGithubUrl(repoUrl);
1386
+ if (!gh) {
1387
+ return { ok: false, reason: 'repo-url-not-github', repo: repoUrl };
1388
+ }
1389
+ const runId = parsed.data.runId ?? process.env.RUN_ID;
1390
+ if (!runId) {
1391
+ return {
1392
+ ok: false,
1393
+ reason: 'missing-run-id',
1394
+ remediation: "knowledge-code-read needs a session runId to find the clone cache shared with knowledge-code. Pass `runId` in input or set the RUN_ID env var (the agent does this via session-context export).",
1395
+ };
1396
+ }
1397
+ // Security perimeter — reject obvious escape attempts BEFORE touching
1398
+ // the filesystem so the audit chain captures the rejection cleanly.
1399
+ if (path.isAbsolute(filePath)) {
1400
+ return { ok: false, reason: `path-rejected: absolute paths are forbidden (${filePath})` };
1401
+ }
1402
+ // Normalize and re-check — a path like `foo/../../bar` would resolve
1403
+ // up two levels even though the literal string contains no leading
1404
+ // `../`. `path.normalize` collapses it; we then reject if it starts
1405
+ // with `..`.
1406
+ const normalized = path.normalize(filePath);
1407
+ if (normalized.startsWith('..') || normalized === '..' || normalized.includes(`${path.sep}..${path.sep}`)) {
1408
+ return { ok: false, reason: `path-rejected: path-traversal segments forbidden (${filePath} -> ${normalized})` };
1409
+ }
1410
+ // Reuse the cached clone from knowledge-code; clone fresh if missing
1411
+ // (e.g. agent called knowledge-code-read without calling knowledge-
1412
+ // code first — supported but slower).
1413
+ const cloneResult = ensureClone(runId, repoUrl, ref ?? 'HEAD', gh.owner, gh.name);
1414
+ if (!cloneResult.ok) {
1415
+ return {
1416
+ ok: false,
1417
+ reason: 'clone-failed',
1418
+ repo: `${gh.owner}/${gh.name}`,
1419
+ remediation: `Could not access clone for ${repoUrl}. Underlying error: ${cloneResult.error ?? 'unknown'}`,
1420
+ };
1421
+ }
1422
+ const absPath = path.join(cloneResult.path, normalized);
1423
+ // Final paranoia check — resolve the real path and verify it's still
1424
+ // a child of the clone root. Defends against symlink-shaped escapes
1425
+ // (an attacker-controlled file in the repo that's a symlink to /etc).
1426
+ let realPath;
1427
+ try {
1428
+ realPath = fs.realpathSync.native(absPath);
1429
+ }
1430
+ catch {
1431
+ return { ok: false, reason: `file-not-found: ${filePath} not in ${gh.owner}/${gh.name}@${cloneResult.sha.slice(0, 12)}` };
1432
+ }
1433
+ const realClone = fs.realpathSync.native(cloneResult.path);
1434
+ if (!realPath.startsWith(realClone + path.sep) && realPath !== realClone) {
1435
+ return { ok: false, reason: `path-escape: resolved path falls outside the cloned repo (${filePath} -> ${realPath})` };
1436
+ }
1437
+ let stat;
1438
+ try {
1439
+ stat = fs.statSync(realPath);
1440
+ }
1441
+ catch {
1442
+ return { ok: false, reason: `file-not-found: ${filePath}` };
1443
+ }
1444
+ if (stat.isDirectory()) {
1445
+ return { ok: false, reason: `path-is-directory: ${filePath} is a directory; knowledge-code-read returns file contents only` };
1446
+ }
1447
+ // Read + truncate + reject binary.
1448
+ let buf;
1449
+ try {
1450
+ buf = fs.readFileSync(realPath);
1451
+ }
1452
+ catch (err) {
1453
+ return { ok: false, reason: `read-failed: ${err instanceof Error ? err.message : String(err)}` };
1454
+ }
1455
+ // Heuristic: a NUL byte in the first 8 KB is a strong binary signal.
1456
+ // Strings of bytes that legitimately contain NUL bytes (gzip, images,
1457
+ // wasm) are not source code; refuse them.
1458
+ if (buf.slice(0, Math.min(buf.length, 8192)).includes(0)) {
1459
+ return { ok: false, reason: `binary-file: ${filePath} contains NUL bytes; knowledge-code-read returns text only` };
1460
+ }
1461
+ const totalBytes = buf.length;
1462
+ const truncated = totalBytes > KNOWLEDGE_CODE_READ_MAX_BYTES;
1463
+ const content = (truncated ? buf.subarray(0, KNOWLEDGE_CODE_READ_MAX_BYTES) : buf).toString('utf8');
1464
+ const lang = LANG_EXTS[path.extname(filePath).toLowerCase()] ?? 'unknown';
1465
+ const lineCount = content.split('\n').length;
1466
+ const auditMetadata = {
1467
+ phase: 'what',
1468
+ repo: `${gh.owner}/${gh.name}`,
1469
+ file: normalized,
1470
+ sha: cloneResult.sha.slice(0, 12),
1471
+ bytes_returned: content.length,
1472
+ bytes_total: totalBytes,
1473
+ truncated,
1474
+ lang,
1475
+ okr_id: okrId,
1476
+ };
1477
+ return {
1478
+ ok: true,
1479
+ repo: `${gh.owner}/${gh.name}`,
1480
+ file: normalized,
1481
+ sha: cloneResult.sha,
1482
+ content,
1483
+ lang,
1484
+ lineCount,
1485
+ truncated,
1486
+ bytesReturned: content.length,
1487
+ bytesTotal: totalBytes,
1488
+ auditMetadata,
1489
+ };
1490
+ };
1491
+ // ─────────────────────────────────────────────────────────────────────
1197
1492
  // Search skills — thin wrappers over the existing search nodes
1198
1493
  // ─────────────────────────────────────────────────────────────────────
1199
1494
  const SearchQueriesInput = zod_1.z.object({
@@ -1230,6 +1525,52 @@ function detectAllQueriesFailed(envelopes, skill) {
1230
1525
  // pattern matching of firewall-block vs query-quality failures.
1231
1526
  return `all-queries-failed: ${skill} — ${firstError}`;
1232
1527
  }
1528
+ /**
1529
+ * Bug-Q phase 3 (Codex audit follow-up / oracle evidence) — search
1530
+ * audit metadata now carries a bounded preview of WHICH results came
1531
+ * back, not just HOW MANY. Without this, a reviewer who wants to
1532
+ * verify "S-3 cites a real arXiv paper, not a hallucinated one"
1533
+ * has nothing in the chain to verify against — they'd have to trust
1534
+ * the agent's research-doc citations and re-run the search.
1535
+ *
1536
+ * Preview shape per hit: { provider, query, title, url, snippet?,
1537
+ * score?, publishedDate? } where:
1538
+ * - snippet is truncated to ~200 chars (the ProviderResult.content
1539
+ * field already caps at ~500; we shorten further for chain size)
1540
+ * - score is rounded to 2 decimals
1541
+ *
1542
+ * Total preview cap: 25 hits per skill_call. Search runs typically
1543
+ * return 10-30 results per provider before dedupe; the cap keeps the
1544
+ * audit JSONL compact while still proving "real evidence behind every
1545
+ * citation."
1546
+ */
1547
+ const SEARCH_RESULTS_PREVIEW_CAP = 25;
1548
+ const SEARCH_SNIPPET_CAP = 200;
1549
+ function buildSearchAuditMetadata(queries, results) {
1550
+ const preview = results.slice(0, SEARCH_RESULTS_PREVIEW_CAP).map((r) => {
1551
+ const snippet = (r.content || '').replace(/\s+/g, ' ').trim();
1552
+ const truncated = snippet.length > SEARCH_SNIPPET_CAP
1553
+ ? snippet.slice(0, SEARCH_SNIPPET_CAP) + '…'
1554
+ : snippet;
1555
+ const entry = {
1556
+ provider: r.provider,
1557
+ query: r.fromQuery,
1558
+ title: r.title,
1559
+ url: r.url,
1560
+ };
1561
+ if (truncated) {
1562
+ entry.snippet = truncated;
1563
+ }
1564
+ if (typeof r.score === 'number' && isFinite(r.score)) {
1565
+ entry.score = Math.round(r.score * 100) / 100;
1566
+ }
1567
+ if (r.publishedDate) {
1568
+ entry.publishedDate = r.publishedDate;
1569
+ }
1570
+ return entry;
1571
+ });
1572
+ return { queries, result_count: results.length, results_preview: preview };
1573
+ }
1233
1574
  const handleTavilySearch = async (input) => {
1234
1575
  const parsed = SearchQueriesInput.safeParse(input);
1235
1576
  if (!parsed.success) {
@@ -1245,7 +1586,7 @@ const handleTavilySearch = async (input) => {
1245
1586
  queries: parsed.data.queries,
1246
1587
  maxResultsPerQuery: parsed.data.maxResults,
1247
1588
  });
1248
- const auditMetadata = { queries: parsed.data.queries, result_count: res.results.length };
1589
+ const auditMetadata = buildSearchAuditMetadata(parsed.data.queries, res.results);
1249
1590
  const failure = detectAllQueriesFailed(res.envelopes, 'tavily-search');
1250
1591
  if (failure) {
1251
1592
  return { ok: false, reason: failure, envelopes: res.envelopes, auditMetadata };
@@ -1266,7 +1607,7 @@ const handleArxivSearch = async (input) => {
1266
1607
  queries: parsed.data.queries,
1267
1608
  maxResultsPerQuery: parsed.data.maxResults,
1268
1609
  });
1269
- const auditMetadata = { queries: parsed.data.queries, result_count: res.results.length };
1610
+ const auditMetadata = buildSearchAuditMetadata(parsed.data.queries, res.results);
1270
1611
  const failure = detectAllQueriesFailed(res.envelopes, 'arxiv-search');
1271
1612
  if (failure) {
1272
1613
  return { ok: false, reason: failure, envelopes: res.envelopes, auditMetadata };
@@ -1292,7 +1633,7 @@ const handleUsptoSearch = async (input) => {
1292
1633
  queries: parsed.data.queries,
1293
1634
  maxResultsPerQuery: parsed.data.maxResults,
1294
1635
  });
1295
- const auditMetadata = { queries: parsed.data.queries, result_count: res.results.length };
1636
+ const auditMetadata = buildSearchAuditMetadata(parsed.data.queries, res.results);
1296
1637
  const failure = detectAllQueriesFailed(res.envelopes, 'uspto-search');
1297
1638
  if (failure) {
1298
1639
  return { ok: false, reason: failure, envelopes: res.envelopes, auditMetadata };
@@ -1313,7 +1654,7 @@ const handleHackerNewsSearch = async (input) => {
1313
1654
  queries: parsed.data.queries,
1314
1655
  hitsPerQuery: parsed.data.maxResults,
1315
1656
  });
1316
- const auditMetadata = { queries: parsed.data.queries, result_count: res.results.length };
1657
+ const auditMetadata = buildSearchAuditMetadata(parsed.data.queries, res.results);
1317
1658
  const failure = detectAllQueriesFailed(res.envelopes, 'hackernews-search');
1318
1659
  if (failure) {
1319
1660
  return { ok: false, reason: failure, envelopes: res.envelopes, auditMetadata };
@@ -2050,6 +2391,12 @@ exports.SKILLS = {
2050
2391
  // targetCodeRepoStatus: brownfield (clone + classify), greenfield
2051
2392
  // (scaffolding hints, no clone), refuse (not-connected / unreachable).
2052
2393
  'knowledge-code': handleKnowledgeCode,
2394
+ // Bug-Q phase 2 — knowledge-code-read returns bounded file CONTENT
2395
+ // from the brownfield clone retained by knowledge-code. Lets the
2396
+ // agent ground design decisions in real code excerpts (Codex audit
2397
+ // round 2 / B1: agent was hallucinating brownfield file paths
2398
+ // because the substrate was structural metadata only).
2399
+ 'knowledge-code-read': handleKnowledgeCodeRead,
2053
2400
  'tavily-search': handleTavilySearch,
2054
2401
  'arxiv-search': handleArxivSearch,
2055
2402
  'uspto-search': handleUsptoSearch,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@maintainabilityai/research-runner",
3
- "version": "0.1.43",
3
+ "version": "0.1.45",
4
4
  "description": "Research + PRD agent runner — orchestrates the Archeologist and PRD pipelines for the MaintainabilityAI governance mesh",
5
5
  "license": "MIT",
6
6
  "author": "MaintainabilityAI",