kc-beta 0.6.0 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "kc-beta",
3
- "version": "0.6.0",
3
+ "version": "0.6.2",
4
4
  "description": "KC Agent — LLM document verification agent (pure Node.js CLI)",
5
5
  "type": "module",
6
6
  "bin": {
@@ -69,6 +69,19 @@ export const NEXT_PHASE = {
69
69
  [Phase.PRODUCTION_QC]: Phase.FINALIZATION, // E1: new 7th phase
70
70
  };
71
71
 
72
+ // v0.6.2 J2: explicit linear order so `_advancePhase` can detect rollback
73
+ // direction (target index < current index → rollback). Mirrors NEXT_PHASE
74
+ // but ordered, plus FINALIZATION at the end as the terminal phase.
75
+ export const PHASE_ORDER = [
76
+ Phase.BOOTSTRAP,
77
+ Phase.EXTRACTION,
78
+ Phase.SKILL_AUTHORING,
79
+ Phase.SKILL_TESTING,
80
+ Phase.DISTILLATION,
81
+ Phase.PRODUCTION_QC,
82
+ Phase.FINALIZATION,
83
+ ];
84
+
72
85
  /**
73
86
  * The KC Agent conversation engine.
74
87
  *
@@ -150,7 +163,7 @@ export class AgentEngine {
150
163
  });
151
164
 
152
165
  // Session state persistence
153
- this.sessionState = new SessionState(this.workspace.cwd, { statePath });
166
+ this.sessionState = new SessionState(this.workspace.cwd, { statePath, workspace: this.workspace });
154
167
 
155
168
  // Task manager (ralph-loop) — sub-agents don't queue further sub-tasks,
156
169
  // so they don't get a TaskManager.
@@ -165,7 +178,7 @@ export class AgentEngine {
165
178
  this.pipelines = {
166
179
  [Phase.BOOTSTRAP]: new ProjectInitializer(this.workspace),
167
180
  [Phase.EXTRACTION]: new RuleExtractionPipeline(this.workspace),
168
- [Phase.SKILL_AUTHORING]: new SkillAuthoringPipeline(this.workspace),
181
+ [Phase.SKILL_AUTHORING]: new SkillAuthoringPipeline(this.workspace, this.taskManager),
169
182
  [Phase.SKILL_TESTING]: new SkillTestingPipeline(this.workspace),
170
183
  [Phase.DISTILLATION]: new DistillationPipeline(this.workspace),
171
184
  [Phase.PRODUCTION_QC]: new ProductionQCPipeline(this.workspace),
@@ -223,6 +236,11 @@ export class AgentEngine {
223
236
  historyLen: this.history?.messages?.length ?? 0,
224
237
  tasksPending: this.taskManager?.progress?.pending ?? 0,
225
238
  tasksInProgress: this.taskManager?.progress?.inProgress ?? 0,
239
+ // v0.6.2 K1: per-component breakdown so heap-analyze.js can
240
+ // attribute growth (history vs subagents vs event log vs cache).
241
+ // All values in MB. Failures inside _sampleComponents are caught
242
+ // and the row gets `componentsErr` instead.
243
+ components: this._sampleComponents(),
226
244
  };
227
245
  fs.mkdirSync(logDir, { recursive: true });
228
246
  fs.appendFileSync(logPath, JSON.stringify(row) + "\n", "utf-8");
@@ -240,6 +258,89 @@ export class AgentEngine {
240
258
  };
241
259
  }
242
260
 
261
+ /**
262
+ * v0.6.2 K1: per-component heap accounting. Each value is in MB,
263
+ * rounded. The whole function is wrapped in a single try/catch by the
264
+ * caller; failures are silently dropped to keep the sampler diagnostic
265
+ * (never load-bearing).
266
+ *
267
+ * Components measured (by source):
268
+ * - history: in-memory `this.history.messages` content sizes (sum of
269
+ * JSON-stringified content)
270
+ * - eventLog: disk size of `logs/events.jsonl`
271
+ * - toolResults: disk size of `logs/tool_results/` (offloaded tool
272
+ * output, summed top-level files only — the dir is one level deep)
273
+ * - subagents: disk size of `sub_agents/` (one level — each subagent
274
+ * has its own directory tree but we just want the order of magnitude)
275
+ * - bundleCache: disk size of `cache/bundles/`
276
+ */
277
+ _sampleComponents() {
278
+ const out = { historyMB: 0, eventLogMB: 0, toolResultsMB: 0, subagentsMB: 0, bundleCacheMB: 0 };
279
+ const cwd = this.workspace?.cwd;
280
+ if (!cwd) return out;
281
+ // history: walk messages, sum content string lengths (UTF-16 → bytes
282
+ // approx 2× length; we conservatively count length itself since most
283
+ // content is ASCII-heavy JSON tool output)
284
+ try {
285
+ const msgs = this.history?.messages || [];
286
+ let bytes = 0;
287
+ for (const m of msgs) {
288
+ const c = m?.content;
289
+ if (typeof c === "string") bytes += c.length;
290
+ else if (Array.isArray(c)) {
291
+ for (const part of c) {
292
+ if (typeof part === "string") bytes += part.length;
293
+ else if (part?.text) bytes += String(part.text).length;
294
+ else if (part?.content) bytes += String(part.content).length;
295
+ else if (part?.input) bytes += JSON.stringify(part.input).length;
296
+ }
297
+ } else if (c && typeof c === "object") {
298
+ bytes += JSON.stringify(c).length;
299
+ }
300
+ }
301
+ out.historyMB = Math.round(bytes / 1024 / 1024);
302
+ } catch { /* skip */ }
303
+ // events.jsonl — single file size
304
+ try {
305
+ const p = path.join(cwd, "logs", "events.jsonl");
306
+ out.eventLogMB = Math.round(fs.statSync(p).size / 1024 / 1024);
307
+ } catch { /* skip */ }
308
+ // logs/tool_results/ — sum file sizes one level deep (it's flat)
309
+ try {
310
+ const dir = path.join(cwd, "logs", "tool_results");
311
+ let total = 0;
312
+ for (const e of fs.readdirSync(dir, { withFileTypes: true })) {
313
+ if (e.isFile()) {
314
+ try { total += fs.statSync(path.join(dir, e.name)).size; } catch { /* skip */ }
315
+ }
316
+ }
317
+ out.toolResultsMB = Math.round(total / 1024 / 1024);
318
+ } catch { /* skip */ }
319
+ // sub_agents/ — sum top-level entries (each is a dir, statSync returns
320
+ // dir-block size, not contents — that's fine for an order-of-magnitude
321
+ // signal; recursive walk would be too expensive for the sampler)
322
+ try {
323
+ const dir = path.join(cwd, "sub_agents");
324
+ let total = 0;
325
+ for (const e of fs.readdirSync(dir, { withFileTypes: true })) {
326
+ try { total += fs.statSync(path.join(dir, e.name)).size; } catch { /* skip */ }
327
+ }
328
+ out.subagentsMB = Math.round(total / 1024 / 1024);
329
+ } catch { /* skip */ }
330
+ // cache/bundles/
331
+ try {
332
+ const dir = path.join(cwd, "cache", "bundles");
333
+ let total = 0;
334
+ for (const e of fs.readdirSync(dir, { withFileTypes: true })) {
335
+ if (e.isFile()) {
336
+ try { total += fs.statSync(path.join(dir, e.name)).size; } catch { /* skip */ }
337
+ }
338
+ }
339
+ out.bundleCacheMB = Math.round(total / 1024 / 1024);
340
+ } catch { /* skip */ }
341
+ return out;
342
+ }
343
+
243
344
  /** Stop background diagnostics. Call on graceful shutdown. */
244
345
  stop() {
245
346
  try { this._heapSamplerStop?.(); } catch { /* ignore */ }
@@ -280,6 +381,14 @@ export class AgentEngine {
280
381
  new PhaseAdvanceTool(
281
382
  (to, reason, opts) => this._advancePhase(to, reason, opts),
282
383
  () => this.currentPhase, // H1: tool reads phase BEFORE its own call
384
+ // v0.6.2 J1: surface running subagents so the tool can refuse
385
+ // advance until the agent explicitly acknowledges them.
386
+ () => {
387
+ try {
388
+ const agentTool = this._buildTools?.core?.find((t) => t?.name === "agent_tool");
389
+ return agentTool?.getRunningTaskIds?.() || [];
390
+ } catch { return []; }
391
+ },
283
392
  ),
284
393
  new DocumentParseTool(this.workspace, {
285
394
  mineruApiUrl: this.config.mineruApiUrl,
@@ -311,7 +420,11 @@ export class AgentEngine {
311
420
  // Distillation+ only (DISTILL mode)
312
421
  distill: [
313
422
  workerLlm,
314
- new WorkflowRunTool(this.workspace, this.versionManager, this.confidence),
423
+ new WorkflowRunTool(this.workspace, this.versionManager, this.confidence, {
424
+ // v0.6.1 A6: hook engine-emitted milestones so phase gates see workflow runs
425
+ recordMilestone: (phase, key, value) => this._recordMilestone(phase, key, value),
426
+ getCurrentPhase: () => this.currentPhase,
427
+ }),
315
428
  new TierDowngradeTool(this.workspace, workerLlm),
316
429
  new QCSampleTool(this.workspace),
317
430
  ],
@@ -1057,12 +1170,33 @@ export class AgentEngine {
1057
1170
  return false;
1058
1171
  }
1059
1172
 
1060
- const phaseSummary = `[${this.currentPhase.toUpperCase()} → ${nextPhase.toUpperCase()}]: ${reason}${force && nextPhase !== expected ? " (forced)" : ""}`;
1173
+ // v0.6.2 J2: detect rollback direction. PHASE_ORDER is a linear array
1174
+ // of all phases; if target index < current index, this is a rollback
1175
+ // (e.g., production_qc → skill_authoring after gates revealed gaps).
1176
+ const fromIdx = PHASE_ORDER.indexOf(this.currentPhase);
1177
+ const toIdx = PHASE_ORDER.indexOf(nextPhase);
1178
+ const direction = (fromIdx >= 0 && toIdx >= 0 && toIdx < fromIdx)
1179
+ ? "rollback" : "forward";
1180
+
1181
+ // v0.6.1 B1: build engine-appended hard-counts block + heuristic mismatch
1182
+ // detection so the LLM-narrated reason can be cross-checked against
1183
+ // ground-truth telemetry. Phase summaries become diagnostic, not just
1184
+ // narrative.
1185
+ const engineCounts = this._buildEngineCountsBlock(this.currentPhase);
1186
+ const mismatchPrefix = this._detectSummaryMismatch(reason, this.currentPhase) ? "⚠️ POSSIBLE MISMATCH: " : "";
1187
+ const directionTag = direction === "rollback" ? " [ROLLBACK]" : "";
1188
+ const phaseSummary =
1189
+ `[${this.currentPhase.toUpperCase()} → ${nextPhase.toUpperCase()}]${directionTag}: ${mismatchPrefix}${reason}` +
1190
+ (force && nextPhase !== expected ? " (forced)" : "") +
1191
+ (engineCounts ? `\n (engine) ${engineCounts}` : "");
1061
1192
  this._phaseSummaries.push(phaseSummary);
1062
1193
  this.eventLog.append("phase_transition", {
1063
1194
  from: this.currentPhase,
1064
1195
  to: nextPhase,
1065
1196
  reason,
1197
+ direction,
1198
+ engineCounts: engineCounts || null,
1199
+ possibleMismatch: !!mismatchPrefix,
1066
1200
  forced: force && nextPhase !== expected,
1067
1201
  });
1068
1202
  const fromPhase = this.currentPhase;
@@ -1070,6 +1204,17 @@ export class AgentEngine {
1070
1204
  this._registerToolsForPhase(this.currentPhase);
1071
1205
  this.workspace.setPhase(this.currentPhase);
1072
1206
  this._createTasksForPhase(this.currentPhase);
1207
+
1208
+ // v0.6.2 J2: on rollback, reset the rolled-FROM phase's lastReady
1209
+ // edge-trigger so that if the agent revisits it and re-flips
1210
+ // exit-criteria true, _maybeAutoAdvance will fire correctly. Without
1211
+ // this, the auto-advance edge trigger stays latched true and the
1212
+ // moment the agent returns to fromPhase the engine immediately
1213
+ // bounces them back out — defeating the rollback.
1214
+ if (direction === "rollback" && this._lastReady) {
1215
+ this._lastReady[fromPhase] = false;
1216
+ }
1217
+
1073
1218
  this.saveState();
1074
1219
 
1075
1220
  // B8: Soft signal — surface any sub-agents left running from the prior
@@ -1093,6 +1238,172 @@ export class AgentEngine {
1093
1238
  return true;
1094
1239
  }
1095
1240
 
1241
+ /**
1242
+ * v0.6.1 A6: Single chokepoint for engine-emitted milestone updates.
1243
+ * Tools call this on successful execution to bump pipeline counters that
1244
+ * the phase-gate hardening (A2-A5) depends on. Without engine emission,
1245
+ * gates fall back to filesystem scans which can miss work that didn't
1246
+ * follow canonical output paths (E2E #4: `unified_qc.py` wrote to
1247
+ * `output/results/`, production-qc only scanned `output/qc/`).
1248
+ *
1249
+ * The mutation routes through the pipeline's existing internal state, so
1250
+ * exportState/importState round-trips work unchanged and the gate sees a
1251
+ * unified view of (filesystem-scanned + engine-emitted) signals.
1252
+ *
1253
+ * Three modes inferred from value shape:
1254
+ * - increment counter: pipeline[key] is number, value is number → add
1255
+ * - set in dict-by-id: pipeline[key] is object, value is { id, value? } → assign
1256
+ * - dedupe-add to array: pipeline[key] is array, value is string → push if absent
1257
+ *
1258
+ * @param {string} phase - Pipeline name (e.g., "distillation")
1259
+ * @param {string} key - Field on the pipeline (e.g., "workflowsTested")
1260
+ * @param {*} value - Shape varies by target type (see modes above)
1261
+ * @returns {boolean} true if a write happened
1262
+ */
1263
+ _recordMilestone(phase, key, value) {
1264
+ const pipeline = this.pipelines?.[phase];
1265
+ if (!pipeline) return false;
1266
+ const target = pipeline[key];
1267
+ // increment counter
1268
+ if (typeof target === "number" && typeof value === "number") {
1269
+ pipeline[key] = target + value;
1270
+ return true;
1271
+ }
1272
+ // set on dict-by-id
1273
+ if (target && typeof target === "object" && !Array.isArray(target)
1274
+ && value && typeof value === "object" && "id" in value) {
1275
+ target[value.id] = "value" in value ? value.value : true;
1276
+ return true;
1277
+ }
1278
+ // dedupe-add to array
1279
+ if (Array.isArray(target) && typeof value === "string") {
1280
+ if (!target.includes(value)) target.push(value);
1281
+ return true;
1282
+ }
1283
+ return false;
1284
+ }
1285
+
1286
+ /**
1287
+ * v0.6.1 B1: build a one-line "engine counts" block summarizing the
1288
+ * pipeline's ground-truth telemetry at the moment of phase advance.
1289
+ * Different phases surface different metrics; we keep this short so the
1290
+ * appended summary line stays readable.
1291
+ *
1292
+ * @param {string} fromPhase - The phase being LEFT (we summarize its work)
1293
+ * @returns {string} block text, or "" if pipeline has nothing to report
1294
+ */
1295
+ _buildEngineCountsBlock(fromPhase) {
1296
+ const pipeline = this.pipelines?.[fromPhase];
1297
+ if (!pipeline) return "";
1298
+ const parts = [];
1299
+ try {
1300
+ switch (fromPhase) {
1301
+ case "extraction": {
1302
+ const total = pipeline._catalogRuleCount?.() ?? pipeline.rulesExtracted?.length ?? 0;
1303
+ parts.push(`rulesExtracted: ${pipeline.rulesExtracted?.length ?? 0}`);
1304
+ parts.push(`rulesWithChunkRefs: ${pipeline.rulesWithChunkRefs?.length ?? 0}/${total}`);
1305
+ parts.push(`rulesWithTests: ${pipeline.rulesWithTests?.length ?? 0}`);
1306
+ parts.push(`coverageAudited: ${pipeline.coverageAudited ? "yes" : "no"}`);
1307
+ break;
1308
+ }
1309
+ case "skill_authoring": {
1310
+ const totalRules = pipeline.totalRules?.length ?? 0;
1311
+ const covered = pipeline.ruleIdsCovered?.size ?? 0;
1312
+ parts.push(`rulesCovered: ${covered}/${totalRules}`);
1313
+ parts.push(`skillDirsAuthored: ${pipeline.skillsAuthored?.length ?? 0}`);
1314
+ if (this.taskManager) {
1315
+ const t = this.taskManager.countByPhase("skill_authoring");
1316
+ const d = this.taskManager.countByPhase("skill_authoring", "completed");
1317
+ const f = this.taskManager.countByPhase("skill_authoring", "failed");
1318
+ parts.push(`tasksCompleted: ${d}/${t}${f > 0 ? ` (+${f} failed)` : ""}`);
1319
+ }
1320
+ break;
1321
+ }
1322
+ case "skill_testing": {
1323
+ const total = pipeline.skillsToTest?.length ?? 0;
1324
+ const tested = Object.keys(pipeline.skillsTested || {}).length;
1325
+ const passing = pipeline.skillsPassing?.length ?? 0;
1326
+ parts.push(`skillsTested: ${tested}/${total}`);
1327
+ parts.push(`skillsPassing: ${passing}`);
1328
+ parts.push(`iterations: ${pipeline.iterationCount ?? 0}`);
1329
+ break;
1330
+ }
1331
+ case "distillation": {
1332
+ const total = pipeline.skillsToDistill?.length ?? 0;
1333
+ const created = Object.keys(pipeline.workflowsCreated || {}).length;
1334
+ const tested = Object.keys(pipeline.workflowsTested || {}).length;
1335
+ const passing = pipeline.workflowsPassing?.length ?? 0;
1336
+ parts.push(`workflowsCreated: ${created}/${total}`);
1337
+ parts.push(`workflowsTested: ${tested}/${total}`);
1338
+ parts.push(`workflowsPassing: ${passing}/${total}`);
1339
+ break;
1340
+ }
1341
+ case "production_qc": {
1342
+ parts.push(`batchesProcessed: ${pipeline.batchesProcessed ?? 0}`);
1343
+ parts.push(`documentsReviewed: ${pipeline.documentsReviewed ?? 0}`);
1344
+ parts.push(`monitoring: ${pipeline.monitoringPhase ?? "?"}`);
1345
+ break;
1346
+ }
1347
+ // bootstrap / finalization: no specific counters, fall through
1348
+ }
1349
+ } catch { /* never let summary build break phase advance */ }
1350
+ return parts.join(", ");
1351
+ }
1352
+
1353
+ /**
1354
+ * v0.6.1 B1: heuristic mismatch detection. Conservative regex over the
1355
+ * LLM's free-form reason for percentages and counts, compared against
1356
+ * engine truth. INFORMATIONAL only — never blocks the transition. False
1357
+ * positives are acceptable (the warning is a hint to the human reviewer,
1358
+ * not a hard signal). False negatives are also acceptable (this catches
1359
+ * the loud, numerical claims; subtle ones still slip through).
1360
+ *
1361
+ * Returns true if the agent's reason mentions a count or percentage that
1362
+ * doesn't match engine state.
1363
+ */
1364
+ _detectSummaryMismatch(reason, fromPhase) {
1365
+ if (!reason || typeof reason !== "string") return false;
1366
+ const pipeline = this.pipelines?.[fromPhase];
1367
+ if (!pipeline) return false;
1368
+ try {
1369
+ // Match "N/M" fractions and standalone counts
1370
+ const fractionMatches = [...reason.matchAll(/(\d+)\s*\/\s*(\d+)/g)];
1371
+ // Match "N rules / skills / workflows / tasks"
1372
+ const countMatches = [...reason.matchAll(/(\d+)\s*(rules?|skills?|workflows?|tasks?|条规则|个技能)/gi)];
1373
+ // Match accuracy claims like "95%", "0.95"
1374
+ const pctMatches = [...reason.matchAll(/(\d+(?:\.\d+)?)\s*%/g)];
1375
+
1376
+ // Phase-specific cross-checks (cheap conservative comparisons)
1377
+ if (fromPhase === "skill_authoring" && this.taskManager) {
1378
+ const completed = this.taskManager.countByPhase("skill_authoring", "completed");
1379
+ const total = this.taskManager.countByPhase("skill_authoring");
1380
+ for (const m of fractionMatches) {
1381
+ const claimedDone = parseInt(m[1], 10);
1382
+ const claimedTotal = parseInt(m[2], 10);
1383
+ if (claimedTotal === total && claimedDone > completed + 5) return true;
1384
+ }
1385
+ }
1386
+ if (fromPhase === "skill_testing") {
1387
+ const tested = Object.keys(pipeline.skillsTested || {}).length;
1388
+ const passing = pipeline.skillsPassing?.length ?? 0;
1389
+ for (const m of pctMatches) {
1390
+ const claimed = parseFloat(m[1]);
1391
+ // If claimed > 50% but engine sees 0 tested, that's suspicious
1392
+ if (claimed >= 50 && tested === 0 && passing === 0) return true;
1393
+ }
1394
+ }
1395
+ if (fromPhase === "production_qc") {
1396
+ const batches = pipeline.batchesProcessed ?? 0;
1397
+ // Any "complete" or large-count claim while batches==0 is suspicious
1398
+ if (batches === 0) {
1399
+ if (countMatches.some((m) => parseInt(m[1], 10) > 10)) return true;
1400
+ if (pctMatches.some((m) => parseFloat(m[1]) > 50)) return true;
1401
+ }
1402
+ }
1403
+ } catch { /* informational only — never block */ }
1404
+ return false;
1405
+ }
1406
+
1096
1407
  /**
1097
1408
  * Bug 4 trigger (1) auto-detect, edge-triggered (Bug 5): only fires on a
1098
1409
  * fresh false → true flip in `exitCriteriaMet()`. Sessions resumed in an
@@ -40,6 +40,13 @@ export class DistillationEngine extends Pipeline {
40
40
  }
41
41
 
42
42
  _scanWorkflows() {
43
+ // v0.6.1 A6: preserve engine-emitted entries across filesystem rescans.
44
+ // workflow_run hook bumps workflowsTested[ruleId] and adds to
45
+ // workflowsPassing on success — without this preservation, those entries
46
+ // get clobbered on the next describeState() / onToolResult() rescan.
47
+ const engineWfTested = { ...this.workflowsTested };
48
+ const engineWfPassing = [...this.workflowsPassing];
49
+
43
50
  this.workflowsCreated = {};
44
51
  this.workflowsTested = {};
45
52
  this.workflowsPassing = [];
@@ -68,6 +75,14 @@ export class DistillationEngine extends Pipeline {
68
75
  this.workflowsCreated[path.parse(e.name).name] = 1;
69
76
  }
70
77
  }
78
+
79
+ // Re-merge engine-emitted entries on top of filesystem-derived state
80
+ for (const [k, v] of Object.entries(engineWfTested)) {
81
+ if (!(k in this.workflowsTested)) this.workflowsTested[k] = v;
82
+ }
83
+ for (const id of engineWfPassing) {
84
+ if (!this.workflowsPassing.includes(id)) this.workflowsPassing.push(id);
85
+ }
71
86
  }
72
87
 
73
88
  describeState() {
@@ -11,6 +11,11 @@ export class RuleExtractionPipeline extends Pipeline {
11
11
  this.rulesExtracted = [];
12
12
  this.rulesWithTests = [];
13
13
  this.coverageAudited = false;
14
+ // v0.6.1 A1: track which rules in catalog.json have non-empty
15
+ // source_chunk_ids — D1 grounded skill_authoring prompts on these but
16
+ // exit didn't require them, so a sloppy extraction could leave rules
17
+ // unmoored.
18
+ this.rulesWithChunkRefs = [];
14
19
  this._scanWorkspace();
15
20
  }
16
21
 
@@ -28,11 +33,21 @@ export class RuleExtractionPipeline extends Pipeline {
28
33
 
29
34
  _scanRules() {
30
35
  this.rulesExtracted = [];
36
+ this.rulesWithChunkRefs = [];
31
37
  const catalogPath = path.join(this._workspace.cwd, "rules", "catalog.json");
32
38
  if (fs.existsSync(catalogPath)) {
33
39
  try {
34
40
  const data = JSON.parse(fs.readFileSync(catalogPath, "utf-8"));
35
- if (Array.isArray(data)) this.rulesExtracted = data.map((r, i) => r.id || `rule_${i}`);
41
+ if (Array.isArray(data)) {
42
+ this.rulesExtracted = data.map((r, i) => r.id || `rule_${i}`);
43
+ // A1: collect ids whose entry has non-empty source_chunk_ids
44
+ for (const r of data) {
45
+ const ids = r?.source_chunk_ids;
46
+ if (Array.isArray(ids) && ids.length > 0 && r?.id) {
47
+ this.rulesWithChunkRefs.push(r.id);
48
+ }
49
+ }
50
+ }
36
51
  } catch { /* skip */ }
37
52
  }
38
53
  const skillsDir = path.join(this._workspace.cwd, "rule_skills");
@@ -67,10 +82,43 @@ export class RuleExtractionPipeline extends Pipeline {
67
82
  parts.push("### Exit\nExtraction complete. Proceed to SKILL_AUTHORING.");
68
83
  }
69
84
 
70
- parts.push(`### Exit criteria\n- [${this.regulationsScanned ? "x" : " "}] All regulations read\n- [${this.rulesExtracted.length > 0 ? "x" : " "}] Rules decomposed into atomic units\n- [${this.rulesWithTests.length >= Math.max(this.rulesExtracted.length * 0.8, 1) ? "x" : " "}] >=80% of rules have test stubs\n- [${this.coverageAudited ? "x" : " "}] Coverage audit completed`);
85
+ const chunkRefsOk = this._chunkRefsCriterionMet();
86
+ parts.push(
87
+ `### Exit criteria\n` +
88
+ `- [${this.regulationsScanned ? "x" : " "}] All regulations read\n` +
89
+ `- [${this.rulesExtracted.length > 0 ? "x" : " "}] Rules decomposed into atomic units\n` +
90
+ `- [${this.rulesWithTests.length >= Math.max(this.rulesExtracted.length * 0.8, 1) ? "x" : " "}] >=80% of rules have test stubs\n` +
91
+ `- [${this.coverageAudited ? "x" : " "}] Coverage audit completed\n` +
92
+ `- [${chunkRefsOk ? "x" : " "}] Every rule has source_chunk_ids in catalog.json (${this.rulesWithChunkRefs.length}/${this._catalogRuleCount()})`,
93
+ );
71
94
  return parts.join("\n\n");
72
95
  }
73
96
 
97
+ /**
98
+ * v0.6.1 A1: number of rules currently in catalog.json (not the union with
99
+ * rule_skills/ dirs that rulesExtracted carries). Used by the chunk-refs
100
+ * gate so we compare apples to apples.
101
+ */
102
+ _catalogRuleCount() {
103
+ const catalogPath = path.join(this._workspace.cwd, "rules", "catalog.json");
104
+ if (!fs.existsSync(catalogPath)) return 0;
105
+ try {
106
+ const data = JSON.parse(fs.readFileSync(catalogPath, "utf-8"));
107
+ return Array.isArray(data) ? data.length : 0;
108
+ } catch { return 0; }
109
+ }
110
+
111
+ /**
112
+ * v0.6.1 A1: pass when every rule in catalog.json has a non-empty
113
+ * source_chunk_ids array. Empty catalog (legacy / pre-D1 sessions) passes
114
+ * trivially so resume of v0.6.0 sessions doesn't get trapped.
115
+ */
116
+ _chunkRefsCriterionMet() {
117
+ const total = this._catalogRuleCount();
118
+ if (total === 0) return true; // backwards-compat for sessions pre-D1
119
+ return this.rulesWithChunkRefs.length >= total;
120
+ }
121
+
74
122
  onToolResult(toolName, toolInput, result) {
75
123
  if (result.isError) return null;
76
124
  const wasReady = this.exitCriteriaMet();
@@ -85,7 +133,12 @@ export class RuleExtractionPipeline extends Pipeline {
85
133
 
86
134
  exitCriteriaMet() {
87
135
  return this.regulationsScanned && this.rulesExtracted.length > 0 &&
88
- this.rulesWithTests.length >= Math.max(this.rulesExtracted.length * 0.8, 1) && this.coverageAudited;
136
+ this.rulesWithTests.length >= Math.max(this.rulesExtracted.length * 0.8, 1) &&
137
+ this.coverageAudited &&
138
+ // v0.6.1 A1: hard tracking — D1 source-context auto-attach requires
139
+ // catalog.json entries to carry source_chunk_ids. Without them the
140
+ // skill_authoring prompts are blind.
141
+ this._chunkRefsCriterionMet();
89
142
  }
90
143
 
91
144
  exportState() {
@@ -93,6 +146,7 @@ export class RuleExtractionPipeline extends Pipeline {
93
146
  regulationsScanned: this.regulationsScanned,
94
147
  rulesExtracted: this.rulesExtracted,
95
148
  rulesWithTests: this.rulesWithTests,
149
+ rulesWithChunkRefs: this.rulesWithChunkRefs,
96
150
  coverageAudited: this.coverageAudited,
97
151
  };
98
152
  }
@@ -107,5 +161,8 @@ export class RuleExtractionPipeline extends Pipeline {
107
161
  if (Array.isArray(data.rulesWithTests) && data.rulesWithTests.length > this.rulesWithTests.length) {
108
162
  this.rulesWithTests = data.rulesWithTests;
109
163
  }
164
+ if (Array.isArray(data.rulesWithChunkRefs) && data.rulesWithChunkRefs.length > this.rulesWithChunkRefs.length) {
165
+ this.rulesWithChunkRefs = data.rulesWithChunkRefs;
166
+ }
110
167
  }
111
168
  }
@@ -36,6 +36,11 @@ export class ProductionQCPipeline extends Pipeline {
36
36
  }
37
37
 
38
38
  _scanQcResults() {
39
+ // v0.6.1 A5/A6: don't reset documentsReviewed if engine emission has
40
+ // bumped it since last scan — workflow_run hooks call _recordMilestone
41
+ // and the increment lives in this same field. Other counters (batches,
42
+ // accuracy, issues) come solely from filesystem scan and reset cleanly.
43
+ const engineDocsReviewed = this.documentsReviewed;
39
44
  this.batchesProcessed = 0;
40
45
  this.totalDocuments = 0;
41
46
  this.documentsReviewed = 0;
@@ -43,23 +48,57 @@ export class ProductionQCPipeline extends Pipeline {
43
48
  this.confidenceDistribution = { low: 0, medium: 0, high: 0 };
44
49
  this.issuesFound = [];
45
50
 
51
+ // Existing canonical path: output/qc/*.json (formal QC batch reports)
46
52
  const qcDir = path.join(this._workspace.cwd, "output", "qc");
47
- if (!fs.existsSync(qcDir)) return;
53
+ if (fs.existsSync(qcDir)) {
54
+ for (const f of fs.readdirSync(qcDir).filter((f) => f.endsWith(".json")).sort()) {
55
+ try {
56
+ const data = JSON.parse(fs.readFileSync(path.join(qcDir, f), "utf-8"));
57
+ this.batchesProcessed++;
58
+ this.totalDocuments += typeof data.documents === "number" ? data.documents : (data.total || 0);
59
+ this.documentsReviewed += data.reviewed || 0;
60
+ if (data.accuracy_by_rule) Object.assign(this.accuracyByRule, data.accuracy_by_rule);
61
+ if (data.confidence) {
62
+ for (const band of ["low", "medium", "high"]) this.confidenceDistribution[band] += data.confidence[band] || 0;
63
+ }
64
+ if (Array.isArray(data.issues)) this.issuesFound.push(...data.issues);
65
+ } catch { /* skip */ }
66
+ }
67
+ }
48
68
 
49
- for (const f of fs.readdirSync(qcDir).filter((f) => f.endsWith(".json")).sort()) {
50
- try {
51
- const data = JSON.parse(fs.readFileSync(path.join(qcDir, f), "utf-8"));
69
+ // v0.6.1 A5: also pick up batch-style results in output/results/. E2E #4
70
+ // showed agents writing batch QC outputs to output/results/qc_*.json
71
+ // (e.g. unified_qc.py) instead of output/qc/, so the formal scanner
72
+ // missed them. Heuristic match: filename starts with "qc_" or contains
73
+ // "_batch_". Each match counts as one batch; total_checks → totalDocuments.
74
+ const resultsDir = path.join(this._workspace.cwd, "output", "results");
75
+ if (fs.existsSync(resultsDir)) {
76
+ const seen = new Set();
77
+ for (const f of fs.readdirSync(resultsDir).filter((f) => f.endsWith(".json"))) {
78
+ const lower = f.toLowerCase();
79
+ if (!(lower.startsWith("qc_") || lower.includes("_batch_"))) continue;
80
+ // Dedupe near-duplicate filenames that differ only by timestamp
81
+ // suffix (qc_full_batch_20260424_141642.json vs _141921.json
82
+ // — both are real batches, keep both. But qc_pt_x.json and
83
+ // qc_pt_x_<ts>.json are usually the same batch saved twice; key
84
+ // on the prefix before any 8-digit date.)
85
+ const key = f.replace(/_\d{8}_\d{6}/g, "").replace(/\.json$/, "");
86
+ if (seen.has(key)) continue;
87
+ seen.add(key);
52
88
  this.batchesProcessed++;
53
- this.totalDocuments += typeof data.documents === "number" ? data.documents : (data.total || 0);
54
- this.documentsReviewed += data.reviewed || 0;
55
- if (data.accuracy_by_rule) Object.assign(this.accuracyByRule, data.accuracy_by_rule);
56
- if (data.confidence) {
57
- for (const band of ["low", "medium", "high"]) this.confidenceDistribution[band] += data.confidence[band] || 0;
58
- }
59
- if (Array.isArray(data.issues)) this.issuesFound.push(...data.issues);
60
- } catch { /* skip */ }
89
+ try {
90
+ const data = JSON.parse(fs.readFileSync(path.join(resultsDir, f), "utf-8"));
91
+ // Best-effort metric extraction; tolerate missing keys
92
+ this.totalDocuments += typeof data.sample_count === "number" ? data.sample_count
93
+ : typeof data.documents === "number" ? data.documents
94
+ : typeof data.total === "number" ? data.total : 0;
95
+ } catch { /* skip */ }
96
+ }
61
97
  }
62
98
 
99
+ // Restore engine-emitted documentsReviewed if filesystem reported less
100
+ if (engineDocsReviewed > this.documentsReviewed) this.documentsReviewed = engineDocsReviewed;
101
+
63
102
  // Determine monitoring phase
64
103
  if (this.batchesProcessed < 3) this.monitoringPhase = "initial";
65
104
  else if (this.issuesFound.length > 0) this.monitoringPhase = "active";
@@ -93,7 +132,18 @@ export class ProductionQCPipeline extends Pipeline {
93
132
  return null;
94
133
  }
95
134
 
96
- exitCriteriaMet() { return this.monitoringPhase === "stable"; }
135
+ /**
136
+ * v0.6.1 A5: gate requires at least one batch processed (real telemetry)
137
+ * AND the legacy stable-monitoring criterion. Without the batch floor, the
138
+ * agent could declare PRODUCTION_QC done from a clean session-state file
139
+ * (E2E #4: phase advanced into PRODUCTION_QC, agent ran 6,930 checks via
140
+ * sandbox_exec to non-canonical paths, batchesProcessed stayed 0, exit
141
+ * fired anyway because monitoringPhase defaults can flip to "stable" with
142
+ * empty accuracyByRule + zero issues).
143
+ */
144
+ exitCriteriaMet() {
145
+ return this.batchesProcessed > 0 && this.monitoringPhase === "stable";
146
+ }
97
147
 
98
148
  exportState() {
99
149
  return {