kc-beta 0.5.6 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/QUICKSTART.md +17 -4
  2. package/README.md +58 -11
  3. package/bin/kc-beta.js +35 -1
  4. package/package.json +1 -1
  5. package/src/agent/bundle-tree.js +553 -0
  6. package/src/agent/context.js +40 -1
  7. package/src/agent/engine.js +828 -31
  8. package/src/agent/llm-client.js +67 -18
  9. package/src/agent/pipelines/distillation.js +15 -0
  10. package/src/agent/pipelines/extraction.js +60 -3
  11. package/src/agent/pipelines/finalization.js +186 -0
  12. package/src/agent/pipelines/index.js +8 -0
  13. package/src/agent/pipelines/initializer.js +40 -0
  14. package/src/agent/pipelines/production-qc.js +63 -13
  15. package/src/agent/pipelines/skill-authoring.js +136 -7
  16. package/src/agent/skill-loader.js +54 -4
  17. package/src/agent/task-manager.js +81 -3
  18. package/src/agent/tools/agent-tool.js +283 -35
  19. package/src/agent/tools/bundle-search.js +146 -0
  20. package/src/agent/tools/document-chunk.js +246 -0
  21. package/src/agent/tools/document-classify.js +311 -0
  22. package/src/agent/tools/document-parse.js +8 -1
  23. package/src/agent/tools/phase-advance.js +30 -7
  24. package/src/agent/tools/registry.js +10 -0
  25. package/src/agent/tools/rule-catalog.js +17 -3
  26. package/src/agent/tools/sandbox-exec.js +30 -0
  27. package/src/agent/tools/workflow-run.js +34 -1
  28. package/src/agent/workspace.js +168 -14
  29. package/src/cli/components.js +165 -17
  30. package/src/cli/index.js +166 -19
  31. package/src/cli/meme.js +58 -0
  32. package/src/config.js +39 -2
  33. package/src/providers.js +26 -0
  34. package/template/skills/en/meta-meta/evolution-loop/SKILL.md +13 -1
  35. package/template/skills/en/meta-meta/rule-extraction/SKILL.md +74 -0
  36. package/template/skills/zh/meta-meta/evolution-loop/SKILL.md +7 -1
  37. package/template/skills/zh/meta-meta/rule-extraction/SKILL.md +73 -0
@@ -19,6 +19,9 @@ import { ReleaseTool } from "./tools/release.js";
19
19
  import { PhaseAdvanceTool } from "./tools/phase-advance.js";
20
20
  import { DocumentParseTool } from "./tools/document-parse.js";
21
21
  import { DocumentSearchTool } from "./tools/document-search.js";
22
+ import { DocumentChunkTool } from "./tools/document-chunk.js";
23
+ import { BundleSearchTool } from "./tools/bundle-search.js";
24
+ import { DocumentClassifyTool } from "./tools/document-classify.js";
22
25
  import { WorkerLLMCallTool } from "./tools/worker-llm-call.js";
23
26
  import { WorkflowRunTool } from "./tools/workflow-run.js";
24
27
  import { RuleCatalogTool } from "./tools/rule-catalog.js";
@@ -38,6 +41,7 @@ import { SkillAuthoringPipeline } from "./pipelines/skill-authoring.js";
38
41
  import { SkillTestingPipeline } from "./pipelines/skill-testing.js";
39
42
  import { DistillationEngine as DistillationPipeline } from "./pipelines/distillation.js";
40
43
  import { ProductionQCPipeline } from "./pipelines/production-qc.js";
44
+ import { FinalizationPipeline } from "./pipelines/finalization.js";
41
45
  import { EventLog } from "./event-log.js";
42
46
  import { ContextWindow } from "./context-window.js";
43
47
  import { SessionState } from "./session-state.js";
@@ -48,8 +52,10 @@ import { estimateTokens, estimateMessagesTokens } from "./token-counter.js";
48
52
  // or kc_max_tokens in the global config.
49
53
  const DEFAULT_KC_MAX_TOKENS = 65536;
50
54
 
51
- // Phases where worker LLM tools are available (DISTILL mode)
52
- const DISTILL_PHASES = new Set([Phase.DISTILLATION, Phase.PRODUCTION_QC]);
55
+ // Phases where worker LLM tools are available (DISTILL mode).
56
+ // E1: FINALIZATION inherits worker-LLM access so one-last-pass validation
57
+ // runs + dashboard_render + workflow_run stay usable during packaging.
58
+ const DISTILL_PHASES = new Set([Phase.DISTILLATION, Phase.PRODUCTION_QC, Phase.FINALIZATION]);
53
59
 
54
60
  // Linear phase order — used by auto-advance (Bug 4). Last phase has no successor.
55
61
  // Exported so the TUI's /phase slash command (src/cli/index.js) can call
@@ -60,6 +66,7 @@ export const NEXT_PHASE = {
60
66
  [Phase.SKILL_AUTHORING]: Phase.SKILL_TESTING,
61
67
  [Phase.SKILL_TESTING]: Phase.DISTILLATION,
62
68
  [Phase.DISTILLATION]: Phase.PRODUCTION_QC,
69
+ [Phase.PRODUCTION_QC]: Phase.FINALIZATION, // E1: new 7th phase
63
70
  };
64
71
 
65
72
  /**
@@ -158,10 +165,11 @@ export class AgentEngine {
158
165
  this.pipelines = {
159
166
  [Phase.BOOTSTRAP]: new ProjectInitializer(this.workspace),
160
167
  [Phase.EXTRACTION]: new RuleExtractionPipeline(this.workspace),
161
- [Phase.SKILL_AUTHORING]: new SkillAuthoringPipeline(this.workspace),
168
+ [Phase.SKILL_AUTHORING]: new SkillAuthoringPipeline(this.workspace, this.taskManager),
162
169
  [Phase.SKILL_TESTING]: new SkillTestingPipeline(this.workspace),
163
170
  [Phase.DISTILLATION]: new DistillationPipeline(this.workspace),
164
171
  [Phase.PRODUCTION_QC]: new ProductionQCPipeline(this.workspace),
172
+ [Phase.FINALIZATION]: new FinalizationPipeline(this.workspace), // E1
165
173
  };
166
174
 
167
175
  // Skill discovery (Claude Code pattern: index in context, full content on demand)
@@ -181,6 +189,61 @@ export class AgentEngine {
181
189
  this._lastReady = Object.fromEntries(
182
190
  Object.keys(this.pipelines).map((p) => [p, false]),
183
191
  );
192
+
193
+ // B0.1: Heap sampler. Parent engines only — sub-agents share a process
194
+ // with the parent and would double-log. Writes a single JSONL line
195
+ // per minute to <workspace>/logs/heap.jsonl with the numbers needed
196
+ // to diagnose RSS creep (heapUsed/heapTotal/external/rss/arrayBuffers,
197
+ // plus active task count and history length). Always on, ~60 bytes
198
+ // per minute to disk.
199
+ this._heapSamplerStop = this._isSubagent ? null : this._startHeapSampler();
200
+ }
201
+
202
+ /**
203
+ * Start sampling process.memoryUsage() every 60 s into logs/heap.jsonl.
204
+ * Returns a stop fn. Timer is .unref()'d so it never keeps the process
205
+ * alive by itself. Failures are silently suppressed — this is a
206
+ * diagnostic, not a correctness feature.
207
+ */
208
+ _startHeapSampler() {
209
+ const logDir = path.join(this.workspace.cwd, "logs");
210
+ const logPath = path.join(logDir, "heap.jsonl");
211
+ const sample = () => {
212
+ try {
213
+ const mem = process.memoryUsage();
214
+ const row = {
215
+ t: new Date().toISOString(),
216
+ seq: this.eventLog?.currentSeq ?? 0,
217
+ phase: this.currentPhase,
218
+ rssMB: Math.round(mem.rss / 1024 / 1024),
219
+ heapUsedMB: Math.round(mem.heapUsed / 1024 / 1024),
220
+ heapTotalMB: Math.round(mem.heapTotal / 1024 / 1024),
221
+ externalMB: Math.round((mem.external || 0) / 1024 / 1024),
222
+ arrayBuffersMB: Math.round((mem.arrayBuffers || 0) / 1024 / 1024),
223
+ historyLen: this.history?.messages?.length ?? 0,
224
+ tasksPending: this.taskManager?.progress?.pending ?? 0,
225
+ tasksInProgress: this.taskManager?.progress?.inProgress ?? 0,
226
+ };
227
+ fs.mkdirSync(logDir, { recursive: true });
228
+ fs.appendFileSync(logPath, JSON.stringify(row) + "\n", "utf-8");
229
+ } catch { /* never fatal */ }
230
+ };
231
+ // Record one sample at startup so we have a baseline even on short runs.
232
+ sample();
233
+ const timer = setInterval(sample, 60_000);
234
+ timer.unref?.();
235
+ return () => {
236
+ try {
237
+ clearInterval(timer);
238
+ sample(); // one final sample on shutdown
239
+ } catch { /* ignore */ }
240
+ };
241
+ }
242
+
243
+ /** Stop background diagnostics. Call on graceful shutdown. */
244
+ stop() {
245
+ try { this._heapSamplerStop?.(); } catch { /* ignore */ }
246
+ this._heapSamplerStop = null;
184
247
  }
185
248
 
186
249
  /**
@@ -214,7 +277,10 @@ export class AgentEngine {
214
277
  new ArchiveFileTool(this.workspace),
215
278
  new ScheduleFetchTool(this.workspace),
216
279
  new ReleaseTool(this.workspace, { kcVersion: "0.5.2" }),
217
- new PhaseAdvanceTool((to, reason, opts) => this._advancePhase(to, reason, opts)),
280
+ new PhaseAdvanceTool(
281
+ (to, reason, opts) => this._advancePhase(to, reason, opts),
282
+ () => this.currentPhase, // H1: tool reads phase BEFORE its own call
283
+ ),
218
284
  new DocumentParseTool(this.workspace, {
219
285
  mineruApiUrl: this.config.mineruApiUrl,
220
286
  mineruApiKey: this.config.mineruApiKey,
@@ -223,6 +289,12 @@ export class AgentEngine {
223
289
  ocrModel: vlmModel,
224
290
  }),
225
291
  new DocumentSearchTool(this.workspace),
292
+ // Group C — chunker/RAG infrastructure ported from AMC app. Core
293
+ // tools (not phase-gated): useful from BOOTSTRAP through FINALIZATION
294
+ // for any doc-heavy project, not just rule extraction.
295
+ new DocumentChunkTool(this.workspace),
296
+ new BundleSearchTool(this.workspace),
297
+ new DocumentClassifyTool(this.workspace, this.config),
226
298
  new RuleCatalogTool(this.workspace),
227
299
  new EvolutionCycleTool(this.workspace, this.cornerCases),
228
300
  new DashboardRenderTool(this.workspace),
@@ -239,7 +311,11 @@ export class AgentEngine {
239
311
  // Distillation+ only (DISTILL mode)
240
312
  distill: [
241
313
  workerLlm,
242
- new WorkflowRunTool(this.workspace, this.versionManager, this.confidence),
314
+ new WorkflowRunTool(this.workspace, this.versionManager, this.confidence, {
315
+ // v0.6.1 A6: hook engine-emitted milestones so phase gates see workflow runs
316
+ recordMilestone: (phase, key, value) => this._recordMilestone(phase, key, value),
317
+ getCurrentPhase: () => this.currentPhase,
318
+ }),
243
319
  new TierDowngradeTool(this.workspace, workerLlm),
244
320
  new QCSampleTool(this.workspace),
245
321
  ],
@@ -313,7 +389,7 @@ export class AgentEngine {
313
389
  getContextStats() {
314
390
  const systemPrompt = this.context.build({
315
391
  agentMd: this._readAgentMd(),
316
- skillIndex: this._skillLoader.formatForContext(),
392
+ skillIndex: this._skillLoader.formatForContext(this.currentPhase),
317
393
  pipelineState: this.pipelines[this.currentPhase]?.describeState?.() || null,
318
394
  workspaceState: this._buildWorkspaceState(),
319
395
  });
@@ -353,21 +429,37 @@ export class AgentEngine {
353
429
 
354
430
  // Heap-pressure diagnostic. The TUI has its own virtualization + tool-
355
431
  // output truncation (Bug 3 fixes), so Ink itself should never OOM. If we
356
- // still see high heap usage, something else is leaking — log it once per
357
- // pressure-crossing so operators can investigate without flooding logs.
432
+ // still see high heap usage, something else is leaking.
433
+ //
434
+ // A9: Original design logged once per pressure-crossing (edge-triggered),
435
+ // which went silent for 17h during E2E #3 while RSS climbed to 3.8GB.
436
+ // Now: still edge-trigger on entry (noisy otherwise), but ALSO re-emit
437
+ // every 15min while we're still above the threshold, so an operator
438
+ // watching logs after hour 4 still sees the signal. Drops to silent on
439
+ // recovery below 0.60.
358
440
  try {
359
441
  const mem = process.memoryUsage();
360
442
  const frac = mem.heapUsed / (mem.heapTotal || 1);
361
- if (frac > 0.80 && !this._memPressureLogged) {
362
- this._memPressureLogged = true;
363
- this.eventLog.append("memory_pressure", {
364
- heapUsedMB: Math.round(mem.heapUsed / 1024 / 1024),
365
- heapTotalMB: Math.round(mem.heapTotal / 1024 / 1024),
366
- rssMB: Math.round(mem.rss / 1024 / 1024),
367
- historyLength: this.history.messages.length,
368
- });
443
+ const now = Date.now();
444
+ const REPRESS_INTERVAL_MS = 15 * 60 * 1000;
445
+ if (frac > 0.80) {
446
+ const firstCrossing = !this._memPressureLogged;
447
+ const dueForRepress = this._memPressureLastEmittedAt &&
448
+ (now - this._memPressureLastEmittedAt) >= REPRESS_INTERVAL_MS;
449
+ if (firstCrossing || dueForRepress) {
450
+ this._memPressureLogged = true;
451
+ this._memPressureLastEmittedAt = now;
452
+ this.eventLog.append("memory_pressure", {
453
+ heapUsedMB: Math.round(mem.heapUsed / 1024 / 1024),
454
+ heapTotalMB: Math.round(mem.heapTotal / 1024 / 1024),
455
+ rssMB: Math.round(mem.rss / 1024 / 1024),
456
+ historyLength: this.history.messages.length,
457
+ kind: firstCrossing ? "crossing" : "sustained",
458
+ });
459
+ }
369
460
  } else if (frac < 0.60 && this._memPressureLogged) {
370
461
  this._memPressureLogged = false; // re-arm for next crossing
462
+ this._memPressureLastEmittedAt = null;
371
463
  }
372
464
  } catch { /* process.memoryUsage failures are non-fatal */ }
373
465
  }
@@ -701,7 +793,7 @@ export class AgentEngine {
701
793
 
702
794
  const systemPrompt = this.context.build({
703
795
  agentMd: this._readAgentMd(),
704
- skillIndex: this._skillLoader.formatForContext(),
796
+ skillIndex: this._skillLoader.formatForContext(this.currentPhase),
705
797
  pipelineState,
706
798
  workspaceState: this._buildWorkspaceState(),
707
799
  });
@@ -781,6 +873,30 @@ export class AgentEngine {
781
873
  });
782
874
 
783
875
  if (toolCallsAcc.size === 0) {
876
+ // A3: Empty-response guard. If the LLM returned no content AND no
877
+ // tool calls, count it. Two in a row almost always means the
878
+ // provider is silently failing (context exceeded, rate-limit
879
+ // corruption, auth expired) and continuing wastes tokens + time.
880
+ // Reset on any non-empty turn. Reason-tagged so /status can
881
+ // surface the running rate.
882
+ if (!collectedText || !collectedText.trim()) {
883
+ this._consecutiveEmptyResponses = (this._consecutiveEmptyResponses || 0) + 1;
884
+ this._totalEmptyResponses = (this._totalEmptyResponses || 0) + 1;
885
+ if (this._consecutiveEmptyResponses >= 2) {
886
+ const message =
887
+ `LLM returned empty response ${this._consecutiveEmptyResponses}× in a row — ` +
888
+ `likely context-length exceeded or provider-side silent failure. ` +
889
+ `Stopping this turn to prevent runaway API spend.`;
890
+ this.eventLog.append("error", { message, kind: "empty_response_streak" });
891
+ yield new AgentEvent({ type: "error", message });
892
+ this._consecutiveEmptyResponses = 0; // reset so next /run isn't blocked
893
+ return;
894
+ }
895
+ } else {
896
+ this._consecutiveEmptyResponses = 0;
897
+ }
898
+ this._totalTurns = (this._totalTurns || 0) + 1;
899
+
784
900
  // Bug 4 trigger (1): re-check phase criteria at end of every turn —
785
901
  // KC may have advanced state via conversation alone, without any
786
902
  // tool that the pipeline narrowly watches.
@@ -793,6 +909,10 @@ export class AgentEngine {
793
909
  return;
794
910
  }
795
911
 
912
+ // A3: A turn with tool_calls or content is not empty — reset streak.
913
+ this._consecutiveEmptyResponses = 0;
914
+ this._totalTurns = (this._totalTurns || 0) + 1;
915
+
796
916
  // Tool execution loop
797
917
  for (const tc of toolCallsAcc.values()) {
798
918
  let inputData = {};
@@ -803,6 +923,12 @@ export class AgentEngine {
803
923
  this.eventLog.append("tool_start", { name: tc.name, input: inputData });
804
924
  yield new AgentEvent({ type: "tool_start", name: tc.name, input: inputData });
805
925
 
926
+ // A1: Capture phase BEFORE tool execution. Some tools — notably
927
+ // phase_advance — mutate this.currentPhase via a callback without
928
+ // yielding any AgentEvent, so the TUI's status bar never gets the
929
+ // signal. We diff after execute() and emit a synthetic
930
+ // pipeline_event so subscribers can sync.
931
+ const beforePhase = this.currentPhase;
806
932
  const result = await this.toolRegistry.execute(tc.name, inputData);
807
933
 
808
934
  // Tool-call offloading: large outputs go to logs/tool_results/<traceId>.txt;
@@ -817,6 +943,29 @@ export class AgentEngine {
817
943
  isError: result.isError,
818
944
  traceId: offload?.traceId || null,
819
945
  });
946
+
947
+ // D3a: trace skill invocations. When the agent reads a SKILL.md via
948
+ // workspace_file (the canonical way KC "uses" a skill, since skills
949
+ // are progressively-disclosed markdown), emit a skill_invoked event.
950
+ // Makes "which skills did KC actually consult?" answerable in post-run
951
+ // analysis — before this, skills were opaque to the event log.
952
+ try {
953
+ if (
954
+ !result.isError &&
955
+ (tc.name === "workspace_file" || tc.name === "sandbox_exec")
956
+ ) {
957
+ const p = String(inputData?.path || inputData?.command || "");
958
+ const skillMatch = p.match(/(?:template\/)?skills\/[a-z-]+\/(?:meta-meta|meta|skill-creator)\/([a-zA-Z0-9_-]+)(?:\/SKILL\.md|\/)?|\bSKILL\.md\b/);
959
+ if (skillMatch) {
960
+ const skillName = skillMatch[1] || "(unknown)";
961
+ this.eventLog.append("skill_invoked", {
962
+ skill: skillName,
963
+ via_tool: tc.name,
964
+ phase: this.currentPhase,
965
+ });
966
+ }
967
+ }
968
+ } catch { /* never let tracing break a tool call */ }
820
969
  yield new AgentEvent({
821
970
  type: "tool_result",
822
971
  name: tc.name,
@@ -837,6 +986,22 @@ export class AgentEngine {
837
986
  // user saw "CTX: 210% / stream terminated" with no recovery.
838
987
  this._maybeWindowAfterToolResult();
839
988
 
989
+ // A1: If the tool mutated the phase (e.g. phase_advance), emit the
990
+ // signal the TUI and pipelines need to re-sync state. Runs BEFORE
991
+ // pipeline.onToolResult so the fresh phase is active if the pipeline
992
+ // itself wants to react to the transition.
993
+ if (this.currentPhase !== beforePhase) {
994
+ yield new AgentEvent({
995
+ type: "pipeline_event",
996
+ data: {
997
+ type: "phase_changed",
998
+ from: beforePhase,
999
+ nextPhase: this.currentPhase,
1000
+ reason: `via ${tc.name}`,
1001
+ },
1002
+ });
1003
+ }
1004
+
840
1005
  // Pipeline controller: update state and re-register tools on phase change
841
1006
  if (pipeline?.onToolResult) {
842
1007
  const pEvent = pipeline.onToolResult(tc.name, inputData, result);
@@ -857,8 +1022,15 @@ export class AgentEngine {
857
1022
  if (ev) yield ev;
858
1023
 
859
1024
  } catch (err) {
860
- this.eventLog.append("error", { message: err.message });
861
- yield new AgentEvent({ type: "error", message: err.message });
1025
+ // A8: If the LLM client tagged the stream termination reason, pass
1026
+ // it through. Upstream log consumers + the TUI can then distinguish
1027
+ // "provider returned 429" from "socket died mid-token" from "SSE
1028
+ // buffer exploded" — today they're all just "Error: ...".
1029
+ const payload = { message: err.message };
1030
+ if (err.streamTermination) payload.kind = err.streamTermination;
1031
+ if (err.status) payload.status = err.status;
1032
+ this.eventLog.append("error", payload);
1033
+ yield new AgentEvent({ type: "error", message: err.message, ...payload });
862
1034
  return;
863
1035
  }
864
1036
  }
@@ -889,22 +1061,219 @@ export class AgentEngine {
889
1061
  return false;
890
1062
  }
891
1063
 
892
- const phaseSummary = `[${this.currentPhase.toUpperCase()} → ${nextPhase.toUpperCase()}]: ${reason}${force && nextPhase !== expected ? " (forced)" : ""}`;
1064
+ // v0.6.1 B1: build engine-appended hard-counts block + heuristic mismatch
1065
+ // detection so the LLM-narrated reason can be cross-checked against
1066
+ // ground-truth telemetry. Phase summaries become diagnostic, not just
1067
+ // narrative.
1068
+ const engineCounts = this._buildEngineCountsBlock(this.currentPhase);
1069
+ const mismatchPrefix = this._detectSummaryMismatch(reason, this.currentPhase) ? "⚠️ POSSIBLE MISMATCH: " : "";
1070
+ const phaseSummary =
1071
+ `[${this.currentPhase.toUpperCase()} → ${nextPhase.toUpperCase()}]: ${mismatchPrefix}${reason}` +
1072
+ (force && nextPhase !== expected ? " (forced)" : "") +
1073
+ (engineCounts ? `\n (engine) ${engineCounts}` : "");
893
1074
  this._phaseSummaries.push(phaseSummary);
894
1075
  this.eventLog.append("phase_transition", {
895
1076
  from: this.currentPhase,
896
1077
  to: nextPhase,
897
1078
  reason,
1079
+ engineCounts: engineCounts || null,
1080
+ possibleMismatch: !!mismatchPrefix,
898
1081
  forced: force && nextPhase !== expected,
899
1082
  });
1083
+ const fromPhase = this.currentPhase;
900
1084
  this.currentPhase = nextPhase;
901
1085
  this._registerToolsForPhase(this.currentPhase);
902
1086
  this.workspace.setPhase(this.currentPhase);
903
1087
  this._createTasksForPhase(this.currentPhase);
904
1088
  this.saveState();
1089
+
1090
+ // B8: Soft signal — surface any sub-agents left running from the prior
1091
+ // phase so the main agent's next turn can decide whether to kill them.
1092
+ // NOT automated: phase_advance can fire from _maybeAutoAdvance on a
1093
+ // criteria-flip, and auto-killing would couple lifecycle with blast
1094
+ // radius. This just informs.
1095
+ try {
1096
+ const agentTool = this._buildTools?.core?.find((t) => t?.name === "agent_tool");
1097
+ const runningIds = agentTool?.getRunningTaskIds?.() || [];
1098
+ if (runningIds.length > 0) {
1099
+ this.eventLog.append("stale_subagents", {
1100
+ from_phase: fromPhase,
1101
+ to_phase: nextPhase,
1102
+ running_task_ids: runningIds,
1103
+ hint: "These sub-agents were dispatched during the prior phase. Consider operation=poll to check status, or operation=kill to abort if stale.",
1104
+ });
1105
+ }
1106
+ } catch { /* never let signal emission break phase advance */ }
1107
+
905
1108
  return true;
906
1109
  }
907
1110
 
1111
+ /**
1112
+ * v0.6.1 A6: Single chokepoint for engine-emitted milestone updates.
1113
+ * Tools call this on successful execution to bump pipeline counters that
1114
+ * the phase-gate hardening (A2-A5) depends on. Without engine emission,
1115
+ * gates fall back to filesystem scans which can miss work that didn't
1116
+ * follow canonical output paths (E2E #4: `unified_qc.py` wrote to
1117
+ * `output/results/`, production-qc only scanned `output/qc/`).
1118
+ *
1119
+ * The mutation routes through the pipeline's existing internal state, so
1120
+ * exportState/importState round-trips work unchanged and the gate sees a
1121
+ * unified view of (filesystem-scanned + engine-emitted) signals.
1122
+ *
1123
+ * Three modes inferred from value shape:
1124
+ * - increment counter: pipeline[key] is number, value is number → add
1125
+ * - set in dict-by-id: pipeline[key] is object, value is { id, value? } → assign
1126
+ * - dedupe-add to array: pipeline[key] is array, value is string → push if absent
1127
+ *
1128
+ * @param {string} phase - Pipeline name (e.g., "distillation")
1129
+ * @param {string} key - Field on the pipeline (e.g., "workflowsTested")
1130
+ * @param {*} value - Shape varies by target type (see modes above)
1131
+ * @returns {boolean} true if a write happened
1132
+ */
1133
+ _recordMilestone(phase, key, value) {
1134
+ const pipeline = this.pipelines?.[phase];
1135
+ if (!pipeline) return false;
1136
+ const target = pipeline[key];
1137
+ // increment counter
1138
+ if (typeof target === "number" && typeof value === "number") {
1139
+ pipeline[key] = target + value;
1140
+ return true;
1141
+ }
1142
+ // set on dict-by-id
1143
+ if (target && typeof target === "object" && !Array.isArray(target)
1144
+ && value && typeof value === "object" && "id" in value) {
1145
+ target[value.id] = "value" in value ? value.value : true;
1146
+ return true;
1147
+ }
1148
+ // dedupe-add to array
1149
+ if (Array.isArray(target) && typeof value === "string") {
1150
+ if (!target.includes(value)) target.push(value);
1151
+ return true;
1152
+ }
1153
+ return false;
1154
+ }
1155
+
1156
+ /**
1157
+ * v0.6.1 B1: build a one-line "engine counts" block summarizing the
1158
+ * pipeline's ground-truth telemetry at the moment of phase advance.
1159
+ * Different phases surface different metrics; we keep this short so the
1160
+ * appended summary line stays readable.
1161
+ *
1162
+ * @param {string} fromPhase - The phase being LEFT (we summarize its work)
1163
+ * @returns {string} block text, or "" if pipeline has nothing to report
1164
+ */
1165
+ _buildEngineCountsBlock(fromPhase) {
1166
+ const pipeline = this.pipelines?.[fromPhase];
1167
+ if (!pipeline) return "";
1168
+ const parts = [];
1169
+ try {
1170
+ switch (fromPhase) {
1171
+ case "extraction": {
1172
+ const total = pipeline._catalogRuleCount?.() ?? pipeline.rulesExtracted?.length ?? 0;
1173
+ parts.push(`rulesExtracted: ${pipeline.rulesExtracted?.length ?? 0}`);
1174
+ parts.push(`rulesWithChunkRefs: ${pipeline.rulesWithChunkRefs?.length ?? 0}/${total}`);
1175
+ parts.push(`rulesWithTests: ${pipeline.rulesWithTests?.length ?? 0}`);
1176
+ parts.push(`coverageAudited: ${pipeline.coverageAudited ? "yes" : "no"}`);
1177
+ break;
1178
+ }
1179
+ case "skill_authoring": {
1180
+ const totalRules = pipeline.totalRules?.length ?? 0;
1181
+ const covered = pipeline.ruleIdsCovered?.size ?? 0;
1182
+ parts.push(`rulesCovered: ${covered}/${totalRules}`);
1183
+ parts.push(`skillDirsAuthored: ${pipeline.skillsAuthored?.length ?? 0}`);
1184
+ if (this.taskManager) {
1185
+ const t = this.taskManager.countByPhase("skill_authoring");
1186
+ const d = this.taskManager.countByPhase("skill_authoring", "completed");
1187
+ const f = this.taskManager.countByPhase("skill_authoring", "failed");
1188
+ parts.push(`tasksCompleted: ${d}/${t}${f > 0 ? ` (+${f} failed)` : ""}`);
1189
+ }
1190
+ break;
1191
+ }
1192
+ case "skill_testing": {
1193
+ const total = pipeline.skillsToTest?.length ?? 0;
1194
+ const tested = Object.keys(pipeline.skillsTested || {}).length;
1195
+ const passing = pipeline.skillsPassing?.length ?? 0;
1196
+ parts.push(`skillsTested: ${tested}/${total}`);
1197
+ parts.push(`skillsPassing: ${passing}`);
1198
+ parts.push(`iterations: ${pipeline.iterationCount ?? 0}`);
1199
+ break;
1200
+ }
1201
+ case "distillation": {
1202
+ const total = pipeline.skillsToDistill?.length ?? 0;
1203
+ const created = Object.keys(pipeline.workflowsCreated || {}).length;
1204
+ const tested = Object.keys(pipeline.workflowsTested || {}).length;
1205
+ const passing = pipeline.workflowsPassing?.length ?? 0;
1206
+ parts.push(`workflowsCreated: ${created}/${total}`);
1207
+ parts.push(`workflowsTested: ${tested}/${total}`);
1208
+ parts.push(`workflowsPassing: ${passing}/${total}`);
1209
+ break;
1210
+ }
1211
+ case "production_qc": {
1212
+ parts.push(`batchesProcessed: ${pipeline.batchesProcessed ?? 0}`);
1213
+ parts.push(`documentsReviewed: ${pipeline.documentsReviewed ?? 0}`);
1214
+ parts.push(`monitoring: ${pipeline.monitoringPhase ?? "?"}`);
1215
+ break;
1216
+ }
1217
+ // bootstrap / finalization: no specific counters, fall through
1218
+ }
1219
+ } catch { /* never let summary build break phase advance */ }
1220
+ return parts.join(", ");
1221
+ }
1222
+
1223
+ /**
1224
+ * v0.6.1 B1: heuristic mismatch detection. Conservative regex over the
1225
+ * LLM's free-form reason for percentages and counts, compared against
1226
+ * engine truth. INFORMATIONAL only — never blocks the transition. False
1227
+ * positives are acceptable (the warning is a hint to the human reviewer,
1228
+ * not a hard signal). False negatives are also acceptable (this catches
1229
+ * the loud, numerical claims; subtle ones still slip through).
1230
+ *
1231
+ * Returns true if the agent's reason mentions a count or percentage that
1232
+ * doesn't match engine state.
1233
+ */
1234
+ _detectSummaryMismatch(reason, fromPhase) {
1235
+ if (!reason || typeof reason !== "string") return false;
1236
+ const pipeline = this.pipelines?.[fromPhase];
1237
+ if (!pipeline) return false;
1238
+ try {
1239
+ // Match "N/M" fractions and standalone counts
1240
+ const fractionMatches = [...reason.matchAll(/(\d+)\s*\/\s*(\d+)/g)];
1241
+ // Match "N rules / skills / workflows / tasks"
1242
+ const countMatches = [...reason.matchAll(/(\d+)\s*(rules?|skills?|workflows?|tasks?|条规则|个技能)/gi)];
1243
+ // Match accuracy claims like "95%", "0.95"
1244
+ const pctMatches = [...reason.matchAll(/(\d+(?:\.\d+)?)\s*%/g)];
1245
+
1246
+ // Phase-specific cross-checks (cheap conservative comparisons)
1247
+ if (fromPhase === "skill_authoring" && this.taskManager) {
1248
+ const completed = this.taskManager.countByPhase("skill_authoring", "completed");
1249
+ const total = this.taskManager.countByPhase("skill_authoring");
1250
+ for (const m of fractionMatches) {
1251
+ const claimedDone = parseInt(m[1], 10);
1252
+ const claimedTotal = parseInt(m[2], 10);
1253
+ if (claimedTotal === total && claimedDone > completed + 5) return true;
1254
+ }
1255
+ }
1256
+ if (fromPhase === "skill_testing") {
1257
+ const tested = Object.keys(pipeline.skillsTested || {}).length;
1258
+ const passing = pipeline.skillsPassing?.length ?? 0;
1259
+ for (const m of pctMatches) {
1260
+ const claimed = parseFloat(m[1]);
1261
+ // If claimed > 50% but engine sees 0 tested, that's suspicious
1262
+ if (claimed >= 50 && tested === 0 && passing === 0) return true;
1263
+ }
1264
+ }
1265
+ if (fromPhase === "production_qc") {
1266
+ const batches = pipeline.batchesProcessed ?? 0;
1267
+ // Any "complete" or large-count claim while batches==0 is suspicious
1268
+ if (batches === 0) {
1269
+ if (countMatches.some((m) => parseInt(m[1], 10) > 10)) return true;
1270
+ if (pctMatches.some((m) => parseFloat(m[1]) > 50)) return true;
1271
+ }
1272
+ }
1273
+ } catch { /* informational only — never block */ }
1274
+ return false;
1275
+ }
1276
+
908
1277
  /**
909
1278
  * Bug 4 trigger (1) auto-detect, edge-triggered (Bug 5): only fires on a
910
1279
  * fresh false → true flip in `exitCriteriaMet()`. Sessions resumed in an
@@ -972,6 +1341,16 @@ export class AgentEngine {
972
1341
  /**
973
1342
  * Create per-rule tasks when entering a new phase.
974
1343
  * Reads the rule catalog and creates one task per rule for the given phase.
1344
+ *
1345
+ * D6: For skill_authoring / skill_testing, filter rules via the bundle
1346
+ * classification cache (`cache/bundles/<hash>.classification.json`,
1347
+ * written by document_classify). Rules whose `applicable_product_types`
1348
+ * or `report_types` don't overlap with the bundle's classification get
1349
+ * SKIPPED at task-creation time — we don't mutate catalog.json to mark
1350
+ * them not_applicable, we just keep them out of the task queue. The
1351
+ * finalization phase (Group E) will report them in the coverage
1352
+ * artifact as "not applicable to this bundle." Conservative default:
1353
+ * if no classification exists, include all rules (pre-B9 behavior).
975
1354
  */
976
1355
  _createTasksForPhase(phase) {
977
1356
  if (!this.taskManager) return; // Sub-agents don't manage tasks
@@ -980,28 +1359,258 @@ export class AgentEngine {
980
1359
 
981
1360
  try {
982
1361
  const catalog = JSON.parse(fs.readFileSync(catalogPath, "utf-8"));
983
- const rules = normalizeRuleCatalog(catalog);
984
- if (rules.length > 0) {
985
- this.taskManager.createRuleTasks(rules, phase);
1362
+ let rules = normalizeRuleCatalog(catalog);
1363
+ if (rules.length === 0) return;
1364
+
1365
+ // D6: applicability pre-filter (skill phases only — bootstrap/extraction
1366
+ // have no task creation here per A6).
1367
+ if (phase === "skill_authoring" || phase === "skill_testing") {
1368
+ const classification = this._loadBundleClassification();
1369
+ if (classification) {
1370
+ const before = rules.length;
1371
+ rules = rules.filter((r) => this._ruleAppliesToBundle(r, classification));
1372
+ if (rules.length < before) {
1373
+ this.eventLog.append("applicability_prefilter", {
1374
+ phase,
1375
+ classification: {
1376
+ product_type: classification.product_type,
1377
+ report_type: classification.report_type,
1378
+ source: classification.source,
1379
+ },
1380
+ rules_before: before,
1381
+ rules_after: rules.length,
1382
+ skipped: before - rules.length,
1383
+ });
1384
+ }
1385
+ }
986
1386
  }
1387
+ this.taskManager.createRuleTasks(rules, phase);
987
1388
  } catch { /* skip if catalog can't be read */ }
988
1389
  }
989
1390
 
1391
+ /**
1392
+ * D6: Load the most recent bundle classification cache, if one exists.
1393
+ * Written by the `document_classify` tool. Returns null if no cache or
1394
+ * unreadable — callers must treat null as "all rules apply."
1395
+ */
1396
+ _loadBundleClassification() {
1397
+ const cacheDir = path.join(this.workspace.cwd, "cache", "bundles");
1398
+ if (!fs.existsSync(cacheDir)) return null;
1399
+ let entries;
1400
+ try { entries = fs.readdirSync(cacheDir); }
1401
+ catch { return null; }
1402
+ const files = entries
1403
+ .filter((n) => n.endsWith(".classification.json"))
1404
+ .map((n) => {
1405
+ const p = path.join(cacheDir, n);
1406
+ try { return { path: p, mtime: fs.statSync(p).mtimeMs }; }
1407
+ catch { return null; }
1408
+ })
1409
+ .filter(Boolean)
1410
+ .sort((a, b) => b.mtime - a.mtime);
1411
+ if (files.length === 0) return null;
1412
+ try { return JSON.parse(fs.readFileSync(files[0].path, "utf-8")); }
1413
+ catch { return null; }
1414
+ }
1415
+
1416
+ /**
1417
+ * D6: Rule-applicability check mirroring the AMC app's `applies_to`.
1418
+ * Conservative: returns true when we don't have enough info to
1419
+ * confidently skip (missing fields on rule, or classification with
1420
+ * empty product/report).
1421
+ */
1422
+ _ruleAppliesToBundle(rule, classification) {
1423
+ const docProduct = classification?.product_type || "";
1424
+ const docReport = classification?.report_type || "";
1425
+ const ruleProducts = rule.applicable_product_types || rule.applicable_sections || [];
1426
+ const ruleReports = rule.report_types || [];
1427
+
1428
+ const allProducts = ruleProducts.length === 0 ||
1429
+ ruleProducts.some((x) => x === "全部" || x === "all" || x === "");
1430
+ const allReports = ruleReports.length === 0 ||
1431
+ ruleReports.some((x) => x === "全部" || x === "all" || x === "");
1432
+ if (allProducts && allReports) return true;
1433
+
1434
+ const productOk = allProducts || (
1435
+ docProduct && ruleProducts.some((rp) => rp.includes(docProduct) || docProduct.includes(rp))
1436
+ );
1437
+ const reportOk = allReports || (
1438
+ docReport && ruleReports.some((rr) => rr.includes(docReport) || docReport.includes(rr))
1439
+ );
1440
+
1441
+ // Unknown classification → don't prefilter, let the agent judge.
1442
+ if (!docProduct && !docReport) return true;
1443
+ return productOk && reportOk;
1444
+ }
1445
+
1446
+ /**
1447
+ * D1: Enrich a skill_authoring / skill_testing task prompt with the
1448
+ * rule's source context — reads `source_chunk_ids` back-refs from
1449
+ * catalog.json (populated by extraction) and fetches chunk text from
1450
+ * the most recent BundleTree cache. Falls back to the minimal prompt
1451
+ * when catalog / cache aren't available.
1452
+ *
1453
+ * Previously the task prompt was ONE line — "Continue with next task:
1454
+ * ${title}" — leaving the skill-author agent to re-read the rule and
1455
+ * re-find its evidence per task. Auto-attach saves the LLM turn
1456
+ * needed for document_search on every task, and ensures the author
1457
+ * sees the exact regulation text the extractor used to justify the
1458
+ * rule.
1459
+ *
1460
+ * @param {{id: string, title: string, ruleId?: string, phase: string}} task
1461
+ * @returns {string}
1462
+ */
1463
+ _buildEnrichedTaskPrompt(task) {
1464
+ const fallback = `Continue with next task: ${task.title}` +
1465
+ (task.ruleId ? ` (rule: ${task.ruleId})` : "");
1466
+
1467
+ // Only enrich for rule-anchored phases
1468
+ if (task.phase !== "skill_authoring" && task.phase !== "skill_testing") {
1469
+ return fallback;
1470
+ }
1471
+ if (!task.ruleId) return fallback;
1472
+
1473
+ // Find the rule in catalog.json
1474
+ const catalogPath = path.join(this.workspace.cwd, "rules", "catalog.json");
1475
+ if (!fs.existsSync(catalogPath)) return fallback;
1476
+ let rules;
1477
+ try {
1478
+ rules = normalizeRuleCatalog(JSON.parse(fs.readFileSync(catalogPath, "utf-8")));
1479
+ } catch { return fallback; }
1480
+ const rule = rules.find((r) => r.id === task.ruleId);
1481
+ if (!rule) return fallback;
1482
+
1483
+ // Assemble the enriched brief. Every section is optional — when a
1484
+ // back-ref or cache is missing, just skip that section rather than
1485
+ // failing back to the minimal prompt.
1486
+ const lines = [];
1487
+ lines.push(`# Task: ${task.title}`);
1488
+ lines.push("");
1489
+ lines.push(`## Rule ${rule.id}`);
1490
+ if (rule.source_ref) lines.push(`Source: ${rule.source_ref}`);
1491
+ if (rule.severity) lines.push(`Severity: ${rule.severity}`);
1492
+ if (rule.description) lines.push(`\n${rule.description}`);
1493
+ if (rule.falsifiability_statement) lines.push(`\n**Falsifiability**: ${rule.falsifiability_statement}`);
1494
+ if (rule.test_case_stub) lines.push(`**Test stub**: ${rule.test_case_stub}`);
1495
+
1496
+ // D1: if rule has source_chunk_ids AND a BundleTree cache exists,
1497
+ // pull chunk text inline so the author doesn't need to call
1498
+ // bundle_search manually. Bounded to ~3000 tokens total to avoid
1499
+ // blowing the author's context budget.
1500
+ const chunkIds = Array.isArray(rule.source_chunk_ids) ? rule.source_chunk_ids : [];
1501
+ if (chunkIds.length > 0) {
1502
+ const chunks = this._loadChunksFromBundleCache(chunkIds);
1503
+ if (chunks.length > 0) {
1504
+ lines.push("");
1505
+ lines.push("## Source context");
1506
+ let totalChars = 0;
1507
+ const MAX_CHARS = 7500; // ~3000 CJK tokens
1508
+ for (const ch of chunks) {
1509
+ const header = `### ${ch.title || ch.chunk_id} · ${ch.source_file} p.${(ch.page_range || [1, 1]).join("-")}`;
1510
+ const body = (ch.content || "").trim();
1511
+ const block = `${header}\n${body}\n`;
1512
+ if (totalChars + block.length > MAX_CHARS) {
1513
+ lines.push(`\n[…${chunks.length - chunks.indexOf(ch)} more source chunks truncated; use bundle_search to retrieve them…]`);
1514
+ break;
1515
+ }
1516
+ lines.push("");
1517
+ lines.push(block);
1518
+ totalChars += block.length;
1519
+ }
1520
+ }
1521
+ }
1522
+
1523
+ // Sibling rules (same source_ref prefix) — helps the author see the
1524
+ // surrounding catalog and avoid re-implementing cross-referenced logic.
1525
+ const siblings = this._findSiblingRuleIds(rule, rules);
1526
+ if (siblings.length > 0) {
1527
+ lines.push("");
1528
+ lines.push(`## Sibling rules (same regulation section)`);
1529
+ lines.push(siblings.map((id) => `- ${id}`).join("\n"));
1530
+ }
1531
+
1532
+ lines.push("");
1533
+ lines.push("Write the skill to `rule_skills/<rule_id>/SKILL.md` + detect script. Prefer 1 rule = 1 skill dir (use `check_rNNN_rMMM.py` naming ONLY when rules share evidence and fail together).");
1534
+
1535
+ return lines.join("\n");
1536
+ }
1537
+
1538
+ /** D1: Load chunk text from the most recent BundleTree cache. */
1539
+ _loadChunksFromBundleCache(chunkIds) {
1540
+ const cacheDir = path.join(this.workspace.cwd, "cache", "bundles");
1541
+ if (!fs.existsSync(cacheDir)) return [];
1542
+ let entries;
1543
+ try { entries = fs.readdirSync(cacheDir); }
1544
+ catch { return []; }
1545
+ const candidates = entries
1546
+ .filter((n) => n.endsWith(".json") && !n.endsWith(".classification.json"))
1547
+ .map((n) => {
1548
+ const p = path.join(cacheDir, n);
1549
+ try { return { path: p, mtime: fs.statSync(p).mtimeMs }; }
1550
+ catch { return null; }
1551
+ })
1552
+ .filter(Boolean)
1553
+ .sort((a, b) => b.mtime - a.mtime);
1554
+ if (candidates.length === 0) return [];
1555
+ let tree;
1556
+ try { tree = JSON.parse(fs.readFileSync(candidates[0].path, "utf-8")); }
1557
+ catch { return []; }
1558
+ const out = [];
1559
+ for (const cid of chunkIds) {
1560
+ const ch = tree.chunks?.[cid];
1561
+ if (ch) out.push(ch);
1562
+ }
1563
+ return out;
1564
+ }
1565
+
1566
+ /** D1: Rules that share the same regulation article (naive: source_ref prefix). */
1567
+ _findSiblingRuleIds(rule, allRules) {
1568
+ if (!rule.source_ref) return [];
1569
+ const prefix = rule.source_ref.split(/[第条款项]/)[0].trim();
1570
+ if (!prefix) return [];
1571
+ return allRules
1572
+ .filter((r) => r.id !== rule.id && (r.source_ref || "").startsWith(prefix))
1573
+ .slice(0, 8)
1574
+ .map((r) => r.id);
1575
+ }
1576
+
990
1577
  /**
991
1578
  * Ralph-loop: run a turn, then auto-continue through pending tasks.
992
1579
  * Compacts context aggressively between tasks to prevent context blowup.
993
1580
  * If no tasks exist, behaves identically to runTurn().
994
1581
  *
995
1582
  * @param {string} userMessage
1583
+ * @param {{parallelism?: number}} [opts] — B1: optional parallel mode.
1584
+ * N > 1 dispatches tasks through N concurrent subagents (using the
1585
+ * agent_tool infrastructure from B8). Clamped to `effectiveParallelism`
1586
+ * from config.js — which silently downgrades to 1 unless
1587
+ * KC_PARALLELISM_VERIFIED=1 is set AND heap.jsonl shows flat RSS
1588
+ * (B0.6 guard; prevents accidental $100+ runaway runs).
996
1589
  * @yields {AgentEvent}
997
1590
  */
998
- async *runTaskLoop(userMessage) {
1591
+ async *runTaskLoop(userMessage, opts = {}) {
999
1592
  // Sub-agents don't run task loops — they execute one task and exit
1000
1593
  if (!this.taskManager) {
1001
1594
  yield* this.runTurn(userMessage);
1002
1595
  return;
1003
1596
  }
1004
1597
 
1598
+ // B1: resolve effective parallelism. Caller opts override config.
1599
+ const requested = Number.isFinite(opts.parallelism)
1600
+ ? Math.max(1, Math.min(8, opts.parallelism))
1601
+ : (this.config.effectiveParallelism?.() ?? 1);
1602
+
1603
+ if (requested > 1) {
1604
+ yield* this._runTaskLoopParallel(userMessage, requested);
1605
+ return;
1606
+ }
1607
+
1608
+ yield* this._runTaskLoopSerial(userMessage);
1609
+ }
1610
+
1611
+ /** B1: original serial ralph-loop path — one task at a time, shared
1612
+ * conversation history. Unchanged from pre-v0.6.0 behavior. */
1613
+ async *_runTaskLoopSerial(userMessage) {
1005
1614
  // Run the initial turn (user's request)
1006
1615
  yield* this.runTurn(userMessage);
1007
1616
 
@@ -1015,8 +1624,11 @@ export class AgentEngine {
1015
1624
  await this.compact({ recentCount: 8 });
1016
1625
  }
1017
1626
 
1018
- const task = this.taskManager.getNextPending();
1019
- this.taskManager.updateTask(task.id, { status: "in_progress" });
1627
+ // B2: atomic claim — for serial we could use getNextPending, but
1628
+ // using claimNextPending gives us consistent state fields (worker
1629
+ // label, startedAt) whether in serial or parallel mode.
1630
+ const task = this.taskManager.claimNextPending("serial");
1631
+ if (!task) break;
1020
1632
 
1021
1633
  // Yield task progress event for TUI
1022
1634
  yield new AgentEvent({
@@ -1030,14 +1642,15 @@ export class AgentEngine {
1030
1642
  },
1031
1643
  });
1032
1644
 
1033
- // Synthesize a task-focused prompt
1034
- const taskPrompt = `Continue with next task: ${task.title}` +
1035
- (task.ruleId ? ` (rule: ${task.ruleId})` : "");
1645
+ // D1: synthesize a task-focused prompt, enriched with rule source
1646
+ // context (rule NL + source_ref + chunk text + sibling ids) when
1647
+ // the catalog + BundleTree cache are available. Falls back to the
1648
+ // minimal "Continue with next task" line otherwise.
1649
+ const taskPrompt = this._buildEnrichedTaskPrompt(task);
1036
1650
 
1037
1651
  yield* this.runTurn(taskPrompt);
1038
1652
 
1039
- this.taskManager.updateTask(task.id, { status: "completed" });
1040
- this.taskManager.save();
1653
+ this.taskManager.markDone(task.id);
1041
1654
  this.saveState();
1042
1655
 
1043
1656
  yield new AgentEvent({
@@ -1074,6 +1687,190 @@ export class AgentEngine {
1074
1687
  }
1075
1688
  }
1076
1689
 
1690
+ /**
1691
+ * B1: Parallel ralph-loop — N concurrent subagents each executing one
1692
+ * task at a time, claimed atomically from TaskManager.
1693
+ *
1694
+ * Implementation: leverages B8's agent_tool infrastructure. Each worker
1695
+ * slot is a sub-engine with its own heap-isolated history; workspace
1696
+ * writes are serialized through B9's file locks. The main engine acts
1697
+ * as dispatcher — it claims tasks and spawns subagents, then waits.
1698
+ *
1699
+ * Chosen over in-process history-forking because: (a) sub-engines are
1700
+ * already heap-isolated (good under B0's RSS-safety regime); (b)
1701
+ * kill authority from B8 applies uniformly; (c) no runTurn refactor
1702
+ * needed — the engine's conversation-state assumptions stay intact.
1703
+ * Trade-off: each task pays a cold-start cost (re-read AGENT.md,
1704
+ * skill index, pipeline state). For 100+ task sessions this is
1705
+ * amortized against the 2-4× wall-clock speedup.
1706
+ */
1707
+ async *_runTaskLoopParallel(userMessage, parallelism) {
1708
+ // Initial turn: main agent reads user request, creates tasks.
1709
+ yield* this.runTurn(userMessage);
1710
+
1711
+ const agentTool = this._buildTools.core.find((t) => t?.name === "agent_tool");
1712
+ if (!agentTool) {
1713
+ // Shouldn't happen (agent_tool is core), but fall back safely.
1714
+ yield new AgentEvent({
1715
+ type: "error",
1716
+ message: "agent_tool not registered; parallel mode requires it. Falling back to serial.",
1717
+ });
1718
+ yield* this._runTaskLoopSerial("");
1719
+ return;
1720
+ }
1721
+
1722
+ // Event queue so concurrent workers can yield progress through a
1723
+ // single async-generator consumer. push-style with a notifier.
1724
+ const eventQueue = [];
1725
+ let notify = null;
1726
+ const enq = (ev) => {
1727
+ eventQueue.push(ev);
1728
+ if (notify) { const n = notify; notify = null; n(); }
1729
+ };
1730
+
1731
+ // In-flight: subagent task_id → { task, promise }
1732
+ const inFlight = new Map();
1733
+
1734
+ const dispatch = async () => {
1735
+ while (inFlight.size < parallelism) {
1736
+ const task = this.taskManager.claimNextPending(`pool${inFlight.size}`);
1737
+ if (!task) return;
1738
+
1739
+ const workerLabel = `pool${[...inFlight.keys()].length}`;
1740
+ const subId = `pool_${task.id}`.replace(/[^A-Za-z0-9_-]/g, "_").slice(0, 60);
1741
+
1742
+ // D1: build the enriched brief with source context. Parallel workers
1743
+ // are subagents — each with zero conversation history, so the brief
1744
+ // must carry everything they need. Even more important to have
1745
+ // source context inline vs. expecting them to call document_search.
1746
+ const enriched = this._buildEnrichedTaskPrompt(task);
1747
+ const brief =
1748
+ enriched +
1749
+ `\n\nNOTE (parallel worker): write outputs via workspace_file or ` +
1750
+ `rule_catalog — do NOT write to shared coordination files ` +
1751
+ `(rules/catalog.json, rules/manifest.json) via sandbox_exec; they're ` +
1752
+ `lock-protected and bypassing the lock will race with other workers.`;
1753
+
1754
+ enq(new AgentEvent({
1755
+ type: "task_progress",
1756
+ data: {
1757
+ taskId: task.id, title: task.title, ruleId: task.ruleId,
1758
+ status: "in_progress", worker: workerLabel,
1759
+ progress: this.taskManager.progress,
1760
+ },
1761
+ }));
1762
+
1763
+ // Spawn via the tool's public API. agent_tool writes status.txt,
1764
+ // abort controller, etc. We read _runningTasks to get a promise
1765
+ // handle we can await.
1766
+ const spawnRes = await agentTool.execute({
1767
+ operation: "spawn",
1768
+ task_description: brief,
1769
+ task_id: subId,
1770
+ });
1771
+
1772
+ if (spawnRes.isError) {
1773
+ this.taskManager.markFailed(task.id, `spawn failed: ${spawnRes.content}`);
1774
+ enq(new AgentEvent({
1775
+ type: "task_progress",
1776
+ data: { taskId: task.id, status: "failed", worker: workerLabel },
1777
+ }));
1778
+ continue;
1779
+ }
1780
+
1781
+ const entry = agentTool._runningTasks.get(subId);
1782
+ if (!entry) {
1783
+ // Sub-agent completed synchronously (no events) — mark done.
1784
+ this.taskManager.markDone(task.id);
1785
+ enq(new AgentEvent({
1786
+ type: "task_progress",
1787
+ data: { taskId: task.id, status: "completed", worker: workerLabel },
1788
+ }));
1789
+ continue;
1790
+ }
1791
+
1792
+ const trackedPromise = entry.promise.then(
1793
+ () => ({ taskId: task.id, subId, ok: true }),
1794
+ (e) => ({ taskId: task.id, subId, ok: false, error: e?.message || String(e) }),
1795
+ );
1796
+ inFlight.set(subId, { task, workerLabel, promise: trackedPromise });
1797
+ }
1798
+ };
1799
+
1800
+ // Prime the pool
1801
+ await dispatch();
1802
+
1803
+ // Drain events + replenish until queue is empty and all in-flight done.
1804
+ while (inFlight.size > 0 || eventQueue.length > 0) {
1805
+ // Drain all queued events first
1806
+ while (eventQueue.length > 0) yield eventQueue.shift();
1807
+
1808
+ if (inFlight.size === 0) break;
1809
+
1810
+ // Wait for either the next event OR a worker to complete
1811
+ const workerCompletion = Promise.race([...inFlight.values()].map((v) => v.promise));
1812
+ const eventArrival = new Promise((resolve) => { notify = () => resolve("event"); });
1813
+ const winner = await Promise.race([
1814
+ workerCompletion.then((done) => ({ kind: "worker", done })),
1815
+ eventArrival.then(() => ({ kind: "event" })),
1816
+ ]);
1817
+
1818
+ if (winner.kind === "worker") {
1819
+ const { taskId, subId, ok, error } = winner.done;
1820
+ const entry = inFlight.get(subId);
1821
+ inFlight.delete(subId);
1822
+
1823
+ if (ok) {
1824
+ this.taskManager.markDone(taskId);
1825
+ enq(new AgentEvent({
1826
+ type: "task_progress",
1827
+ data: {
1828
+ taskId, status: "completed",
1829
+ worker: entry?.workerLabel,
1830
+ progress: this.taskManager.progress,
1831
+ },
1832
+ }));
1833
+ } else {
1834
+ this.taskManager.markFailed(taskId, error);
1835
+ enq(new AgentEvent({
1836
+ type: "task_progress",
1837
+ data: {
1838
+ taskId, status: "failed",
1839
+ worker: entry?.workerLabel,
1840
+ error,
1841
+ progress: this.taskManager.progress,
1842
+ },
1843
+ }));
1844
+ }
1845
+
1846
+ // Refill the pool. If no pending tasks left, in-flight drains naturally.
1847
+ await dispatch();
1848
+ }
1849
+ // event winner: loop re-iterates and drains eventQueue
1850
+ }
1851
+
1852
+ this.saveState();
1853
+
1854
+ // After all workers done, check for phase auto-advance (same as serial path).
1855
+ if (this._allCurrentPhaseTasksComplete()) {
1856
+ const pipeline = this.pipelines[this.currentPhase];
1857
+ let exitMet = false;
1858
+ try { exitMet = !!pipeline?.exitCriteriaMet?.(); } catch { exitMet = false; }
1859
+ if (exitMet) {
1860
+ const next = NEXT_PHASE[this.currentPhase];
1861
+ if (next) {
1862
+ const advanced = this._advancePhase(next, "all parallel tasks completed + exit criteria met");
1863
+ if (advanced) {
1864
+ yield new AgentEvent({
1865
+ type: "pipeline_event",
1866
+ data: { type: "phase_ready", nextPhase: next, message: "all phase tasks done; exit criteria met" },
1867
+ });
1868
+ }
1869
+ }
1870
+ }
1871
+ }
1872
+ }
1873
+
1077
1874
  /**
1078
1875
  * True when every task tagged with the current phase is in a terminal state
1079
1876
  * (completed | failed | skipped) and at least one such task exists. Used by