kc-beta 0.5.5 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/QUICKSTART.md +17 -4
  2. package/README.md +58 -11
  3. package/bin/kc-beta.js +35 -1
  4. package/package.json +1 -1
  5. package/src/agent/bundle-tree.js +553 -0
  6. package/src/agent/context.js +40 -1
  7. package/src/agent/engine.js +644 -28
  8. package/src/agent/llm-client.js +67 -18
  9. package/src/agent/pipelines/finalization.js +186 -0
  10. package/src/agent/pipelines/index.js +8 -0
  11. package/src/agent/pipelines/initializer.js +40 -0
  12. package/src/agent/pipelines/skill-authoring.js +100 -6
  13. package/src/agent/skill-loader.js +54 -4
  14. package/src/agent/task-manager.js +66 -3
  15. package/src/agent/tools/agent-tool.js +283 -35
  16. package/src/agent/tools/bundle-search.js +146 -0
  17. package/src/agent/tools/document-chunk.js +246 -0
  18. package/src/agent/tools/document-classify.js +311 -0
  19. package/src/agent/tools/document-parse.js +8 -1
  20. package/src/agent/tools/phase-advance.js +30 -7
  21. package/src/agent/tools/registry.js +10 -0
  22. package/src/agent/tools/rule-catalog.js +17 -3
  23. package/src/agent/tools/sandbox-exec.js +30 -0
  24. package/src/agent/workspace.js +168 -14
  25. package/src/cli/components.js +165 -17
  26. package/src/cli/index.js +166 -19
  27. package/src/cli/meme.js +58 -0
  28. package/src/config.js +39 -2
  29. package/src/model-tiers.json +3 -2
  30. package/src/providers.js +34 -1
  31. package/template/skills/en/meta-meta/evolution-loop/SKILL.md +13 -1
  32. package/template/skills/en/meta-meta/rule-extraction/SKILL.md +74 -0
  33. package/template/skills/zh/meta-meta/evolution-loop/SKILL.md +7 -1
  34. package/template/skills/zh/meta-meta/rule-extraction/SKILL.md +73 -0
@@ -19,6 +19,9 @@ import { ReleaseTool } from "./tools/release.js";
19
19
  import { PhaseAdvanceTool } from "./tools/phase-advance.js";
20
20
  import { DocumentParseTool } from "./tools/document-parse.js";
21
21
  import { DocumentSearchTool } from "./tools/document-search.js";
22
+ import { DocumentChunkTool } from "./tools/document-chunk.js";
23
+ import { BundleSearchTool } from "./tools/bundle-search.js";
24
+ import { DocumentClassifyTool } from "./tools/document-classify.js";
22
25
  import { WorkerLLMCallTool } from "./tools/worker-llm-call.js";
23
26
  import { WorkflowRunTool } from "./tools/workflow-run.js";
24
27
  import { RuleCatalogTool } from "./tools/rule-catalog.js";
@@ -38,6 +41,7 @@ import { SkillAuthoringPipeline } from "./pipelines/skill-authoring.js";
38
41
  import { SkillTestingPipeline } from "./pipelines/skill-testing.js";
39
42
  import { DistillationEngine as DistillationPipeline } from "./pipelines/distillation.js";
40
43
  import { ProductionQCPipeline } from "./pipelines/production-qc.js";
44
+ import { FinalizationPipeline } from "./pipelines/finalization.js";
41
45
  import { EventLog } from "./event-log.js";
42
46
  import { ContextWindow } from "./context-window.js";
43
47
  import { SessionState } from "./session-state.js";
@@ -48,8 +52,10 @@ import { estimateTokens, estimateMessagesTokens } from "./token-counter.js";
48
52
  // or kc_max_tokens in the global config.
49
53
  const DEFAULT_KC_MAX_TOKENS = 65536;
50
54
 
51
- // Phases where worker LLM tools are available (DISTILL mode)
52
- const DISTILL_PHASES = new Set([Phase.DISTILLATION, Phase.PRODUCTION_QC]);
55
+ // Phases where worker LLM tools are available (DISTILL mode).
56
+ // E1: FINALIZATION inherits worker-LLM access so one-last-pass validation
57
+ // runs + dashboard_render + workflow_run stay usable during packaging.
58
+ const DISTILL_PHASES = new Set([Phase.DISTILLATION, Phase.PRODUCTION_QC, Phase.FINALIZATION]);
53
59
 
54
60
  // Linear phase order — used by auto-advance (Bug 4). Last phase has no successor.
55
61
  // Exported so the TUI's /phase slash command (src/cli/index.js) can call
@@ -60,6 +66,7 @@ export const NEXT_PHASE = {
60
66
  [Phase.SKILL_AUTHORING]: Phase.SKILL_TESTING,
61
67
  [Phase.SKILL_TESTING]: Phase.DISTILLATION,
62
68
  [Phase.DISTILLATION]: Phase.PRODUCTION_QC,
69
+ [Phase.PRODUCTION_QC]: Phase.FINALIZATION, // E1: new 7th phase
63
70
  };
64
71
 
65
72
  /**
@@ -162,6 +169,7 @@ export class AgentEngine {
162
169
  [Phase.SKILL_TESTING]: new SkillTestingPipeline(this.workspace),
163
170
  [Phase.DISTILLATION]: new DistillationPipeline(this.workspace),
164
171
  [Phase.PRODUCTION_QC]: new ProductionQCPipeline(this.workspace),
172
+ [Phase.FINALIZATION]: new FinalizationPipeline(this.workspace), // E1
165
173
  };
166
174
 
167
175
  // Skill discovery (Claude Code pattern: index in context, full content on demand)
@@ -181,6 +189,61 @@ export class AgentEngine {
181
189
  this._lastReady = Object.fromEntries(
182
190
  Object.keys(this.pipelines).map((p) => [p, false]),
183
191
  );
192
+
193
+ // B0.1: Heap sampler. Parent engines only — sub-agents share a process
194
+ // with the parent and would double-log. Writes a single JSONL line
195
+ // per minute to <workspace>/logs/heap.jsonl with the numbers needed
196
+ // to diagnose RSS creep (heapUsed/heapTotal/external/rss/arrayBuffers,
197
+ // plus active task count and history length). Always on, ~60 bytes
198
+ // per minute to disk.
199
+ this._heapSamplerStop = this._isSubagent ? null : this._startHeapSampler();
200
+ }
201
+
202
+ /**
203
+ * Start sampling process.memoryUsage() every 60 s into logs/heap.jsonl.
204
+ * Returns a stop fn. Timer is .unref()'d so it never keeps the process
205
+ * alive by itself. Failures are silently suppressed — this is a
206
+ * diagnostic, not a correctness feature.
207
+ */
208
+ _startHeapSampler() {
209
+ const logDir = path.join(this.workspace.cwd, "logs");
210
+ const logPath = path.join(logDir, "heap.jsonl");
211
+ const sample = () => {
212
+ try {
213
+ const mem = process.memoryUsage();
214
+ const row = {
215
+ t: new Date().toISOString(),
216
+ seq: this.eventLog?.currentSeq ?? 0,
217
+ phase: this.currentPhase,
218
+ rssMB: Math.round(mem.rss / 1024 / 1024),
219
+ heapUsedMB: Math.round(mem.heapUsed / 1024 / 1024),
220
+ heapTotalMB: Math.round(mem.heapTotal / 1024 / 1024),
221
+ externalMB: Math.round((mem.external || 0) / 1024 / 1024),
222
+ arrayBuffersMB: Math.round((mem.arrayBuffers || 0) / 1024 / 1024),
223
+ historyLen: this.history?.messages?.length ?? 0,
224
+ tasksPending: this.taskManager?.progress?.pending ?? 0,
225
+ tasksInProgress: this.taskManager?.progress?.inProgress ?? 0,
226
+ };
227
+ fs.mkdirSync(logDir, { recursive: true });
228
+ fs.appendFileSync(logPath, JSON.stringify(row) + "\n", "utf-8");
229
+ } catch { /* never fatal */ }
230
+ };
231
+ // Record one sample at startup so we have a baseline even on short runs.
232
+ sample();
233
+ const timer = setInterval(sample, 60_000);
234
+ timer.unref?.();
235
+ return () => {
236
+ try {
237
+ clearInterval(timer);
238
+ sample(); // one final sample on shutdown
239
+ } catch { /* ignore */ }
240
+ };
241
+ }
242
+
243
+ /** Stop background diagnostics. Call on graceful shutdown. */
244
+ stop() {
245
+ try { this._heapSamplerStop?.(); } catch { /* ignore */ }
246
+ this._heapSamplerStop = null;
184
247
  }
185
248
 
186
249
  /**
@@ -214,7 +277,10 @@ export class AgentEngine {
214
277
  new ArchiveFileTool(this.workspace),
215
278
  new ScheduleFetchTool(this.workspace),
216
279
  new ReleaseTool(this.workspace, { kcVersion: "0.5.2" }),
217
- new PhaseAdvanceTool((to, reason, opts) => this._advancePhase(to, reason, opts)),
280
+ new PhaseAdvanceTool(
281
+ (to, reason, opts) => this._advancePhase(to, reason, opts),
282
+ () => this.currentPhase, // H1: tool reads phase BEFORE its own call
283
+ ),
218
284
  new DocumentParseTool(this.workspace, {
219
285
  mineruApiUrl: this.config.mineruApiUrl,
220
286
  mineruApiKey: this.config.mineruApiKey,
@@ -223,6 +289,12 @@ export class AgentEngine {
223
289
  ocrModel: vlmModel,
224
290
  }),
225
291
  new DocumentSearchTool(this.workspace),
292
+ // Group C — chunker/RAG infrastructure ported from AMC app. Core
293
+ // tools (not phase-gated): useful from BOOTSTRAP through FINALIZATION
294
+ // for any doc-heavy project, not just rule extraction.
295
+ new DocumentChunkTool(this.workspace),
296
+ new BundleSearchTool(this.workspace),
297
+ new DocumentClassifyTool(this.workspace, this.config),
226
298
  new RuleCatalogTool(this.workspace),
227
299
  new EvolutionCycleTool(this.workspace, this.cornerCases),
228
300
  new DashboardRenderTool(this.workspace),
@@ -313,7 +385,7 @@ export class AgentEngine {
313
385
  getContextStats() {
314
386
  const systemPrompt = this.context.build({
315
387
  agentMd: this._readAgentMd(),
316
- skillIndex: this._skillLoader.formatForContext(),
388
+ skillIndex: this._skillLoader.formatForContext(this.currentPhase),
317
389
  pipelineState: this.pipelines[this.currentPhase]?.describeState?.() || null,
318
390
  workspaceState: this._buildWorkspaceState(),
319
391
  });
@@ -353,21 +425,37 @@ export class AgentEngine {
353
425
 
354
426
  // Heap-pressure diagnostic. The TUI has its own virtualization + tool-
355
427
  // output truncation (Bug 3 fixes), so Ink itself should never OOM. If we
356
- // still see high heap usage, something else is leaking — log it once per
357
- // pressure-crossing so operators can investigate without flooding logs.
428
+ // still see high heap usage, something else is leaking.
429
+ //
430
+ // A9: Original design logged once per pressure-crossing (edge-triggered),
431
+ // which went silent for 17h during E2E #3 while RSS climbed to 3.8GB.
432
+ // Now: still edge-trigger on entry (noisy otherwise), but ALSO re-emit
433
+ // every 15min while we're still above the threshold, so an operator
434
+ // watching logs after hour 4 still sees the signal. Drops to silent on
435
+ // recovery below 0.60.
358
436
  try {
359
437
  const mem = process.memoryUsage();
360
438
  const frac = mem.heapUsed / (mem.heapTotal || 1);
361
- if (frac > 0.80 && !this._memPressureLogged) {
362
- this._memPressureLogged = true;
363
- this.eventLog.append("memory_pressure", {
364
- heapUsedMB: Math.round(mem.heapUsed / 1024 / 1024),
365
- heapTotalMB: Math.round(mem.heapTotal / 1024 / 1024),
366
- rssMB: Math.round(mem.rss / 1024 / 1024),
367
- historyLength: this.history.messages.length,
368
- });
439
+ const now = Date.now();
440
+ const REPRESS_INTERVAL_MS = 15 * 60 * 1000;
441
+ if (frac > 0.80) {
442
+ const firstCrossing = !this._memPressureLogged;
443
+ const dueForRepress = this._memPressureLastEmittedAt &&
444
+ (now - this._memPressureLastEmittedAt) >= REPRESS_INTERVAL_MS;
445
+ if (firstCrossing || dueForRepress) {
446
+ this._memPressureLogged = true;
447
+ this._memPressureLastEmittedAt = now;
448
+ this.eventLog.append("memory_pressure", {
449
+ heapUsedMB: Math.round(mem.heapUsed / 1024 / 1024),
450
+ heapTotalMB: Math.round(mem.heapTotal / 1024 / 1024),
451
+ rssMB: Math.round(mem.rss / 1024 / 1024),
452
+ historyLength: this.history.messages.length,
453
+ kind: firstCrossing ? "crossing" : "sustained",
454
+ });
455
+ }
369
456
  } else if (frac < 0.60 && this._memPressureLogged) {
370
457
  this._memPressureLogged = false; // re-arm for next crossing
458
+ this._memPressureLastEmittedAt = null;
371
459
  }
372
460
  } catch { /* process.memoryUsage failures are non-fatal */ }
373
461
  }
@@ -701,7 +789,7 @@ export class AgentEngine {
701
789
 
702
790
  const systemPrompt = this.context.build({
703
791
  agentMd: this._readAgentMd(),
704
- skillIndex: this._skillLoader.formatForContext(),
792
+ skillIndex: this._skillLoader.formatForContext(this.currentPhase),
705
793
  pipelineState,
706
794
  workspaceState: this._buildWorkspaceState(),
707
795
  });
@@ -781,6 +869,30 @@ export class AgentEngine {
781
869
  });
782
870
 
783
871
  if (toolCallsAcc.size === 0) {
872
+ // A3: Empty-response guard. If the LLM returned no content AND no
873
+ // tool calls, count it. Two in a row almost always means the
874
+ // provider is silently failing (context exceeded, rate-limit
875
+ // corruption, auth expired) and continuing wastes tokens + time.
876
+ // Reset on any non-empty turn. Reason-tagged so /status can
877
+ // surface the running rate.
878
+ if (!collectedText || !collectedText.trim()) {
879
+ this._consecutiveEmptyResponses = (this._consecutiveEmptyResponses || 0) + 1;
880
+ this._totalEmptyResponses = (this._totalEmptyResponses || 0) + 1;
881
+ if (this._consecutiveEmptyResponses >= 2) {
882
+ const message =
883
+ `LLM returned empty response ${this._consecutiveEmptyResponses}× in a row — ` +
884
+ `likely context-length exceeded or provider-side silent failure. ` +
885
+ `Stopping this turn to prevent runaway API spend.`;
886
+ this.eventLog.append("error", { message, kind: "empty_response_streak" });
887
+ yield new AgentEvent({ type: "error", message });
888
+ this._consecutiveEmptyResponses = 0; // reset so next /run isn't blocked
889
+ return;
890
+ }
891
+ } else {
892
+ this._consecutiveEmptyResponses = 0;
893
+ }
894
+ this._totalTurns = (this._totalTurns || 0) + 1;
895
+
784
896
  // Bug 4 trigger (1): re-check phase criteria at end of every turn —
785
897
  // KC may have advanced state via conversation alone, without any
786
898
  // tool that the pipeline narrowly watches.
@@ -793,6 +905,10 @@ export class AgentEngine {
793
905
  return;
794
906
  }
795
907
 
908
+ // A3: A turn with tool_calls or content is not empty — reset streak.
909
+ this._consecutiveEmptyResponses = 0;
910
+ this._totalTurns = (this._totalTurns || 0) + 1;
911
+
796
912
  // Tool execution loop
797
913
  for (const tc of toolCallsAcc.values()) {
798
914
  let inputData = {};
@@ -803,6 +919,12 @@ export class AgentEngine {
803
919
  this.eventLog.append("tool_start", { name: tc.name, input: inputData });
804
920
  yield new AgentEvent({ type: "tool_start", name: tc.name, input: inputData });
805
921
 
922
+ // A1: Capture phase BEFORE tool execution. Some tools — notably
923
+ // phase_advance — mutate this.currentPhase via a callback without
924
+ // yielding any AgentEvent, so the TUI's status bar never gets the
925
+ // signal. We diff after execute() and emit a synthetic
926
+ // pipeline_event so subscribers can sync.
927
+ const beforePhase = this.currentPhase;
806
928
  const result = await this.toolRegistry.execute(tc.name, inputData);
807
929
 
808
930
  // Tool-call offloading: large outputs go to logs/tool_results/<traceId>.txt;
@@ -817,6 +939,29 @@ export class AgentEngine {
817
939
  isError: result.isError,
818
940
  traceId: offload?.traceId || null,
819
941
  });
942
+
943
+ // D3a: trace skill invocations. When the agent reads a SKILL.md via
944
+ // workspace_file (the canonical way KC "uses" a skill, since skills
945
+ // are progressively-disclosed markdown), emit a skill_invoked event.
946
+ // Makes "which skills did KC actually consult?" answerable in post-run
947
+ // analysis — before this, skills were opaque to the event log.
948
+ try {
949
+ if (
950
+ !result.isError &&
951
+ (tc.name === "workspace_file" || tc.name === "sandbox_exec")
952
+ ) {
953
+ const p = String(inputData?.path || inputData?.command || "");
954
+ const skillMatch = p.match(/(?:template\/)?skills\/[a-z-]+\/(?:meta-meta|meta|skill-creator)\/([a-zA-Z0-9_-]+)(?:\/SKILL\.md|\/)?|\bSKILL\.md\b/);
955
+ if (skillMatch) {
956
+ const skillName = skillMatch[1] || "(unknown)";
957
+ this.eventLog.append("skill_invoked", {
958
+ skill: skillName,
959
+ via_tool: tc.name,
960
+ phase: this.currentPhase,
961
+ });
962
+ }
963
+ }
964
+ } catch { /* never let tracing break a tool call */ }
820
965
  yield new AgentEvent({
821
966
  type: "tool_result",
822
967
  name: tc.name,
@@ -837,6 +982,22 @@ export class AgentEngine {
837
982
  // user saw "CTX: 210% / stream terminated" with no recovery.
838
983
  this._maybeWindowAfterToolResult();
839
984
 
985
+ // A1: If the tool mutated the phase (e.g. phase_advance), emit the
986
+ // signal the TUI and pipelines need to re-sync state. Runs BEFORE
987
+ // pipeline.onToolResult so the fresh phase is active if the pipeline
988
+ // itself wants to react to the transition.
989
+ if (this.currentPhase !== beforePhase) {
990
+ yield new AgentEvent({
991
+ type: "pipeline_event",
992
+ data: {
993
+ type: "phase_changed",
994
+ from: beforePhase,
995
+ nextPhase: this.currentPhase,
996
+ reason: `via ${tc.name}`,
997
+ },
998
+ });
999
+ }
1000
+
840
1001
  // Pipeline controller: update state and re-register tools on phase change
841
1002
  if (pipeline?.onToolResult) {
842
1003
  const pEvent = pipeline.onToolResult(tc.name, inputData, result);
@@ -857,8 +1018,15 @@ export class AgentEngine {
857
1018
  if (ev) yield ev;
858
1019
 
859
1020
  } catch (err) {
860
- this.eventLog.append("error", { message: err.message });
861
- yield new AgentEvent({ type: "error", message: err.message });
1021
+ // A8: If the LLM client tagged the stream termination reason, pass
1022
+ // it through. Upstream log consumers + the TUI can then distinguish
1023
+ // "provider returned 429" from "socket died mid-token" from "SSE
1024
+ // buffer exploded" — today they're all just "Error: ...".
1025
+ const payload = { message: err.message };
1026
+ if (err.streamTermination) payload.kind = err.streamTermination;
1027
+ if (err.status) payload.status = err.status;
1028
+ this.eventLog.append("error", payload);
1029
+ yield new AgentEvent({ type: "error", message: err.message, ...payload });
862
1030
  return;
863
1031
  }
864
1032
  }
@@ -897,11 +1065,31 @@ export class AgentEngine {
897
1065
  reason,
898
1066
  forced: force && nextPhase !== expected,
899
1067
  });
1068
+ const fromPhase = this.currentPhase;
900
1069
  this.currentPhase = nextPhase;
901
1070
  this._registerToolsForPhase(this.currentPhase);
902
1071
  this.workspace.setPhase(this.currentPhase);
903
1072
  this._createTasksForPhase(this.currentPhase);
904
1073
  this.saveState();
1074
+
1075
+ // B8: Soft signal — surface any sub-agents left running from the prior
1076
+ // phase so the main agent's next turn can decide whether to kill them.
1077
+ // NOT automated: phase_advance can fire from _maybeAutoAdvance on a
1078
+ // criteria-flip, and auto-killing would couple lifecycle with blast
1079
+ // radius. This just informs.
1080
+ try {
1081
+ const agentTool = this._buildTools?.core?.find((t) => t?.name === "agent_tool");
1082
+ const runningIds = agentTool?.getRunningTaskIds?.() || [];
1083
+ if (runningIds.length > 0) {
1084
+ this.eventLog.append("stale_subagents", {
1085
+ from_phase: fromPhase,
1086
+ to_phase: nextPhase,
1087
+ running_task_ids: runningIds,
1088
+ hint: "These sub-agents were dispatched during the prior phase. Consider operation=poll to check status, or operation=kill to abort if stale.",
1089
+ });
1090
+ }
1091
+ } catch { /* never let signal emission break phase advance */ }
1092
+
905
1093
  return true;
906
1094
  }
907
1095
 
@@ -972,6 +1160,16 @@ export class AgentEngine {
972
1160
  /**
973
1161
  * Create per-rule tasks when entering a new phase.
974
1162
  * Reads the rule catalog and creates one task per rule for the given phase.
1163
+ *
1164
+ * D6: For skill_authoring / skill_testing, filter rules via the bundle
1165
+ * classification cache (`cache/bundles/<hash>.classification.json`,
1166
+ * written by document_classify). Rules whose `applicable_product_types`
1167
+ * or `report_types` don't overlap with the bundle's classification get
1168
+ * SKIPPED at task-creation time — we don't mutate catalog.json to mark
1169
+ * them not_applicable, we just keep them out of the task queue. The
1170
+ * finalization phase (Group E) will report them in the coverage
1171
+ * artifact as "not applicable to this bundle." Conservative default:
1172
+ * if no classification exists, include all rules (pre-B9 behavior).
975
1173
  */
976
1174
  _createTasksForPhase(phase) {
977
1175
  if (!this.taskManager) return; // Sub-agents don't manage tasks
@@ -980,28 +1178,258 @@ export class AgentEngine {
980
1178
 
981
1179
  try {
982
1180
  const catalog = JSON.parse(fs.readFileSync(catalogPath, "utf-8"));
983
- const rules = normalizeRuleCatalog(catalog);
984
- if (rules.length > 0) {
985
- this.taskManager.createRuleTasks(rules, phase);
1181
+ let rules = normalizeRuleCatalog(catalog);
1182
+ if (rules.length === 0) return;
1183
+
1184
+ // D6: applicability pre-filter (skill phases only — bootstrap/extraction
1185
+ // have no task creation here per A6).
1186
+ if (phase === "skill_authoring" || phase === "skill_testing") {
1187
+ const classification = this._loadBundleClassification();
1188
+ if (classification) {
1189
+ const before = rules.length;
1190
+ rules = rules.filter((r) => this._ruleAppliesToBundle(r, classification));
1191
+ if (rules.length < before) {
1192
+ this.eventLog.append("applicability_prefilter", {
1193
+ phase,
1194
+ classification: {
1195
+ product_type: classification.product_type,
1196
+ report_type: classification.report_type,
1197
+ source: classification.source,
1198
+ },
1199
+ rules_before: before,
1200
+ rules_after: rules.length,
1201
+ skipped: before - rules.length,
1202
+ });
1203
+ }
1204
+ }
986
1205
  }
1206
+ this.taskManager.createRuleTasks(rules, phase);
987
1207
  } catch { /* skip if catalog can't be read */ }
988
1208
  }
989
1209
 
1210
+ /**
1211
+ * D6: Load the most recent bundle classification cache, if one exists.
1212
+ * Written by the `document_classify` tool. Returns null if no cache or
1213
+ * unreadable — callers must treat null as "all rules apply."
1214
+ */
1215
+ _loadBundleClassification() {
1216
+ const cacheDir = path.join(this.workspace.cwd, "cache", "bundles");
1217
+ if (!fs.existsSync(cacheDir)) return null;
1218
+ let entries;
1219
+ try { entries = fs.readdirSync(cacheDir); }
1220
+ catch { return null; }
1221
+ const files = entries
1222
+ .filter((n) => n.endsWith(".classification.json"))
1223
+ .map((n) => {
1224
+ const p = path.join(cacheDir, n);
1225
+ try { return { path: p, mtime: fs.statSync(p).mtimeMs }; }
1226
+ catch { return null; }
1227
+ })
1228
+ .filter(Boolean)
1229
+ .sort((a, b) => b.mtime - a.mtime);
1230
+ if (files.length === 0) return null;
1231
+ try { return JSON.parse(fs.readFileSync(files[0].path, "utf-8")); }
1232
+ catch { return null; }
1233
+ }
1234
+
1235
+ /**
1236
+ * D6: Rule-applicability check mirroring the AMC app's `applies_to`.
1237
+ * Conservative: returns true when we don't have enough info to
1238
+ * confidently skip (missing fields on rule, or classification with
1239
+ * empty product/report).
1240
+ */
1241
+ _ruleAppliesToBundle(rule, classification) {
1242
+ const docProduct = classification?.product_type || "";
1243
+ const docReport = classification?.report_type || "";
1244
+ const ruleProducts = rule.applicable_product_types || rule.applicable_sections || [];
1245
+ const ruleReports = rule.report_types || [];
1246
+
1247
+ const allProducts = ruleProducts.length === 0 ||
1248
+ ruleProducts.some((x) => x === "全部" || x === "all" || x === "");
1249
+ const allReports = ruleReports.length === 0 ||
1250
+ ruleReports.some((x) => x === "全部" || x === "all" || x === "");
1251
+ if (allProducts && allReports) return true;
1252
+
1253
+ const productOk = allProducts || (
1254
+ docProduct && ruleProducts.some((rp) => rp.includes(docProduct) || docProduct.includes(rp))
1255
+ );
1256
+ const reportOk = allReports || (
1257
+ docReport && ruleReports.some((rr) => rr.includes(docReport) || docReport.includes(rr))
1258
+ );
1259
+
1260
+ // Unknown classification → don't prefilter, let the agent judge.
1261
+ if (!docProduct && !docReport) return true;
1262
+ return productOk && reportOk;
1263
+ }
1264
+
1265
+ /**
1266
+ * D1: Enrich a skill_authoring / skill_testing task prompt with the
1267
+ * rule's source context — reads `source_chunk_ids` back-refs from
1268
+ * catalog.json (populated by extraction) and fetches chunk text from
1269
+ * the most recent BundleTree cache. Falls back to the minimal prompt
1270
+ * when catalog / cache aren't available.
1271
+ *
1272
+ * Previously the task prompt was ONE line — "Continue with next task:
1273
+ * ${title}" — leaving the skill-author agent to re-read the rule and
1274
+ * re-find its evidence per task. Auto-attach saves the LLM turn
1275
+ * needed for document_search on every task, and ensures the author
1276
+ * sees the exact regulation text the extractor used to justify the
1277
+ * rule.
1278
+ *
1279
+ * @param {{id: string, title: string, ruleId?: string, phase: string}} task
1280
+ * @returns {string}
1281
+ */
1282
+ _buildEnrichedTaskPrompt(task) {
1283
+ const fallback = `Continue with next task: ${task.title}` +
1284
+ (task.ruleId ? ` (rule: ${task.ruleId})` : "");
1285
+
1286
+ // Only enrich for rule-anchored phases
1287
+ if (task.phase !== "skill_authoring" && task.phase !== "skill_testing") {
1288
+ return fallback;
1289
+ }
1290
+ if (!task.ruleId) return fallback;
1291
+
1292
+ // Find the rule in catalog.json
1293
+ const catalogPath = path.join(this.workspace.cwd, "rules", "catalog.json");
1294
+ if (!fs.existsSync(catalogPath)) return fallback;
1295
+ let rules;
1296
+ try {
1297
+ rules = normalizeRuleCatalog(JSON.parse(fs.readFileSync(catalogPath, "utf-8")));
1298
+ } catch { return fallback; }
1299
+ const rule = rules.find((r) => r.id === task.ruleId);
1300
+ if (!rule) return fallback;
1301
+
1302
+ // Assemble the enriched brief. Every section is optional — when a
1303
+ // back-ref or cache is missing, just skip that section rather than
1304
+ // failing back to the minimal prompt.
1305
+ const lines = [];
1306
+ lines.push(`# Task: ${task.title}`);
1307
+ lines.push("");
1308
+ lines.push(`## Rule ${rule.id}`);
1309
+ if (rule.source_ref) lines.push(`Source: ${rule.source_ref}`);
1310
+ if (rule.severity) lines.push(`Severity: ${rule.severity}`);
1311
+ if (rule.description) lines.push(`\n${rule.description}`);
1312
+ if (rule.falsifiability_statement) lines.push(`\n**Falsifiability**: ${rule.falsifiability_statement}`);
1313
+ if (rule.test_case_stub) lines.push(`**Test stub**: ${rule.test_case_stub}`);
1314
+
1315
+ // D1: if rule has source_chunk_ids AND a BundleTree cache exists,
1316
+ // pull chunk text inline so the author doesn't need to call
1317
+ // bundle_search manually. Bounded to ~3000 tokens total to avoid
1318
+ // blowing the author's context budget.
1319
+ const chunkIds = Array.isArray(rule.source_chunk_ids) ? rule.source_chunk_ids : [];
1320
+ if (chunkIds.length > 0) {
1321
+ const chunks = this._loadChunksFromBundleCache(chunkIds);
1322
+ if (chunks.length > 0) {
1323
+ lines.push("");
1324
+ lines.push("## Source context");
1325
+ let totalChars = 0;
1326
+ const MAX_CHARS = 7500; // ~3000 CJK tokens
1327
+ for (const ch of chunks) {
1328
+ const header = `### ${ch.title || ch.chunk_id} · ${ch.source_file} p.${(ch.page_range || [1, 1]).join("-")}`;
1329
+ const body = (ch.content || "").trim();
1330
+ const block = `${header}\n${body}\n`;
1331
+ if (totalChars + block.length > MAX_CHARS) {
1332
+ lines.push(`\n[…${chunks.length - chunks.indexOf(ch)} more source chunks truncated; use bundle_search to retrieve them…]`);
1333
+ break;
1334
+ }
1335
+ lines.push("");
1336
+ lines.push(block);
1337
+ totalChars += block.length;
1338
+ }
1339
+ }
1340
+ }
1341
+
1342
+ // Sibling rules (same source_ref prefix) — helps the author see the
1343
+ // surrounding catalog and avoid re-implementing cross-referenced logic.
1344
+ const siblings = this._findSiblingRuleIds(rule, rules);
1345
+ if (siblings.length > 0) {
1346
+ lines.push("");
1347
+ lines.push(`## Sibling rules (same regulation section)`);
1348
+ lines.push(siblings.map((id) => `- ${id}`).join("\n"));
1349
+ }
1350
+
1351
+ lines.push("");
1352
+ lines.push("Write the skill to `rule_skills/<rule_id>/SKILL.md` + detect script. Prefer 1 rule = 1 skill dir (use `check_rNNN_rMMM.py` naming ONLY when rules share evidence and fail together).");
1353
+
1354
+ return lines.join("\n");
1355
+ }
1356
+
1357
+ /** D1: Load chunk text from the most recent BundleTree cache. */
1358
+ _loadChunksFromBundleCache(chunkIds) {
1359
+ const cacheDir = path.join(this.workspace.cwd, "cache", "bundles");
1360
+ if (!fs.existsSync(cacheDir)) return [];
1361
+ let entries;
1362
+ try { entries = fs.readdirSync(cacheDir); }
1363
+ catch { return []; }
1364
+ const candidates = entries
1365
+ .filter((n) => n.endsWith(".json") && !n.endsWith(".classification.json"))
1366
+ .map((n) => {
1367
+ const p = path.join(cacheDir, n);
1368
+ try { return { path: p, mtime: fs.statSync(p).mtimeMs }; }
1369
+ catch { return null; }
1370
+ })
1371
+ .filter(Boolean)
1372
+ .sort((a, b) => b.mtime - a.mtime);
1373
+ if (candidates.length === 0) return [];
1374
+ let tree;
1375
+ try { tree = JSON.parse(fs.readFileSync(candidates[0].path, "utf-8")); }
1376
+ catch { return []; }
1377
+ const out = [];
1378
+ for (const cid of chunkIds) {
1379
+ const ch = tree.chunks?.[cid];
1380
+ if (ch) out.push(ch);
1381
+ }
1382
+ return out;
1383
+ }
1384
+
1385
+ /** D1: Rules that share the same regulation article (naive: source_ref prefix). */
1386
+ _findSiblingRuleIds(rule, allRules) {
1387
+ if (!rule.source_ref) return [];
1388
+ const prefix = rule.source_ref.split(/[第条款项]/)[0].trim();
1389
+ if (!prefix) return [];
1390
+ return allRules
1391
+ .filter((r) => r.id !== rule.id && (r.source_ref || "").startsWith(prefix))
1392
+ .slice(0, 8)
1393
+ .map((r) => r.id);
1394
+ }
1395
+
990
1396
  /**
991
1397
  * Ralph-loop: run a turn, then auto-continue through pending tasks.
992
1398
  * Compacts context aggressively between tasks to prevent context blowup.
993
1399
  * If no tasks exist, behaves identically to runTurn().
994
1400
  *
995
1401
  * @param {string} userMessage
1402
+ * @param {{parallelism?: number}} [opts] — B1: optional parallel mode.
1403
+ * N > 1 dispatches tasks through N concurrent subagents (using the
1404
+ * agent_tool infrastructure from B8). Clamped to `effectiveParallelism`
1405
+ * from config.js — which silently downgrades to 1 unless
1406
+ * KC_PARALLELISM_VERIFIED=1 is set AND heap.jsonl shows flat RSS
1407
+ * (B0.6 guard; prevents accidental $100+ runaway runs).
996
1408
  * @yields {AgentEvent}
997
1409
  */
998
- async *runTaskLoop(userMessage) {
1410
+ async *runTaskLoop(userMessage, opts = {}) {
999
1411
  // Sub-agents don't run task loops — they execute one task and exit
1000
1412
  if (!this.taskManager) {
1001
1413
  yield* this.runTurn(userMessage);
1002
1414
  return;
1003
1415
  }
1004
1416
 
1417
+ // B1: resolve effective parallelism. Caller opts override config.
1418
+ const requested = Number.isFinite(opts.parallelism)
1419
+ ? Math.max(1, Math.min(8, opts.parallelism))
1420
+ : (this.config.effectiveParallelism?.() ?? 1);
1421
+
1422
+ if (requested > 1) {
1423
+ yield* this._runTaskLoopParallel(userMessage, requested);
1424
+ return;
1425
+ }
1426
+
1427
+ yield* this._runTaskLoopSerial(userMessage);
1428
+ }
1429
+
1430
+ /** B1: original serial ralph-loop path — one task at a time, shared
1431
+ * conversation history. Unchanged from pre-v0.6.0 behavior. */
1432
+ async *_runTaskLoopSerial(userMessage) {
1005
1433
  // Run the initial turn (user's request)
1006
1434
  yield* this.runTurn(userMessage);
1007
1435
 
@@ -1015,8 +1443,11 @@ export class AgentEngine {
1015
1443
  await this.compact({ recentCount: 8 });
1016
1444
  }
1017
1445
 
1018
- const task = this.taskManager.getNextPending();
1019
- this.taskManager.updateTask(task.id, { status: "in_progress" });
1446
+ // B2: atomic claim — for serial we could use getNextPending, but
1447
+ // using claimNextPending gives us consistent state fields (worker
1448
+ // label, startedAt) whether in serial or parallel mode.
1449
+ const task = this.taskManager.claimNextPending("serial");
1450
+ if (!task) break;
1020
1451
 
1021
1452
  // Yield task progress event for TUI
1022
1453
  yield new AgentEvent({
@@ -1030,14 +1461,15 @@ export class AgentEngine {
1030
1461
  },
1031
1462
  });
1032
1463
 
1033
- // Synthesize a task-focused prompt
1034
- const taskPrompt = `Continue with next task: ${task.title}` +
1035
- (task.ruleId ? ` (rule: ${task.ruleId})` : "");
1464
+ // D1: synthesize a task-focused prompt, enriched with rule source
1465
+ // context (rule NL + source_ref + chunk text + sibling ids) when
1466
+ // the catalog + BundleTree cache are available. Falls back to the
1467
+ // minimal "Continue with next task" line otherwise.
1468
+ const taskPrompt = this._buildEnrichedTaskPrompt(task);
1036
1469
 
1037
1470
  yield* this.runTurn(taskPrompt);
1038
1471
 
1039
- this.taskManager.updateTask(task.id, { status: "completed" });
1040
- this.taskManager.save();
1472
+ this.taskManager.markDone(task.id);
1041
1473
  this.saveState();
1042
1474
 
1043
1475
  yield new AgentEvent({
@@ -1074,6 +1506,190 @@ export class AgentEngine {
1074
1506
  }
1075
1507
  }
1076
1508
 
1509
+ /**
1510
+ * B1: Parallel ralph-loop — N concurrent subagents each executing one
1511
+ * task at a time, claimed atomically from TaskManager.
1512
+ *
1513
+ * Implementation: leverages B8's agent_tool infrastructure. Each worker
1514
+ * slot is a sub-engine with its own heap-isolated history; workspace
1515
+ * writes are serialized through B9's file locks. The main engine acts
1516
+ * as dispatcher — it claims tasks and spawns subagents, then waits.
1517
+ *
1518
+ * Chosen over in-process history-forking because: (a) sub-engines are
1519
+ * already heap-isolated (good under B0's RSS-safety regime); (b)
1520
+ * kill authority from B8 applies uniformly; (c) no runTurn refactor
1521
+ * needed — the engine's conversation-state assumptions stay intact.
1522
+ * Trade-off: each task pays a cold-start cost (re-read AGENT.md,
1523
+ * skill index, pipeline state). For 100+ task sessions this is
1524
+ * amortized against the 2-4× wall-clock speedup.
1525
+ */
1526
+ async *_runTaskLoopParallel(userMessage, parallelism) {
1527
+ // Initial turn: main agent reads user request, creates tasks.
1528
+ yield* this.runTurn(userMessage);
1529
+
1530
+ const agentTool = this._buildTools.core.find((t) => t?.name === "agent_tool");
1531
+ if (!agentTool) {
1532
+ // Shouldn't happen (agent_tool is core), but fall back safely.
1533
+ yield new AgentEvent({
1534
+ type: "error",
1535
+ message: "agent_tool not registered; parallel mode requires it. Falling back to serial.",
1536
+ });
1537
+ yield* this._runTaskLoopSerial("");
1538
+ return;
1539
+ }
1540
+
1541
+ // Event queue so concurrent workers can yield progress through a
1542
+ // single async-generator consumer. push-style with a notifier.
1543
+ const eventQueue = [];
1544
+ let notify = null;
1545
+ const enq = (ev) => {
1546
+ eventQueue.push(ev);
1547
+ if (notify) { const n = notify; notify = null; n(); }
1548
+ };
1549
+
1550
+ // In-flight: subagent task_id → { task, promise }
1551
+ const inFlight = new Map();
1552
+
1553
+ const dispatch = async () => {
1554
+ while (inFlight.size < parallelism) {
1555
+ const task = this.taskManager.claimNextPending(`pool${inFlight.size}`);
1556
+ if (!task) return;
1557
+
1558
+ const workerLabel = `pool${[...inFlight.keys()].length}`;
1559
+ const subId = `pool_${task.id}`.replace(/[^A-Za-z0-9_-]/g, "_").slice(0, 60);
1560
+
1561
+ // D1: build the enriched brief with source context. Parallel workers
1562
+ // are subagents — each with zero conversation history, so the brief
1563
+ // must carry everything they need. Even more important to have
1564
+ // source context inline vs. expecting them to call document_search.
1565
+ const enriched = this._buildEnrichedTaskPrompt(task);
1566
+ const brief =
1567
+ enriched +
1568
+ `\n\nNOTE (parallel worker): write outputs via workspace_file or ` +
1569
+ `rule_catalog — do NOT write to shared coordination files ` +
1570
+ `(rules/catalog.json, rules/manifest.json) via sandbox_exec; they're ` +
1571
+ `lock-protected and bypassing the lock will race with other workers.`;
1572
+
1573
+ enq(new AgentEvent({
1574
+ type: "task_progress",
1575
+ data: {
1576
+ taskId: task.id, title: task.title, ruleId: task.ruleId,
1577
+ status: "in_progress", worker: workerLabel,
1578
+ progress: this.taskManager.progress,
1579
+ },
1580
+ }));
1581
+
1582
+ // Spawn via the tool's public API. agent_tool writes status.txt,
1583
+ // abort controller, etc. We read _runningTasks to get a promise
1584
+ // handle we can await.
1585
+ const spawnRes = await agentTool.execute({
1586
+ operation: "spawn",
1587
+ task_description: brief,
1588
+ task_id: subId,
1589
+ });
1590
+
1591
+ if (spawnRes.isError) {
1592
+ this.taskManager.markFailed(task.id, `spawn failed: ${spawnRes.content}`);
1593
+ enq(new AgentEvent({
1594
+ type: "task_progress",
1595
+ data: { taskId: task.id, status: "failed", worker: workerLabel },
1596
+ }));
1597
+ continue;
1598
+ }
1599
+
1600
+ const entry = agentTool._runningTasks.get(subId);
1601
+ if (!entry) {
1602
+ // Sub-agent completed synchronously (no events) — mark done.
1603
+ this.taskManager.markDone(task.id);
1604
+ enq(new AgentEvent({
1605
+ type: "task_progress",
1606
+ data: { taskId: task.id, status: "completed", worker: workerLabel },
1607
+ }));
1608
+ continue;
1609
+ }
1610
+
1611
+ const trackedPromise = entry.promise.then(
1612
+ () => ({ taskId: task.id, subId, ok: true }),
1613
+ (e) => ({ taskId: task.id, subId, ok: false, error: e?.message || String(e) }),
1614
+ );
1615
+ inFlight.set(subId, { task, workerLabel, promise: trackedPromise });
1616
+ }
1617
+ };
1618
+
1619
+ // Prime the pool
1620
+ await dispatch();
1621
+
1622
+ // Drain events + replenish until queue is empty and all in-flight done.
1623
+ while (inFlight.size > 0 || eventQueue.length > 0) {
1624
+ // Drain all queued events first
1625
+ while (eventQueue.length > 0) yield eventQueue.shift();
1626
+
1627
+ if (inFlight.size === 0) break;
1628
+
1629
+ // Wait for either the next event OR a worker to complete
1630
+ const workerCompletion = Promise.race([...inFlight.values()].map((v) => v.promise));
1631
+ const eventArrival = new Promise((resolve) => { notify = () => resolve("event"); });
1632
+ const winner = await Promise.race([
1633
+ workerCompletion.then((done) => ({ kind: "worker", done })),
1634
+ eventArrival.then(() => ({ kind: "event" })),
1635
+ ]);
1636
+
1637
+ if (winner.kind === "worker") {
1638
+ const { taskId, subId, ok, error } = winner.done;
1639
+ const entry = inFlight.get(subId);
1640
+ inFlight.delete(subId);
1641
+
1642
+ if (ok) {
1643
+ this.taskManager.markDone(taskId);
1644
+ enq(new AgentEvent({
1645
+ type: "task_progress",
1646
+ data: {
1647
+ taskId, status: "completed",
1648
+ worker: entry?.workerLabel,
1649
+ progress: this.taskManager.progress,
1650
+ },
1651
+ }));
1652
+ } else {
1653
+ this.taskManager.markFailed(taskId, error);
1654
+ enq(new AgentEvent({
1655
+ type: "task_progress",
1656
+ data: {
1657
+ taskId, status: "failed",
1658
+ worker: entry?.workerLabel,
1659
+ error,
1660
+ progress: this.taskManager.progress,
1661
+ },
1662
+ }));
1663
+ }
1664
+
1665
+ // Refill the pool. If no pending tasks left, in-flight drains naturally.
1666
+ await dispatch();
1667
+ }
1668
+ // event winner: loop re-iterates and drains eventQueue
1669
+ }
1670
+
1671
+ this.saveState();
1672
+
1673
+ // After all workers done, check for phase auto-advance (same as serial path).
1674
+ if (this._allCurrentPhaseTasksComplete()) {
1675
+ const pipeline = this.pipelines[this.currentPhase];
1676
+ let exitMet = false;
1677
+ try { exitMet = !!pipeline?.exitCriteriaMet?.(); } catch { exitMet = false; }
1678
+ if (exitMet) {
1679
+ const next = NEXT_PHASE[this.currentPhase];
1680
+ if (next) {
1681
+ const advanced = this._advancePhase(next, "all parallel tasks completed + exit criteria met");
1682
+ if (advanced) {
1683
+ yield new AgentEvent({
1684
+ type: "pipeline_event",
1685
+ data: { type: "phase_ready", nextPhase: next, message: "all phase tasks done; exit criteria met" },
1686
+ });
1687
+ }
1688
+ }
1689
+ }
1690
+ }
1691
+ }
1692
+
1077
1693
  /**
1078
1694
  * True when every task tagged with the current phase is in a terminal state
1079
1695
  * (completed | failed | skipped) and at least one such task exists. Used by