kc-beta 0.5.6 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/QUICKSTART.md +17 -4
- package/README.md +58 -11
- package/bin/kc-beta.js +35 -1
- package/package.json +1 -1
- package/src/agent/bundle-tree.js +553 -0
- package/src/agent/context.js +40 -1
- package/src/agent/engine.js +828 -31
- package/src/agent/llm-client.js +67 -18
- package/src/agent/pipelines/distillation.js +15 -0
- package/src/agent/pipelines/extraction.js +60 -3
- package/src/agent/pipelines/finalization.js +186 -0
- package/src/agent/pipelines/index.js +8 -0
- package/src/agent/pipelines/initializer.js +40 -0
- package/src/agent/pipelines/production-qc.js +63 -13
- package/src/agent/pipelines/skill-authoring.js +136 -7
- package/src/agent/skill-loader.js +54 -4
- package/src/agent/task-manager.js +81 -3
- package/src/agent/tools/agent-tool.js +283 -35
- package/src/agent/tools/bundle-search.js +146 -0
- package/src/agent/tools/document-chunk.js +246 -0
- package/src/agent/tools/document-classify.js +311 -0
- package/src/agent/tools/document-parse.js +8 -1
- package/src/agent/tools/phase-advance.js +30 -7
- package/src/agent/tools/registry.js +10 -0
- package/src/agent/tools/rule-catalog.js +17 -3
- package/src/agent/tools/sandbox-exec.js +30 -0
- package/src/agent/tools/workflow-run.js +34 -1
- package/src/agent/workspace.js +168 -14
- package/src/cli/components.js +165 -17
- package/src/cli/index.js +166 -19
- package/src/cli/meme.js +58 -0
- package/src/config.js +39 -2
- package/src/providers.js +26 -0
- package/template/skills/en/meta-meta/evolution-loop/SKILL.md +13 -1
- package/template/skills/en/meta-meta/rule-extraction/SKILL.md +74 -0
- package/template/skills/zh/meta-meta/evolution-loop/SKILL.md +7 -1
- package/template/skills/zh/meta-meta/rule-extraction/SKILL.md +73 -0
package/src/agent/engine.js
CHANGED
|
@@ -19,6 +19,9 @@ import { ReleaseTool } from "./tools/release.js";
|
|
|
19
19
|
import { PhaseAdvanceTool } from "./tools/phase-advance.js";
|
|
20
20
|
import { DocumentParseTool } from "./tools/document-parse.js";
|
|
21
21
|
import { DocumentSearchTool } from "./tools/document-search.js";
|
|
22
|
+
import { DocumentChunkTool } from "./tools/document-chunk.js";
|
|
23
|
+
import { BundleSearchTool } from "./tools/bundle-search.js";
|
|
24
|
+
import { DocumentClassifyTool } from "./tools/document-classify.js";
|
|
22
25
|
import { WorkerLLMCallTool } from "./tools/worker-llm-call.js";
|
|
23
26
|
import { WorkflowRunTool } from "./tools/workflow-run.js";
|
|
24
27
|
import { RuleCatalogTool } from "./tools/rule-catalog.js";
|
|
@@ -38,6 +41,7 @@ import { SkillAuthoringPipeline } from "./pipelines/skill-authoring.js";
|
|
|
38
41
|
import { SkillTestingPipeline } from "./pipelines/skill-testing.js";
|
|
39
42
|
import { DistillationEngine as DistillationPipeline } from "./pipelines/distillation.js";
|
|
40
43
|
import { ProductionQCPipeline } from "./pipelines/production-qc.js";
|
|
44
|
+
import { FinalizationPipeline } from "./pipelines/finalization.js";
|
|
41
45
|
import { EventLog } from "./event-log.js";
|
|
42
46
|
import { ContextWindow } from "./context-window.js";
|
|
43
47
|
import { SessionState } from "./session-state.js";
|
|
@@ -48,8 +52,10 @@ import { estimateTokens, estimateMessagesTokens } from "./token-counter.js";
|
|
|
48
52
|
// or kc_max_tokens in the global config.
|
|
49
53
|
const DEFAULT_KC_MAX_TOKENS = 65536;
|
|
50
54
|
|
|
51
|
-
// Phases where worker LLM tools are available (DISTILL mode)
|
|
52
|
-
|
|
55
|
+
// Phases where worker LLM tools are available (DISTILL mode).
|
|
56
|
+
// E1: FINALIZATION inherits worker-LLM access so one-last-pass validation
|
|
57
|
+
// runs + dashboard_render + workflow_run stay usable during packaging.
|
|
58
|
+
const DISTILL_PHASES = new Set([Phase.DISTILLATION, Phase.PRODUCTION_QC, Phase.FINALIZATION]);
|
|
53
59
|
|
|
54
60
|
// Linear phase order — used by auto-advance (Bug 4). Last phase has no successor.
|
|
55
61
|
// Exported so the TUI's /phase slash command (src/cli/index.js) can call
|
|
@@ -60,6 +66,7 @@ export const NEXT_PHASE = {
|
|
|
60
66
|
[Phase.SKILL_AUTHORING]: Phase.SKILL_TESTING,
|
|
61
67
|
[Phase.SKILL_TESTING]: Phase.DISTILLATION,
|
|
62
68
|
[Phase.DISTILLATION]: Phase.PRODUCTION_QC,
|
|
69
|
+
[Phase.PRODUCTION_QC]: Phase.FINALIZATION, // E1: new 7th phase
|
|
63
70
|
};
|
|
64
71
|
|
|
65
72
|
/**
|
|
@@ -158,10 +165,11 @@ export class AgentEngine {
|
|
|
158
165
|
this.pipelines = {
|
|
159
166
|
[Phase.BOOTSTRAP]: new ProjectInitializer(this.workspace),
|
|
160
167
|
[Phase.EXTRACTION]: new RuleExtractionPipeline(this.workspace),
|
|
161
|
-
[Phase.SKILL_AUTHORING]: new SkillAuthoringPipeline(this.workspace),
|
|
168
|
+
[Phase.SKILL_AUTHORING]: new SkillAuthoringPipeline(this.workspace, this.taskManager),
|
|
162
169
|
[Phase.SKILL_TESTING]: new SkillTestingPipeline(this.workspace),
|
|
163
170
|
[Phase.DISTILLATION]: new DistillationPipeline(this.workspace),
|
|
164
171
|
[Phase.PRODUCTION_QC]: new ProductionQCPipeline(this.workspace),
|
|
172
|
+
[Phase.FINALIZATION]: new FinalizationPipeline(this.workspace), // E1
|
|
165
173
|
};
|
|
166
174
|
|
|
167
175
|
// Skill discovery (Claude Code pattern: index in context, full content on demand)
|
|
@@ -181,6 +189,61 @@ export class AgentEngine {
|
|
|
181
189
|
this._lastReady = Object.fromEntries(
|
|
182
190
|
Object.keys(this.pipelines).map((p) => [p, false]),
|
|
183
191
|
);
|
|
192
|
+
|
|
193
|
+
// B0.1: Heap sampler. Parent engines only — sub-agents share a process
|
|
194
|
+
// with the parent and would double-log. Writes a single JSONL line
|
|
195
|
+
// per minute to <workspace>/logs/heap.jsonl with the numbers needed
|
|
196
|
+
// to diagnose RSS creep (heapUsed/heapTotal/external/rss/arrayBuffers,
|
|
197
|
+
// plus active task count and history length). Always on, ~60 bytes
|
|
198
|
+
// per minute to disk.
|
|
199
|
+
this._heapSamplerStop = this._isSubagent ? null : this._startHeapSampler();
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/**
|
|
203
|
+
* Start sampling process.memoryUsage() every 60 s into logs/heap.jsonl.
|
|
204
|
+
* Returns a stop fn. Timer is .unref()'d so it never keeps the process
|
|
205
|
+
* alive by itself. Failures are silently suppressed — this is a
|
|
206
|
+
* diagnostic, not a correctness feature.
|
|
207
|
+
*/
|
|
208
|
+
_startHeapSampler() {
|
|
209
|
+
const logDir = path.join(this.workspace.cwd, "logs");
|
|
210
|
+
const logPath = path.join(logDir, "heap.jsonl");
|
|
211
|
+
const sample = () => {
|
|
212
|
+
try {
|
|
213
|
+
const mem = process.memoryUsage();
|
|
214
|
+
const row = {
|
|
215
|
+
t: new Date().toISOString(),
|
|
216
|
+
seq: this.eventLog?.currentSeq ?? 0,
|
|
217
|
+
phase: this.currentPhase,
|
|
218
|
+
rssMB: Math.round(mem.rss / 1024 / 1024),
|
|
219
|
+
heapUsedMB: Math.round(mem.heapUsed / 1024 / 1024),
|
|
220
|
+
heapTotalMB: Math.round(mem.heapTotal / 1024 / 1024),
|
|
221
|
+
externalMB: Math.round((mem.external || 0) / 1024 / 1024),
|
|
222
|
+
arrayBuffersMB: Math.round((mem.arrayBuffers || 0) / 1024 / 1024),
|
|
223
|
+
historyLen: this.history?.messages?.length ?? 0,
|
|
224
|
+
tasksPending: this.taskManager?.progress?.pending ?? 0,
|
|
225
|
+
tasksInProgress: this.taskManager?.progress?.inProgress ?? 0,
|
|
226
|
+
};
|
|
227
|
+
fs.mkdirSync(logDir, { recursive: true });
|
|
228
|
+
fs.appendFileSync(logPath, JSON.stringify(row) + "\n", "utf-8");
|
|
229
|
+
} catch { /* never fatal */ }
|
|
230
|
+
};
|
|
231
|
+
// Record one sample at startup so we have a baseline even on short runs.
|
|
232
|
+
sample();
|
|
233
|
+
const timer = setInterval(sample, 60_000);
|
|
234
|
+
timer.unref?.();
|
|
235
|
+
return () => {
|
|
236
|
+
try {
|
|
237
|
+
clearInterval(timer);
|
|
238
|
+
sample(); // one final sample on shutdown
|
|
239
|
+
} catch { /* ignore */ }
|
|
240
|
+
};
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/** Stop background diagnostics. Call on graceful shutdown. */
|
|
244
|
+
stop() {
|
|
245
|
+
try { this._heapSamplerStop?.(); } catch { /* ignore */ }
|
|
246
|
+
this._heapSamplerStop = null;
|
|
184
247
|
}
|
|
185
248
|
|
|
186
249
|
/**
|
|
@@ -214,7 +277,10 @@ export class AgentEngine {
|
|
|
214
277
|
new ArchiveFileTool(this.workspace),
|
|
215
278
|
new ScheduleFetchTool(this.workspace),
|
|
216
279
|
new ReleaseTool(this.workspace, { kcVersion: "0.5.2" }),
|
|
217
|
-
new PhaseAdvanceTool(
|
|
280
|
+
new PhaseAdvanceTool(
|
|
281
|
+
(to, reason, opts) => this._advancePhase(to, reason, opts),
|
|
282
|
+
() => this.currentPhase, // H1: tool reads phase BEFORE its own call
|
|
283
|
+
),
|
|
218
284
|
new DocumentParseTool(this.workspace, {
|
|
219
285
|
mineruApiUrl: this.config.mineruApiUrl,
|
|
220
286
|
mineruApiKey: this.config.mineruApiKey,
|
|
@@ -223,6 +289,12 @@ export class AgentEngine {
|
|
|
223
289
|
ocrModel: vlmModel,
|
|
224
290
|
}),
|
|
225
291
|
new DocumentSearchTool(this.workspace),
|
|
292
|
+
// Group C — chunker/RAG infrastructure ported from AMC app. Core
|
|
293
|
+
// tools (not phase-gated): useful from BOOTSTRAP through FINALIZATION
|
|
294
|
+
// for any doc-heavy project, not just rule extraction.
|
|
295
|
+
new DocumentChunkTool(this.workspace),
|
|
296
|
+
new BundleSearchTool(this.workspace),
|
|
297
|
+
new DocumentClassifyTool(this.workspace, this.config),
|
|
226
298
|
new RuleCatalogTool(this.workspace),
|
|
227
299
|
new EvolutionCycleTool(this.workspace, this.cornerCases),
|
|
228
300
|
new DashboardRenderTool(this.workspace),
|
|
@@ -239,7 +311,11 @@ export class AgentEngine {
|
|
|
239
311
|
// Distillation+ only (DISTILL mode)
|
|
240
312
|
distill: [
|
|
241
313
|
workerLlm,
|
|
242
|
-
new WorkflowRunTool(this.workspace, this.versionManager, this.confidence
|
|
314
|
+
new WorkflowRunTool(this.workspace, this.versionManager, this.confidence, {
|
|
315
|
+
// v0.6.1 A6: hook engine-emitted milestones so phase gates see workflow runs
|
|
316
|
+
recordMilestone: (phase, key, value) => this._recordMilestone(phase, key, value),
|
|
317
|
+
getCurrentPhase: () => this.currentPhase,
|
|
318
|
+
}),
|
|
243
319
|
new TierDowngradeTool(this.workspace, workerLlm),
|
|
244
320
|
new QCSampleTool(this.workspace),
|
|
245
321
|
],
|
|
@@ -313,7 +389,7 @@ export class AgentEngine {
|
|
|
313
389
|
getContextStats() {
|
|
314
390
|
const systemPrompt = this.context.build({
|
|
315
391
|
agentMd: this._readAgentMd(),
|
|
316
|
-
skillIndex: this._skillLoader.formatForContext(),
|
|
392
|
+
skillIndex: this._skillLoader.formatForContext(this.currentPhase),
|
|
317
393
|
pipelineState: this.pipelines[this.currentPhase]?.describeState?.() || null,
|
|
318
394
|
workspaceState: this._buildWorkspaceState(),
|
|
319
395
|
});
|
|
@@ -353,21 +429,37 @@ export class AgentEngine {
|
|
|
353
429
|
|
|
354
430
|
// Heap-pressure diagnostic. The TUI has its own virtualization + tool-
|
|
355
431
|
// output truncation (Bug 3 fixes), so Ink itself should never OOM. If we
|
|
356
|
-
// still see high heap usage, something else is leaking
|
|
357
|
-
//
|
|
432
|
+
// still see high heap usage, something else is leaking.
|
|
433
|
+
//
|
|
434
|
+
// A9: Original design logged once per pressure-crossing (edge-triggered),
|
|
435
|
+
// which went silent for 17h during E2E #3 while RSS climbed to 3.8GB.
|
|
436
|
+
// Now: still edge-trigger on entry (noisy otherwise), but ALSO re-emit
|
|
437
|
+
// every 15min while we're still above the threshold, so an operator
|
|
438
|
+
// watching logs after hour 4 still sees the signal. Drops to silent on
|
|
439
|
+
// recovery below 0.60.
|
|
358
440
|
try {
|
|
359
441
|
const mem = process.memoryUsage();
|
|
360
442
|
const frac = mem.heapUsed / (mem.heapTotal || 1);
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
443
|
+
const now = Date.now();
|
|
444
|
+
const REPRESS_INTERVAL_MS = 15 * 60 * 1000;
|
|
445
|
+
if (frac > 0.80) {
|
|
446
|
+
const firstCrossing = !this._memPressureLogged;
|
|
447
|
+
const dueForRepress = this._memPressureLastEmittedAt &&
|
|
448
|
+
(now - this._memPressureLastEmittedAt) >= REPRESS_INTERVAL_MS;
|
|
449
|
+
if (firstCrossing || dueForRepress) {
|
|
450
|
+
this._memPressureLogged = true;
|
|
451
|
+
this._memPressureLastEmittedAt = now;
|
|
452
|
+
this.eventLog.append("memory_pressure", {
|
|
453
|
+
heapUsedMB: Math.round(mem.heapUsed / 1024 / 1024),
|
|
454
|
+
heapTotalMB: Math.round(mem.heapTotal / 1024 / 1024),
|
|
455
|
+
rssMB: Math.round(mem.rss / 1024 / 1024),
|
|
456
|
+
historyLength: this.history.messages.length,
|
|
457
|
+
kind: firstCrossing ? "crossing" : "sustained",
|
|
458
|
+
});
|
|
459
|
+
}
|
|
369
460
|
} else if (frac < 0.60 && this._memPressureLogged) {
|
|
370
461
|
this._memPressureLogged = false; // re-arm for next crossing
|
|
462
|
+
this._memPressureLastEmittedAt = null;
|
|
371
463
|
}
|
|
372
464
|
} catch { /* process.memoryUsage failures are non-fatal */ }
|
|
373
465
|
}
|
|
@@ -701,7 +793,7 @@ export class AgentEngine {
|
|
|
701
793
|
|
|
702
794
|
const systemPrompt = this.context.build({
|
|
703
795
|
agentMd: this._readAgentMd(),
|
|
704
|
-
skillIndex: this._skillLoader.formatForContext(),
|
|
796
|
+
skillIndex: this._skillLoader.formatForContext(this.currentPhase),
|
|
705
797
|
pipelineState,
|
|
706
798
|
workspaceState: this._buildWorkspaceState(),
|
|
707
799
|
});
|
|
@@ -781,6 +873,30 @@ export class AgentEngine {
|
|
|
781
873
|
});
|
|
782
874
|
|
|
783
875
|
if (toolCallsAcc.size === 0) {
|
|
876
|
+
// A3: Empty-response guard. If the LLM returned no content AND no
|
|
877
|
+
// tool calls, count it. Two in a row almost always means the
|
|
878
|
+
// provider is silently failing (context exceeded, rate-limit
|
|
879
|
+
// corruption, auth expired) and continuing wastes tokens + time.
|
|
880
|
+
// Reset on any non-empty turn. Reason-tagged so /status can
|
|
881
|
+
// surface the running rate.
|
|
882
|
+
if (!collectedText || !collectedText.trim()) {
|
|
883
|
+
this._consecutiveEmptyResponses = (this._consecutiveEmptyResponses || 0) + 1;
|
|
884
|
+
this._totalEmptyResponses = (this._totalEmptyResponses || 0) + 1;
|
|
885
|
+
if (this._consecutiveEmptyResponses >= 2) {
|
|
886
|
+
const message =
|
|
887
|
+
`LLM returned empty response ${this._consecutiveEmptyResponses}× in a row — ` +
|
|
888
|
+
`likely context-length exceeded or provider-side silent failure. ` +
|
|
889
|
+
`Stopping this turn to prevent runaway API spend.`;
|
|
890
|
+
this.eventLog.append("error", { message, kind: "empty_response_streak" });
|
|
891
|
+
yield new AgentEvent({ type: "error", message });
|
|
892
|
+
this._consecutiveEmptyResponses = 0; // reset so next /run isn't blocked
|
|
893
|
+
return;
|
|
894
|
+
}
|
|
895
|
+
} else {
|
|
896
|
+
this._consecutiveEmptyResponses = 0;
|
|
897
|
+
}
|
|
898
|
+
this._totalTurns = (this._totalTurns || 0) + 1;
|
|
899
|
+
|
|
784
900
|
// Bug 4 trigger (1): re-check phase criteria at end of every turn —
|
|
785
901
|
// KC may have advanced state via conversation alone, without any
|
|
786
902
|
// tool that the pipeline narrowly watches.
|
|
@@ -793,6 +909,10 @@ export class AgentEngine {
|
|
|
793
909
|
return;
|
|
794
910
|
}
|
|
795
911
|
|
|
912
|
+
// A3: A turn with tool_calls or content is not empty — reset streak.
|
|
913
|
+
this._consecutiveEmptyResponses = 0;
|
|
914
|
+
this._totalTurns = (this._totalTurns || 0) + 1;
|
|
915
|
+
|
|
796
916
|
// Tool execution loop
|
|
797
917
|
for (const tc of toolCallsAcc.values()) {
|
|
798
918
|
let inputData = {};
|
|
@@ -803,6 +923,12 @@ export class AgentEngine {
|
|
|
803
923
|
this.eventLog.append("tool_start", { name: tc.name, input: inputData });
|
|
804
924
|
yield new AgentEvent({ type: "tool_start", name: tc.name, input: inputData });
|
|
805
925
|
|
|
926
|
+
// A1: Capture phase BEFORE tool execution. Some tools — notably
|
|
927
|
+
// phase_advance — mutate this.currentPhase via a callback without
|
|
928
|
+
// yielding any AgentEvent, so the TUI's status bar never gets the
|
|
929
|
+
// signal. We diff after execute() and emit a synthetic
|
|
930
|
+
// pipeline_event so subscribers can sync.
|
|
931
|
+
const beforePhase = this.currentPhase;
|
|
806
932
|
const result = await this.toolRegistry.execute(tc.name, inputData);
|
|
807
933
|
|
|
808
934
|
// Tool-call offloading: large outputs go to logs/tool_results/<traceId>.txt;
|
|
@@ -817,6 +943,29 @@ export class AgentEngine {
|
|
|
817
943
|
isError: result.isError,
|
|
818
944
|
traceId: offload?.traceId || null,
|
|
819
945
|
});
|
|
946
|
+
|
|
947
|
+
// D3a: trace skill invocations. When the agent reads a SKILL.md via
|
|
948
|
+
// workspace_file (the canonical way KC "uses" a skill, since skills
|
|
949
|
+
// are progressively-disclosed markdown), emit a skill_invoked event.
|
|
950
|
+
// Makes "which skills did KC actually consult?" answerable in post-run
|
|
951
|
+
// analysis — before this, skills were opaque to the event log.
|
|
952
|
+
try {
|
|
953
|
+
if (
|
|
954
|
+
!result.isError &&
|
|
955
|
+
(tc.name === "workspace_file" || tc.name === "sandbox_exec")
|
|
956
|
+
) {
|
|
957
|
+
const p = String(inputData?.path || inputData?.command || "");
|
|
958
|
+
const skillMatch = p.match(/(?:template\/)?skills\/[a-z-]+\/(?:meta-meta|meta|skill-creator)\/([a-zA-Z0-9_-]+)(?:\/SKILL\.md|\/)?|\bSKILL\.md\b/);
|
|
959
|
+
if (skillMatch) {
|
|
960
|
+
const skillName = skillMatch[1] || "(unknown)";
|
|
961
|
+
this.eventLog.append("skill_invoked", {
|
|
962
|
+
skill: skillName,
|
|
963
|
+
via_tool: tc.name,
|
|
964
|
+
phase: this.currentPhase,
|
|
965
|
+
});
|
|
966
|
+
}
|
|
967
|
+
}
|
|
968
|
+
} catch { /* never let tracing break a tool call */ }
|
|
820
969
|
yield new AgentEvent({
|
|
821
970
|
type: "tool_result",
|
|
822
971
|
name: tc.name,
|
|
@@ -837,6 +986,22 @@ export class AgentEngine {
|
|
|
837
986
|
// user saw "CTX: 210% / stream terminated" with no recovery.
|
|
838
987
|
this._maybeWindowAfterToolResult();
|
|
839
988
|
|
|
989
|
+
// A1: If the tool mutated the phase (e.g. phase_advance), emit the
|
|
990
|
+
// signal the TUI and pipelines need to re-sync state. Runs BEFORE
|
|
991
|
+
// pipeline.onToolResult so the fresh phase is active if the pipeline
|
|
992
|
+
// itself wants to react to the transition.
|
|
993
|
+
if (this.currentPhase !== beforePhase) {
|
|
994
|
+
yield new AgentEvent({
|
|
995
|
+
type: "pipeline_event",
|
|
996
|
+
data: {
|
|
997
|
+
type: "phase_changed",
|
|
998
|
+
from: beforePhase,
|
|
999
|
+
nextPhase: this.currentPhase,
|
|
1000
|
+
reason: `via ${tc.name}`,
|
|
1001
|
+
},
|
|
1002
|
+
});
|
|
1003
|
+
}
|
|
1004
|
+
|
|
840
1005
|
// Pipeline controller: update state and re-register tools on phase change
|
|
841
1006
|
if (pipeline?.onToolResult) {
|
|
842
1007
|
const pEvent = pipeline.onToolResult(tc.name, inputData, result);
|
|
@@ -857,8 +1022,15 @@ export class AgentEngine {
|
|
|
857
1022
|
if (ev) yield ev;
|
|
858
1023
|
|
|
859
1024
|
} catch (err) {
|
|
860
|
-
|
|
861
|
-
|
|
1025
|
+
// A8: If the LLM client tagged the stream termination reason, pass
|
|
1026
|
+
// it through. Upstream log consumers + the TUI can then distinguish
|
|
1027
|
+
// "provider returned 429" from "socket died mid-token" from "SSE
|
|
1028
|
+
// buffer exploded" — today they're all just "Error: ...".
|
|
1029
|
+
const payload = { message: err.message };
|
|
1030
|
+
if (err.streamTermination) payload.kind = err.streamTermination;
|
|
1031
|
+
if (err.status) payload.status = err.status;
|
|
1032
|
+
this.eventLog.append("error", payload);
|
|
1033
|
+
yield new AgentEvent({ type: "error", message: err.message, ...payload });
|
|
862
1034
|
return;
|
|
863
1035
|
}
|
|
864
1036
|
}
|
|
@@ -889,22 +1061,219 @@ export class AgentEngine {
|
|
|
889
1061
|
return false;
|
|
890
1062
|
}
|
|
891
1063
|
|
|
892
|
-
|
|
1064
|
+
// v0.6.1 B1: build engine-appended hard-counts block + heuristic mismatch
|
|
1065
|
+
// detection so the LLM-narrated reason can be cross-checked against
|
|
1066
|
+
// ground-truth telemetry. Phase summaries become diagnostic, not just
|
|
1067
|
+
// narrative.
|
|
1068
|
+
const engineCounts = this._buildEngineCountsBlock(this.currentPhase);
|
|
1069
|
+
const mismatchPrefix = this._detectSummaryMismatch(reason, this.currentPhase) ? "⚠️ POSSIBLE MISMATCH: " : "";
|
|
1070
|
+
const phaseSummary =
|
|
1071
|
+
`[${this.currentPhase.toUpperCase()} → ${nextPhase.toUpperCase()}]: ${mismatchPrefix}${reason}` +
|
|
1072
|
+
(force && nextPhase !== expected ? " (forced)" : "") +
|
|
1073
|
+
(engineCounts ? `\n (engine) ${engineCounts}` : "");
|
|
893
1074
|
this._phaseSummaries.push(phaseSummary);
|
|
894
1075
|
this.eventLog.append("phase_transition", {
|
|
895
1076
|
from: this.currentPhase,
|
|
896
1077
|
to: nextPhase,
|
|
897
1078
|
reason,
|
|
1079
|
+
engineCounts: engineCounts || null,
|
|
1080
|
+
possibleMismatch: !!mismatchPrefix,
|
|
898
1081
|
forced: force && nextPhase !== expected,
|
|
899
1082
|
});
|
|
1083
|
+
const fromPhase = this.currentPhase;
|
|
900
1084
|
this.currentPhase = nextPhase;
|
|
901
1085
|
this._registerToolsForPhase(this.currentPhase);
|
|
902
1086
|
this.workspace.setPhase(this.currentPhase);
|
|
903
1087
|
this._createTasksForPhase(this.currentPhase);
|
|
904
1088
|
this.saveState();
|
|
1089
|
+
|
|
1090
|
+
// B8: Soft signal — surface any sub-agents left running from the prior
|
|
1091
|
+
// phase so the main agent's next turn can decide whether to kill them.
|
|
1092
|
+
// NOT automated: phase_advance can fire from _maybeAutoAdvance on a
|
|
1093
|
+
// criteria-flip, and auto-killing would couple lifecycle with blast
|
|
1094
|
+
// radius. This just informs.
|
|
1095
|
+
try {
|
|
1096
|
+
const agentTool = this._buildTools?.core?.find((t) => t?.name === "agent_tool");
|
|
1097
|
+
const runningIds = agentTool?.getRunningTaskIds?.() || [];
|
|
1098
|
+
if (runningIds.length > 0) {
|
|
1099
|
+
this.eventLog.append("stale_subagents", {
|
|
1100
|
+
from_phase: fromPhase,
|
|
1101
|
+
to_phase: nextPhase,
|
|
1102
|
+
running_task_ids: runningIds,
|
|
1103
|
+
hint: "These sub-agents were dispatched during the prior phase. Consider operation=poll to check status, or operation=kill to abort if stale.",
|
|
1104
|
+
});
|
|
1105
|
+
}
|
|
1106
|
+
} catch { /* never let signal emission break phase advance */ }
|
|
1107
|
+
|
|
905
1108
|
return true;
|
|
906
1109
|
}
|
|
907
1110
|
|
|
1111
|
+
/**
|
|
1112
|
+
* v0.6.1 A6: Single chokepoint for engine-emitted milestone updates.
|
|
1113
|
+
* Tools call this on successful execution to bump pipeline counters that
|
|
1114
|
+
* the phase-gate hardening (A2-A5) depends on. Without engine emission,
|
|
1115
|
+
* gates fall back to filesystem scans which can miss work that didn't
|
|
1116
|
+
* follow canonical output paths (E2E #4: `unified_qc.py` wrote to
|
|
1117
|
+
* `output/results/`, production-qc only scanned `output/qc/`).
|
|
1118
|
+
*
|
|
1119
|
+
* The mutation routes through the pipeline's existing internal state, so
|
|
1120
|
+
* exportState/importState round-trips work unchanged and the gate sees a
|
|
1121
|
+
* unified view of (filesystem-scanned + engine-emitted) signals.
|
|
1122
|
+
*
|
|
1123
|
+
* Three modes inferred from value shape:
|
|
1124
|
+
* - increment counter: pipeline[key] is number, value is number → add
|
|
1125
|
+
* - set in dict-by-id: pipeline[key] is object, value is { id, value? } → assign
|
|
1126
|
+
* - dedupe-add to array: pipeline[key] is array, value is string → push if absent
|
|
1127
|
+
*
|
|
1128
|
+
* @param {string} phase - Pipeline name (e.g., "distillation")
|
|
1129
|
+
* @param {string} key - Field on the pipeline (e.g., "workflowsTested")
|
|
1130
|
+
* @param {*} value - Shape varies by target type (see modes above)
|
|
1131
|
+
* @returns {boolean} true if a write happened
|
|
1132
|
+
*/
|
|
1133
|
+
_recordMilestone(phase, key, value) {
|
|
1134
|
+
const pipeline = this.pipelines?.[phase];
|
|
1135
|
+
if (!pipeline) return false;
|
|
1136
|
+
const target = pipeline[key];
|
|
1137
|
+
// increment counter
|
|
1138
|
+
if (typeof target === "number" && typeof value === "number") {
|
|
1139
|
+
pipeline[key] = target + value;
|
|
1140
|
+
return true;
|
|
1141
|
+
}
|
|
1142
|
+
// set on dict-by-id
|
|
1143
|
+
if (target && typeof target === "object" && !Array.isArray(target)
|
|
1144
|
+
&& value && typeof value === "object" && "id" in value) {
|
|
1145
|
+
target[value.id] = "value" in value ? value.value : true;
|
|
1146
|
+
return true;
|
|
1147
|
+
}
|
|
1148
|
+
// dedupe-add to array
|
|
1149
|
+
if (Array.isArray(target) && typeof value === "string") {
|
|
1150
|
+
if (!target.includes(value)) target.push(value);
|
|
1151
|
+
return true;
|
|
1152
|
+
}
|
|
1153
|
+
return false;
|
|
1154
|
+
}
|
|
1155
|
+
|
|
1156
|
+
/**
|
|
1157
|
+
* v0.6.1 B1: build a one-line "engine counts" block summarizing the
|
|
1158
|
+
* pipeline's ground-truth telemetry at the moment of phase advance.
|
|
1159
|
+
* Different phases surface different metrics; we keep this short so the
|
|
1160
|
+
* appended summary line stays readable.
|
|
1161
|
+
*
|
|
1162
|
+
* @param {string} fromPhase - The phase being LEFT (we summarize its work)
|
|
1163
|
+
* @returns {string} block text, or "" if pipeline has nothing to report
|
|
1164
|
+
*/
|
|
1165
|
+
_buildEngineCountsBlock(fromPhase) {
|
|
1166
|
+
const pipeline = this.pipelines?.[fromPhase];
|
|
1167
|
+
if (!pipeline) return "";
|
|
1168
|
+
const parts = [];
|
|
1169
|
+
try {
|
|
1170
|
+
switch (fromPhase) {
|
|
1171
|
+
case "extraction": {
|
|
1172
|
+
const total = pipeline._catalogRuleCount?.() ?? pipeline.rulesExtracted?.length ?? 0;
|
|
1173
|
+
parts.push(`rulesExtracted: ${pipeline.rulesExtracted?.length ?? 0}`);
|
|
1174
|
+
parts.push(`rulesWithChunkRefs: ${pipeline.rulesWithChunkRefs?.length ?? 0}/${total}`);
|
|
1175
|
+
parts.push(`rulesWithTests: ${pipeline.rulesWithTests?.length ?? 0}`);
|
|
1176
|
+
parts.push(`coverageAudited: ${pipeline.coverageAudited ? "yes" : "no"}`);
|
|
1177
|
+
break;
|
|
1178
|
+
}
|
|
1179
|
+
case "skill_authoring": {
|
|
1180
|
+
const totalRules = pipeline.totalRules?.length ?? 0;
|
|
1181
|
+
const covered = pipeline.ruleIdsCovered?.size ?? 0;
|
|
1182
|
+
parts.push(`rulesCovered: ${covered}/${totalRules}`);
|
|
1183
|
+
parts.push(`skillDirsAuthored: ${pipeline.skillsAuthored?.length ?? 0}`);
|
|
1184
|
+
if (this.taskManager) {
|
|
1185
|
+
const t = this.taskManager.countByPhase("skill_authoring");
|
|
1186
|
+
const d = this.taskManager.countByPhase("skill_authoring", "completed");
|
|
1187
|
+
const f = this.taskManager.countByPhase("skill_authoring", "failed");
|
|
1188
|
+
parts.push(`tasksCompleted: ${d}/${t}${f > 0 ? ` (+${f} failed)` : ""}`);
|
|
1189
|
+
}
|
|
1190
|
+
break;
|
|
1191
|
+
}
|
|
1192
|
+
case "skill_testing": {
|
|
1193
|
+
const total = pipeline.skillsToTest?.length ?? 0;
|
|
1194
|
+
const tested = Object.keys(pipeline.skillsTested || {}).length;
|
|
1195
|
+
const passing = pipeline.skillsPassing?.length ?? 0;
|
|
1196
|
+
parts.push(`skillsTested: ${tested}/${total}`);
|
|
1197
|
+
parts.push(`skillsPassing: ${passing}`);
|
|
1198
|
+
parts.push(`iterations: ${pipeline.iterationCount ?? 0}`);
|
|
1199
|
+
break;
|
|
1200
|
+
}
|
|
1201
|
+
case "distillation": {
|
|
1202
|
+
const total = pipeline.skillsToDistill?.length ?? 0;
|
|
1203
|
+
const created = Object.keys(pipeline.workflowsCreated || {}).length;
|
|
1204
|
+
const tested = Object.keys(pipeline.workflowsTested || {}).length;
|
|
1205
|
+
const passing = pipeline.workflowsPassing?.length ?? 0;
|
|
1206
|
+
parts.push(`workflowsCreated: ${created}/${total}`);
|
|
1207
|
+
parts.push(`workflowsTested: ${tested}/${total}`);
|
|
1208
|
+
parts.push(`workflowsPassing: ${passing}/${total}`);
|
|
1209
|
+
break;
|
|
1210
|
+
}
|
|
1211
|
+
case "production_qc": {
|
|
1212
|
+
parts.push(`batchesProcessed: ${pipeline.batchesProcessed ?? 0}`);
|
|
1213
|
+
parts.push(`documentsReviewed: ${pipeline.documentsReviewed ?? 0}`);
|
|
1214
|
+
parts.push(`monitoring: ${pipeline.monitoringPhase ?? "?"}`);
|
|
1215
|
+
break;
|
|
1216
|
+
}
|
|
1217
|
+
// bootstrap / finalization: no specific counters, fall through
|
|
1218
|
+
}
|
|
1219
|
+
} catch { /* never let summary build break phase advance */ }
|
|
1220
|
+
return parts.join(", ");
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1223
|
+
/**
|
|
1224
|
+
* v0.6.1 B1: heuristic mismatch detection. Conservative regex over the
|
|
1225
|
+
* LLM's free-form reason for percentages and counts, compared against
|
|
1226
|
+
* engine truth. INFORMATIONAL only — never blocks the transition. False
|
|
1227
|
+
* positives are acceptable (the warning is a hint to the human reviewer,
|
|
1228
|
+
* not a hard signal). False negatives are also acceptable (this catches
|
|
1229
|
+
* the loud, numerical claims; subtle ones still slip through).
|
|
1230
|
+
*
|
|
1231
|
+
* Returns true if the agent's reason mentions a count or percentage that
|
|
1232
|
+
* doesn't match engine state.
|
|
1233
|
+
*/
|
|
1234
|
+
_detectSummaryMismatch(reason, fromPhase) {
|
|
1235
|
+
if (!reason || typeof reason !== "string") return false;
|
|
1236
|
+
const pipeline = this.pipelines?.[fromPhase];
|
|
1237
|
+
if (!pipeline) return false;
|
|
1238
|
+
try {
|
|
1239
|
+
// Match "N/M" fractions and standalone counts
|
|
1240
|
+
const fractionMatches = [...reason.matchAll(/(\d+)\s*\/\s*(\d+)/g)];
|
|
1241
|
+
// Match "N rules / skills / workflows / tasks"
|
|
1242
|
+
const countMatches = [...reason.matchAll(/(\d+)\s*(rules?|skills?|workflows?|tasks?|条规则|个技能)/gi)];
|
|
1243
|
+
// Match accuracy claims like "95%", "0.95"
|
|
1244
|
+
const pctMatches = [...reason.matchAll(/(\d+(?:\.\d+)?)\s*%/g)];
|
|
1245
|
+
|
|
1246
|
+
// Phase-specific cross-checks (cheap conservative comparisons)
|
|
1247
|
+
if (fromPhase === "skill_authoring" && this.taskManager) {
|
|
1248
|
+
const completed = this.taskManager.countByPhase("skill_authoring", "completed");
|
|
1249
|
+
const total = this.taskManager.countByPhase("skill_authoring");
|
|
1250
|
+
for (const m of fractionMatches) {
|
|
1251
|
+
const claimedDone = parseInt(m[1], 10);
|
|
1252
|
+
const claimedTotal = parseInt(m[2], 10);
|
|
1253
|
+
if (claimedTotal === total && claimedDone > completed + 5) return true;
|
|
1254
|
+
}
|
|
1255
|
+
}
|
|
1256
|
+
if (fromPhase === "skill_testing") {
|
|
1257
|
+
const tested = Object.keys(pipeline.skillsTested || {}).length;
|
|
1258
|
+
const passing = pipeline.skillsPassing?.length ?? 0;
|
|
1259
|
+
for (const m of pctMatches) {
|
|
1260
|
+
const claimed = parseFloat(m[1]);
|
|
1261
|
+
// If claimed > 50% but engine sees 0 tested, that's suspicious
|
|
1262
|
+
if (claimed >= 50 && tested === 0 && passing === 0) return true;
|
|
1263
|
+
}
|
|
1264
|
+
}
|
|
1265
|
+
if (fromPhase === "production_qc") {
|
|
1266
|
+
const batches = pipeline.batchesProcessed ?? 0;
|
|
1267
|
+
// Any "complete" or large-count claim while batches==0 is suspicious
|
|
1268
|
+
if (batches === 0) {
|
|
1269
|
+
if (countMatches.some((m) => parseInt(m[1], 10) > 10)) return true;
|
|
1270
|
+
if (pctMatches.some((m) => parseFloat(m[1]) > 50)) return true;
|
|
1271
|
+
}
|
|
1272
|
+
}
|
|
1273
|
+
} catch { /* informational only — never block */ }
|
|
1274
|
+
return false;
|
|
1275
|
+
}
|
|
1276
|
+
|
|
908
1277
|
/**
|
|
909
1278
|
* Bug 4 trigger (1) auto-detect, edge-triggered (Bug 5): only fires on a
|
|
910
1279
|
* fresh false → true flip in `exitCriteriaMet()`. Sessions resumed in an
|
|
@@ -972,6 +1341,16 @@ export class AgentEngine {
|
|
|
972
1341
|
/**
|
|
973
1342
|
* Create per-rule tasks when entering a new phase.
|
|
974
1343
|
* Reads the rule catalog and creates one task per rule for the given phase.
|
|
1344
|
+
*
|
|
1345
|
+
* D6: For skill_authoring / skill_testing, filter rules via the bundle
|
|
1346
|
+
* classification cache (`cache/bundles/<hash>.classification.json`,
|
|
1347
|
+
* written by document_classify). Rules whose `applicable_product_types`
|
|
1348
|
+
* or `report_types` don't overlap with the bundle's classification get
|
|
1349
|
+
* SKIPPED at task-creation time — we don't mutate catalog.json to mark
|
|
1350
|
+
* them not_applicable, we just keep them out of the task queue. The
|
|
1351
|
+
* finalization phase (Group E) will report them in the coverage
|
|
1352
|
+
* artifact as "not applicable to this bundle." Conservative default:
|
|
1353
|
+
* if no classification exists, include all rules (pre-B9 behavior).
|
|
975
1354
|
*/
|
|
976
1355
|
_createTasksForPhase(phase) {
|
|
977
1356
|
if (!this.taskManager) return; // Sub-agents don't manage tasks
|
|
@@ -980,28 +1359,258 @@ export class AgentEngine {
|
|
|
980
1359
|
|
|
981
1360
|
try {
|
|
982
1361
|
const catalog = JSON.parse(fs.readFileSync(catalogPath, "utf-8"));
|
|
983
|
-
|
|
984
|
-
if (rules.length
|
|
985
|
-
|
|
1362
|
+
let rules = normalizeRuleCatalog(catalog);
|
|
1363
|
+
if (rules.length === 0) return;
|
|
1364
|
+
|
|
1365
|
+
// D6: applicability pre-filter (skill phases only — bootstrap/extraction
|
|
1366
|
+
// have no task creation here per A6).
|
|
1367
|
+
if (phase === "skill_authoring" || phase === "skill_testing") {
|
|
1368
|
+
const classification = this._loadBundleClassification();
|
|
1369
|
+
if (classification) {
|
|
1370
|
+
const before = rules.length;
|
|
1371
|
+
rules = rules.filter((r) => this._ruleAppliesToBundle(r, classification));
|
|
1372
|
+
if (rules.length < before) {
|
|
1373
|
+
this.eventLog.append("applicability_prefilter", {
|
|
1374
|
+
phase,
|
|
1375
|
+
classification: {
|
|
1376
|
+
product_type: classification.product_type,
|
|
1377
|
+
report_type: classification.report_type,
|
|
1378
|
+
source: classification.source,
|
|
1379
|
+
},
|
|
1380
|
+
rules_before: before,
|
|
1381
|
+
rules_after: rules.length,
|
|
1382
|
+
skipped: before - rules.length,
|
|
1383
|
+
});
|
|
1384
|
+
}
|
|
1385
|
+
}
|
|
986
1386
|
}
|
|
1387
|
+
this.taskManager.createRuleTasks(rules, phase);
|
|
987
1388
|
} catch { /* skip if catalog can't be read */ }
|
|
988
1389
|
}
|
|
989
1390
|
|
|
1391
|
+
/**
|
|
1392
|
+
* D6: Load the most recent bundle classification cache, if one exists.
|
|
1393
|
+
* Written by the `document_classify` tool. Returns null if no cache or
|
|
1394
|
+
* unreadable — callers must treat null as "all rules apply."
|
|
1395
|
+
*/
|
|
1396
|
+
_loadBundleClassification() {
|
|
1397
|
+
const cacheDir = path.join(this.workspace.cwd, "cache", "bundles");
|
|
1398
|
+
if (!fs.existsSync(cacheDir)) return null;
|
|
1399
|
+
let entries;
|
|
1400
|
+
try { entries = fs.readdirSync(cacheDir); }
|
|
1401
|
+
catch { return null; }
|
|
1402
|
+
const files = entries
|
|
1403
|
+
.filter((n) => n.endsWith(".classification.json"))
|
|
1404
|
+
.map((n) => {
|
|
1405
|
+
const p = path.join(cacheDir, n);
|
|
1406
|
+
try { return { path: p, mtime: fs.statSync(p).mtimeMs }; }
|
|
1407
|
+
catch { return null; }
|
|
1408
|
+
})
|
|
1409
|
+
.filter(Boolean)
|
|
1410
|
+
.sort((a, b) => b.mtime - a.mtime);
|
|
1411
|
+
if (files.length === 0) return null;
|
|
1412
|
+
try { return JSON.parse(fs.readFileSync(files[0].path, "utf-8")); }
|
|
1413
|
+
catch { return null; }
|
|
1414
|
+
}
|
|
1415
|
+
|
|
1416
|
+
/**
|
|
1417
|
+
* D6: Rule-applicability check mirroring the AMC app's `applies_to`.
|
|
1418
|
+
* Conservative: returns true when we don't have enough info to
|
|
1419
|
+
* confidently skip (missing fields on rule, or classification with
|
|
1420
|
+
* empty product/report).
|
|
1421
|
+
*/
|
|
1422
|
+
_ruleAppliesToBundle(rule, classification) {
|
|
1423
|
+
const docProduct = classification?.product_type || "";
|
|
1424
|
+
const docReport = classification?.report_type || "";
|
|
1425
|
+
const ruleProducts = rule.applicable_product_types || rule.applicable_sections || [];
|
|
1426
|
+
const ruleReports = rule.report_types || [];
|
|
1427
|
+
|
|
1428
|
+
const allProducts = ruleProducts.length === 0 ||
|
|
1429
|
+
ruleProducts.some((x) => x === "全部" || x === "all" || x === "");
|
|
1430
|
+
const allReports = ruleReports.length === 0 ||
|
|
1431
|
+
ruleReports.some((x) => x === "全部" || x === "all" || x === "");
|
|
1432
|
+
if (allProducts && allReports) return true;
|
|
1433
|
+
|
|
1434
|
+
const productOk = allProducts || (
|
|
1435
|
+
docProduct && ruleProducts.some((rp) => rp.includes(docProduct) || docProduct.includes(rp))
|
|
1436
|
+
);
|
|
1437
|
+
const reportOk = allReports || (
|
|
1438
|
+
docReport && ruleReports.some((rr) => rr.includes(docReport) || docReport.includes(rr))
|
|
1439
|
+
);
|
|
1440
|
+
|
|
1441
|
+
// Unknown classification → don't prefilter, let the agent judge.
|
|
1442
|
+
if (!docProduct && !docReport) return true;
|
|
1443
|
+
return productOk && reportOk;
|
|
1444
|
+
}
|
|
1445
|
+
|
|
1446
|
+
/**
|
|
1447
|
+
* D1: Enrich a skill_authoring / skill_testing task prompt with the
|
|
1448
|
+
* rule's source context — reads `source_chunk_ids` back-refs from
|
|
1449
|
+
* catalog.json (populated by extraction) and fetches chunk text from
|
|
1450
|
+
* the most recent BundleTree cache. Falls back to the minimal prompt
|
|
1451
|
+
* when catalog / cache aren't available.
|
|
1452
|
+
*
|
|
1453
|
+
* Previously the task prompt was ONE line — "Continue with next task:
|
|
1454
|
+
* ${title}" — leaving the skill-author agent to re-read the rule and
|
|
1455
|
+
* re-find its evidence per task. Auto-attach saves the LLM turn
|
|
1456
|
+
* needed for document_search on every task, and ensures the author
|
|
1457
|
+
* sees the exact regulation text the extractor used to justify the
|
|
1458
|
+
* rule.
|
|
1459
|
+
*
|
|
1460
|
+
* @param {{id: string, title: string, ruleId?: string, phase: string}} task
|
|
1461
|
+
* @returns {string}
|
|
1462
|
+
*/
|
|
1463
|
+
_buildEnrichedTaskPrompt(task) {
|
|
1464
|
+
const fallback = `Continue with next task: ${task.title}` +
|
|
1465
|
+
(task.ruleId ? ` (rule: ${task.ruleId})` : "");
|
|
1466
|
+
|
|
1467
|
+
// Only enrich for rule-anchored phases
|
|
1468
|
+
if (task.phase !== "skill_authoring" && task.phase !== "skill_testing") {
|
|
1469
|
+
return fallback;
|
|
1470
|
+
}
|
|
1471
|
+
if (!task.ruleId) return fallback;
|
|
1472
|
+
|
|
1473
|
+
// Find the rule in catalog.json
|
|
1474
|
+
const catalogPath = path.join(this.workspace.cwd, "rules", "catalog.json");
|
|
1475
|
+
if (!fs.existsSync(catalogPath)) return fallback;
|
|
1476
|
+
let rules;
|
|
1477
|
+
try {
|
|
1478
|
+
rules = normalizeRuleCatalog(JSON.parse(fs.readFileSync(catalogPath, "utf-8")));
|
|
1479
|
+
} catch { return fallback; }
|
|
1480
|
+
const rule = rules.find((r) => r.id === task.ruleId);
|
|
1481
|
+
if (!rule) return fallback;
|
|
1482
|
+
|
|
1483
|
+
// Assemble the enriched brief. Every section is optional — when a
|
|
1484
|
+
// back-ref or cache is missing, just skip that section rather than
|
|
1485
|
+
// failing back to the minimal prompt.
|
|
1486
|
+
const lines = [];
|
|
1487
|
+
lines.push(`# Task: ${task.title}`);
|
|
1488
|
+
lines.push("");
|
|
1489
|
+
lines.push(`## Rule ${rule.id}`);
|
|
1490
|
+
if (rule.source_ref) lines.push(`Source: ${rule.source_ref}`);
|
|
1491
|
+
if (rule.severity) lines.push(`Severity: ${rule.severity}`);
|
|
1492
|
+
if (rule.description) lines.push(`\n${rule.description}`);
|
|
1493
|
+
if (rule.falsifiability_statement) lines.push(`\n**Falsifiability**: ${rule.falsifiability_statement}`);
|
|
1494
|
+
if (rule.test_case_stub) lines.push(`**Test stub**: ${rule.test_case_stub}`);
|
|
1495
|
+
|
|
1496
|
+
// D1: if rule has source_chunk_ids AND a BundleTree cache exists,
|
|
1497
|
+
// pull chunk text inline so the author doesn't need to call
|
|
1498
|
+
// bundle_search manually. Bounded to ~3000 tokens total to avoid
|
|
1499
|
+
// blowing the author's context budget.
|
|
1500
|
+
const chunkIds = Array.isArray(rule.source_chunk_ids) ? rule.source_chunk_ids : [];
|
|
1501
|
+
if (chunkIds.length > 0) {
|
|
1502
|
+
const chunks = this._loadChunksFromBundleCache(chunkIds);
|
|
1503
|
+
if (chunks.length > 0) {
|
|
1504
|
+
lines.push("");
|
|
1505
|
+
lines.push("## Source context");
|
|
1506
|
+
let totalChars = 0;
|
|
1507
|
+
const MAX_CHARS = 7500; // ~3000 CJK tokens
|
|
1508
|
+
for (const ch of chunks) {
|
|
1509
|
+
const header = `### ${ch.title || ch.chunk_id} · ${ch.source_file} p.${(ch.page_range || [1, 1]).join("-")}`;
|
|
1510
|
+
const body = (ch.content || "").trim();
|
|
1511
|
+
const block = `${header}\n${body}\n`;
|
|
1512
|
+
if (totalChars + block.length > MAX_CHARS) {
|
|
1513
|
+
lines.push(`\n[…${chunks.length - chunks.indexOf(ch)} more source chunks truncated; use bundle_search to retrieve them…]`);
|
|
1514
|
+
break;
|
|
1515
|
+
}
|
|
1516
|
+
lines.push("");
|
|
1517
|
+
lines.push(block);
|
|
1518
|
+
totalChars += block.length;
|
|
1519
|
+
}
|
|
1520
|
+
}
|
|
1521
|
+
}
|
|
1522
|
+
|
|
1523
|
+
// Sibling rules (same source_ref prefix) — helps the author see the
|
|
1524
|
+
// surrounding catalog and avoid re-implementing cross-referenced logic.
|
|
1525
|
+
const siblings = this._findSiblingRuleIds(rule, rules);
|
|
1526
|
+
if (siblings.length > 0) {
|
|
1527
|
+
lines.push("");
|
|
1528
|
+
lines.push(`## Sibling rules (same regulation section)`);
|
|
1529
|
+
lines.push(siblings.map((id) => `- ${id}`).join("\n"));
|
|
1530
|
+
}
|
|
1531
|
+
|
|
1532
|
+
lines.push("");
|
|
1533
|
+
lines.push("Write the skill to `rule_skills/<rule_id>/SKILL.md` + detect script. Prefer 1 rule = 1 skill dir (use `check_rNNN_rMMM.py` naming ONLY when rules share evidence and fail together).");
|
|
1534
|
+
|
|
1535
|
+
return lines.join("\n");
|
|
1536
|
+
}
|
|
1537
|
+
|
|
1538
|
+
/** D1: Load chunk text from the most recent BundleTree cache. */
|
|
1539
|
+
_loadChunksFromBundleCache(chunkIds) {
|
|
1540
|
+
const cacheDir = path.join(this.workspace.cwd, "cache", "bundles");
|
|
1541
|
+
if (!fs.existsSync(cacheDir)) return [];
|
|
1542
|
+
let entries;
|
|
1543
|
+
try { entries = fs.readdirSync(cacheDir); }
|
|
1544
|
+
catch { return []; }
|
|
1545
|
+
const candidates = entries
|
|
1546
|
+
.filter((n) => n.endsWith(".json") && !n.endsWith(".classification.json"))
|
|
1547
|
+
.map((n) => {
|
|
1548
|
+
const p = path.join(cacheDir, n);
|
|
1549
|
+
try { return { path: p, mtime: fs.statSync(p).mtimeMs }; }
|
|
1550
|
+
catch { return null; }
|
|
1551
|
+
})
|
|
1552
|
+
.filter(Boolean)
|
|
1553
|
+
.sort((a, b) => b.mtime - a.mtime);
|
|
1554
|
+
if (candidates.length === 0) return [];
|
|
1555
|
+
let tree;
|
|
1556
|
+
try { tree = JSON.parse(fs.readFileSync(candidates[0].path, "utf-8")); }
|
|
1557
|
+
catch { return []; }
|
|
1558
|
+
const out = [];
|
|
1559
|
+
for (const cid of chunkIds) {
|
|
1560
|
+
const ch = tree.chunks?.[cid];
|
|
1561
|
+
if (ch) out.push(ch);
|
|
1562
|
+
}
|
|
1563
|
+
return out;
|
|
1564
|
+
}
|
|
1565
|
+
|
|
1566
|
+
/** D1: Rules that share the same regulation article (naive: source_ref prefix). */
|
|
1567
|
+
_findSiblingRuleIds(rule, allRules) {
|
|
1568
|
+
if (!rule.source_ref) return [];
|
|
1569
|
+
const prefix = rule.source_ref.split(/[第条款项]/)[0].trim();
|
|
1570
|
+
if (!prefix) return [];
|
|
1571
|
+
return allRules
|
|
1572
|
+
.filter((r) => r.id !== rule.id && (r.source_ref || "").startsWith(prefix))
|
|
1573
|
+
.slice(0, 8)
|
|
1574
|
+
.map((r) => r.id);
|
|
1575
|
+
}
|
|
1576
|
+
|
|
990
1577
|
/**
|
|
991
1578
|
* Ralph-loop: run a turn, then auto-continue through pending tasks.
|
|
992
1579
|
* Compacts context aggressively between tasks to prevent context blowup.
|
|
993
1580
|
* If no tasks exist, behaves identically to runTurn().
|
|
994
1581
|
*
|
|
995
1582
|
* @param {string} userMessage
|
|
1583
|
+
* @param {{parallelism?: number}} [opts] — B1: optional parallel mode.
|
|
1584
|
+
* N > 1 dispatches tasks through N concurrent subagents (using the
|
|
1585
|
+
* agent_tool infrastructure from B8). Clamped to `effectiveParallelism`
|
|
1586
|
+
* from config.js — which silently downgrades to 1 unless
|
|
1587
|
+
* KC_PARALLELISM_VERIFIED=1 is set AND heap.jsonl shows flat RSS
|
|
1588
|
+
* (B0.6 guard; prevents accidental $100+ runaway runs).
|
|
996
1589
|
* @yields {AgentEvent}
|
|
997
1590
|
*/
|
|
998
|
-
async *runTaskLoop(userMessage) {
|
|
1591
|
+
async *runTaskLoop(userMessage, opts = {}) {
|
|
999
1592
|
// Sub-agents don't run task loops — they execute one task and exit
|
|
1000
1593
|
if (!this.taskManager) {
|
|
1001
1594
|
yield* this.runTurn(userMessage);
|
|
1002
1595
|
return;
|
|
1003
1596
|
}
|
|
1004
1597
|
|
|
1598
|
+
// B1: resolve effective parallelism. Caller opts override config.
|
|
1599
|
+
const requested = Number.isFinite(opts.parallelism)
|
|
1600
|
+
? Math.max(1, Math.min(8, opts.parallelism))
|
|
1601
|
+
: (this.config.effectiveParallelism?.() ?? 1);
|
|
1602
|
+
|
|
1603
|
+
if (requested > 1) {
|
|
1604
|
+
yield* this._runTaskLoopParallel(userMessage, requested);
|
|
1605
|
+
return;
|
|
1606
|
+
}
|
|
1607
|
+
|
|
1608
|
+
yield* this._runTaskLoopSerial(userMessage);
|
|
1609
|
+
}
|
|
1610
|
+
|
|
1611
|
+
/** B1: original serial ralph-loop path — one task at a time, shared
|
|
1612
|
+
* conversation history. Unchanged from pre-v0.6.0 behavior. */
|
|
1613
|
+
async *_runTaskLoopSerial(userMessage) {
|
|
1005
1614
|
// Run the initial turn (user's request)
|
|
1006
1615
|
yield* this.runTurn(userMessage);
|
|
1007
1616
|
|
|
@@ -1015,8 +1624,11 @@ export class AgentEngine {
|
|
|
1015
1624
|
await this.compact({ recentCount: 8 });
|
|
1016
1625
|
}
|
|
1017
1626
|
|
|
1018
|
-
|
|
1019
|
-
|
|
1627
|
+
// B2: atomic claim — for serial we could use getNextPending, but
|
|
1628
|
+
// using claimNextPending gives us consistent state fields (worker
|
|
1629
|
+
// label, startedAt) whether in serial or parallel mode.
|
|
1630
|
+
const task = this.taskManager.claimNextPending("serial");
|
|
1631
|
+
if (!task) break;
|
|
1020
1632
|
|
|
1021
1633
|
// Yield task progress event for TUI
|
|
1022
1634
|
yield new AgentEvent({
|
|
@@ -1030,14 +1642,15 @@ export class AgentEngine {
|
|
|
1030
1642
|
},
|
|
1031
1643
|
});
|
|
1032
1644
|
|
|
1033
|
-
//
|
|
1034
|
-
|
|
1035
|
-
|
|
1645
|
+
// D1: synthesize a task-focused prompt, enriched with rule source
|
|
1646
|
+
// context (rule NL + source_ref + chunk text + sibling ids) when
|
|
1647
|
+
// the catalog + BundleTree cache are available. Falls back to the
|
|
1648
|
+
// minimal "Continue with next task" line otherwise.
|
|
1649
|
+
const taskPrompt = this._buildEnrichedTaskPrompt(task);
|
|
1036
1650
|
|
|
1037
1651
|
yield* this.runTurn(taskPrompt);
|
|
1038
1652
|
|
|
1039
|
-
this.taskManager.
|
|
1040
|
-
this.taskManager.save();
|
|
1653
|
+
this.taskManager.markDone(task.id);
|
|
1041
1654
|
this.saveState();
|
|
1042
1655
|
|
|
1043
1656
|
yield new AgentEvent({
|
|
@@ -1074,6 +1687,190 @@ export class AgentEngine {
|
|
|
1074
1687
|
}
|
|
1075
1688
|
}
|
|
1076
1689
|
|
|
1690
|
+
/**
|
|
1691
|
+
* B1: Parallel ralph-loop — N concurrent subagents each executing one
|
|
1692
|
+
* task at a time, claimed atomically from TaskManager.
|
|
1693
|
+
*
|
|
1694
|
+
* Implementation: leverages B8's agent_tool infrastructure. Each worker
|
|
1695
|
+
* slot is a sub-engine with its own heap-isolated history; workspace
|
|
1696
|
+
* writes are serialized through B9's file locks. The main engine acts
|
|
1697
|
+
* as dispatcher — it claims tasks and spawns subagents, then waits.
|
|
1698
|
+
*
|
|
1699
|
+
* Chosen over in-process history-forking because: (a) sub-engines are
|
|
1700
|
+
* already heap-isolated (good under B0's RSS-safety regime); (b)
|
|
1701
|
+
* kill authority from B8 applies uniformly; (c) no runTurn refactor
|
|
1702
|
+
* needed — the engine's conversation-state assumptions stay intact.
|
|
1703
|
+
* Trade-off: each task pays a cold-start cost (re-read AGENT.md,
|
|
1704
|
+
* skill index, pipeline state). For 100+ task sessions this is
|
|
1705
|
+
* amortized against the 2-4× wall-clock speedup.
|
|
1706
|
+
*/
|
|
1707
|
+
async *_runTaskLoopParallel(userMessage, parallelism) {
|
|
1708
|
+
// Initial turn: main agent reads user request, creates tasks.
|
|
1709
|
+
yield* this.runTurn(userMessage);
|
|
1710
|
+
|
|
1711
|
+
const agentTool = this._buildTools.core.find((t) => t?.name === "agent_tool");
|
|
1712
|
+
if (!agentTool) {
|
|
1713
|
+
// Shouldn't happen (agent_tool is core), but fall back safely.
|
|
1714
|
+
yield new AgentEvent({
|
|
1715
|
+
type: "error",
|
|
1716
|
+
message: "agent_tool not registered; parallel mode requires it. Falling back to serial.",
|
|
1717
|
+
});
|
|
1718
|
+
yield* this._runTaskLoopSerial("");
|
|
1719
|
+
return;
|
|
1720
|
+
}
|
|
1721
|
+
|
|
1722
|
+
// Event queue so concurrent workers can yield progress through a
|
|
1723
|
+
// single async-generator consumer. push-style with a notifier.
|
|
1724
|
+
const eventQueue = [];
|
|
1725
|
+
let notify = null;
|
|
1726
|
+
const enq = (ev) => {
|
|
1727
|
+
eventQueue.push(ev);
|
|
1728
|
+
if (notify) { const n = notify; notify = null; n(); }
|
|
1729
|
+
};
|
|
1730
|
+
|
|
1731
|
+
// In-flight: subagent task_id → { task, promise }
|
|
1732
|
+
const inFlight = new Map();
|
|
1733
|
+
|
|
1734
|
+
const dispatch = async () => {
|
|
1735
|
+
while (inFlight.size < parallelism) {
|
|
1736
|
+
const task = this.taskManager.claimNextPending(`pool${inFlight.size}`);
|
|
1737
|
+
if (!task) return;
|
|
1738
|
+
|
|
1739
|
+
const workerLabel = `pool${[...inFlight.keys()].length}`;
|
|
1740
|
+
const subId = `pool_${task.id}`.replace(/[^A-Za-z0-9_-]/g, "_").slice(0, 60);
|
|
1741
|
+
|
|
1742
|
+
// D1: build the enriched brief with source context. Parallel workers
|
|
1743
|
+
// are subagents — each with zero conversation history, so the brief
|
|
1744
|
+
// must carry everything they need. Even more important to have
|
|
1745
|
+
// source context inline vs. expecting them to call document_search.
|
|
1746
|
+
const enriched = this._buildEnrichedTaskPrompt(task);
|
|
1747
|
+
const brief =
|
|
1748
|
+
enriched +
|
|
1749
|
+
`\n\nNOTE (parallel worker): write outputs via workspace_file or ` +
|
|
1750
|
+
`rule_catalog — do NOT write to shared coordination files ` +
|
|
1751
|
+
`(rules/catalog.json, rules/manifest.json) via sandbox_exec; they're ` +
|
|
1752
|
+
`lock-protected and bypassing the lock will race with other workers.`;
|
|
1753
|
+
|
|
1754
|
+
enq(new AgentEvent({
|
|
1755
|
+
type: "task_progress",
|
|
1756
|
+
data: {
|
|
1757
|
+
taskId: task.id, title: task.title, ruleId: task.ruleId,
|
|
1758
|
+
status: "in_progress", worker: workerLabel,
|
|
1759
|
+
progress: this.taskManager.progress,
|
|
1760
|
+
},
|
|
1761
|
+
}));
|
|
1762
|
+
|
|
1763
|
+
// Spawn via the tool's public API. agent_tool writes status.txt,
|
|
1764
|
+
// abort controller, etc. We read _runningTasks to get a promise
|
|
1765
|
+
// handle we can await.
|
|
1766
|
+
const spawnRes = await agentTool.execute({
|
|
1767
|
+
operation: "spawn",
|
|
1768
|
+
task_description: brief,
|
|
1769
|
+
task_id: subId,
|
|
1770
|
+
});
|
|
1771
|
+
|
|
1772
|
+
if (spawnRes.isError) {
|
|
1773
|
+
this.taskManager.markFailed(task.id, `spawn failed: ${spawnRes.content}`);
|
|
1774
|
+
enq(new AgentEvent({
|
|
1775
|
+
type: "task_progress",
|
|
1776
|
+
data: { taskId: task.id, status: "failed", worker: workerLabel },
|
|
1777
|
+
}));
|
|
1778
|
+
continue;
|
|
1779
|
+
}
|
|
1780
|
+
|
|
1781
|
+
const entry = agentTool._runningTasks.get(subId);
|
|
1782
|
+
if (!entry) {
|
|
1783
|
+
// Sub-agent completed synchronously (no events) — mark done.
|
|
1784
|
+
this.taskManager.markDone(task.id);
|
|
1785
|
+
enq(new AgentEvent({
|
|
1786
|
+
type: "task_progress",
|
|
1787
|
+
data: { taskId: task.id, status: "completed", worker: workerLabel },
|
|
1788
|
+
}));
|
|
1789
|
+
continue;
|
|
1790
|
+
}
|
|
1791
|
+
|
|
1792
|
+
const trackedPromise = entry.promise.then(
|
|
1793
|
+
() => ({ taskId: task.id, subId, ok: true }),
|
|
1794
|
+
(e) => ({ taskId: task.id, subId, ok: false, error: e?.message || String(e) }),
|
|
1795
|
+
);
|
|
1796
|
+
inFlight.set(subId, { task, workerLabel, promise: trackedPromise });
|
|
1797
|
+
}
|
|
1798
|
+
};
|
|
1799
|
+
|
|
1800
|
+
// Prime the pool
|
|
1801
|
+
await dispatch();
|
|
1802
|
+
|
|
1803
|
+
// Drain events + replenish until queue is empty and all in-flight done.
|
|
1804
|
+
while (inFlight.size > 0 || eventQueue.length > 0) {
|
|
1805
|
+
// Drain all queued events first
|
|
1806
|
+
while (eventQueue.length > 0) yield eventQueue.shift();
|
|
1807
|
+
|
|
1808
|
+
if (inFlight.size === 0) break;
|
|
1809
|
+
|
|
1810
|
+
// Wait for either the next event OR a worker to complete
|
|
1811
|
+
const workerCompletion = Promise.race([...inFlight.values()].map((v) => v.promise));
|
|
1812
|
+
const eventArrival = new Promise((resolve) => { notify = () => resolve("event"); });
|
|
1813
|
+
const winner = await Promise.race([
|
|
1814
|
+
workerCompletion.then((done) => ({ kind: "worker", done })),
|
|
1815
|
+
eventArrival.then(() => ({ kind: "event" })),
|
|
1816
|
+
]);
|
|
1817
|
+
|
|
1818
|
+
if (winner.kind === "worker") {
|
|
1819
|
+
const { taskId, subId, ok, error } = winner.done;
|
|
1820
|
+
const entry = inFlight.get(subId);
|
|
1821
|
+
inFlight.delete(subId);
|
|
1822
|
+
|
|
1823
|
+
if (ok) {
|
|
1824
|
+
this.taskManager.markDone(taskId);
|
|
1825
|
+
enq(new AgentEvent({
|
|
1826
|
+
type: "task_progress",
|
|
1827
|
+
data: {
|
|
1828
|
+
taskId, status: "completed",
|
|
1829
|
+
worker: entry?.workerLabel,
|
|
1830
|
+
progress: this.taskManager.progress,
|
|
1831
|
+
},
|
|
1832
|
+
}));
|
|
1833
|
+
} else {
|
|
1834
|
+
this.taskManager.markFailed(taskId, error);
|
|
1835
|
+
enq(new AgentEvent({
|
|
1836
|
+
type: "task_progress",
|
|
1837
|
+
data: {
|
|
1838
|
+
taskId, status: "failed",
|
|
1839
|
+
worker: entry?.workerLabel,
|
|
1840
|
+
error,
|
|
1841
|
+
progress: this.taskManager.progress,
|
|
1842
|
+
},
|
|
1843
|
+
}));
|
|
1844
|
+
}
|
|
1845
|
+
|
|
1846
|
+
// Refill the pool. If no pending tasks left, in-flight drains naturally.
|
|
1847
|
+
await dispatch();
|
|
1848
|
+
}
|
|
1849
|
+
// event winner: loop re-iterates and drains eventQueue
|
|
1850
|
+
}
|
|
1851
|
+
|
|
1852
|
+
this.saveState();
|
|
1853
|
+
|
|
1854
|
+
// After all workers done, check for phase auto-advance (same as serial path).
|
|
1855
|
+
if (this._allCurrentPhaseTasksComplete()) {
|
|
1856
|
+
const pipeline = this.pipelines[this.currentPhase];
|
|
1857
|
+
let exitMet = false;
|
|
1858
|
+
try { exitMet = !!pipeline?.exitCriteriaMet?.(); } catch { exitMet = false; }
|
|
1859
|
+
if (exitMet) {
|
|
1860
|
+
const next = NEXT_PHASE[this.currentPhase];
|
|
1861
|
+
if (next) {
|
|
1862
|
+
const advanced = this._advancePhase(next, "all parallel tasks completed + exit criteria met");
|
|
1863
|
+
if (advanced) {
|
|
1864
|
+
yield new AgentEvent({
|
|
1865
|
+
type: "pipeline_event",
|
|
1866
|
+
data: { type: "phase_ready", nextPhase: next, message: "all phase tasks done; exit criteria met" },
|
|
1867
|
+
});
|
|
1868
|
+
}
|
|
1869
|
+
}
|
|
1870
|
+
}
|
|
1871
|
+
}
|
|
1872
|
+
}
|
|
1873
|
+
|
|
1077
1874
|
/**
|
|
1078
1875
|
* True when every task tagged with the current phase is in a terminal state
|
|
1079
1876
|
* (completed | failed | skipped) and at least one such task exists. Used by
|