kc-beta 0.6.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/LICENSE +81 -0
  2. package/LICENSE-COMMERCIAL.md +125 -0
  3. package/README.md +21 -3
  4. package/package.json +14 -5
  5. package/src/agent/context-window.js +9 -12
  6. package/src/agent/context.js +14 -1
  7. package/src/agent/document-parser.js +169 -0
  8. package/src/agent/engine.js +499 -20
  9. package/src/agent/history/event-history.js +222 -0
  10. package/src/agent/llm-client.js +55 -0
  11. package/src/agent/message-utils.js +63 -0
  12. package/src/agent/pipelines/_milestone-derive.js +511 -0
  13. package/src/agent/pipelines/base.js +21 -0
  14. package/src/agent/pipelines/distillation.js +28 -15
  15. package/src/agent/pipelines/extraction.js +103 -36
  16. package/src/agent/pipelines/finalization.js +178 -11
  17. package/src/agent/pipelines/index.js +6 -1
  18. package/src/agent/pipelines/initializer.js +74 -8
  19. package/src/agent/pipelines/production-qc.js +31 -44
  20. package/src/agent/pipelines/skill-authoring.js +152 -80
  21. package/src/agent/pipelines/skill-testing.js +67 -23
  22. package/src/agent/retry.js +10 -2
  23. package/src/agent/scheduler.js +14 -2
  24. package/src/agent/session-state.js +35 -2
  25. package/src/agent/skill-loader.js +13 -7
  26. package/src/agent/skill-validator.js +163 -0
  27. package/src/agent/task-manager.js +61 -5
  28. package/src/agent/tools/_workflow-result-schema.js +249 -0
  29. package/src/agent/tools/document-chunk.js +21 -9
  30. package/src/agent/tools/phase-advance.js +52 -6
  31. package/src/agent/tools/release.js +51 -9
  32. package/src/agent/tools/rule-catalog.js +11 -1
  33. package/src/agent/tools/workflow-run.js +9 -4
  34. package/src/agent/tools/workspace-file.js +32 -0
  35. package/src/agent/workspace.js +61 -0
  36. package/src/cli/components.js +64 -14
  37. package/src/cli/index.js +62 -3
  38. package/src/cli/meme.js +26 -25
  39. package/src/config.js +65 -22
  40. package/src/model-tiers.json +48 -0
  41. package/src/providers.js +87 -0
  42. package/template/release/v1/README.md.tmpl +108 -0
  43. package/template/release/v1/catalog.json.tmpl +4 -0
  44. package/template/release/v1/kc_runtime/__init__.py +11 -0
  45. package/template/release/v1/kc_runtime/confidence.py +63 -0
  46. package/template/release/v1/kc_runtime/doc_parser.py +127 -0
  47. package/template/release/v1/manifest.json.tmpl +11 -0
  48. package/template/release/v1/render_dashboard.py +117 -0
  49. package/template/release/v1/run.py +212 -0
  50. package/template/release/v1/serve.sh +17 -0
  51. package/template/skills/en/meta-meta/skill-authoring/SKILL.md +19 -0
  52. package/template/skills/en/meta-meta/work-decomposition/SKILL.md +266 -0
  53. package/template/skills/en/skill-creator/SKILL.md +1 -1
  54. package/template/skills/zh/meta-meta/skill-authoring/SKILL.md +19 -0
  55. package/template/skills/zh/meta-meta/work-decomposition/SKILL.md +264 -0
  56. package/template/skills/zh/skill-creator/SKILL.md +1 -1
@@ -1,8 +1,13 @@
1
1
  import fs from "node:fs";
2
2
  import path from "node:path";
3
3
  import { AgentEvent } from "./events.js";
4
+ import {
5
+ deriveSkillAuthoringMilestones,
6
+ deriveSkillTestingMilestones,
7
+ } from "./pipelines/_milestone-derive.js";
4
8
  import { ContextAssembler } from "./context.js";
5
9
  import { ConversationHistory } from "./history.js";
10
+ import { findSafeSplitPoint } from "./message-utils.js";
6
11
  import { Workspace } from "./workspace.js";
7
12
  import { normalizeRuleCatalog } from "./rule-catalog-normalize.js";
8
13
  import { VersionManager } from "./version-manager.js";
@@ -52,6 +57,45 @@ import { estimateTokens, estimateMessagesTokens } from "./token-counter.js";
52
57
  // or kc_max_tokens in the global config.
53
58
  const DEFAULT_KC_MAX_TOKENS = 65536;
54
59
 
60
+ /**
61
+ * v0.6.3.1: Tolerant JSON parse for streamed tool-call arguments. When LLMs
62
+ * (esp. SiliconFlow GLM-5.1 in E2E #5) hit max_tokens mid-arguments, the
63
+ * stream returns truncated JSON missing N closing braces or quotes. Strict
64
+ * parse fails; old code silently dropped to {} which masked the actual issue.
65
+ *
66
+ * Strategy:
67
+ * 1. Try strict JSON.parse (fast path, most calls).
68
+ * 2. On failure, attempt to balance braces by appending up to BRACE_BUDGET
69
+ * `}` characters. Cheap; recovers the common single-brace-truncation case.
70
+ * 3. If still failing, return error so caller surfaces it to the agent.
71
+ *
72
+ * Returns { ok: true, value, recovered? } | { ok: false, error }.
73
+ */
74
+ const BRACE_RECOVERY_BUDGET = 4;
75
+ function parseToolArgsTolerant(raw) {
76
+ if (typeof raw !== "string") return { ok: false, error: "arguments not a string" };
77
+ if (raw === "") return { ok: true, value: {} };
78
+ // Fast path
79
+ try { return { ok: true, value: JSON.parse(raw) }; } catch (e0) {
80
+ // Recovery: balance braces by appending up to BRACE_RECOVERY_BUDGET `}`
81
+ const opens = (raw.match(/\{/g) || []).length;
82
+ const closes = (raw.match(/\}/g) || []).length;
83
+ const needed = opens - closes;
84
+ if (needed > 0 && needed <= BRACE_RECOVERY_BUDGET) {
85
+ const padded = raw + "}".repeat(needed);
86
+ try { return { ok: true, value: JSON.parse(padded), recovered: needed }; } catch (_) { /* fall through */ }
87
+ }
88
+ // Last-ditch: try closing an open string then balancing braces.
89
+ // Truncation can land mid-string-value: ..."description": "abc<EOF>
90
+ const quotes = (raw.match(/"/g) || []).length;
91
+ if (quotes % 2 === 1) {
92
+ const candidate = raw + '"' + "}".repeat(Math.max(1, needed));
93
+ try { return { ok: true, value: JSON.parse(candidate), recovered: candidate.length - raw.length }; } catch (_) { /* fall through */ }
94
+ }
95
+ return { ok: false, error: e0.message || "JSON parse failed" };
96
+ }
97
+ }
98
+
55
99
  // Phases where worker LLM tools are available (DISTILL mode).
56
100
  // E1: FINALIZATION inherits worker-LLM access so one-last-pass validation
57
101
  // runs + dashboard_render + workflow_run stay usable during packaging.
@@ -69,6 +113,19 @@ export const NEXT_PHASE = {
69
113
  [Phase.PRODUCTION_QC]: Phase.FINALIZATION, // E1: new 7th phase
70
114
  };
71
115
 
116
+ // v0.6.2 J2: explicit linear order so `_advancePhase` can detect rollback
117
+ // direction (target index < current index → rollback). Mirrors NEXT_PHASE
118
+ // but ordered, plus FINALIZATION at the end as the terminal phase.
119
+ export const PHASE_ORDER = [
120
+ Phase.BOOTSTRAP,
121
+ Phase.EXTRACTION,
122
+ Phase.SKILL_AUTHORING,
123
+ Phase.SKILL_TESTING,
124
+ Phase.DISTILLATION,
125
+ Phase.PRODUCTION_QC,
126
+ Phase.FINALIZATION,
127
+ ];
128
+
72
129
  /**
73
130
  * The KC Agent conversation engine.
74
131
  *
@@ -150,7 +207,7 @@ export class AgentEngine {
150
207
  });
151
208
 
152
209
  // Session state persistence
153
- this.sessionState = new SessionState(this.workspace.cwd, { statePath });
210
+ this.sessionState = new SessionState(this.workspace.cwd, { statePath, workspace: this.workspace });
154
211
 
155
212
  // Task manager (ralph-loop) — sub-agents don't queue further sub-tasks,
156
213
  // so they don't get a TaskManager.
@@ -223,6 +280,11 @@ export class AgentEngine {
223
280
  historyLen: this.history?.messages?.length ?? 0,
224
281
  tasksPending: this.taskManager?.progress?.pending ?? 0,
225
282
  tasksInProgress: this.taskManager?.progress?.inProgress ?? 0,
283
+ // v0.6.2 K1: per-component breakdown so heap-analyze.js can
284
+ // attribute growth (history vs subagents vs event log vs cache).
285
+ // All values in MB. Failures inside _sampleComponents are caught
286
+ // and the row gets `componentsErr` instead.
287
+ components: this._sampleComponents(),
226
288
  };
227
289
  fs.mkdirSync(logDir, { recursive: true });
228
290
  fs.appendFileSync(logPath, JSON.stringify(row) + "\n", "utf-8");
@@ -240,6 +302,89 @@ export class AgentEngine {
240
302
  };
241
303
  }
242
304
 
305
+ /**
306
+ * v0.6.2 K1: per-component heap accounting. Each value is in MB,
307
+ * rounded. The whole function is wrapped in a single try/catch by the
308
+ * caller; failures are silently dropped to keep the sampler diagnostic
309
+ * (never load-bearing).
310
+ *
311
+ * Components measured (by source):
312
+ * - history: in-memory `this.history.messages` content sizes (sum of
313
+ * JSON-stringified content)
314
+ * - eventLog: disk size of `logs/events.jsonl`
315
+ * - toolResults: disk size of `logs/tool_results/` (offloaded tool
316
+ * output, summed top-level files only — the dir is one level deep)
317
+ * - subagents: disk size of `sub_agents/` (one level — each subagent
318
+ * has its own directory tree but we just want the order of magnitude)
319
+ * - bundleCache: disk size of `cache/bundles/`
320
+ */
321
+ _sampleComponents() {
322
+ const out = { historyMB: 0, eventLogMB: 0, toolResultsMB: 0, subagentsMB: 0, bundleCacheMB: 0 };
323
+ const cwd = this.workspace?.cwd;
324
+ if (!cwd) return out;
325
+ // history: walk messages, sum content string lengths (UTF-16 → bytes
326
+ // approx 2× length; we conservatively count length itself since most
327
+ // content is ASCII-heavy JSON tool output)
328
+ try {
329
+ const msgs = this.history?.messages || [];
330
+ let bytes = 0;
331
+ for (const m of msgs) {
332
+ const c = m?.content;
333
+ if (typeof c === "string") bytes += c.length;
334
+ else if (Array.isArray(c)) {
335
+ for (const part of c) {
336
+ if (typeof part === "string") bytes += part.length;
337
+ else if (part?.text) bytes += String(part.text).length;
338
+ else if (part?.content) bytes += String(part.content).length;
339
+ else if (part?.input) bytes += JSON.stringify(part.input).length;
340
+ }
341
+ } else if (c && typeof c === "object") {
342
+ bytes += JSON.stringify(c).length;
343
+ }
344
+ }
345
+ out.historyMB = Math.round(bytes / 1024 / 1024);
346
+ } catch { /* skip */ }
347
+ // events.jsonl — single file size
348
+ try {
349
+ const p = path.join(cwd, "logs", "events.jsonl");
350
+ out.eventLogMB = Math.round(fs.statSync(p).size / 1024 / 1024);
351
+ } catch { /* skip */ }
352
+ // logs/tool_results/ — sum file sizes one level deep (it's flat)
353
+ try {
354
+ const dir = path.join(cwd, "logs", "tool_results");
355
+ let total = 0;
356
+ for (const e of fs.readdirSync(dir, { withFileTypes: true })) {
357
+ if (e.isFile()) {
358
+ try { total += fs.statSync(path.join(dir, e.name)).size; } catch { /* skip */ }
359
+ }
360
+ }
361
+ out.toolResultsMB = Math.round(total / 1024 / 1024);
362
+ } catch { /* skip */ }
363
+ // sub_agents/ — sum top-level entries (each is a dir, statSync returns
364
+ // dir-block size, not contents — that's fine for an order-of-magnitude
365
+ // signal; recursive walk would be too expensive for the sampler)
366
+ try {
367
+ const dir = path.join(cwd, "sub_agents");
368
+ let total = 0;
369
+ for (const e of fs.readdirSync(dir, { withFileTypes: true })) {
370
+ try { total += fs.statSync(path.join(dir, e.name)).size; } catch { /* skip */ }
371
+ }
372
+ out.subagentsMB = Math.round(total / 1024 / 1024);
373
+ } catch { /* skip */ }
374
+ // cache/bundles/
375
+ try {
376
+ const dir = path.join(cwd, "cache", "bundles");
377
+ let total = 0;
378
+ for (const e of fs.readdirSync(dir, { withFileTypes: true })) {
379
+ if (e.isFile()) {
380
+ try { total += fs.statSync(path.join(dir, e.name)).size; } catch { /* skip */ }
381
+ }
382
+ }
383
+ out.bundleCacheMB = Math.round(total / 1024 / 1024);
384
+ } catch { /* skip */ }
385
+ return out;
386
+ }
387
+
243
388
  /** Stop background diagnostics. Call on graceful shutdown. */
244
389
  stop() {
245
390
  try { this._heapSamplerStop?.(); } catch { /* ignore */ }
@@ -280,6 +425,14 @@ export class AgentEngine {
280
425
  new PhaseAdvanceTool(
281
426
  (to, reason, opts) => this._advancePhase(to, reason, opts),
282
427
  () => this.currentPhase, // H1: tool reads phase BEFORE its own call
428
+ // v0.6.2 J1: surface running subagents so the tool can refuse
429
+ // advance until the agent explicitly acknowledges them.
430
+ () => {
431
+ try {
432
+ const agentTool = this._buildTools?.core?.find((t) => t?.name === "agent_tool");
433
+ return agentTool?.getRunningTaskIds?.() || [];
434
+ } catch { return []; }
435
+ },
283
436
  ),
284
437
  new DocumentParseTool(this.workspace, {
285
438
  mineruApiUrl: this.config.mineruApiUrl,
@@ -353,6 +506,27 @@ export class AgentEngine {
353
506
  return "";
354
507
  }
355
508
 
509
+ /**
510
+ * v0.7.0 B3: Read rules/PATTERNS.md (project memory) for surfacing in
511
+ * the system prompt. Only loaded for phases where the agent owns
512
+ * decomposition decisions (skill_authoring + skill_testing — the two
513
+ * phases the work-decomposition skill operates in). Capped at ~5 KB
514
+ * so it stays trivial token-wise; if the file is larger, we truncate
515
+ * to the first 5 KB and append a "...truncated" marker so the agent
516
+ * knows to prune.
517
+ */
518
+ _readProjectMemory() {
519
+ if (!["skill_authoring", "skill_testing"].includes(this.currentPhase)) return null;
520
+ const p = path.join(this.workspace.cwd, "rules", "PATTERNS.md");
521
+ try {
522
+ if (!fs.existsSync(p)) return null;
523
+ const raw = fs.readFileSync(p, "utf-8");
524
+ const CAP = 5 * 1024;
525
+ if (raw.length <= CAP) return raw;
526
+ return raw.slice(0, CAP) + "\n\n…truncated at 5 KB — prune the least-actionable entries (work-decomposition skill: Sizing).";
527
+ } catch { return null; }
528
+ }
529
+
356
530
  /**
357
531
  * Build the workspace/project directory state string for the system prompt.
358
532
  */
@@ -392,6 +566,7 @@ export class AgentEngine {
392
566
  skillIndex: this._skillLoader.formatForContext(this.currentPhase),
393
567
  pipelineState: this.pipelines[this.currentPhase]?.describeState?.() || null,
394
568
  workspaceState: this._buildWorkspaceState(),
569
+ projectMemory: this._readProjectMemory(),
395
570
  });
396
571
  const systemTokens = estimateTokens(systemPrompt);
397
572
  const messageTokens = estimateMessagesTokens(this.history.messages);
@@ -550,8 +725,18 @@ export class AgentEngine {
550
725
  async compact({ recentCount = 20 } = {}) {
551
726
  if (this.history.messages.length <= recentCount) return null;
552
727
 
553
- const olderMessages = this.history.messages.slice(0, -recentCount);
554
- const recentMessages = this.history.messages.slice(-recentCount);
728
+ // v0.6.3.1: tool-pair atomicity. Naive slice(-recentCount) can land on
729
+ // a tool message (whose assistant_with_tool_calls is in the older batch
730
+ // about to be summarized) OR put the split between an assistant with
731
+ // tool_calls and its tool results. Either creates an orphan that
732
+ // DeepSeek's strict API rejects with 400. Walk the split point forward
733
+ // until BOTH (recent[0] isn't tool) AND (older[-1] isn't
734
+ // assistant_with_tool_calls).
735
+ const desiredSplit = this.history.messages.length - recentCount;
736
+ const splitPoint = findSafeSplitPoint(this.history.messages, desiredSplit);
737
+ const olderMessages = this.history.messages.slice(0, splitPoint);
738
+ const recentMessages = this.history.messages.slice(splitPoint);
739
+ if (olderMessages.length === 0) return null; // nothing safely summarizable
555
740
 
556
741
  const CHUNK_BUDGET = 30000; // tokens per summarization request
557
742
  const chunks = this._chunkMessages(olderMessages, CHUNK_BUDGET);
@@ -684,6 +869,39 @@ export class AgentEngine {
684
869
  engine._registerToolsForPhase(engine.currentPhase);
685
870
  engine.workspace.setPhase(engine.currentPhase);
686
871
 
872
+ // v0.6.3.1: detect whether prior turns of this session used reasoning
873
+ // mode, so the field-consistency invariant continues across resume.
874
+ // Without this, the first assistant turn after resume might lack
875
+ // reasoning_content even though earlier turns have it, and DeepSeek's
876
+ // strict-mode rejects with 400.
877
+ try {
878
+ const msgs = engine.history?.messages || [];
879
+ engine._sessionUsesReasoning = msgs.some(
880
+ (m) => m?.role === "assistant" && "reasoning_content" in m,
881
+ );
882
+ // One-shot migration: backfill empty reasoning_content on assistant
883
+ // messages that are missing the field. Pre-v0.6.3.1 sessions could
884
+ // accumulate "holes" (turns where the model skipped reasoning) that
885
+ // poison the conversation for resume. A single empty string on each
886
+ // hole is enough to satisfy DeepSeek's field-consistency rule.
887
+ if (engine._sessionUsesReasoning) {
888
+ let patched = 0;
889
+ for (const m of msgs) {
890
+ if (m?.role === "assistant" && !("reasoning_content" in m)) {
891
+ m.reasoning_content = "";
892
+ patched++;
893
+ }
894
+ }
895
+ if (patched > 0) {
896
+ engine.history._save?.();
897
+ engine.eventLog.append("reasoning_content_backfilled", {
898
+ count: patched,
899
+ reason: "v0.6.3.1 migration on resume",
900
+ });
901
+ }
902
+ }
903
+ } catch { /* never let resume break on this */ }
904
+
687
905
  // Restore project directory from saved state
688
906
  if (data.projectDir) {
689
907
  if (fs.existsSync(data.projectDir)) {
@@ -796,6 +1014,7 @@ export class AgentEngine {
796
1014
  skillIndex: this._skillLoader.formatForContext(this.currentPhase),
797
1015
  pipelineState,
798
1016
  workspaceState: this._buildWorkspaceState(),
1017
+ projectMemory: this._readProjectMemory(),
799
1018
  });
800
1019
  const tools = this.toolRegistry.schemasOpenai();
801
1020
 
@@ -824,6 +1043,19 @@ export class AgentEngine {
824
1043
 
825
1044
  try {
826
1045
  let collectedText = "";
1046
+ // v0.7.0 L (#76): Anthropic-only — accumulator for the
1047
+ // signature_delta blob that proves the thinking content came
1048
+ // from Anthropic's model. Required alongside thinking text on
1049
+ // multi-turn replay.
1050
+ let collectedReasoningSignature = "";
1051
+ // v0.6.3: hybrid reasoning models (GLM-5.1, DeepSeek v4, MiMo v2.5,
1052
+ // Qwen3, ...) stream `delta.reasoning_content` separately from
1053
+ // `delta.content`. DeepSeek's strict API requires this field to be
1054
+ // round-tripped on subsequent assistant messages or it rejects the
1055
+ // request with "reasoning_content in the thinking mode must be passed
1056
+ // back". Even providers that don't enforce this (SiliconFlow) still
1057
+ // benefit from preservation — without it, prior reasoning is wasted.
1058
+ let collectedReasoning = "";
827
1059
  /** @type {Map<number, {id: string, name: string, arguments: string}>} */
828
1060
  const toolCallsAcc = new Map();
829
1061
 
@@ -843,6 +1075,22 @@ export class AgentEngine {
843
1075
  collectedText += delta.content;
844
1076
  }
845
1077
 
1078
+ // v0.6.3: capture reasoning_content from the same delta. Emit a
1079
+ // separate event type so the TUI can optionally render thinking
1080
+ // (today it's silently consumed; round-trip is the priority fix).
1081
+ if (delta.reasoning_content) {
1082
+ yield new AgentEvent({ type: "reasoning_delta", text: delta.reasoning_content });
1083
+ collectedReasoning += delta.reasoning_content;
1084
+ }
1085
+
1086
+ // v0.7.0 L (#76): Anthropic-only signature_delta. Carries the
1087
+ // opaque proof-of-thinking blob that strict-mode multi-turn
1088
+ // requires alongside the thinking text. OpenAI-shape providers
1089
+ // never emit this delta; it's a no-op for them.
1090
+ if (delta.reasoning_signature) {
1091
+ collectedReasoningSignature += delta.reasoning_signature;
1092
+ }
1093
+
846
1094
  if (delta.tool_calls) {
847
1095
  for (const tcDelta of delta.tool_calls) {
848
1096
  const idx = tcDelta.index;
@@ -859,6 +1107,31 @@ export class AgentEngine {
859
1107
 
860
1108
  // Log the complete assistant message (coalesced, not per-delta)
861
1109
  const assistantMsg = { role: "assistant", content: collectedText || null };
1110
+ // v0.6.3: persist reasoning_content on the assistant message so it
1111
+ // round-trips on the next request. history.addRaw spreads the input,
1112
+ // preserving unknown fields; OpenAI body builder doesn't strip them.
1113
+ //
1114
+ // v0.6.3.1: DeepSeek's strict-mode rule is FIELD CONSISTENCY, not
1115
+ // field content — once any assistant turn in the conversation has
1116
+ // reasoning_content, every subsequent assistant turn must also have
1117
+ // it (empty string OK; missing the field rejects with 400). Hybrid
1118
+ // reasoning models sometimes skip reasoning on trivial follow-through
1119
+ // tool calls, leaving collectedReasoning="". Track at session level:
1120
+ // once we see ANY reasoning, keep setting the field (possibly empty)
1121
+ // for the rest of the session. Providers that don't use the field
1122
+ // ignore it silently.
1123
+ if (collectedReasoning) {
1124
+ assistantMsg.reasoning_content = collectedReasoning;
1125
+ this._sessionUsesReasoning = true;
1126
+ } else if (this._sessionUsesReasoning) {
1127
+ assistantMsg.reasoning_content = "";
1128
+ }
1129
+ // v0.7.0 L (#76): persist Anthropic signature alongside thinking.
1130
+ // Always stored together — if either is missing, _buildAnthropicBody
1131
+ // skips the thinking-block replay (would be rejected as malformed).
1132
+ if (collectedReasoningSignature) {
1133
+ assistantMsg.reasoning_signature = collectedReasoningSignature;
1134
+ }
862
1135
  if (toolCallsAcc.size > 0) {
863
1136
  assistantMsg.tool_calls = Array.from(toolCallsAcc.values()).map((tc) => ({
864
1137
  id: tc.id,
@@ -915,10 +1188,61 @@ export class AgentEngine {
915
1188
 
916
1189
  // Tool execution loop
917
1190
  for (const tc of toolCallsAcc.values()) {
918
- let inputData = {};
919
- try {
920
- inputData = tc.arguments ? JSON.parse(tc.arguments) : {};
921
- } catch { /* ignore */ }
1191
+ // v0.6.3.1: tool-argument JSON parsing used to be `try { parse } catch {}`
1192
+ // — silently falling back to {} on any parse failure. E2E #5 GLM
1193
+ // session showed this firing 100+ times: SiliconFlow streaming
1194
+ // truncates GLM-5.1 tool_call arguments by ~1 closing brace
1195
+ // (likely max_tokens cutoff mid-args), the silent fallback shipped
1196
+ // {} to the tool, and the tool returned generic "(empty)" errors
1197
+ // which the agent kept retrying without understanding why.
1198
+ //
1199
+ // Fix: try strict parse, then attempt brace-balance recovery (cheap
1200
+ // — recovers from the common single-brace-truncation case), and if
1201
+ // that fails, surface a structured error to the agent so it can
1202
+ // see what it sent and self-correct.
1203
+ let inputData = null;
1204
+ let argParseError = null;
1205
+ if (tc.arguments) {
1206
+ const recovery = parseToolArgsTolerant(tc.arguments);
1207
+ if (recovery.ok) {
1208
+ inputData = recovery.value;
1209
+ if (recovery.recovered) {
1210
+ this.eventLog.append("tool_args_recovered", {
1211
+ name: tc.name,
1212
+ added_chars: recovery.recovered,
1213
+ original_len: tc.arguments.length,
1214
+ });
1215
+ }
1216
+ } else {
1217
+ argParseError = recovery.error;
1218
+ }
1219
+ } else {
1220
+ inputData = {};
1221
+ }
1222
+
1223
+ // If arguments were unparseable, skip execution and return a tool
1224
+ // result that tells the agent what went wrong. Engine's tool result
1225
+ // loop continues so the rest of the assistant's tool_calls in this
1226
+ // turn still execute.
1227
+ if (argParseError) {
1228
+ const preview = (tc.arguments || "").slice(0, 200);
1229
+ const errMsg =
1230
+ `Tool arguments were malformed JSON for ${tc.name}. ` +
1231
+ `Likely streaming truncation by the model (provider cut tokens mid-output). ` +
1232
+ `Parser error: ${argParseError}. ` +
1233
+ `First 200 chars of what was received: ${preview}${tc.arguments && tc.arguments.length > 200 ? "..." : ""}. ` +
1234
+ `Retry the call with shorter / simpler arguments — the model may have hit max_tokens partway through encoding.`;
1235
+ this.eventLog.append("tool_args_parse_failed", {
1236
+ name: tc.name,
1237
+ error: argParseError,
1238
+ raw_args_len: (tc.arguments || "").length,
1239
+ raw_preview: preview,
1240
+ });
1241
+ yield new AgentEvent({ type: "tool_start", name: tc.name, input: { _parse_error: argParseError } });
1242
+ yield new AgentEvent({ type: "tool_result", name: tc.name, output: errMsg, isError: true });
1243
+ this.history.addRaw({ role: "tool", tool_call_id: tc.id, content: errMsg });
1244
+ continue;
1245
+ }
922
1246
 
923
1247
  this.eventLog.append("tool_start", { name: tc.name, input: inputData });
924
1248
  yield new AgentEvent({ type: "tool_start", name: tc.name, input: inputData });
@@ -973,10 +1297,31 @@ export class AgentEngine {
973
1297
  isError: result.isError,
974
1298
  });
975
1299
 
1300
+ // v0.6.3 (#74): phase-misfit nudge. Ask the current pipeline whether
1301
+ // this tool call looks like work that belongs to a different phase.
1302
+ // If so, append a `<system-reminder>` tag to the tool result content
1303
+ // (same convention as task-tools and auto-memory reminders). The
1304
+ // agent sees this on its next turn and can self-check whether to
1305
+ // call phase_advance. Only fires for non-error results — failed
1306
+ // tool calls have their own error message and don't need the nudge.
1307
+ let nudgedContent = historyContent;
1308
+ try {
1309
+ const pipelineForPhase = this.pipelines?.[beforePhase];
1310
+ const hint = pipelineForPhase?.phaseMisfitHint?.(tc.name, inputData, result);
1311
+ if (hint && !result.isError) {
1312
+ nudgedContent = `${historyContent}\n\n<system-reminder>\nPhase-misfit detected: ${hint}\n</system-reminder>`;
1313
+ this.eventLog.append("phase_misfit_hint", {
1314
+ phase: beforePhase,
1315
+ tool: tc.name,
1316
+ hint,
1317
+ });
1318
+ }
1319
+ } catch { /* never let the nudge logic break the tool loop */ }
1320
+
976
1321
  this.history.addRaw({
977
1322
  role: "tool",
978
1323
  tool_call_id: tc.id,
979
- content: historyContent,
1324
+ content: nudgedContent,
980
1325
  });
981
1326
 
982
1327
  // Post-tool-result safety net: check for context pressure RIGHT NOW
@@ -1053,38 +1398,144 @@ export class AgentEngine {
1053
1398
 
1054
1399
  const expected = NEXT_PHASE[this.currentPhase];
1055
1400
  if (!force && nextPhase !== expected) {
1401
+ // v0.7.0 A3: event-log hint stays factual (records what the gate
1402
+ // saw) — the LLM-facing refusal text in phase-advance.js no longer
1403
+ // advertises force:true. Hint kept here for post-mortem audit.
1056
1404
  this.eventLog.append("phase_advance_refused", {
1057
1405
  from: this.currentPhase, to: nextPhase, reason,
1058
- hint: expected ? `expected next phase is '${expected}' — pass force:true to override`
1406
+ hint: expected ? `non-adjacent transition; immediate next phase is '${expected}'`
1059
1407
  : `${this.currentPhase} is the terminal phase`,
1060
1408
  });
1061
1409
  return false;
1062
1410
  }
1063
1411
 
1412
+ // v0.7.0 A5: reconcile per-rule tasks against disk artifacts before
1413
+ // checking exit criteria. Catches the E2E #5 DS pattern (tasks.json
1414
+ // showed 70/70 done while only 56 dirs / 36 with check_*.py existed):
1415
+ // markDone() is fire-and-forget today, so the agent can claim
1416
+ // completion that didn't materialize. Reconcile flips back to
1417
+ // pending if the helper-derived ruleIdsCovered set doesn't include
1418
+ // the task's ruleId. A "force"d advance bypasses reconcile too —
1419
+ // the gate already gives the agent / user that escape.
1420
+ if (!force && this.taskManager && this.workspace) {
1421
+ try {
1422
+ const sa = deriveSkillAuthoringMilestones(this.workspace);
1423
+ const covered = new Set(sa.ruleIdsCovered);
1424
+ const tm = deriveSkillTestingMilestones(this.workspace);
1425
+ const tested = new Set(tm.skillsTested);
1426
+ const r = this.taskManager.reconcileAgainstDisk((task) => {
1427
+ if (task.phase === "skill_authoring") return covered.has(task.ruleId);
1428
+ if (task.phase === "skill_testing") return tested.has(task.ruleId);
1429
+ return true; // unknown phase — leave alone
1430
+ });
1431
+ if (r.flippedBack.length > 0) {
1432
+ this.eventLog.append("tasks_reconciled", {
1433
+ from_phase: this.currentPhase,
1434
+ target_phase: nextPhase,
1435
+ flipped_back: r.flippedBack,
1436
+ count: r.flippedBack.length,
1437
+ inspected: r.reconciled,
1438
+ });
1439
+ }
1440
+ } catch { /* never let reconcile break advance */ }
1441
+ }
1442
+
1443
+ // v0.6.3: HARD-TRACKING GATE — refuse forward advance unless the source
1444
+ // phase's exit criteria are met by engine telemetry. v0.6.1 added the
1445
+ // engineCounts block to phase summaries (observation) but never wired
1446
+ // exitCriteriaMet() into the gate (enforcement). E2E #5 surfaced the
1447
+ // gap: MiMo advanced rule_extraction → skill_authoring with
1448
+ // rulesExtracted=0 in engine telemetry because rule_catalog had been
1449
+ // writing to a stranded post-rename path AND nothing checked the gate.
1450
+ //
1451
+ // Forward-only enforcement: rollbacks (_advancePhase from a later phase
1452
+ // to an earlier one with force:true) are an explicit escape, not a
1453
+ // criteria check — the rolled-from phase doesn't need to be "complete".
1454
+ // force:true also bypasses (matches existing escape pattern: user/agent
1455
+ // explicitly chose to skip).
1456
+ if (!force) {
1457
+ const fromIdx = PHASE_ORDER.indexOf(this.currentPhase);
1458
+ const toIdx = PHASE_ORDER.indexOf(nextPhase);
1459
+ const isForward = fromIdx >= 0 && toIdx >= 0 && toIdx > fromIdx;
1460
+ if (isForward) {
1461
+ const fromPipeline = this.pipelines?.[this.currentPhase];
1462
+ let criteriaMet = true;
1463
+ try { criteriaMet = !!fromPipeline?.exitCriteriaMet?.(); } catch { criteriaMet = true; }
1464
+ if (!criteriaMet) {
1465
+ const counts = this._buildEngineCountsBlock(this.currentPhase);
1466
+ this.eventLog.append("phase_advance_refused", {
1467
+ from: this.currentPhase, to: nextPhase, reason,
1468
+ hint: "exit criteria not met by engine telemetry",
1469
+ engineCounts: counts || null,
1470
+ });
1471
+ return false;
1472
+ }
1473
+ }
1474
+ }
1475
+
1476
+ // v0.6.2 J2: detect rollback direction. PHASE_ORDER is a linear array
1477
+ // of all phases; if target index < current index, this is a rollback
1478
+ // (e.g., production_qc → skill_authoring after gates revealed gaps).
1479
+ const fromIdx = PHASE_ORDER.indexOf(this.currentPhase);
1480
+ const toIdx = PHASE_ORDER.indexOf(nextPhase);
1481
+ const direction = (fromIdx >= 0 && toIdx >= 0 && toIdx < fromIdx)
1482
+ ? "rollback" : "forward";
1483
+
1064
1484
  // v0.6.1 B1: build engine-appended hard-counts block + heuristic mismatch
1065
1485
  // detection so the LLM-narrated reason can be cross-checked against
1066
1486
  // ground-truth telemetry. Phase summaries become diagnostic, not just
1067
1487
  // narrative.
1068
1488
  const engineCounts = this._buildEngineCountsBlock(this.currentPhase);
1069
1489
  const mismatchPrefix = this._detectSummaryMismatch(reason, this.currentPhase) ? "⚠️ POSSIBLE MISMATCH: " : "";
1490
+ const directionTag = direction === "rollback" ? " [ROLLBACK]" : "";
1491
+ // v0.7.0 A2: forced is now `!!force` (honest), not the old
1492
+ // `force && nextPhase !== expected` which masked every adjacent-forward
1493
+ // force in the audit log. E2E #5 had 12/12 force-bypasses but the event
1494
+ // log read 0 forced because every transition was to the immediate next
1495
+ // phase. Truth in audit logs first; refinement (forward-vs-non-adjacent
1496
+ // distinction) lives in the `direction` field.
1070
1497
  const phaseSummary =
1071
- `[${this.currentPhase.toUpperCase()} → ${nextPhase.toUpperCase()}]: ${mismatchPrefix}${reason}` +
1072
- (force && nextPhase !== expected ? " (forced)" : "") +
1498
+ `[${this.currentPhase.toUpperCase()} → ${nextPhase.toUpperCase()}]${directionTag}: ${mismatchPrefix}${reason}` +
1499
+ (force ? " (forced)" : "") +
1073
1500
  (engineCounts ? `\n (engine) ${engineCounts}` : "");
1074
1501
  this._phaseSummaries.push(phaseSummary);
1075
1502
  this.eventLog.append("phase_transition", {
1076
1503
  from: this.currentPhase,
1077
1504
  to: nextPhase,
1078
1505
  reason,
1506
+ direction,
1079
1507
  engineCounts: engineCounts || null,
1080
1508
  possibleMismatch: !!mismatchPrefix,
1081
- forced: force && nextPhase !== expected,
1509
+ forced: !!force,
1082
1510
  });
1083
1511
  const fromPhase = this.currentPhase;
1084
1512
  this.currentPhase = nextPhase;
1085
1513
  this._registerToolsForPhase(this.currentPhase);
1086
1514
  this.workspace.setPhase(this.currentPhase);
1087
1515
  this._createTasksForPhase(this.currentPhase);
1516
+
1517
+ // v0.7.0 N (#94): give the entered pipeline a chance to do
1518
+ // phase-entry setup. Used by finalization to copy the release
1519
+ // template into output/releases/v1/. Other pipelines are no-ops.
1520
+ // Wrapped so a failure here can't trap the phase advance.
1521
+ try { this.pipelines[this.currentPhase]?.onPhaseEnter?.({ fromPhase, workspace: this.workspace }); }
1522
+ catch (e) {
1523
+ this.eventLog.append("phase_enter_hook_failed", {
1524
+ phase: this.currentPhase,
1525
+ error: e?.message || String(e),
1526
+ });
1527
+ }
1528
+
1529
+ // v0.6.2 J2: on rollback, reset the rolled-FROM phase's lastReady
1530
+ // edge-trigger so that if the agent revisits it and re-flips
1531
+ // exit-criteria true, _maybeAutoAdvance will fire correctly. Without
1532
+ // this, the auto-advance edge trigger stays latched true and the
1533
+ // moment the agent returns to fromPhase the engine immediately
1534
+ // bounces them back out — defeating the rollback.
1535
+ if (direction === "rollback" && this._lastReady) {
1536
+ this._lastReady[fromPhase] = false;
1537
+ }
1538
+
1088
1539
  this.saveState();
1089
1540
 
1090
1541
  // B8: Soft signal — surface any sub-agents left running from the prior
@@ -1168,7 +1619,7 @@ export class AgentEngine {
1168
1619
  const parts = [];
1169
1620
  try {
1170
1621
  switch (fromPhase) {
1171
- case "extraction": {
1622
+ case "rule_extraction": {
1172
1623
  const total = pipeline._catalogRuleCount?.() ?? pipeline.rulesExtracted?.length ?? 0;
1173
1624
  parts.push(`rulesExtracted: ${pipeline.rulesExtracted?.length ?? 0}`);
1174
1625
  parts.push(`rulesWithChunkRefs: ${pipeline.rulesWithChunkRefs?.length ?? 0}/${total}`);
@@ -1616,11 +2067,23 @@ export class AgentEngine {
1616
2067
 
1617
2068
  // Auto-continue through pending tasks
1618
2069
  while (this.taskManager.getNextPending()) {
1619
- // Context safety: force compaction if above 70%, or light compaction if history is long
2070
+ // v0.7.0 #93: budget-aware compact threshold. The old
2071
+ // `messages.length > 15` was message-count-based and frozen
2072
+ // from when KC ran on smaller contexts. With 200K+ budgets it
2073
+ // fired on every iteration of any non-trivial task — E2E #5
2074
+ // GLM saw 76 memory_pressure events and DS saw 46 because
2075
+ // compact pre-empted natural windowing. Replace with token-
2076
+ // budget threshold (default 60% of context, configurable via
2077
+ // KC_COMPACT_THRESHOLD_TOKENS) so compact runs when there's
2078
+ // actual pressure, not just when message count crossed an
2079
+ // ancient heuristic.
1620
2080
  const stats = this.getContextStats();
2081
+ const thresholdTokens = parseInt(
2082
+ process.env.KC_COMPACT_THRESHOLD_TOKENS || "0", 10,
2083
+ ) || Math.round((this.config.kcContextLimit || 200000) * 0.6);
1621
2084
  if (stats.percentage > 70) {
1622
2085
  await this.compact();
1623
- } else if (this.history.messages.length > 15) {
2086
+ } else if (stats.totalTokens > thresholdTokens) {
1624
2087
  await this.compact({ recentCount: 8 });
1625
2088
  }
1626
2089
 
@@ -1789,10 +2252,18 @@ export class AgentEngine {
1789
2252
  continue;
1790
2253
  }
1791
2254
 
1792
- const trackedPromise = entry.promise.then(
1793
- () => ({ taskId: task.id, subId, ok: true }),
1794
- (e) => ({ taskId: task.id, subId, ok: false, error: e?.message || String(e) }),
1795
- );
2255
+ // v0.7.0 H1: trackedPromise covers both fulfilled and rejected
2256
+ // paths (second arg). The .catch tail is belt-and-braces in case
2257
+ // the .then callbacks themselves throw without it, a JSON
2258
+ // serialization throw inside the success-arm callback would
2259
+ // surface as UnhandledPromiseRejection and crash strict-mode
2260
+ // Node. We never want a worker error to take the engine down.
2261
+ const trackedPromise = entry.promise
2262
+ .then(
2263
+ () => ({ taskId: task.id, subId, ok: true }),
2264
+ (e) => ({ taskId: task.id, subId, ok: false, error: e?.message || String(e) }),
2265
+ )
2266
+ .catch((e) => ({ taskId: task.id, subId, ok: false, error: `tracked-promise threw: ${e?.message || String(e)}` }));
1796
2267
  inFlight.set(subId, { task, workerLabel, promise: trackedPromise });
1797
2268
  }
1798
2269
  };
@@ -1807,7 +2278,15 @@ export class AgentEngine {
1807
2278
 
1808
2279
  if (inFlight.size === 0) break;
1809
2280
 
1810
- // Wait for either the next event OR a worker to complete
2281
+ // Wait for either the next event OR a worker to complete.
2282
+ //
2283
+ // v0.7.0 C1 note: losers in Promise.race() keep their .then()
2284
+ // chains active and resolve into garbage objects. That's the
2285
+ // intended JS Promise behavior — rejections are still handled,
2286
+ // memory drops at GC. The audit was overstated; no actual hang
2287
+ // or leak. Each loop iteration rebuilds the race from current
2288
+ // inFlight.values() so stale promises from prior iterations
2289
+ // are naturally re-observed (they've already resolved by then).
1811
2290
  const workerCompletion = Promise.race([...inFlight.values()].map((v) => v.promise));
1812
2291
  const eventArrival = new Promise((resolve) => { notify = () => resolve("event"); });
1813
2292
  const winner = await Promise.race([