kc-beta 0.6.2 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/LICENSE +81 -0
  2. package/LICENSE-COMMERCIAL.md +125 -0
  3. package/README.md +21 -3
  4. package/package.json +14 -5
  5. package/src/agent/context-window.js +9 -12
  6. package/src/agent/context.js +14 -1
  7. package/src/agent/document-parser.js +169 -0
  8. package/src/agent/engine.js +382 -19
  9. package/src/agent/history/event-history.js +222 -0
  10. package/src/agent/llm-client.js +55 -0
  11. package/src/agent/message-utils.js +63 -0
  12. package/src/agent/pipelines/_milestone-derive.js +566 -0
  13. package/src/agent/pipelines/base.js +21 -0
  14. package/src/agent/pipelines/distillation.js +28 -15
  15. package/src/agent/pipelines/extraction.js +130 -36
  16. package/src/agent/pipelines/finalization.js +178 -11
  17. package/src/agent/pipelines/index.js +6 -1
  18. package/src/agent/pipelines/initializer.js +74 -8
  19. package/src/agent/pipelines/production-qc.js +31 -44
  20. package/src/agent/pipelines/skill-authoring.js +97 -80
  21. package/src/agent/pipelines/skill-testing.js +106 -23
  22. package/src/agent/retry.js +10 -2
  23. package/src/agent/scheduler.js +14 -2
  24. package/src/agent/session-state.js +18 -1
  25. package/src/agent/skill-loader.js +13 -7
  26. package/src/agent/skill-validator.js +19 -5
  27. package/src/agent/task-manager.js +61 -5
  28. package/src/agent/tools/document-chunk.js +21 -9
  29. package/src/agent/tools/phase-advance.js +37 -5
  30. package/src/agent/tools/release.js +51 -9
  31. package/src/agent/tools/rule-catalog.js +11 -1
  32. package/src/agent/tools/workspace-file.js +32 -0
  33. package/src/agent/workspace.js +39 -1
  34. package/src/cli/components.js +64 -14
  35. package/src/cli/index.js +62 -3
  36. package/src/cli/meme.js +26 -25
  37. package/src/config.js +65 -22
  38. package/src/model-tiers.json +24 -8
  39. package/src/providers.js +42 -0
  40. package/template/release/v1/README.md.tmpl +108 -0
  41. package/template/release/v1/catalog.json.tmpl +4 -0
  42. package/template/release/v1/kc_runtime/__init__.py +11 -0
  43. package/template/release/v1/kc_runtime/confidence.py +63 -0
  44. package/template/release/v1/kc_runtime/doc_parser.py +127 -0
  45. package/template/release/v1/manifest.json.tmpl +11 -0
  46. package/template/release/v1/render_dashboard.py +117 -0
  47. package/template/release/v1/run.py +212 -0
  48. package/template/release/v1/serve.sh +17 -0
  49. package/template/skills/en/meta-meta/work-decomposition/SKILL.md +326 -0
  50. package/template/skills/en/skill-creator/SKILL.md +1 -1
  51. package/template/skills/zh/meta-meta/work-decomposition/SKILL.md +321 -0
  52. package/template/skills/zh/skill-creator/SKILL.md +1 -1
@@ -1,8 +1,13 @@
1
1
  import fs from "node:fs";
2
2
  import path from "node:path";
3
3
  import { AgentEvent } from "./events.js";
4
+ import {
5
+ deriveSkillAuthoringMilestones,
6
+ deriveSkillTestingMilestones,
7
+ } from "./pipelines/_milestone-derive.js";
4
8
  import { ContextAssembler } from "./context.js";
5
9
  import { ConversationHistory } from "./history.js";
10
+ import { findSafeSplitPoint } from "./message-utils.js";
6
11
  import { Workspace } from "./workspace.js";
7
12
  import { normalizeRuleCatalog } from "./rule-catalog-normalize.js";
8
13
  import { VersionManager } from "./version-manager.js";
@@ -52,6 +57,45 @@ import { estimateTokens, estimateMessagesTokens } from "./token-counter.js";
52
57
  // or kc_max_tokens in the global config.
53
58
  const DEFAULT_KC_MAX_TOKENS = 65536;
54
59
 
60
+ /**
61
+ * v0.6.3.1: Tolerant JSON parse for streamed tool-call arguments. When LLMs
62
+ * (esp. SiliconFlow GLM-5.1 in E2E #5) hit max_tokens mid-arguments, the
63
+ * stream returns truncated JSON missing N closing braces or quotes. Strict
64
+ * parse fails; old code silently dropped to {} which masked the actual issue.
65
+ *
66
+ * Strategy:
67
+ * 1. Try strict JSON.parse (fast path, most calls).
68
+ * 2. On failure, attempt to balance braces by appending up to BRACE_BUDGET
69
+ * `}` characters. Cheap; recovers the common single-brace-truncation case.
70
+ * 3. If still failing, return error so caller surfaces it to the agent.
71
+ *
72
+ * Returns { ok: true, value, recovered? } | { ok: false, error }.
73
+ */
74
+ const BRACE_RECOVERY_BUDGET = 4;
75
+ function parseToolArgsTolerant(raw) {
76
+ if (typeof raw !== "string") return { ok: false, error: "arguments not a string" };
77
+ if (raw === "") return { ok: true, value: {} };
78
+ // Fast path
79
+ try { return { ok: true, value: JSON.parse(raw) }; } catch (e0) {
80
+ // Recovery: balance braces by appending up to BRACE_RECOVERY_BUDGET `}`
81
+ const opens = (raw.match(/\{/g) || []).length;
82
+ const closes = (raw.match(/\}/g) || []).length;
83
+ const needed = opens - closes;
84
+ if (needed > 0 && needed <= BRACE_RECOVERY_BUDGET) {
85
+ const padded = raw + "}".repeat(needed);
86
+ try { return { ok: true, value: JSON.parse(padded), recovered: needed }; } catch (_) { /* fall through */ }
87
+ }
88
+ // Last-ditch: try closing an open string then balancing braces.
89
+ // Truncation can land mid-string-value: ..."description": "abc<EOF>
90
+ const quotes = (raw.match(/"/g) || []).length;
91
+ if (quotes % 2 === 1) {
92
+ const candidate = raw + '"' + "}".repeat(Math.max(1, needed));
93
+ try { return { ok: true, value: JSON.parse(candidate), recovered: candidate.length - raw.length }; } catch (_) { /* fall through */ }
94
+ }
95
+ return { ok: false, error: e0.message || "JSON parse failed" };
96
+ }
97
+ }
98
+
55
99
  // Phases where worker LLM tools are available (DISTILL mode).
56
100
  // E1: FINALIZATION inherits worker-LLM access so one-last-pass validation
57
101
  // runs + dashboard_render + workflow_run stay usable during packaging.
@@ -379,7 +423,21 @@ export class AgentEngine {
379
423
  new ScheduleFetchTool(this.workspace),
380
424
  new ReleaseTool(this.workspace, { kcVersion: "0.5.2" }),
381
425
  new PhaseAdvanceTool(
382
- (to, reason, opts) => this._advancePhase(to, reason, opts),
426
+ // v0.7.1 2c: advanceFn returns rich `{advanced, engineCounts?}`
427
+ // so the tool's refusal text can surface the engine telemetry
428
+ // that motivated the refusal. Internal callers of
429
+ // `_advancePhase` continue to use the bool return value
430
+ // directly; only this lambda wraps for the LLM-facing tool.
431
+ (to, reason, opts) => {
432
+ const advanced = this._advancePhase(to, reason, opts);
433
+ if (!advanced) {
434
+ let engineCounts = null;
435
+ try { engineCounts = this._buildEngineCountsBlock(this.currentPhase); }
436
+ catch { /* defensive */ }
437
+ return { advanced: false, engineCounts };
438
+ }
439
+ return { advanced: true };
440
+ },
383
441
  () => this.currentPhase, // H1: tool reads phase BEFORE its own call
384
442
  // v0.6.2 J1: surface running subagents so the tool can refuse
385
443
  // advance until the agent explicitly acknowledges them.
@@ -462,6 +520,27 @@ export class AgentEngine {
462
520
  return "";
463
521
  }
464
522
 
523
+ /**
524
+ * v0.7.0 B3: Read rules/PATTERNS.md (project memory) for surfacing in
525
+ * the system prompt. Only loaded for phases where the agent owns
526
+ * decomposition decisions (skill_authoring + skill_testing — the two
527
+ * phases the work-decomposition skill operates in). Capped at ~5 KB
528
+ * so it stays trivial token-wise; if the file is larger, we truncate
529
+ * to the first 5 KB and append a "...truncated" marker so the agent
530
+ * knows to prune.
531
+ */
532
+ _readProjectMemory() {
533
+ if (!["skill_authoring", "skill_testing"].includes(this.currentPhase)) return null;
534
+ const p = path.join(this.workspace.cwd, "rules", "PATTERNS.md");
535
+ try {
536
+ if (!fs.existsSync(p)) return null;
537
+ const raw = fs.readFileSync(p, "utf-8");
538
+ const CAP = 5 * 1024;
539
+ if (raw.length <= CAP) return raw;
540
+ return raw.slice(0, CAP) + "\n\n…truncated at 5 KB — prune the least-actionable entries (work-decomposition skill: Sizing).";
541
+ } catch { return null; }
542
+ }
543
+
465
544
  /**
466
545
  * Build the workspace/project directory state string for the system prompt.
467
546
  */
@@ -501,6 +580,7 @@ export class AgentEngine {
501
580
  skillIndex: this._skillLoader.formatForContext(this.currentPhase),
502
581
  pipelineState: this.pipelines[this.currentPhase]?.describeState?.() || null,
503
582
  workspaceState: this._buildWorkspaceState(),
583
+ projectMemory: this._readProjectMemory(),
504
584
  });
505
585
  const systemTokens = estimateTokens(systemPrompt);
506
586
  const messageTokens = estimateMessagesTokens(this.history.messages);
@@ -659,8 +739,18 @@ export class AgentEngine {
659
739
  async compact({ recentCount = 20 } = {}) {
660
740
  if (this.history.messages.length <= recentCount) return null;
661
741
 
662
- const olderMessages = this.history.messages.slice(0, -recentCount);
663
- const recentMessages = this.history.messages.slice(-recentCount);
742
+ // v0.6.3.1: tool-pair atomicity. Naive slice(-recentCount) can land on
743
+ // a tool message (whose assistant_with_tool_calls is in the older batch
744
+ // about to be summarized) OR put the split between an assistant with
745
+ // tool_calls and its tool results. Either creates an orphan that
746
+ // DeepSeek's strict API rejects with 400. Walk the split point forward
747
+ // until BOTH (recent[0] isn't tool) AND (older[-1] isn't
748
+ // assistant_with_tool_calls).
749
+ const desiredSplit = this.history.messages.length - recentCount;
750
+ const splitPoint = findSafeSplitPoint(this.history.messages, desiredSplit);
751
+ const olderMessages = this.history.messages.slice(0, splitPoint);
752
+ const recentMessages = this.history.messages.slice(splitPoint);
753
+ if (olderMessages.length === 0) return null; // nothing safely summarizable
664
754
 
665
755
  const CHUNK_BUDGET = 30000; // tokens per summarization request
666
756
  const chunks = this._chunkMessages(olderMessages, CHUNK_BUDGET);
@@ -793,6 +883,39 @@ export class AgentEngine {
793
883
  engine._registerToolsForPhase(engine.currentPhase);
794
884
  engine.workspace.setPhase(engine.currentPhase);
795
885
 
886
+ // v0.6.3.1: detect whether prior turns of this session used reasoning
887
+ // mode, so the field-consistency invariant continues across resume.
888
+ // Without this, the first assistant turn after resume might lack
889
+ // reasoning_content even though earlier turns have it, and DeepSeek's
890
+ // strict-mode rejects with 400.
891
+ try {
892
+ const msgs = engine.history?.messages || [];
893
+ engine._sessionUsesReasoning = msgs.some(
894
+ (m) => m?.role === "assistant" && "reasoning_content" in m,
895
+ );
896
+ // One-shot migration: backfill empty reasoning_content on assistant
897
+ // messages that are missing the field. Pre-v0.6.3.1 sessions could
898
+ // accumulate "holes" (turns where the model skipped reasoning) that
899
+ // poison the conversation for resume. A single empty string on each
900
+ // hole is enough to satisfy DeepSeek's field-consistency rule.
901
+ if (engine._sessionUsesReasoning) {
902
+ let patched = 0;
903
+ for (const m of msgs) {
904
+ if (m?.role === "assistant" && !("reasoning_content" in m)) {
905
+ m.reasoning_content = "";
906
+ patched++;
907
+ }
908
+ }
909
+ if (patched > 0) {
910
+ engine.history._save?.();
911
+ engine.eventLog.append("reasoning_content_backfilled", {
912
+ count: patched,
913
+ reason: "v0.6.3.1 migration on resume",
914
+ });
915
+ }
916
+ }
917
+ } catch { /* never let resume break on this */ }
918
+
796
919
  // Restore project directory from saved state
797
920
  if (data.projectDir) {
798
921
  if (fs.existsSync(data.projectDir)) {
@@ -905,6 +1028,7 @@ export class AgentEngine {
905
1028
  skillIndex: this._skillLoader.formatForContext(this.currentPhase),
906
1029
  pipelineState,
907
1030
  workspaceState: this._buildWorkspaceState(),
1031
+ projectMemory: this._readProjectMemory(),
908
1032
  });
909
1033
  const tools = this.toolRegistry.schemasOpenai();
910
1034
 
@@ -933,6 +1057,19 @@ export class AgentEngine {
933
1057
 
934
1058
  try {
935
1059
  let collectedText = "";
1060
+ // v0.7.0 L (#76): Anthropic-only — accumulator for the
1061
+ // signature_delta blob that proves the thinking content came
1062
+ // from Anthropic's model. Required alongside thinking text on
1063
+ // multi-turn replay.
1064
+ let collectedReasoningSignature = "";
1065
+ // v0.6.3: hybrid reasoning models (GLM-5.1, DeepSeek v4, MiMo v2.5,
1066
+ // Qwen3, ...) stream `delta.reasoning_content` separately from
1067
+ // `delta.content`. DeepSeek's strict API requires this field to be
1068
+ // round-tripped on subsequent assistant messages or it rejects the
1069
+ // request with "reasoning_content in the thinking mode must be passed
1070
+ // back". Even providers that don't enforce this (SiliconFlow) still
1071
+ // benefit from preservation — without it, prior reasoning is wasted.
1072
+ let collectedReasoning = "";
936
1073
  /** @type {Map<number, {id: string, name: string, arguments: string}>} */
937
1074
  const toolCallsAcc = new Map();
938
1075
 
@@ -952,6 +1089,22 @@ export class AgentEngine {
952
1089
  collectedText += delta.content;
953
1090
  }
954
1091
 
1092
+ // v0.6.3: capture reasoning_content from the same delta. Emit a
1093
+ // separate event type so the TUI can optionally render thinking
1094
+ // (today it's silently consumed; round-trip is the priority fix).
1095
+ if (delta.reasoning_content) {
1096
+ yield new AgentEvent({ type: "reasoning_delta", text: delta.reasoning_content });
1097
+ collectedReasoning += delta.reasoning_content;
1098
+ }
1099
+
1100
+ // v0.7.0 L (#76): Anthropic-only signature_delta. Carries the
1101
+ // opaque proof-of-thinking blob that strict-mode multi-turn
1102
+ // requires alongside the thinking text. OpenAI-shape providers
1103
+ // never emit this delta; it's a no-op for them.
1104
+ if (delta.reasoning_signature) {
1105
+ collectedReasoningSignature += delta.reasoning_signature;
1106
+ }
1107
+
955
1108
  if (delta.tool_calls) {
956
1109
  for (const tcDelta of delta.tool_calls) {
957
1110
  const idx = tcDelta.index;
@@ -968,6 +1121,31 @@ export class AgentEngine {
968
1121
 
969
1122
  // Log the complete assistant message (coalesced, not per-delta)
970
1123
  const assistantMsg = { role: "assistant", content: collectedText || null };
1124
+ // v0.6.3: persist reasoning_content on the assistant message so it
1125
+ // round-trips on the next request. history.addRaw spreads the input,
1126
+ // preserving unknown fields; OpenAI body builder doesn't strip them.
1127
+ //
1128
+ // v0.6.3.1: DeepSeek's strict-mode rule is FIELD CONSISTENCY, not
1129
+ // field content — once any assistant turn in the conversation has
1130
+ // reasoning_content, every subsequent assistant turn must also have
1131
+ // it (empty string OK; missing the field rejects with 400). Hybrid
1132
+ // reasoning models sometimes skip reasoning on trivial follow-through
1133
+ // tool calls, leaving collectedReasoning="". Track at session level:
1134
+ // once we see ANY reasoning, keep setting the field (possibly empty)
1135
+ // for the rest of the session. Providers that don't use the field
1136
+ // ignore it silently.
1137
+ if (collectedReasoning) {
1138
+ assistantMsg.reasoning_content = collectedReasoning;
1139
+ this._sessionUsesReasoning = true;
1140
+ } else if (this._sessionUsesReasoning) {
1141
+ assistantMsg.reasoning_content = "";
1142
+ }
1143
+ // v0.7.0 L (#76): persist Anthropic signature alongside thinking.
1144
+ // Always stored together — if either is missing, _buildAnthropicBody
1145
+ // skips the thinking-block replay (would be rejected as malformed).
1146
+ if (collectedReasoningSignature) {
1147
+ assistantMsg.reasoning_signature = collectedReasoningSignature;
1148
+ }
971
1149
  if (toolCallsAcc.size > 0) {
972
1150
  assistantMsg.tool_calls = Array.from(toolCallsAcc.values()).map((tc) => ({
973
1151
  id: tc.id,
@@ -1024,10 +1202,61 @@ export class AgentEngine {
1024
1202
 
1025
1203
  // Tool execution loop
1026
1204
  for (const tc of toolCallsAcc.values()) {
1027
- let inputData = {};
1028
- try {
1029
- inputData = tc.arguments ? JSON.parse(tc.arguments) : {};
1030
- } catch { /* ignore */ }
1205
+ // v0.6.3.1: tool-argument JSON parsing used to be `try { parse } catch {}`
1206
+ // — silently falling back to {} on any parse failure. E2E #5 GLM
1207
+ // session showed this firing 100+ times: SiliconFlow streaming
1208
+ // truncates GLM-5.1 tool_call arguments by ~1 closing brace
1209
+ // (likely max_tokens cutoff mid-args), the silent fallback shipped
1210
+ // {} to the tool, and the tool returned generic "(empty)" errors
1211
+ // which the agent kept retrying without understanding why.
1212
+ //
1213
+ // Fix: try strict parse, then attempt brace-balance recovery (cheap
1214
+ // — recovers from the common single-brace-truncation case), and if
1215
+ // that fails, surface a structured error to the agent so it can
1216
+ // see what it sent and self-correct.
1217
+ let inputData = null;
1218
+ let argParseError = null;
1219
+ if (tc.arguments) {
1220
+ const recovery = parseToolArgsTolerant(tc.arguments);
1221
+ if (recovery.ok) {
1222
+ inputData = recovery.value;
1223
+ if (recovery.recovered) {
1224
+ this.eventLog.append("tool_args_recovered", {
1225
+ name: tc.name,
1226
+ added_chars: recovery.recovered,
1227
+ original_len: tc.arguments.length,
1228
+ });
1229
+ }
1230
+ } else {
1231
+ argParseError = recovery.error;
1232
+ }
1233
+ } else {
1234
+ inputData = {};
1235
+ }
1236
+
1237
+ // If arguments were unparseable, skip execution and return a tool
1238
+ // result that tells the agent what went wrong. Engine's tool result
1239
+ // loop continues so the rest of the assistant's tool_calls in this
1240
+ // turn still execute.
1241
+ if (argParseError) {
1242
+ const preview = (tc.arguments || "").slice(0, 200);
1243
+ const errMsg =
1244
+ `Tool arguments were malformed JSON for ${tc.name}. ` +
1245
+ `Likely streaming truncation by the model (provider cut tokens mid-output). ` +
1246
+ `Parser error: ${argParseError}. ` +
1247
+ `First 200 chars of what was received: ${preview}${tc.arguments && tc.arguments.length > 200 ? "..." : ""}. ` +
1248
+ `Retry the call with shorter / simpler arguments — the model may have hit max_tokens partway through encoding.`;
1249
+ this.eventLog.append("tool_args_parse_failed", {
1250
+ name: tc.name,
1251
+ error: argParseError,
1252
+ raw_args_len: (tc.arguments || "").length,
1253
+ raw_preview: preview,
1254
+ });
1255
+ yield new AgentEvent({ type: "tool_start", name: tc.name, input: { _parse_error: argParseError } });
1256
+ yield new AgentEvent({ type: "tool_result", name: tc.name, output: errMsg, isError: true });
1257
+ this.history.addRaw({ role: "tool", tool_call_id: tc.id, content: errMsg });
1258
+ continue;
1259
+ }
1031
1260
 
1032
1261
  this.eventLog.append("tool_start", { name: tc.name, input: inputData });
1033
1262
  yield new AgentEvent({ type: "tool_start", name: tc.name, input: inputData });
@@ -1082,10 +1311,31 @@ export class AgentEngine {
1082
1311
  isError: result.isError,
1083
1312
  });
1084
1313
 
1314
+ // v0.6.3 (#74): phase-misfit nudge. Ask the current pipeline whether
1315
+ // this tool call looks like work that belongs to a different phase.
1316
+ // If so, append a `<system-reminder>` tag to the tool result content
1317
+ // (same convention as task-tools and auto-memory reminders). The
1318
+ // agent sees this on its next turn and can self-check whether to
1319
+ // call phase_advance. Only fires for non-error results — failed
1320
+ // tool calls have their own error message and don't need the nudge.
1321
+ let nudgedContent = historyContent;
1322
+ try {
1323
+ const pipelineForPhase = this.pipelines?.[beforePhase];
1324
+ const hint = pipelineForPhase?.phaseMisfitHint?.(tc.name, inputData, result);
1325
+ if (hint && !result.isError) {
1326
+ nudgedContent = `${historyContent}\n\n<system-reminder>\nPhase-misfit detected: ${hint}\n</system-reminder>`;
1327
+ this.eventLog.append("phase_misfit_hint", {
1328
+ phase: beforePhase,
1329
+ tool: tc.name,
1330
+ hint,
1331
+ });
1332
+ }
1333
+ } catch { /* never let the nudge logic break the tool loop */ }
1334
+
1085
1335
  this.history.addRaw({
1086
1336
  role: "tool",
1087
1337
  tool_call_id: tc.id,
1088
- content: historyContent,
1338
+ content: nudgedContent,
1089
1339
  });
1090
1340
 
1091
1341
  // Post-tool-result safety net: check for context pressure RIGHT NOW
@@ -1162,14 +1412,81 @@ export class AgentEngine {
1162
1412
 
1163
1413
  const expected = NEXT_PHASE[this.currentPhase];
1164
1414
  if (!force && nextPhase !== expected) {
1415
+ // v0.7.0 A3: event-log hint stays factual (records what the gate
1416
+ // saw) — the LLM-facing refusal text in phase-advance.js no longer
1417
+ // advertises force:true. Hint kept here for post-mortem audit.
1165
1418
  this.eventLog.append("phase_advance_refused", {
1166
1419
  from: this.currentPhase, to: nextPhase, reason,
1167
- hint: expected ? `expected next phase is '${expected}' — pass force:true to override`
1420
+ hint: expected ? `non-adjacent transition; immediate next phase is '${expected}'`
1168
1421
  : `${this.currentPhase} is the terminal phase`,
1169
1422
  });
1170
1423
  return false;
1171
1424
  }
1172
1425
 
1426
+ // v0.7.0 A5: reconcile per-rule tasks against disk artifacts before
1427
+ // checking exit criteria. Catches the E2E #5 DS pattern (tasks.json
1428
+ // showed 70/70 done while only 56 dirs / 36 with check_*.py existed):
1429
+ // markDone() is fire-and-forget today, so the agent can claim
1430
+ // completion that didn't materialize. Reconcile flips back to
1431
+ // pending if the helper-derived ruleIdsCovered set doesn't include
1432
+ // the task's ruleId. A "force"d advance bypasses reconcile too —
1433
+ // the gate already gives the agent / user that escape.
1434
+ if (!force && this.taskManager && this.workspace) {
1435
+ try {
1436
+ const sa = deriveSkillAuthoringMilestones(this.workspace);
1437
+ const covered = new Set(sa.ruleIdsCovered);
1438
+ const tm = deriveSkillTestingMilestones(this.workspace);
1439
+ const tested = new Set(tm.skillsTested);
1440
+ const r = this.taskManager.reconcileAgainstDisk((task) => {
1441
+ if (task.phase === "skill_authoring") return covered.has(task.ruleId);
1442
+ if (task.phase === "skill_testing") return tested.has(task.ruleId);
1443
+ return true; // unknown phase — leave alone
1444
+ });
1445
+ if (r.flippedBack.length > 0) {
1446
+ this.eventLog.append("tasks_reconciled", {
1447
+ from_phase: this.currentPhase,
1448
+ target_phase: nextPhase,
1449
+ flipped_back: r.flippedBack,
1450
+ count: r.flippedBack.length,
1451
+ inspected: r.reconciled,
1452
+ });
1453
+ }
1454
+ } catch { /* never let reconcile break advance */ }
1455
+ }
1456
+
1457
+ // v0.6.3: HARD-TRACKING GATE — refuse forward advance unless the source
1458
+ // phase's exit criteria are met by engine telemetry. v0.6.1 added the
1459
+ // engineCounts block to phase summaries (observation) but never wired
1460
+ // exitCriteriaMet() into the gate (enforcement). E2E #5 surfaced the
1461
+ // gap: MiMo advanced rule_extraction → skill_authoring with
1462
+ // rulesExtracted=0 in engine telemetry because rule_catalog had been
1463
+ // writing to a stranded post-rename path AND nothing checked the gate.
1464
+ //
1465
+ // Forward-only enforcement: rollbacks (_advancePhase from a later phase
1466
+ // to an earlier one with force:true) are an explicit escape, not a
1467
+ // criteria check — the rolled-from phase doesn't need to be "complete".
1468
+ // force:true also bypasses (matches existing escape pattern: user/agent
1469
+ // explicitly chose to skip).
1470
+ if (!force) {
1471
+ const fromIdx = PHASE_ORDER.indexOf(this.currentPhase);
1472
+ const toIdx = PHASE_ORDER.indexOf(nextPhase);
1473
+ const isForward = fromIdx >= 0 && toIdx >= 0 && toIdx > fromIdx;
1474
+ if (isForward) {
1475
+ const fromPipeline = this.pipelines?.[this.currentPhase];
1476
+ let criteriaMet = true;
1477
+ try { criteriaMet = !!fromPipeline?.exitCriteriaMet?.(); } catch { criteriaMet = true; }
1478
+ if (!criteriaMet) {
1479
+ const counts = this._buildEngineCountsBlock(this.currentPhase);
1480
+ this.eventLog.append("phase_advance_refused", {
1481
+ from: this.currentPhase, to: nextPhase, reason,
1482
+ hint: "exit criteria not met by engine telemetry",
1483
+ engineCounts: counts || null,
1484
+ });
1485
+ return false;
1486
+ }
1487
+ }
1488
+ }
1489
+
1173
1490
  // v0.6.2 J2: detect rollback direction. PHASE_ORDER is a linear array
1174
1491
  // of all phases; if target index < current index, this is a rollback
1175
1492
  // (e.g., production_qc → skill_authoring after gates revealed gaps).
@@ -1185,9 +1502,15 @@ export class AgentEngine {
1185
1502
  const engineCounts = this._buildEngineCountsBlock(this.currentPhase);
1186
1503
  const mismatchPrefix = this._detectSummaryMismatch(reason, this.currentPhase) ? "⚠️ POSSIBLE MISMATCH: " : "";
1187
1504
  const directionTag = direction === "rollback" ? " [ROLLBACK]" : "";
1505
+ // v0.7.0 A2: forced is now `!!force` (honest), not the old
1506
+ // `force && nextPhase !== expected` which masked every adjacent-forward
1507
+ // force in the audit log. E2E #5 had 12/12 force-bypasses but the event
1508
+ // log read 0 forced because every transition was to the immediate next
1509
+ // phase. Truth in audit logs first; refinement (forward-vs-non-adjacent
1510
+ // distinction) lives in the `direction` field.
1188
1511
  const phaseSummary =
1189
1512
  `[${this.currentPhase.toUpperCase()} → ${nextPhase.toUpperCase()}]${directionTag}: ${mismatchPrefix}${reason}` +
1190
- (force && nextPhase !== expected ? " (forced)" : "") +
1513
+ (force ? " (forced)" : "") +
1191
1514
  (engineCounts ? `\n (engine) ${engineCounts}` : "");
1192
1515
  this._phaseSummaries.push(phaseSummary);
1193
1516
  this.eventLog.append("phase_transition", {
@@ -1197,7 +1520,7 @@ export class AgentEngine {
1197
1520
  direction,
1198
1521
  engineCounts: engineCounts || null,
1199
1522
  possibleMismatch: !!mismatchPrefix,
1200
- forced: force && nextPhase !== expected,
1523
+ forced: !!force,
1201
1524
  });
1202
1525
  const fromPhase = this.currentPhase;
1203
1526
  this.currentPhase = nextPhase;
@@ -1205,6 +1528,18 @@ export class AgentEngine {
1205
1528
  this.workspace.setPhase(this.currentPhase);
1206
1529
  this._createTasksForPhase(this.currentPhase);
1207
1530
 
1531
+ // v0.7.0 N (#94): give the entered pipeline a chance to do
1532
+ // phase-entry setup. Used by finalization to copy the release
1533
+ // template into output/releases/v1/. Other pipelines are no-ops.
1534
+ // Wrapped so a failure here can't trap the phase advance.
1535
+ try { this.pipelines[this.currentPhase]?.onPhaseEnter?.({ fromPhase, workspace: this.workspace }); }
1536
+ catch (e) {
1537
+ this.eventLog.append("phase_enter_hook_failed", {
1538
+ phase: this.currentPhase,
1539
+ error: e?.message || String(e),
1540
+ });
1541
+ }
1542
+
1208
1543
  // v0.6.2 J2: on rollback, reset the rolled-FROM phase's lastReady
1209
1544
  // edge-trigger so that if the agent revisits it and re-flips
1210
1545
  // exit-criteria true, _maybeAutoAdvance will fire correctly. Without
@@ -1298,7 +1633,7 @@ export class AgentEngine {
1298
1633
  const parts = [];
1299
1634
  try {
1300
1635
  switch (fromPhase) {
1301
- case "extraction": {
1636
+ case "rule_extraction": {
1302
1637
  const total = pipeline._catalogRuleCount?.() ?? pipeline.rulesExtracted?.length ?? 0;
1303
1638
  parts.push(`rulesExtracted: ${pipeline.rulesExtracted?.length ?? 0}`);
1304
1639
  parts.push(`rulesWithChunkRefs: ${pipeline.rulesWithChunkRefs?.length ?? 0}/${total}`);
@@ -1746,11 +2081,23 @@ export class AgentEngine {
1746
2081
 
1747
2082
  // Auto-continue through pending tasks
1748
2083
  while (this.taskManager.getNextPending()) {
1749
- // Context safety: force compaction if above 70%, or light compaction if history is long
2084
+ // v0.7.0 #93: budget-aware compact threshold. The old
2085
+ // `messages.length > 15` was message-count-based and frozen
2086
+ // from when KC ran on smaller contexts. With 200K+ budgets it
2087
+ // fired on every iteration of any non-trivial task — E2E #5
2088
+ // GLM saw 76 memory_pressure events and DS saw 46 because
2089
+ // compact pre-empted natural windowing. Replace with token-
2090
+ // budget threshold (default 60% of context, configurable via
2091
+ // KC_COMPACT_THRESHOLD_TOKENS) so compact runs when there's
2092
+ // actual pressure, not just when message count crossed an
2093
+ // ancient heuristic.
1750
2094
  const stats = this.getContextStats();
2095
+ const thresholdTokens = parseInt(
2096
+ process.env.KC_COMPACT_THRESHOLD_TOKENS || "0", 10,
2097
+ ) || Math.round((this.config.kcContextLimit || 200000) * 0.6);
1751
2098
  if (stats.percentage > 70) {
1752
2099
  await this.compact();
1753
- } else if (this.history.messages.length > 15) {
2100
+ } else if (stats.totalTokens > thresholdTokens) {
1754
2101
  await this.compact({ recentCount: 8 });
1755
2102
  }
1756
2103
 
@@ -1919,10 +2266,18 @@ export class AgentEngine {
1919
2266
  continue;
1920
2267
  }
1921
2268
 
1922
- const trackedPromise = entry.promise.then(
1923
- () => ({ taskId: task.id, subId, ok: true }),
1924
- (e) => ({ taskId: task.id, subId, ok: false, error: e?.message || String(e) }),
1925
- );
2269
+ // v0.7.0 H1: trackedPromise covers both fulfilled and rejected
2270
+ // paths (second arg). The .catch tail is belt-and-braces in case
2271
+ // the .then callbacks themselves throw without it, a JSON
2272
+ // serialization throw inside the success-arm callback would
2273
+ // surface as UnhandledPromiseRejection and crash strict-mode
2274
+ // Node. We never want a worker error to take the engine down.
2275
+ const trackedPromise = entry.promise
2276
+ .then(
2277
+ () => ({ taskId: task.id, subId, ok: true }),
2278
+ (e) => ({ taskId: task.id, subId, ok: false, error: e?.message || String(e) }),
2279
+ )
2280
+ .catch((e) => ({ taskId: task.id, subId, ok: false, error: `tracked-promise threw: ${e?.message || String(e)}` }));
1926
2281
  inFlight.set(subId, { task, workerLabel, promise: trackedPromise });
1927
2282
  }
1928
2283
  };
@@ -1937,7 +2292,15 @@ export class AgentEngine {
1937
2292
 
1938
2293
  if (inFlight.size === 0) break;
1939
2294
 
1940
- // Wait for either the next event OR a worker to complete
2295
+ // Wait for either the next event OR a worker to complete.
2296
+ //
2297
+ // v0.7.0 C1 note: losers in Promise.race() keep their .then()
2298
+ // chains active and resolve into garbage objects. That's the
2299
+ // intended JS Promise behavior — rejections are still handled,
2300
+ // memory drops at GC. The audit was overstated; no actual hang
2301
+ // or leak. Each loop iteration rebuilds the race from current
2302
+ // inFlight.values() so stale promises from prior iterations
2303
+ // are naturally re-observed (they've already resolved by then).
1941
2304
  const workerCompletion = Promise.race([...inFlight.values()].map((v) => v.promise));
1942
2305
  const eventArrival = new Promise((resolve) => { notify = () => resolve("event"); });
1943
2306
  const winner = await Promise.race([