kc-beta 0.6.2 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/LICENSE +81 -0
  2. package/LICENSE-COMMERCIAL.md +125 -0
  3. package/README.md +21 -3
  4. package/package.json +14 -5
  5. package/src/agent/context-window.js +9 -12
  6. package/src/agent/context.js +14 -1
  7. package/src/agent/document-parser.js +169 -0
  8. package/src/agent/engine.js +367 -18
  9. package/src/agent/history/event-history.js +222 -0
  10. package/src/agent/llm-client.js +55 -0
  11. package/src/agent/message-utils.js +63 -0
  12. package/src/agent/pipelines/_milestone-derive.js +511 -0
  13. package/src/agent/pipelines/base.js +21 -0
  14. package/src/agent/pipelines/distillation.js +28 -15
  15. package/src/agent/pipelines/extraction.js +103 -36
  16. package/src/agent/pipelines/finalization.js +178 -11
  17. package/src/agent/pipelines/index.js +6 -1
  18. package/src/agent/pipelines/initializer.js +74 -8
  19. package/src/agent/pipelines/production-qc.js +31 -44
  20. package/src/agent/pipelines/skill-authoring.js +97 -80
  21. package/src/agent/pipelines/skill-testing.js +67 -23
  22. package/src/agent/retry.js +10 -2
  23. package/src/agent/scheduler.js +14 -2
  24. package/src/agent/session-state.js +18 -1
  25. package/src/agent/skill-loader.js +13 -7
  26. package/src/agent/skill-validator.js +19 -5
  27. package/src/agent/task-manager.js +61 -5
  28. package/src/agent/tools/document-chunk.js +21 -9
  29. package/src/agent/tools/phase-advance.js +18 -3
  30. package/src/agent/tools/release.js +51 -9
  31. package/src/agent/tools/rule-catalog.js +11 -1
  32. package/src/agent/tools/workspace-file.js +32 -0
  33. package/src/agent/workspace.js +39 -1
  34. package/src/cli/components.js +64 -14
  35. package/src/cli/index.js +62 -3
  36. package/src/cli/meme.js +26 -25
  37. package/src/config.js +65 -22
  38. package/src/model-tiers.json +24 -8
  39. package/src/providers.js +42 -0
  40. package/template/release/v1/README.md.tmpl +108 -0
  41. package/template/release/v1/catalog.json.tmpl +4 -0
  42. package/template/release/v1/kc_runtime/__init__.py +11 -0
  43. package/template/release/v1/kc_runtime/confidence.py +63 -0
  44. package/template/release/v1/kc_runtime/doc_parser.py +127 -0
  45. package/template/release/v1/manifest.json.tmpl +11 -0
  46. package/template/release/v1/render_dashboard.py +117 -0
  47. package/template/release/v1/run.py +212 -0
  48. package/template/release/v1/serve.sh +17 -0
  49. package/template/skills/en/meta-meta/work-decomposition/SKILL.md +266 -0
  50. package/template/skills/en/skill-creator/SKILL.md +1 -1
  51. package/template/skills/zh/meta-meta/work-decomposition/SKILL.md +264 -0
  52. package/template/skills/zh/skill-creator/SKILL.md +1 -1
@@ -1,8 +1,13 @@
1
1
  import fs from "node:fs";
2
2
  import path from "node:path";
3
3
  import { AgentEvent } from "./events.js";
4
+ import {
5
+ deriveSkillAuthoringMilestones,
6
+ deriveSkillTestingMilestones,
7
+ } from "./pipelines/_milestone-derive.js";
4
8
  import { ContextAssembler } from "./context.js";
5
9
  import { ConversationHistory } from "./history.js";
10
+ import { findSafeSplitPoint } from "./message-utils.js";
6
11
  import { Workspace } from "./workspace.js";
7
12
  import { normalizeRuleCatalog } from "./rule-catalog-normalize.js";
8
13
  import { VersionManager } from "./version-manager.js";
@@ -52,6 +57,45 @@ import { estimateTokens, estimateMessagesTokens } from "./token-counter.js";
52
57
  // or kc_max_tokens in the global config.
53
58
  const DEFAULT_KC_MAX_TOKENS = 65536;
54
59
 
60
+ /**
61
+ * v0.6.3.1: Tolerant JSON parse for streamed tool-call arguments. When LLMs
62
+ * (esp. SiliconFlow GLM-5.1 in E2E #5) hit max_tokens mid-arguments, the
63
+ * stream returns truncated JSON missing N closing braces or quotes. Strict
64
+ * parse fails; old code silently dropped to {} which masked the actual issue.
65
+ *
66
+ * Strategy:
67
+ * 1. Try strict JSON.parse (fast path, most calls).
68
+ * 2. On failure, attempt to balance braces by appending up to BRACE_BUDGET
69
+ * `}` characters. Cheap; recovers the common single-brace-truncation case.
70
+ * 3. If still failing, return error so caller surfaces it to the agent.
71
+ *
72
+ * Returns { ok: true, value, recovered? } | { ok: false, error }.
73
+ */
74
+ const BRACE_RECOVERY_BUDGET = 4;
75
+ function parseToolArgsTolerant(raw) {
76
+ if (typeof raw !== "string") return { ok: false, error: "arguments not a string" };
77
+ if (raw === "") return { ok: true, value: {} };
78
+ // Fast path
79
+ try { return { ok: true, value: JSON.parse(raw) }; } catch (e0) {
80
+ // Recovery: balance braces by appending up to BRACE_RECOVERY_BUDGET `}`
81
+ const opens = (raw.match(/\{/g) || []).length;
82
+ const closes = (raw.match(/\}/g) || []).length;
83
+ const needed = opens - closes;
84
+ if (needed > 0 && needed <= BRACE_RECOVERY_BUDGET) {
85
+ const padded = raw + "}".repeat(needed);
86
+ try { return { ok: true, value: JSON.parse(padded), recovered: needed }; } catch (_) { /* fall through */ }
87
+ }
88
+ // Last-ditch: try closing an open string then balancing braces.
89
+ // Truncation can land mid-string-value: ..."description": "abc<EOF>
90
+ const quotes = (raw.match(/"/g) || []).length;
91
+ if (quotes % 2 === 1) {
92
+ const candidate = raw + '"' + "}".repeat(Math.max(1, needed));
93
+ try { return { ok: true, value: JSON.parse(candidate), recovered: candidate.length - raw.length }; } catch (_) { /* fall through */ }
94
+ }
95
+ return { ok: false, error: e0.message || "JSON parse failed" };
96
+ }
97
+ }
98
+
55
99
  // Phases where worker LLM tools are available (DISTILL mode).
56
100
  // E1: FINALIZATION inherits worker-LLM access so one-last-pass validation
57
101
  // runs + dashboard_render + workflow_run stay usable during packaging.
@@ -462,6 +506,27 @@ export class AgentEngine {
462
506
  return "";
463
507
  }
464
508
 
509
+ /**
510
+ * v0.7.0 B3: Read rules/PATTERNS.md (project memory) for surfacing in
511
+ * the system prompt. Only loaded for phases where the agent owns
512
+ * decomposition decisions (skill_authoring + skill_testing — the two
513
+ * phases the work-decomposition skill operates in). Capped at ~5 KB
514
+ * so it stays trivial token-wise; if the file is larger, we truncate
515
+ * to the first 5 KB and append a "...truncated" marker so the agent
516
+ * knows to prune.
517
+ */
518
+ _readProjectMemory() {
519
+ if (!["skill_authoring", "skill_testing"].includes(this.currentPhase)) return null;
520
+ const p = path.join(this.workspace.cwd, "rules", "PATTERNS.md");
521
+ try {
522
+ if (!fs.existsSync(p)) return null;
523
+ const raw = fs.readFileSync(p, "utf-8");
524
+ const CAP = 5 * 1024;
525
+ if (raw.length <= CAP) return raw;
526
+ return raw.slice(0, CAP) + "\n\n…truncated at 5 KB — prune the least-actionable entries (work-decomposition skill: Sizing).";
527
+ } catch { return null; }
528
+ }
529
+
465
530
  /**
466
531
  * Build the workspace/project directory state string for the system prompt.
467
532
  */
@@ -501,6 +566,7 @@ export class AgentEngine {
501
566
  skillIndex: this._skillLoader.formatForContext(this.currentPhase),
502
567
  pipelineState: this.pipelines[this.currentPhase]?.describeState?.() || null,
503
568
  workspaceState: this._buildWorkspaceState(),
569
+ projectMemory: this._readProjectMemory(),
504
570
  });
505
571
  const systemTokens = estimateTokens(systemPrompt);
506
572
  const messageTokens = estimateMessagesTokens(this.history.messages);
@@ -659,8 +725,18 @@ export class AgentEngine {
659
725
  async compact({ recentCount = 20 } = {}) {
660
726
  if (this.history.messages.length <= recentCount) return null;
661
727
 
662
- const olderMessages = this.history.messages.slice(0, -recentCount);
663
- const recentMessages = this.history.messages.slice(-recentCount);
728
+ // v0.6.3.1: tool-pair atomicity. Naive slice(-recentCount) can land on
729
+ // a tool message (whose assistant_with_tool_calls is in the older batch
730
+ // about to be summarized) OR put the split between an assistant with
731
+ // tool_calls and its tool results. Either creates an orphan that
732
+ // DeepSeek's strict API rejects with 400. Walk the split point forward
733
+ // until BOTH (recent[0] isn't tool) AND (older[-1] isn't
734
+ // assistant_with_tool_calls).
735
+ const desiredSplit = this.history.messages.length - recentCount;
736
+ const splitPoint = findSafeSplitPoint(this.history.messages, desiredSplit);
737
+ const olderMessages = this.history.messages.slice(0, splitPoint);
738
+ const recentMessages = this.history.messages.slice(splitPoint);
739
+ if (olderMessages.length === 0) return null; // nothing safely summarizable
664
740
 
665
741
  const CHUNK_BUDGET = 30000; // tokens per summarization request
666
742
  const chunks = this._chunkMessages(olderMessages, CHUNK_BUDGET);
@@ -793,6 +869,39 @@ export class AgentEngine {
793
869
  engine._registerToolsForPhase(engine.currentPhase);
794
870
  engine.workspace.setPhase(engine.currentPhase);
795
871
 
872
+ // v0.6.3.1: detect whether prior turns of this session used reasoning
873
+ // mode, so the field-consistency invariant continues across resume.
874
+ // Without this, the first assistant turn after resume might lack
875
+ // reasoning_content even though earlier turns have it, and DeepSeek's
876
+ // strict-mode rejects with 400.
877
+ try {
878
+ const msgs = engine.history?.messages || [];
879
+ engine._sessionUsesReasoning = msgs.some(
880
+ (m) => m?.role === "assistant" && "reasoning_content" in m,
881
+ );
882
+ // One-shot migration: backfill empty reasoning_content on assistant
883
+ // messages that are missing the field. Pre-v0.6.3.1 sessions could
884
+ // accumulate "holes" (turns where the model skipped reasoning) that
885
+ // poison the conversation for resume. A single empty string on each
886
+ // hole is enough to satisfy DeepSeek's field-consistency rule.
887
+ if (engine._sessionUsesReasoning) {
888
+ let patched = 0;
889
+ for (const m of msgs) {
890
+ if (m?.role === "assistant" && !("reasoning_content" in m)) {
891
+ m.reasoning_content = "";
892
+ patched++;
893
+ }
894
+ }
895
+ if (patched > 0) {
896
+ engine.history._save?.();
897
+ engine.eventLog.append("reasoning_content_backfilled", {
898
+ count: patched,
899
+ reason: "v0.6.3.1 migration on resume",
900
+ });
901
+ }
902
+ }
903
+ } catch { /* never let resume break on this */ }
904
+
796
905
  // Restore project directory from saved state
797
906
  if (data.projectDir) {
798
907
  if (fs.existsSync(data.projectDir)) {
@@ -905,6 +1014,7 @@ export class AgentEngine {
905
1014
  skillIndex: this._skillLoader.formatForContext(this.currentPhase),
906
1015
  pipelineState,
907
1016
  workspaceState: this._buildWorkspaceState(),
1017
+ projectMemory: this._readProjectMemory(),
908
1018
  });
909
1019
  const tools = this.toolRegistry.schemasOpenai();
910
1020
 
@@ -933,6 +1043,19 @@ export class AgentEngine {
933
1043
 
934
1044
  try {
935
1045
  let collectedText = "";
1046
+ // v0.7.0 L (#76): Anthropic-only — accumulator for the
1047
+ // signature_delta blob that proves the thinking content came
1048
+ // from Anthropic's model. Required alongside thinking text on
1049
+ // multi-turn replay.
1050
+ let collectedReasoningSignature = "";
1051
+ // v0.6.3: hybrid reasoning models (GLM-5.1, DeepSeek v4, MiMo v2.5,
1052
+ // Qwen3, ...) stream `delta.reasoning_content` separately from
1053
+ // `delta.content`. DeepSeek's strict API requires this field to be
1054
+ // round-tripped on subsequent assistant messages or it rejects the
1055
+ // request with "reasoning_content in the thinking mode must be passed
1056
+ // back". Even providers that don't enforce this (SiliconFlow) still
1057
+ // benefit from preservation — without it, prior reasoning is wasted.
1058
+ let collectedReasoning = "";
936
1059
  /** @type {Map<number, {id: string, name: string, arguments: string}>} */
937
1060
  const toolCallsAcc = new Map();
938
1061
 
@@ -952,6 +1075,22 @@ export class AgentEngine {
952
1075
  collectedText += delta.content;
953
1076
  }
954
1077
 
1078
+ // v0.6.3: capture reasoning_content from the same delta. Emit a
1079
+ // separate event type so the TUI can optionally render thinking
1080
+ // (today it's silently consumed; round-trip is the priority fix).
1081
+ if (delta.reasoning_content) {
1082
+ yield new AgentEvent({ type: "reasoning_delta", text: delta.reasoning_content });
1083
+ collectedReasoning += delta.reasoning_content;
1084
+ }
1085
+
1086
+ // v0.7.0 L (#76): Anthropic-only signature_delta. Carries the
1087
+ // opaque proof-of-thinking blob that strict-mode multi-turn
1088
+ // requires alongside the thinking text. OpenAI-shape providers
1089
+ // never emit this delta; it's a no-op for them.
1090
+ if (delta.reasoning_signature) {
1091
+ collectedReasoningSignature += delta.reasoning_signature;
1092
+ }
1093
+
955
1094
  if (delta.tool_calls) {
956
1095
  for (const tcDelta of delta.tool_calls) {
957
1096
  const idx = tcDelta.index;
@@ -968,6 +1107,31 @@ export class AgentEngine {
968
1107
 
969
1108
  // Log the complete assistant message (coalesced, not per-delta)
970
1109
  const assistantMsg = { role: "assistant", content: collectedText || null };
1110
+ // v0.6.3: persist reasoning_content on the assistant message so it
1111
+ // round-trips on the next request. history.addRaw spreads the input,
1112
+ // preserving unknown fields; OpenAI body builder doesn't strip them.
1113
+ //
1114
+ // v0.6.3.1: DeepSeek's strict-mode rule is FIELD CONSISTENCY, not
1115
+ // field content — once any assistant turn in the conversation has
1116
+ // reasoning_content, every subsequent assistant turn must also have
1117
+ // it (empty string OK; missing the field rejects with 400). Hybrid
1118
+ // reasoning models sometimes skip reasoning on trivial follow-through
1119
+ // tool calls, leaving collectedReasoning="". Track at session level:
1120
+ // once we see ANY reasoning, keep setting the field (possibly empty)
1121
+ // for the rest of the session. Providers that don't use the field
1122
+ // ignore it silently.
1123
+ if (collectedReasoning) {
1124
+ assistantMsg.reasoning_content = collectedReasoning;
1125
+ this._sessionUsesReasoning = true;
1126
+ } else if (this._sessionUsesReasoning) {
1127
+ assistantMsg.reasoning_content = "";
1128
+ }
1129
+ // v0.7.0 L (#76): persist Anthropic signature alongside thinking.
1130
+ // Always stored together — if either is missing, _buildAnthropicBody
1131
+ // skips the thinking-block replay (would be rejected as malformed).
1132
+ if (collectedReasoningSignature) {
1133
+ assistantMsg.reasoning_signature = collectedReasoningSignature;
1134
+ }
971
1135
  if (toolCallsAcc.size > 0) {
972
1136
  assistantMsg.tool_calls = Array.from(toolCallsAcc.values()).map((tc) => ({
973
1137
  id: tc.id,
@@ -1024,10 +1188,61 @@ export class AgentEngine {
1024
1188
 
1025
1189
  // Tool execution loop
1026
1190
  for (const tc of toolCallsAcc.values()) {
1027
- let inputData = {};
1028
- try {
1029
- inputData = tc.arguments ? JSON.parse(tc.arguments) : {};
1030
- } catch { /* ignore */ }
1191
+ // v0.6.3.1: tool-argument JSON parsing used to be `try { parse } catch {}`
1192
+ // — silently falling back to {} on any parse failure. E2E #5 GLM
1193
+ // session showed this firing 100+ times: SiliconFlow streaming
1194
+ // truncates GLM-5.1 tool_call arguments by ~1 closing brace
1195
+ // (likely max_tokens cutoff mid-args), the silent fallback shipped
1196
+ // {} to the tool, and the tool returned generic "(empty)" errors
1197
+ // which the agent kept retrying without understanding why.
1198
+ //
1199
+ // Fix: try strict parse, then attempt brace-balance recovery (cheap
1200
+ // — recovers from the common single-brace-truncation case), and if
1201
+ // that fails, surface a structured error to the agent so it can
1202
+ // see what it sent and self-correct.
1203
+ let inputData = null;
1204
+ let argParseError = null;
1205
+ if (tc.arguments) {
1206
+ const recovery = parseToolArgsTolerant(tc.arguments);
1207
+ if (recovery.ok) {
1208
+ inputData = recovery.value;
1209
+ if (recovery.recovered) {
1210
+ this.eventLog.append("tool_args_recovered", {
1211
+ name: tc.name,
1212
+ added_chars: recovery.recovered,
1213
+ original_len: tc.arguments.length,
1214
+ });
1215
+ }
1216
+ } else {
1217
+ argParseError = recovery.error;
1218
+ }
1219
+ } else {
1220
+ inputData = {};
1221
+ }
1222
+
1223
+ // If arguments were unparseable, skip execution and return a tool
1224
+ // result that tells the agent what went wrong. Engine's tool result
1225
+ // loop continues so the rest of the assistant's tool_calls in this
1226
+ // turn still execute.
1227
+ if (argParseError) {
1228
+ const preview = (tc.arguments || "").slice(0, 200);
1229
+ const errMsg =
1230
+ `Tool arguments were malformed JSON for ${tc.name}. ` +
1231
+ `Likely streaming truncation by the model (provider cut tokens mid-output). ` +
1232
+ `Parser error: ${argParseError}. ` +
1233
+ `First 200 chars of what was received: ${preview}${tc.arguments && tc.arguments.length > 200 ? "..." : ""}. ` +
1234
+ `Retry the call with shorter / simpler arguments — the model may have hit max_tokens partway through encoding.`;
1235
+ this.eventLog.append("tool_args_parse_failed", {
1236
+ name: tc.name,
1237
+ error: argParseError,
1238
+ raw_args_len: (tc.arguments || "").length,
1239
+ raw_preview: preview,
1240
+ });
1241
+ yield new AgentEvent({ type: "tool_start", name: tc.name, input: { _parse_error: argParseError } });
1242
+ yield new AgentEvent({ type: "tool_result", name: tc.name, output: errMsg, isError: true });
1243
+ this.history.addRaw({ role: "tool", tool_call_id: tc.id, content: errMsg });
1244
+ continue;
1245
+ }
1031
1246
 
1032
1247
  this.eventLog.append("tool_start", { name: tc.name, input: inputData });
1033
1248
  yield new AgentEvent({ type: "tool_start", name: tc.name, input: inputData });
@@ -1082,10 +1297,31 @@ export class AgentEngine {
1082
1297
  isError: result.isError,
1083
1298
  });
1084
1299
 
1300
+ // v0.6.3 (#74): phase-misfit nudge. Ask the current pipeline whether
1301
+ // this tool call looks like work that belongs to a different phase.
1302
+ // If so, append a `<system-reminder>` tag to the tool result content
1303
+ // (same convention as task-tools and auto-memory reminders). The
1304
+ // agent sees this on its next turn and can self-check whether to
1305
+ // call phase_advance. Only fires for non-error results — failed
1306
+ // tool calls have their own error message and don't need the nudge.
1307
+ let nudgedContent = historyContent;
1308
+ try {
1309
+ const pipelineForPhase = this.pipelines?.[beforePhase];
1310
+ const hint = pipelineForPhase?.phaseMisfitHint?.(tc.name, inputData, result);
1311
+ if (hint && !result.isError) {
1312
+ nudgedContent = `${historyContent}\n\n<system-reminder>\nPhase-misfit detected: ${hint}\n</system-reminder>`;
1313
+ this.eventLog.append("phase_misfit_hint", {
1314
+ phase: beforePhase,
1315
+ tool: tc.name,
1316
+ hint,
1317
+ });
1318
+ }
1319
+ } catch { /* never let the nudge logic break the tool loop */ }
1320
+
1085
1321
  this.history.addRaw({
1086
1322
  role: "tool",
1087
1323
  tool_call_id: tc.id,
1088
- content: historyContent,
1324
+ content: nudgedContent,
1089
1325
  });
1090
1326
 
1091
1327
  // Post-tool-result safety net: check for context pressure RIGHT NOW
@@ -1162,14 +1398,81 @@ export class AgentEngine {
1162
1398
 
1163
1399
  const expected = NEXT_PHASE[this.currentPhase];
1164
1400
  if (!force && nextPhase !== expected) {
1401
+ // v0.7.0 A3: event-log hint stays factual (records what the gate
1402
+ // saw) — the LLM-facing refusal text in phase-advance.js no longer
1403
+ // advertises force:true. Hint kept here for post-mortem audit.
1165
1404
  this.eventLog.append("phase_advance_refused", {
1166
1405
  from: this.currentPhase, to: nextPhase, reason,
1167
- hint: expected ? `expected next phase is '${expected}' — pass force:true to override`
1406
+ hint: expected ? `non-adjacent transition; immediate next phase is '${expected}'`
1168
1407
  : `${this.currentPhase} is the terminal phase`,
1169
1408
  });
1170
1409
  return false;
1171
1410
  }
1172
1411
 
1412
+ // v0.7.0 A5: reconcile per-rule tasks against disk artifacts before
1413
+ // checking exit criteria. Catches the E2E #5 DS pattern (tasks.json
1414
+ // showed 70/70 done while only 56 dirs / 36 with check_*.py existed):
1415
+ // markDone() is fire-and-forget today, so the agent can claim
1416
+ // completion that didn't materialize. Reconcile flips back to
1417
+ // pending if the helper-derived ruleIdsCovered set doesn't include
1418
+ // the task's ruleId. A "force"d advance bypasses reconcile too —
1419
+ // the gate already gives the agent / user that escape.
1420
+ if (!force && this.taskManager && this.workspace) {
1421
+ try {
1422
+ const sa = deriveSkillAuthoringMilestones(this.workspace);
1423
+ const covered = new Set(sa.ruleIdsCovered);
1424
+ const tm = deriveSkillTestingMilestones(this.workspace);
1425
+ const tested = new Set(tm.skillsTested);
1426
+ const r = this.taskManager.reconcileAgainstDisk((task) => {
1427
+ if (task.phase === "skill_authoring") return covered.has(task.ruleId);
1428
+ if (task.phase === "skill_testing") return tested.has(task.ruleId);
1429
+ return true; // unknown phase — leave alone
1430
+ });
1431
+ if (r.flippedBack.length > 0) {
1432
+ this.eventLog.append("tasks_reconciled", {
1433
+ from_phase: this.currentPhase,
1434
+ target_phase: nextPhase,
1435
+ flipped_back: r.flippedBack,
1436
+ count: r.flippedBack.length,
1437
+ inspected: r.reconciled,
1438
+ });
1439
+ }
1440
+ } catch { /* never let reconcile break advance */ }
1441
+ }
1442
+
1443
+ // v0.6.3: HARD-TRACKING GATE — refuse forward advance unless the source
1444
+ // phase's exit criteria are met by engine telemetry. v0.6.1 added the
1445
+ // engineCounts block to phase summaries (observation) but never wired
1446
+ // exitCriteriaMet() into the gate (enforcement). E2E #5 surfaced the
1447
+ // gap: MiMo advanced rule_extraction → skill_authoring with
1448
+ // rulesExtracted=0 in engine telemetry because rule_catalog had been
1449
+ // writing to a stranded post-rename path AND nothing checked the gate.
1450
+ //
1451
+ // Forward-only enforcement: rollbacks (_advancePhase from a later phase
1452
+ // to an earlier one with force:true) are an explicit escape, not a
1453
+ // criteria check — the rolled-from phase doesn't need to be "complete".
1454
+ // force:true also bypasses (matches existing escape pattern: user/agent
1455
+ // explicitly chose to skip).
1456
+ if (!force) {
1457
+ const fromIdx = PHASE_ORDER.indexOf(this.currentPhase);
1458
+ const toIdx = PHASE_ORDER.indexOf(nextPhase);
1459
+ const isForward = fromIdx >= 0 && toIdx >= 0 && toIdx > fromIdx;
1460
+ if (isForward) {
1461
+ const fromPipeline = this.pipelines?.[this.currentPhase];
1462
+ let criteriaMet = true;
1463
+ try { criteriaMet = !!fromPipeline?.exitCriteriaMet?.(); } catch { criteriaMet = true; }
1464
+ if (!criteriaMet) {
1465
+ const counts = this._buildEngineCountsBlock(this.currentPhase);
1466
+ this.eventLog.append("phase_advance_refused", {
1467
+ from: this.currentPhase, to: nextPhase, reason,
1468
+ hint: "exit criteria not met by engine telemetry",
1469
+ engineCounts: counts || null,
1470
+ });
1471
+ return false;
1472
+ }
1473
+ }
1474
+ }
1475
+
1173
1476
  // v0.6.2 J2: detect rollback direction. PHASE_ORDER is a linear array
1174
1477
  // of all phases; if target index < current index, this is a rollback
1175
1478
  // (e.g., production_qc → skill_authoring after gates revealed gaps).
@@ -1185,9 +1488,15 @@ export class AgentEngine {
1185
1488
  const engineCounts = this._buildEngineCountsBlock(this.currentPhase);
1186
1489
  const mismatchPrefix = this._detectSummaryMismatch(reason, this.currentPhase) ? "⚠️ POSSIBLE MISMATCH: " : "";
1187
1490
  const directionTag = direction === "rollback" ? " [ROLLBACK]" : "";
1491
+ // v0.7.0 A2: forced is now `!!force` (honest), not the old
1492
+ // `force && nextPhase !== expected` which masked every adjacent-forward
1493
+ // force in the audit log. E2E #5 had 12/12 force-bypasses but the event
1494
+ // log read 0 forced because every transition was to the immediate next
1495
+ // phase. Truth in audit logs first; refinement (forward-vs-non-adjacent
1496
+ // distinction) lives in the `direction` field.
1188
1497
  const phaseSummary =
1189
1498
  `[${this.currentPhase.toUpperCase()} → ${nextPhase.toUpperCase()}]${directionTag}: ${mismatchPrefix}${reason}` +
1190
- (force && nextPhase !== expected ? " (forced)" : "") +
1499
+ (force ? " (forced)" : "") +
1191
1500
  (engineCounts ? `\n (engine) ${engineCounts}` : "");
1192
1501
  this._phaseSummaries.push(phaseSummary);
1193
1502
  this.eventLog.append("phase_transition", {
@@ -1197,7 +1506,7 @@ export class AgentEngine {
1197
1506
  direction,
1198
1507
  engineCounts: engineCounts || null,
1199
1508
  possibleMismatch: !!mismatchPrefix,
1200
- forced: force && nextPhase !== expected,
1509
+ forced: !!force,
1201
1510
  });
1202
1511
  const fromPhase = this.currentPhase;
1203
1512
  this.currentPhase = nextPhase;
@@ -1205,6 +1514,18 @@ export class AgentEngine {
1205
1514
  this.workspace.setPhase(this.currentPhase);
1206
1515
  this._createTasksForPhase(this.currentPhase);
1207
1516
 
1517
+ // v0.7.0 N (#94): give the entered pipeline a chance to do
1518
+ // phase-entry setup. Used by finalization to copy the release
1519
+ // template into output/releases/v1/. Other pipelines are no-ops.
1520
+ // Wrapped so a failure here can't trap the phase advance.
1521
+ try { this.pipelines[this.currentPhase]?.onPhaseEnter?.({ fromPhase, workspace: this.workspace }); }
1522
+ catch (e) {
1523
+ this.eventLog.append("phase_enter_hook_failed", {
1524
+ phase: this.currentPhase,
1525
+ error: e?.message || String(e),
1526
+ });
1527
+ }
1528
+
1208
1529
  // v0.6.2 J2: on rollback, reset the rolled-FROM phase's lastReady
1209
1530
  // edge-trigger so that if the agent revisits it and re-flips
1210
1531
  // exit-criteria true, _maybeAutoAdvance will fire correctly. Without
@@ -1298,7 +1619,7 @@ export class AgentEngine {
1298
1619
  const parts = [];
1299
1620
  try {
1300
1621
  switch (fromPhase) {
1301
- case "extraction": {
1622
+ case "rule_extraction": {
1302
1623
  const total = pipeline._catalogRuleCount?.() ?? pipeline.rulesExtracted?.length ?? 0;
1303
1624
  parts.push(`rulesExtracted: ${pipeline.rulesExtracted?.length ?? 0}`);
1304
1625
  parts.push(`rulesWithChunkRefs: ${pipeline.rulesWithChunkRefs?.length ?? 0}/${total}`);
@@ -1746,11 +2067,23 @@ export class AgentEngine {
1746
2067
 
1747
2068
  // Auto-continue through pending tasks
1748
2069
  while (this.taskManager.getNextPending()) {
1749
- // Context safety: force compaction if above 70%, or light compaction if history is long
2070
+ // v0.7.0 #93: budget-aware compact threshold. The old
2071
+ // `messages.length > 15` was message-count-based and frozen
2072
+ // from when KC ran on smaller contexts. With 200K+ budgets it
2073
+ // fired on every iteration of any non-trivial task — E2E #5
2074
+ // GLM saw 76 memory_pressure events and DS saw 46 because
2075
+ // compact pre-empted natural windowing. Replace with token-
2076
+ // budget threshold (default 60% of context, configurable via
2077
+ // KC_COMPACT_THRESHOLD_TOKENS) so compact runs when there's
2078
+ // actual pressure, not just when message count crossed an
2079
+ // ancient heuristic.
1750
2080
  const stats = this.getContextStats();
2081
+ const thresholdTokens = parseInt(
2082
+ process.env.KC_COMPACT_THRESHOLD_TOKENS || "0", 10,
2083
+ ) || Math.round((this.config.kcContextLimit || 200000) * 0.6);
1751
2084
  if (stats.percentage > 70) {
1752
2085
  await this.compact();
1753
- } else if (this.history.messages.length > 15) {
2086
+ } else if (stats.totalTokens > thresholdTokens) {
1754
2087
  await this.compact({ recentCount: 8 });
1755
2088
  }
1756
2089
 
@@ -1919,10 +2252,18 @@ export class AgentEngine {
1919
2252
  continue;
1920
2253
  }
1921
2254
 
1922
- const trackedPromise = entry.promise.then(
1923
- () => ({ taskId: task.id, subId, ok: true }),
1924
- (e) => ({ taskId: task.id, subId, ok: false, error: e?.message || String(e) }),
1925
- );
2255
+ // v0.7.0 H1: trackedPromise covers both fulfilled and rejected
2256
+ // paths (second arg). The .catch tail is belt-and-braces in case
2257
+ // the .then callbacks themselves throw without it, a JSON
2258
+ // serialization throw inside the success-arm callback would
2259
+ // surface as UnhandledPromiseRejection and crash strict-mode
2260
+ // Node. We never want a worker error to take the engine down.
2261
+ const trackedPromise = entry.promise
2262
+ .then(
2263
+ () => ({ taskId: task.id, subId, ok: true }),
2264
+ (e) => ({ taskId: task.id, subId, ok: false, error: e?.message || String(e) }),
2265
+ )
2266
+ .catch((e) => ({ taskId: task.id, subId, ok: false, error: `tracked-promise threw: ${e?.message || String(e)}` }));
1926
2267
  inFlight.set(subId, { task, workerLabel, promise: trackedPromise });
1927
2268
  }
1928
2269
  };
@@ -1937,7 +2278,15 @@ export class AgentEngine {
1937
2278
 
1938
2279
  if (inFlight.size === 0) break;
1939
2280
 
1940
- // Wait for either the next event OR a worker to complete
2281
+ // Wait for either the next event OR a worker to complete.
2282
+ //
2283
+ // v0.7.0 C1 note: losers in Promise.race() keep their .then()
2284
+ // chains active and resolve into garbage objects. That's the
2285
+ // intended JS Promise behavior — rejections are still handled,
2286
+ // memory drops at GC. The audit was overstated; no actual hang
2287
+ // or leak. Each loop iteration rebuilds the race from current
2288
+ // inFlight.values() so stale promises from prior iterations
2289
+ // are naturally re-observed (they've already resolved by then).
1941
2290
  const workerCompletion = Promise.race([...inFlight.values()].map((v) => v.promise));
1942
2291
  const eventArrival = new Promise((resolve) => { notify = () => resolve("event"); });
1943
2292
  const winner = await Promise.race([