npm - kc-beta - Versions diffs - 0.6.1 → 0.7.0 - Mend

kc-beta 0.6.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

package/LICENSE +81 -0
package/LICENSE-COMMERCIAL.md +125 -0
package/README.md +21 -3
package/package.json +14 -5
package/src/agent/context-window.js +9 -12
package/src/agent/context.js +14 -1
package/src/agent/document-parser.js +169 -0
package/src/agent/engine.js +499 -20
package/src/agent/history/event-history.js +222 -0
package/src/agent/llm-client.js +55 -0
package/src/agent/message-utils.js +63 -0
package/src/agent/pipelines/_milestone-derive.js +511 -0
package/src/agent/pipelines/base.js +21 -0
package/src/agent/pipelines/distillation.js +28 -15
package/src/agent/pipelines/extraction.js +103 -36
package/src/agent/pipelines/finalization.js +178 -11
package/src/agent/pipelines/index.js +6 -1
package/src/agent/pipelines/initializer.js +74 -8
package/src/agent/pipelines/production-qc.js +31 -44
package/src/agent/pipelines/skill-authoring.js +152 -80
package/src/agent/pipelines/skill-testing.js +67 -23
package/src/agent/retry.js +10 -2
package/src/agent/scheduler.js +14 -2
package/src/agent/session-state.js +35 -2
package/src/agent/skill-loader.js +13 -7
package/src/agent/skill-validator.js +163 -0
package/src/agent/task-manager.js +61 -5
package/src/agent/tools/_workflow-result-schema.js +249 -0
package/src/agent/tools/document-chunk.js +21 -9
package/src/agent/tools/phase-advance.js +52 -6
package/src/agent/tools/release.js +51 -9
package/src/agent/tools/rule-catalog.js +11 -1
package/src/agent/tools/workflow-run.js +9 -4
package/src/agent/tools/workspace-file.js +32 -0
package/src/agent/workspace.js +61 -0
package/src/cli/components.js +64 -14
package/src/cli/index.js +62 -3
package/src/cli/meme.js +26 -25
package/src/config.js +65 -22
package/src/model-tiers.json +48 -0
package/src/providers.js +87 -0
package/template/release/v1/README.md.tmpl +108 -0
package/template/release/v1/catalog.json.tmpl +4 -0
package/template/release/v1/kc_runtime/__init__.py +11 -0
package/template/release/v1/kc_runtime/confidence.py +63 -0
package/template/release/v1/kc_runtime/doc_parser.py +127 -0
package/template/release/v1/manifest.json.tmpl +11 -0
package/template/release/v1/render_dashboard.py +117 -0
package/template/release/v1/run.py +212 -0
package/template/release/v1/serve.sh +17 -0
package/template/skills/en/meta-meta/skill-authoring/SKILL.md +19 -0
package/template/skills/en/meta-meta/work-decomposition/SKILL.md +266 -0
package/template/skills/en/skill-creator/SKILL.md +1 -1
package/template/skills/zh/meta-meta/skill-authoring/SKILL.md +19 -0
package/template/skills/zh/meta-meta/work-decomposition/SKILL.md +264 -0
package/template/skills/zh/skill-creator/SKILL.md +1 -1

package/src/agent/engine.js CHANGED Viewed

@@ -1,8 +1,13 @@
 import fs from "node:fs";
 import path from "node:path";
 import { AgentEvent } from "./events.js";
+import {
+  deriveSkillAuthoringMilestones,
+  deriveSkillTestingMilestones,
+} from "./pipelines/_milestone-derive.js";
 import { ContextAssembler } from "./context.js";
 import { ConversationHistory } from "./history.js";
+import { findSafeSplitPoint } from "./message-utils.js";
 import { Workspace } from "./workspace.js";
 import { normalizeRuleCatalog } from "./rule-catalog-normalize.js";
 import { VersionManager } from "./version-manager.js";
@@ -52,6 +57,45 @@ import { estimateTokens, estimateMessagesTokens } from "./token-counter.js";
 // or kc_max_tokens in the global config.
 const DEFAULT_KC_MAX_TOKENS = 65536;
+/**
+ * v0.6.3.1: Tolerant JSON parse for streamed tool-call arguments. When LLMs
+ * (esp. SiliconFlow GLM-5.1 in E2E #5) hit max_tokens mid-arguments, the
+ * stream returns truncated JSON missing N closing braces or quotes. Strict
+ * parse fails; old code silently dropped to {} which masked the actual issue.
+ *
+ * Strategy:
+ *   1. Try strict JSON.parse (fast path, most calls).
+ *   2. On failure, attempt to balance braces by appending up to BRACE_BUDGET
+ *      `}` characters. Cheap; recovers the common single-brace-truncation case.
+ *   3. If still failing, return error so caller surfaces it to the agent.
+ *
+ * Returns { ok: true, value, recovered? } | { ok: false, error }.
+ */
+const BRACE_RECOVERY_BUDGET = 4;
+function parseToolArgsTolerant(raw) {
+  if (typeof raw !== "string") return { ok: false, error: "arguments not a string" };
+  if (raw === "") return { ok: true, value: {} };
+  // Fast path
+  try { return { ok: true, value: JSON.parse(raw) }; } catch (e0) {
+    // Recovery: balance braces by appending up to BRACE_RECOVERY_BUDGET `}`
+    const opens = (raw.match(/\{/g) || []).length;
+    const closes = (raw.match(/\}/g) || []).length;
+    const needed = opens - closes;
+    if (needed > 0 && needed <= BRACE_RECOVERY_BUDGET) {
+      const padded = raw + "}".repeat(needed);
+      try { return { ok: true, value: JSON.parse(padded), recovered: needed }; } catch (_) { /* fall through */ }
+    }
+    // Last-ditch: try closing an open string then balancing braces.
+    // Truncation can land mid-string-value: ..."description": "abc<EOF>
+    const quotes = (raw.match(/"/g) || []).length;
+    if (quotes % 2 === 1) {
+      const candidate = raw + '"' + "}".repeat(Math.max(1, needed));
+      try { return { ok: true, value: JSON.parse(candidate), recovered: candidate.length - raw.length }; } catch (_) { /* fall through */ }
+    }
+    return { ok: false, error: e0.message || "JSON parse failed" };
+  }
+}
 // Phases where worker LLM tools are available (DISTILL mode).
 // E1: FINALIZATION inherits worker-LLM access so one-last-pass validation
 // runs + dashboard_render + workflow_run stay usable during packaging.
@@ -69,6 +113,19 @@ export const NEXT_PHASE = {
   [Phase.PRODUCTION_QC]: Phase.FINALIZATION, // E1: new 7th phase
 };
+// v0.6.2 J2: explicit linear order so `_advancePhase` can detect rollback
+// direction (target index < current index → rollback). Mirrors NEXT_PHASE
+// but ordered, plus FINALIZATION at the end as the terminal phase.
+export const PHASE_ORDER = [
+  Phase.BOOTSTRAP,
+  Phase.EXTRACTION,
+  Phase.SKILL_AUTHORING,
+  Phase.SKILL_TESTING,
+  Phase.DISTILLATION,
+  Phase.PRODUCTION_QC,
+  Phase.FINALIZATION,
+];
 /**
  * The KC Agent conversation engine.
  *
@@ -150,7 +207,7 @@ export class AgentEngine {
     });
     // Session state persistence
-    this.sessionState = new SessionState(this.workspace.cwd, { statePath });
+    this.sessionState = new SessionState(this.workspace.cwd, { statePath, workspace: this.workspace });
     // Task manager (ralph-loop) — sub-agents don't queue further sub-tasks,
     // so they don't get a TaskManager.
@@ -223,6 +280,11 @@ export class AgentEngine {
           historyLen: this.history?.messages?.length ?? 0,
           tasksPending: this.taskManager?.progress?.pending ?? 0,
           tasksInProgress: this.taskManager?.progress?.inProgress ?? 0,
+          // v0.6.2 K1: per-component breakdown so heap-analyze.js can
+          // attribute growth (history vs subagents vs event log vs cache).
+          // All values in MB. Failures inside _sampleComponents are caught
+          // and the row gets `componentsErr` instead.
+          components: this._sampleComponents(),
         };
         fs.mkdirSync(logDir, { recursive: true });
         fs.appendFileSync(logPath, JSON.stringify(row) + "\n", "utf-8");
@@ -240,6 +302,89 @@ export class AgentEngine {
     };
   }
+  /**
+   * v0.6.2 K1: per-component heap accounting. Each value is in MB,
+   * rounded. The whole function is wrapped in a single try/catch by the
+   * caller; failures are silently dropped to keep the sampler diagnostic
+   * (never load-bearing).
+   *
+   * Components measured (by source):
+   *  - history: in-memory `this.history.messages` content sizes (sum of
+   *    JSON-stringified content)
+   *  - eventLog: disk size of `logs/events.jsonl`
+   *  - toolResults: disk size of `logs/tool_results/` (offloaded tool
+   *    output, summed top-level files only — the dir is one level deep)
+   *  - subagents: disk size of `sub_agents/` (one level — each subagent
+   *    has its own directory tree but we just want the order of magnitude)
+   *  - bundleCache: disk size of `cache/bundles/`
+   */
+  _sampleComponents() {
+    const out = { historyMB: 0, eventLogMB: 0, toolResultsMB: 0, subagentsMB: 0, bundleCacheMB: 0 };
+    const cwd = this.workspace?.cwd;
+    if (!cwd) return out;
+    // history: walk messages, sum content string lengths (UTF-16 → bytes
+    // approx 2× length; we conservatively count length itself since most
+    // content is ASCII-heavy JSON tool output)
+    try {
+      const msgs = this.history?.messages || [];
+      let bytes = 0;
+      for (const m of msgs) {
+        const c = m?.content;
+        if (typeof c === "string") bytes += c.length;
+        else if (Array.isArray(c)) {
+          for (const part of c) {
+            if (typeof part === "string") bytes += part.length;
+            else if (part?.text) bytes += String(part.text).length;
+            else if (part?.content) bytes += String(part.content).length;
+            else if (part?.input) bytes += JSON.stringify(part.input).length;
+          }
+        } else if (c && typeof c === "object") {
+          bytes += JSON.stringify(c).length;
+        }
+      }
+      out.historyMB = Math.round(bytes / 1024 / 1024);
+    } catch { /* skip */ }
+    // events.jsonl — single file size
+    try {
+      const p = path.join(cwd, "logs", "events.jsonl");
+      out.eventLogMB = Math.round(fs.statSync(p).size / 1024 / 1024);
+    } catch { /* skip */ }
+    // logs/tool_results/ — sum file sizes one level deep (it's flat)
+    try {
+      const dir = path.join(cwd, "logs", "tool_results");
+      let total = 0;
+      for (const e of fs.readdirSync(dir, { withFileTypes: true })) {
+        if (e.isFile()) {
+          try { total += fs.statSync(path.join(dir, e.name)).size; } catch { /* skip */ }
+        }
+      }
+      out.toolResultsMB = Math.round(total / 1024 / 1024);
+    } catch { /* skip */ }
+    // sub_agents/ — sum top-level entries (each is a dir, statSync returns
+    // dir-block size, not contents — that's fine for an order-of-magnitude
+    // signal; recursive walk would be too expensive for the sampler)
+    try {
+      const dir = path.join(cwd, "sub_agents");
+      let total = 0;
+      for (const e of fs.readdirSync(dir, { withFileTypes: true })) {
+        try { total += fs.statSync(path.join(dir, e.name)).size; } catch { /* skip */ }
+      }
+      out.subagentsMB = Math.round(total / 1024 / 1024);
+    } catch { /* skip */ }
+    // cache/bundles/
+    try {
+      const dir = path.join(cwd, "cache", "bundles");
+      let total = 0;
+      for (const e of fs.readdirSync(dir, { withFileTypes: true })) {
+        if (e.isFile()) {
+          try { total += fs.statSync(path.join(dir, e.name)).size; } catch { /* skip */ }
+        }
+      }
+      out.bundleCacheMB = Math.round(total / 1024 / 1024);
+    } catch { /* skip */ }
+    return out;
+  }
   /** Stop background diagnostics. Call on graceful shutdown. */
   stop() {
     try { this._heapSamplerStop?.(); } catch { /* ignore */ }
@@ -280,6 +425,14 @@ export class AgentEngine {
         new PhaseAdvanceTool(
           (to, reason, opts) => this._advancePhase(to, reason, opts),
           () => this.currentPhase, // H1: tool reads phase BEFORE its own call
+          // v0.6.2 J1: surface running subagents so the tool can refuse
+          // advance until the agent explicitly acknowledges them.
+          () => {
+            try {
+              const agentTool = this._buildTools?.core?.find((t) => t?.name === "agent_tool");
+              return agentTool?.getRunningTaskIds?.() || [];
+            } catch { return []; }
+          },
         ),
         new DocumentParseTool(this.workspace, {
           mineruApiUrl: this.config.mineruApiUrl,
@@ -353,6 +506,27 @@ export class AgentEngine {
     return "";
   }
+  /**
+   * v0.7.0 B3: Read rules/PATTERNS.md (project memory) for surfacing in
+   * the system prompt. Only loaded for phases where the agent owns
+   * decomposition decisions (skill_authoring + skill_testing — the two
+   * phases the work-decomposition skill operates in). Capped at ~5 KB
+   * so it stays trivial token-wise; if the file is larger, we truncate
+   * to the first 5 KB and append a "...truncated" marker so the agent
+   * knows to prune.
+   */
+  _readProjectMemory() {
+    if (!["skill_authoring", "skill_testing"].includes(this.currentPhase)) return null;
+    const p = path.join(this.workspace.cwd, "rules", "PATTERNS.md");
+    try {
+      if (!fs.existsSync(p)) return null;
+      const raw = fs.readFileSync(p, "utf-8");
+      const CAP = 5 * 1024;
+      if (raw.length <= CAP) return raw;
+      return raw.slice(0, CAP) + "\n\n…truncated at 5 KB — prune the least-actionable entries (work-decomposition skill: Sizing).";
+    } catch { return null; }
+  }
   /**
    * Build the workspace/project directory state string for the system prompt.
    */
@@ -392,6 +566,7 @@ export class AgentEngine {
       skillIndex: this._skillLoader.formatForContext(this.currentPhase),
       pipelineState: this.pipelines[this.currentPhase]?.describeState?.() || null,
       workspaceState: this._buildWorkspaceState(),
+      projectMemory: this._readProjectMemory(),
     });
     const systemTokens = estimateTokens(systemPrompt);
     const messageTokens = estimateMessagesTokens(this.history.messages);
@@ -550,8 +725,18 @@ export class AgentEngine {
   async compact({ recentCount = 20 } = {}) {
     if (this.history.messages.length <= recentCount) return null;
-    const olderMessages = this.history.messages.slice(0, -recentCount);
-    const recentMessages = this.history.messages.slice(-recentCount);
+    // v0.6.3.1: tool-pair atomicity. Naive slice(-recentCount) can land on
+    // a tool message (whose assistant_with_tool_calls is in the older batch
+    // about to be summarized) OR put the split between an assistant with
+    // tool_calls and its tool results. Either creates an orphan that
+    // DeepSeek's strict API rejects with 400. Walk the split point forward
+    // until BOTH (recent[0] isn't tool) AND (older[-1] isn't
+    // assistant_with_tool_calls).
+    const desiredSplit = this.history.messages.length - recentCount;
+    const splitPoint = findSafeSplitPoint(this.history.messages, desiredSplit);
+    const olderMessages = this.history.messages.slice(0, splitPoint);
+    const recentMessages = this.history.messages.slice(splitPoint);
+    if (olderMessages.length === 0) return null; // nothing safely summarizable
     const CHUNK_BUDGET = 30000; // tokens per summarization request
     const chunks = this._chunkMessages(olderMessages, CHUNK_BUDGET);
@@ -684,6 +869,39 @@ export class AgentEngine {
       engine._registerToolsForPhase(engine.currentPhase);
       engine.workspace.setPhase(engine.currentPhase);
+      // v0.6.3.1: detect whether prior turns of this session used reasoning
+      // mode, so the field-consistency invariant continues across resume.
+      // Without this, the first assistant turn after resume might lack
+      // reasoning_content even though earlier turns have it, and DeepSeek's
+      // strict-mode rejects with 400.
+      try {
+        const msgs = engine.history?.messages || [];
+        engine._sessionUsesReasoning = msgs.some(
+          (m) => m?.role === "assistant" && "reasoning_content" in m,
+        );
+        // One-shot migration: backfill empty reasoning_content on assistant
+        // messages that are missing the field. Pre-v0.6.3.1 sessions could
+        // accumulate "holes" (turns where the model skipped reasoning) that
+        // poison the conversation for resume. A single empty string on each
+        // hole is enough to satisfy DeepSeek's field-consistency rule.
+        if (engine._sessionUsesReasoning) {
+          let patched = 0;
+          for (const m of msgs) {
+            if (m?.role === "assistant" && !("reasoning_content" in m)) {
+              m.reasoning_content = "";
+              patched++;
+            }
+          }
+          if (patched > 0) {
+            engine.history._save?.();
+            engine.eventLog.append("reasoning_content_backfilled", {
+              count: patched,
+              reason: "v0.6.3.1 migration on resume",
+            });
+          }
+        }
+      } catch { /* never let resume break on this */ }
       // Restore project directory from saved state
       if (data.projectDir) {
         if (fs.existsSync(data.projectDir)) {
@@ -796,6 +1014,7 @@ export class AgentEngine {
       skillIndex: this._skillLoader.formatForContext(this.currentPhase),
       pipelineState,
       workspaceState: this._buildWorkspaceState(),
+      projectMemory: this._readProjectMemory(),
     });
     const tools = this.toolRegistry.schemasOpenai();
@@ -824,6 +1043,19 @@ export class AgentEngine {
       try {
         let collectedText = "";
+        // v0.7.0 L (#76): Anthropic-only — accumulator for the
+        // signature_delta blob that proves the thinking content came
+        // from Anthropic's model. Required alongside thinking text on
+        // multi-turn replay.
+        let collectedReasoningSignature = "";
+        // v0.6.3: hybrid reasoning models (GLM-5.1, DeepSeek v4, MiMo v2.5,
+        // Qwen3, ...) stream `delta.reasoning_content` separately from
+        // `delta.content`. DeepSeek's strict API requires this field to be
+        // round-tripped on subsequent assistant messages or it rejects the
+        // request with "reasoning_content in the thinking mode must be passed
+        // back". Even providers that don't enforce this (SiliconFlow) still
+        // benefit from preservation — without it, prior reasoning is wasted.
+        let collectedReasoning = "";
         /** @type {Map<number, {id: string, name: string, arguments: string}>} */
         const toolCallsAcc = new Map();
@@ -843,6 +1075,22 @@ export class AgentEngine {
             collectedText += delta.content;
           }
+          // v0.6.3: capture reasoning_content from the same delta. Emit a
+          // separate event type so the TUI can optionally render thinking
+          // (today it's silently consumed; round-trip is the priority fix).
+          if (delta.reasoning_content) {
+            yield new AgentEvent({ type: "reasoning_delta", text: delta.reasoning_content });
+            collectedReasoning += delta.reasoning_content;
+          }
+          // v0.7.0 L (#76): Anthropic-only signature_delta. Carries the
+          // opaque proof-of-thinking blob that strict-mode multi-turn
+          // requires alongside the thinking text. OpenAI-shape providers
+          // never emit this delta; it's a no-op for them.
+          if (delta.reasoning_signature) {
+            collectedReasoningSignature += delta.reasoning_signature;
+          }
           if (delta.tool_calls) {
             for (const tcDelta of delta.tool_calls) {
               const idx = tcDelta.index;
@@ -859,6 +1107,31 @@ export class AgentEngine {
         // Log the complete assistant message (coalesced, not per-delta)
         const assistantMsg = { role: "assistant", content: collectedText || null };
+        // v0.6.3: persist reasoning_content on the assistant message so it
+        // round-trips on the next request. history.addRaw spreads the input,
+        // preserving unknown fields; OpenAI body builder doesn't strip them.
+        //
+        // v0.6.3.1: DeepSeek's strict-mode rule is FIELD CONSISTENCY, not
+        // field content — once any assistant turn in the conversation has
+        // reasoning_content, every subsequent assistant turn must also have
+        // it (empty string OK; missing the field rejects with 400). Hybrid
+        // reasoning models sometimes skip reasoning on trivial follow-through
+        // tool calls, leaving collectedReasoning="". Track at session level:
+        // once we see ANY reasoning, keep setting the field (possibly empty)
+        // for the rest of the session. Providers that don't use the field
+        // ignore it silently.
+        if (collectedReasoning) {
+          assistantMsg.reasoning_content = collectedReasoning;
+          this._sessionUsesReasoning = true;
+        } else if (this._sessionUsesReasoning) {
+          assistantMsg.reasoning_content = "";
+        }
+        // v0.7.0 L (#76): persist Anthropic signature alongside thinking.
+        // Always stored together — if either is missing, _buildAnthropicBody
+        // skips the thinking-block replay (would be rejected as malformed).
+        if (collectedReasoningSignature) {
+          assistantMsg.reasoning_signature = collectedReasoningSignature;
+        }
         if (toolCallsAcc.size > 0) {
           assistantMsg.tool_calls = Array.from(toolCallsAcc.values()).map((tc) => ({
             id: tc.id,
@@ -915,10 +1188,61 @@ export class AgentEngine {
         // Tool execution loop
         for (const tc of toolCallsAcc.values()) {
-          let inputData = {};
-          try {
-            inputData = tc.arguments ? JSON.parse(tc.arguments) : {};
-          } catch { /* ignore */ }
+          // v0.6.3.1: tool-argument JSON parsing used to be `try { parse } catch {}`
+          // — silently falling back to {} on any parse failure. E2E #5 GLM
+          // session showed this firing 100+ times: SiliconFlow streaming
+          // truncates GLM-5.1 tool_call arguments by ~1 closing brace
+          // (likely max_tokens cutoff mid-args), the silent fallback shipped
+          // {} to the tool, and the tool returned generic "(empty)" errors
+          // which the agent kept retrying without understanding why.
+          //
+          // Fix: try strict parse, then attempt brace-balance recovery (cheap
+          // — recovers from the common single-brace-truncation case), and if
+          // that fails, surface a structured error to the agent so it can
+          // see what it sent and self-correct.
+          let inputData = null;
+          let argParseError = null;
+          if (tc.arguments) {
+            const recovery = parseToolArgsTolerant(tc.arguments);
+            if (recovery.ok) {
+              inputData = recovery.value;
+              if (recovery.recovered) {
+                this.eventLog.append("tool_args_recovered", {
+                  name: tc.name,
+                  added_chars: recovery.recovered,
+                  original_len: tc.arguments.length,
+                });
+              }
+            } else {
+              argParseError = recovery.error;
+            }
+          } else {
+            inputData = {};
+          }
+          // If arguments were unparseable, skip execution and return a tool
+          // result that tells the agent what went wrong. Engine's tool result
+          // loop continues so the rest of the assistant's tool_calls in this
+          // turn still execute.
+          if (argParseError) {
+            const preview = (tc.arguments || "").slice(0, 200);
+            const errMsg =
+              `Tool arguments were malformed JSON for ${tc.name}. ` +
+              `Likely streaming truncation by the model (provider cut tokens mid-output). ` +
+              `Parser error: ${argParseError}. ` +
+              `First 200 chars of what was received: ${preview}${tc.arguments && tc.arguments.length > 200 ? "..." : ""}. ` +
+              `Retry the call with shorter / simpler arguments — the model may have hit max_tokens partway through encoding.`;
+            this.eventLog.append("tool_args_parse_failed", {
+              name: tc.name,
+              error: argParseError,
+              raw_args_len: (tc.arguments || "").length,
+              raw_preview: preview,
+            });
+            yield new AgentEvent({ type: "tool_start", name: tc.name, input: { _parse_error: argParseError } });
+            yield new AgentEvent({ type: "tool_result", name: tc.name, output: errMsg, isError: true });
+            this.history.addRaw({ role: "tool", tool_call_id: tc.id, content: errMsg });
+            continue;
+          }
           this.eventLog.append("tool_start", { name: tc.name, input: inputData });
           yield new AgentEvent({ type: "tool_start", name: tc.name, input: inputData });
@@ -973,10 +1297,31 @@ export class AgentEngine {
             isError: result.isError,
           });
+          // v0.6.3 (#74): phase-misfit nudge. Ask the current pipeline whether
+          // this tool call looks like work that belongs to a different phase.
+          // If so, append a `<system-reminder>` tag to the tool result content
+          // (same convention as task-tools and auto-memory reminders). The
+          // agent sees this on its next turn and can self-check whether to
+          // call phase_advance. Only fires for non-error results — failed
+          // tool calls have their own error message and don't need the nudge.
+          let nudgedContent = historyContent;
+          try {
+            const pipelineForPhase = this.pipelines?.[beforePhase];
+            const hint = pipelineForPhase?.phaseMisfitHint?.(tc.name, inputData, result);
+            if (hint && !result.isError) {
+              nudgedContent = `${historyContent}\n\n<system-reminder>\nPhase-misfit detected: ${hint}\n</system-reminder>`;
+              this.eventLog.append("phase_misfit_hint", {
+                phase: beforePhase,
+                tool: tc.name,
+                hint,
+              });
+            }
+          } catch { /* never let the nudge logic break the tool loop */ }
           this.history.addRaw({
             role: "tool",
             tool_call_id: tc.id,
-            content: historyContent,
+            content: nudgedContent,
           });
           // Post-tool-result safety net: check for context pressure RIGHT NOW
@@ -1053,38 +1398,144 @@ export class AgentEngine {
     const expected = NEXT_PHASE[this.currentPhase];
     if (!force && nextPhase !== expected) {
+      // v0.7.0 A3: event-log hint stays factual (records what the gate
+      // saw) — the LLM-facing refusal text in phase-advance.js no longer
+      // advertises force:true. Hint kept here for post-mortem audit.
       this.eventLog.append("phase_advance_refused", {
         from: this.currentPhase, to: nextPhase, reason,
-        hint: expected ? `expected next phase is '${expected}' — pass force:true to override`
+        hint: expected ? `non-adjacent transition; immediate next phase is '${expected}'`
                        : `${this.currentPhase} is the terminal phase`,
       });
       return false;
     }
+    // v0.7.0 A5: reconcile per-rule tasks against disk artifacts before
+    // checking exit criteria. Catches the E2E #5 DS pattern (tasks.json
+    // showed 70/70 done while only 56 dirs / 36 with check_*.py existed):
+    // markDone() is fire-and-forget today, so the agent can claim
+    // completion that didn't materialize. Reconcile flips back to
+    // pending if the helper-derived ruleIdsCovered set doesn't include
+    // the task's ruleId. A "force"d advance bypasses reconcile too —
+    // the gate already gives the agent / user that escape.
+    if (!force && this.taskManager && this.workspace) {
+      try {
+        const sa = deriveSkillAuthoringMilestones(this.workspace);
+        const covered = new Set(sa.ruleIdsCovered);
+        const tm = deriveSkillTestingMilestones(this.workspace);
+        const tested = new Set(tm.skillsTested);
+        const r = this.taskManager.reconcileAgainstDisk((task) => {
+          if (task.phase === "skill_authoring") return covered.has(task.ruleId);
+          if (task.phase === "skill_testing") return tested.has(task.ruleId);
+          return true; // unknown phase — leave alone
+        });
+        if (r.flippedBack.length > 0) {
+          this.eventLog.append("tasks_reconciled", {
+            from_phase: this.currentPhase,
+            target_phase: nextPhase,
+            flipped_back: r.flippedBack,
+            count: r.flippedBack.length,
+            inspected: r.reconciled,
+          });
+        }
+      } catch { /* never let reconcile break advance */ }
+    }
+    // v0.6.3: HARD-TRACKING GATE — refuse forward advance unless the source
+    // phase's exit criteria are met by engine telemetry. v0.6.1 added the
+    // engineCounts block to phase summaries (observation) but never wired
+    // exitCriteriaMet() into the gate (enforcement). E2E #5 surfaced the
+    // gap: MiMo advanced rule_extraction → skill_authoring with
+    // rulesExtracted=0 in engine telemetry because rule_catalog had been
+    // writing to a stranded post-rename path AND nothing checked the gate.
+    //
+    // Forward-only enforcement: rollbacks (_advancePhase from a later phase
+    // to an earlier one with force:true) are an explicit escape, not a
+    // criteria check — the rolled-from phase doesn't need to be "complete".
+    // force:true also bypasses (matches existing escape pattern: user/agent
+    // explicitly chose to skip).
+    if (!force) {
+      const fromIdx = PHASE_ORDER.indexOf(this.currentPhase);
+      const toIdx = PHASE_ORDER.indexOf(nextPhase);
+      const isForward = fromIdx >= 0 && toIdx >= 0 && toIdx > fromIdx;
+      if (isForward) {
+        const fromPipeline = this.pipelines?.[this.currentPhase];
+        let criteriaMet = true;
+        try { criteriaMet = !!fromPipeline?.exitCriteriaMet?.(); } catch { criteriaMet = true; }
+        if (!criteriaMet) {
+          const counts = this._buildEngineCountsBlock(this.currentPhase);
+          this.eventLog.append("phase_advance_refused", {
+            from: this.currentPhase, to: nextPhase, reason,
+            hint: "exit criteria not met by engine telemetry",
+            engineCounts: counts || null,
+          });
+          return false;
+        }
+      }
+    }
+    // v0.6.2 J2: detect rollback direction. PHASE_ORDER is a linear array
+    // of all phases; if target index < current index, this is a rollback
+    // (e.g., production_qc → skill_authoring after gates revealed gaps).
+    const fromIdx = PHASE_ORDER.indexOf(this.currentPhase);
+    const toIdx = PHASE_ORDER.indexOf(nextPhase);
+    const direction = (fromIdx >= 0 && toIdx >= 0 && toIdx < fromIdx)
+      ? "rollback" : "forward";
     // v0.6.1 B1: build engine-appended hard-counts block + heuristic mismatch
     // detection so the LLM-narrated reason can be cross-checked against
     // ground-truth telemetry. Phase summaries become diagnostic, not just
     // narrative.
     const engineCounts = this._buildEngineCountsBlock(this.currentPhase);
     const mismatchPrefix = this._detectSummaryMismatch(reason, this.currentPhase) ? "⚠️ POSSIBLE MISMATCH: " : "";
+    const directionTag = direction === "rollback" ? " [ROLLBACK]" : "";
+    // v0.7.0 A2: forced is now `!!force` (honest), not the old
+    // `force && nextPhase !== expected` which masked every adjacent-forward
+    // force in the audit log. E2E #5 had 12/12 force-bypasses but the event
+    // log read 0 forced because every transition was to the immediate next
+    // phase. Truth in audit logs first; refinement (forward-vs-non-adjacent
+    // distinction) lives in the `direction` field.
     const phaseSummary =
-      `[${this.currentPhase.toUpperCase()} → ${nextPhase.toUpperCase()}]: ${mismatchPrefix}${reason}` +
-      (force && nextPhase !== expected ? " (forced)" : "") +
+      `[${this.currentPhase.toUpperCase()} → ${nextPhase.toUpperCase()}]${directionTag}: ${mismatchPrefix}${reason}` +
+      (force ? " (forced)" : "") +
       (engineCounts ? `\n  (engine) ${engineCounts}` : "");
     this._phaseSummaries.push(phaseSummary);
     this.eventLog.append("phase_transition", {
       from: this.currentPhase,
       to: nextPhase,
       reason,
+      direction,
       engineCounts: engineCounts || null,
       possibleMismatch: !!mismatchPrefix,
-      forced: force && nextPhase !== expected,
+      forced: !!force,
     });
     const fromPhase = this.currentPhase;
     this.currentPhase = nextPhase;
     this._registerToolsForPhase(this.currentPhase);
     this.workspace.setPhase(this.currentPhase);
     this._createTasksForPhase(this.currentPhase);
+    // v0.7.0 N (#94): give the entered pipeline a chance to do
+    // phase-entry setup. Used by finalization to copy the release
+    // template into output/releases/v1/. Other pipelines are no-ops.
+    // Wrapped so a failure here can't trap the phase advance.
+    try { this.pipelines[this.currentPhase]?.onPhaseEnter?.({ fromPhase, workspace: this.workspace }); }
+    catch (e) {
+      this.eventLog.append("phase_enter_hook_failed", {
+        phase: this.currentPhase,
+        error: e?.message || String(e),
+      });
+    }
+    // v0.6.2 J2: on rollback, reset the rolled-FROM phase's lastReady
+    // edge-trigger so that if the agent revisits it and re-flips
+    // exit-criteria true, _maybeAutoAdvance will fire correctly. Without
+    // this, the auto-advance edge trigger stays latched true and the
+    // moment the agent returns to fromPhase the engine immediately
+    // bounces them back out — defeating the rollback.
+    if (direction === "rollback" && this._lastReady) {
+      this._lastReady[fromPhase] = false;
+    }
     this.saveState();
     // B8: Soft signal — surface any sub-agents left running from the prior
@@ -1168,7 +1619,7 @@ export class AgentEngine {
     const parts = [];
     try {
       switch (fromPhase) {
-        case "extraction": {
+        case "rule_extraction": {
           const total = pipeline._catalogRuleCount?.() ?? pipeline.rulesExtracted?.length ?? 0;
           parts.push(`rulesExtracted: ${pipeline.rulesExtracted?.length ?? 0}`);
           parts.push(`rulesWithChunkRefs: ${pipeline.rulesWithChunkRefs?.length ?? 0}/${total}`);
@@ -1616,11 +2067,23 @@ export class AgentEngine {
     // Auto-continue through pending tasks
     while (this.taskManager.getNextPending()) {
-      // Context safety: force compaction if above 70%, or light compaction if history is long
+      // v0.7.0 #93: budget-aware compact threshold. The old
+      // `messages.length > 15` was message-count-based and frozen
+      // from when KC ran on smaller contexts. With 200K+ budgets it
+      // fired on every iteration of any non-trivial task — E2E #5
+      // GLM saw 76 memory_pressure events and DS saw 46 because
+      // compact pre-empted natural windowing. Replace with token-
+      // budget threshold (default 60% of context, configurable via
+      // KC_COMPACT_THRESHOLD_TOKENS) so compact runs when there's
+      // actual pressure, not just when message count crossed an
+      // ancient heuristic.
       const stats = this.getContextStats();
+      const thresholdTokens = parseInt(
+        process.env.KC_COMPACT_THRESHOLD_TOKENS || "0", 10,
+      ) || Math.round((this.config.kcContextLimit || 200000) * 0.6);
       if (stats.percentage > 70) {
         await this.compact();
-      } else if (this.history.messages.length > 15) {
+      } else if (stats.totalTokens > thresholdTokens) {
         await this.compact({ recentCount: 8 });
       }
@@ -1789,10 +2252,18 @@ export class AgentEngine {
           continue;
         }
-        const trackedPromise = entry.promise.then(
-          () => ({ taskId: task.id, subId, ok: true }),
-          (e) => ({ taskId: task.id, subId, ok: false, error: e?.message || String(e) }),
-        );
+        // v0.7.0 H1: trackedPromise covers both fulfilled and rejected
+        // paths (second arg). The .catch tail is belt-and-braces in case
+        // the .then callbacks themselves throw — without it, a JSON
+        // serialization throw inside the success-arm callback would
+        // surface as UnhandledPromiseRejection and crash strict-mode
+        // Node. We never want a worker error to take the engine down.
+        const trackedPromise = entry.promise
+          .then(
+            () => ({ taskId: task.id, subId, ok: true }),
+            (e) => ({ taskId: task.id, subId, ok: false, error: e?.message || String(e) }),
+          )
+          .catch((e) => ({ taskId: task.id, subId, ok: false, error: `tracked-promise threw: ${e?.message || String(e)}` }));
         inFlight.set(subId, { task, workerLabel, promise: trackedPromise });
       }
     };
@@ -1807,7 +2278,15 @@ export class AgentEngine {
       if (inFlight.size === 0) break;
-      // Wait for either the next event OR a worker to complete
+      // Wait for either the next event OR a worker to complete.
+      //
+      // v0.7.0 C1 note: losers in Promise.race() keep their .then()
+      // chains active and resolve into garbage objects. That's the
+      // intended JS Promise behavior — rejections are still handled,
+      // memory drops at GC. The audit was overstated; no actual hang
+      // or leak. Each loop iteration rebuilds the race from current
+      // inFlight.values() so stale promises from prior iterations
+      // are naturally re-observed (they've already resolved by then).
       const workerCompletion = Promise.race([...inFlight.values()].map((v) => v.promise));
       const eventArrival = new Promise((resolve) => { notify = () => resolve("event"); });
       const winner = await Promise.race([