npm - kc-beta - Versions diffs - 0.5.6 → 0.6.0 - Mend

kc-beta 0.5.6 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

package/QUICKSTART.md +17 -4
package/README.md +58 -11
package/bin/kc-beta.js +35 -1
package/package.json +1 -1
package/src/agent/bundle-tree.js +553 -0
package/src/agent/context.js +40 -1
package/src/agent/engine.js +644 -28
package/src/agent/llm-client.js +67 -18
package/src/agent/pipelines/finalization.js +186 -0
package/src/agent/pipelines/index.js +8 -0
package/src/agent/pipelines/initializer.js +40 -0
package/src/agent/pipelines/skill-authoring.js +100 -6
package/src/agent/skill-loader.js +54 -4
package/src/agent/task-manager.js +66 -3
package/src/agent/tools/agent-tool.js +283 -35
package/src/agent/tools/bundle-search.js +146 -0
package/src/agent/tools/document-chunk.js +246 -0
package/src/agent/tools/document-classify.js +311 -0
package/src/agent/tools/document-parse.js +8 -1
package/src/agent/tools/phase-advance.js +30 -7
package/src/agent/tools/registry.js +10 -0
package/src/agent/tools/rule-catalog.js +17 -3
package/src/agent/tools/sandbox-exec.js +30 -0
package/src/agent/workspace.js +168 -14
package/src/cli/components.js +165 -17
package/src/cli/index.js +166 -19
package/src/cli/meme.js +58 -0
package/src/config.js +39 -2
package/src/providers.js +26 -0
package/template/skills/en/meta-meta/evolution-loop/SKILL.md +13 -1
package/template/skills/en/meta-meta/rule-extraction/SKILL.md +74 -0
package/template/skills/zh/meta-meta/evolution-loop/SKILL.md +7 -1
package/template/skills/zh/meta-meta/rule-extraction/SKILL.md +73 -0

package/src/cli/index.js CHANGED Viewed

@@ -15,6 +15,7 @@ import {
   HRule,
   InputPrompt,
 } from "./components.js";
+import { MemeOverlay } from "./meme.js"; // F6
 const h = React.createElement;
@@ -30,6 +31,17 @@ const VISIBLE_WINDOW = 50;
 // Older ToolBlocks show header only. Both still persist full output to disk.
 const RECENT_TOOL_WINDOW = 10;
+// B0.3: Hard cap on the React `messages` array. Without this, the array
+// grows forever (setMessages((prev) => [...prev, msg]) via addMessage) —
+// the VISIBLE_WINDOW virtualization hides old entries from render but
+// they still sit in state. Over a 17 h session with 2-4 messages per
+// turn, that's 1000s of entries holding tool-result digest strings and
+// pipeline messages. /compact resets messages to a 1-item summary, so
+// this cap is really a safety net between compacts. On cap hit, drop
+// oldest non-system entries (system messages carry session-level
+// context — pipeline transitions, errors — that users want retained).
+const MAX_RETAINED_MESSAGES = 500;
 /**
  * Main KC Agent CLI App using Ink (React for terminals).
  */
@@ -43,6 +55,7 @@ function App({ engine, config }) {
   const [sessionId, setSessionId] = useState(engine.workspace.sessionId);
   const [phase, setPhase] = useState(engine.currentPhase);
   const [showWelcome, setShowWelcome] = useState(true);
+  const [showMeme, setShowMeme] = useState(false); // F6
   const [spinnerStatus, setSpinnerStatus] = useState(null);
   const [contextTokens, setContextTokens] = useState(0);
   const [contextLimit, setContextLimit] = useState(config.kcContextLimit || 200000);
@@ -63,7 +76,16 @@ function App({ engine, config }) {
   }, []);
   const addMessage = useCallback((msg) => {
-    setMessages((prev) => [...prev, msg]);
+    setMessages((prev) => {
+      if (prev.length < MAX_RETAINED_MESSAGES) return [...prev, msg];
+      // Cap hit: drop the oldest non-system entry. If everything is system
+      // (unlikely but possible), fall back to dropping the very oldest.
+      const dropIdx = prev.findIndex((m) => m.role !== "system");
+      const next = dropIdx >= 0
+        ? [...prev.slice(0, dropIdx), ...prev.slice(dropIdx + 1), msg]
+        : [...prev.slice(1), msg];
+      return next;
+    });
   }, []);
   const runTurn = useCallback(async (text) => {
@@ -76,7 +98,9 @@ function App({ engine, config }) {
     let accumulated = "";
     try {
-      for await (const event of engineRef.current.runTaskLoop(text)) {
+      for await (const event of engineRef.current.runTaskLoop(text, {
+        parallelism: config.effectiveParallelism?.() ?? 1,
+      })) {
         switch (event.type) {
           case "text_delta":
             accumulated += event.text ?? "";
@@ -117,6 +141,13 @@ function App({ engine, config }) {
             });
             setCurrentTool(null);
             setSpinnerStatus("Analyzing results...");
+            // H4: Refresh the CTX indicator after every tool_result. Without
+            // this, contextTokens only updates on turn_complete, which never
+            // fires in long tool-heavy sessions — we observed 908 events with
+            // zero turn_complete in session 6304673afaa0, CTX stuck at 0/131k
+            // for 30+ minutes. getContextStats() is a cheap pure calc over
+            // the history array; safe to call on every tool call.
+            updateContextStats();
             break;
           case "pipeline_event": {
@@ -153,7 +184,10 @@ function App({ engine, config }) {
     // Process queue
     if (queueRef.current.length > 0) {
       const next = queueRef.current.shift();
+      setQueueSize(queueRef.current.length); // F2
       runTurn(next);
+    } else {
+      setQueueSize(0); // F2
     }
   }, [addMessage, updateContextStats]);
@@ -173,6 +207,8 @@ function App({ engine, config }) {
             "  /tasks               Show task progress\n" +
             "  /phase [sub]         advance | status | <name> — manual phase override\n" +
             "  /schedule            Show scheduled ingestion jobs and recent log lines\n" +
+            "  /tools               List all registered tools and which phase gates them\n" +
+            "  /parallelism [N]     Show or set parallel ralph-loop worker count (1-8)\n" +
             "  /clear               Clear conversation history (keep workspace)\n" +
             "  /compact             Summarize older messages to reduce context\n" +
             "  /sessions            List all sessions\n" +
@@ -184,19 +220,90 @@ function App({ engine, config }) {
       case "/status": {
         const stats = engineRef.current.getContextStats();
+        const par = config.effectiveParallelism?.() ?? 1;
+        const parLine = par > 1
+          ? `${par} (verified)`
+          : `${config.parallelismRequested || 1} requested` +
+            (config.parallelismRequested > 1 && !config.parallelismVerified
+              ? ` — clamped to 1 (KC_PARALLELISM_VERIFIED not set; run heap baseline first)`
+              : "");
+        addMessage({
+          role: "system",
+          content:
+            `Session:     ${engineRef.current.workspace.sessionId}\n` +
+            `Phase:       ${engineRef.current.currentPhase.toUpperCase()}\n` +
+            `Model:       ${config.kcModel}\n` +
+            `Provider:    ${config.provider || "unknown"}\n` +
+            `LLM URL:     ${config.llmBaseUrl}\n` +
+            `Project:     ${engineRef.current.workspace.projectDir || "(none)"}\n` +
+            `Workspace:   ${engineRef.current.workspace.cwd}\n` +
+            `Tools:       ${engineRef.current.toolRegistry.size} registered\n` +
+            `History:     ${engineRef.current.history.messages.length} messages\n` +
+            `Context:     ~${stats.totalTokens} tokens (${stats.percentage}% of ${stats.limit})\n` +
+            `Parallelism: ${parLine}`,
+        });
+        return true;
+      }
+      case "/meme":
+        // F6: easter egg. Not in /help.
+        setShowMeme(true);
+        return true;
+      case "/tools": {
+        // F5: list all registered tools + which phase gates them. Reads
+        // from the live toolRegistry so what you see is what the agent
+        // currently has available. Also names the distill-only tools
+        // explicitly so users understand why some tools "come and go"
+        // as phases advance.
+        const reg = engineRef.current.toolRegistry;
+        const names = reg?.names?.() || [];
+        const core = engineRef.current._buildTools?.core?.map((t) => t?.name).filter(Boolean) || [];
+        const distill = engineRef.current._buildTools?.distill?.map((t) => t?.name).filter(Boolean) || [];
+        const phase = engineRef.current.currentPhase.toUpperCase();
+        const lines = [
+          `Tools registered for phase ${phase}: ${names.length}`,
+          "",
+          `Core (always available, ${core.length}):`,
+          ...core.map((n) => `  • ${n}${names.includes(n) ? "" : " [not currently registered]"}`),
+        ];
+        if (distill.length > 0) {
+          lines.push("", `Distill-only (DISTILLATION / PRODUCTION_QC / FINALIZATION, ${distill.length}):`);
+          for (const n of distill) {
+            lines.push(`  • ${n}${names.includes(n) ? "" : " [gated out of this phase]"}`);
+          }
+        }
+        lines.push("", "Tools are not separately installable — they ship with the KC release. To see what each tool does, invoke it or ask the agent.");
+        addMessage({ role: "system", content: lines.join("\n") });
+        return true;
+      }
+      case "/parallelism": {
+        // B3: set parallelism at runtime. Respects the B0.6 guard —
+        // takes effect only if KC_PARALLELISM_VERIFIED is already set.
+        const n = parseInt(arg, 10);
+        if (!Number.isFinite(n) || n < 1) {
+          addMessage({
+            role: "system",
+            content:
+              `Usage: /parallelism <N> (1-8)\n` +
+              `Current: requested=${config.parallelismRequested || 1}, ` +
+              `effective=${config.effectiveParallelism?.() ?? 1}. ` +
+              (config.parallelismVerified
+                ? "Verified — new value takes effect next /run."
+                : "Unverified — clamped to 1. Set KC_PARALLELISM_VERIFIED=1 after a clean 2h heap-baseline run."),
+          });
+          return true;
+        }
+        const clamped = Math.min(Math.max(n, 1), 8);
+        config.parallelismRequested = clamped;
         addMessage({
           role: "system",
           content:
-            `Session:   ${engineRef.current.workspace.sessionId}\n` +
-            `Phase:     ${engineRef.current.currentPhase.toUpperCase()}\n` +
-            `Model:     ${config.kcModel}\n` +
-            `Provider:  ${config.provider || "unknown"}\n` +
-            `LLM URL:   ${config.llmBaseUrl}\n` +
-            `Project:   ${engineRef.current.workspace.projectDir || "(none)"}\n` +
-            `Workspace: ${engineRef.current.workspace.cwd}\n` +
-            `Tools:     ${engineRef.current.toolRegistry.size} registered\n` +
-            `History:   ${engineRef.current.history.messages.length} messages\n` +
-            `Context:   ~${stats.totalTokens} tokens (${stats.percentage}% of ${stats.limit})`,
+            `Parallelism requested=${clamped}. ` +
+            (config.parallelismVerified
+              ? `Effective=${config.effectiveParallelism()} (verified).`
+              : `Effective=1 (verified flag not set — see /status).`),
         });
         return true;
       }
@@ -339,10 +446,25 @@ function App({ engine, config }) {
           } catch (err) {
             addMessage({ role: "system", content: `Compact failed: ${err.message}` });
           } finally {
+            // F8: Spinner-race fix. If a queued task is about to kick off
+            // via runTurn(next), DO NOT clear the streaming/spinner state
+            // here — runTurn's own entry sets streamingRef=true + spinner
+            // immediately, but there's a brief React-render window between
+            // our `setStreaming(false)` and its `setStreaming(true)` where
+            // the TUI paints "no spinner, no streaming" for 1-2 frames.
+            // Over long sessions that looked like a dead TUI when a user
+            // watched the moment /compact auto-chained to the next task.
+            // Order now: IF next task is queued, let runTurn(next) set all
+            // streaming state in one atomic render; we just reset the ref
+            // flags to avoid the input-is-locked issue. Otherwise do the
+            // full clear (idle-TUI case).
+            const hasQueuedWork = queueRef.current.length > 0;
             streamingRef.current = false;
-            setStreaming(false);
-            setSpinnerStatus(null);
-            if (queueRef.current.length > 0) {
+            if (!hasQueuedWork) {
+              setStreaming(false);
+              setSpinnerStatus(null);
+            }
+            if (hasQueuedWork) {
               const next = queueRef.current.shift();
               runTurn(next);
             }
@@ -436,8 +558,9 @@ function App({ engine, config }) {
       case "/exit":
       case "/quit":
-        // Save state before exit
+        // Save state + stop diagnostics before exit
         try { engineRef.current.saveState(); } catch { /* ignore */ }
+        try { engineRef.current.stop(); } catch { /* ignore */ }
         exit();
         return true;
@@ -446,6 +569,8 @@ function App({ engine, config }) {
     }
   }, [addMessage, config, exit, updateContextStats]);
+  const [queueSize, setQueueSize] = useState(0); // F2: count for TUI indicator
   const handleSubmit = useCallback((text) => {
     const trimmed = text.trim();
     setInputValue("");
@@ -460,6 +585,11 @@ function App({ engine, config }) {
     if (streamingRef.current) {
       queueRef.current.push(trimmed);
+      setQueueSize(queueRef.current.length); // F2
+      addMessage({
+        role: "system",
+        content: `⏳ Queued (${queueRef.current.length} waiting). Will be sent to KC on next turn boundary.`,
+      });
     } else {
       runTurn(trimmed);
     }
@@ -473,15 +603,23 @@ function App({ engine, config }) {
         addMessage({ role: "system", content: "[Queue cleared]" });
       } else {
         try { engineRef.current.saveState(); } catch { /* ignore */ }
+        try { engineRef.current.stop(); } catch { /* ignore */ }
         exit();
       }
     }
     if (key.ctrl && input === "d") {
       try { engineRef.current.saveState(); } catch { /* ignore */ }
+      try { engineRef.current.stop(); } catch { /* ignore */ }
       exit();
     }
   });
+  // F6: /meme overlay short-circuits the rest of the UI until dismissed.
+  // Its own useInput handler owns ESC / Enter while it's up.
+  if (showMeme) {
+    return h(MemeOverlay, { onDismiss: () => setShowMeme(false) });
+  }
   return h(Box, { flexDirection: "column" },
     // Welcome banner
     showWelcome ? h(WelcomeBanner, {
@@ -558,11 +696,16 @@ function App({ engine, config }) {
     // Separator + Input
     h(HRule),
+    // F2: Input stays active during streaming. Submissions while the
+    // agent is busy get queued (handleSubmit checks streamingRef) and
+    // flushed at the next natural turn boundary. Matches Claude Code's
+    // type-ahead behavior.
     h(InputPrompt, {
       value: inputValue,
       onChange: setInputValue,
       onSubmit: handleSubmit,
-      isActive: !streaming,
+      isActive: true,
+      placeholderRight: queueSize > 0 ? `(${queueSize} queued)` : null,
     }),
     h(HRule),
     h(StatusBar, { sessionId, phase, contextTokens, contextLimit }),
@@ -611,8 +754,12 @@ export async function main({ languageOverride } = {}) {
   const engine = new AgentEngine({ client, config });
-  // Save state on process exit
-  const saveOnExit = () => { try { engine.saveState(); } catch { /* ignore */ } };
+  // Save state on process exit + stop background diagnostics (B0.1 heap
+  // sampler). saveState is idempotent; stop() is safe to call twice.
+  const saveOnExit = () => {
+    try { engine.saveState(); } catch { /* ignore */ }
+    try { engine.stop(); } catch { /* ignore */ }
+  };
   process.on("SIGINT", saveOnExit);
   process.on("SIGTERM", saveOnExit);

package/src/cli/meme.js ADDED Viewed

@@ -0,0 +1,58 @@
+import React, { useState } from "react";
+import { Box, Text, useInput } from "ink";
+const h = React.createElement;
+// F6: /meme easter egg. Intentionally not listed in /help — discovery
+// is the point. Press ESC or Enter to dismiss. Content per the v0.6.0
+// plan (item 15) — lyrics + team credit.
+const LYRICS = [
+  "I'll wait and soon",
+  "We're stranded on the beach",
+  "In our dream",
+  "We part too soon",
+  "But in our lies",
+  "There's a truth to find",
+  "The end is new",
+  "A tomorrow we must reach for",
+  "To be heard",
+];
+const TEAM = [
+  "@kitchen-engineer42", "@Xigua", "@Amelia", "@01Fish",
+  "@zyxthetroll", "@theon", "@DivisionDirectorXu",
+  "@AnselKocen", "@CarolineCRL", "@GraceGuo",
+  "@XY🌟", "@HalfM", "@GreenOrange",
+  "@LilyHuang", "@Qianlili", "@songmao",
+  "@zoezoe", "@yhhm",
+];
+export function MemeOverlay({ onDismiss }) {
+  useInput((input, key) => {
+    if (key.escape || key.return) onDismiss();
+  });
+  return h(Box, { flexDirection: "column", borderStyle: "round", borderColor: "magenta", paddingLeft: 2, paddingRight: 2, paddingTop: 1, paddingBottom: 1, marginTop: 1, marginBottom: 1 },
+    // Lyrics block
+    h(Box, { flexDirection: "column" },
+      ...LYRICS.map((line, i) =>
+        h(Text, { key: `l-${i}`, color: "cyan", italic: true }, line),
+      ),
+    ),
+    h(Text, null, ""),
+    h(Text, { dimColor: true }, "─".repeat(60)),
+    h(Text, null, ""),
+    // Team credit
+    h(Text, { color: "yellow", bold: true },
+      "Here's to all the smart minds that are/were part of our team:"),
+    h(Text, null, ""),
+    h(Box, { flexWrap: "wrap" },
+      ...TEAM.map((handle, i) =>
+        h(Text, { key: `t-${i}`, color: "green" }, `${handle}${i < TEAM.length - 1 ? ",  " : ""}`),
+      ),
+    ),
+    h(Text, null, ""),
+    h(Text, { dimColor: true }, "Press ESC or Enter to dismiss."),
+  );
+}

package/src/config.js CHANGED Viewed

@@ -109,8 +109,17 @@ export function loadSettings(workspacePath) {
     // Web search
     tavilyApiKey: env.TAVILY_API_KEY || gc.tavily_api_key || "",
-    // Context management
-    kcContextLimit: parseInt(env.KC_CONTEXT_LIMIT || "200000", 10),
+    // Context management — A2: prefer per-provider cap from providers.js
+    // over the generic 200000 default. KC_CONTEXT_LIMIT env still wins.
+    // gc.kc_context_limit (global config) is next. Then provider.contextLimit.
+    // Then a safe 200000 fallback for unknown/custom providers.
+    kcContextLimit: parseInt(
+      env.KC_CONTEXT_LIMIT ||
+        gc.kc_context_limit?.toString() ||
+        providerDef?.contextLimit?.toString() ||
+        "200000",
+      10,
+    ),
     toolOutputOffloadTokens: parseInt(env.TOOL_OUTPUT_OFFLOAD_TOKENS || gc.tool_output_offload_tokens?.toString() || "2000", 10),
     toolOutputOffloadErrorTokens: parseInt(env.TOOL_OUTPUT_OFFLOAD_ERROR_TOKENS || gc.tool_output_offload_error_tokens?.toString() || "500", 10),
     maxMessageTokens: parseInt(env.MAX_MESSAGE_TOKENS || gc.max_message_tokens?.toString() || "60000", 10),
@@ -123,8 +132,36 @@ export function loadSettings(workspacePath) {
     // Language
     language: env.LANGUAGE || gc.language || "en",
+    // B0.6: Parallel ralph-loop guard. Parallelism > 1 is a LOADED footgun
+    // until the heap-safety conformance gate (B0.7) passes. Unsetting the
+    // verified flag forces serial execution — KC_PARALLELISM_VERIFIED must
+    // be set explicitly after heap.jsonl shows a flat RSS trajectory over
+    // ≥ 2h. This prevents accidental $100+ runaway runs.
+    //
+    // Source priority (highest first): process.env (B3 CLI flag sets this)
+    // → workspace .env → global config. Parsed here; the actual effective
+    // value is computed by a helper below that downgrades to 1 if the
+    // verified flag isn't set.
+    parallelismVerified: (() => {
+      const raw = (process.env.KC_PARALLELISM_VERIFIED ||
+        env.KC_PARALLELISM_VERIFIED || gc.parallelism_verified || "").toString();
+      return raw === "1" || raw.toLowerCase() === "true";
+    })(),
+    parallelismRequested: (() => {
+      const raw = process.env.KC_PARALLELISM || env.KC_PARALLELISM || gc.parallelism;
+      const n = Number.parseInt(raw, 10);
+      if (!Number.isFinite(n) || n < 1) return 1;
+      return Math.min(n, 8); // max 8 per plan — prevents API-spend runaway
+    })(),
   };
+  // Effective parallelism is silently clamped to 1 unless KC_PARALLELISM_VERIFIED
+  // is set. Callers (engine.runTaskLoop, /parallelism slash command, CLI flag)
+  // should read this instead of parallelismRequested.
+  settings.effectiveParallelism = () =>
+    settings.parallelismVerified ? settings.parallelismRequested : 1;
   // Effective worker config (falls back to conductor config)
   settings.effectiveWorkerProvider = () => settings.workerProvider || settings.provider;
   settings.effectiveWorkerApiKey = () => settings.workerApiKey || settings.llmApiKey;

package/src/providers.js CHANGED Viewed

@@ -28,6 +28,16 @@ function getTierConfig(providerId) {
   return MODEL_TIERS[providerId] || { conductor: "", llm: {}, vlm: {} };
 }
+// A2: Per-provider context-window caps. Without these, every provider
+// inherited the generic 200000-token default from config.js, which caused
+// silent empty-response failures on smaller-window models (xfyun
+// astron-code-latest behaves like it has ~32K during E2E #3). The
+// _maybeWindowAfterToolResult threshold only fires around 70% of budget, so
+// with a 200K budget on a 32K-limit model windowing never fires in time.
+// These numbers are conservative minimums — users can still override via
+// KC_CONTEXT_LIMIT env or kc_context_limit in global config.
+const DEFAULT_CONTEXT_LIMIT = 200000;
 const PROVIDERS = [
   {
     id: "siliconflow",
@@ -36,6 +46,7 @@ const PROVIDERS = [
     authType: "bearer",
     apiFormat: "openai",
     modelsEndpoint: "/models",
+    contextLimit: 200000, // GLM-5.1, Kimi-K2.5 — 200K native
     defaultModel: getTierConfig("siliconflow").conductor || "glm-5",
     defaultTiers: getTierConfig("siliconflow").llm,
     defaultVlm: getTierConfig("siliconflow").vlm,
@@ -54,6 +65,7 @@ const PROVIDERS = [
     apiFormat: "openai",
     modelsEndpoint: null, // Aliyun coding plan doesn't support /models
     supportsCodingPlanKey: true,
+    contextLimit: 131072, // Qwen3.x family — 128K on the coding plan
     defaultModel: getTierConfig("aliyun").conductor || "qwen3.6-plus",
     defaultTiers: getTierConfig("aliyun").llm,
     defaultVlm: getTierConfig("aliyun").vlm,
@@ -86,6 +98,7 @@ const PROVIDERS = [
     apiFormat: "openai",
     modelsEndpoint: null, // VolcanoCloud — use curated list
     supportsCodingPlanKey: true,
+    contextLimit: 200000, // H2: glm-5.1 on coding plan has 200K native
     defaultModel: getTierConfig("volcanocloud").conductor || "doubao-seed-2-0-pro-260215",
     defaultTiers: getTierConfig("volcanocloud").llm,
     defaultVlm: getTierConfig("volcanocloud").vlm,
@@ -114,6 +127,10 @@ const PROVIDERS = [
     authType: "bearer",
     apiFormat: "openai",
     modelsEndpoint: null,
+    // xfyun astron-code-latest — empirical ~32K-64K window per E2E #3. Set
+    // conservatively at 32K so windowing fires early and the provider never
+    // sees a request it will silently fail on.
+    contextLimit: 32768,
     defaultModel: getTierConfig("xfyun").conductor || "astron-code-latest",
     defaultTiers: getTierConfig("xfyun").llm,
     defaultVlm: getTierConfig("xfyun").vlm,
@@ -132,6 +149,7 @@ const PROVIDERS = [
     authType: "x-api-key",
     apiFormat: "anthropic",
     modelsEndpoint: null, // Use curated list
+    contextLimit: 400000, // Claude 4.x family — 400K on current long-context tier
     defaultModel: getTierConfig("anthropic").conductor || "claude-sonnet-4-20250514",
     defaultTiers: getTierConfig("anthropic").llm,
     defaultVlm: getTierConfig("anthropic").vlm,
@@ -152,6 +170,7 @@ const PROVIDERS = [
     authType: "bearer",
     apiFormat: "openai",
     modelsEndpoint: "/models",
+    contextLimit: 128000, // gpt-4o — 128K
     defaultModel: getTierConfig("openai").conductor || "gpt-4o",
     defaultTiers: getTierConfig("openai").llm,
     defaultVlm: getTierConfig("openai").vlm,
@@ -167,6 +186,7 @@ const PROVIDERS = [
     authType: "bearer",
     apiFormat: "openai",
     modelsEndpoint: "/models",
+    contextLimit: 200000, // GLM official (bigmodel.cn) — 200K on GLM-4.x/5.x tiers
     defaultModel: getTierConfig("zhipu").conductor || "glm-4-plus",
     defaultTiers: getTierConfig("zhipu").llm,
     defaultVlm: getTierConfig("zhipu").vlm,
@@ -182,6 +202,7 @@ const PROVIDERS = [
     authType: "bearer",
     apiFormat: "openai",
     modelsEndpoint: "/models",
+    contextLimit: 245760, // MiniMax-M2.5 — 240K
     defaultModel: getTierConfig("minimax").conductor || "MiniMax-M2.5",
     defaultTiers: getTierConfig("minimax").llm,
     defaultVlm: getTierConfig("minimax").vlm,
@@ -197,6 +218,10 @@ const PROVIDERS = [
     authType: "bearer",
     apiFormat: "openai",
     modelsEndpoint: "/models",
+    // OpenRouter proxies many models; defaulting to 200K matches the underlying
+    // frontier Anthropic/Google routes most users pick. Lower-context models
+    // behind OpenRouter will still work, just won't benefit from early windowing.
+    contextLimit: 200000,
     defaultModel: getTierConfig("openrouter").conductor || "anthropic/claude-sonnet-4-20250514",
     defaultTiers: getTierConfig("openrouter").llm,
     defaultVlm: getTierConfig("openrouter").vlm,
@@ -212,6 +237,7 @@ const PROVIDERS = [
     authType: "aws-sigv4",
     apiFormat: "anthropic",
     modelsEndpoint: null,
+    contextLimit: 200000, // Bedrock Anthropic routes mirror native Claude 200K
     defaultModel: getTierConfig("bedrock").conductor || "anthropic.claude-sonnet-4-20250514-v1:0",
     defaultTiers: getTierConfig("bedrock").llm,
     defaultVlm: getTierConfig("bedrock").vlm,

package/template/skills/en/meta-meta/evolution-loop/SKILL.md CHANGED Viewed

@@ -163,12 +163,24 @@ Track three metrics per iteration to know when to stop:
 ### Stopping Criteria
-Stop the loop when ALL three conditions hold for one iteration:
+Stop the loop when **ALL** three conditions hold for one iteration:
 1. Correction volume < 5% of total test cases.
 2. New pattern count = 0.
 3. Regression count = 0.
+**OR** when the standalone accuracy-convergence condition holds (D5, added
+2026-04-23):
+4. Overall accuracy changed by less than 1% between the last two
+   iterations — i.e. `|accuracy[N+1] − accuracy[N]| < 0.01`.
+Condition 4 prevents the observed over-iteration pattern of v5 → v12,
+where each iteration oscillated within a ~0.5% accuracy window. Once the
+model has reached "good enough," continuing burns tokens without
+delivering real improvement. When accuracy has plateaued, proceed to the
+next phase (distillation / production).
 If correction volume *increases* between consecutive iterations, this is a regression signal. Pause the loop and diagnose before continuing — the last fix may be destabilizing the system.
 ### Expected Convergence

package/template/skills/en/meta-meta/rule-extraction/SKILL.md CHANGED Viewed

@@ -59,6 +59,80 @@ Rules will be distilled into workflows (see `skill-to-workflow`). Design with di
 ### Catalog Versioning
 When rules change (additions, modifications, deprecations), version the entire rule catalog as a unit. Individual rule versions track specific rules; the catalog version tracks the coherent set. Record the catalog version in `versions.json` alongside individual rule versions.
+## Granularity Calibration (read before extracting)
+A well-extracted rule catalog has **10-20 rules per typical regulation PDF**
+(a 30-80 page disclosure regulation). Over-extraction into 60-100 rules per
+regulation signals you're treating every clause as its own rule — downstream
+consumers (skill-authoring, workflow-run) can't distinguish meaningful
+checks from boilerplate.
+If your first pass produces more than ~25 rules for a single regulation:
+- **Merge rules that share evidence and fail together** (e.g., "must
+  disclose X" and "must disclose Y" where both come from the same
+  required-fields table → one rule: "must disclose the required-fields
+  list including X, Y").
+- **Drop procedural language** that isn't checkable against a report
+  (definitions, scope statements, references to other regs that just
+  transitively apply).
+- **Keep only checkable obligations, prohibitions, and thresholds** —
+  things where you can read a sample report and say pass or fail.
+### Sample "good" rule
+```json
+{
+  "id": "R014",
+  "source_ref": "Disclosure Reg §15.2",
+  "description": "Quarterly reports must be disclosed within 15 business days after quarter-end.",
+  "applicable_sections": ["public funds"],
+  "severity": "high",
+  "machine_checkable": true,
+  "falsifiability_statement": "If disclosure date is later than 15th business day after quarter-end, the rule fails.",
+  "test_case_stub": "Read the quarterly report's disclosure date + the quarter-end date, compute business-day difference."
+}
+```
+Note: one pass/fail outcome, a single `source_ref` to a specific clause,
+clear applicability scope. Skill-authoring can write `check_r014.py` from
+this alone.
+### Cross-regulation dedup (when working across multiple PDFs)
+If the developer user provides N regulations, rules from later regs often
+duplicate cross-cutting requirements already captured by earlier ones
+(e.g., a 2018 generic disclosure rule vs. a 2025 specific version).
+Before emitting a rule from reg N:
+1. **Check the existing catalog.** Use `rule_catalog` (operation: list)
+   to see what's already there. Skip if a rule with equivalent scope +
+   intent exists.
+2. **Prefer the newer / more specific source_ref** when rules overlap.
+3. **If you merged rules**, record the consolidated sources in
+   `source_ref`: e.g., `"New Reg §15.2 + Old Reg §24"`.
+### Delegation to sub-agents
+If you dispatch extraction to sub-agents (one per regulation), the
+sub-agent inherits ONLY its `task_description` — it cannot see your
+conversation or existing catalog. Therefore, when composing the brief:
+- **Specify the target count band** explicitly: "Extract 10-20 atomic
+  rules from this regulation."
+- **Include a sample rule** in the brief body (paste the JSON above
+  verbatim) so the sub-agent's calibration matches yours.
+- **Name every regulation the sub-agent should process.** If AGENT.md
+  lists 10 core regulations, the brief must list all 10 by name, not
+  "the core regs" as a pronoun — LLMs composing long structured briefs
+  frequently drop items (observed in session 6304673afaa0 where reg 02
+  was silently omitted).
+- **State the dedup contract**: "Rules already in the parent's catalog
+  (R001–Rnnn) should NOT be re-extracted. If a requirement is already
+  covered, skip it." Then pass the current catalog's ID ranges.
+- **Prefer `rule_catalog` create operations over sandbox_exec writes to
+  catalog.json.** rule_catalog uses workspace file locking;
+  sandbox_exec bypasses it and races with other writers.
 ## Extraction Strategies
 ### Strategy 1: Structured Input (Developer User Provides Rules)

package/template/skills/zh/meta-meta/evolution-loop/SKILL.md CHANGED Viewed

@@ -241,12 +241,18 @@ description: Drive continuous improvement of skills and workflows through the di
 ### 停止条件
-当一轮迭代同时满足以下三个条件时，停止循环：
+当一轮迭代**同时**满足以下三个条件时，停止循环：
 1. 修正量 < 总测试案例的 5%。
 2. 新模式数 = 0。
 3. 回归数 = 0。
+**或者**满足单独的准确率收敛条件（D5，2026-04-23 新增）：
+4. 连续两轮迭代之间的整体准确率变化 < 1%。即 `|accuracy[N+1] - accuracy[N]| < 0.01`。
+条件 4 是为了防止观察到的过度迭代模式——从 v5 一直迭代到 v12，每轮都在 0.5% 的精度范围内来回波动。当模型已经达到"足够好"时，继续迭代只会消耗 token，不会带来实质改进。一旦准确率趋于稳定，应当进入下一阶段（蒸馏/生产）。
 如果修正量在连续两轮迭代之间**增加**，这是回归信号。暂停循环，先诊断原因再继续——上一轮的修复可能正在破坏系统的稳定性。
 ### 预期收敛速度