kc-beta 0.5.6 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/cli/index.js CHANGED
@@ -15,6 +15,7 @@ import {
15
15
  HRule,
16
16
  InputPrompt,
17
17
  } from "./components.js";
18
+ import { MemeOverlay } from "./meme.js"; // F6
18
19
 
19
20
  const h = React.createElement;
20
21
 
@@ -30,6 +31,17 @@ const VISIBLE_WINDOW = 50;
30
31
  // Older ToolBlocks show header only. Both still persist full output to disk.
31
32
  const RECENT_TOOL_WINDOW = 10;
32
33
 
34
+ // B0.3: Hard cap on the React `messages` array. Without this, the array
35
+ // grows forever (setMessages((prev) => [...prev, msg]) via addMessage) —
36
+ // the VISIBLE_WINDOW virtualization hides old entries from render but
37
+ // they still sit in state. Over a 17 h session with 2-4 messages per
38
+ // turn, that's 1000s of entries holding tool-result digest strings and
39
+ // pipeline messages. /compact resets messages to a 1-item summary, so
40
+ // this cap is really a safety net between compacts. On cap hit, drop
41
+ // oldest non-system entries (system messages carry session-level
42
+ // context — pipeline transitions, errors — that users want retained).
43
+ const MAX_RETAINED_MESSAGES = 500;
44
+
33
45
  /**
34
46
  * Main KC Agent CLI App using Ink (React for terminals).
35
47
  */
@@ -43,6 +55,7 @@ function App({ engine, config }) {
43
55
  const [sessionId, setSessionId] = useState(engine.workspace.sessionId);
44
56
  const [phase, setPhase] = useState(engine.currentPhase);
45
57
  const [showWelcome, setShowWelcome] = useState(true);
58
+ const [showMeme, setShowMeme] = useState(false); // F6
46
59
  const [spinnerStatus, setSpinnerStatus] = useState(null);
47
60
  const [contextTokens, setContextTokens] = useState(0);
48
61
  const [contextLimit, setContextLimit] = useState(config.kcContextLimit || 200000);
@@ -63,7 +76,16 @@ function App({ engine, config }) {
63
76
  }, []);
64
77
 
65
78
  const addMessage = useCallback((msg) => {
66
- setMessages((prev) => [...prev, msg]);
79
+ setMessages((prev) => {
80
+ if (prev.length < MAX_RETAINED_MESSAGES) return [...prev, msg];
81
+ // Cap hit: drop the oldest non-system entry. If everything is system
82
+ // (unlikely but possible), fall back to dropping the very oldest.
83
+ const dropIdx = prev.findIndex((m) => m.role !== "system");
84
+ const next = dropIdx >= 0
85
+ ? [...prev.slice(0, dropIdx), ...prev.slice(dropIdx + 1), msg]
86
+ : [...prev.slice(1), msg];
87
+ return next;
88
+ });
67
89
  }, []);
68
90
 
69
91
  const runTurn = useCallback(async (text) => {
@@ -76,7 +98,9 @@ function App({ engine, config }) {
76
98
  let accumulated = "";
77
99
 
78
100
  try {
79
- for await (const event of engineRef.current.runTaskLoop(text)) {
101
+ for await (const event of engineRef.current.runTaskLoop(text, {
102
+ parallelism: config.effectiveParallelism?.() ?? 1,
103
+ })) {
80
104
  switch (event.type) {
81
105
  case "text_delta":
82
106
  accumulated += event.text ?? "";
@@ -117,6 +141,13 @@ function App({ engine, config }) {
117
141
  });
118
142
  setCurrentTool(null);
119
143
  setSpinnerStatus("Analyzing results...");
144
+ // H4: Refresh the CTX indicator after every tool_result. Without
145
+ // this, contextTokens only updates on turn_complete, which never
146
+ // fires in long tool-heavy sessions — we observed 908 events with
147
+ // zero turn_complete in session 6304673afaa0, CTX stuck at 0/131k
148
+ // for 30+ minutes. getContextStats() is a cheap pure calc over
149
+ // the history array; safe to call on every tool call.
150
+ updateContextStats();
120
151
  break;
121
152
 
122
153
  case "pipeline_event": {
@@ -153,7 +184,10 @@ function App({ engine, config }) {
153
184
  // Process queue
154
185
  if (queueRef.current.length > 0) {
155
186
  const next = queueRef.current.shift();
187
+ setQueueSize(queueRef.current.length); // F2
156
188
  runTurn(next);
189
+ } else {
190
+ setQueueSize(0); // F2
157
191
  }
158
192
  }, [addMessage, updateContextStats]);
159
193
 
@@ -173,6 +207,8 @@ function App({ engine, config }) {
173
207
  " /tasks Show task progress\n" +
174
208
  " /phase [sub] advance | status | <name> — manual phase override\n" +
175
209
  " /schedule Show scheduled ingestion jobs and recent log lines\n" +
210
+ " /tools List all registered tools and which phase gates them\n" +
211
+ " /parallelism [N] Show or set parallel ralph-loop worker count (1-8)\n" +
176
212
  " /clear Clear conversation history (keep workspace)\n" +
177
213
  " /compact Summarize older messages to reduce context\n" +
178
214
  " /sessions List all sessions\n" +
@@ -184,19 +220,90 @@ function App({ engine, config }) {
184
220
 
185
221
  case "/status": {
186
222
  const stats = engineRef.current.getContextStats();
223
+ const par = config.effectiveParallelism?.() ?? 1;
224
+ const parLine = par > 1
225
+ ? `${par} (verified)`
226
+ : `${config.parallelismRequested || 1} requested` +
227
+ (config.parallelismRequested > 1 && !config.parallelismVerified
228
+ ? ` — clamped to 1 (KC_PARALLELISM_VERIFIED not set; run heap baseline first)`
229
+ : "");
230
+ addMessage({
231
+ role: "system",
232
+ content:
233
+ `Session: ${engineRef.current.workspace.sessionId}\n` +
234
+ `Phase: ${engineRef.current.currentPhase.toUpperCase()}\n` +
235
+ `Model: ${config.kcModel}\n` +
236
+ `Provider: ${config.provider || "unknown"}\n` +
237
+ `LLM URL: ${config.llmBaseUrl}\n` +
238
+ `Project: ${engineRef.current.workspace.projectDir || "(none)"}\n` +
239
+ `Workspace: ${engineRef.current.workspace.cwd}\n` +
240
+ `Tools: ${engineRef.current.toolRegistry.size} registered\n` +
241
+ `History: ${engineRef.current.history.messages.length} messages\n` +
242
+ `Context: ~${stats.totalTokens} tokens (${stats.percentage}% of ${stats.limit})\n` +
243
+ `Parallelism: ${parLine}`,
244
+ });
245
+ return true;
246
+ }
247
+
248
+ case "/meme":
249
+ // F6: easter egg. Not in /help.
250
+ setShowMeme(true);
251
+ return true;
252
+
253
+ case "/tools": {
254
+ // F5: list all registered tools + which phase gates them. Reads
255
+ // from the live toolRegistry so what you see is what the agent
256
+ // currently has available. Also names the distill-only tools
257
+ // explicitly so users understand why some tools "come and go"
258
+ // as phases advance.
259
+ const reg = engineRef.current.toolRegistry;
260
+ const names = reg?.names?.() || [];
261
+ const core = engineRef.current._buildTools?.core?.map((t) => t?.name).filter(Boolean) || [];
262
+ const distill = engineRef.current._buildTools?.distill?.map((t) => t?.name).filter(Boolean) || [];
263
+ const phase = engineRef.current.currentPhase.toUpperCase();
264
+ const lines = [
265
+ `Tools registered for phase ${phase}: ${names.length}`,
266
+ "",
267
+ `Core (always available, ${core.length}):`,
268
+ ...core.map((n) => ` • ${n}${names.includes(n) ? "" : " [not currently registered]"}`),
269
+ ];
270
+ if (distill.length > 0) {
271
+ lines.push("", `Distill-only (DISTILLATION / PRODUCTION_QC / FINALIZATION, ${distill.length}):`);
272
+ for (const n of distill) {
273
+ lines.push(` • ${n}${names.includes(n) ? "" : " [gated out of this phase]"}`);
274
+ }
275
+ }
276
+ lines.push("", "Tools are not separately installable — they ship with the KC release. To see what each tool does, invoke it or ask the agent.");
277
+ addMessage({ role: "system", content: lines.join("\n") });
278
+ return true;
279
+ }
280
+
281
+ case "/parallelism": {
282
+ // B3: set parallelism at runtime. Respects the B0.6 guard —
283
+ // takes effect only if KC_PARALLELISM_VERIFIED is already set.
284
+ const n = parseInt(arg, 10);
285
+ if (!Number.isFinite(n) || n < 1) {
286
+ addMessage({
287
+ role: "system",
288
+ content:
289
+ `Usage: /parallelism <N> (1-8)\n` +
290
+ `Current: requested=${config.parallelismRequested || 1}, ` +
291
+ `effective=${config.effectiveParallelism?.() ?? 1}. ` +
292
+ (config.parallelismVerified
293
+ ? "Verified — new value takes effect next /run."
294
+ : "Unverified — clamped to 1. Set KC_PARALLELISM_VERIFIED=1 after a clean 2h heap-baseline run."),
295
+ });
296
+ return true;
297
+ }
298
+ const clamped = Math.min(Math.max(n, 1), 8);
299
+ config.parallelismRequested = clamped;
187
300
  addMessage({
188
301
  role: "system",
189
302
  content:
190
- `Session: ${engineRef.current.workspace.sessionId}\n` +
191
- `Phase: ${engineRef.current.currentPhase.toUpperCase()}\n` +
192
- `Model: ${config.kcModel}\n` +
193
- `Provider: ${config.provider || "unknown"}\n` +
194
- `LLM URL: ${config.llmBaseUrl}\n` +
195
- `Project: ${engineRef.current.workspace.projectDir || "(none)"}\n` +
196
- `Workspace: ${engineRef.current.workspace.cwd}\n` +
197
- `Tools: ${engineRef.current.toolRegistry.size} registered\n` +
198
- `History: ${engineRef.current.history.messages.length} messages\n` +
199
- `Context: ~${stats.totalTokens} tokens (${stats.percentage}% of ${stats.limit})`,
303
+ `Parallelism requested=${clamped}. ` +
304
+ (config.parallelismVerified
305
+ ? `Effective=${config.effectiveParallelism()} (verified).`
306
+ : `Effective=1 (verified flag not set — see /status).`),
200
307
  });
201
308
  return true;
202
309
  }
@@ -339,10 +446,25 @@ function App({ engine, config }) {
339
446
  } catch (err) {
340
447
  addMessage({ role: "system", content: `Compact failed: ${err.message}` });
341
448
  } finally {
449
+ // F8: Spinner-race fix. If a queued task is about to kick off
450
+ // via runTurn(next), DO NOT clear the streaming/spinner state
451
+ // here — runTurn's own entry sets streamingRef=true + spinner
452
+ // immediately, but there's a brief React-render window between
453
+ // our `setStreaming(false)` and its `setStreaming(true)` where
454
+ // the TUI paints "no spinner, no streaming" for 1-2 frames.
455
+ // Over long sessions that looked like a dead TUI when a user
456
+ // watched the moment /compact auto-chained to the next task.
457
+ // Order now: IF next task is queued, let runTurn(next) set all
458
+ // streaming state in one atomic render; we just reset the ref
459
+ // flags to avoid the input-is-locked issue. Otherwise do the
460
+ // full clear (idle-TUI case).
461
+ const hasQueuedWork = queueRef.current.length > 0;
342
462
  streamingRef.current = false;
343
- setStreaming(false);
344
- setSpinnerStatus(null);
345
- if (queueRef.current.length > 0) {
463
+ if (!hasQueuedWork) {
464
+ setStreaming(false);
465
+ setSpinnerStatus(null);
466
+ }
467
+ if (hasQueuedWork) {
346
468
  const next = queueRef.current.shift();
347
469
  runTurn(next);
348
470
  }
@@ -436,8 +558,9 @@ function App({ engine, config }) {
436
558
 
437
559
  case "/exit":
438
560
  case "/quit":
439
- // Save state before exit
561
+ // Save state + stop diagnostics before exit
440
562
  try { engineRef.current.saveState(); } catch { /* ignore */ }
563
+ try { engineRef.current.stop(); } catch { /* ignore */ }
441
564
  exit();
442
565
  return true;
443
566
 
@@ -446,6 +569,8 @@ function App({ engine, config }) {
446
569
  }
447
570
  }, [addMessage, config, exit, updateContextStats]);
448
571
 
572
+ const [queueSize, setQueueSize] = useState(0); // F2: count for TUI indicator
573
+
449
574
  const handleSubmit = useCallback((text) => {
450
575
  const trimmed = text.trim();
451
576
  setInputValue("");
@@ -460,6 +585,11 @@ function App({ engine, config }) {
460
585
 
461
586
  if (streamingRef.current) {
462
587
  queueRef.current.push(trimmed);
588
+ setQueueSize(queueRef.current.length); // F2
589
+ addMessage({
590
+ role: "system",
591
+ content: `⏳ Queued (${queueRef.current.length} waiting). Will be sent to KC on next turn boundary.`,
592
+ });
463
593
  } else {
464
594
  runTurn(trimmed);
465
595
  }
@@ -473,15 +603,23 @@ function App({ engine, config }) {
473
603
  addMessage({ role: "system", content: "[Queue cleared]" });
474
604
  } else {
475
605
  try { engineRef.current.saveState(); } catch { /* ignore */ }
606
+ try { engineRef.current.stop(); } catch { /* ignore */ }
476
607
  exit();
477
608
  }
478
609
  }
479
610
  if (key.ctrl && input === "d") {
480
611
  try { engineRef.current.saveState(); } catch { /* ignore */ }
612
+ try { engineRef.current.stop(); } catch { /* ignore */ }
481
613
  exit();
482
614
  }
483
615
  });
484
616
 
617
+ // F6: /meme overlay short-circuits the rest of the UI until dismissed.
618
+ // Its own useInput handler owns ESC / Enter while it's up.
619
+ if (showMeme) {
620
+ return h(MemeOverlay, { onDismiss: () => setShowMeme(false) });
621
+ }
622
+
485
623
  return h(Box, { flexDirection: "column" },
486
624
  // Welcome banner
487
625
  showWelcome ? h(WelcomeBanner, {
@@ -558,11 +696,16 @@ function App({ engine, config }) {
558
696
 
559
697
  // Separator + Input
560
698
  h(HRule),
699
+ // F2: Input stays active during streaming. Submissions while the
700
+ // agent is busy get queued (handleSubmit checks streamingRef) and
701
+ // flushed at the next natural turn boundary. Matches Claude Code's
702
+ // type-ahead behavior.
561
703
  h(InputPrompt, {
562
704
  value: inputValue,
563
705
  onChange: setInputValue,
564
706
  onSubmit: handleSubmit,
565
- isActive: !streaming,
707
+ isActive: true,
708
+ placeholderRight: queueSize > 0 ? `(${queueSize} queued)` : null,
566
709
  }),
567
710
  h(HRule),
568
711
  h(StatusBar, { sessionId, phase, contextTokens, contextLimit }),
@@ -611,8 +754,12 @@ export async function main({ languageOverride } = {}) {
611
754
 
612
755
  const engine = new AgentEngine({ client, config });
613
756
 
614
- // Save state on process exit
615
- const saveOnExit = () => { try { engine.saveState(); } catch { /* ignore */ } };
757
+ // Save state on process exit + stop background diagnostics (B0.1 heap
758
+ // sampler). saveState is idempotent; stop() is safe to call twice.
759
+ const saveOnExit = () => {
760
+ try { engine.saveState(); } catch { /* ignore */ }
761
+ try { engine.stop(); } catch { /* ignore */ }
762
+ };
616
763
  process.on("SIGINT", saveOnExit);
617
764
  process.on("SIGTERM", saveOnExit);
618
765
 
@@ -0,0 +1,58 @@
1
+ import React, { useState } from "react";
2
+ import { Box, Text, useInput } from "ink";
3
+
4
+ const h = React.createElement;
5
+
6
+ // F6: /meme easter egg. Intentionally not listed in /help — discovery
7
+ // is the point. Press ESC or Enter to dismiss. Content per the v0.6.0
8
+ // plan (item 15) — lyrics + team credit.
9
+
10
+ const LYRICS = [
11
+ "I'll wait and soon",
12
+ "We're stranded on the beach",
13
+ "In our dream",
14
+ "We part too soon",
15
+ "But in our lies",
16
+ "There's a truth to find",
17
+ "The end is new",
18
+ "A tomorrow we must reach for",
19
+ "To be heard",
20
+ ];
21
+
22
+ const TEAM = [
23
+ "@kitchen-engineer42", "@Xigua", "@Amelia", "@01Fish",
24
+ "@zyxthetroll", "@theon", "@DivisionDirectorXu",
25
+ "@AnselKocen", "@CarolineCRL", "@GraceGuo",
26
+ "@XY🌟", "@HalfM", "@GreenOrange",
27
+ "@LilyHuang", "@Qianlili", "@songmao",
28
+ "@zoezoe", "@yhhm",
29
+ ];
30
+
31
+ export function MemeOverlay({ onDismiss }) {
32
+ useInput((input, key) => {
33
+ if (key.escape || key.return) onDismiss();
34
+ });
35
+
36
+ return h(Box, { flexDirection: "column", borderStyle: "round", borderColor: "magenta", paddingLeft: 2, paddingRight: 2, paddingTop: 1, paddingBottom: 1, marginTop: 1, marginBottom: 1 },
37
+ // Lyrics block
38
+ h(Box, { flexDirection: "column" },
39
+ ...LYRICS.map((line, i) =>
40
+ h(Text, { key: `l-${i}`, color: "cyan", italic: true }, line),
41
+ ),
42
+ ),
43
+ h(Text, null, ""),
44
+ h(Text, { dimColor: true }, "─".repeat(60)),
45
+ h(Text, null, ""),
46
+ // Team credit
47
+ h(Text, { color: "yellow", bold: true },
48
+ "Here's to all the smart minds that are/were part of our team:"),
49
+ h(Text, null, ""),
50
+ h(Box, { flexWrap: "wrap" },
51
+ ...TEAM.map((handle, i) =>
52
+ h(Text, { key: `t-${i}`, color: "green" }, `${handle}${i < TEAM.length - 1 ? ", " : ""}`),
53
+ ),
54
+ ),
55
+ h(Text, null, ""),
56
+ h(Text, { dimColor: true }, "Press ESC or Enter to dismiss."),
57
+ );
58
+ }
package/src/config.js CHANGED
@@ -109,8 +109,17 @@ export function loadSettings(workspacePath) {
109
109
  // Web search
110
110
  tavilyApiKey: env.TAVILY_API_KEY || gc.tavily_api_key || "",
111
111
 
112
- // Context management
113
- kcContextLimit: parseInt(env.KC_CONTEXT_LIMIT || "200000", 10),
112
+ // Context management — A2: prefer per-provider cap from providers.js
113
+ // over the generic 200000 default. KC_CONTEXT_LIMIT env still wins.
114
+ // gc.kc_context_limit (global config) is next. Then provider.contextLimit.
115
+ // Then a safe 200000 fallback for unknown/custom providers.
116
+ kcContextLimit: parseInt(
117
+ env.KC_CONTEXT_LIMIT ||
118
+ gc.kc_context_limit?.toString() ||
119
+ providerDef?.contextLimit?.toString() ||
120
+ "200000",
121
+ 10,
122
+ ),
114
123
  toolOutputOffloadTokens: parseInt(env.TOOL_OUTPUT_OFFLOAD_TOKENS || gc.tool_output_offload_tokens?.toString() || "2000", 10),
115
124
  toolOutputOffloadErrorTokens: parseInt(env.TOOL_OUTPUT_OFFLOAD_ERROR_TOKENS || gc.tool_output_offload_error_tokens?.toString() || "500", 10),
116
125
  maxMessageTokens: parseInt(env.MAX_MESSAGE_TOKENS || gc.max_message_tokens?.toString() || "60000", 10),
@@ -123,8 +132,36 @@ export function loadSettings(workspacePath) {
123
132
 
124
133
  // Language
125
134
  language: env.LANGUAGE || gc.language || "en",
135
+
136
+ // B0.6: Parallel ralph-loop guard. Parallelism > 1 is a LOADED footgun
137
+ // until the heap-safety conformance gate (B0.7) passes. Unsetting the
138
+ // verified flag forces serial execution — KC_PARALLELISM_VERIFIED must
139
+ // be set explicitly after heap.jsonl shows a flat RSS trajectory over
140
+ // ≥ 2h. This prevents accidental $100+ runaway runs.
141
+ //
142
+ // Source priority (highest first): process.env (B3 CLI flag sets this)
143
+ // → workspace .env → global config. Parsed here; the actual effective
144
+ // value is computed by a helper below that downgrades to 1 if the
145
+ // verified flag isn't set.
146
+ parallelismVerified: (() => {
147
+ const raw = (process.env.KC_PARALLELISM_VERIFIED ||
148
+ env.KC_PARALLELISM_VERIFIED || gc.parallelism_verified || "").toString();
149
+ return raw === "1" || raw.toLowerCase() === "true";
150
+ })(),
151
+ parallelismRequested: (() => {
152
+ const raw = process.env.KC_PARALLELISM || env.KC_PARALLELISM || gc.parallelism;
153
+ const n = Number.parseInt(raw, 10);
154
+ if (!Number.isFinite(n) || n < 1) return 1;
155
+ return Math.min(n, 8); // max 8 per plan — prevents API-spend runaway
156
+ })(),
126
157
  };
127
158
 
159
+ // Effective parallelism is silently clamped to 1 unless KC_PARALLELISM_VERIFIED
160
+ // is set. Callers (engine.runTaskLoop, /parallelism slash command, CLI flag)
161
+ // should read this instead of parallelismRequested.
162
+ settings.effectiveParallelism = () =>
163
+ settings.parallelismVerified ? settings.parallelismRequested : 1;
164
+
128
165
  // Effective worker config (falls back to conductor config)
129
166
  settings.effectiveWorkerProvider = () => settings.workerProvider || settings.provider;
130
167
  settings.effectiveWorkerApiKey = () => settings.workerApiKey || settings.llmApiKey;
package/src/providers.js CHANGED
@@ -28,6 +28,16 @@ function getTierConfig(providerId) {
28
28
  return MODEL_TIERS[providerId] || { conductor: "", llm: {}, vlm: {} };
29
29
  }
30
30
 
31
+ // A2: Per-provider context-window caps. Without these, every provider
32
+ // inherited the generic 200000-token default from config.js, which caused
33
+ // silent empty-response failures on smaller-window models (xfyun
34
+ // astron-code-latest behaves like it has ~32K during E2E #3). The
35
+ // _maybeWindowAfterToolResult threshold only fires around 70% of budget, so
36
+ // with a 200K budget on a 32K-limit model windowing never fires in time.
37
+ // These numbers are conservative minimums — users can still override via
38
+ // KC_CONTEXT_LIMIT env or kc_context_limit in global config.
39
+ const DEFAULT_CONTEXT_LIMIT = 200000;
40
+
31
41
  const PROVIDERS = [
32
42
  {
33
43
  id: "siliconflow",
@@ -36,6 +46,7 @@ const PROVIDERS = [
36
46
  authType: "bearer",
37
47
  apiFormat: "openai",
38
48
  modelsEndpoint: "/models",
49
+ contextLimit: 200000, // GLM-5.1, Kimi-K2.5 — 200K native
39
50
  defaultModel: getTierConfig("siliconflow").conductor || "glm-5",
40
51
  defaultTiers: getTierConfig("siliconflow").llm,
41
52
  defaultVlm: getTierConfig("siliconflow").vlm,
@@ -54,6 +65,7 @@ const PROVIDERS = [
54
65
  apiFormat: "openai",
55
66
  modelsEndpoint: null, // Aliyun coding plan doesn't support /models
56
67
  supportsCodingPlanKey: true,
68
+ contextLimit: 131072, // Qwen3.x family — 128K on the coding plan
57
69
  defaultModel: getTierConfig("aliyun").conductor || "qwen3.6-plus",
58
70
  defaultTiers: getTierConfig("aliyun").llm,
59
71
  defaultVlm: getTierConfig("aliyun").vlm,
@@ -86,6 +98,7 @@ const PROVIDERS = [
86
98
  apiFormat: "openai",
87
99
  modelsEndpoint: null, // VolcanoCloud — use curated list
88
100
  supportsCodingPlanKey: true,
101
+ contextLimit: 200000, // H2: glm-5.1 on coding plan has 200K native
89
102
  defaultModel: getTierConfig("volcanocloud").conductor || "doubao-seed-2-0-pro-260215",
90
103
  defaultTiers: getTierConfig("volcanocloud").llm,
91
104
  defaultVlm: getTierConfig("volcanocloud").vlm,
@@ -114,6 +127,10 @@ const PROVIDERS = [
114
127
  authType: "bearer",
115
128
  apiFormat: "openai",
116
129
  modelsEndpoint: null,
130
+ // xfyun astron-code-latest — empirical ~32K-64K window per E2E #3. Set
131
+ // conservatively at 32K so windowing fires early and the provider never
132
+ // sees a request it will silently fail on.
133
+ contextLimit: 32768,
117
134
  defaultModel: getTierConfig("xfyun").conductor || "astron-code-latest",
118
135
  defaultTiers: getTierConfig("xfyun").llm,
119
136
  defaultVlm: getTierConfig("xfyun").vlm,
@@ -132,6 +149,7 @@ const PROVIDERS = [
132
149
  authType: "x-api-key",
133
150
  apiFormat: "anthropic",
134
151
  modelsEndpoint: null, // Use curated list
152
+ contextLimit: 400000, // Claude 4.x family — 400K on current long-context tier
135
153
  defaultModel: getTierConfig("anthropic").conductor || "claude-sonnet-4-20250514",
136
154
  defaultTiers: getTierConfig("anthropic").llm,
137
155
  defaultVlm: getTierConfig("anthropic").vlm,
@@ -152,6 +170,7 @@ const PROVIDERS = [
152
170
  authType: "bearer",
153
171
  apiFormat: "openai",
154
172
  modelsEndpoint: "/models",
173
+ contextLimit: 128000, // gpt-4o — 128K
155
174
  defaultModel: getTierConfig("openai").conductor || "gpt-4o",
156
175
  defaultTiers: getTierConfig("openai").llm,
157
176
  defaultVlm: getTierConfig("openai").vlm,
@@ -167,6 +186,7 @@ const PROVIDERS = [
167
186
  authType: "bearer",
168
187
  apiFormat: "openai",
169
188
  modelsEndpoint: "/models",
189
+ contextLimit: 200000, // GLM official (bigmodel.cn) — 200K on GLM-4.x/5.x tiers
170
190
  defaultModel: getTierConfig("zhipu").conductor || "glm-4-plus",
171
191
  defaultTiers: getTierConfig("zhipu").llm,
172
192
  defaultVlm: getTierConfig("zhipu").vlm,
@@ -182,6 +202,7 @@ const PROVIDERS = [
182
202
  authType: "bearer",
183
203
  apiFormat: "openai",
184
204
  modelsEndpoint: "/models",
205
+ contextLimit: 245760, // MiniMax-M2.5 — 240K
185
206
  defaultModel: getTierConfig("minimax").conductor || "MiniMax-M2.5",
186
207
  defaultTiers: getTierConfig("minimax").llm,
187
208
  defaultVlm: getTierConfig("minimax").vlm,
@@ -197,6 +218,10 @@ const PROVIDERS = [
197
218
  authType: "bearer",
198
219
  apiFormat: "openai",
199
220
  modelsEndpoint: "/models",
221
+ // OpenRouter proxies many models; defaulting to 200K matches the underlying
222
+ // frontier Anthropic/Google routes most users pick. Lower-context models
223
+ // behind OpenRouter will still work, just won't benefit from early windowing.
224
+ contextLimit: 200000,
200
225
  defaultModel: getTierConfig("openrouter").conductor || "anthropic/claude-sonnet-4-20250514",
201
226
  defaultTiers: getTierConfig("openrouter").llm,
202
227
  defaultVlm: getTierConfig("openrouter").vlm,
@@ -212,6 +237,7 @@ const PROVIDERS = [
212
237
  authType: "aws-sigv4",
213
238
  apiFormat: "anthropic",
214
239
  modelsEndpoint: null,
240
+ contextLimit: 200000, // Bedrock Anthropic routes mirror native Claude 200K
215
241
  defaultModel: getTierConfig("bedrock").conductor || "anthropic.claude-sonnet-4-20250514-v1:0",
216
242
  defaultTiers: getTierConfig("bedrock").llm,
217
243
  defaultVlm: getTierConfig("bedrock").vlm,
@@ -163,12 +163,24 @@ Track three metrics per iteration to know when to stop:
163
163
 
164
164
  ### Stopping Criteria
165
165
 
166
- Stop the loop when ALL three conditions hold for one iteration:
166
+ Stop the loop when **ALL** three conditions hold for one iteration:
167
167
 
168
168
  1. Correction volume < 5% of total test cases.
169
169
  2. New pattern count = 0.
170
170
  3. Regression count = 0.
171
171
 
172
+ **OR** when the standalone accuracy-convergence condition holds (D5, added
173
+ 2026-04-23):
174
+
175
+ 4. Overall accuracy changed by less than 1% between the last two
176
+ iterations — i.e. `|accuracy[N+1] − accuracy[N]| < 0.01`.
177
+
178
+ Condition 4 prevents the observed over-iteration pattern of v5 → v12,
179
+ where each iteration oscillated within a ~0.5% accuracy window. Once the
180
+ model has reached "good enough," continuing burns tokens without
181
+ delivering real improvement. When accuracy has plateaued, proceed to the
182
+ next phase (distillation / production).
183
+
172
184
  If correction volume *increases* between consecutive iterations, this is a regression signal. Pause the loop and diagnose before continuing — the last fix may be destabilizing the system.
173
185
 
174
186
  ### Expected Convergence
@@ -59,6 +59,80 @@ Rules will be distilled into workflows (see `skill-to-workflow`). Design with di
59
59
  ### Catalog Versioning
60
60
  When rules change (additions, modifications, deprecations), version the entire rule catalog as a unit. Individual rule versions track specific rules; the catalog version tracks the coherent set. Record the catalog version in `versions.json` alongside individual rule versions.
61
61
 
62
+ ## Granularity Calibration (read before extracting)
63
+
64
+ A well-extracted rule catalog has **10-20 rules per typical regulation PDF**
65
+ (a 30-80 page disclosure regulation). Over-extraction into 60-100 rules per
66
+ regulation signals you're treating every clause as its own rule — downstream
67
+ consumers (skill-authoring, workflow-run) can't distinguish meaningful
68
+ checks from boilerplate.
69
+
70
+ If your first pass produces more than ~25 rules for a single regulation:
71
+ - **Merge rules that share evidence and fail together** (e.g., "must
72
+ disclose X" and "must disclose Y" where both come from the same
73
+ required-fields table → one rule: "must disclose the required-fields
74
+ list including X, Y").
75
+ - **Drop procedural language** that isn't checkable against a report
76
+ (definitions, scope statements, references to other regs that just
77
+ transitively apply).
78
+ - **Keep only checkable obligations, prohibitions, and thresholds** —
79
+ things where you can read a sample report and say pass or fail.
80
+
81
+ ### Sample "good" rule
82
+
83
+ ```json
84
+ {
85
+ "id": "R014",
86
+ "source_ref": "Disclosure Reg §15.2",
87
+ "description": "Quarterly reports must be disclosed within 15 business days after quarter-end.",
88
+ "applicable_sections": ["public funds"],
89
+ "severity": "high",
90
+ "machine_checkable": true,
91
+ "falsifiability_statement": "If disclosure date is later than 15th business day after quarter-end, the rule fails.",
92
+ "test_case_stub": "Read the quarterly report's disclosure date + the quarter-end date, compute business-day difference."
93
+ }
94
+ ```
95
+
96
+ Note: one pass/fail outcome, a single `source_ref` to a specific clause,
97
+ clear applicability scope. Skill-authoring can write `check_r014.py` from
98
+ this alone.
99
+
100
+ ### Cross-regulation dedup (when working across multiple PDFs)
101
+
102
+ If the developer user provides N regulations, rules from later regs often
103
+ duplicate cross-cutting requirements already captured by earlier ones
104
+ (e.g., a 2018 generic disclosure rule vs. a 2025 specific version).
105
+ Before emitting a rule from reg N:
106
+
107
+ 1. **Check the existing catalog.** Use `rule_catalog` (operation: list)
108
+ to see what's already there. Skip if a rule with equivalent scope +
109
+ intent exists.
110
+ 2. **Prefer the newer / more specific source_ref** when rules overlap.
111
+ 3. **If you merged rules**, record the consolidated sources in
112
+ `source_ref`: e.g., `"New Reg §15.2 + Old Reg §24"`.
113
+
114
+ ### Delegation to sub-agents
115
+
116
+ If you dispatch extraction to sub-agents (one per regulation), the
117
+ sub-agent inherits ONLY its `task_description` — it cannot see your
118
+ conversation or existing catalog. Therefore, when composing the brief:
119
+
120
+ - **Specify the target count band** explicitly: "Extract 10-20 atomic
121
+ rules from this regulation."
122
+ - **Include a sample rule** in the brief body (paste the JSON above
123
+ verbatim) so the sub-agent's calibration matches yours.
124
+ - **Name every regulation the sub-agent should process.** If AGENT.md
125
+ lists 10 core regulations, the brief must list all 10 by name, not
126
+ "the core regs" as a pronoun — LLMs composing long structured briefs
127
+ frequently drop items (observed in session 6304673afaa0 where reg 02
128
+ was silently omitted).
129
+ - **State the dedup contract**: "Rules already in the parent's catalog
130
+ (R001–Rnnn) should NOT be re-extracted. If a requirement is already
131
+ covered, skip it." Then pass the current catalog's ID ranges.
132
+ - **Prefer `rule_catalog` create operations over sandbox_exec writes to
133
+ catalog.json.** rule_catalog uses workspace file locking;
134
+ sandbox_exec bypasses it and races with other writers.
135
+
62
136
  ## Extraction Strategies
63
137
 
64
138
  ### Strategy 1: Structured Input (Developer User Provides Rules)
@@ -241,12 +241,18 @@ description: Drive continuous improvement of skills and workflows through the di
241
241
 
242
242
  ### 停止条件
243
243
 
244
- 当一轮迭代同时满足以下三个条件时,停止循环:
244
+ 当一轮迭代**同时**满足以下三个条件时,停止循环:
245
245
 
246
246
  1. 修正量 < 总测试案例的 5%。
247
247
  2. 新模式数 = 0。
248
248
  3. 回归数 = 0。
249
249
 
250
+ **或者**满足单独的准确率收敛条件(D5,2026-04-23 新增):
251
+
252
+ 4. 连续两轮迭代之间的整体准确率变化 < 1%。即 `|accuracy[N+1] - accuracy[N]| < 0.01`。
253
+
254
+ 条件 4 是为了防止观察到的过度迭代模式——从 v5 一直迭代到 v12,每轮都在 0.5% 的精度范围内来回波动。当模型已经达到"足够好"时,继续迭代只会消耗 token,不会带来实质改进。一旦准确率趋于稳定,应当进入下一阶段(蒸馏/生产)。
255
+
250
256
  如果修正量在连续两轮迭代之间**增加**,这是回归信号。暂停循环,先诊断原因再继续——上一轮的修复可能正在破坏系统的稳定性。
251
257
 
252
258
  ### 预期收敛速度