kc-beta 0.7.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "kc-beta",
3
- "version": "0.7.0",
3
+ "version": "0.7.1",
4
4
  "description": "KC Agent — LLM document verification agent (pure Node.js CLI). Dual-licensed: PolyForm Noncommercial 1.0.0 for personal/noncommercial use; commercial license required for enterprise production. See LICENSE and LICENSE-COMMERCIAL.md.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -423,7 +423,21 @@ export class AgentEngine {
423
423
  new ScheduleFetchTool(this.workspace),
424
424
  new ReleaseTool(this.workspace, { kcVersion: "0.5.2" }),
425
425
  new PhaseAdvanceTool(
426
- (to, reason, opts) => this._advancePhase(to, reason, opts),
426
+ // v0.7.1 2c: advanceFn returns rich `{advanced, engineCounts?}`
427
+ // so the tool's refusal text can surface the engine telemetry
428
+ // that motivated the refusal. Internal callers of
429
+ // `_advancePhase` continue to use the bool return value
430
+ // directly; only this lambda wraps for the LLM-facing tool.
431
+ (to, reason, opts) => {
432
+ const advanced = this._advancePhase(to, reason, opts);
433
+ if (!advanced) {
434
+ let engineCounts = null;
435
+ try { engineCounts = this._buildEngineCountsBlock(this.currentPhase); }
436
+ catch { /* defensive */ }
437
+ return { advanced: false, engineCounts };
438
+ }
439
+ return { advanced: true };
440
+ },
427
441
  () => this.currentPhase, // H1: tool reads phase BEFORE its own call
428
442
  // v0.6.2 J1: surface running subagents so the tool can refuse
429
443
  // advance until the agent explicitly acknowledges them.
@@ -250,7 +250,8 @@ export function deriveSkillAuthoringMilestones(workspace) {
250
250
  export function deriveSkillTestingMilestones(workspace) {
251
251
  const cwd = cwdOf(workspace);
252
252
  const skillsDir = path.join(cwd, "rule_skills");
253
- const skillsTested = [];
253
+ // Use a Set so the v0.7.1 1a output/-side scan can add without duplicates.
254
+ const tested = new Set();
254
255
 
255
256
  if (dirExists(skillsDir)) {
256
257
  for (const e of listChildDirs(skillsDir)) {
@@ -266,14 +267,68 @@ export function deriveSkillTestingMilestones(workspace) {
266
267
  fileExists(path.join(skillPath, "assets", "test_cases.json")) ||
267
268
  listChildFiles(skillPath).some((f) =>
268
269
  /^(test|.*_test)_(output|result|log)/i.test(f.name) && f.name.endsWith(".json"));
269
- if (hasTestArtifact) skillsTested.push(e.name);
270
+ if (hasTestArtifact) tested.add(e.name);
270
271
  }
271
272
  }
272
273
 
274
+ // v0.7.1 1a: also credit rules whose verdicts appear in output/*.json.
275
+ // Agents naturally write batch-test results to output/, not per-skill
276
+ // paths. v0.6.x's _loadTestResults already reads here on the canonical
277
+ // accuracy schema; this expands the helper-derived milestone to
278
+ // recognize the same shape (plus the GLM/DS-shape variants seen in
279
+ // E2E #6 v070). Without this, agents who run tests via sandbox_exec
280
+ // and persist to output/ saw skillsTested=0 and force-bypassed.
281
+ const collectFromJsonFile = (data) => {
282
+ if (!data) return;
283
+ if (data.rule_id) tested.add(data.rule_id);
284
+ if (Array.isArray(data) && data[0] && typeof data[0] === "object" && data[0].rule_id) {
285
+ for (const r of data) if (r?.rule_id) tested.add(r.rule_id);
286
+ }
287
+ if (data.results && typeof data.results === "object") {
288
+ for (const k of Object.keys(data.results)) tested.add(k);
289
+ }
290
+ };
291
+
292
+ const outputDir = path.join(cwd, "output");
293
+ if (dirExists(outputDir)) {
294
+ for (const f of listChildFiles(outputDir)) {
295
+ if (!f.name.endsWith(".json")) continue;
296
+ collectFromJsonFile(readJsonSafe(path.join(outputDir, f.name)));
297
+ }
298
+ // One level into output/results/, output/distillation/ — the two
299
+ // most common batch-result locations across E2E #5 and v070 sessions.
300
+ for (const sub of ["results", "distillation", "qc"]) {
301
+ const subDir = path.join(outputDir, sub);
302
+ if (!dirExists(subDir)) continue;
303
+ for (const f of listChildFiles(subDir)) {
304
+ if (!f.name.endsWith(".json")) continue;
305
+ collectFromJsonFile(readJsonSafe(path.join(subDir, f.name)));
306
+ }
307
+ // GLM v070 wrote per-rule subdirs under output/results/<rule_id>/
308
+ // — walk one more level for that pattern.
309
+ for (const child of listChildDirs(subDir)) {
310
+ for (const f of listChildFiles(path.join(subDir, child.name))) {
311
+ if (!f.name.endsWith(".json")) continue;
312
+ collectFromJsonFile(readJsonSafe(path.join(subDir, child.name, f.name)));
313
+ }
314
+ }
315
+ }
316
+ }
317
+
318
+ // DS v070 wrote a top-level aggregate at either rules/test_results.json
319
+ // OR rule_skills/test_results.json. Both seen in the wild; check both.
320
+ for (const candidate of [
321
+ path.join(cwd, "rules", "test_results.json"),
322
+ path.join(cwd, "rule_skills", "test_results.json"),
323
+ path.join(cwd, "test_results.json"),
324
+ ]) {
325
+ if (fileExists(candidate)) collectFromJsonFile(readJsonSafe(candidate));
326
+ }
327
+
273
328
  // skillsPassing — per-skill accuracy threshold. Without a uniform
274
329
  // schema across agent outputs we report `tested` as the floor; the
275
330
  // pipeline's existing _loadTestResults() can layer accuracy on top.
276
- return { skillsTested };
331
+ return { skillsTested: [...tested] };
277
332
  }
278
333
 
279
334
  // ───────────────────────────────────────────────────────────────────
@@ -205,6 +205,33 @@ export class RuleExtractionPipeline extends Pipeline {
205
205
  return `workflow_run is SKILL_TESTING/PRODUCTION_QC-phase work, but engine is in RULE_EXTRACTION. ${exitText}`;
206
206
  }
207
207
 
208
+ // v0.7.1 2a/2b: when agent attempts phase_advance from rule_extraction,
209
+ // surface advisories for the two soft-but-load-bearing artifacts the
210
+ // gate criteria require (chunk_refs and coverage_audit). v0.7.0 GLM
211
+ // session forced through with both missing — gate refused for the
212
+ // right reason but the refusal text was generic. Name them inline.
213
+ if (toolName === "phase_advance" && toolInput?.to === "skill_authoring") {
214
+ const advisories = [];
215
+ if (this.rulesExtracted.length > 0 && this.rulesWithChunkRefs.length === 0) {
216
+ advisories.push(
217
+ `Advancing rule_extraction with rulesWithChunkRefs=0/${this.rulesExtracted.length}. ` +
218
+ `The skill_authoring phase's prompts use source_chunk_ids to ground ` +
219
+ `skill explanations against regulation text. Without them, skill authoring ` +
220
+ `runs blind. Either populate chunk refs via the rule_catalog tool, or ` +
221
+ `accept that skill_authoring's generated content won't cite source regulation.`,
222
+ );
223
+ }
224
+ if (this.rulesExtracted.length > 0 && !this.coverageAudited) {
225
+ advisories.push(
226
+ `Advancing rule_extraction without rules/coverage_audit.md (or .json). ` +
227
+ `Coverage audit identifies regulation articles you didn't extract a rule ` +
228
+ `for — without it, gaps go silent through to production. If your ` +
229
+ `extraction is genuinely complete, write a one-paragraph audit confirming so.`,
230
+ );
231
+ }
232
+ if (advisories.length > 0) return advisories.join("\n\n");
233
+ }
234
+
208
235
  return null;
209
236
  }
210
237
 
@@ -14,6 +14,11 @@ export class SkillTestingPipeline extends Pipeline {
14
14
  this.iterationCount = 0;
15
15
  this._accuracyThreshold = 0.9;
16
16
  this._maxIterations = 20;
17
+ // v0.7.1 1b: rate-limit phaseMisfitHint firing for ephemeral
18
+ // sandbox tests. Caps at ~3 nudges per phase entry so the agent
19
+ // sees the path expectation but doesn't get spammed during a
20
+ // batch run.
21
+ this._misfit_nudge_count = 0;
17
22
  this._scanWorkspace();
18
23
  }
19
24
 
@@ -132,6 +137,12 @@ export class SkillTestingPipeline extends Pipeline {
132
137
  * v0.6.3 (#74): SKILL_TESTING runs check scripts against test samples and
133
138
  * measures accuracy. Writing distillation outputs or production results
134
139
  * here means phase boundaries got skipped.
140
+ *
141
+ * v0.7.1 1b: also nudges agents who run check scripts via sandbox_exec
142
+ * but don't persist verdicts. E2E #6 v070 surfaced this — both
143
+ * conductors batched tests in one sandbox_exec call, read pass/fail
144
+ * from stdout, then declared "testing done" while engine saw
145
+ * skillsTested=0 because nothing landed in a recognized path.
135
146
  */
136
147
  phaseMisfitHint(toolName, toolInput, result) {
137
148
  if (result?.isError) return null;
@@ -148,6 +159,34 @@ export class SkillTestingPipeline extends Pipeline {
148
159
  return `Writing under output/results/ is PRODUCTION_QC-phase work, but engine is in SKILL_TESTING. ${exitText}`;
149
160
  }
150
161
  }
162
+
163
+ // v0.7.1 1b: sandbox_exec test-command nudge
164
+ if (toolName === "sandbox_exec") {
165
+ const cmd = String(toolInput?.command || "");
166
+ const looksLikeTest =
167
+ /python.*check.*\.py.*\.(txt|pdf|md|docx)/i.test(cmd) ||
168
+ /pytest|unittest|run_tests/i.test(cmd) ||
169
+ /python.*workflow.*\.py.*samples/i.test(cmd);
170
+ if (!looksLikeTest) return null;
171
+
172
+ const tested = Object.keys(this.skillsTested).length;
173
+ const total = this.skillsToTest.length;
174
+ // Already satisfied? Don't nudge.
175
+ if (total === 0 || tested >= total) return null;
176
+
177
+ // Rate-limit: ~3 per phase. Counter resets on phase entry
178
+ // (constructor) and on importState if available.
179
+ this._misfit_nudge_count = (this._misfit_nudge_count || 0) + 1;
180
+ if (this._misfit_nudge_count > 3) return null;
181
+
182
+ return (
183
+ `Engine derives skillsTested from rule_skills/<id>/test_results.json, ` +
184
+ `rule_skills/<id>/tests/, OR output/*.json with rule_id field. ` +
185
+ `Sandbox runs are ephemeral — record per-rule verdicts to one of ` +
186
+ `those paths before phase_advance. Currently engine sees ` +
187
+ `${tested}/${total} skills tested.`
188
+ );
189
+ }
151
190
  return null;
152
191
  }
153
192
 
@@ -15,7 +15,11 @@ const VALID_PHASES = new Set(Object.values(Phase));
15
15
  */
16
16
  export class PhaseAdvanceTool extends BaseTool {
17
17
  /**
18
- * @param {(to: string, reason: string, opts: {force?: boolean}) => boolean} advanceFn
18
+ * @param {(to: string, reason: string, opts: {force?: boolean}) => {advanced: boolean, engineCounts?: string}} advanceFn
19
+ * v0.7.1 2c: returns the rich object so the tool can surface engine
20
+ * telemetry in the refusal text. Internal engine callers of
21
+ * `_advancePhase` still get the bool; only this LLM-facing tool
22
+ * uses the wrapped form.
19
23
  * @param {() => string} getCurrentPhaseFn - H1: lets the tool read the
20
24
  * engine's phase BEFORE the call, so it can distinguish "already there"
21
25
  * (silent no-op, informational) from "non-adjacent refusal" (actionable).
@@ -91,7 +95,11 @@ export class PhaseAdvanceTool extends BaseTool {
91
95
  );
92
96
  }
93
97
 
94
- const advanced = this._advance(to, input.reason || "agent request", { force: !!input.force });
98
+ // v0.7.1 2c: advanceFn returns {advanced, engineCounts?} so we can
99
+ // surface telemetry in the refusal text below. Internal callers of
100
+ // _advancePhase still get bool; only this LLM-facing tool unwraps.
101
+ const advanceResult = this._advance(to, input.reason || "agent request", { force: !!input.force });
102
+ const advanced = !!advanceResult?.advanced;
95
103
  if (advanced) {
96
104
  // Log the ack so post-mortems can find phase advances that proceeded
97
105
  // with live subagents
@@ -113,9 +121,18 @@ export class PhaseAdvanceTool extends BaseTool {
113
121
  // immediately (12/12 transitions). The escape valve remains in the input
114
122
  // schema (discoverable) but isn't hand-fed to the LLM here. Instead,
115
123
  // direct the agent at the missing milestones it can satisfy.
124
+ //
125
+ // v0.7.1 2c: include engineCounts when available so the agent sees
126
+ // exactly which milestones the gate is reading and can satisfy them.
127
+ // E2E #6 v070 showed the generic "check /status" hint wasn't concrete
128
+ // enough — agents forced through. Naming the gap inline reduces that.
129
+ const engineCountsLine = advanceResult?.engineCounts
130
+ ? `\nEngine telemetry: ${advanceResult.engineCounts}`
131
+ : "";
132
+
116
133
  return new ToolResult(
117
134
  `Did not advance to ${to} (currently in ${beforePhase || "?"}). ` +
118
- `Likely cause: source-phase exit criteria not met. ` +
135
+ `Likely cause: source-phase exit criteria not met.${engineCountsLine}\n\n` +
119
136
  `Run /status (or read the phase describeState block in this turn's system reminder) ` +
120
137
  `to see which milestones are missing, then produce the disk artifacts that satisfy them — ` +
121
138
  `the engine derives milestones from filesystem facts (rule_skills/<id>/SKILL.md, check.py, ` +
@@ -101,6 +101,52 @@ The v0.6.2 D2 anti-pattern wording captures the failure case clearly:
101
101
 
102
102
  That came from E2E #4 where one conductor wrote a 2,400-line `unified_qc.py` that ran all rules at once. It produced 1,150 ERROR verdicts (16.6%) because every rule's failure cascaded into every other rule's verdict. Per-rule skills are KC's unit of granularity for a reason.
103
103
 
104
+ ### Anti-pattern: stub check.py + real workflow.py
105
+
106
+ Do NOT make `rule_skills/<id>/check.py` a stub that defers to
107
+ `workflows/<id>/workflow.py`. KC's intent: SKILL.md + check.py is the
108
+ **canonical** verification. workflow.py is the **distilled, cheaper**
109
+ form (regex baseline + LLM fallback). The relationship is
110
+ skill → workflow, not workflow → skill.
111
+
112
+ ❌ DON'T:
113
+ ```python
114
+ # rule_skills/R001/check.py — STUB, real logic elsewhere
115
+ def check(text):
116
+ rule_ids = re.findall(r"R\d{3}", load_skill())
117
+ return {rid: {"pass": None, "method": "stub",
118
+ "note": "to be implemented later"} for rid in rule_ids}
119
+ # real verification logic only in workflows/R001/workflow_v1.py
120
+ ```
121
+
122
+ ✅ DO:
123
+ ```python
124
+ # rule_skills/R001/check.py — canonical verification
125
+ def check(text):
126
+ matches = re.findall(r"...", text) # actual rule logic
127
+ return {"rule_id": "R001", "passed": bool(matches),
128
+ "evidence": matches[:3], "method": "regex"}
129
+
130
+ # workflows/R001/workflow_v1.py — distilled, cheaper form
131
+ def run(text, llm_fn=None):
132
+ result = check(text) # baseline from skill
133
+ if not result["passed"] and llm_fn:
134
+ result = llm_verify(text, llm_fn) # escalate on fail
135
+ return result
136
+ ```
137
+
138
+ Why it matters: distillation phase consumers (release tool, run.py
139
+ harness) load workflow.py. If check.py is a stub, the skill's
140
+ methodology (SKILL.md) becomes documentation-only and the
141
+ verification logic is scattered across N workflow files. Future
142
+ iterations of the skill (changes to regulation interpretation, edge
143
+ cases discovered in production) need a single canonical place to
144
+ update — the skill — not N workflows that have drifted independently.
145
+
146
+ E2E #6 v070 surfaced this pattern (DS bundled-skill check.py files
147
+ all returned `{"pass": null, "method": "stub"}` deferring to
148
+ workflows/). v0.7.1 added this anti-pattern explicitly.
149
+
104
150
  ### Naming convention for grouped checks
105
151
 
106
152
  When you do bundle, name the file with the explicit range:
@@ -263,4 +309,18 @@ When entering skill_authoring with an empty TaskBoard:
263
309
  5. **Pick the first task.** Work it to completion (skill + check + at least one local test). Update PATTERNS.md with whatever you learned. Move to the next task.
264
310
  6. **At task ~5 and task ~10:** stop and re-read PATTERNS.md. If patterns suggest a refactor of earlier work, do it now (cheap) rather than later (expensive).
265
311
 
312
+ ### Why PATTERNS.md FIRST, before any skill code
313
+
314
+ If you start writing skill code (rule_skills/<id>/check.py) before PATTERNS.md exists, **stop**. Even a 200-byte initial PATTERNS.md ("decided Shannon-Huffman; first hard rule R028 will dictate verdict shape; sample corpus has bilingual table headings") sets the framework. You'll save 4× the time later not re-deriving the same shapes per rule.
315
+
316
+ ❌ "I'll write the skills first, then PATTERNS.md when I have insights."
317
+
318
+ By the time you have N skills, you've made N implicit decisions about verdict shape, chunker boundaries, worker tier — each rule re-derives from scratch. Refactoring requires touching N files instead of one.
319
+
320
+ ✅ "Write PATTERNS.md, even tentatively, then re-read it before each new rule. Update it when discoveries change the framework."
321
+
322
+ PATTERNS.md is your project's index card. Build it before the work, update it during the work, harvest it after.
323
+
324
+ E2E #6 v070 surfaced this: DS only wrote PATTERNS.md after a rollback intervention; the per-skill design decisions before that point were already locked in and had to be re-touched. v0.7.1 reinforced this guidance.
325
+
266
326
  The engine's filesystem-derived milestones (Group A v0.7.0) verify coverage on disk regardless of how you split the work. The TaskBoard is your scratchpad; the disk is the contract.
@@ -101,6 +101,49 @@ v0.6.2 D2 的反模式说法已经把失败情形说得很清楚了:
101
101
 
102
102
  那段话来自 E2E #4:一个指挥模型写了 2,400 行 `unified_qc.py` 一次性跑所有规则。结果出现 1,150 条 ERROR verdict(16.6%),因为每条规则的失败都连带把所有其他规则的判定也带崩了。Per-rule skill 是 KC 的粒度单元,这是有原因的。
103
103
 
104
+ ### 反模式:check.py 是 stub + workflow.py 才是真逻辑
105
+
106
+ **不要**把 `rule_skills/<id>/check.py` 写成一个把真实逻辑推迟到
107
+ `workflows/<id>/workflow.py` 的占位文件。KC 的设计意图是:SKILL.md
108
+ + check.py 是**正典**核查;workflow.py 是**蒸馏后、更便宜**的形式
109
+ (regex 优先 + LLM 回退)。关系是 skill → workflow,不是反过来。
110
+
111
+ ❌ 不要这样:
112
+ ```python
113
+ # rule_skills/R001/check.py —— STUB,真逻辑在别处
114
+ def check(text):
115
+ rule_ids = re.findall(r"R\d{3}", load_skill())
116
+ return {rid: {"pass": None, "method": "stub",
117
+ "note": "待技能测试阶段实现"} for rid in rule_ids}
118
+ # 实际核查逻辑只在 workflows/R001/workflow_v1.py 里
119
+ ```
120
+
121
+ ✅ 应该这样:
122
+ ```python
123
+ # rule_skills/R001/check.py —— 正典核查
124
+ def check(text):
125
+ matches = re.findall(r"...", text) # 真实规则逻辑
126
+ return {"rule_id": "R001", "passed": bool(matches),
127
+ "evidence": matches[:3], "method": "regex"}
128
+
129
+ # workflows/R001/workflow_v1.py —— 蒸馏后的便宜形式
130
+ def run(text, llm_fn=None):
131
+ result = check(text) # skill 提供基线
132
+ if not result["passed"] and llm_fn:
133
+ result = llm_verify(text, llm_fn) # FAIL 时升级到 LLM
134
+ return result
135
+ ```
136
+
137
+ 为什么重要:蒸馏阶段下游消费者(release 工具、run.py 运行器)加载
138
+ 的是 workflow.py。如果 check.py 是 stub,skill 的方法论(SKILL.md)
139
+ 就只剩文档作用,而核查逻辑被分散到 N 个 workflow 文件里。后续对
140
+ skill 的迭代(法规解释变化、生产中发现的边缘情形)需要一个**正典
141
+ 位置**来更新——也就是 skill——而不是 N 个已经各自漂移的 workflow。
142
+
143
+ E2E #6 v070 暴露了这个反模式(DS 把所有 bundled skill 的 check.py
144
+ 都写成 `{"pass": null, "method": "stub"}` 推给 workflows/)。
145
+ v0.7.1 把这个反模式显式写进 skill。
146
+
104
147
  ### 合并 check 的命名约定
105
148
 
106
149
  确实需要合并时,文件名要把范围写明:
@@ -261,4 +304,18 @@ PATTERNS.md 全文控制在约 5 KB 之内。超过时,剪掉最不可执行
261
304
  5. **挑第一个任务**。做到完整(skill + check + 至少一次本地测试)。把学到的写进 PATTERNS.md。换下一个任务。
262
305
  6. **任务做到第 5 个、第 10 个时**:停下来重读 PATTERNS.md。如果新积累的 pattern 暗示要重构早期工作,**现在做**(便宜)而不是更晚(昂贵)。
263
306
 
307
+ ### 为什么 PATTERNS.md 要先写、写在 skill 代码之前
308
+
309
+ 如果你在 PATTERNS.md 还不存在的时候就开始写 skill 代码(rule_skills/<id>/check.py),**停**。哪怕只是 200 字节的初始 PATTERNS.md("决定走 Shannon-Huffman;第一条难规则 R028 决定 verdict 形状;样本语料表头中英双语")也能搭起框架。后续每条规则少重新推导一次同样的形状,整体能省 4 倍时间。
310
+
311
+ ❌ "我先把 skill 写完,等有洞察再写 PATTERNS.md。"
312
+
313
+ 到你写完 N 个 skill 时,你已经做了 N 个隐式决定(verdict 形状、chunker 边界、worker tier)——每条规则都是从零推导。重构需要碰 N 个文件,而不是一个。
314
+
315
+ ✅ "先写 PATTERNS.md(哪怕是初步的),写每条新规则之前先重读,发现新东西就回头更新。"
316
+
317
+ PATTERNS.md 是项目的索引卡片。工作之前搭好它、工作中更新它、工作之后从中收割。
318
+
319
+ E2E #6 v070 暴露了这个:DS 在用户介入回退之后才写 PATTERNS.md,而那之前每条 skill 的设计决定都已经各自固化、之后还要再碰一遍。v0.7.1 把这个引导写得更明确。
320
+
264
321
  引擎从文件系统推导里程碑(v0.7.0 Group A)会按磁盘事实核验覆盖率,无论你怎么切分工作。TaskBoard 是你的草稿;磁盘才是契约。