kc-beta 0.7.1 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "kc-beta",
3
- "version": "0.7.1",
3
+ "version": "0.7.2",
4
4
  "description": "KC Agent — LLM document verification agent (pure Node.js CLI). Dual-licensed: PolyForm Noncommercial 1.0.0 for personal/noncommercial use; commercial license required for enterprise production. See LICENSE and LICENSE-COMMERCIAL.md.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -21,6 +21,7 @@ import { SnapshotTool } from "./tools/snapshot.js";
21
21
  import { ArchiveFileTool } from "./tools/archive-file.js";
22
22
  import { ScheduleFetchTool } from "./tools/schedule-fetch.js";
23
23
  import { ReleaseTool } from "./tools/release.js";
24
+ import { readKcVersion } from "../util/kc-version.js";
24
25
  import { PhaseAdvanceTool } from "./tools/phase-advance.js";
25
26
  import { DocumentParseTool } from "./tools/document-parse.js";
26
27
  import { DocumentSearchTool } from "./tools/document-search.js";
@@ -421,7 +422,7 @@ export class AgentEngine {
421
422
  new SnapshotTool(this.workspace),
422
423
  new ArchiveFileTool(this.workspace),
423
424
  new ScheduleFetchTool(this.workspace),
424
- new ReleaseTool(this.workspace, { kcVersion: "0.5.2" }),
425
+ new ReleaseTool(this.workspace, { kcVersion: readKcVersion() }),
425
426
  new PhaseAdvanceTool(
426
427
  // v0.7.1 2c: advanceFn returns rich `{advanced, engineCounts?}`
427
428
  // so the tool's refusal text can surface the engine telemetry
@@ -1679,7 +1680,24 @@ export class AgentEngine {
1679
1680
  parts.push(`monitoring: ${pipeline.monitoringPhase ?? "?"}`);
1680
1681
  break;
1681
1682
  }
1682
- // bootstrap / finalization: no specific counters, fall through
1683
+ case "bootstrap": {
1684
+ // v0.7.2 1e: previously fell through to empty string. Both
1685
+ // v0.7.1 audit runs had bootstrap → rule_extraction refusals
1686
+ // with engineCounts: "" — agent saw the refusal but had no
1687
+ // engine telemetry to react to. The InitializerPipeline tracks
1688
+ // boolean checklist flags rather than numeric counters; we
1689
+ // surface those flags as "yes/no" so the agent can see which
1690
+ // bootstrap criterion is missing.
1691
+ if (typeof pipeline.describeBootstrapChecklist === "function") {
1692
+ const cl = pipeline.describeBootstrapChecklist();
1693
+ parts.push(`workspaceCreated: ${cl.workspaceCreated ? "yes" : "no"}`);
1694
+ parts.push(`configReady: ${cl.configReady ? "yes" : "no"}`);
1695
+ parts.push(`hasRegulations: ${cl.hasRegulations ? "yes" : "no"}`);
1696
+ parts.push(`hasSamples: ${cl.hasSamples ? "yes" : "no"}`);
1697
+ }
1698
+ break;
1699
+ }
1700
+ // finalization: no specific counters, fall through
1683
1701
  }
1684
1702
  } catch { /* never let summary build break phase advance */ }
1685
1703
  return parts.join(", ");
@@ -57,16 +57,21 @@ function listChildFiles(p) {
57
57
  // Walk a directory recursively, yielding every file path. Skips hidden
58
58
  // dirs/files and __pycache__. Used by derive functions that need to
59
59
  // match arbitrarily-nested artifacts (e.g., scripts/ subdirs).
60
- function* walkFiles(root) {
60
+ //
61
+ // v0.7.2 1a: optional maxDepth caps recursion. depth=0 is root's
62
+ // direct children; depth=1 is one level down. Default unbounded
63
+ // (existing callers).
64
+ function* walkFiles(root, { maxDepth } = {}) {
61
65
  if (!dirExists(root)) return;
62
- const stack = [root];
66
+ const stack = [{ dir: root, depth: 0 }];
63
67
  while (stack.length) {
64
- const dir = stack.pop();
68
+ const { dir, depth } = stack.pop();
65
69
  for (const e of readDirSafe(dir)) {
66
70
  if (e.name.startsWith(".") || e.name === "__pycache__") continue;
67
71
  const p = path.join(dir, e.name);
68
- if (e.isDirectory()) stack.push(p);
69
- else if (e.isFile()) yield p;
72
+ if (e.isDirectory()) {
73
+ if (maxDepth == null || depth < maxDepth) stack.push({ dir: p, depth: depth + 1 });
74
+ } else if (e.isFile()) yield p;
70
75
  }
71
76
  }
72
77
  }
@@ -271,48 +276,66 @@ export function deriveSkillTestingMilestones(workspace) {
271
276
  }
272
277
  }
273
278
 
274
- // v0.7.1 1a: also credit rules whose verdicts appear in output/*.json.
275
- // Agents naturally write batch-test results to output/, not per-skill
276
- // paths. v0.6.x's _loadTestResults already reads here on the canonical
277
- // accuracy schema; this expands the helper-derived milestone to
278
- // recognize the same shape (plus the GLM/DS-shape variants seen in
279
- // E2E #6 v070). Without this, agents who run tests via sandbox_exec
280
- // and persist to output/ saw skillsTested=0 and force-bypassed.
281
- const collectFromJsonFile = (data) => {
282
- if (!data) return;
283
- if (data.rule_id) tested.add(data.rule_id);
284
- if (Array.isArray(data) && data[0] && typeof data[0] === "object" && data[0].rule_id) {
285
- for (const r of data) if (r?.rule_id) tested.add(r.rule_id);
279
+ // v0.7.1 1a / v0.7.2 1a: credit rules whose verdicts appear anywhere
280
+ // under output/*.json. Agents persist batch-test results in
281
+ // conductor-specific shapes (this is the recurring drift point
282
+ // engine derivation has to match disk reality, not the other way
283
+ // around). Shapes seen across E2E #5/6/7:
284
+ //
285
+ // - DS v0.7.0/0.7.1: catalog.json as array of {id: "R001", ...}
286
+ // entries; skill_test_*.json as {doc_name: {R019a: bool, ...}};
287
+ // skill_test_阳光资产.json with {doc, results: {R019a: ...}}
288
+ // - GLM v0.7.1: rule_stats.json as {D01-01: {PASS, FAIL, NA}, ...};
289
+ // full_test_results_v[1-6].json as {sample_id: {path, meta,
290
+ // results: {D01-01: {verdict, ...}}}} (nested 2 levels deep, why
291
+ // v0.7.1's shallow walk missed them)
292
+ //
293
+ // The collector recurses (depth-limited) and uses two heuristics to
294
+ // separate rule_ids from sample_ids / doc_names:
295
+ // 1. Rule-id shape: starts with letter, ≤ 30 chars, contains digits
296
+ // (matches R001, D01-01, T02-31; rejects 06f2ed1488, doc paths)
297
+ // 2. Verdict-shape on values: {verdict, passed, pass, PASS, FAIL}
298
+ // keys signal that the parent dict's keys are rule_ids
299
+ const ruleIdShape = /^[A-Za-z][A-Za-z0-9_-]{0,29}$/;
300
+ const isRuleIdShape = (s) => typeof s === "string" && ruleIdShape.test(s) && /\d/.test(s);
301
+ const looksLikeVerdict = (v) =>
302
+ v && typeof v === "object" && !Array.isArray(v) && (
303
+ v.verdict !== undefined ||
304
+ v.passed !== undefined ||
305
+ v.pass !== undefined ||
306
+ typeof v.PASS === "number" ||
307
+ typeof v.FAIL === "number"
308
+ );
309
+ const collectFromJsonFile = (data, depth = 0) => {
310
+ if (!data || depth > 4) return;
311
+ if (typeof data !== "object") return;
312
+ if (Array.isArray(data)) {
313
+ for (const r of data) collectFromJsonFile(r, depth + 1);
314
+ return;
315
+ }
316
+ // {rule_id: "X"} or {id: "R001"} on a rule entry
317
+ if (isRuleIdShape(data.rule_id)) tested.add(data.rule_id);
318
+ if (isRuleIdShape(data.id)) tested.add(data.id);
319
+ // {<rule_id>: <verdict_shaped>, ...} (rule_stats / per-doc test_results)
320
+ for (const [k, v] of Object.entries(data)) {
321
+ if (isRuleIdShape(k) && looksLikeVerdict(v)) tested.add(k);
286
322
  }
287
- if (data.results && typeof data.results === "object") {
288
- for (const k of Object.keys(data.results)) tested.add(k);
323
+ // {results: {<rule_id>: ...}} keys must look rule-id-shaped
324
+ if (data.results && typeof data.results === "object" && !Array.isArray(data.results)) {
325
+ for (const k of Object.keys(data.results)) {
326
+ if (isRuleIdShape(k)) tested.add(k);
327
+ }
328
+ }
329
+ // Recurse into nested objects (handles {sample_id: {results: {...}}})
330
+ for (const v of Object.values(data)) {
331
+ if (v && typeof v === "object") collectFromJsonFile(v, depth + 1);
289
332
  }
290
333
  };
291
334
 
292
335
  const outputDir = path.join(cwd, "output");
293
- if (dirExists(outputDir)) {
294
- for (const f of listChildFiles(outputDir)) {
295
- if (!f.name.endsWith(".json")) continue;
296
- collectFromJsonFile(readJsonSafe(path.join(outputDir, f.name)));
297
- }
298
- // One level into output/results/, output/distillation/ — the two
299
- // most common batch-result locations across E2E #5 and v070 sessions.
300
- for (const sub of ["results", "distillation", "qc"]) {
301
- const subDir = path.join(outputDir, sub);
302
- if (!dirExists(subDir)) continue;
303
- for (const f of listChildFiles(subDir)) {
304
- if (!f.name.endsWith(".json")) continue;
305
- collectFromJsonFile(readJsonSafe(path.join(subDir, f.name)));
306
- }
307
- // GLM v070 wrote per-rule subdirs under output/results/<rule_id>/
308
- // — walk one more level for that pattern.
309
- for (const child of listChildDirs(subDir)) {
310
- for (const f of listChildFiles(path.join(subDir, child.name))) {
311
- if (!f.name.endsWith(".json")) continue;
312
- collectFromJsonFile(readJsonSafe(path.join(subDir, child.name, f.name)));
313
- }
314
- }
315
- }
336
+ for (const p of walkFiles(outputDir, { maxDepth: 6 })) {
337
+ if (!p.endsWith(".json")) continue;
338
+ collectFromJsonFile(readJsonSafe(p));
316
339
  }
317
340
 
318
341
  // DS v070 wrote a top-level aggregate at either rules/test_results.json
@@ -5,6 +5,7 @@ import { PipelineEvent } from "./index.js";
5
5
  import { Pipeline } from "./base.js";
6
6
  import { normalizeRuleCatalog } from "../rule-catalog-normalize.js";
7
7
  import { deriveFinalizationMilestones } from "./_milestone-derive.js";
8
+ import { readKcVersion } from "../../util/kc-version.js";
8
9
 
9
10
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
10
11
  // v0.7.0 N: ship template/release/v1/ from the npm package; copy into
@@ -310,12 +311,7 @@ export class FinalizationPipeline extends Pipeline {
310
311
  }
311
312
 
312
313
  _readKcVersion() {
313
- try {
314
- const pkg = JSON.parse(fs.readFileSync(
315
- path.resolve(__dirname, "../../../package.json"), "utf-8",
316
- ));
317
- return pkg.version || "unknown";
318
- } catch { return "unknown"; }
314
+ return readKcVersion();
319
315
  }
320
316
 
321
317
  /**
@@ -240,6 +240,19 @@ export class ProjectInitializer extends Pipeline {
240
240
  return this.workspaceCreated && this.configReady && this.hasRegulations && this.hasSamples;
241
241
  }
242
242
 
243
+ // v0.7.2 1e: surface the checklist as engine telemetry so
244
+ // `_buildEngineCountsBlock("bootstrap")` has something to report when
245
+ // bootstrap → rule_extraction is refused. Agent sees the missing
246
+ // criteria directly in the refusal text.
247
+ describeBootstrapChecklist() {
248
+ return {
249
+ workspaceCreated: !!this.workspaceCreated,
250
+ configReady: !!this.configReady,
251
+ hasRegulations: !!this.hasRegulations,
252
+ hasSamples: !!this.hasSamples,
253
+ };
254
+ }
255
+
243
256
  /**
244
257
  * v0.6.3 (#74): nudge the agent when it does work that belongs to a later
245
258
  * phase. Bootstrap is setup — reading rules/samples, configuring keys,
@@ -178,7 +178,21 @@ export class ReleaseTool extends BaseTool {
178
178
  path.join(bundleAbs, "glossary.json"), { fallback: '{"version":1,"entries":[]}\n' });
179
179
  this._copyIfExists(path.join(this._workspace.cwd, "corner_cases.json"),
180
180
  path.join(bundleAbs, "corner_cases.json"), { fallback: '[]\n' });
181
- this._copyIfExists(path.join(this._workspace.cwd, "confidence_calibration.json"),
181
+ // v0.7.2 1c: auto-aggregate from output/ if no calibration file at
182
+ // workspace root. Both v0.7.1 audit runs (DS + GLM) shipped releases
183
+ // with empty `historical_accuracy: {}` despite having per-rule QC
184
+ // data on disk under output/ — the release tool just passed the
185
+ // file through and emitted a stub on miss. We try to populate from
186
+ // known QC artifact shapes here; if nothing matches, fall through
187
+ // to the existing stub fallback.
188
+ const calibSrc = path.join(this._workspace.cwd, "confidence_calibration.json");
189
+ if (!fs.existsSync(calibSrc)) {
190
+ const aggregated = this._aggregateAccuracyFromOutput();
191
+ if (aggregated && Object.keys(aggregated.historical_accuracy).length > 0) {
192
+ fs.writeFileSync(calibSrc, JSON.stringify(aggregated, null, 2) + "\n", "utf-8");
193
+ }
194
+ }
195
+ this._copyIfExists(calibSrc,
182
196
  path.join(bundleAbs, "confidence_calibration.json"),
183
197
  { fallback: '{"historical_accuracy":{}}\n' });
184
198
 
@@ -233,6 +247,30 @@ export class ReleaseTool extends BaseTool {
233
247
  .replace(/\{RULES_LIST\}/g, rulesList);
234
248
  fs.writeFileSync(path.join(bundleAbs, "README.md"), readme, "utf-8");
235
249
 
250
+ // v0.7.2 1d: clean up the template scaffold dir if a customized
251
+ // release was just written alongside it. Both v0.7.1 audit runs
252
+ // shipped with `output/releases/v1/` (template-derived, .tmpl
253
+ // files lingering) AND `output/releases/v1-0/` (or v1-0-hybrid/)
254
+ // — the customized release. The pre-scaffold is meant as a hint;
255
+ // once the agent calls `release(label="v1-0")` and we've written
256
+ // the real bundle, the unedited scaffold is just clutter.
257
+ //
258
+ // Conservative gate: only delete a sibling `v1/` if BOTH (a) we
259
+ // didn't just write to v1/ ourselves, AND (b) it still contains
260
+ // .tmpl files (signature of unedited template). If the agent
261
+ // intentionally edited v1/ in place (removing .tmpl), our cleanup
262
+ // leaves it alone.
263
+ if (slug !== "v1") {
264
+ const tmplScaffold = path.join(this._workspace.resolvePath(path.join("output", "releases")), "v1");
265
+ if (fs.existsSync(tmplScaffold) && fs.statSync(tmplScaffold).isDirectory()) {
266
+ let hasTmpl = false;
267
+ try { hasTmpl = fs.readdirSync(tmplScaffold).some((f) => f.endsWith(".tmpl")); } catch { /* ignore */ }
268
+ if (hasTmpl) {
269
+ try { fs.rmSync(tmplScaffold, { recursive: true, force: true }); } catch { /* best-effort */ }
270
+ }
271
+ }
272
+ }
273
+
236
274
  // Bundle dir is in output/ (gitignored). Snapshot manifest in snapshots/ IS tracked.
237
275
  const lines = [
238
276
  `Release '${label}' bundled at ${bundleRel}`,
@@ -319,6 +357,118 @@ export class ReleaseTool extends BaseTool {
319
357
  return null;
320
358
  }
321
359
 
360
+ // v0.7.2 1c: walk output/ for QC artifacts and aggregate per-rule
361
+ // accuracy. Recognized shapes (covering DS + GLM v0.7.1 audit runs):
362
+ //
363
+ // rule_stats_v*.json — {<rule_id>: {PASS: N, FAIL: N, NOT_APPLICABLE: N, ERROR: N}}
364
+ // (GLM produced 4 versions; pick the highest)
365
+ // full_test_results_v*.json — {<sample_id>: {results: {<rule_id>: {verdict}}}}
366
+ // (GLM; accumulate verdicts per rule across samples)
367
+ // skill_test_*.json — {<doc_name>: {<rule_id>: bool}} (DS shape)
368
+ //
369
+ // Returns null if no recognized artifact, or an object with
370
+ // { historical_accuracy: {<rule_id>: {pass_rate, n_samples, ...}}, computed_at, source_files }
371
+ // suitable for confidence_calibration.json.
372
+ _aggregateAccuracyFromOutput() {
373
+ const ruleIdShape = /^[A-Za-z][A-Za-z0-9_-]{0,29}$/;
374
+ const isRuleId = (s) => typeof s === "string" && ruleIdShape.test(s) && /\d/.test(s);
375
+ const tally = new Map(); // rule_id -> {pass, fail, na, n}
376
+ const sourceFiles = [];
377
+ const bump = (rid, kind) => {
378
+ if (!isRuleId(rid)) return;
379
+ const t = tally.get(rid) || { pass: 0, fail: 0, na: 0, n: 0 };
380
+ t[kind] += 1;
381
+ t.n += 1;
382
+ tally.set(rid, t);
383
+ };
384
+ const outputDir = path.join(this._workspace.cwd, "output");
385
+ if (!fs.existsSync(outputDir)) return null;
386
+
387
+ // Collect all .json files under output/ (depth limited)
388
+ const files = [];
389
+ const stack = [{ dir: outputDir, depth: 0 }];
390
+ while (stack.length) {
391
+ const { dir, depth } = stack.pop();
392
+ let entries;
393
+ try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch { continue; }
394
+ for (const e of entries) {
395
+ if (e.name.startsWith(".") || e.name === "__pycache__") continue;
396
+ const p = path.join(dir, e.name);
397
+ if (e.isDirectory()) {
398
+ if (depth < 6) stack.push({ dir: p, depth: depth + 1 });
399
+ } else if (e.isFile() && e.name.endsWith(".json")) {
400
+ files.push({ path: p, name: e.name });
401
+ }
402
+ }
403
+ }
404
+
405
+ // 1) Prefer rule_stats_v<N>.json (highest version) — direct counts
406
+ const ruleStatsFiles = files
407
+ .filter((f) => /^rule_stats(?:_v\d+)?\.json$/i.test(f.name))
408
+ .map((f) => ({ ...f, ver: (f.name.match(/_v(\d+)/) || [0, 0])[1] | 0 }))
409
+ .sort((a, b) => b.ver - a.ver);
410
+ if (ruleStatsFiles.length > 0) {
411
+ const top = ruleStatsFiles[0];
412
+ try {
413
+ const d = JSON.parse(fs.readFileSync(top.path, "utf-8"));
414
+ for (const [rid, stats] of Object.entries(d)) {
415
+ if (!isRuleId(rid) || !stats || typeof stats !== "object") continue;
416
+ const pass = stats.PASS | 0, fail = stats.FAIL | 0;
417
+ const na = stats.NOT_APPLICABLE | stats.NA | 0;
418
+ const t = tally.get(rid) || { pass: 0, fail: 0, na: 0, n: 0 };
419
+ t.pass += pass; t.fail += fail; t.na += na; t.n += pass + fail + na;
420
+ tally.set(rid, t);
421
+ }
422
+ sourceFiles.push(path.relative(this._workspace.cwd, top.path));
423
+ } catch { /* fall through to other shapes */ }
424
+ }
425
+
426
+ // 2) Fallback: full_test_results*.json with nested {sample_id: {results: {rid: {verdict}}}}
427
+ if (tally.size === 0) {
428
+ const ftrFiles = files
429
+ .filter((f) => /^full_test_results(?:_v\d+)?\.json$/i.test(f.name))
430
+ .map((f) => ({ ...f, ver: (f.name.match(/_v(\d+)/) || [0, 0])[1] | 0 }))
431
+ .sort((a, b) => b.ver - a.ver);
432
+ for (const f of ftrFiles.slice(0, 1)) {
433
+ try {
434
+ const d = JSON.parse(fs.readFileSync(f.path, "utf-8"));
435
+ for (const sample of Object.values(d)) {
436
+ if (!sample || typeof sample !== "object") continue;
437
+ const results = sample.results;
438
+ if (!results || typeof results !== "object") continue;
439
+ for (const [rid, r] of Object.entries(results)) {
440
+ if (!isRuleId(rid) || !r || typeof r !== "object") continue;
441
+ const verdict = (r.verdict || "").toString().toUpperCase();
442
+ if (verdict === "PASS") bump(rid, "pass");
443
+ else if (verdict === "FAIL") bump(rid, "fail");
444
+ else if (verdict === "NOT_APPLICABLE" || verdict === "NA") bump(rid, "na");
445
+ }
446
+ }
447
+ sourceFiles.push(path.relative(this._workspace.cwd, f.path));
448
+ } catch { /* try next shape */ }
449
+ }
450
+ }
451
+
452
+ if (tally.size === 0) return null;
453
+
454
+ const historical_accuracy = {};
455
+ for (const [rid, t] of tally.entries()) {
456
+ const fired = t.pass + t.fail;
457
+ historical_accuracy[rid] = {
458
+ pass_rate: fired > 0 ? +(t.pass / fired).toFixed(4) : null,
459
+ n_passed: t.pass,
460
+ n_failed: t.fail,
461
+ n_not_applicable: t.na,
462
+ n_samples: t.n,
463
+ };
464
+ }
465
+ return {
466
+ historical_accuracy,
467
+ computed_at: new Date().toISOString(),
468
+ source_files: sourceFiles,
469
+ };
470
+ }
471
+
322
472
  _readWorkerTiers() {
323
473
  const envPath = path.join(this._workspace.cwd, ".env");
324
474
  const out = { tier1: "", tier2: "", tier3: "", tier4: "" };
@@ -0,0 +1,27 @@
1
+ // Single source of truth for the live KC CLI version string.
2
+ //
3
+ // Reads package.json once. Used by engine.js (passed to ReleaseTool so
4
+ // release manifests stamp the correct version) and by
5
+ // pipelines/finalization.js (anywhere it surfaces "Built by kc-beta X").
6
+ //
7
+ // Before v0.7.2, engine.js hardcoded `kcVersion: "0.5.2"` which leaked
8
+ // into every release manifest's `kc_beta_version` field regardless of
9
+ // the actual package version. Both v0.7.1 audit runs (DS + GLM)
10
+ // surfaced this. Reading package.json closes the gap.
11
+
12
+ import fs from "node:fs";
13
+ import path from "node:path";
14
+ import { fileURLToPath } from "node:url";
15
+
16
+ const __filename = fileURLToPath(import.meta.url);
17
+ const __dirname = path.dirname(__filename);
18
+
19
+ export function readKcVersion() {
20
+ try {
21
+ const pkgPath = path.resolve(__dirname, "../../package.json");
22
+ const pkg = JSON.parse(fs.readFileSync(pkgPath, "utf-8"));
23
+ return pkg.version || "unknown";
24
+ } catch {
25
+ return "unknown";
26
+ }
27
+ }
@@ -223,6 +223,24 @@ Regulations are often ambiguous. When you encounter ambiguity:
223
223
 
224
224
  Do not skip ambiguous rules. They are often the most important ones.
225
225
 
226
+ ## Sanity-check applicability against the sample corpus
227
+
228
+ After extracting your rule catalog and before authoring skills, do this 5-minute check: project each rule's applicability filter against the sample corpus.
229
+
230
+ For every rule:
231
+ 1. Walk `samples/`, classify each by product type / report type / document format
232
+ 2. For each rule, count how many samples it would apply to (per the rule's `applicability` field, scope filter, or whatever shape your catalog uses)
233
+ 3. Flag rules that apply to **0 samples** — they're either genuinely test-corpus-irrelevant (acceptable) or over-constrained (bug)
234
+
235
+ E2E #7 GLM produced a 97-rule catalog where 36 rules (37%) had `PASS=0 FAIL=0 NOT_APPLICABLE=90` across all 90 documents — they never fired. Some were legit (rules for cash-management products with no cash-management samples in corpus), but 36 inactive of 97 was high enough to suggest scope-too-narrow drift.
236
+
237
+ If many rules are 0-sample, either:
238
+ - **Reframe their applicability** — broaden product types, look for evidence in headers/footers not just body, relax the scope filter
239
+ - **Document them as "future scope"** and remove from this iteration's catalog (still capture them in a `rules/future_scope.md` so they're not forgotten)
240
+ - **Update the test corpus** to include matching samples (work with the developer user)
241
+
242
+ Catching this in `rule_extraction` is much cheaper than authoring 36 skills that then test as inactive in `skill_testing`. The cheap projection here is worth the time it saves later.
243
+
226
244
  ## When Rules Change
227
245
 
228
246
  Regulations evolve. When the developer user adds new or updated regulation documents:
@@ -45,6 +45,32 @@ If yes, design a worker LLM prompt. Use the smallest model tier that maintains a
45
45
  ### The hybrid approach (most common)
46
46
  Most rules are a mix: regex extracts the number, Python compares it to the threshold, LLM handles the exceptional cases. Design the workflow as a pipeline where cheap steps run first and expensive steps run only when needed.
47
47
 
48
+ ### When regex alone isn't enough — decision rubric
49
+
50
+ Before declaring distillation complete, audit each rule's `verification_type` / `metric` / `evidence_type` (or equivalent fields in your catalog). For rules where the required verification is one of:
51
+
52
+ - **Semantic** ("is this a positive guarantee or a disclaimer?")
53
+ - **Contextual** ("interpret this in light of the document's product type")
54
+ - **Counterfactual** ("what should this value be, given the other fields?")
55
+ - **Cross-field arithmetic** ("does 期初 + 收益 - 分配 = 期末?")
56
+
57
+ regex alone rarely suffices. Three acceptable forms:
58
+
59
+ 1. **Pure regex with documented limits** — write the regex check, include a comment explaining the fragility (e.g., "matches syntactic pattern only; cannot detect semantic guarantees")
60
+ 2. **Hybrid regex + LLM** — regex baseline catches obvious cases, `worker_llm_call` (tier1-2) handles ambiguous ones. The hybrid workflow declares which rule_ids escalate.
61
+ 3. **Pure LLM via `worker_llm_call`** — for fully semantic rules where no regex baseline is meaningful.
62
+
63
+ Don't ship pure regex for a rule whose `verification_type` is `judgment` / `semantic` without the documented-limits note. Future-you or a colleague will assume the regex is sufficient and that bug will hide for months.
64
+
65
+ ### Worker LLM cost-aware tier choice
66
+
67
+ If you do escalate to LLM:
68
+ - **tier1** (most capable, ~¥0.001-0.002/doc): cross-field reasoning, ambiguity resolution, rules that benefit from chain-of-thought
69
+ - **tier2-3**: bulk extraction with simple semantic checks
70
+ - **tier4** (cheapest): high-volume keyword-spotting that regex can't handle. Note: tier4 models on SiliconFlow are Qwen3.5 thinking-mode — `content` can return empty if `reasoning_content` consumes max_tokens. Test with realistic prompts before relying. If you see empty responses, either bump max_tokens to ≥8192, shorten your prompt, or fall back to tier1-2.
71
+
72
+ Both v0.7.1 audit conductors (DS and GLM) defaulted to all-regex distillation and only added LLM escalation when the human user explicitly asked for "V2 with worker LLM". If your rule catalog has any rules where the verification is genuinely semantic, you should reach for `worker_llm_call` yourself — don't wait to be asked.
73
+
48
74
  ## Workflow Structure
49
75
 
50
76
  A workflow is a Python file (or small set of files) in `workflows/`:
@@ -147,6 +147,41 @@ E2E #6 v070 surfaced this pattern (DS bundled-skill check.py files
147
147
  all returned `{"pass": null, "method": "stub"}` deferring to
148
148
  workflows/). v0.7.1 added this anti-pattern explicitly.
149
149
 
150
+ E2E #7 v071 showed the teaching prevented the stub anti-pattern in
151
+ both conductors (no `{"pass": null}` patterns in either run), but
152
+ **DS still inverted the canonical-vs-distilled relationship**: DS's
153
+ 6 thematic skill folders had SKILL.md only (no check.py), with the
154
+ real verification code living in `workflows/<skill>/check.py`. The
155
+ absence of stubs is good; the inversion is not — editing a rule then
156
+ requires touching both SKILL.md (the doc) and the workflow check.py
157
+ (the code). Single source of truth is lost.
158
+
159
+ GLM v071 by contrast landed the canonical pattern: 97/97 skills had
160
+ both SKILL.md AND a real `check.py` (median 143 LOC of regex +
161
+ applicability logic), and `workflows/<id>/workflow_v1.py` was a
162
+ 50-line thin wrapper that imported and called it:
163
+
164
+ ```python
165
+ # workflows/D01-01/workflow_v1.py — thin wrapper, 52 LOC
166
+ import importlib.util, json
167
+ from pathlib import Path
168
+
169
+ def run(doc_text: str, meta: dict = None) -> dict:
170
+ check_path = Path(__file__).parent.parent.parent / "rule_skills" / "D01-01" / "check.py"
171
+ spec = importlib.util.spec_from_file_location("check", check_path)
172
+ mod = importlib.util.module_from_spec(spec)
173
+ spec.loader.exec_module(mod)
174
+ result = mod.check(doc_text, meta)
175
+ result["_workflow"] = "D01-01_v1"
176
+ return result
177
+ ```
178
+
179
+ This is the v0.7.2+ canonical pattern: workflow is a shim that
180
+ points at the skill's check.py. To iterate on a rule's verification,
181
+ edit `rule_skills/<id>/check.py`. The workflow doesn't change. v0.7.2
182
+ clarifies the teaching: avoid stubs AND keep the canonical
183
+ relationship (skill is canonical, workflow is distilled wrapper).
184
+
150
185
  ### Naming convention for grouped checks
151
186
 
152
187
  When you do bundle, name the file with the explicit range:
@@ -309,18 +344,26 @@ When entering skill_authoring with an empty TaskBoard:
309
344
  5. **Pick the first task.** Work it to completion (skill + check + at least one local test). Update PATTERNS.md with whatever you learned. Move to the next task.
310
345
  6. **At task ~5 and task ~10:** stop and re-read PATTERNS.md. If patterns suggest a refactor of earlier work, do it now (cheap) rather than later (expensive).
311
346
 
312
- ### Why PATTERNS.md FIRST, before any skill code
347
+ ### Persisted methodology — PATTERNS.md OR phase logs OR AGENT.md decisions
348
+
349
+ The principle: capture framework-level decisions to disk before each phase advance. The conversation will compact, agents will restart, the next phase will lose grounding. Whichever format you pick, write to disk — don't rely on conversation context that disappears.
350
+
351
+ Three formats, each defensible. Pick one and stick with it:
352
+
353
+ - **`rules/PATTERNS.md`** — concise, framework-only, updated as the project progresses. Best for greenfield projects with clear hypothesis-up-front structure. Capped at ~5 KB; entries are transferable shapes / project constraints / anti-patterns with rationale (see "What to write" above).
313
354
 
314
- If you start writing skill code (rule_skills/<id>/check.py) before PATTERNS.md exists, **stop**. Even a 200-byte initial PATTERNS.md ("decided Shannon-Huffman; first hard rule R028 will dictate verdict shape; sample corpus has bilingual table headings") sets the framework. You'll save the time later not re-deriving the same shapes per rule.
355
+ - **`logs/phase_<name>_complete.md` per phase** incremental, captures what each phase produced + decisions made + what the next phase inherits. Best for iterative discovery work where the framework crystallizes mid-run. E2E #7 GLM used this pattern across 6 phase docs and an `evolution_summary_v1.2.md`; the methodology was captured even though PATTERNS.md was never written.
315
356
 
316
- "I'll write the skills first, then PATTERNS.md when I have insights."
357
+ - **`AGENT.md` decisions section + domain notes** — narrative-style, living document of "what we know" and "why". Best for projects with rich domain context to capture (regulations, edge cases, thresholds, sample format distributions). E2E #7 GLM's AGENT.md included regulation enforcement dates, product type taxonomies, threshold values, and sample format counts — this is fine; it's a different idiom for the same goal.
317
358
 
318
- By the time you have N skills, you've made N implicit decisions about verdict shape, chunker boundaries, worker tier each rule re-derives from scratch. Refactoring requires touching N files instead of one.
359
+ What you should NOT do: skip persistence and rely only on the live conversation context. By the time you have N skills authored without any persisted methodology, you've made N implicit decisions about verdict shape, chunker boundaries, and worker tier. Each rule re-derives from scratch. Refactoring requires touching N files instead of one.
319
360
 
320
- "Write PATTERNS.md, even tentatively, then re-read it before each new rule. Update it when discoveries change the framework."
361
+ "I'll capture insights when I have time."
321
362
 
322
- PATTERNS.md is your project's index card. Build it before the work, update it during the work, harvest it after.
363
+ "Before each phase advance, write what I learned to whichever persistence file matches this project's idiom — even if it's tentative."
323
364
 
324
- E2E #6 v070 surfaced this: DS only wrote PATTERNS.md after a rollback intervention; the per-skill design decisions before that point were already locked in and had to be re-touched. v0.7.1 reinforced this guidance.
365
+ E2E history:
366
+ - E2E #6 v070 DS wrote PATTERNS.md only after a rollback. Per-skill decisions before that point had to be re-touched. v0.7.1 added "PATTERNS.md FIRST" reinforcement.
367
+ - E2E #7 v071 neither DS nor GLM wrote PATTERNS.md, but GLM wrote 6 rich phase-completion logs and a comprehensive AGENT.md — the methodology WAS captured, just in different files. v0.7.2 blesses the broader principle: persist before you advance, format flexible.
325
368
 
326
- The engine's filesystem-derived milestones (Group A v0.7.0) verify coverage on disk regardless of how you split the work. The TaskBoard is your scratchpad; the disk is the contract.
369
+ The engine's filesystem-derived milestones (Group A v0.7.0) verify coverage on disk regardless of how you split the work. The TaskBoard is your scratchpad; the disk is the contract; the persistence file is your project's memory.
@@ -222,6 +222,24 @@ Regulations are often ambiguous. When you encounter ambiguity:
222
222
 
223
223
  Do not skip ambiguous rules. They are often the most important ones.
224
224
 
225
+ ## Sanity-check applicability against the sample corpus
226
+
227
+ After extracting your rule catalog and before authoring skills, do this 5-minute check: project each rule's applicability filter against the sample corpus.
228
+
229
+ For every rule:
230
+ 1. Walk `samples/`, classify each by product type / report type / document format
231
+ 2. For each rule, count how many samples it would apply to (per the rule's `applicability` field, scope filter, or whatever shape your catalog uses)
232
+ 3. Flag rules that apply to **0 samples** — they're either genuinely test-corpus-irrelevant (acceptable) or over-constrained (bug)
233
+
234
+ E2E #7 GLM produced a 97-rule catalog where 36 rules (37%) had `PASS=0 FAIL=0 NOT_APPLICABLE=90` across all 90 documents — they never fired. Some were legit (rules for cash-management products with no cash-management samples in corpus), but 36 inactive of 97 was high enough to suggest scope-too-narrow drift.
235
+
236
+ If many rules are 0-sample, either:
237
+ - **Reframe their applicability** — broaden product types, look for evidence in headers/footers not just body, relax the scope filter
238
+ - **Document them as "future scope"** and remove from this iteration's catalog (still capture them in a `rules/future_scope.md` so they're not forgotten)
239
+ - **Update the test corpus** to include matching samples (work with the developer user)
240
+
241
+ Catching this in `rule_extraction` is much cheaper than authoring 36 skills that then test as inactive in `skill_testing`. The cheap projection here is worth the time it saves later.
242
+
225
243
  ## When Rules Change
226
244
 
227
245
  Regulations evolve. When the developer user adds new or updated regulation documents:
@@ -45,6 +45,32 @@ If yes, design a worker LLM prompt. Use the smallest model tier that maintains a
45
45
  ### The hybrid approach (most common)
46
46
  Most rules are a mix: regex extracts the number, Python compares it to the threshold, LLM handles the exceptional cases. Design the workflow as a pipeline where cheap steps run first and expensive steps run only when needed.
47
47
 
48
+ ### When regex alone isn't enough — decision rubric
49
+
50
+ Before declaring distillation complete, audit each rule's `verification_type` / `metric` / `evidence_type` (or equivalent fields in your catalog). For rules where the required verification is one of:
51
+
52
+ - **Semantic** ("is this a positive guarantee or a disclaimer?")
53
+ - **Contextual** ("interpret this in light of the document's product type")
54
+ - **Counterfactual** ("what should this value be, given the other fields?")
55
+ - **Cross-field arithmetic** ("does 期初 + 收益 - 分配 = 期末?")
56
+
57
+ regex alone rarely suffices. Three acceptable forms:
58
+
59
+ 1. **Pure regex with documented limits** — write the regex check, include a comment explaining the fragility (e.g., "matches syntactic pattern only; cannot detect semantic guarantees")
60
+ 2. **Hybrid regex + LLM** — regex baseline catches obvious cases, `worker_llm_call` (tier1-2) handles ambiguous ones. The hybrid workflow declares which rule_ids escalate.
61
+ 3. **Pure LLM via `worker_llm_call`** — for fully semantic rules where no regex baseline is meaningful.
62
+
63
+ Don't ship pure regex for a rule whose `verification_type` is `judgment` / `semantic` without the documented-limits note. Future-you or a colleague will assume the regex is sufficient and that bug will hide for months.
64
+
65
+ ### Worker LLM cost-aware tier choice
66
+
67
+ If you do escalate to LLM:
68
+ - **tier1** (most capable, ~¥0.001-0.002/doc): cross-field reasoning, ambiguity resolution, rules that benefit from chain-of-thought
69
+ - **tier2-3**: bulk extraction with simple semantic checks
70
+ - **tier4** (cheapest): high-volume keyword-spotting that regex can't handle. Note: tier4 models on SiliconFlow are Qwen3.5 thinking-mode — `content` can return empty if `reasoning_content` consumes max_tokens. Test with realistic prompts before relying. If you see empty responses, either bump max_tokens to ≥8192, shorten your prompt, or fall back to tier1-2.
71
+
72
+ Both v0.7.1 audit conductors (DS and GLM) defaulted to all-regex distillation and only added LLM escalation when the human user explicitly asked for "V2 with worker LLM". If your rule catalog has any rules where the verification is genuinely semantic, you should reach for `worker_llm_call` yourself — don't wait to be asked.
73
+
48
74
  ## Workflow Structure
49
75
 
50
76
  A workflow is a Python file (or small set of files) in `workflows/`:
@@ -144,6 +144,39 @@ E2E #6 v070 暴露了这个反模式(DS 把所有 bundled skill 的 check.py
144
144
  都写成 `{"pass": null, "method": "stub"}` 推给 workflows/)。
145
145
  v0.7.1 把这个反模式显式写进 skill。
146
146
 
147
+ E2E #7 v071 显示这个反 stub 的引导在两个 conductor 上都生效(两条 run
148
+ 里都没有 `{"pass": null}` 这种 stub 模式),但是 **DS 仍然把"正典 vs
149
+ 蒸馏"的关系搞反了**:DS 写了 6 个主题分组的 skill 文件夹,每个只有
150
+ SKILL.md(没有 check.py),真正的验证代码却在
151
+ `workflows/<skill>/check.py` 里。没有 stub 是好事;关系搞反不是 ——
152
+ 要修改一条规则的逻辑就得同时改 SKILL.md(文档)和 workflow check.py
153
+ (代码),单一信息源就丢了。
154
+
155
+ GLM v071 反而把正典模式落地了:97/97 个 skill 都同时有 SKILL.md 和
156
+ 真正的 `check.py`(regex + 适用性判断的代码,中位 143 行),而
157
+ `workflows/<id>/workflow_v1.py` 是一个 50 行的薄壳,只是 import 并
158
+ 调用 skill 的 check.py:
159
+
160
+ ```python
161
+ # workflows/D01-01/workflow_v1.py — 薄壳,52 行
162
+ import importlib.util, json
163
+ from pathlib import Path
164
+
165
+ def run(doc_text: str, meta: dict = None) -> dict:
166
+ check_path = Path(__file__).parent.parent.parent / "rule_skills" / "D01-01" / "check.py"
167
+ spec = importlib.util.spec_from_file_location("check", check_path)
168
+ mod = importlib.util.module_from_spec(spec)
169
+ spec.loader.exec_module(mod)
170
+ result = mod.check(doc_text, meta)
171
+ result["_workflow"] = "D01-01_v1"
172
+ return result
173
+ ```
174
+
175
+ 这是 v0.7.2+ 的正典模式:workflow 是个壳,指向 skill 的 check.py。
176
+ 迭代规则验证逻辑时,编辑 `rule_skills/<id>/check.py`,workflow 不用动。
177
+ v0.7.2 把引导说得更清楚:既不要 stub,也要保留正典关系(skill 是
178
+ 正典,workflow 是蒸馏过的薄壳)。
179
+
147
180
  ### 合并 check 的命名约定
148
181
 
149
182
  确实需要合并时,文件名要把范围写明:
@@ -304,18 +337,26 @@ PATTERNS.md 全文控制在约 5 KB 之内。超过时,剪掉最不可执行
304
337
  5. **挑第一个任务**。做到完整(skill + check + 至少一次本地测试)。把学到的写进 PATTERNS.md。换下一个任务。
305
338
  6. **任务做到第 5 个、第 10 个时**:停下来重读 PATTERNS.md。如果新积累的 pattern 暗示要重构早期工作,**现在做**(便宜)而不是更晚(昂贵)。
306
339
 
307
- ### 为什么 PATTERNS.md 要先写、写在 skill 代码之前
340
+ ### 持久化方法论 —— PATTERNS.md phase 日志 或 AGENT.md decisions
341
+
342
+ 原则:在每次 phase 推进之前,把框架级的决定写到磁盘。对话会被 compact、agent 会重启、下一个 phase 会失去上下文。不管你选哪种格式,**写到磁盘** —— 不要依赖会消失的对话上下文。
343
+
344
+ 三种格式都站得住,挑一种坚持下去:
345
+
346
+ - **`rules/PATTERNS.md`** —— 简洁,只装框架级内容,随项目推进而更新。适合假设可以前置、结构清晰的全新项目。上限 ~5 KB;条目是可迁移的形状 / 项目级约束 / 反模式加原因(参考上面"该写什么"一节)。
308
347
 
309
- 如果你在 PATTERNS.md 还不存在的时候就开始写 skill 代码(rule_skills/<id>/check.py),**停**。哪怕只是 200 字节的初始 PATTERNS.md("决定走 Shannon-Huffman;第一条难规则 R028 决定 verdict 形状;样本语料表头中英双语")也能搭起框架。后续每条规则少重新推导一次同样的形状,整体能省 4 倍时间。
348
+ - **每阶段写 `logs/phase_<name>_complete.md`** —— 增量式,记录每个 phase 产出了什么、做了哪些决定、下个 phase 继承什么。适合"边发现边定型"的迭代式工作。E2E #7 GLM 用了这个模式:6 phase 文档 + `evolution_summary_v1.2.md`,方法论照样捕获了,只是没写 PATTERNS.md。
310
349
 
311
- "我先把 skill 写完,等有洞察再写 PATTERNS.md。"
350
+ - **`AGENT.md` decisions 段 + 领域笔记** —— 叙事风格,是关于"我们知道什么"和"为什么"的活文档。适合需要捕获丰富领域上下文的项目(法规、边缘案例、阈值、样本格式分布)。E2E #7 GLM 的 AGENT.md 里有法规生效日期、产品类型分类、阈值数值、样本格式数量 —— 完全 OK,是相同目标的不同惯用法。
312
351
 
313
- 到你写完 N skill 时,你已经做了 N 个隐式决定(verdict 形状、chunker 边界、worker tier)——每条规则都是从零推导。重构需要碰 N 个文件,而不是一个。
352
+ 不该做的事:跳过持久化、只靠对话上下文活着。等你写到第 N skill 还没把方法论写到磁盘时,你已经做了 N 个关于 verdict 形状、chunker 边界、worker tier 的隐式决定 —— 每条规则都从零推导,重构要碰 N 个文件而不是一个。
314
353
 
315
- "先写 PATTERNS.md(哪怕是初步的),写每条新规则之前先重读,发现新东西就回头更新。"
354
+ "等我有空再来记录这些洞察。"
316
355
 
317
- PATTERNS.md 是项目的索引卡片。工作之前搭好它、工作中更新它、工作之后从中收割。
356
+ "每次 phase 推进之前,把这一阶段学到的东西写到适合本项目惯用法的那个持久化文件里 —— 哪怕只是初稿。"
318
357
 
319
- E2E #6 v070 暴露了这个:DS 在用户介入回退之后才写 PATTERNS.md,而那之前每条 skill 的设计决定都已经各自固化、之后还要再碰一遍。v0.7.1 把这个引导写得更明确。
358
+ E2E 历史:
359
+ - E2E #6 v070 DS 在用户介入回退之后才写 PATTERNS.md。那之前每条 skill 的设计决定都各自固化,之后还要再碰一遍。v0.7.1 加了"PATTERNS.md FIRST"的引导。
360
+ - E2E #7 v071 DS 和 GLM 都没写 PATTERNS.md,但 GLM 写了 6 篇 phase 完成日志和一份内容详尽的 AGENT.md —— 方法论 *捕获了*,只是放在了不同文件里。v0.7.2 把更宽的原则写进 skill:推进之前先持久化,格式灵活。
320
361
 
321
- 引擎从文件系统推导里程碑(v0.7.0 Group A)会按磁盘事实核验覆盖率,无论你怎么切分工作。TaskBoard 是你的草稿;磁盘才是契约。
362
+ 引擎从文件系统推导里程碑(v0.7.0 Group A)会按磁盘事实核验覆盖率,无论你怎么切分工作。TaskBoard 是你的草稿;磁盘才是契约;持久化文件是项目的记忆。