kc-beta 0.7.1 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/agent/engine.js +20 -2
- package/src/agent/pipelines/_milestone-derive.js +65 -42
- package/src/agent/pipelines/finalization.js +2 -6
- package/src/agent/pipelines/initializer.js +13 -0
- package/src/agent/tools/release.js +151 -1
- package/src/util/kc-version.js +27 -0
- package/template/skills/en/meta-meta/rule-extraction/SKILL.md +18 -0
- package/template/skills/en/meta-meta/skill-to-workflow/SKILL.md +26 -0
- package/template/skills/en/meta-meta/work-decomposition/SKILL.md +51 -8
- package/template/skills/zh/meta-meta/rule-extraction/SKILL.md +18 -0
- package/template/skills/zh/meta-meta/skill-to-workflow/SKILL.md +26 -0
- package/template/skills/zh/meta-meta/work-decomposition/SKILL.md +49 -8
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "kc-beta",
|
|
3
|
-
"version": "0.7.
|
|
3
|
+
"version": "0.7.2",
|
|
4
4
|
"description": "KC Agent — LLM document verification agent (pure Node.js CLI). Dual-licensed: PolyForm Noncommercial 1.0.0 for personal/noncommercial use; commercial license required for enterprise production. See LICENSE and LICENSE-COMMERCIAL.md.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
package/src/agent/engine.js
CHANGED
|
@@ -21,6 +21,7 @@ import { SnapshotTool } from "./tools/snapshot.js";
|
|
|
21
21
|
import { ArchiveFileTool } from "./tools/archive-file.js";
|
|
22
22
|
import { ScheduleFetchTool } from "./tools/schedule-fetch.js";
|
|
23
23
|
import { ReleaseTool } from "./tools/release.js";
|
|
24
|
+
import { readKcVersion } from "../util/kc-version.js";
|
|
24
25
|
import { PhaseAdvanceTool } from "./tools/phase-advance.js";
|
|
25
26
|
import { DocumentParseTool } from "./tools/document-parse.js";
|
|
26
27
|
import { DocumentSearchTool } from "./tools/document-search.js";
|
|
@@ -421,7 +422,7 @@ export class AgentEngine {
|
|
|
421
422
|
new SnapshotTool(this.workspace),
|
|
422
423
|
new ArchiveFileTool(this.workspace),
|
|
423
424
|
new ScheduleFetchTool(this.workspace),
|
|
424
|
-
new ReleaseTool(this.workspace, { kcVersion:
|
|
425
|
+
new ReleaseTool(this.workspace, { kcVersion: readKcVersion() }),
|
|
425
426
|
new PhaseAdvanceTool(
|
|
426
427
|
// v0.7.1 2c: advanceFn returns rich `{advanced, engineCounts?}`
|
|
427
428
|
// so the tool's refusal text can surface the engine telemetry
|
|
@@ -1679,7 +1680,24 @@ export class AgentEngine {
|
|
|
1679
1680
|
parts.push(`monitoring: ${pipeline.monitoringPhase ?? "?"}`);
|
|
1680
1681
|
break;
|
|
1681
1682
|
}
|
|
1682
|
-
|
|
1683
|
+
case "bootstrap": {
|
|
1684
|
+
// v0.7.2 1e: previously fell through to empty string. Both
|
|
1685
|
+
// v0.7.1 audit runs had bootstrap → rule_extraction refusals
|
|
1686
|
+
// with engineCounts: "" — agent saw the refusal but had no
|
|
1687
|
+
// engine telemetry to react to. The InitializerPipeline tracks
|
|
1688
|
+
// boolean checklist flags rather than numeric counters; we
|
|
1689
|
+
// surface those flags as "yes/no" so the agent can see which
|
|
1690
|
+
// bootstrap criterion is missing.
|
|
1691
|
+
if (typeof pipeline.describeBootstrapChecklist === "function") {
|
|
1692
|
+
const cl = pipeline.describeBootstrapChecklist();
|
|
1693
|
+
parts.push(`workspaceCreated: ${cl.workspaceCreated ? "yes" : "no"}`);
|
|
1694
|
+
parts.push(`configReady: ${cl.configReady ? "yes" : "no"}`);
|
|
1695
|
+
parts.push(`hasRegulations: ${cl.hasRegulations ? "yes" : "no"}`);
|
|
1696
|
+
parts.push(`hasSamples: ${cl.hasSamples ? "yes" : "no"}`);
|
|
1697
|
+
}
|
|
1698
|
+
break;
|
|
1699
|
+
}
|
|
1700
|
+
// finalization: no specific counters, fall through
|
|
1683
1701
|
}
|
|
1684
1702
|
} catch { /* never let summary build break phase advance */ }
|
|
1685
1703
|
return parts.join(", ");
|
|
@@ -57,16 +57,21 @@ function listChildFiles(p) {
|
|
|
57
57
|
// Walk a directory recursively, yielding every file path. Skips hidden
|
|
58
58
|
// dirs/files and __pycache__. Used by derive functions that need to
|
|
59
59
|
// match arbitrarily-nested artifacts (e.g., scripts/ subdirs).
|
|
60
|
-
|
|
60
|
+
//
|
|
61
|
+
// v0.7.2 1a: optional maxDepth caps recursion. depth=0 is root's
|
|
62
|
+
// direct children; depth=1 is one level down. Default unbounded
|
|
63
|
+
// (existing callers).
|
|
64
|
+
function* walkFiles(root, { maxDepth } = {}) {
|
|
61
65
|
if (!dirExists(root)) return;
|
|
62
|
-
const stack = [root];
|
|
66
|
+
const stack = [{ dir: root, depth: 0 }];
|
|
63
67
|
while (stack.length) {
|
|
64
|
-
const dir = stack.pop();
|
|
68
|
+
const { dir, depth } = stack.pop();
|
|
65
69
|
for (const e of readDirSafe(dir)) {
|
|
66
70
|
if (e.name.startsWith(".") || e.name === "__pycache__") continue;
|
|
67
71
|
const p = path.join(dir, e.name);
|
|
68
|
-
if (e.isDirectory())
|
|
69
|
-
|
|
72
|
+
if (e.isDirectory()) {
|
|
73
|
+
if (maxDepth == null || depth < maxDepth) stack.push({ dir: p, depth: depth + 1 });
|
|
74
|
+
} else if (e.isFile()) yield p;
|
|
70
75
|
}
|
|
71
76
|
}
|
|
72
77
|
}
|
|
@@ -271,48 +276,66 @@ export function deriveSkillTestingMilestones(workspace) {
|
|
|
271
276
|
}
|
|
272
277
|
}
|
|
273
278
|
|
|
274
|
-
// v0.7.1 1a:
|
|
275
|
-
// Agents
|
|
276
|
-
//
|
|
277
|
-
//
|
|
278
|
-
//
|
|
279
|
-
//
|
|
280
|
-
//
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
279
|
+
// v0.7.1 1a / v0.7.2 1a: credit rules whose verdicts appear anywhere
|
|
280
|
+
// under output/*.json. Agents persist batch-test results in
|
|
281
|
+
// conductor-specific shapes (this is the recurring drift point —
|
|
282
|
+
// engine derivation has to match disk reality, not the other way
|
|
283
|
+
// around). Shapes seen across E2E #5/6/7:
|
|
284
|
+
//
|
|
285
|
+
// - DS v0.7.0/0.7.1: catalog.json as array of {id: "R001", ...}
|
|
286
|
+
// entries; skill_test_*.json as {doc_name: {R019a: bool, ...}};
|
|
287
|
+
// skill_test_阳光资产.json with {doc, results: {R019a: ...}}
|
|
288
|
+
// - GLM v0.7.1: rule_stats.json as {D01-01: {PASS, FAIL, NA}, ...};
|
|
289
|
+
// full_test_results_v[1-6].json as {sample_id: {path, meta,
|
|
290
|
+
// results: {D01-01: {verdict, ...}}}} (nested 2 levels deep, why
|
|
291
|
+
// v0.7.1's shallow walk missed them)
|
|
292
|
+
//
|
|
293
|
+
// The collector recurses (depth-limited) and uses two heuristics to
|
|
294
|
+
// separate rule_ids from sample_ids / doc_names:
|
|
295
|
+
// 1. Rule-id shape: starts with letter, ≤ 30 chars, contains digits
|
|
296
|
+
// (matches R001, D01-01, T02-31; rejects 06f2ed1488, doc paths)
|
|
297
|
+
// 2. Verdict-shape on values: {verdict, passed, pass, PASS, FAIL}
|
|
298
|
+
// keys signal that the parent dict's keys are rule_ids
|
|
299
|
+
const ruleIdShape = /^[A-Za-z][A-Za-z0-9_-]{0,29}$/;
|
|
300
|
+
const isRuleIdShape = (s) => typeof s === "string" && ruleIdShape.test(s) && /\d/.test(s);
|
|
301
|
+
const looksLikeVerdict = (v) =>
|
|
302
|
+
v && typeof v === "object" && !Array.isArray(v) && (
|
|
303
|
+
v.verdict !== undefined ||
|
|
304
|
+
v.passed !== undefined ||
|
|
305
|
+
v.pass !== undefined ||
|
|
306
|
+
typeof v.PASS === "number" ||
|
|
307
|
+
typeof v.FAIL === "number"
|
|
308
|
+
);
|
|
309
|
+
const collectFromJsonFile = (data, depth = 0) => {
|
|
310
|
+
if (!data || depth > 4) return;
|
|
311
|
+
if (typeof data !== "object") return;
|
|
312
|
+
if (Array.isArray(data)) {
|
|
313
|
+
for (const r of data) collectFromJsonFile(r, depth + 1);
|
|
314
|
+
return;
|
|
315
|
+
}
|
|
316
|
+
// {rule_id: "X"} or {id: "R001"} on a rule entry
|
|
317
|
+
if (isRuleIdShape(data.rule_id)) tested.add(data.rule_id);
|
|
318
|
+
if (isRuleIdShape(data.id)) tested.add(data.id);
|
|
319
|
+
// {<rule_id>: <verdict_shaped>, ...} (rule_stats / per-doc test_results)
|
|
320
|
+
for (const [k, v] of Object.entries(data)) {
|
|
321
|
+
if (isRuleIdShape(k) && looksLikeVerdict(v)) tested.add(k);
|
|
286
322
|
}
|
|
287
|
-
|
|
288
|
-
|
|
323
|
+
// {results: {<rule_id>: ...}} — keys must look rule-id-shaped
|
|
324
|
+
if (data.results && typeof data.results === "object" && !Array.isArray(data.results)) {
|
|
325
|
+
for (const k of Object.keys(data.results)) {
|
|
326
|
+
if (isRuleIdShape(k)) tested.add(k);
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
// Recurse into nested objects (handles {sample_id: {results: {...}}})
|
|
330
|
+
for (const v of Object.values(data)) {
|
|
331
|
+
if (v && typeof v === "object") collectFromJsonFile(v, depth + 1);
|
|
289
332
|
}
|
|
290
333
|
};
|
|
291
334
|
|
|
292
335
|
const outputDir = path.join(cwd, "output");
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
collectFromJsonFile(readJsonSafe(path.join(outputDir, f.name)));
|
|
297
|
-
}
|
|
298
|
-
// One level into output/results/, output/distillation/ — the two
|
|
299
|
-
// most common batch-result locations across E2E #5 and v070 sessions.
|
|
300
|
-
for (const sub of ["results", "distillation", "qc"]) {
|
|
301
|
-
const subDir = path.join(outputDir, sub);
|
|
302
|
-
if (!dirExists(subDir)) continue;
|
|
303
|
-
for (const f of listChildFiles(subDir)) {
|
|
304
|
-
if (!f.name.endsWith(".json")) continue;
|
|
305
|
-
collectFromJsonFile(readJsonSafe(path.join(subDir, f.name)));
|
|
306
|
-
}
|
|
307
|
-
// GLM v070 wrote per-rule subdirs under output/results/<rule_id>/
|
|
308
|
-
// — walk one more level for that pattern.
|
|
309
|
-
for (const child of listChildDirs(subDir)) {
|
|
310
|
-
for (const f of listChildFiles(path.join(subDir, child.name))) {
|
|
311
|
-
if (!f.name.endsWith(".json")) continue;
|
|
312
|
-
collectFromJsonFile(readJsonSafe(path.join(subDir, child.name, f.name)));
|
|
313
|
-
}
|
|
314
|
-
}
|
|
315
|
-
}
|
|
336
|
+
for (const p of walkFiles(outputDir, { maxDepth: 6 })) {
|
|
337
|
+
if (!p.endsWith(".json")) continue;
|
|
338
|
+
collectFromJsonFile(readJsonSafe(p));
|
|
316
339
|
}
|
|
317
340
|
|
|
318
341
|
// DS v070 wrote a top-level aggregate at either rules/test_results.json
|
|
@@ -5,6 +5,7 @@ import { PipelineEvent } from "./index.js";
|
|
|
5
5
|
import { Pipeline } from "./base.js";
|
|
6
6
|
import { normalizeRuleCatalog } from "../rule-catalog-normalize.js";
|
|
7
7
|
import { deriveFinalizationMilestones } from "./_milestone-derive.js";
|
|
8
|
+
import { readKcVersion } from "../../util/kc-version.js";
|
|
8
9
|
|
|
9
10
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
10
11
|
// v0.7.0 N: ship template/release/v1/ from the npm package; copy into
|
|
@@ -310,12 +311,7 @@ export class FinalizationPipeline extends Pipeline {
|
|
|
310
311
|
}
|
|
311
312
|
|
|
312
313
|
_readKcVersion() {
|
|
313
|
-
|
|
314
|
-
const pkg = JSON.parse(fs.readFileSync(
|
|
315
|
-
path.resolve(__dirname, "../../../package.json"), "utf-8",
|
|
316
|
-
));
|
|
317
|
-
return pkg.version || "unknown";
|
|
318
|
-
} catch { return "unknown"; }
|
|
314
|
+
return readKcVersion();
|
|
319
315
|
}
|
|
320
316
|
|
|
321
317
|
/**
|
|
@@ -240,6 +240,19 @@ export class ProjectInitializer extends Pipeline {
|
|
|
240
240
|
return this.workspaceCreated && this.configReady && this.hasRegulations && this.hasSamples;
|
|
241
241
|
}
|
|
242
242
|
|
|
243
|
+
// v0.7.2 1e: surface the checklist as engine telemetry so
|
|
244
|
+
// `_buildEngineCountsBlock("bootstrap")` has something to report when
|
|
245
|
+
// bootstrap → rule_extraction is refused. Agent sees the missing
|
|
246
|
+
// criteria directly in the refusal text.
|
|
247
|
+
describeBootstrapChecklist() {
|
|
248
|
+
return {
|
|
249
|
+
workspaceCreated: !!this.workspaceCreated,
|
|
250
|
+
configReady: !!this.configReady,
|
|
251
|
+
hasRegulations: !!this.hasRegulations,
|
|
252
|
+
hasSamples: !!this.hasSamples,
|
|
253
|
+
};
|
|
254
|
+
}
|
|
255
|
+
|
|
243
256
|
/**
|
|
244
257
|
* v0.6.3 (#74): nudge the agent when it does work that belongs to a later
|
|
245
258
|
* phase. Bootstrap is setup — reading rules/samples, configuring keys,
|
|
@@ -178,7 +178,21 @@ export class ReleaseTool extends BaseTool {
|
|
|
178
178
|
path.join(bundleAbs, "glossary.json"), { fallback: '{"version":1,"entries":[]}\n' });
|
|
179
179
|
this._copyIfExists(path.join(this._workspace.cwd, "corner_cases.json"),
|
|
180
180
|
path.join(bundleAbs, "corner_cases.json"), { fallback: '[]\n' });
|
|
181
|
-
|
|
181
|
+
// v0.7.2 1c: auto-aggregate from output/ if no calibration file at
|
|
182
|
+
// workspace root. Both v0.7.1 audit runs (DS + GLM) shipped releases
|
|
183
|
+
// with empty `historical_accuracy: {}` despite having per-rule QC
|
|
184
|
+
// data on disk under output/ — the release tool just passed the
|
|
185
|
+
// file through and emitted a stub on miss. We try to populate from
|
|
186
|
+
// known QC artifact shapes here; if nothing matches, fall through
|
|
187
|
+
// to the existing stub fallback.
|
|
188
|
+
const calibSrc = path.join(this._workspace.cwd, "confidence_calibration.json");
|
|
189
|
+
if (!fs.existsSync(calibSrc)) {
|
|
190
|
+
const aggregated = this._aggregateAccuracyFromOutput();
|
|
191
|
+
if (aggregated && Object.keys(aggregated.historical_accuracy).length > 0) {
|
|
192
|
+
fs.writeFileSync(calibSrc, JSON.stringify(aggregated, null, 2) + "\n", "utf-8");
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
this._copyIfExists(calibSrc,
|
|
182
196
|
path.join(bundleAbs, "confidence_calibration.json"),
|
|
183
197
|
{ fallback: '{"historical_accuracy":{}}\n' });
|
|
184
198
|
|
|
@@ -233,6 +247,30 @@ export class ReleaseTool extends BaseTool {
|
|
|
233
247
|
.replace(/\{RULES_LIST\}/g, rulesList);
|
|
234
248
|
fs.writeFileSync(path.join(bundleAbs, "README.md"), readme, "utf-8");
|
|
235
249
|
|
|
250
|
+
// v0.7.2 1d: clean up the template scaffold dir if a customized
|
|
251
|
+
// release was just written alongside it. Both v0.7.1 audit runs
|
|
252
|
+
// shipped with `output/releases/v1/` (template-derived, .tmpl
|
|
253
|
+
// files lingering) AND `output/releases/v1-0/` (or v1-0-hybrid/)
|
|
254
|
+
// — the customized release. The pre-scaffold is meant as a hint;
|
|
255
|
+
// once the agent calls `release(label="v1-0")` and we've written
|
|
256
|
+
// the real bundle, the unedited scaffold is just clutter.
|
|
257
|
+
//
|
|
258
|
+
// Conservative gate: only delete a sibling `v1/` if BOTH (a) we
|
|
259
|
+
// didn't just write to v1/ ourselves, AND (b) it still contains
|
|
260
|
+
// .tmpl files (signature of unedited template). If the agent
|
|
261
|
+
// intentionally edited v1/ in place (removing .tmpl), our cleanup
|
|
262
|
+
// leaves it alone.
|
|
263
|
+
if (slug !== "v1") {
|
|
264
|
+
const tmplScaffold = path.join(this._workspace.resolvePath(path.join("output", "releases")), "v1");
|
|
265
|
+
if (fs.existsSync(tmplScaffold) && fs.statSync(tmplScaffold).isDirectory()) {
|
|
266
|
+
let hasTmpl = false;
|
|
267
|
+
try { hasTmpl = fs.readdirSync(tmplScaffold).some((f) => f.endsWith(".tmpl")); } catch { /* ignore */ }
|
|
268
|
+
if (hasTmpl) {
|
|
269
|
+
try { fs.rmSync(tmplScaffold, { recursive: true, force: true }); } catch { /* best-effort */ }
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
|
|
236
274
|
// Bundle dir is in output/ (gitignored). Snapshot manifest in snapshots/ IS tracked.
|
|
237
275
|
const lines = [
|
|
238
276
|
`Release '${label}' bundled at ${bundleRel}`,
|
|
@@ -319,6 +357,118 @@ export class ReleaseTool extends BaseTool {
|
|
|
319
357
|
return null;
|
|
320
358
|
}
|
|
321
359
|
|
|
360
|
+
// v0.7.2 1c: walk output/ for QC artifacts and aggregate per-rule
|
|
361
|
+
// accuracy. Recognized shapes (covering DS + GLM v0.7.1 audit runs):
|
|
362
|
+
//
|
|
363
|
+
// rule_stats_v*.json — {<rule_id>: {PASS: N, FAIL: N, NOT_APPLICABLE: N, ERROR: N}}
|
|
364
|
+
// (GLM produced 4 versions; pick the highest)
|
|
365
|
+
// full_test_results_v*.json — {<sample_id>: {results: {<rule_id>: {verdict}}}}
|
|
366
|
+
// (GLM; accumulate verdicts per rule across samples)
|
|
367
|
+
// skill_test_*.json — {<doc_name>: {<rule_id>: bool}} (DS shape)
|
|
368
|
+
//
|
|
369
|
+
// Returns null if no recognized artifact, or an object with
|
|
370
|
+
// { historical_accuracy: {<rule_id>: {pass_rate, n_samples, ...}}, computed_at, source_files }
|
|
371
|
+
// suitable for confidence_calibration.json.
|
|
372
|
+
_aggregateAccuracyFromOutput() {
|
|
373
|
+
const ruleIdShape = /^[A-Za-z][A-Za-z0-9_-]{0,29}$/;
|
|
374
|
+
const isRuleId = (s) => typeof s === "string" && ruleIdShape.test(s) && /\d/.test(s);
|
|
375
|
+
const tally = new Map(); // rule_id -> {pass, fail, na, n}
|
|
376
|
+
const sourceFiles = [];
|
|
377
|
+
const bump = (rid, kind) => {
|
|
378
|
+
if (!isRuleId(rid)) return;
|
|
379
|
+
const t = tally.get(rid) || { pass: 0, fail: 0, na: 0, n: 0 };
|
|
380
|
+
t[kind] += 1;
|
|
381
|
+
t.n += 1;
|
|
382
|
+
tally.set(rid, t);
|
|
383
|
+
};
|
|
384
|
+
const outputDir = path.join(this._workspace.cwd, "output");
|
|
385
|
+
if (!fs.existsSync(outputDir)) return null;
|
|
386
|
+
|
|
387
|
+
// Collect all .json files under output/ (depth limited)
|
|
388
|
+
const files = [];
|
|
389
|
+
const stack = [{ dir: outputDir, depth: 0 }];
|
|
390
|
+
while (stack.length) {
|
|
391
|
+
const { dir, depth } = stack.pop();
|
|
392
|
+
let entries;
|
|
393
|
+
try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch { continue; }
|
|
394
|
+
for (const e of entries) {
|
|
395
|
+
if (e.name.startsWith(".") || e.name === "__pycache__") continue;
|
|
396
|
+
const p = path.join(dir, e.name);
|
|
397
|
+
if (e.isDirectory()) {
|
|
398
|
+
if (depth < 6) stack.push({ dir: p, depth: depth + 1 });
|
|
399
|
+
} else if (e.isFile() && e.name.endsWith(".json")) {
|
|
400
|
+
files.push({ path: p, name: e.name });
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
// 1) Prefer rule_stats_v<N>.json (highest version) — direct counts
|
|
406
|
+
const ruleStatsFiles = files
|
|
407
|
+
.filter((f) => /^rule_stats(?:_v\d+)?\.json$/i.test(f.name))
|
|
408
|
+
.map((f) => ({ ...f, ver: (f.name.match(/_v(\d+)/) || [0, 0])[1] | 0 }))
|
|
409
|
+
.sort((a, b) => b.ver - a.ver);
|
|
410
|
+
if (ruleStatsFiles.length > 0) {
|
|
411
|
+
const top = ruleStatsFiles[0];
|
|
412
|
+
try {
|
|
413
|
+
const d = JSON.parse(fs.readFileSync(top.path, "utf-8"));
|
|
414
|
+
for (const [rid, stats] of Object.entries(d)) {
|
|
415
|
+
if (!isRuleId(rid) || !stats || typeof stats !== "object") continue;
|
|
416
|
+
const pass = stats.PASS | 0, fail = stats.FAIL | 0;
|
|
417
|
+
const na = stats.NOT_APPLICABLE | stats.NA | 0;
|
|
418
|
+
const t = tally.get(rid) || { pass: 0, fail: 0, na: 0, n: 0 };
|
|
419
|
+
t.pass += pass; t.fail += fail; t.na += na; t.n += pass + fail + na;
|
|
420
|
+
tally.set(rid, t);
|
|
421
|
+
}
|
|
422
|
+
sourceFiles.push(path.relative(this._workspace.cwd, top.path));
|
|
423
|
+
} catch { /* fall through to other shapes */ }
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
// 2) Fallback: full_test_results*.json with nested {sample_id: {results: {rid: {verdict}}}}
|
|
427
|
+
if (tally.size === 0) {
|
|
428
|
+
const ftrFiles = files
|
|
429
|
+
.filter((f) => /^full_test_results(?:_v\d+)?\.json$/i.test(f.name))
|
|
430
|
+
.map((f) => ({ ...f, ver: (f.name.match(/_v(\d+)/) || [0, 0])[1] | 0 }))
|
|
431
|
+
.sort((a, b) => b.ver - a.ver);
|
|
432
|
+
for (const f of ftrFiles.slice(0, 1)) {
|
|
433
|
+
try {
|
|
434
|
+
const d = JSON.parse(fs.readFileSync(f.path, "utf-8"));
|
|
435
|
+
for (const sample of Object.values(d)) {
|
|
436
|
+
if (!sample || typeof sample !== "object") continue;
|
|
437
|
+
const results = sample.results;
|
|
438
|
+
if (!results || typeof results !== "object") continue;
|
|
439
|
+
for (const [rid, r] of Object.entries(results)) {
|
|
440
|
+
if (!isRuleId(rid) || !r || typeof r !== "object") continue;
|
|
441
|
+
const verdict = (r.verdict || "").toString().toUpperCase();
|
|
442
|
+
if (verdict === "PASS") bump(rid, "pass");
|
|
443
|
+
else if (verdict === "FAIL") bump(rid, "fail");
|
|
444
|
+
else if (verdict === "NOT_APPLICABLE" || verdict === "NA") bump(rid, "na");
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
sourceFiles.push(path.relative(this._workspace.cwd, f.path));
|
|
448
|
+
} catch { /* try next shape */ }
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
if (tally.size === 0) return null;
|
|
453
|
+
|
|
454
|
+
const historical_accuracy = {};
|
|
455
|
+
for (const [rid, t] of tally.entries()) {
|
|
456
|
+
const fired = t.pass + t.fail;
|
|
457
|
+
historical_accuracy[rid] = {
|
|
458
|
+
pass_rate: fired > 0 ? +(t.pass / fired).toFixed(4) : null,
|
|
459
|
+
n_passed: t.pass,
|
|
460
|
+
n_failed: t.fail,
|
|
461
|
+
n_not_applicable: t.na,
|
|
462
|
+
n_samples: t.n,
|
|
463
|
+
};
|
|
464
|
+
}
|
|
465
|
+
return {
|
|
466
|
+
historical_accuracy,
|
|
467
|
+
computed_at: new Date().toISOString(),
|
|
468
|
+
source_files: sourceFiles,
|
|
469
|
+
};
|
|
470
|
+
}
|
|
471
|
+
|
|
322
472
|
_readWorkerTiers() {
|
|
323
473
|
const envPath = path.join(this._workspace.cwd, ".env");
|
|
324
474
|
const out = { tier1: "", tier2: "", tier3: "", tier4: "" };
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
// Single source of truth for the live KC CLI version string.
|
|
2
|
+
//
|
|
3
|
+
// Reads package.json once. Used by engine.js (passed to ReleaseTool so
|
|
4
|
+
// release manifests stamp the correct version) and by
|
|
5
|
+
// pipelines/finalization.js (anywhere it surfaces "Built by kc-beta X").
|
|
6
|
+
//
|
|
7
|
+
// Before v0.7.2, engine.js hardcoded `kcVersion: "0.5.2"` which leaked
|
|
8
|
+
// into every release manifest's `kc_beta_version` field regardless of
|
|
9
|
+
// the actual package version. Both v0.7.1 audit runs (DS + GLM)
|
|
10
|
+
// surfaced this. Reading package.json closes the gap.
|
|
11
|
+
|
|
12
|
+
import fs from "node:fs";
|
|
13
|
+
import path from "node:path";
|
|
14
|
+
import { fileURLToPath } from "node:url";
|
|
15
|
+
|
|
16
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
17
|
+
const __dirname = path.dirname(__filename);
|
|
18
|
+
|
|
19
|
+
export function readKcVersion() {
|
|
20
|
+
try {
|
|
21
|
+
const pkgPath = path.resolve(__dirname, "../../package.json");
|
|
22
|
+
const pkg = JSON.parse(fs.readFileSync(pkgPath, "utf-8"));
|
|
23
|
+
return pkg.version || "unknown";
|
|
24
|
+
} catch {
|
|
25
|
+
return "unknown";
|
|
26
|
+
}
|
|
27
|
+
}
|
|
@@ -223,6 +223,24 @@ Regulations are often ambiguous. When you encounter ambiguity:
|
|
|
223
223
|
|
|
224
224
|
Do not skip ambiguous rules. They are often the most important ones.
|
|
225
225
|
|
|
226
|
+
## Sanity-check applicability against the sample corpus
|
|
227
|
+
|
|
228
|
+
After extracting your rule catalog and before authoring skills, do this 5-minute check: project each rule's applicability filter against the sample corpus.
|
|
229
|
+
|
|
230
|
+
For every rule:
|
|
231
|
+
1. Walk `samples/`, classify each by product type / report type / document format
|
|
232
|
+
2. For each rule, count how many samples it would apply to (per the rule's `applicability` field, scope filter, or whatever shape your catalog uses)
|
|
233
|
+
3. Flag rules that apply to **0 samples** — they're either genuinely test-corpus-irrelevant (acceptable) or over-constrained (bug)
|
|
234
|
+
|
|
235
|
+
E2E #7 GLM produced a 97-rule catalog where 36 rules (37%) had `PASS=0 FAIL=0 NOT_APPLICABLE=90` across all 90 documents — they never fired. Some were legit (rules for cash-management products with no cash-management samples in corpus), but 36 inactive of 97 was high enough to suggest scope-too-narrow drift.
|
|
236
|
+
|
|
237
|
+
If many rules are 0-sample, either:
|
|
238
|
+
- **Reframe their applicability** — broaden product types, look for evidence in headers/footers not just body, relax the scope filter
|
|
239
|
+
- **Document them as "future scope"** and remove from this iteration's catalog (still capture them in a `rules/future_scope.md` so they're not forgotten)
|
|
240
|
+
- **Update the test corpus** to include matching samples (work with the developer user)
|
|
241
|
+
|
|
242
|
+
Catching this in `rule_extraction` is much cheaper than authoring 36 skills that then test as inactive in `skill_testing`. The cheap projection here is worth the time it saves later.
|
|
243
|
+
|
|
226
244
|
## When Rules Change
|
|
227
245
|
|
|
228
246
|
Regulations evolve. When the developer user adds new or updated regulation documents:
|
|
@@ -45,6 +45,32 @@ If yes, design a worker LLM prompt. Use the smallest model tier that maintains a
|
|
|
45
45
|
### The hybrid approach (most common)
|
|
46
46
|
Most rules are a mix: regex extracts the number, Python compares it to the threshold, LLM handles the exceptional cases. Design the workflow as a pipeline where cheap steps run first and expensive steps run only when needed.
|
|
47
47
|
|
|
48
|
+
### When regex alone isn't enough — decision rubric
|
|
49
|
+
|
|
50
|
+
Before declaring distillation complete, audit each rule's `verification_type` / `metric` / `evidence_type` (or equivalent fields in your catalog). For rules where the required verification is one of:
|
|
51
|
+
|
|
52
|
+
- **Semantic** ("is this a positive guarantee or a disclaimer?")
|
|
53
|
+
- **Contextual** ("interpret this in light of the document's product type")
|
|
54
|
+
- **Counterfactual** ("what should this value be, given the other fields?")
|
|
55
|
+
- **Cross-field arithmetic** ("does 期初 + 收益 - 分配 = 期末?")
|
|
56
|
+
|
|
57
|
+
regex alone rarely suffices. Three acceptable forms:
|
|
58
|
+
|
|
59
|
+
1. **Pure regex with documented limits** — write the regex check, include a comment explaining the fragility (e.g., "matches syntactic pattern only; cannot detect semantic guarantees")
|
|
60
|
+
2. **Hybrid regex + LLM** — regex baseline catches obvious cases, `worker_llm_call` (tier1-2) handles ambiguous ones. The hybrid workflow declares which rule_ids escalate.
|
|
61
|
+
3. **Pure LLM via `worker_llm_call`** — for fully semantic rules where no regex baseline is meaningful.
|
|
62
|
+
|
|
63
|
+
Don't ship pure regex for a rule whose `verification_type` is `judgment` / `semantic` without the documented-limits note. Future-you or a colleague will assume the regex is sufficient and that bug will hide for months.
|
|
64
|
+
|
|
65
|
+
### Worker LLM cost-aware tier choice
|
|
66
|
+
|
|
67
|
+
If you do escalate to LLM:
|
|
68
|
+
- **tier1** (most capable, ~¥0.001-0.002/doc): cross-field reasoning, ambiguity resolution, rules that benefit from chain-of-thought
|
|
69
|
+
- **tier2-3**: bulk extraction with simple semantic checks
|
|
70
|
+
- **tier4** (cheapest): high-volume keyword-spotting that regex can't handle. Note: tier4 models on SiliconFlow are Qwen3.5 thinking-mode — `content` can return empty if `reasoning_content` consumes max_tokens. Test with realistic prompts before relying. If you see empty responses, either bump max_tokens to ≥8192, shorten your prompt, or fall back to tier1-2.
|
|
71
|
+
|
|
72
|
+
Both v0.7.1 audit conductors (DS and GLM) defaulted to all-regex distillation and only added LLM escalation when the human user explicitly asked for "V2 with worker LLM". If your rule catalog has any rules where the verification is genuinely semantic, you should reach for `worker_llm_call` yourself — don't wait to be asked.
|
|
73
|
+
|
|
48
74
|
## Workflow Structure
|
|
49
75
|
|
|
50
76
|
A workflow is a Python file (or small set of files) in `workflows/`:
|
|
@@ -147,6 +147,41 @@ E2E #6 v070 surfaced this pattern (DS bundled-skill check.py files
|
|
|
147
147
|
all returned `{"pass": null, "method": "stub"}` deferring to
|
|
148
148
|
workflows/). v0.7.1 added this anti-pattern explicitly.
|
|
149
149
|
|
|
150
|
+
E2E #7 v071 showed the teaching prevented the stub anti-pattern in
|
|
151
|
+
both conductors (no `{"pass": null}` patterns in either run), but
|
|
152
|
+
**DS still inverted the canonical-vs-distilled relationship**: DS's
|
|
153
|
+
6 thematic skill folders had SKILL.md only (no check.py), with the
|
|
154
|
+
real verification code living in `workflows/<skill>/check.py`. The
|
|
155
|
+
absence of stubs is good; the inversion is not — editing a rule then
|
|
156
|
+
requires touching both SKILL.md (the doc) and the workflow check.py
|
|
157
|
+
(the code). Single source of truth is lost.
|
|
158
|
+
|
|
159
|
+
GLM v071 by contrast landed the canonical pattern: 97/97 skills had
|
|
160
|
+
both SKILL.md AND a real `check.py` (median 143 LOC of regex +
|
|
161
|
+
applicability logic), and `workflows/<id>/workflow_v1.py` was a
|
|
162
|
+
50-line thin wrapper that imported and called it:
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
# workflows/D01-01/workflow_v1.py — thin wrapper, 52 LOC
|
|
166
|
+
import importlib.util, json
|
|
167
|
+
from pathlib import Path
|
|
168
|
+
|
|
169
|
+
def run(doc_text: str, meta: dict = None) -> dict:
|
|
170
|
+
check_path = Path(__file__).parent.parent.parent / "rule_skills" / "D01-01" / "check.py"
|
|
171
|
+
spec = importlib.util.spec_from_file_location("check", check_path)
|
|
172
|
+
mod = importlib.util.module_from_spec(spec)
|
|
173
|
+
spec.loader.exec_module(mod)
|
|
174
|
+
result = mod.check(doc_text, meta)
|
|
175
|
+
result["_workflow"] = "D01-01_v1"
|
|
176
|
+
return result
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
This is the v0.7.2+ canonical pattern: workflow is a shim that
|
|
180
|
+
points at the skill's check.py. To iterate on a rule's verification,
|
|
181
|
+
edit `rule_skills/<id>/check.py`. The workflow doesn't change. v0.7.2
|
|
182
|
+
clarifies the teaching: avoid stubs AND keep the canonical
|
|
183
|
+
relationship (skill is canonical, workflow is distilled wrapper).
|
|
184
|
+
|
|
150
185
|
### Naming convention for grouped checks
|
|
151
186
|
|
|
152
187
|
When you do bundle, name the file with the explicit range:
|
|
@@ -309,18 +344,26 @@ When entering skill_authoring with an empty TaskBoard:
|
|
|
309
344
|
5. **Pick the first task.** Work it to completion (skill + check + at least one local test). Update PATTERNS.md with whatever you learned. Move to the next task.
|
|
310
345
|
6. **At task ~5 and task ~10:** stop and re-read PATTERNS.md. If patterns suggest a refactor of earlier work, do it now (cheap) rather than later (expensive).
|
|
311
346
|
|
|
312
|
-
###
|
|
347
|
+
### Persisted methodology — PATTERNS.md OR phase logs OR AGENT.md decisions
|
|
348
|
+
|
|
349
|
+
The principle: capture framework-level decisions to disk before each phase advance. The conversation will compact, agents will restart, the next phase will lose grounding. Whichever format you pick, write to disk — don't rely on conversation context that disappears.
|
|
350
|
+
|
|
351
|
+
Three formats, each defensible. Pick one and stick with it:
|
|
352
|
+
|
|
353
|
+
- **`rules/PATTERNS.md`** — concise, framework-only, updated as the project progresses. Best for greenfield projects with clear hypothesis-up-front structure. Capped at ~5 KB; entries are transferable shapes / project constraints / anti-patterns with rationale (see "What to write" above).
|
|
313
354
|
|
|
314
|
-
|
|
355
|
+
- **`logs/phase_<name>_complete.md` per phase** — incremental, captures what each phase produced + decisions made + what the next phase inherits. Best for iterative discovery work where the framework crystallizes mid-run. E2E #7 GLM used this pattern across 6 phase docs and an `evolution_summary_v1.2.md`; the methodology was captured even though PATTERNS.md was never written.
|
|
315
356
|
|
|
316
|
-
|
|
357
|
+
- **`AGENT.md` decisions section + domain notes** — narrative-style, living document of "what we know" and "why". Best for projects with rich domain context to capture (regulations, edge cases, thresholds, sample format distributions). E2E #7 GLM's AGENT.md included regulation enforcement dates, product type taxonomies, threshold values, and sample format counts — this is fine; it's a different idiom for the same goal.
|
|
317
358
|
|
|
318
|
-
By the time you have N skills, you've made N implicit decisions about verdict shape, chunker boundaries, worker tier
|
|
359
|
+
What you should NOT do: skip persistence and rely only on the live conversation context. By the time you have N skills authored without any persisted methodology, you've made N implicit decisions about verdict shape, chunker boundaries, and worker tier. Each rule re-derives from scratch. Refactoring requires touching N files instead of one.
|
|
319
360
|
|
|
320
|
-
|
|
361
|
+
❌ "I'll capture insights when I have time."
|
|
321
362
|
|
|
322
|
-
|
|
363
|
+
✅ "Before each phase advance, write what I learned to whichever persistence file matches this project's idiom — even if it's tentative."
|
|
323
364
|
|
|
324
|
-
E2E
|
|
365
|
+
E2E history:
|
|
366
|
+
- E2E #6 v070 DS wrote PATTERNS.md only after a rollback. Per-skill decisions before that point had to be re-touched. v0.7.1 added "PATTERNS.md FIRST" reinforcement.
|
|
367
|
+
- E2E #7 v071 neither DS nor GLM wrote PATTERNS.md, but GLM wrote 6 rich phase-completion logs and a comprehensive AGENT.md — the methodology WAS captured, just in different files. v0.7.2 blesses the broader principle: persist before you advance, format flexible.
|
|
325
368
|
|
|
326
|
-
The engine's filesystem-derived milestones (Group A v0.7.0) verify coverage on disk regardless of how you split the work. The TaskBoard is your scratchpad; the disk is the contract.
|
|
369
|
+
The engine's filesystem-derived milestones (Group A v0.7.0) verify coverage on disk regardless of how you split the work. The TaskBoard is your scratchpad; the disk is the contract; the persistence file is your project's memory.
|
|
@@ -222,6 +222,24 @@ Regulations are often ambiguous. When you encounter ambiguity:
|
|
|
222
222
|
|
|
223
223
|
Do not skip ambiguous rules. They are often the most important ones.
|
|
224
224
|
|
|
225
|
+
## Sanity-check applicability against the sample corpus
|
|
226
|
+
|
|
227
|
+
After extracting your rule catalog and before authoring skills, do this 5-minute check: project each rule's applicability filter against the sample corpus.
|
|
228
|
+
|
|
229
|
+
For every rule:
|
|
230
|
+
1. Walk `samples/`, classify each by product type / report type / document format
|
|
231
|
+
2. For each rule, count how many samples it would apply to (per the rule's `applicability` field, scope filter, or whatever shape your catalog uses)
|
|
232
|
+
3. Flag rules that apply to **0 samples** — they're either genuinely test-corpus-irrelevant (acceptable) or over-constrained (bug)
|
|
233
|
+
|
|
234
|
+
E2E #7 GLM produced a 97-rule catalog where 36 rules (37%) had `PASS=0 FAIL=0 NOT_APPLICABLE=90` across all 90 documents — they never fired. Some were legit (rules for cash-management products with no cash-management samples in corpus), but 36 inactive of 97 was high enough to suggest scope-too-narrow drift.
|
|
235
|
+
|
|
236
|
+
If many rules are 0-sample, either:
|
|
237
|
+
- **Reframe their applicability** — broaden product types, look for evidence in headers/footers not just body, relax the scope filter
|
|
238
|
+
- **Document them as "future scope"** and remove from this iteration's catalog (still capture them in a `rules/future_scope.md` so they're not forgotten)
|
|
239
|
+
- **Update the test corpus** to include matching samples (work with the developer user)
|
|
240
|
+
|
|
241
|
+
Catching this in `rule_extraction` is much cheaper than authoring 36 skills that then test as inactive in `skill_testing`. The cheap projection here is worth the time it saves later.
|
|
242
|
+
|
|
225
243
|
## When Rules Change
|
|
226
244
|
|
|
227
245
|
Regulations evolve. When the developer user adds new or updated regulation documents:
|
|
@@ -45,6 +45,32 @@ If yes, design a worker LLM prompt. Use the smallest model tier that maintains a
|
|
|
45
45
|
### The hybrid approach (most common)
|
|
46
46
|
Most rules are a mix: regex extracts the number, Python compares it to the threshold, LLM handles the exceptional cases. Design the workflow as a pipeline where cheap steps run first and expensive steps run only when needed.
|
|
47
47
|
|
|
48
|
+
### When regex alone isn't enough — decision rubric
|
|
49
|
+
|
|
50
|
+
Before declaring distillation complete, audit each rule's `verification_type` / `metric` / `evidence_type` (or equivalent fields in your catalog). For rules where the required verification is one of:
|
|
51
|
+
|
|
52
|
+
- **Semantic** ("is this a positive guarantee or a disclaimer?")
|
|
53
|
+
- **Contextual** ("interpret this in light of the document's product type")
|
|
54
|
+
- **Counterfactual** ("what should this value be, given the other fields?")
|
|
55
|
+
- **Cross-field arithmetic** ("does 期初 + 收益 - 分配 = 期末?")
|
|
56
|
+
|
|
57
|
+
regex alone rarely suffices. Three acceptable forms:
|
|
58
|
+
|
|
59
|
+
1. **Pure regex with documented limits** — write the regex check, include a comment explaining the fragility (e.g., "matches syntactic pattern only; cannot detect semantic guarantees")
|
|
60
|
+
2. **Hybrid regex + LLM** — regex baseline catches obvious cases, `worker_llm_call` (tier1-2) handles ambiguous ones. The hybrid workflow declares which rule_ids escalate.
|
|
61
|
+
3. **Pure LLM via `worker_llm_call`** — for fully semantic rules where no regex baseline is meaningful.
|
|
62
|
+
|
|
63
|
+
Don't ship pure regex for a rule whose `verification_type` is `judgment` / `semantic` without the documented-limits note. Future-you or a colleague will assume the regex is sufficient and that bug will hide for months.
|
|
64
|
+
|
|
65
|
+
### Worker LLM cost-aware tier choice
|
|
66
|
+
|
|
67
|
+
If you do escalate to LLM:
|
|
68
|
+
- **tier1** (most capable, ~¥0.001-0.002/doc): cross-field reasoning, ambiguity resolution, rules that benefit from chain-of-thought
|
|
69
|
+
- **tier2-3**: bulk extraction with simple semantic checks
|
|
70
|
+
- **tier4** (cheapest): high-volume keyword-spotting that regex can't handle. Note: tier4 models on SiliconFlow are Qwen3.5 thinking-mode — `content` can return empty if `reasoning_content` consumes max_tokens. Test with realistic prompts before relying. If you see empty responses, either bump max_tokens to ≥8192, shorten your prompt, or fall back to tier1-2.
|
|
71
|
+
|
|
72
|
+
Both v0.7.1 audit conductors (DS and GLM) defaulted to all-regex distillation and only added LLM escalation when the human user explicitly asked for "V2 with worker LLM". If your rule catalog has any rules where the verification is genuinely semantic, you should reach for `worker_llm_call` yourself — don't wait to be asked.
|
|
73
|
+
|
|
48
74
|
## Workflow Structure
|
|
49
75
|
|
|
50
76
|
A workflow is a Python file (or small set of files) in `workflows/`:
|
|
@@ -144,6 +144,39 @@ E2E #6 v070 暴露了这个反模式(DS 把所有 bundled skill 的 check.py
|
|
|
144
144
|
都写成 `{"pass": null, "method": "stub"}` 推给 workflows/)。
|
|
145
145
|
v0.7.1 把这个反模式显式写进 skill。
|
|
146
146
|
|
|
147
|
+
E2E #7 v071 显示这个反 stub 的引导在两个 conductor 上都生效(两条 run
|
|
148
|
+
里都没有 `{"pass": null}` 这种 stub 模式),但是 **DS 仍然把"正典 vs
|
|
149
|
+
蒸馏"的关系搞反了**:DS 写了 6 个主题分组的 skill 文件夹,每个只有
|
|
150
|
+
SKILL.md(没有 check.py),真正的验证代码却在
|
|
151
|
+
`workflows/<skill>/check.py` 里。没有 stub 是好事;关系搞反不是 ——
|
|
152
|
+
要修改一条规则的逻辑就得同时改 SKILL.md(文档)和 workflow check.py
|
|
153
|
+
(代码),单一信息源就丢了。
|
|
154
|
+
|
|
155
|
+
GLM v071 反而把正典模式落地了:97/97 个 skill 都同时有 SKILL.md 和
|
|
156
|
+
真正的 `check.py`(regex + 适用性判断的代码,中位 143 行),而
|
|
157
|
+
`workflows/<id>/workflow_v1.py` 是一个 50 行的薄壳,只是 import 并
|
|
158
|
+
调用 skill 的 check.py:
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
# workflows/D01-01/workflow_v1.py — 薄壳,52 行
|
|
162
|
+
import importlib.util, json
|
|
163
|
+
from pathlib import Path
|
|
164
|
+
|
|
165
|
+
def run(doc_text: str, meta: dict = None) -> dict:
|
|
166
|
+
check_path = Path(__file__).parent.parent.parent / "rule_skills" / "D01-01" / "check.py"
|
|
167
|
+
spec = importlib.util.spec_from_file_location("check", check_path)
|
|
168
|
+
mod = importlib.util.module_from_spec(spec)
|
|
169
|
+
spec.loader.exec_module(mod)
|
|
170
|
+
result = mod.check(doc_text, meta)
|
|
171
|
+
result["_workflow"] = "D01-01_v1"
|
|
172
|
+
return result
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
这是 v0.7.2+ 的正典模式:workflow 是个壳,指向 skill 的 check.py。
|
|
176
|
+
迭代规则验证逻辑时,编辑 `rule_skills/<id>/check.py`,workflow 不用动。
|
|
177
|
+
v0.7.2 把引导说得更清楚:既不要 stub,也要保留正典关系(skill 是
|
|
178
|
+
正典,workflow 是蒸馏过的薄壳)。
|
|
179
|
+
|
|
147
180
|
### 合并 check 的命名约定
|
|
148
181
|
|
|
149
182
|
确实需要合并时,文件名要把范围写明:
|
|
@@ -304,18 +337,26 @@ PATTERNS.md 全文控制在约 5 KB 之内。超过时,剪掉最不可执行
|
|
|
304
337
|
5. **挑第一个任务**。做到完整(skill + check + 至少一次本地测试)。把学到的写进 PATTERNS.md。换下一个任务。
|
|
305
338
|
6. **任务做到第 5 个、第 10 个时**:停下来重读 PATTERNS.md。如果新积累的 pattern 暗示要重构早期工作,**现在做**(便宜)而不是更晚(昂贵)。
|
|
306
339
|
|
|
307
|
-
###
|
|
340
|
+
### 持久化方法论 —— PATTERNS.md 或 phase 日志 或 AGENT.md decisions
|
|
341
|
+
|
|
342
|
+
原则:在每次 phase 推进之前,把框架级的决定写到磁盘。对话会被 compact、agent 会重启、下一个 phase 会失去上下文。不管你选哪种格式,**写到磁盘** —— 不要依赖会消失的对话上下文。
|
|
343
|
+
|
|
344
|
+
三种格式都站得住,挑一种坚持下去:
|
|
345
|
+
|
|
346
|
+
- **`rules/PATTERNS.md`** —— 简洁,只装框架级内容,随项目推进而更新。适合假设可以前置、结构清晰的全新项目。上限 ~5 KB;条目是可迁移的形状 / 项目级约束 / 反模式加原因(参考上面"该写什么"一节)。
|
|
308
347
|
|
|
309
|
-
|
|
348
|
+
- **每阶段写 `logs/phase_<name>_complete.md`** —— 增量式,记录每个 phase 产出了什么、做了哪些决定、下个 phase 继承什么。适合"边发现边定型"的迭代式工作。E2E #7 GLM 用了这个模式:6 篇 phase 文档 + `evolution_summary_v1.2.md`,方法论照样捕获了,只是没写 PATTERNS.md。
|
|
310
349
|
|
|
311
|
-
|
|
350
|
+
- **`AGENT.md` decisions 段 + 领域笔记** —— 叙事风格,是关于"我们知道什么"和"为什么"的活文档。适合需要捕获丰富领域上下文的项目(法规、边缘案例、阈值、样本格式分布)。E2E #7 GLM 的 AGENT.md 里有法规生效日期、产品类型分类、阈值数值、样本格式数量 —— 完全 OK,是相同目标的不同惯用法。
|
|
312
351
|
|
|
313
|
-
|
|
352
|
+
不该做的事:跳过持久化、只靠对话上下文活着。等你写到第 N 条 skill 还没把方法论写到磁盘时,你已经做了 N 个关于 verdict 形状、chunker 边界、worker tier 的隐式决定 —— 每条规则都从零推导,重构要碰 N 个文件而不是一个。
|
|
314
353
|
|
|
315
|
-
|
|
354
|
+
❌ "等我有空再来记录这些洞察。"
|
|
316
355
|
|
|
317
|
-
|
|
356
|
+
✅ "每次 phase 推进之前,把这一阶段学到的东西写到适合本项目惯用法的那个持久化文件里 —— 哪怕只是初稿。"
|
|
318
357
|
|
|
319
|
-
E2E
|
|
358
|
+
E2E 历史:
|
|
359
|
+
- E2E #6 v070 DS 在用户介入回退之后才写 PATTERNS.md。那之前每条 skill 的设计决定都各自固化,之后还要再碰一遍。v0.7.1 加了"PATTERNS.md FIRST"的引导。
|
|
360
|
+
- E2E #7 v071 DS 和 GLM 都没写 PATTERNS.md,但 GLM 写了 6 篇 phase 完成日志和一份内容详尽的 AGENT.md —— 方法论 *捕获了*,只是放在了不同文件里。v0.7.2 把更宽的原则写进 skill:推进之前先持久化,格式灵活。
|
|
320
361
|
|
|
321
|
-
引擎从文件系统推导里程碑(v0.7.0 Group A)会按磁盘事实核验覆盖率,无论你怎么切分工作。TaskBoard
|
|
362
|
+
引擎从文件系统推导里程碑(v0.7.0 Group A)会按磁盘事实核验覆盖率,无论你怎么切分工作。TaskBoard 是你的草稿;磁盘才是契约;持久化文件是项目的记忆。
|