kc-beta 0.7.5 → 0.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. package/README.md +47 -0
  2. package/package.json +3 -2
  3. package/src/agent/context.js +17 -1
  4. package/src/agent/engine.js +467 -100
  5. package/src/agent/llm-client.js +24 -1
  6. package/src/agent/pipelines/_advance-hints.js +92 -0
  7. package/src/agent/pipelines/_milestone-derive.js +325 -20
  8. package/src/agent/pipelines/skill-authoring.js +49 -3
  9. package/src/agent/tools/agent-tool.js +2 -2
  10. package/src/agent/tools/consult-skill.js +15 -0
  11. package/src/agent/tools/dashboard-render.js +48 -1
  12. package/src/agent/tools/document-parse.js +31 -2
  13. package/src/agent/tools/phase-advance.js +17 -13
  14. package/src/agent/tools/release.js +343 -7
  15. package/src/agent/tools/sandbox-exec.js +65 -8
  16. package/src/agent/tools/worker-llm-call.js +95 -15
  17. package/src/agent/workspace.js +25 -4
  18. package/src/cli/components.js +4 -1
  19. package/src/cli/index.js +125 -8
  20. package/src/config.js +19 -2
  21. package/src/marathon/driver.js +217 -0
  22. package/src/marathon/prompts.js +93 -0
  23. package/template/.env.template +17 -1
  24. package/template/AGENT.md +2 -2
  25. package/template/skills/en/auto-model-selection/SKILL.md +55 -35
  26. package/template/skills/en/bootstrap-workspace/SKILL.md +27 -0
  27. package/template/skills/en/compliance-judgment/SKILL.md +14 -0
  28. package/template/skills/en/confidence-system/SKILL.md +30 -8
  29. package/template/skills/en/corner-case-management/SKILL.md +53 -33
  30. package/template/skills/en/cross-document-verification/SKILL.md +88 -83
  31. package/template/skills/en/dashboard-reporting/SKILL.md +91 -66
  32. package/template/skills/en/dashboard-reporting/scripts/generate_dashboard.py +1 -1
  33. package/template/skills/en/data-sensibility/SKILL.md +19 -12
  34. package/template/skills/en/document-chunking/SKILL.md +99 -15
  35. package/template/skills/en/entity-extraction/SKILL.md +14 -4
  36. package/template/skills/en/quality-control/SKILL.md +23 -0
  37. package/template/skills/en/rule-extraction/SKILL.md +92 -94
  38. package/template/skills/en/rule-extraction/references/chunking-strategies.md +7 -78
  39. package/template/skills/en/skill-authoring/SKILL.md +85 -2
  40. package/template/skills/en/skill-creator/SKILL.md +25 -3
  41. package/template/skills/en/skill-to-workflow/SKILL.md +73 -1
  42. package/template/skills/en/task-decomposition/SKILL.md +1 -1
  43. package/template/skills/en/tree-processing/SKILL.md +1 -1
  44. package/template/skills/en/version-control/SKILL.md +15 -0
  45. package/template/skills/en/work-decomposition/SKILL.md +52 -32
  46. package/template/skills/phase_skills.yaml +5 -0
  47. package/template/skills/zh/auto-model-selection/SKILL.md +54 -33
  48. package/template/skills/zh/bootstrap-workspace/SKILL.md +27 -0
  49. package/template/skills/zh/compliance-judgment/SKILL.md +51 -37
  50. package/template/skills/zh/compliance-judgment/references/output-format.md +62 -62
  51. package/template/skills/zh/confidence-system/SKILL.md +34 -9
  52. package/template/skills/zh/corner-case-management/SKILL.md +71 -104
  53. package/template/skills/zh/cross-document-verification/SKILL.md +90 -195
  54. package/template/skills/zh/cross-document-verification/references/contradiction-taxonomy.md +36 -36
  55. package/template/skills/zh/dashboard-reporting/SKILL.md +82 -232
  56. package/template/skills/zh/dashboard-reporting/scripts/generate_dashboard.py +1 -1
  57. package/template/skills/zh/data-sensibility/SKILL.md +13 -0
  58. package/template/skills/zh/document-chunking/SKILL.md +101 -18
  59. package/template/skills/zh/document-parsing/SKILL.md +65 -65
  60. package/template/skills/zh/document-parsing/references/parser-catalog.md +26 -26
  61. package/template/skills/zh/entity-extraction/SKILL.md +78 -68
  62. package/template/skills/zh/evolution-loop/references/convergence-guide.md +38 -38
  63. package/template/skills/zh/quality-control/SKILL.md +23 -0
  64. package/template/skills/zh/quality-control/references/qa-layers.md +65 -65
  65. package/template/skills/zh/quality-control/references/sampling-strategies.md +49 -49
  66. package/template/skills/zh/rule-extraction/SKILL.md +199 -188
  67. package/template/skills/zh/rule-extraction/references/chunking-strategies.md +5 -78
  68. package/template/skills/zh/skill-authoring/SKILL.md +136 -58
  69. package/template/skills/zh/skill-authoring/references/skill-format-spec.md +39 -39
  70. package/template/skills/zh/skill-creator/SKILL.md +215 -201
  71. package/template/skills/zh/skill-creator/references/schemas.md +60 -60
  72. package/template/skills/zh/skill-to-workflow/SKILL.md +73 -1
  73. package/template/skills/zh/skill-to-workflow/references/worker-llm-catalog.md +24 -24
  74. package/template/skills/zh/task-decomposition/SKILL.md +1 -1
  75. package/template/skills/zh/task-decomposition/references/decision-matrix.md +54 -54
  76. package/template/skills/zh/tree-processing/SKILL.md +67 -63
  77. package/template/skills/zh/version-control/SKILL.md +15 -0
  78. package/template/skills/zh/version-control/references/trace-id-spec.md +34 -34
  79. package/template/skills/zh/work-decomposition/SKILL.md +52 -30
  80. package/template/workflows/common/llm_client.py +168 -0
  81. package/template/workflows/common/utils.py +132 -0
@@ -32,6 +32,16 @@ export class LLMClient {
32
32
  this.baseUrl = baseUrl.replace(/\/+$/, "");
33
33
  this.authType = authType;
34
34
  this.apiFormat = apiFormat;
35
+ // v0.8.2 P14-A: request-level timeout for fetch. SiliconFlow GLM-5.1
36
+ // streams hung 8h+ overnight in E2E #12 with no HTTP-level cutoff.
37
+ // 10 min ceiling (configurable via KC_LLM_REQUEST_TIMEOUT_MS) lets the
38
+ // marathon driver's `error: terminated` → recovery path kick in within
39
+ // minutes instead of hours when the upstream stalls a request without
40
+ // closing the TCP connection.
41
+ const envTimeout = parseInt(process.env.KC_LLM_REQUEST_TIMEOUT_MS || "0", 10);
42
+ this.requestTimeoutMs = Number.isFinite(envTimeout) && envTimeout > 0
43
+ ? envTimeout
44
+ : 10 * 60 * 1000;
35
45
  }
36
46
 
37
47
  /**
@@ -196,10 +206,15 @@ export class LLMClient {
196
206
  let resp;
197
207
  try {
198
208
  resp = await withRetry(async () => {
209
+ // v0.8.2 P14-A: AbortSignal.timeout for stream connect + per-chunk
210
+ // forward progress. Hung streams (SiliconFlow GLM-5.1 overnight,
211
+ // E2E #12) abort within requestTimeoutMs and surface as an error
212
+ // event the marathon driver can recover from.
199
213
  const r = await fetch(this._getEndpoint(), {
200
214
  method: "POST",
201
215
  headers: this._buildHeaders(),
202
216
  body: JSON.stringify(body),
217
+ signal: AbortSignal.timeout(this.requestTimeoutMs),
203
218
  });
204
219
  if (!r.ok) {
205
220
  const text = await r.text();
@@ -215,7 +230,13 @@ export class LLMClient {
215
230
  // A8: Any pre-stream failure (network, auth, 4xx/5xx after retry) is
216
231
  // tagged and re-thrown. Engine's outer catch sees exactly one tagged
217
232
  // error event.
218
- if (!err.streamTermination) err.streamTermination = "connect_error";
233
+ // v0.8.2 P14-A: AbortError from AbortSignal.timeout marks request_timeout
234
+ // distinctly so audits can count these vs. generic connect errors.
235
+ if (err.name === "TimeoutError" || err.name === "AbortError") {
236
+ err.streamTermination = "request_timeout";
237
+ } else if (!err.streamTermination) {
238
+ err.streamTermination = "connect_error";
239
+ }
219
240
  throw err;
220
241
  }
221
242
 
@@ -256,10 +277,12 @@ export class LLMClient {
256
277
  const body = this._buildNonStreamBody({ model, messages, maxTokens });
257
278
 
258
279
  const resp = await withRetry(async () => {
280
+ // v0.8.2 P14-A: same request-level timeout as streamChat for symmetry.
259
281
  const r = await fetch(this._getEndpoint(), {
260
282
  method: "POST",
261
283
  headers: this._buildHeaders(),
262
284
  body: JSON.stringify(body),
285
+ signal: AbortSignal.timeout(this.requestTimeoutMs),
263
286
  });
264
287
  if (!r.ok) {
265
288
  const text = await r.text();
@@ -0,0 +1,92 @@
1
+ // v0.8 P0-E: prescriptive refusal hints for phase_advance gate failures.
2
+ //
3
+ // 资管 + 贷款 v0.7.5 audits both observed the force-bypass pattern:
4
+ // engine refuses phase_advance with `engineCounts: workflowsTested: 0/14`,
5
+ // agent does ~3 min of cleanup, then forces past anyway. Cleanup happens
6
+ // (signal IS being consumed) but force always wins because the descriptive
7
+ // "exit criteria not met" hint doesn't tell the agent WHAT to write.
8
+ //
9
+ // v0.8 P0-E replaces the descriptive hint with a prescriptive one. The
10
+ // hint text below derives from the same artifact paths + filename patterns
11
+ // that _milestone-derive.js walks, so the agent's instructions match what
12
+ // the engine will check next turn.
13
+ //
14
+ // Design contract (matches v0.8 design doc Q20 user lean):
15
+ // - Single shared helper here; engine.js + phase-advance.js both call it.
16
+ // - Each hint is one or two concrete sentences naming a path, a filename
17
+ // pattern, and a script to run (where applicable).
18
+ // - Hint output is plain text, suitable to drop into a tool result.
19
+ //
20
+ // To extend: edit the per-phase hint generators below. Keep the artifact
21
+ // paths in sync with the corresponding derive function in _milestone-derive.js.
22
+
23
+ /**
24
+ * Build a prescriptive refusal hint for a phase_advance gate failure.
25
+ *
26
+ * @param {string} fromPhase — the phase the agent is trying to leave
27
+ * @param {object} engineCounts — raw engine counts (or null)
28
+ * @param {string} [engineCountsLine] — formatted summary string from _buildEngineCountsBlock
29
+ * @returns {string} a multi-line hint suitable for the LLM tool result
30
+ */
31
+ export function getPrescriptiveHint(fromPhase, engineCounts, engineCountsLine = "") {
32
+ const header = engineCountsLine
33
+ ? `Engine telemetry: ${engineCountsLine}\n\n`
34
+ : "";
35
+
36
+ const hint = HINTS_BY_PHASE[fromPhase];
37
+ if (!hint) {
38
+ return header + "Check the system prompt's phase state block for missing milestones. The engine derives milestones from filesystem facts.";
39
+ }
40
+ return header + hint;
41
+ }
42
+
43
+ const HINTS_BY_PHASE = {
44
+ bootstrap:
45
+ "To advance to rule_extraction:\n" +
46
+ " • Verify <workspace>/source_docs/ contains the regulation file(s) you're extracting rules from.\n" +
47
+ " • Verify <workspace>/samples/ contains at least one sample document for testing.\n" +
48
+ " • Ensure AGENT.md exists at workspace root with project context filled in.\n" +
49
+ "Engine reads filesystem facts; no need to call any 'mark bootstrap complete' tool — just produce the artifacts.",
50
+
51
+ rule_extraction:
52
+ "To advance to skill_authoring:\n" +
53
+ " • For each rule in the source regulation, write an entry to rules/catalog.json with {id, source_ref, falsifiability_statement, applicable_sections}.\n" +
54
+ " • Use rule_catalog tool (operation: 'write') for catalog entries; engine derives `rulesExtracted` from this file.\n" +
55
+ " • For chunk traceability: each catalog entry should reference its source chunk via applicable_sections.\n" +
56
+ " • Write rule_skills/coverage_report.md or rules/coverage_report.md to mark coverageAudited=true (a per-rule × per-section table).",
57
+
58
+ skill_authoring:
59
+ "To advance to skill_testing:\n" +
60
+ " • For each rule_id in rules/catalog.json, create rule_skills/<rule_id>/SKILL.md (uppercase! engine path-match is case-sensitive on Linux).\n" +
61
+ " • Each SKILL.md needs frontmatter (id, name, description) + a body describing verification logic.\n" +
62
+ " • Pair each SKILL.md with rule_skills/<rule_id>/check.py — substantive logic, NOT a 'return NOT_APPLICABLE' stub. If logic lives in workflows/, check.py must import + call the workflow.\n" +
63
+ " • For grouped skills covering multiple rules, frontmatter MUST include `source_rules: [R001, R005, ...]` so engine credits each rule_id.\n" +
64
+ " • Engine counts `rulesCovered` from rule_skills/ walk; aim for catalog.json's full rule list.",
65
+
66
+ skill_testing:
67
+ "To advance to distillation:\n" +
68
+ " • For each rule_id, write test results to output/results/skill_test_round<N>.json or output/results/<rule_id>_<sample>.json.\n" +
69
+ " • Each test result needs `verdict` (PASS/FAIL/NOT_APPLICABLE) plus per-rule accuracy.\n" +
70
+ " • Engine counts `skillsTested` from these files. Aim for ≥1 result per rule, with ≥90% accuracy on labeled samples.\n" +
71
+ " • If a rule consistently fails, iterate the SKILL.md + check.py before advancing (this is the evolution-loop pattern).",
72
+
73
+ distillation:
74
+ "To advance to production_qc:\n" +
75
+ " • For each rule_id, write workflows/<rule_id>/workflow_v1.py (regex-only or hybrid regex+worker_llm).\n" +
76
+ " • Each workflow.py needs a `verify(document_text, config)` function returning {verdict, evidence, confidence, ...}.\n" +
77
+ " • Engine counts `workflowsCreated` from workflows/<rule_id>/workflow_v*.py walk.\n" +
78
+ " • Run scripts/v1_regression.py (or equivalent) to populate output/results/v1_regression.json — engine counts `workflowsTested` from this.\n" +
79
+ " • For grouped workflows (one workflow covering multiple rules), declare `source_rules: [...]` in workflow's docstring or sidecar config.",
80
+
81
+ production_qc:
82
+ "To advance to finalization:\n" +
83
+ " • Write output/results/production_qc_results.json (preferred shape: {results: {<rule_id>: {<doc_id>: {verdict, evidence, confidence}}}}).\n" +
84
+ " • OR write output/qc/review_<batch>.json with `documents_reviewed: N` for each batch — engine sums across files.\n" +
85
+ " • Engine counts `batchesProcessed` and `documentsReviewed`. Each batch should cover the full doc set OR a meaningful sample.\n" +
86
+ " • If accuracy is below threshold, run evolution-loop on the failing rules before advancing.",
87
+
88
+ finalization:
89
+ "(Finalization is the terminal phase — no forward advance.)",
90
+ };
91
+
92
+ export default getPrescriptiveHint;
@@ -80,6 +80,22 @@ function readJsonSafe(p) {
80
80
  try { return JSON.parse(fs.readFileSync(p, "utf-8")); } catch { return null; }
81
81
  }
82
82
 
83
+ // v0.8 P1-A: find the first existing file from a list of candidate relative
84
+ // paths. Returns the absolute path of the first match, or null. Used for
85
+ // "agent-might-have-written-it-anywhere" lookups where conventions vary.
86
+ //
87
+ // 资管 v0.7.5 wrote rule_skills/coverage_report.md; 贷款 v0.7.5 wrote
88
+ // output/coverage_report.md or similar. Each derive function previously
89
+ // hardcoded its own short list — extracting this helper keeps additions
90
+ // centralized.
91
+ function findFileAcrossKnownPaths(workspaceCwd, relPaths) {
92
+ for (const rel of relPaths) {
93
+ const abs = path.join(workspaceCwd, rel);
94
+ if (fileExists(abs)) return abs;
95
+ }
96
+ return null;
97
+ }
98
+
83
99
  function readFileSafe(p) {
84
100
  try { return fs.readFileSync(p, "utf-8"); } catch { return ""; }
85
101
  }
@@ -140,13 +156,33 @@ function sha256OfFile(p) {
140
156
  } catch { return null; }
141
157
  }
142
158
 
143
- // Normalize a rule id like "R14" / "r014" / "R0014" to canonical "R014".
159
+ // Normalize a rule id to a canonical form for dedup + comparison.
160
+ // Accepts two shapes:
161
+ // Bare-numeric: "R14" / "r014" / "R0014" → "R014"
162
+ // Compound: "R01-01" / "R01_01" / "R001-005" → "R001-005"
163
+ // (zero-pads the major part to 3 digits; preserves the
164
+ // minor part numerically; uses dash separator canonically)
144
165
  // Returns null for non-matching strings (e.g., thematic skill names like
145
- // "account_identity" — those stay as-is via the second branch).
146
- function canonicalRuleId(s) {
166
+ // "account_identity" — those stay as-is and don't get credited via this
167
+ // path; their credit comes from frontmatter `source_rules:` instead).
168
+ //
169
+ // v0.8.3 P20-B2: compound form added. E2E #13 资管 used `R01-01`..`R07-01`
170
+ // naturally following the regulation's subsection numbering; v0.8.2's
171
+ // bare-only regex returned null for all 15 dirs → `rulesCovered: 0/15`
172
+ // → engine refused natural skill_testing advance.
173
+ export function canonicalRuleId(s) {
147
174
  if (typeof s !== "string") return null;
148
- const m = s.match(/^R0*(\d+)$/i);
149
- if (m) return `R${String(parseInt(m[1], 10)).padStart(3, "0")}`;
175
+ const trimmed = s.trim();
176
+ // Compound form: R01-01, R01_01, R001-005, etc.
177
+ const compound = trimmed.match(/^R0*(\d+)[-_](\d+)$/i);
178
+ if (compound) {
179
+ const major = String(parseInt(compound[1], 10)).padStart(3, "0");
180
+ const minor = String(parseInt(compound[2], 10)).padStart(2, "0");
181
+ return `R${major}-${minor}`;
182
+ }
183
+ // Bare-numeric form
184
+ const bare = trimmed.match(/^R0*(\d+)$/i);
185
+ if (bare) return `R${String(parseInt(bare[1], 10)).padStart(3, "0")}`;
150
186
  return null;
151
187
  }
152
188
 
@@ -177,9 +213,16 @@ export function deriveRuleExtractionMilestones(workspace) {
177
213
 
178
214
  // rulesExtracted: every rule object across every JSON file in rules/
179
215
  // that has a non-empty `id` field. catalog.json is canonical but agents
180
- // sometimes fan out to per-rule files (E2E #5 DS).
216
+ // sometimes fan out to per-rule files (E2E #5 DS) — or write SIBLING
217
+ // files with the same IDs plus additional metadata (E2E #13 资管's
218
+ // `rules/difficulty.json` added judgment-type classifications and
219
+ // doubled the count from 15 → 30 because the engine pushed IDs without
220
+ // dedup). v0.8.3 P20-B1: dedup by ID across all rules/*.json files.
221
+ // First-seen wins for chunk-ref counting (catalog.json is read first
222
+ // by alphabetical / fs order in most cases).
181
223
  const rulesExtracted = [];
182
224
  const rulesWithChunkRefs = [];
225
+ const seenIds = new Set();
183
226
  if (dirExists(rulesDir)) {
184
227
  for (const e of listChildFiles(rulesDir)) {
185
228
  if (!e.name.endsWith(".json")) continue;
@@ -188,8 +231,21 @@ export function deriveRuleExtractionMilestones(workspace) {
188
231
  const items = Array.isArray(data) ? data : (data.rules || []);
189
232
  for (const r of items) {
190
233
  if (r && typeof r.id === "string" && r.id.length) {
234
+ if (seenIds.has(r.id)) continue; // v0.8.3 P20-B1 dedup
235
+ seenIds.add(r.id);
191
236
  rulesExtracted.push(r.id);
192
- if (Array.isArray(r.source_chunk_ids) && r.source_chunk_ids.length > 0) {
237
+ // v0.8.2 P13-C: accept any of three field names for chunk
238
+ // references. Engine historically looked only for
239
+ // `source_chunk_ids`, but 贷款 v0.8.1 + 资管 v0.8.1 catalogs
240
+ // wrote `chunk_ids` (the shorter form agents naturally pick
241
+ // from the rule-extraction skill examples). `chunk_refs` is
242
+ // a legacy alias from older audit docs. Any non-empty match
243
+ // counts.
244
+ const chunks = (Array.isArray(r.source_chunk_ids) && r.source_chunk_ids)
245
+ || (Array.isArray(r.chunk_ids) && r.chunk_ids)
246
+ || (Array.isArray(r.chunk_refs) && r.chunk_refs)
247
+ || null;
248
+ if (chunks && chunks.length > 0) {
193
249
  rulesWithChunkRefs.push(r.id);
194
250
  }
195
251
  }
@@ -197,15 +253,18 @@ export function deriveRuleExtractionMilestones(workspace) {
197
253
  }
198
254
  }
199
255
 
200
- // coverageAudited: presence of rules/coverage_audit.{md,json} OR a
201
- // rules/coverage_report.md / output/coverage_report.md. Loose criterion
202
- // because agents pick different conventions; the spirit is "did the
256
+ // coverageAudited: presence of any coverage audit/report doc. Loose
257
+ // criterion agents pick different conventions; the spirit is "did the
203
258
  // agent produce a coverage doc" not "did they put it in this exact file".
204
- const coverageAudited =
205
- fileExists(path.join(rulesDir, "coverage_audit.md")) ||
206
- fileExists(path.join(rulesDir, "coverage_audit.json")) ||
207
- fileExists(path.join(rulesDir, "coverage_report.md")) ||
208
- fileExists(path.join(cwd, "output", "coverage_report.md"));
259
+ // v0.8 P1-A: use the same findFileAcrossKnownPaths helper as finalization.
260
+ const coverageAudited = !!findFileAcrossKnownPaths(cwd, [
261
+ path.join("rules", "coverage_audit.md"),
262
+ path.join("rules", "coverage_audit.json"),
263
+ path.join("rules", "coverage_report.md"),
264
+ path.join("output", "coverage_report.md"),
265
+ path.join("rule_skills", "coverage_report.md"), // v0.8 P1-A
266
+ path.join("output", "qc", "coverage_report.md"),
267
+ ]);
209
268
 
210
269
  return {
211
270
  rulesExtracted,
@@ -312,15 +371,121 @@ export function deriveSkillAuthoringMilestones(workspace) {
312
371
  }
313
372
  } catch { /* best-effort */ }
314
373
  }
374
+
375
+ // v0.8.2 P13-D: also credit rule_ids declared in rule_mapping.json.
376
+ // 资管 v0.8.1 wrote 6 thematic-overlay dirs (R01_periodic_report,
377
+ // R02_custodian_core, etc.) each containing a rule_mapping.json that
378
+ // maps rule_ids to engine-level check function names. The dirs have
379
+ // no own check.py because the actual implementation lives in
380
+ // workspace-root verify_v*.py. Without recognizing rule_mapping.json,
381
+ // the engine treats them as orphan dirs.
382
+ //
383
+ // Rule-id formats in the wild include both bare-numeric (R01, R027)
384
+ // and compound (R01-05, R02-08). canonicalRuleId() only handles the
385
+ // bare form, so we accept either canonicalized form OR a raw key
386
+ // that looks like a rule id (matches R\d+ optionally followed by
387
+ // `-` or `_` and more digits).
388
+ try {
389
+ const mappingPath = path.join(skillPath, "rule_mapping.json");
390
+ if (fileExists(mappingPath)) {
391
+ const mapping = readJsonSafe(mappingPath);
392
+ if (mapping && typeof mapping === "object" && !Array.isArray(mapping)) {
393
+ for (const key of Object.keys(mapping)) {
394
+ const canon = canonicalRuleId(key);
395
+ if (canon) {
396
+ ruleIdsCovered.add(canon);
397
+ } else if (/^R0*\d+[-_]?\d*$/i.test(key.trim())) {
398
+ // Compound form like "R01-05" — preserve as-is
399
+ ruleIdsCovered.add(key.trim());
400
+ }
401
+ }
402
+ }
403
+ }
404
+ } catch { /* best-effort */ }
315
405
  }
316
406
 
407
+ // v0.8 P2-F (item 22): count stub-shaped check.py files. Pairs with
408
+ // v0.8 P2-A teaching about the inverse-stub anti-pattern. Surfaces
409
+ // a ratio that downstream code (skill-authoring exitCriteriaMet)
410
+ // can choose to enforce via env flag.
411
+ const checkPyAudit = _auditCheckPyShapes(skillsDir);
412
+
317
413
  return {
318
414
  skillsAuthored,
319
415
  skillsWithScripts,
320
416
  ruleIdsCovered: [...ruleIdsCovered],
417
+ checkPyTotal: checkPyAudit.total,
418
+ checkPyStubCount: checkPyAudit.stubFiles.length,
419
+ checkPyStubFiles: checkPyAudit.stubFiles,
420
+ checkPyStubRatio: checkPyAudit.total > 0
421
+ ? +(checkPyAudit.stubFiles.length / checkPyAudit.total).toFixed(3)
422
+ : 0,
321
423
  };
322
424
  }
323
425
 
426
+ // v0.8 P2-F: walk rule_skills/<id>/ for check_*.py and check each for
427
+ // stub-shape patterns. Returns {total, stubFiles}. Patterns recognized
428
+ // as stubs (per v0.7.x audit findings):
429
+ // - returns literal `"verdict": "NOT_APPLICABLE"` (资管 v0.7.5 variant)
430
+ // - returns literal `"pass": null` (v0.7.0 legacy)
431
+ // - returns literal `"method": "stub"`
432
+ // - AND none of: workflow import, >20 non-comment lines.
433
+ // Substantive signals override the stub-return signal (a check.py that
434
+ // imports + delegates to a workflow but happens to return NOT_APPLICABLE
435
+ // for some sub-path is not a stub).
436
+ function _auditCheckPyShapes(skillsDir) {
437
+ const stubFiles = [];
438
+ let total = 0;
439
+ if (!dirExists(skillsDir)) return { total, stubFiles };
440
+
441
+ for (const dirEntry of listChildDirs(skillsDir)) {
442
+ if (dirEntry.name.startsWith("__")) continue;
443
+ const skillPath = path.join(skillsDir, dirEntry.name);
444
+ const scripts = findCheckScripts(skillPath);
445
+ for (const scriptPath of scripts) {
446
+ total++;
447
+ if (_isCheckPyStubShaped(scriptPath)) {
448
+ stubFiles.push(path.relative(skillsDir, scriptPath));
449
+ }
450
+ }
451
+ }
452
+ return { total, stubFiles };
453
+ }
454
+
455
+ function _isCheckPyStubShaped(scriptPath) {
456
+ let content;
457
+ try { content = fs.readFileSync(scriptPath, "utf-8"); }
458
+ catch { return false; }
459
+
460
+ // Substantive signal 1: imports a workflow (direct delegation)
461
+ if (/from\s+workflows[.\w]+\s+import|^import\s+workflows\./m.test(content)) {
462
+ return false;
463
+ }
464
+
465
+ // Stub return patterns. A check.py is a stub if it ALWAYS returns one
466
+ // of these regardless of input. We detect "always returns" by checking
467
+ // that the file has no other verdict literal — no PASS, FAIL, WARNING
468
+ // returns elsewhere. A scaffold with 30+ lines but a single
469
+ // NOT_APPLICABLE return path (like 资管 v0.7.5's 14 check.py files) is
470
+ // still a stub by behavior — line count is unreliable.
471
+ const stubReturn1 = /return\s+\{[^}]*["']verdict["']\s*:\s*["']NOT_APPLICABLE["']/m.test(content);
472
+ const stubReturn2 = /return\s+\{[^}]*["']pass["']\s*:\s*None/m.test(content);
473
+ const stubReturn3 = /return\s+\{[^}]*["']method["']\s*:\s*["']stub["']/m.test(content);
474
+ const hasStubReturn = stubReturn1 || stubReturn2 || stubReturn3;
475
+
476
+ if (!hasStubReturn) return false;
477
+
478
+ // If we find ANY other verdict (PASS, FAIL, WARNING), the file is doing
479
+ // real branching even if one path returns NOT_APPLICABLE — not a stub.
480
+ const hasOtherVerdict =
481
+ /["']verdict["']\s*:\s*["']PASS["']/m.test(content) ||
482
+ /["']verdict["']\s*:\s*["']FAIL["']/m.test(content) ||
483
+ /["']verdict["']\s*:\s*["']WARNING["']/m.test(content) ||
484
+ /\bmake_result\b/.test(content); // common helper that produces non-stub returns
485
+
486
+ return !hasOtherVerdict;
487
+ }
488
+
324
489
  // ───────────────────────────────────────────────────────────────────
325
490
  // skill_testing
326
491
  // ───────────────────────────────────────────────────────────────────
@@ -613,10 +778,45 @@ export function deriveProductionQcMilestones(workspace) {
613
778
  }
614
779
  }
615
780
 
781
+ // v0.8 P1-A: per-doc QC review files at output/qc/reviews/doc_*.json
782
+ // (贷款 v0.7.5 shape). Each file is a single review object with
783
+ // {review_id, document, verdict}. Engine previously skipped these
784
+ // because they don't match the batch heuristic, causing
785
+ // `documents_reviewed: 0` despite 16 docs on disk.
786
+ const perDocReviewsDir = path.join(outputDir, "qc", "reviews");
787
+ if (dirExists(perDocReviewsDir)) {
788
+ for (const e of listChildFiles(perDocReviewsDir)) {
789
+ if (!e.name.endsWith(".json")) continue;
790
+ const data = readJsonSafe(path.join(perDocReviewsDir, e.name));
791
+ if (!data || typeof data !== "object" || !data.verdict) continue;
792
+ // Document identifier: prefer explicit fields, fall back to filename
793
+ const docKey = data.document || data.doc || data.file || data.path || e.name.replace(/\.json$/, "");
794
+ documentsReviewedSet.add(String(docKey));
795
+ }
796
+ }
797
+
798
+ // v0.8 P1-A: also read numeric `documents_reviewed: N` from any
799
+ // top-level batch file (贷款 review_001.json declares 16 directly).
800
+ // We use this only when the doc set is smaller than the claim — agents
801
+ // sometimes write summary batches without enumerating individual docs.
802
+ let declaredDocCount = 0;
803
+ for (const dir of candidateDirs) {
804
+ if (!dirExists(dir)) continue;
805
+ for (const e of listChildFiles(dir)) {
806
+ if (!e.name.endsWith(".json")) continue;
807
+ const data = readJsonSafe(path.join(dir, e.name));
808
+ if (!data || typeof data !== "object") continue;
809
+ const n = Number(data.documents_reviewed);
810
+ if (Number.isFinite(n) && n > declaredDocCount) declaredDocCount = n;
811
+ }
812
+ }
813
+ const documentsReviewed = Math.max(documentsReviewedSet.size, declaredDocCount);
814
+
616
815
  return {
617
816
  batchesProcessed,
618
- documentsReviewed: documentsReviewedSet.size,
817
+ documentsReviewed,
619
818
  documentsReviewedKeys: [...documentsReviewedSet], // for describeState detail
819
+ documentsReviewedDeclared: declaredDocCount > documentsReviewedSet.size ? declaredDocCount : 0,
620
820
  };
621
821
  }
622
822
 
@@ -658,10 +858,18 @@ export function deriveFinalizationMilestones(workspace) {
658
858
  }
659
859
  }
660
860
 
661
- // coverageReportWritten: rules/coverage_report.md OR output/coverage_report.md.
662
- const coverageReportWritten =
663
- fileExists(path.join(cwd, "rules", "coverage_report.md")) ||
664
- fileExists(path.join(cwd, "output", "coverage_report.md"));
861
+ // coverageReportWritten: accept multiple known agent-write locations.
862
+ // v0.8 P1-A: added rule_skills/coverage_report.md (资管 v0.7.5 wrote here)
863
+ // and coverage_audit.md variants (贷款 v0.7.5 wrote rules/coverage_audit.md).
864
+ // The "coverage doc" concept covers both report-style + audit-style files.
865
+ const coverageReportWritten = !!findFileAcrossKnownPaths(cwd, [
866
+ path.join("rules", "coverage_report.md"),
867
+ path.join("rules", "coverage_audit.md"), // 贷款 v0.7.5
868
+ path.join("rules", "coverage_audit.json"),
869
+ path.join("output", "coverage_report.md"),
870
+ path.join("rule_skills", "coverage_report.md"), // 资管 v0.7.5
871
+ path.join("output", "qc", "coverage_report.md"), // future-proofing
872
+ ]);
665
873
 
666
874
  // finalDashboardWritten: at least one dashboards/*.html that is NOT a
667
875
  // duplicate of any other. DS + GLM both shipped byte-identical
@@ -694,11 +902,108 @@ export function deriveFinalizationMilestones(workspace) {
694
902
  }
695
903
  }
696
904
 
905
+ // v0.8 P0-D: stale-release detection. SOFT gate — surfaces a warning,
906
+ // doesn't refuse phase advance. 资管 audit § 9.1 finding 11 found both
907
+ // release bundles snapped BEFORE the user's "更激进 worker LLM" prompt
908
+ // drove 14 hybrid workflow_v2.py builds, but neither was re-released.
909
+ // We detect by comparing the most-recent release manifest's created_at
910
+ // against the mtimes of workflows/ and rule_skills/.
911
+ const staleRelease = _detectStaleRelease(cwd);
912
+
697
913
  return {
698
914
  readmeWritten,
699
915
  coverageReportWritten,
700
916
  finalDashboardWritten,
701
917
  dashboardDuplicatesDetected,
918
+ releaseIsStale: staleRelease.isStale,
919
+ staleReleaseDetail: staleRelease.detail,
920
+ };
921
+ }
922
+
923
+ // v0.8 P0-D: detect whether workflows/ or rule_skills/ contain files
924
+ // modified after the most-recent release manifest was written. Returns
925
+ // {isStale: bool, detail: {releaseTs?, releasePath?, newerFiles?: [...]}}.
926
+ // SOFT semantics — the milestone is informational; phase advance still
927
+ // works. The agent + downstream tooling (e2e-audit) decides what to do.
928
+ function _detectStaleRelease(cwd) {
929
+ const releasesRoot = path.join(cwd, "output", "releases");
930
+ if (!dirExists(releasesRoot)) return { isStale: false, detail: null };
931
+
932
+ // Find most-recent release manifest (by created_at OR fs mtime as fallback).
933
+ let latestRelease = null; // {path, createdAt: Date}
934
+ for (const e of listChildDirs(releasesRoot)) {
935
+ const manifestPath = path.join(releasesRoot, e.name, "manifest.json");
936
+ try {
937
+ const stat = fs.statSync(manifestPath);
938
+ if (!stat.isFile()) continue;
939
+ let createdAt = stat.mtime;
940
+ try {
941
+ const m = JSON.parse(fs.readFileSync(manifestPath, "utf-8"));
942
+ if (m?.created_at) {
943
+ const parsed = new Date(m.created_at);
944
+ if (!Number.isNaN(parsed.getTime())) createdAt = parsed;
945
+ }
946
+ } catch { /* fall back to mtime */ }
947
+ if (!latestRelease || createdAt > latestRelease.createdAt) {
948
+ latestRelease = { path: manifestPath, createdAt, slug: e.name };
949
+ }
950
+ } catch { /* skip */ }
951
+ }
952
+
953
+ if (!latestRelease) return { isStale: false, detail: null };
954
+
955
+ // Walk workflows/ and rule_skills/ for files newer than latestRelease.createdAt.
956
+ // Cap to first 10 newer-than-release matches to bound report size.
957
+ const newerFiles = [];
958
+ const cutoff = latestRelease.createdAt.getTime();
959
+ const SCAN_DIRS = ["workflows", "rule_skills"];
960
+ for (const sub of SCAN_DIRS) {
961
+ const root = path.join(cwd, sub);
962
+ if (!dirExists(root)) continue;
963
+ const stack = [root];
964
+ while (stack.length && newerFiles.length < 10) {
965
+ const d = stack.pop();
966
+ let entries;
967
+ try { entries = fs.readdirSync(d, { withFileTypes: true }); } catch { continue; }
968
+ for (const ent of entries) {
969
+ if (ent.name.startsWith(".") || ent.name === "__pycache__" || ent.name === "node_modules") continue;
970
+ const p = path.join(d, ent.name);
971
+ if (ent.isDirectory()) { stack.push(p); continue; }
972
+ if (!ent.isFile()) continue;
973
+ // Care about workflow_v*.py + check.py + SKILL.md/skill.md only —
974
+ // not __pycache__, not test artifacts, not .json.
975
+ if (!/(workflow_v\d+\.py|check\.py|SKILL\.md|skill\.md)$/.test(ent.name)) continue;
976
+ try {
977
+ const st = fs.statSync(p);
978
+ if (st.mtimeMs > cutoff) {
979
+ newerFiles.push({
980
+ path: path.relative(cwd, p),
981
+ mtime: new Date(st.mtimeMs).toISOString(),
982
+ });
983
+ if (newerFiles.length >= 10) break;
984
+ }
985
+ } catch { /* skip */ }
986
+ }
987
+ }
988
+ }
989
+
990
+ if (newerFiles.length === 0) return { isStale: false, detail: null };
991
+
992
+ // SOFT: accept_stale_release marker bypasses the warning. Agents that
993
+ // intentionally accept the older release write this file.
994
+ const acceptPath = path.join(cwd, "output", "releases", latestRelease.slug, ".accept_stale_release");
995
+ if (fileExists(acceptPath)) return { isStale: false, detail: { acceptedAt: latestRelease.slug } };
996
+
997
+ return {
998
+ isStale: true,
999
+ detail: {
1000
+ releasePath: path.relative(cwd, latestRelease.path),
1001
+ releaseSlug: latestRelease.slug,
1002
+ releaseCreatedAt: latestRelease.createdAt.toISOString(),
1003
+ newerFiles,
1004
+ totalNewerCount: newerFiles.length,
1005
+ hint: "Workspace artifacts modified after release was built. Either re-run the release tool or write .accept_stale_release into the release dir to acknowledge.",
1006
+ },
702
1007
  };
703
1008
  }
704
1009
 
@@ -3,7 +3,7 @@ import path from "node:path";
3
3
  import { Phase, PipelineEvent } from "./index.js";
4
4
  import { Pipeline } from "./base.js";
5
5
  import { SkillValidator } from "../skill-validator.js";
6
- import { deriveSkillAuthoringMilestones } from "./_milestone-derive.js";
6
+ import { deriveSkillAuthoringMilestones, canonicalRuleId } from "./_milestone-derive.js";
7
7
 
8
8
  export class SkillAuthoringPipeline extends Pipeline {
9
9
  /**
@@ -37,14 +37,31 @@ export class SkillAuthoringPipeline extends Pipeline {
37
37
  }
38
38
 
39
39
  _loadRules() {
40
+ // v0.8.3 P20-B1+B2: dedup rule IDs across all rules/*.json files AND
41
+ // canonicalize them so the rulesCovered comparison against
42
+ // ruleIdsCovered (which now goes through canonicalRuleId) works for
43
+ // BOTH bare-numeric (R14) AND compound (R01-01, R02-03) forms.
44
+ // E2E #13 资管 used compound IDs + wrote a sibling difficulty.json;
45
+ // the raw-string + no-dedup pre-v0.8.3 path produced rulesCovered:
46
+ // 0/30 (compound IDs unmatched + double-counted).
40
47
  this.totalRules = [];
48
+ const seen = new Set();
41
49
  const rulesDir = path.join(this._workspace.cwd, "rules");
42
50
  if (!fs.existsSync(rulesDir)) return;
43
51
  for (const f of fs.readdirSync(rulesDir).filter((f) => f.endsWith(".json"))) {
44
52
  try {
45
53
  const data = JSON.parse(fs.readFileSync(path.join(rulesDir, f), "utf-8"));
46
54
  const rules = Array.isArray(data) ? data : (data.rules || []);
47
- for (const r of rules) { if (r.id) this.totalRules.push(r.id); }
55
+ for (const r of rules) {
56
+ if (!r || !r.id) continue;
57
+ // Canonicalize to match ruleIdsCovered which is built from
58
+ // canonicalRuleId() output. If canonicalRuleId returns null
59
+ // (non-rule-shaped string), preserve the raw trimmed string.
60
+ const canon = canonicalRuleId(r.id) || String(r.id).trim();
61
+ if (seen.has(canon)) continue;
62
+ seen.add(canon);
63
+ this.totalRules.push(canon);
64
+ }
48
65
  } catch { /* skip */ }
49
66
  }
50
67
  }
@@ -59,6 +76,10 @@ export class SkillAuthoringPipeline extends Pipeline {
59
76
  this.skillsAuthored = [...m.skillsAuthored];
60
77
  this.skillsWithScripts = [...m.skillsWithScripts];
61
78
  this.ruleIdsCovered = new Set(m.ruleIdsCovered);
79
+ // v0.8 P2-F (item 22): stub-shape audit for check.py files.
80
+ this._checkPyStubRatio = m.checkPyStubRatio || 0;
81
+ this._checkPyStubFiles = m.checkPyStubFiles || [];
82
+ this._checkPyTotal = m.checkPyTotal || 0;
62
83
  }
63
84
 
64
85
  // v0.7.0 A1: ruleId extraction moved to _milestone-derive.js
@@ -228,7 +249,32 @@ export class SkillAuthoringPipeline extends Pipeline {
228
249
  this._validationFailures = v.failures;
229
250
  this._validationSkipped = v.skipped;
230
251
  if (!v.ok) return false;
231
- return this.skillsWithScripts.length >= Math.max(1, this.skillsAuthored.length * 0.5);
252
+ if (this.skillsWithScripts.length < Math.max(1, this.skillsAuthored.length * 0.5)) {
253
+ return false;
254
+ }
255
+
256
+ // v0.8 P2-F (item 22): optional enforcement of check.py substantiveness.
257
+ // SOFT-by-default — the stub ratio is always computed (visible in
258
+ // describeState / events) but only blocks phase advance if
259
+ // KC_ENFORCE_CHECK_PY_SUBSTANTIVE=1 is set. Default-off because
260
+ // the heuristic may over-fire on legitimate scaffolds; v0.8 ships
261
+ // the detection + reporting, v0.8.x revisits enforcement after audit
262
+ // data shows whether the signal is reliable.
263
+ const enforce = process.env.KC_ENFORCE_CHECK_PY_SUBSTANTIVE === "1";
264
+ if (enforce && this._checkPyTotal > 0 && this._checkPyStubRatio > 0.5) {
265
+ this._validationFailures = this._validationFailures || [];
266
+ this._validationFailures.push({
267
+ file: "<check_py_substantiveness>",
268
+ reason:
269
+ `${this._checkPyStubCount || this._checkPyStubFiles.length}/${this._checkPyTotal} check.py files are stub-shaped ` +
270
+ `(return NOT_APPLICABLE / pass:null with no workflow import + ≤20 lines). ` +
271
+ `Examples: ${this._checkPyStubFiles.slice(0, 3).join(", ")}${this._checkPyStubFiles.length > 3 ? "..." : ""}. ` +
272
+ `See skill-authoring SKILL.md anti-pattern section. ` +
273
+ `Set KC_ENFORCE_CHECK_PY_SUBSTANTIVE=0 to bypass this gate if intentional.`,
274
+ });
275
+ return false;
276
+ }
277
+ return true;
232
278
  }
233
279
 
234
280
  /**