kc-beta 0.7.5 → 0.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. package/README.md +47 -0
  2. package/package.json +3 -2
  3. package/src/agent/context.js +17 -1
  4. package/src/agent/engine.js +467 -100
  5. package/src/agent/llm-client.js +24 -1
  6. package/src/agent/pipelines/_advance-hints.js +92 -0
  7. package/src/agent/pipelines/_milestone-derive.js +325 -20
  8. package/src/agent/pipelines/skill-authoring.js +49 -3
  9. package/src/agent/tools/agent-tool.js +2 -2
  10. package/src/agent/tools/consult-skill.js +15 -0
  11. package/src/agent/tools/dashboard-render.js +48 -1
  12. package/src/agent/tools/document-parse.js +31 -2
  13. package/src/agent/tools/phase-advance.js +17 -13
  14. package/src/agent/tools/release.js +343 -7
  15. package/src/agent/tools/sandbox-exec.js +65 -8
  16. package/src/agent/tools/worker-llm-call.js +95 -15
  17. package/src/agent/workspace.js +25 -4
  18. package/src/cli/components.js +4 -1
  19. package/src/cli/index.js +125 -8
  20. package/src/config.js +19 -2
  21. package/src/marathon/driver.js +217 -0
  22. package/src/marathon/prompts.js +93 -0
  23. package/template/.env.template +17 -1
  24. package/template/AGENT.md +2 -2
  25. package/template/skills/en/auto-model-selection/SKILL.md +55 -35
  26. package/template/skills/en/bootstrap-workspace/SKILL.md +27 -0
  27. package/template/skills/en/compliance-judgment/SKILL.md +14 -0
  28. package/template/skills/en/confidence-system/SKILL.md +30 -8
  29. package/template/skills/en/corner-case-management/SKILL.md +53 -33
  30. package/template/skills/en/cross-document-verification/SKILL.md +88 -83
  31. package/template/skills/en/dashboard-reporting/SKILL.md +91 -66
  32. package/template/skills/en/dashboard-reporting/scripts/generate_dashboard.py +1 -1
  33. package/template/skills/en/data-sensibility/SKILL.md +19 -12
  34. package/template/skills/en/document-chunking/SKILL.md +99 -15
  35. package/template/skills/en/entity-extraction/SKILL.md +14 -4
  36. package/template/skills/en/quality-control/SKILL.md +23 -0
  37. package/template/skills/en/rule-extraction/SKILL.md +92 -94
  38. package/template/skills/en/rule-extraction/references/chunking-strategies.md +7 -78
  39. package/template/skills/en/skill-authoring/SKILL.md +85 -2
  40. package/template/skills/en/skill-creator/SKILL.md +25 -3
  41. package/template/skills/en/skill-to-workflow/SKILL.md +73 -1
  42. package/template/skills/en/task-decomposition/SKILL.md +1 -1
  43. package/template/skills/en/tree-processing/SKILL.md +1 -1
  44. package/template/skills/en/version-control/SKILL.md +15 -0
  45. package/template/skills/en/work-decomposition/SKILL.md +52 -32
  46. package/template/skills/phase_skills.yaml +5 -0
  47. package/template/skills/zh/auto-model-selection/SKILL.md +54 -33
  48. package/template/skills/zh/bootstrap-workspace/SKILL.md +27 -0
  49. package/template/skills/zh/compliance-judgment/SKILL.md +51 -37
  50. package/template/skills/zh/compliance-judgment/references/output-format.md +62 -62
  51. package/template/skills/zh/confidence-system/SKILL.md +34 -9
  52. package/template/skills/zh/corner-case-management/SKILL.md +71 -104
  53. package/template/skills/zh/cross-document-verification/SKILL.md +90 -195
  54. package/template/skills/zh/cross-document-verification/references/contradiction-taxonomy.md +36 -36
  55. package/template/skills/zh/dashboard-reporting/SKILL.md +82 -232
  56. package/template/skills/zh/dashboard-reporting/scripts/generate_dashboard.py +1 -1
  57. package/template/skills/zh/data-sensibility/SKILL.md +13 -0
  58. package/template/skills/zh/document-chunking/SKILL.md +101 -18
  59. package/template/skills/zh/document-parsing/SKILL.md +65 -65
  60. package/template/skills/zh/document-parsing/references/parser-catalog.md +26 -26
  61. package/template/skills/zh/entity-extraction/SKILL.md +78 -68
  62. package/template/skills/zh/evolution-loop/references/convergence-guide.md +38 -38
  63. package/template/skills/zh/quality-control/SKILL.md +23 -0
  64. package/template/skills/zh/quality-control/references/qa-layers.md +65 -65
  65. package/template/skills/zh/quality-control/references/sampling-strategies.md +49 -49
  66. package/template/skills/zh/rule-extraction/SKILL.md +199 -188
  67. package/template/skills/zh/rule-extraction/references/chunking-strategies.md +5 -78
  68. package/template/skills/zh/skill-authoring/SKILL.md +136 -58
  69. package/template/skills/zh/skill-authoring/references/skill-format-spec.md +39 -39
  70. package/template/skills/zh/skill-creator/SKILL.md +215 -201
  71. package/template/skills/zh/skill-creator/references/schemas.md +60 -60
  72. package/template/skills/zh/skill-to-workflow/SKILL.md +73 -1
  73. package/template/skills/zh/skill-to-workflow/references/worker-llm-catalog.md +24 -24
  74. package/template/skills/zh/task-decomposition/SKILL.md +1 -1
  75. package/template/skills/zh/task-decomposition/references/decision-matrix.md +54 -54
  76. package/template/skills/zh/tree-processing/SKILL.md +67 -63
  77. package/template/skills/zh/version-control/SKILL.md +15 -0
  78. package/template/skills/zh/version-control/references/trace-id-spec.md +34 -34
  79. package/template/skills/zh/work-decomposition/SKILL.md +52 -30
  80. package/template/workflows/common/llm_client.py +168 -0
  81. package/template/workflows/common/utils.py +132 -0
@@ -382,8 +382,8 @@ export class AgentTool extends BaseTool {
382
382
  * B8: List currently-running sub-agents. Called by engine's phase-advance
383
383
  * path to emit a `stale_subagents` pipeline event — the main agent's next
384
384
  * turn sees the list and decides whether to kill each. Soft signal, not
385
- * an automated kill, because phase_advance can fire from _maybeAutoAdvance
386
- * unexpectedly and coupling the lifecycle would amplify blast radius.
385
+ * an automated kill: coupling the subagent lifecycle to phase advance
386
+ * would amplify blast radius if a transition happened unexpectedly.
387
387
  */
388
388
  getRunningTaskIds() {
389
389
  return Array.from(this._runningTasks.keys());
@@ -63,6 +63,21 @@ export class ConsultSkillTool extends BaseTool {
63
63
  const name = (input?.name || "").trim();
64
64
  if (!name) return new ToolResult("name required (e.g. consult_skill({name: 'work-decomposition'}))", true);
65
65
 
66
+ // v0.8 P0-A: defensive null-check. v0.7.5 shipped with an init-order bug
67
+ // where ConsultSkillTool received undefined skillLoader and threw
68
+ // "Cannot read properties of undefined (reading 'getPhaseSkillSet')"
69
+ // on every invocation (资管 audit § 9.1, 5/5 failure rate). The init-order
70
+ // fix is in engine.js:238; this guard prevents an uncaught exception if
71
+ // the bug recurs from any future constructor reorder.
72
+ if (!this._skillLoader || typeof this._skillLoader.getPhaseSkillSet !== "function") {
73
+ return new ToolResult(
74
+ "consult_skill is misconfigured: skillLoader unavailable. This is an engine-side bug — " +
75
+ "surface to the developer user. The agent should fall back to reading skill bodies " +
76
+ "directly from <workspace>/skills/<name>/SKILL.md or the system prompt's always-loaded section.",
77
+ true,
78
+ );
79
+ }
80
+
66
81
  const phase = this._getCurrentPhase ? this._getCurrentPhase() : null;
67
82
  const { alwaysLoaded, available } = this._skillLoader.getPhaseSkillSet(phase);
68
83
 
@@ -81,11 +81,57 @@ export class DashboardRenderTool extends BaseTool {
81
81
  metrics.evolution_iterations = fs.readdirSync(evoDir).filter((f) => f.endsWith(".json")).length;
82
82
  }
83
83
 
84
+ // v0.8 P1-G: QC counter now reads from multiple known agent-write
85
+ // locations + counts per-doc reviews. Pre-v0.8 read only output/qc/*.json
86
+ // top-level; 资管 v0.7.5 wrote output/results/production_qc_results.json
87
+ // so the dashboard showed `QC Batches: 0` despite 126 pairs of data.
88
+ let qcBatches = 0;
89
+ let qcDocsReviewed = 0;
90
+
91
+ // (a) Top-level batch files in output/qc/ (贷款 v0.7.5 shape)
84
92
  const qcDir = path.join(ws, "output", "qc");
85
93
  if (fs.existsSync(qcDir)) {
86
- metrics.qc_batches = fs.readdirSync(qcDir).filter((f) => f.endsWith(".json")).length;
94
+ for (const f of fs.readdirSync(qcDir).filter((f) => f.endsWith(".json"))) {
95
+ qcBatches++;
96
+ try {
97
+ const data = JSON.parse(fs.readFileSync(path.join(qcDir, f), "utf-8"));
98
+ const n = Number(data?.documents_reviewed);
99
+ if (Number.isFinite(n) && n > qcDocsReviewed) qcDocsReviewed = n;
100
+ } catch { /* skip malformed */ }
101
+ }
87
102
  }
88
103
 
104
+ // (b) Per-doc reviews at output/qc/reviews/ (贷款 detail shape)
105
+ const reviewsDir = path.join(ws, "output", "qc", "reviews");
106
+ if (fs.existsSync(reviewsDir)) {
107
+ const reviewFiles = fs.readdirSync(reviewsDir).filter((f) => f.endsWith(".json"));
108
+ qcDocsReviewed = Math.max(qcDocsReviewed, reviewFiles.length);
109
+ }
110
+
111
+ // (c) production_qc_results.json shape (资管 v0.7.5)
112
+ const productionQc = path.join(ws, "output", "results", "production_qc_results.json");
113
+ if (fs.existsSync(productionQc)) {
114
+ qcBatches++;
115
+ try {
116
+ const data = JSON.parse(fs.readFileSync(productionQc, "utf-8"));
117
+ const totalDocs = Number(data?.total_docs);
118
+ if (Number.isFinite(totalDocs)) qcDocsReviewed = Math.max(qcDocsReviewed, totalDocs);
119
+ // Otherwise, dedup doc keys from nested results
120
+ if (!Number.isFinite(totalDocs) && data?.results && typeof data.results === "object") {
121
+ const docSet = new Set();
122
+ for (const docs of Object.values(data.results)) {
123
+ if (docs && typeof docs === "object") {
124
+ for (const k of Object.keys(docs)) docSet.add(k);
125
+ }
126
+ }
127
+ if (docSet.size > 0) qcDocsReviewed = Math.max(qcDocsReviewed, docSet.size);
128
+ }
129
+ } catch { /* skip */ }
130
+ }
131
+
132
+ metrics.qc_batches = qcBatches;
133
+ metrics.qc_docs_reviewed = qcDocsReviewed;
134
+
89
135
  return metrics;
90
136
  }
91
137
 
@@ -126,6 +172,7 @@ th { color: #737373; font-size: 0.85em; }
126
172
  <div class="metric"><span class="value">${total}</span><br><span class="label">Results</span></div>
127
173
  <div class="metric"><span class="value">${metrics.evolution_iterations}</span><br><span class="label">Evolution Cycles</span></div>
128
174
  <div class="metric"><span class="value">${metrics.qc_batches}</span><br><span class="label">QC Batches</span></div>
175
+ <div class="metric"><span class="value">${metrics.qc_docs_reviewed || 0}</span><br><span class="label">Docs Reviewed</span></div>
129
176
  </div>
130
177
  <h2>Confidence Distribution</h2>
131
178
  <div class="card">
@@ -12,14 +12,43 @@ const MIN_CHARS_PER_PAGE = 50;
12
12
  * Level 3: OCR models via SiliconFlow — fallback via vision models
13
13
  */
14
14
  export class DocumentParseTool extends BaseTool {
15
- constructor(workspace, { mineruApiUrl, mineruApiKey, llmApiKey, llmBaseUrl, ocrModel } = {}) {
15
+ /**
16
+ * @param {object} workspace
17
+ * @param {object} opts
18
+ * @param {string} [opts.mineruApiUrl]
19
+ * @param {string} [opts.mineruApiKey]
20
+ * @param {string} [opts.llmApiKey]
21
+ * @param {string} [opts.llmBaseUrl]
22
+ * @param {string} [opts.ocrModel] — static fallback (legacy)
23
+ * @param {() => string} [opts.getOcrModel] — v0.8.1 P9-B: live-read
24
+ * callback. If provided, takes precedence over `ocrModel`. The
25
+ * constructor used to capture vlmTier1 once at engine startup, but
26
+ * workspace_env_overlay (P1-B) fires AFTER tool construction in
27
+ * some flows (e.g. agent edits .env mid-run, OR overlay applies on
28
+ * a subagent's engine but parent already cached the gc default).
29
+ * E2E #11 资管 v0.8 audit found document_parse errors quoting
30
+ * Qwen3-VL-235B-A22B-Instruct (gc default) even though .env set
31
+ * OCR_MODEL_TIER1=zai-org/GLM-4.6V — the overlay applied 5 min
32
+ * after first failed call. Live-read fixes the race.
33
+ */
34
+ constructor(workspace, { mineruApiUrl, mineruApiKey, llmApiKey, llmBaseUrl, ocrModel, getOcrModel } = {}) {
16
35
  super();
17
36
  this._workspace = workspace;
18
37
  this._mineruApiUrl = mineruApiUrl || "";
19
38
  this._mineruApiKey = mineruApiKey || "";
20
39
  this._vlmApiKey = llmApiKey || "";
21
40
  this._vlmBaseUrl = (llmBaseUrl || "").replace(/\/+$/, "");
22
- this._ocrModel = ocrModel || "";
41
+ this._ocrModelStatic = ocrModel || "";
42
+ this._getOcrModel = typeof getOcrModel === "function" ? getOcrModel : null;
43
+ }
44
+
45
+ /** Read ocrModel live (P9-B) or fall back to the static value captured at construction. */
46
+ get _ocrModel() {
47
+ if (this._getOcrModel) {
48
+ try { return this._getOcrModel() || this._ocrModelStatic; }
49
+ catch { return this._ocrModelStatic; }
50
+ }
51
+ return this._ocrModelStatic;
23
52
  }
24
53
 
25
54
  get name() { return "document_parse"; }
@@ -1,5 +1,6 @@
1
1
  import { BaseTool, ToolResult } from "./base.js";
2
2
  import { Phase } from "../pipelines/index.js";
3
+ import { getPrescriptiveHint } from "../pipelines/_advance-hints.js";
3
4
 
4
5
  const VALID_PHASES = new Set(Object.values(Phase));
5
6
 
@@ -72,12 +73,12 @@ export class PhaseAdvanceTool extends BaseTool {
72
73
 
73
74
  const beforePhase = this._getCurrentPhase();
74
75
  // H1: short-circuit the "already in target" case with an informational
75
- // message — the agent was trying to advance correctly, engine just
76
- // auto-advanced ahead of it (common when _maybeAutoAdvance fires on a
77
- // criteria flip). Treat as success, not refusal.
76
+ // message — agent was trying to advance correctly, engine was already
77
+ // there (from a prior pipeline_event-driven advance or an earlier
78
+ // explicit call). Treat as success, not refusal.
78
79
  if (beforePhase && beforePhase === to) {
79
80
  return new ToolResult(
80
- `Already in phase ${to} (engine auto-advanced earlier via criteria flip or prior explicit call). Proceed with phase-appropriate work.`,
81
+ `Already in phase ${to} (engine was already there from a prior advance). Proceed with phase-appropriate work.`,
81
82
  );
82
83
  }
83
84
 
@@ -126,18 +127,21 @@ export class PhaseAdvanceTool extends BaseTool {
126
127
  // exactly which milestones the gate is reading and can satisfy them.
127
128
  // E2E #6 v070 showed the generic "check /status" hint wasn't concrete
128
129
  // enough — agents forced through. Naming the gap inline reduces that.
129
- const engineCountsLine = advanceResult?.engineCounts
130
- ? `\nEngine telemetry: ${advanceResult.engineCounts}`
131
- : "";
130
+ // v0.8 P0-E: prescriptive refusal hint — name the artifacts the agent
131
+ // needs to produce, derived from the same paths _milestone-derive.js
132
+ // walks. Replaces the v0.7.x descriptive "check /status" message that
133
+ // 资管 + 贷款 v0.7.5 audits showed agents force-bypassing.
134
+ const prescriptive = getPrescriptiveHint(
135
+ beforePhase,
136
+ advanceResult?.engineCounts,
137
+ advanceResult?.engineCounts || "",
138
+ );
132
139
 
133
140
  return new ToolResult(
134
141
  `Did not advance to ${to} (currently in ${beforePhase || "?"}). ` +
135
- `Likely cause: source-phase exit criteria not met.${engineCountsLine}\n\n` +
136
- `Run /status (or read the phase describeState block in this turn's system reminder) ` +
137
- `to see which milestones are missing, then produce the disk artifacts that satisfy them ` +
138
- `the engine derives milestones from filesystem facts (rule_skills/<id>/SKILL.md, check.py, ` +
139
- `workflows/<id>/*.py, output/results/*.json, etc.). ` +
140
- `If the transition is non-adjacent or this phase truly is done despite the gate, ` +
142
+ `Likely cause: source-phase exit criteria not met.\n\n` +
143
+ prescriptive +
144
+ `\n\nIf the transition is non-adjacent or this phase truly is done despite the gate, ` +
141
145
  `re-call with the documented schema flag. The engine logged the precise reason in ` +
142
146
  `events.jsonl as 'phase_advance_refused'.`,
143
147
  false,
@@ -85,13 +85,19 @@ export class ReleaseTool extends BaseTool {
85
85
  return new ToolResult(`release template missing at ${TEMPLATE_DIR}`, true);
86
86
  }
87
87
 
88
- // 1. Snapshot first locks in commit + tag, regardless of whether bundle build succeeds
89
- const snapResult = await this._snapshot.execute({
90
- label: `release-${slug}`,
91
- notes: `Release ${label} bundle source`,
92
- });
93
- if (snapResult.isError) return new ToolResult(`snapshot failed: ${snapResult.content}`, true);
94
- const { tag: snapshotTag, commit: snapshotCommit } = this._readSnapshotMeta(`release-${slug}`);
88
+ // v0.8.1 P9-C: defer the snapshot (git tag) until AFTER the bundle
89
+ // is written + verified. v0.8.0 ordered snapshot-first to "lock in
90
+ // commit + tag regardless of bundle outcome," but E2E #11 资管 v0.8
91
+ // audit found `release-v1` tags with no corresponding bundle dir —
92
+ // tag without bundle confuses downstream consumers. New order:
93
+ // 1. Build bundle (catalog read, copy template, write fixtures, manifest, README)
94
+ // 2. Verify bundle (manifest.json + README.md exist + non-empty)
95
+ // 3. ONLY THEN snapshot (creates the git tag) + back-fill manifest
96
+ // with snapshot tag/commit
97
+ // If verification fails, a `.failed_release` marker is written into
98
+ // the bundle dir and NO tag is created.
99
+ let snapshotTag = null;
100
+ let snapshotCommit = null;
95
101
 
96
102
  // 2. Read catalog and filter
97
103
  const catalogPath = path.join(this._workspace.cwd, "rules", "catalog.json");
@@ -294,6 +300,77 @@ export class ReleaseTool extends BaseTool {
294
300
  }
295
301
  }
296
302
 
303
+ // v0.8.1 P9-C: bundle verification + transactional snapshot.
304
+ // The manifest + README were written above. Verify they exist with
305
+ // substance (≥200 bytes README, valid JSON manifest with `slug` field).
306
+ // If verification fails, write `.failed_release` marker and skip
307
+ // the git-tag step — no tag-without-bundle.
308
+ const manifestPath = path.join(bundleAbs, "manifest.json");
309
+ const readmePath = path.join(bundleAbs, "README.md");
310
+ let verifyError = null;
311
+ try {
312
+ const mStat = fs.statSync(manifestPath);
313
+ const rStat = fs.statSync(readmePath);
314
+ if (!mStat.isFile() || mStat.size < 50) verifyError = "manifest.json missing or too small";
315
+ else if (!rStat.isFile() || rStat.size < 200) verifyError = "README.md missing or too small";
316
+ else {
317
+ const m = JSON.parse(fs.readFileSync(manifestPath, "utf-8"));
318
+ if (m.slug !== slug) verifyError = `manifest.slug=${m.slug} doesn't match expected ${slug}`;
319
+ }
320
+ } catch (e) {
321
+ verifyError = `bundle verification threw: ${e.message}`;
322
+ }
323
+
324
+ if (verifyError) {
325
+ try {
326
+ fs.writeFileSync(
327
+ path.join(bundleAbs, ".failed_release"),
328
+ JSON.stringify({
329
+ failed_at: new Date().toISOString(),
330
+ reason: verifyError,
331
+ label,
332
+ slug,
333
+ }, null, 2),
334
+ );
335
+ } catch { /* best-effort */ }
336
+ return new ToolResult(
337
+ `Release bundle verification failed (${verifyError}). NO git tag created. ` +
338
+ `See .failed_release marker in ${bundleRel}/ for details. Fix the bundle issue and re-run.`,
339
+ true,
340
+ );
341
+ }
342
+
343
+ // Bundle verified. NOW snapshot — creates the durable git tag.
344
+ const snapResult = await this._snapshot.execute({
345
+ label: `release-${slug}`,
346
+ notes: `Release ${label} bundle source`,
347
+ });
348
+ if (snapResult.isError) {
349
+ // Bundle exists but tagging failed. Surface but don't roll back —
350
+ // the bundle is still usable; the user can manually tag later.
351
+ return new ToolResult(
352
+ `Release '${label}' bundled at ${bundleRel} but snapshot tag FAILED: ${snapResult.content}. ` +
353
+ `Bundle is valid; create the snapshot tag manually if needed.`,
354
+ );
355
+ }
356
+ const meta = this._readSnapshotMeta(`release-${slug}`);
357
+ snapshotTag = meta.tag;
358
+ snapshotCommit = meta.commit;
359
+
360
+ // Back-fill the manifest with the now-known snapshot tag/commit.
361
+ try {
362
+ const m = JSON.parse(fs.readFileSync(manifestPath, "utf-8"));
363
+ m.snapshot_tag = snapshotTag;
364
+ m.snapshot_commit = snapshotCommit;
365
+ fs.writeFileSync(manifestPath, JSON.stringify(m, null, 2) + "\n");
366
+ // Also back-fill the README's snapshot placeholders if still placeholder.
367
+ const readme = fs.readFileSync(readmePath, "utf-8");
368
+ const updated = readme
369
+ .replace(/\(no tag — git unavailable\)/g, snapshotTag || "")
370
+ .replace(/\(unknown\)/g, snapshotCommit || "(unknown)");
371
+ if (updated !== readme) fs.writeFileSync(readmePath, updated);
372
+ } catch { /* best-effort back-fill */ }
373
+
297
374
  // Bundle dir is in output/ (gitignored). Snapshot manifest in snapshots/ IS tracked.
298
375
  const lines = [
299
376
  `Release '${label}' bundled at ${bundleRel}`,
@@ -576,10 +653,268 @@ export class ReleaseTool extends BaseTool {
576
653
  }
577
654
  }
578
655
 
656
+ // 3) v0.8 P0-C: production_qc_results.json + qc_results_v*.json shapes
657
+ // (资管 + 贷款 v0.7.5 audits both shipped empty historical_accuracy
658
+ // because the v0.7.2 aggregator only recognized rule_stats / full_test_results).
659
+ if (tally.size === 0) {
660
+ const qcFiles = files
661
+ .filter((f) =>
662
+ /^production_qc(?:_results)?(?:_v\d+)?\.json$/i.test(f.name) ||
663
+ /^qc_results(?:_v\d+)?\.json$/i.test(f.name)
664
+ )
665
+ .sort((a, b) => a.name.localeCompare(b.name));
666
+ for (const f of qcFiles.slice(0, 5)) {
667
+ try {
668
+ const d = JSON.parse(fs.readFileSync(f.path, "utf-8"));
669
+ const results = d.results;
670
+ if (!results) continue;
671
+
672
+ // Shape 3a (资管): nested rule-keyed map
673
+ // {results: {<rid>: {<doc_id>: {verdict, ...}}}}
674
+ if (typeof results === "object" && !Array.isArray(results)) {
675
+ for (const [rid, docs] of Object.entries(results)) {
676
+ if (!isRuleId(rid) || !docs || typeof docs !== "object") continue;
677
+ for (const r of Object.values(docs)) {
678
+ if (!r || typeof r !== "object") continue;
679
+ const verdict = (r.verdict || "").toString().toUpperCase();
680
+ if (verdict === "PASS") bump(rid, "pass");
681
+ else if (verdict === "FAIL") bump(rid, "fail");
682
+ else if (verdict === "NOT_APPLICABLE" || verdict === "NA" || verdict === "WARNING") bump(rid, "na");
683
+ }
684
+ }
685
+ if (tally.size > 0) sourceFiles.push(path.relative(this._workspace.cwd, f.path));
686
+ }
687
+ // Shape 3b (贷款): per-doc rollup list with failed_rules
688
+ // {results: [{filename, actual, correct, failed_rules: [...]}], total_tested: N}
689
+ // For each rule: failures counted from failed_rules union; passes
690
+ // inferred as (total_tested - failures) for rules that appear in the catalog.
691
+ else if (Array.isArray(results)) {
692
+ const catalogPath = path.join(this._workspace.cwd, "rules", "catalog.json");
693
+ let catalogRules = [];
694
+ try {
695
+ const cat = JSON.parse(fs.readFileSync(catalogPath, "utf-8"));
696
+ const list = Array.isArray(cat) ? cat : Array.isArray(cat?.rules) ? cat.rules : [];
697
+ catalogRules = list.map((r) => r?.id || r?.rule_id).filter((x) => isRuleId(x));
698
+ } catch { /* catalog optional */ }
699
+
700
+ const failCountByRule = new Map();
701
+ let docCount = 0;
702
+ for (const row of results) {
703
+ if (!row || typeof row !== "object") continue;
704
+ docCount += 1;
705
+ const failed = Array.isArray(row.failed_rules) ? row.failed_rules : [];
706
+ for (const rid of failed) {
707
+ if (!isRuleId(rid)) continue;
708
+ failCountByRule.set(rid, (failCountByRule.get(rid) || 0) + 1);
709
+ }
710
+ }
711
+ if (docCount > 0) {
712
+ const ruleSet = new Set([...catalogRules, ...failCountByRule.keys()]);
713
+ for (const rid of ruleSet) {
714
+ const fails = failCountByRule.get(rid) || 0;
715
+ const passes = Math.max(0, docCount - fails);
716
+ const t = tally.get(rid) || { pass: 0, fail: 0, na: 0, n: 0 };
717
+ t.pass += passes; t.fail += fails; t.n += docCount;
718
+ tally.set(rid, t);
719
+ }
720
+ if (tally.size > 0) sourceFiles.push(path.relative(this._workspace.cwd, f.path));
721
+ }
722
+ }
723
+ } catch { /* try next file */ }
724
+ if (tally.size > 0) break;
725
+ }
726
+ }
727
+
728
+ // 4) v0.8.1 P9-A: top-level fail_by_rule + pass_by_rule maps (贷款
729
+ // v0.8 production_qc_report.json shape). Direct per-rule counts —
730
+ // no per-doc rollup, no verdict literals to scan.
731
+ // {accuracy, total_checks, fail_by_rule: {<rid>: N}, pass_by_rule: {<rid>: N}}
732
+ if (tally.size === 0) {
733
+ for (const f of files) {
734
+ if (!/qc|prod|report|result/i.test(f.name)) continue;
735
+ try {
736
+ const d = JSON.parse(fs.readFileSync(f.path, "utf-8"));
737
+ const failMap = d?.fail_by_rule;
738
+ const passMap = d?.pass_by_rule;
739
+ if (
740
+ failMap && typeof failMap === "object" && !Array.isArray(failMap) &&
741
+ passMap && typeof passMap === "object" && !Array.isArray(passMap)
742
+ ) {
743
+ const allRules = new Set([...Object.keys(failMap), ...Object.keys(passMap)]);
744
+ let matched = false;
745
+ for (const rid of allRules) {
746
+ if (!isRuleId(rid)) continue;
747
+ const fails = Number(failMap[rid]) || 0;
748
+ const passes = Number(passMap[rid]) || 0;
749
+ if (fails + passes === 0) continue;
750
+ const t = tally.get(rid) || { pass: 0, fail: 0, na: 0, n: 0 };
751
+ t.pass += passes;
752
+ t.fail += fails;
753
+ t.n += passes + fails;
754
+ tally.set(rid, t);
755
+ matched = true;
756
+ }
757
+ if (matched) {
758
+ sourceFiles.push(path.relative(this._workspace.cwd, f.path));
759
+ break;
760
+ }
761
+ }
762
+ } catch { /* skip non-JSON */ }
763
+ }
764
+ }
765
+
766
+ // 5) v0.8.2 P13-A: doc-keyed → rules-keyed nested shape.
767
+ // 贷款 v0.8.1 wrote skill_test_v*_results.json + v2_hybrid_results.json
768
+ // + run_all_checks.json all with this shape:
769
+ // {
770
+ // "<doc_filename>": {
771
+ // "channel": "...", "expected": "PASS"|"FAIL",
772
+ // "rules": {
773
+ // "R01": {"rule_id": "R01", "verdict": "PASS", "confidence": 0.95, "method": "regex"},
774
+ // "R02": {...}
775
+ // }
776
+ // },
777
+ // ...
778
+ // }
779
+ // The optional outer "results" wrapper from v2_full_regression.json
780
+ // (which nests this further) is unwrapped via d.results || d.
781
+ if (tally.size === 0) {
782
+ for (const f of files) {
783
+ if (!/qc|verdict|result|test/i.test(f.name)) continue;
784
+ try {
785
+ const d = JSON.parse(fs.readFileSync(f.path, "utf-8"));
786
+ const root = d?.results || d;
787
+ if (!root || typeof root !== "object" || Array.isArray(root)) continue;
788
+ let matched = false;
789
+ for (const docKey of Object.keys(root)) {
790
+ const docEntry = root[docKey];
791
+ if (!docEntry || typeof docEntry !== "object") continue;
792
+ const rulesMap = docEntry.rules;
793
+ if (!rulesMap || typeof rulesMap !== "object" || Array.isArray(rulesMap)) continue;
794
+ for (const rid of Object.keys(rulesMap)) {
795
+ if (!isRuleId(rid)) continue;
796
+ const r = rulesMap[rid];
797
+ if (!r || typeof r !== "object") continue;
798
+ const verdict = (r.verdict || r.result_type || r.status || "").toString().toUpperCase();
799
+ if (verdict === "PASS") { bump(rid, "pass"); matched = true; }
800
+ else if (verdict === "FAIL") { bump(rid, "fail"); matched = true; }
801
+ else if (verdict === "NOT_APPLICABLE" || verdict === "NA") { bump(rid, "na"); matched = true; }
802
+ }
803
+ }
804
+ if (matched) {
805
+ sourceFiles.push(path.relative(this._workspace.cwd, f.path));
806
+ break;
807
+ }
808
+ } catch { /* skip non-JSON */ }
809
+ }
810
+ }
811
+
812
+ // 6) v0.8.3 P22-B6: top-level array of {doc_id, results: [{rule_id, status}]}.
813
+ // 资管 v0.8.2 wrote `output/skill_test_v*.json` + `workflow_v*_results.json`
814
+ // + `evolution_round*.json` all with this shape:
815
+ // [
816
+ // {
817
+ // "doc_id": "<doc-filename>",
818
+ // "results": [
819
+ // {"rule_id": "R01-01", "status": "WARNING", "found_fields": {...}},
820
+ // {"rule_id": "R01-02", "status": "PASS", ...},
821
+ // ...
822
+ // ]
823
+ // },
824
+ // ...
825
+ // ]
826
+ // Distinct from Shape 5: top-level is an ARRAY (not object), and the
827
+ // per-rule data lives in `results: [...]` (an array of rule outcomes)
828
+ // rather than `rules: {<rule>: ...}` (object keyed by rule).
829
+ if (tally.size === 0) {
830
+ for (const f of files) {
831
+ if (!/qc|verdict|result|test|evolution|workflow/i.test(f.name)) continue;
832
+ try {
833
+ const d = JSON.parse(fs.readFileSync(f.path, "utf-8"));
834
+ if (!Array.isArray(d)) continue;
835
+ let matched = false;
836
+ for (const docEntry of d) {
837
+ if (!docEntry || typeof docEntry !== "object") continue;
838
+ const results = docEntry.results;
839
+ if (!Array.isArray(results)) continue;
840
+ for (const r of results) {
841
+ if (!r || typeof r !== "object") continue;
842
+ const rid = r.rule_id || r.ruleId || r.id;
843
+ if (!isRuleId(rid)) continue;
844
+ const verdict = (r.status || r.verdict || r.result_type || "").toString().toUpperCase();
845
+ if (verdict === "PASS") { bump(rid, "pass"); matched = true; }
846
+ else if (verdict === "FAIL") { bump(rid, "fail"); matched = true; }
847
+ else if (verdict === "WARNING") { bump(rid, "pass"); matched = true; } // WARNING counts as pass (per existing shape conventions)
848
+ else if (verdict === "NOT_APPLICABLE" || verdict === "NA") { bump(rid, "na"); matched = true; }
849
+ }
850
+ }
851
+ if (matched) {
852
+ sourceFiles.push(path.relative(this._workspace.cwd, f.path));
853
+ break;
854
+ }
855
+ } catch { /* skip non-JSON */ }
856
+ }
857
+ }
858
+
859
+ // 7) Fallback (belt-and-suspenders per v0.8 plan Risk #7):
860
+ // walk any output/*.json with a top-level rule_id-keyed shape that has
861
+ // verdict-like leaf objects. Catches future schema drift before the
862
+ // next audit cycle.
863
+ if (tally.size === 0) {
864
+ for (const f of files) {
865
+ if (!/qc|verdict|result/i.test(f.name)) continue;
866
+ try {
867
+ const d = JSON.parse(fs.readFileSync(f.path, "utf-8"));
868
+ const root = d?.results || d;
869
+ if (!root || typeof root !== "object" || Array.isArray(root)) continue;
870
+ let matched = false;
871
+ for (const [rid, val] of Object.entries(root)) {
872
+ if (!isRuleId(rid) || !val || typeof val !== "object") continue;
873
+ // val might be {verdict, ...} OR {<doc>: {verdict, ...}}
874
+ const probe = val.verdict ? [val] : Object.values(val);
875
+ for (const r of probe) {
876
+ if (!r || typeof r !== "object") continue;
877
+ const verdict = (r.verdict || "").toString().toUpperCase();
878
+ if (verdict === "PASS") { bump(rid, "pass"); matched = true; }
879
+ else if (verdict === "FAIL") { bump(rid, "fail"); matched = true; }
880
+ else if (verdict === "NOT_APPLICABLE" || verdict === "NA") { bump(rid, "na"); matched = true; }
881
+ }
882
+ }
883
+ if (matched) {
884
+ sourceFiles.push(path.relative(this._workspace.cwd, f.path) + " (fallback shape)");
885
+ break;
886
+ }
887
+ } catch { /* skip non-JSON */ }
888
+ }
889
+ }
890
+
579
891
  if (tally.size === 0) return null;
580
892
 
893
+ // v0.8.1 P9-D: filter tally to rule_ids in the current catalog.
894
+ // E2E #11 资管 v0.8 audit: confidence_calibration aggregated from
895
+ // an abandoned 39-rule pipeline included only 2 of 4 final samples.
896
+ // Filtering to catalog.json keeps the calibration scoped to the
897
+ // rules that actually ship in the release.
898
+ let catalogRuleIds = null;
899
+ try {
900
+ const catalogPath = path.join(this._workspace.cwd, "rules", "catalog.json");
901
+ if (fs.existsSync(catalogPath)) {
902
+ const cat = JSON.parse(fs.readFileSync(catalogPath, "utf-8"));
903
+ const list = Array.isArray(cat) ? cat : Array.isArray(cat?.rules) ? cat.rules : [];
904
+ catalogRuleIds = new Set(
905
+ list.map((r) => r?.id || r?.rule_id).filter((x) => isRuleId(x))
906
+ );
907
+ if (catalogRuleIds.size === 0) catalogRuleIds = null;
908
+ }
909
+ } catch { /* skip filter if catalog missing/malformed */ }
910
+
581
911
  const historical_accuracy = {};
912
+ const droppedRules = [];
582
913
  for (const [rid, t] of tally.entries()) {
914
+ if (catalogRuleIds && !catalogRuleIds.has(rid)) {
915
+ droppedRules.push(rid);
916
+ continue;
917
+ }
583
918
  const fired = t.pass + t.fail;
584
919
  historical_accuracy[rid] = {
585
920
  pass_rate: fired > 0 ? +(t.pass / fired).toFixed(4) : null,
@@ -593,6 +928,7 @@ export class ReleaseTool extends BaseTool {
593
928
  historical_accuracy,
594
929
  computed_at: new Date().toISOString(),
595
930
  source_files: sourceFiles,
931
+ ...(droppedRules.length > 0 ? { dropped_off_catalog: droppedRules } : {}),
596
932
  };
597
933
  }
598
934