npm - kc-beta - Versions diffs - 0.7.1 → 0.7.3 - Mend

kc-beta 0.7.1 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/README.md +15 -8
package/package.json +1 -1
package/src/agent/engine.js +32 -2
package/src/agent/pipelines/_milestone-derive.js +65 -42
package/src/agent/pipelines/finalization.js +2 -6
package/src/agent/pipelines/initializer.js +13 -0
package/src/agent/tools/copy-to-workspace.js +17 -12
package/src/agent/tools/release.js +151 -1
package/src/agent/tools/sandbox-exec.js +4 -1
package/src/agent/tools/task-board.js +194 -0
package/src/agent/tools/workspace-file.js +58 -44
package/src/config.js +6 -4
package/src/util/kc-version.js +27 -0
package/template/CLAUDE.md +13 -0
package/template/skills/en/meta-meta/rule-extraction/SKILL.md +77 -0
package/template/skills/en/meta-meta/skill-to-workflow/SKILL.md +26 -0
package/template/skills/en/meta-meta/work-decomposition/SKILL.md +76 -9
package/template/skills/zh/meta-meta/rule-extraction/SKILL.md +65 -0
package/template/skills/zh/meta-meta/skill-to-workflow/SKILL.md +26 -0
package/template/skills/zh/meta-meta/work-decomposition/SKILL.md +74 -9

package/README.md CHANGED Viewed

@@ -216,28 +216,35 @@ Quality Thresholds, Language.
 ## Status
-**v0.6.0 — first architectural beta.** This release lands:
+**v0.7.3 — codex review patch release.** Latest line in the v0.7.x
+hardening track. Architectural payload from v0.6.0+ is still in place:
 - Parallel ralph-loop (up to 8 concurrent workers) with a heap-safety
   conformance gate
 - Native chunker + RAG (onion-peeler + CJK bigram keyword index +
   one-shot LLM bundle classifier, ported from the AMC verification app)
-- Source-context auto-attach on skill_authoring tasks (rule NL + evidence
-  chunks + sibling rules injected into the prompt, no manual search needed)
+- Agent-owned task board: the agent reads the rule list from
+  `describeState`, decides decomposition (per-rule / grouped / range),
+  and calls `TaskCreate` / `TaskUpdate` / `TaskComplete` to drive the
+  Ralph loop. Source-context auto-attach pulls rule NL + evidence chunks
+  + sibling rules into the prompt of each task as it runs.
 - Workspace file locking for shared coordination files (`rules/catalog.json`,
-  `rules/manifest.json`, `tasks.json`, etc.)
+  `rules/manifest.json`, `refs/manifest.json`, `tasks.json`,
+  `session-state.json`) — every writer goes through `withFileLock`.
 - `agent_tool` gets `wait` / `poll` / `list` / `kill` operations +
   `stale_subagents` phase-advance signal
-- New FINALIZATION phase packages the session into a shippable deliverable
+- FINALIZATION phase packages the session into a shippable deliverable
   (canonical `rule_skills/` layout + README + coverage report + final
   dashboard)
+- Filesystem-derived phase milestones (v0.7.0+): the engine reads disk
+  artifacts for advance criteria, never trusts tool-call assertions
 - Input stays active during streaming (type-ahead queue), arrow keys +
   history recall, CTX smoothing + peak, per-provider context-limit caps,
   `/tools`, `/parallelism`, and more
-See [DEV_LOG.md](./DEV_LOG.md) for the full v0.6.0 change breakdown and
-[docs/update_design_v5.md](./docs/update_design_v5.md) for the plan that
-drove it.
+See [DEV_LOG.md](./DEV_LOG.md) for the per-release change breakdowns and
+[docs/update_design_v7.md](./docs/update_design_v7.md) for the v0.7.x
+plan and patch notes.
 Bug reports and PRs welcome at <https://github.com/kitchen-engineer42/kc-cli>.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "kc-beta",
-  "version": "0.7.1",
+  "version": "0.7.3",
   "description": "KC Agent — LLM document verification agent (pure Node.js CLI). Dual-licensed: PolyForm Noncommercial 1.0.0 for personal/noncommercial use; commercial license required for enterprise production. See LICENSE and LICENSE-COMMERCIAL.md.",
   "type": "module",
   "bin": {

package/src/agent/engine.js CHANGED Viewed

@@ -21,6 +21,7 @@ import { SnapshotTool } from "./tools/snapshot.js";
 import { ArchiveFileTool } from "./tools/archive-file.js";
 import { ScheduleFetchTool } from "./tools/schedule-fetch.js";
 import { ReleaseTool } from "./tools/release.js";
+import { readKcVersion } from "../util/kc-version.js";
 import { PhaseAdvanceTool } from "./tools/phase-advance.js";
 import { DocumentParseTool } from "./tools/document-parse.js";
 import { DocumentSearchTool } from "./tools/document-search.js";
@@ -36,6 +37,7 @@ import { EvolutionCycleTool } from "./tools/evolution-cycle.js";
 import { TierDowngradeTool } from "./tools/tier-downgrade.js";
 import { AgentTool } from "./tools/agent-tool.js";
 import { WebSearchTool } from "./tools/web-search.js";
+import { TaskCreateTool, TaskUpdateTool, TaskCompleteTool } from "./tools/task-board.js";
 import { SkillLoader } from "./skill-loader.js";
 import { TaskManager } from "./task-manager.js";
 import { Scheduler } from "./scheduler.js";
@@ -421,7 +423,7 @@ export class AgentEngine {
         new SnapshotTool(this.workspace),
         new ArchiveFileTool(this.workspace),
         new ScheduleFetchTool(this.workspace),
-        new ReleaseTool(this.workspace, { kcVersion: "0.5.2" }),
+        new ReleaseTool(this.workspace, { kcVersion: readKcVersion() }),
         new PhaseAdvanceTool(
           // v0.7.1 2c: advanceFn returns rich `{advanced, engineCounts?}`
           // so the tool's refusal text can surface the engine telemetry
@@ -474,6 +476,16 @@ export class AgentEngine {
           () => this.currentPhase,
         ),
         new WebSearchTool(this.config.tavilyApiKey),
+        // v0.7.3: completes the v0.7.0 "agent owns TaskBoard" design.
+        // Skills already reference TaskCreate by name; these tools make
+        // that contract truthful. See task-board.js + work-decomposition
+        // SKILL.md. Skipped for subagents — they don't own a task board
+        // (taskManager is null in subagent scope, line 216).
+        ...(this.taskManager ? [
+          new TaskCreateTool(this.workspace, this.taskManager),
+          new TaskUpdateTool(this.workspace, this.taskManager),
+          new TaskCompleteTool(this.workspace, this.taskManager),
+        ] : []),
       ],
       // Distillation+ only (DISTILL mode)
       distill: [
@@ -1307,6 +1319,7 @@ export class AgentEngine {
           yield new AgentEvent({
             type: "tool_result",
             name: tc.name,
+            input: inputData,
             output: historyContent,
             isError: result.isError,
           });
@@ -1679,7 +1692,24 @@ export class AgentEngine {
           parts.push(`monitoring: ${pipeline.monitoringPhase ?? "?"}`);
           break;
         }
-        // bootstrap / finalization: no specific counters, fall through
+        case "bootstrap": {
+          // v0.7.2 1e: previously fell through to empty string. Both
+          // v0.7.1 audit runs had bootstrap → rule_extraction refusals
+          // with engineCounts: "" — agent saw the refusal but had no
+          // engine telemetry to react to. The InitializerPipeline tracks
+          // boolean checklist flags rather than numeric counters; we
+          // surface those flags as "yes/no" so the agent can see which
+          // bootstrap criterion is missing.
+          if (typeof pipeline.describeBootstrapChecklist === "function") {
+            const cl = pipeline.describeBootstrapChecklist();
+            parts.push(`workspaceCreated: ${cl.workspaceCreated ? "yes" : "no"}`);
+            parts.push(`configReady: ${cl.configReady ? "yes" : "no"}`);
+            parts.push(`hasRegulations: ${cl.hasRegulations ? "yes" : "no"}`);
+            parts.push(`hasSamples: ${cl.hasSamples ? "yes" : "no"}`);
+          }
+          break;
+        }
+        // finalization: no specific counters, fall through
       }
     } catch { /* never let summary build break phase advance */ }
     return parts.join(", ");

package/src/agent/pipelines/_milestone-derive.js CHANGED Viewed

@@ -57,16 +57,21 @@ function listChildFiles(p) {
 // Walk a directory recursively, yielding every file path. Skips hidden
 // dirs/files and __pycache__. Used by derive functions that need to
 // match arbitrarily-nested artifacts (e.g., scripts/ subdirs).
-function* walkFiles(root) {
+//
+// v0.7.2 1a: optional maxDepth caps recursion. depth=0 is root's
+// direct children; depth=1 is one level down. Default unbounded
+// (existing callers).
+function* walkFiles(root, { maxDepth } = {}) {
   if (!dirExists(root)) return;
-  const stack = [root];
+  const stack = [{ dir: root, depth: 0 }];
   while (stack.length) {
-    const dir = stack.pop();
+    const { dir, depth } = stack.pop();
     for (const e of readDirSafe(dir)) {
       if (e.name.startsWith(".") || e.name === "__pycache__") continue;
       const p = path.join(dir, e.name);
-      if (e.isDirectory()) stack.push(p);
-      else if (e.isFile()) yield p;
+      if (e.isDirectory()) {
+        if (maxDepth == null || depth < maxDepth) stack.push({ dir: p, depth: depth + 1 });
+      } else if (e.isFile()) yield p;
     }
   }
 }
@@ -271,48 +276,66 @@ export function deriveSkillTestingMilestones(workspace) {
     }
   }
-  // v0.7.1 1a: also credit rules whose verdicts appear in output/*.json.
-  // Agents naturally write batch-test results to output/, not per-skill
-  // paths. v0.6.x's _loadTestResults already reads here on the canonical
-  // accuracy schema; this expands the helper-derived milestone to
-  // recognize the same shape (plus the GLM/DS-shape variants seen in
-  // E2E #6 v070). Without this, agents who run tests via sandbox_exec
-  // and persist to output/ saw skillsTested=0 and force-bypassed.
-  const collectFromJsonFile = (data) => {
-    if (!data) return;
-    if (data.rule_id) tested.add(data.rule_id);
-    if (Array.isArray(data) && data[0] && typeof data[0] === "object" && data[0].rule_id) {
-      for (const r of data) if (r?.rule_id) tested.add(r.rule_id);
+  // v0.7.1 1a / v0.7.2 1a: credit rules whose verdicts appear anywhere
+  // under output/*.json. Agents persist batch-test results in
+  // conductor-specific shapes (this is the recurring drift point —
+  // engine derivation has to match disk reality, not the other way
+  // around). Shapes seen across E2E #5/6/7:
+  //
+  //   - DS v0.7.0/0.7.1: catalog.json as array of {id: "R001", ...}
+  //     entries; skill_test_*.json as {doc_name: {R019a: bool, ...}};
+  //     skill_test_阳光资产.json with {doc, results: {R019a: ...}}
+  //   - GLM v0.7.1: rule_stats.json as {D01-01: {PASS, FAIL, NA}, ...};
+  //     full_test_results_v[1-6].json as {sample_id: {path, meta,
+  //     results: {D01-01: {verdict, ...}}}} (nested 2 levels deep, why
+  //     v0.7.1's shallow walk missed them)
+  //
+  // The collector recurses (depth-limited) and uses two heuristics to
+  // separate rule_ids from sample_ids / doc_names:
+  //   1. Rule-id shape: starts with letter, ≤ 30 chars, contains digits
+  //      (matches R001, D01-01, T02-31; rejects 06f2ed1488, doc paths)
+  //   2. Verdict-shape on values: {verdict, passed, pass, PASS, FAIL}
+  //      keys signal that the parent dict's keys are rule_ids
+  const ruleIdShape = /^[A-Za-z][A-Za-z0-9_-]{0,29}$/;
+  const isRuleIdShape = (s) => typeof s === "string" && ruleIdShape.test(s) && /\d/.test(s);
+  const looksLikeVerdict = (v) =>
+    v && typeof v === "object" && !Array.isArray(v) && (
+      v.verdict !== undefined ||
+      v.passed !== undefined ||
+      v.pass !== undefined ||
+      typeof v.PASS === "number" ||
+      typeof v.FAIL === "number"
+    );
+  const collectFromJsonFile = (data, depth = 0) => {
+    if (!data || depth > 4) return;
+    if (typeof data !== "object") return;
+    if (Array.isArray(data)) {
+      for (const r of data) collectFromJsonFile(r, depth + 1);
+      return;
+    }
+    // {rule_id: "X"} or {id: "R001"} on a rule entry
+    if (isRuleIdShape(data.rule_id)) tested.add(data.rule_id);
+    if (isRuleIdShape(data.id)) tested.add(data.id);
+    // {<rule_id>: <verdict_shaped>, ...}  (rule_stats / per-doc test_results)
+    for (const [k, v] of Object.entries(data)) {
+      if (isRuleIdShape(k) && looksLikeVerdict(v)) tested.add(k);
     }
-    if (data.results && typeof data.results === "object") {
-      for (const k of Object.keys(data.results)) tested.add(k);
+    // {results: {<rule_id>: ...}} — keys must look rule-id-shaped
+    if (data.results && typeof data.results === "object" && !Array.isArray(data.results)) {
+      for (const k of Object.keys(data.results)) {
+        if (isRuleIdShape(k)) tested.add(k);
+      }
+    }
+    // Recurse into nested objects (handles {sample_id: {results: {...}}})
+    for (const v of Object.values(data)) {
+      if (v && typeof v === "object") collectFromJsonFile(v, depth + 1);
     }
   };
   const outputDir = path.join(cwd, "output");
-  if (dirExists(outputDir)) {
-    for (const f of listChildFiles(outputDir)) {
-      if (!f.name.endsWith(".json")) continue;
-      collectFromJsonFile(readJsonSafe(path.join(outputDir, f.name)));
-    }
-    // One level into output/results/, output/distillation/ — the two
-    // most common batch-result locations across E2E #5 and v070 sessions.
-    for (const sub of ["results", "distillation", "qc"]) {
-      const subDir = path.join(outputDir, sub);
-      if (!dirExists(subDir)) continue;
-      for (const f of listChildFiles(subDir)) {
-        if (!f.name.endsWith(".json")) continue;
-        collectFromJsonFile(readJsonSafe(path.join(subDir, f.name)));
-      }
-      // GLM v070 wrote per-rule subdirs under output/results/<rule_id>/
-      // — walk one more level for that pattern.
-      for (const child of listChildDirs(subDir)) {
-        for (const f of listChildFiles(path.join(subDir, child.name))) {
-          if (!f.name.endsWith(".json")) continue;
-          collectFromJsonFile(readJsonSafe(path.join(subDir, child.name, f.name)));
-        }
-      }
-    }
+  for (const p of walkFiles(outputDir, { maxDepth: 6 })) {
+    if (!p.endsWith(".json")) continue;
+    collectFromJsonFile(readJsonSafe(p));
   }
   // DS v070 wrote a top-level aggregate at either rules/test_results.json

package/src/agent/pipelines/finalization.js CHANGED Viewed

@@ -5,6 +5,7 @@ import { PipelineEvent } from "./index.js";
 import { Pipeline } from "./base.js";
 import { normalizeRuleCatalog } from "../rule-catalog-normalize.js";
 import { deriveFinalizationMilestones } from "./_milestone-derive.js";
+import { readKcVersion } from "../../util/kc-version.js";
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 // v0.7.0 N: ship template/release/v1/ from the npm package; copy into
@@ -310,12 +311,7 @@ export class FinalizationPipeline extends Pipeline {
   }
   _readKcVersion() {
-    try {
-      const pkg = JSON.parse(fs.readFileSync(
-        path.resolve(__dirname, "../../../package.json"), "utf-8",
-      ));
-      return pkg.version || "unknown";
-    } catch { return "unknown"; }
+    return readKcVersion();
   }
   /**

package/src/agent/pipelines/initializer.js CHANGED Viewed

@@ -240,6 +240,19 @@ export class ProjectInitializer extends Pipeline {
     return this.workspaceCreated && this.configReady && this.hasRegulations && this.hasSamples;
   }
+  // v0.7.2 1e: surface the checklist as engine telemetry so
+  // `_buildEngineCountsBlock("bootstrap")` has something to report when
+  // bootstrap → rule_extraction is refused. Agent sees the missing
+  // criteria directly in the refusal text.
+  describeBootstrapChecklist() {
+    return {
+      workspaceCreated: !!this.workspaceCreated,
+      configReady: !!this.configReady,
+      hasRegulations: !!this.hasRegulations,
+      hasSamples: !!this.hasSamples,
+    };
+  }
   /**
    * v0.6.3 (#74): nudge the agent when it does work that belongs to a later
    * phase. Bootstrap is setup — reading rules/samples, configuring keys,

package/src/agent/tools/copy-to-workspace.js CHANGED Viewed

@@ -94,7 +94,7 @@ export class CopyToWorkspaceTool extends BaseTool {
       this._appendGitignore(`refs/${targetName}`);
     }
-    this._appendManifest({
+    await this._appendManifest({
       target: targetRel,
       source: sourcePath,
       size: stat.size,
@@ -113,17 +113,22 @@ export class CopyToWorkspaceTool extends BaseTool {
     );
   }
-  _appendManifest(entry) {
-    const manifestAbs = this._workspace.resolvePath(MANIFEST_REL);
-    fs.mkdirSync(path.dirname(manifestAbs), { recursive: true });
-    let entries = [];
-    if (fs.existsSync(manifestAbs)) {
-      try { entries = JSON.parse(fs.readFileSync(manifestAbs, "utf-8")); }
-      catch { entries = []; }
-    }
-    if (!Array.isArray(entries)) entries = [];
-    entries.push(entry);
-    fs.writeFileSync(manifestAbs, JSON.stringify(entries, null, 2), "utf-8");
+  async _appendManifest(entry) {
+    // v0.7.3: refs/manifest.json is a shared coordination path — wrap the
+    // whole read-modify-write under the workspace lock so two parallel
+    // copy_to_workspace calls (main agent + subagent) don't lose entries.
+    return await this._workspace.withSharedLockIfApplicable(MANIFEST_REL, () => {
+      const manifestAbs = this._workspace.resolvePath(MANIFEST_REL);
+      fs.mkdirSync(path.dirname(manifestAbs), { recursive: true });
+      let entries = [];
+      if (fs.existsSync(manifestAbs)) {
+        try { entries = JSON.parse(fs.readFileSync(manifestAbs, "utf-8")); }
+        catch { entries = []; }
+      }
+      if (!Array.isArray(entries)) entries = [];
+      entries.push(entry);
+      fs.writeFileSync(manifestAbs, JSON.stringify(entries, null, 2), "utf-8");
+    });
   }
   _appendGitignore(line) {

package/src/agent/tools/release.js CHANGED Viewed

@@ -178,7 +178,21 @@ export class ReleaseTool extends BaseTool {
                        path.join(bundleAbs, "glossary.json"), { fallback: '{"version":1,"entries":[]}\n' });
     this._copyIfExists(path.join(this._workspace.cwd, "corner_cases.json"),
                        path.join(bundleAbs, "corner_cases.json"), { fallback: '[]\n' });
-    this._copyIfExists(path.join(this._workspace.cwd, "confidence_calibration.json"),
+    // v0.7.2 1c: auto-aggregate from output/ if no calibration file at
+    // workspace root. Both v0.7.1 audit runs (DS + GLM) shipped releases
+    // with empty `historical_accuracy: {}` despite having per-rule QC
+    // data on disk under output/ — the release tool just passed the
+    // file through and emitted a stub on miss. We try to populate from
+    // known QC artifact shapes here; if nothing matches, fall through
+    // to the existing stub fallback.
+    const calibSrc = path.join(this._workspace.cwd, "confidence_calibration.json");
+    if (!fs.existsSync(calibSrc)) {
+      const aggregated = this._aggregateAccuracyFromOutput();
+      if (aggregated && Object.keys(aggregated.historical_accuracy).length > 0) {
+        fs.writeFileSync(calibSrc, JSON.stringify(aggregated, null, 2) + "\n", "utf-8");
+      }
+    }
+    this._copyIfExists(calibSrc,
                        path.join(bundleAbs, "confidence_calibration.json"),
                        { fallback: '{"historical_accuracy":{}}\n' });
@@ -233,6 +247,30 @@ export class ReleaseTool extends BaseTool {
       .replace(/\{RULES_LIST\}/g, rulesList);
     fs.writeFileSync(path.join(bundleAbs, "README.md"), readme, "utf-8");
+    // v0.7.2 1d: clean up the template scaffold dir if a customized
+    // release was just written alongside it. Both v0.7.1 audit runs
+    // shipped with `output/releases/v1/` (template-derived, .tmpl
+    // files lingering) AND `output/releases/v1-0/` (or v1-0-hybrid/)
+    // — the customized release. The pre-scaffold is meant as a hint;
+    // once the agent calls `release(label="v1-0")` and we've written
+    // the real bundle, the unedited scaffold is just clutter.
+    //
+    // Conservative gate: only delete a sibling `v1/` if BOTH (a) we
+    // didn't just write to v1/ ourselves, AND (b) it still contains
+    // .tmpl files (signature of unedited template). If the agent
+    // intentionally edited v1/ in place (removing .tmpl), our cleanup
+    // leaves it alone.
+    if (slug !== "v1") {
+      const tmplScaffold = path.join(this._workspace.resolvePath(path.join("output", "releases")), "v1");
+      if (fs.existsSync(tmplScaffold) && fs.statSync(tmplScaffold).isDirectory()) {
+        let hasTmpl = false;
+        try { hasTmpl = fs.readdirSync(tmplScaffold).some((f) => f.endsWith(".tmpl")); } catch { /* ignore */ }
+        if (hasTmpl) {
+          try { fs.rmSync(tmplScaffold, { recursive: true, force: true }); } catch { /* best-effort */ }
+        }
+      }
+    }
     // Bundle dir is in output/ (gitignored). Snapshot manifest in snapshots/ IS tracked.
     const lines = [
       `Release '${label}' bundled at ${bundleRel}`,
@@ -319,6 +357,118 @@ export class ReleaseTool extends BaseTool {
     return null;
   }
+  // v0.7.2 1c: walk output/ for QC artifacts and aggregate per-rule
+  // accuracy. Recognized shapes (covering DS + GLM v0.7.1 audit runs):
+  //
+  //   rule_stats_v*.json — {<rule_id>: {PASS: N, FAIL: N, NOT_APPLICABLE: N, ERROR: N}}
+  //     (GLM produced 4 versions; pick the highest)
+  //   full_test_results_v*.json — {<sample_id>: {results: {<rule_id>: {verdict}}}}
+  //     (GLM; accumulate verdicts per rule across samples)
+  //   skill_test_*.json — {<doc_name>: {<rule_id>: bool}} (DS shape)
+  //
+  // Returns null if no recognized artifact, or an object with
+  //   { historical_accuracy: {<rule_id>: {pass_rate, n_samples, ...}}, computed_at, source_files }
+  // suitable for confidence_calibration.json.
+  _aggregateAccuracyFromOutput() {
+    const ruleIdShape = /^[A-Za-z][A-Za-z0-9_-]{0,29}$/;
+    const isRuleId = (s) => typeof s === "string" && ruleIdShape.test(s) && /\d/.test(s);
+    const tally = new Map();  // rule_id -> {pass, fail, na, n}
+    const sourceFiles = [];
+    const bump = (rid, kind) => {
+      if (!isRuleId(rid)) return;
+      const t = tally.get(rid) || { pass: 0, fail: 0, na: 0, n: 0 };
+      t[kind] += 1;
+      t.n += 1;
+      tally.set(rid, t);
+    };
+    const outputDir = path.join(this._workspace.cwd, "output");
+    if (!fs.existsSync(outputDir)) return null;
+    // Collect all .json files under output/ (depth limited)
+    const files = [];
+    const stack = [{ dir: outputDir, depth: 0 }];
+    while (stack.length) {
+      const { dir, depth } = stack.pop();
+      let entries;
+      try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch { continue; }
+      for (const e of entries) {
+        if (e.name.startsWith(".") || e.name === "__pycache__") continue;
+        const p = path.join(dir, e.name);
+        if (e.isDirectory()) {
+          if (depth < 6) stack.push({ dir: p, depth: depth + 1 });
+        } else if (e.isFile() && e.name.endsWith(".json")) {
+          files.push({ path: p, name: e.name });
+        }
+      }
+    }
+    // 1) Prefer rule_stats_v<N>.json (highest version) — direct counts
+    const ruleStatsFiles = files
+      .filter((f) => /^rule_stats(?:_v\d+)?\.json$/i.test(f.name))
+      .map((f) => ({ ...f, ver: (f.name.match(/_v(\d+)/) || [0, 0])[1] | 0 }))
+      .sort((a, b) => b.ver - a.ver);
+    if (ruleStatsFiles.length > 0) {
+      const top = ruleStatsFiles[0];
+      try {
+        const d = JSON.parse(fs.readFileSync(top.path, "utf-8"));
+        for (const [rid, stats] of Object.entries(d)) {
+          if (!isRuleId(rid) || !stats || typeof stats !== "object") continue;
+          const pass = stats.PASS | 0, fail = stats.FAIL | 0;
+          const na = stats.NOT_APPLICABLE | stats.NA | 0;
+          const t = tally.get(rid) || { pass: 0, fail: 0, na: 0, n: 0 };
+          t.pass += pass; t.fail += fail; t.na += na; t.n += pass + fail + na;
+          tally.set(rid, t);
+        }
+        sourceFiles.push(path.relative(this._workspace.cwd, top.path));
+      } catch { /* fall through to other shapes */ }
+    }
+    // 2) Fallback: full_test_results*.json with nested {sample_id: {results: {rid: {verdict}}}}
+    if (tally.size === 0) {
+      const ftrFiles = files
+        .filter((f) => /^full_test_results(?:_v\d+)?\.json$/i.test(f.name))
+        .map((f) => ({ ...f, ver: (f.name.match(/_v(\d+)/) || [0, 0])[1] | 0 }))
+        .sort((a, b) => b.ver - a.ver);
+      for (const f of ftrFiles.slice(0, 1)) {
+        try {
+          const d = JSON.parse(fs.readFileSync(f.path, "utf-8"));
+          for (const sample of Object.values(d)) {
+            if (!sample || typeof sample !== "object") continue;
+            const results = sample.results;
+            if (!results || typeof results !== "object") continue;
+            for (const [rid, r] of Object.entries(results)) {
+              if (!isRuleId(rid) || !r || typeof r !== "object") continue;
+              const verdict = (r.verdict || "").toString().toUpperCase();
+              if (verdict === "PASS") bump(rid, "pass");
+              else if (verdict === "FAIL") bump(rid, "fail");
+              else if (verdict === "NOT_APPLICABLE" || verdict === "NA") bump(rid, "na");
+            }
+          }
+          sourceFiles.push(path.relative(this._workspace.cwd, f.path));
+        } catch { /* try next shape */ }
+      }
+    }
+    if (tally.size === 0) return null;
+    const historical_accuracy = {};
+    for (const [rid, t] of tally.entries()) {
+      const fired = t.pass + t.fail;
+      historical_accuracy[rid] = {
+        pass_rate: fired > 0 ? +(t.pass / fired).toFixed(4) : null,
+        n_passed: t.pass,
+        n_failed: t.fail,
+        n_not_applicable: t.na,
+        n_samples: t.n,
+      };
+    }
+    return {
+      historical_accuracy,
+      computed_at: new Date().toISOString(),
+      source_files: sourceFiles,
+    };
+  }
   _readWorkerTiers() {
     const envPath = path.join(this._workspace.cwd, ".env");
     const out = { tier1: "", tier2: "", tier3: "", tier4: "" };

package/src/agent/tools/sandbox-exec.js CHANGED Viewed

@@ -44,7 +44,10 @@ export class SandboxExecTool extends BaseTool {
       "Execute a shell command. " +
       "cwd='workspace' (default) runs in KC's workspace. " +
       "cwd='project' runs in the user's project directory. " +
-      "Pipes, redirects, and chained commands (&&) are supported."
+      "Pipes, redirects, and chained commands (&&) are supported. " +
+      "stdout + stderr combined are capped at 10,000 chars; longer output is truncated. " +
+      "For reading individual files larger than ~10 KB (e.g. regulation documents), " +
+      "prefer workspace_file (operation=read) which has a larger 50 KB cap."
     );
   }