kc-beta 0.7.1 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -216,28 +216,35 @@ Quality Thresholds, Language.
216
216
 
217
217
  ## Status
218
218
 
219
- **v0.6.0first architectural beta.** This release lands:
219
+ **v0.7.3codex review patch release.** Latest line in the v0.7.x
220
+ hardening track. Architectural payload from v0.6.0+ is still in place:
220
221
 
221
222
  - Parallel ralph-loop (up to 8 concurrent workers) with a heap-safety
222
223
  conformance gate
223
224
  - Native chunker + RAG (onion-peeler + CJK bigram keyword index +
224
225
  one-shot LLM bundle classifier, ported from the AMC verification app)
225
- - Source-context auto-attach on skill_authoring tasks (rule NL + evidence
226
- chunks + sibling rules injected into the prompt, no manual search needed)
226
+ - Agent-owned task board: the agent reads the rule list from
227
+ `describeState`, decides decomposition (per-rule / grouped / range),
228
+ and calls `TaskCreate` / `TaskUpdate` / `TaskComplete` to drive the
229
+ Ralph loop. Source-context auto-attach pulls rule NL + evidence chunks
230
+ + sibling rules into the prompt of each task as it runs.
227
231
  - Workspace file locking for shared coordination files (`rules/catalog.json`,
228
- `rules/manifest.json`, `tasks.json`, etc.)
232
+ `rules/manifest.json`, `refs/manifest.json`, `tasks.json`,
233
+ `session-state.json`) — every writer goes through `withFileLock`.
229
234
  - `agent_tool` gets `wait` / `poll` / `list` / `kill` operations +
230
235
  `stale_subagents` phase-advance signal
231
- - New FINALIZATION phase packages the session into a shippable deliverable
236
+ - FINALIZATION phase packages the session into a shippable deliverable
232
237
  (canonical `rule_skills/` layout + README + coverage report + final
233
238
  dashboard)
239
+ - Filesystem-derived phase milestones (v0.7.0+): the engine reads disk
240
+ artifacts for advance criteria, never trusts tool-call assertions
234
241
  - Input stays active during streaming (type-ahead queue), arrow keys +
235
242
  history recall, CTX smoothing + peak, per-provider context-limit caps,
236
243
  `/tools`, `/parallelism`, and more
237
244
 
238
- See [DEV_LOG.md](./DEV_LOG.md) for the full v0.6.0 change breakdown and
239
- [docs/update_design_v5.md](./docs/update_design_v5.md) for the plan that
240
- drove it.
245
+ See [DEV_LOG.md](./DEV_LOG.md) for the per-release change breakdowns and
246
+ [docs/update_design_v7.md](./docs/update_design_v7.md) for the v0.7.x
247
+ plan and patch notes.
241
248
 
242
249
  Bug reports and PRs welcome at <https://github.com/kitchen-engineer42/kc-cli>.
243
250
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "kc-beta",
3
- "version": "0.7.1",
3
+ "version": "0.7.3",
4
4
  "description": "KC Agent — LLM document verification agent (pure Node.js CLI). Dual-licensed: PolyForm Noncommercial 1.0.0 for personal/noncommercial use; commercial license required for enterprise production. See LICENSE and LICENSE-COMMERCIAL.md.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -21,6 +21,7 @@ import { SnapshotTool } from "./tools/snapshot.js";
21
21
  import { ArchiveFileTool } from "./tools/archive-file.js";
22
22
  import { ScheduleFetchTool } from "./tools/schedule-fetch.js";
23
23
  import { ReleaseTool } from "./tools/release.js";
24
+ import { readKcVersion } from "../util/kc-version.js";
24
25
  import { PhaseAdvanceTool } from "./tools/phase-advance.js";
25
26
  import { DocumentParseTool } from "./tools/document-parse.js";
26
27
  import { DocumentSearchTool } from "./tools/document-search.js";
@@ -36,6 +37,7 @@ import { EvolutionCycleTool } from "./tools/evolution-cycle.js";
36
37
  import { TierDowngradeTool } from "./tools/tier-downgrade.js";
37
38
  import { AgentTool } from "./tools/agent-tool.js";
38
39
  import { WebSearchTool } from "./tools/web-search.js";
40
+ import { TaskCreateTool, TaskUpdateTool, TaskCompleteTool } from "./tools/task-board.js";
39
41
  import { SkillLoader } from "./skill-loader.js";
40
42
  import { TaskManager } from "./task-manager.js";
41
43
  import { Scheduler } from "./scheduler.js";
@@ -421,7 +423,7 @@ export class AgentEngine {
421
423
  new SnapshotTool(this.workspace),
422
424
  new ArchiveFileTool(this.workspace),
423
425
  new ScheduleFetchTool(this.workspace),
424
- new ReleaseTool(this.workspace, { kcVersion: "0.5.2" }),
426
+ new ReleaseTool(this.workspace, { kcVersion: readKcVersion() }),
425
427
  new PhaseAdvanceTool(
426
428
  // v0.7.1 2c: advanceFn returns rich `{advanced, engineCounts?}`
427
429
  // so the tool's refusal text can surface the engine telemetry
@@ -474,6 +476,16 @@ export class AgentEngine {
474
476
  () => this.currentPhase,
475
477
  ),
476
478
  new WebSearchTool(this.config.tavilyApiKey),
479
+ // v0.7.3: completes the v0.7.0 "agent owns TaskBoard" design.
480
+ // Skills already reference TaskCreate by name; these tools make
481
+ // that contract truthful. See task-board.js + work-decomposition
482
+ // SKILL.md. Skipped for subagents — they don't own a task board
483
+ // (taskManager is null in subagent scope, line 216).
484
+ ...(this.taskManager ? [
485
+ new TaskCreateTool(this.workspace, this.taskManager),
486
+ new TaskUpdateTool(this.workspace, this.taskManager),
487
+ new TaskCompleteTool(this.workspace, this.taskManager),
488
+ ] : []),
477
489
  ],
478
490
  // Distillation+ only (DISTILL mode)
479
491
  distill: [
@@ -1307,6 +1319,7 @@ export class AgentEngine {
1307
1319
  yield new AgentEvent({
1308
1320
  type: "tool_result",
1309
1321
  name: tc.name,
1322
+ input: inputData,
1310
1323
  output: historyContent,
1311
1324
  isError: result.isError,
1312
1325
  });
@@ -1679,7 +1692,24 @@ export class AgentEngine {
1679
1692
  parts.push(`monitoring: ${pipeline.monitoringPhase ?? "?"}`);
1680
1693
  break;
1681
1694
  }
1682
- // bootstrap / finalization: no specific counters, fall through
1695
+ case "bootstrap": {
1696
+ // v0.7.2 1e: previously fell through to empty string. Both
1697
+ // v0.7.1 audit runs had bootstrap → rule_extraction refusals
1698
+ // with engineCounts: "" — agent saw the refusal but had no
1699
+ // engine telemetry to react to. The InitializerPipeline tracks
1700
+ // boolean checklist flags rather than numeric counters; we
1701
+ // surface those flags as "yes/no" so the agent can see which
1702
+ // bootstrap criterion is missing.
1703
+ if (typeof pipeline.describeBootstrapChecklist === "function") {
1704
+ const cl = pipeline.describeBootstrapChecklist();
1705
+ parts.push(`workspaceCreated: ${cl.workspaceCreated ? "yes" : "no"}`);
1706
+ parts.push(`configReady: ${cl.configReady ? "yes" : "no"}`);
1707
+ parts.push(`hasRegulations: ${cl.hasRegulations ? "yes" : "no"}`);
1708
+ parts.push(`hasSamples: ${cl.hasSamples ? "yes" : "no"}`);
1709
+ }
1710
+ break;
1711
+ }
1712
+ // finalization: no specific counters, fall through
1683
1713
  }
1684
1714
  } catch { /* never let summary build break phase advance */ }
1685
1715
  return parts.join(", ");
@@ -57,16 +57,21 @@ function listChildFiles(p) {
57
57
  // Walk a directory recursively, yielding every file path. Skips hidden
58
58
  // dirs/files and __pycache__. Used by derive functions that need to
59
59
  // match arbitrarily-nested artifacts (e.g., scripts/ subdirs).
60
- function* walkFiles(root) {
60
+ //
61
+ // v0.7.2 1a: optional maxDepth caps recursion. depth=0 is root's
62
+ // direct children; depth=1 is one level down. Default unbounded
63
+ // (existing callers).
64
+ function* walkFiles(root, { maxDepth } = {}) {
61
65
  if (!dirExists(root)) return;
62
- const stack = [root];
66
+ const stack = [{ dir: root, depth: 0 }];
63
67
  while (stack.length) {
64
- const dir = stack.pop();
68
+ const { dir, depth } = stack.pop();
65
69
  for (const e of readDirSafe(dir)) {
66
70
  if (e.name.startsWith(".") || e.name === "__pycache__") continue;
67
71
  const p = path.join(dir, e.name);
68
- if (e.isDirectory()) stack.push(p);
69
- else if (e.isFile()) yield p;
72
+ if (e.isDirectory()) {
73
+ if (maxDepth == null || depth < maxDepth) stack.push({ dir: p, depth: depth + 1 });
74
+ } else if (e.isFile()) yield p;
70
75
  }
71
76
  }
72
77
  }
@@ -271,48 +276,66 @@ export function deriveSkillTestingMilestones(workspace) {
271
276
  }
272
277
  }
273
278
 
274
- // v0.7.1 1a: also credit rules whose verdicts appear in output/*.json.
275
- // Agents naturally write batch-test results to output/, not per-skill
276
- // paths. v0.6.x's _loadTestResults already reads here on the canonical
277
- // accuracy schema; this expands the helper-derived milestone to
278
- // recognize the same shape (plus the GLM/DS-shape variants seen in
279
- // E2E #6 v070). Without this, agents who run tests via sandbox_exec
280
- // and persist to output/ saw skillsTested=0 and force-bypassed.
281
- const collectFromJsonFile = (data) => {
282
- if (!data) return;
283
- if (data.rule_id) tested.add(data.rule_id);
284
- if (Array.isArray(data) && data[0] && typeof data[0] === "object" && data[0].rule_id) {
285
- for (const r of data) if (r?.rule_id) tested.add(r.rule_id);
279
+ // v0.7.1 1a / v0.7.2 1a: credit rules whose verdicts appear anywhere
280
+ // under output/*.json. Agents persist batch-test results in
281
+ // conductor-specific shapes (this is the recurring drift point
282
+ // engine derivation has to match disk reality, not the other way
283
+ // around). Shapes seen across E2E #5/6/7:
284
+ //
285
+ // - DS v0.7.0/0.7.1: catalog.json as array of {id: "R001", ...}
286
+ // entries; skill_test_*.json as {doc_name: {R019a: bool, ...}};
287
+ // skill_test_阳光资产.json with {doc, results: {R019a: ...}}
288
+ // - GLM v0.7.1: rule_stats.json as {D01-01: {PASS, FAIL, NA}, ...};
289
+ // full_test_results_v[1-6].json as {sample_id: {path, meta,
290
+ // results: {D01-01: {verdict, ...}}}} (nested 2 levels deep, why
291
+ // v0.7.1's shallow walk missed them)
292
+ //
293
+ // The collector recurses (depth-limited) and uses two heuristics to
294
+ // separate rule_ids from sample_ids / doc_names:
295
+ // 1. Rule-id shape: starts with letter, ≤ 30 chars, contains digits
296
+ // (matches R001, D01-01, T02-31; rejects 06f2ed1488, doc paths)
297
+ // 2. Verdict-shape on values: {verdict, passed, pass, PASS, FAIL}
298
+ // keys signal that the parent dict's keys are rule_ids
299
+ const ruleIdShape = /^[A-Za-z][A-Za-z0-9_-]{0,29}$/;
300
+ const isRuleIdShape = (s) => typeof s === "string" && ruleIdShape.test(s) && /\d/.test(s);
301
+ const looksLikeVerdict = (v) =>
302
+ v && typeof v === "object" && !Array.isArray(v) && (
303
+ v.verdict !== undefined ||
304
+ v.passed !== undefined ||
305
+ v.pass !== undefined ||
306
+ typeof v.PASS === "number" ||
307
+ typeof v.FAIL === "number"
308
+ );
309
+ const collectFromJsonFile = (data, depth = 0) => {
310
+ if (!data || depth > 4) return;
311
+ if (typeof data !== "object") return;
312
+ if (Array.isArray(data)) {
313
+ for (const r of data) collectFromJsonFile(r, depth + 1);
314
+ return;
315
+ }
316
+ // {rule_id: "X"} or {id: "R001"} on a rule entry
317
+ if (isRuleIdShape(data.rule_id)) tested.add(data.rule_id);
318
+ if (isRuleIdShape(data.id)) tested.add(data.id);
319
+ // {<rule_id>: <verdict_shaped>, ...} (rule_stats / per-doc test_results)
320
+ for (const [k, v] of Object.entries(data)) {
321
+ if (isRuleIdShape(k) && looksLikeVerdict(v)) tested.add(k);
286
322
  }
287
- if (data.results && typeof data.results === "object") {
288
- for (const k of Object.keys(data.results)) tested.add(k);
323
+ // {results: {<rule_id>: ...}} keys must look rule-id-shaped
324
+ if (data.results && typeof data.results === "object" && !Array.isArray(data.results)) {
325
+ for (const k of Object.keys(data.results)) {
326
+ if (isRuleIdShape(k)) tested.add(k);
327
+ }
328
+ }
329
+ // Recurse into nested objects (handles {sample_id: {results: {...}}})
330
+ for (const v of Object.values(data)) {
331
+ if (v && typeof v === "object") collectFromJsonFile(v, depth + 1);
289
332
  }
290
333
  };
291
334
 
292
335
  const outputDir = path.join(cwd, "output");
293
- if (dirExists(outputDir)) {
294
- for (const f of listChildFiles(outputDir)) {
295
- if (!f.name.endsWith(".json")) continue;
296
- collectFromJsonFile(readJsonSafe(path.join(outputDir, f.name)));
297
- }
298
- // One level into output/results/, output/distillation/ — the two
299
- // most common batch-result locations across E2E #5 and v070 sessions.
300
- for (const sub of ["results", "distillation", "qc"]) {
301
- const subDir = path.join(outputDir, sub);
302
- if (!dirExists(subDir)) continue;
303
- for (const f of listChildFiles(subDir)) {
304
- if (!f.name.endsWith(".json")) continue;
305
- collectFromJsonFile(readJsonSafe(path.join(subDir, f.name)));
306
- }
307
- // GLM v070 wrote per-rule subdirs under output/results/<rule_id>/
308
- // — walk one more level for that pattern.
309
- for (const child of listChildDirs(subDir)) {
310
- for (const f of listChildFiles(path.join(subDir, child.name))) {
311
- if (!f.name.endsWith(".json")) continue;
312
- collectFromJsonFile(readJsonSafe(path.join(subDir, child.name, f.name)));
313
- }
314
- }
315
- }
336
+ for (const p of walkFiles(outputDir, { maxDepth: 6 })) {
337
+ if (!p.endsWith(".json")) continue;
338
+ collectFromJsonFile(readJsonSafe(p));
316
339
  }
317
340
 
318
341
  // DS v070 wrote a top-level aggregate at either rules/test_results.json
@@ -5,6 +5,7 @@ import { PipelineEvent } from "./index.js";
5
5
  import { Pipeline } from "./base.js";
6
6
  import { normalizeRuleCatalog } from "../rule-catalog-normalize.js";
7
7
  import { deriveFinalizationMilestones } from "./_milestone-derive.js";
8
+ import { readKcVersion } from "../../util/kc-version.js";
8
9
 
9
10
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
10
11
  // v0.7.0 N: ship template/release/v1/ from the npm package; copy into
@@ -310,12 +311,7 @@ export class FinalizationPipeline extends Pipeline {
310
311
  }
311
312
 
312
313
  _readKcVersion() {
313
- try {
314
- const pkg = JSON.parse(fs.readFileSync(
315
- path.resolve(__dirname, "../../../package.json"), "utf-8",
316
- ));
317
- return pkg.version || "unknown";
318
- } catch { return "unknown"; }
314
+ return readKcVersion();
319
315
  }
320
316
 
321
317
  /**
@@ -240,6 +240,19 @@ export class ProjectInitializer extends Pipeline {
240
240
  return this.workspaceCreated && this.configReady && this.hasRegulations && this.hasSamples;
241
241
  }
242
242
 
243
+ // v0.7.2 1e: surface the checklist as engine telemetry so
244
+ // `_buildEngineCountsBlock("bootstrap")` has something to report when
245
+ // bootstrap → rule_extraction is refused. Agent sees the missing
246
+ // criteria directly in the refusal text.
247
+ describeBootstrapChecklist() {
248
+ return {
249
+ workspaceCreated: !!this.workspaceCreated,
250
+ configReady: !!this.configReady,
251
+ hasRegulations: !!this.hasRegulations,
252
+ hasSamples: !!this.hasSamples,
253
+ };
254
+ }
255
+
243
256
  /**
244
257
  * v0.6.3 (#74): nudge the agent when it does work that belongs to a later
245
258
  * phase. Bootstrap is setup — reading rules/samples, configuring keys,
@@ -94,7 +94,7 @@ export class CopyToWorkspaceTool extends BaseTool {
94
94
  this._appendGitignore(`refs/${targetName}`);
95
95
  }
96
96
 
97
- this._appendManifest({
97
+ await this._appendManifest({
98
98
  target: targetRel,
99
99
  source: sourcePath,
100
100
  size: stat.size,
@@ -113,17 +113,22 @@ export class CopyToWorkspaceTool extends BaseTool {
113
113
  );
114
114
  }
115
115
 
116
- _appendManifest(entry) {
117
- const manifestAbs = this._workspace.resolvePath(MANIFEST_REL);
118
- fs.mkdirSync(path.dirname(manifestAbs), { recursive: true });
119
- let entries = [];
120
- if (fs.existsSync(manifestAbs)) {
121
- try { entries = JSON.parse(fs.readFileSync(manifestAbs, "utf-8")); }
122
- catch { entries = []; }
123
- }
124
- if (!Array.isArray(entries)) entries = [];
125
- entries.push(entry);
126
- fs.writeFileSync(manifestAbs, JSON.stringify(entries, null, 2), "utf-8");
116
+ async _appendManifest(entry) {
117
+ // v0.7.3: refs/manifest.json is a shared coordination path — wrap the
118
+ // whole read-modify-write under the workspace lock so two parallel
119
+ // copy_to_workspace calls (main agent + subagent) don't lose entries.
120
+ return await this._workspace.withSharedLockIfApplicable(MANIFEST_REL, () => {
121
+ const manifestAbs = this._workspace.resolvePath(MANIFEST_REL);
122
+ fs.mkdirSync(path.dirname(manifestAbs), { recursive: true });
123
+ let entries = [];
124
+ if (fs.existsSync(manifestAbs)) {
125
+ try { entries = JSON.parse(fs.readFileSync(manifestAbs, "utf-8")); }
126
+ catch { entries = []; }
127
+ }
128
+ if (!Array.isArray(entries)) entries = [];
129
+ entries.push(entry);
130
+ fs.writeFileSync(manifestAbs, JSON.stringify(entries, null, 2), "utf-8");
131
+ });
127
132
  }
128
133
 
129
134
  _appendGitignore(line) {
@@ -178,7 +178,21 @@ export class ReleaseTool extends BaseTool {
178
178
  path.join(bundleAbs, "glossary.json"), { fallback: '{"version":1,"entries":[]}\n' });
179
179
  this._copyIfExists(path.join(this._workspace.cwd, "corner_cases.json"),
180
180
  path.join(bundleAbs, "corner_cases.json"), { fallback: '[]\n' });
181
- this._copyIfExists(path.join(this._workspace.cwd, "confidence_calibration.json"),
181
+ // v0.7.2 1c: auto-aggregate from output/ if no calibration file at
182
+ // workspace root. Both v0.7.1 audit runs (DS + GLM) shipped releases
183
+ // with empty `historical_accuracy: {}` despite having per-rule QC
184
+ // data on disk under output/ — the release tool just passed the
185
+ // file through and emitted a stub on miss. We try to populate from
186
+ // known QC artifact shapes here; if nothing matches, fall through
187
+ // to the existing stub fallback.
188
+ const calibSrc = path.join(this._workspace.cwd, "confidence_calibration.json");
189
+ if (!fs.existsSync(calibSrc)) {
190
+ const aggregated = this._aggregateAccuracyFromOutput();
191
+ if (aggregated && Object.keys(aggregated.historical_accuracy).length > 0) {
192
+ fs.writeFileSync(calibSrc, JSON.stringify(aggregated, null, 2) + "\n", "utf-8");
193
+ }
194
+ }
195
+ this._copyIfExists(calibSrc,
182
196
  path.join(bundleAbs, "confidence_calibration.json"),
183
197
  { fallback: '{"historical_accuracy":{}}\n' });
184
198
 
@@ -233,6 +247,30 @@ export class ReleaseTool extends BaseTool {
233
247
  .replace(/\{RULES_LIST\}/g, rulesList);
234
248
  fs.writeFileSync(path.join(bundleAbs, "README.md"), readme, "utf-8");
235
249
 
250
+ // v0.7.2 1d: clean up the template scaffold dir if a customized
251
+ // release was just written alongside it. Both v0.7.1 audit runs
252
+ // shipped with `output/releases/v1/` (template-derived, .tmpl
253
+ // files lingering) AND `output/releases/v1-0/` (or v1-0-hybrid/)
254
+ // — the customized release. The pre-scaffold is meant as a hint;
255
+ // once the agent calls `release(label="v1-0")` and we've written
256
+ // the real bundle, the unedited scaffold is just clutter.
257
+ //
258
+ // Conservative gate: only delete a sibling `v1/` if BOTH (a) we
259
+ // didn't just write to v1/ ourselves, AND (b) it still contains
260
+ // .tmpl files (signature of unedited template). If the agent
261
+ // intentionally edited v1/ in place (removing .tmpl), our cleanup
262
+ // leaves it alone.
263
+ if (slug !== "v1") {
264
+ const tmplScaffold = path.join(this._workspace.resolvePath(path.join("output", "releases")), "v1");
265
+ if (fs.existsSync(tmplScaffold) && fs.statSync(tmplScaffold).isDirectory()) {
266
+ let hasTmpl = false;
267
+ try { hasTmpl = fs.readdirSync(tmplScaffold).some((f) => f.endsWith(".tmpl")); } catch { /* ignore */ }
268
+ if (hasTmpl) {
269
+ try { fs.rmSync(tmplScaffold, { recursive: true, force: true }); } catch { /* best-effort */ }
270
+ }
271
+ }
272
+ }
273
+
236
274
  // Bundle dir is in output/ (gitignored). Snapshot manifest in snapshots/ IS tracked.
237
275
  const lines = [
238
276
  `Release '${label}' bundled at ${bundleRel}`,
@@ -319,6 +357,118 @@ export class ReleaseTool extends BaseTool {
319
357
  return null;
320
358
  }
321
359
 
360
+ // v0.7.2 1c: walk output/ for QC artifacts and aggregate per-rule
361
+ // accuracy. Recognized shapes (covering DS + GLM v0.7.1 audit runs):
362
+ //
363
+ // rule_stats_v*.json — {<rule_id>: {PASS: N, FAIL: N, NOT_APPLICABLE: N, ERROR: N}}
364
+ // (GLM produced 4 versions; pick the highest)
365
+ // full_test_results_v*.json — {<sample_id>: {results: {<rule_id>: {verdict}}}}
366
+ // (GLM; accumulate verdicts per rule across samples)
367
+ // skill_test_*.json — {<doc_name>: {<rule_id>: bool}} (DS shape)
368
+ //
369
+ // Returns null if no recognized artifact, or an object with
370
+ // { historical_accuracy: {<rule_id>: {pass_rate, n_samples, ...}}, computed_at, source_files }
371
+ // suitable for confidence_calibration.json.
372
+ _aggregateAccuracyFromOutput() {
373
+ const ruleIdShape = /^[A-Za-z][A-Za-z0-9_-]{0,29}$/;
374
+ const isRuleId = (s) => typeof s === "string" && ruleIdShape.test(s) && /\d/.test(s);
375
+ const tally = new Map(); // rule_id -> {pass, fail, na, n}
376
+ const sourceFiles = [];
377
+ const bump = (rid, kind) => {
378
+ if (!isRuleId(rid)) return;
379
+ const t = tally.get(rid) || { pass: 0, fail: 0, na: 0, n: 0 };
380
+ t[kind] += 1;
381
+ t.n += 1;
382
+ tally.set(rid, t);
383
+ };
384
+ const outputDir = path.join(this._workspace.cwd, "output");
385
+ if (!fs.existsSync(outputDir)) return null;
386
+
387
+ // Collect all .json files under output/ (depth limited)
388
+ const files = [];
389
+ const stack = [{ dir: outputDir, depth: 0 }];
390
+ while (stack.length) {
391
+ const { dir, depth } = stack.pop();
392
+ let entries;
393
+ try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch { continue; }
394
+ for (const e of entries) {
395
+ if (e.name.startsWith(".") || e.name === "__pycache__") continue;
396
+ const p = path.join(dir, e.name);
397
+ if (e.isDirectory()) {
398
+ if (depth < 6) stack.push({ dir: p, depth: depth + 1 });
399
+ } else if (e.isFile() && e.name.endsWith(".json")) {
400
+ files.push({ path: p, name: e.name });
401
+ }
402
+ }
403
+ }
404
+
405
+ // 1) Prefer rule_stats_v<N>.json (highest version) — direct counts
406
+ const ruleStatsFiles = files
407
+ .filter((f) => /^rule_stats(?:_v\d+)?\.json$/i.test(f.name))
408
+ .map((f) => ({ ...f, ver: (f.name.match(/_v(\d+)/) || [0, 0])[1] | 0 }))
409
+ .sort((a, b) => b.ver - a.ver);
410
+ if (ruleStatsFiles.length > 0) {
411
+ const top = ruleStatsFiles[0];
412
+ try {
413
+ const d = JSON.parse(fs.readFileSync(top.path, "utf-8"));
414
+ for (const [rid, stats] of Object.entries(d)) {
415
+ if (!isRuleId(rid) || !stats || typeof stats !== "object") continue;
416
+ const pass = stats.PASS | 0, fail = stats.FAIL | 0;
417
+ const na = stats.NOT_APPLICABLE | stats.NA | 0;
418
+ const t = tally.get(rid) || { pass: 0, fail: 0, na: 0, n: 0 };
419
+ t.pass += pass; t.fail += fail; t.na += na; t.n += pass + fail + na;
420
+ tally.set(rid, t);
421
+ }
422
+ sourceFiles.push(path.relative(this._workspace.cwd, top.path));
423
+ } catch { /* fall through to other shapes */ }
424
+ }
425
+
426
+ // 2) Fallback: full_test_results*.json with nested {sample_id: {results: {rid: {verdict}}}}
427
+ if (tally.size === 0) {
428
+ const ftrFiles = files
429
+ .filter((f) => /^full_test_results(?:_v\d+)?\.json$/i.test(f.name))
430
+ .map((f) => ({ ...f, ver: (f.name.match(/_v(\d+)/) || [0, 0])[1] | 0 }))
431
+ .sort((a, b) => b.ver - a.ver);
432
+ for (const f of ftrFiles.slice(0, 1)) {
433
+ try {
434
+ const d = JSON.parse(fs.readFileSync(f.path, "utf-8"));
435
+ for (const sample of Object.values(d)) {
436
+ if (!sample || typeof sample !== "object") continue;
437
+ const results = sample.results;
438
+ if (!results || typeof results !== "object") continue;
439
+ for (const [rid, r] of Object.entries(results)) {
440
+ if (!isRuleId(rid) || !r || typeof r !== "object") continue;
441
+ const verdict = (r.verdict || "").toString().toUpperCase();
442
+ if (verdict === "PASS") bump(rid, "pass");
443
+ else if (verdict === "FAIL") bump(rid, "fail");
444
+ else if (verdict === "NOT_APPLICABLE" || verdict === "NA") bump(rid, "na");
445
+ }
446
+ }
447
+ sourceFiles.push(path.relative(this._workspace.cwd, f.path));
448
+ } catch { /* try next shape */ }
449
+ }
450
+ }
451
+
452
+ if (tally.size === 0) return null;
453
+
454
+ const historical_accuracy = {};
455
+ for (const [rid, t] of tally.entries()) {
456
+ const fired = t.pass + t.fail;
457
+ historical_accuracy[rid] = {
458
+ pass_rate: fired > 0 ? +(t.pass / fired).toFixed(4) : null,
459
+ n_passed: t.pass,
460
+ n_failed: t.fail,
461
+ n_not_applicable: t.na,
462
+ n_samples: t.n,
463
+ };
464
+ }
465
+ return {
466
+ historical_accuracy,
467
+ computed_at: new Date().toISOString(),
468
+ source_files: sourceFiles,
469
+ };
470
+ }
471
+
322
472
  _readWorkerTiers() {
323
473
  const envPath = path.join(this._workspace.cwd, ".env");
324
474
  const out = { tier1: "", tier2: "", tier3: "", tier4: "" };
@@ -44,7 +44,10 @@ export class SandboxExecTool extends BaseTool {
44
44
  "Execute a shell command. " +
45
45
  "cwd='workspace' (default) runs in KC's workspace. " +
46
46
  "cwd='project' runs in the user's project directory. " +
47
- "Pipes, redirects, and chained commands (&&) are supported."
47
+ "Pipes, redirects, and chained commands (&&) are supported. " +
48
+ "stdout + stderr combined are capped at 10,000 chars; longer output is truncated. " +
49
+ "For reading individual files larger than ~10 KB (e.g. regulation documents), " +
50
+ "prefer workspace_file (operation=read) which has a larger 50 KB cap."
48
51
  );
49
52
  }
50
53