kc-beta 0.7.1 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -8
- package/package.json +1 -1
- package/src/agent/engine.js +32 -2
- package/src/agent/pipelines/_milestone-derive.js +65 -42
- package/src/agent/pipelines/finalization.js +2 -6
- package/src/agent/pipelines/initializer.js +13 -0
- package/src/agent/tools/copy-to-workspace.js +17 -12
- package/src/agent/tools/release.js +151 -1
- package/src/agent/tools/sandbox-exec.js +4 -1
- package/src/agent/tools/task-board.js +194 -0
- package/src/agent/tools/workspace-file.js +58 -44
- package/src/config.js +6 -4
- package/src/util/kc-version.js +27 -0
- package/template/CLAUDE.md +13 -0
- package/template/skills/en/meta-meta/rule-extraction/SKILL.md +77 -0
- package/template/skills/en/meta-meta/skill-to-workflow/SKILL.md +26 -0
- package/template/skills/en/meta-meta/work-decomposition/SKILL.md +76 -9
- package/template/skills/zh/meta-meta/rule-extraction/SKILL.md +65 -0
- package/template/skills/zh/meta-meta/skill-to-workflow/SKILL.md +26 -0
- package/template/skills/zh/meta-meta/work-decomposition/SKILL.md +74 -9
package/README.md
CHANGED
|
@@ -216,28 +216,35 @@ Quality Thresholds, Language.
|
|
|
216
216
|
|
|
217
217
|
## Status
|
|
218
218
|
|
|
219
|
-
**v0.
|
|
219
|
+
**v0.7.3 — codex review patch release.** Latest line in the v0.7.x
|
|
220
|
+
hardening track. Architectural payload from v0.6.0+ is still in place:
|
|
220
221
|
|
|
221
222
|
- Parallel ralph-loop (up to 8 concurrent workers) with a heap-safety
|
|
222
223
|
conformance gate
|
|
223
224
|
- Native chunker + RAG (onion-peeler + CJK bigram keyword index +
|
|
224
225
|
one-shot LLM bundle classifier, ported from the AMC verification app)
|
|
225
|
-
-
|
|
226
|
-
|
|
226
|
+
- Agent-owned task board: the agent reads the rule list from
|
|
227
|
+
`describeState`, decides decomposition (per-rule / grouped / range),
|
|
228
|
+
and calls `TaskCreate` / `TaskUpdate` / `TaskComplete` to drive the
|
|
229
|
+
Ralph loop. Source-context auto-attach pulls rule NL + evidence chunks
|
|
230
|
+
+ sibling rules into the prompt of each task as it runs.
|
|
227
231
|
- Workspace file locking for shared coordination files (`rules/catalog.json`,
|
|
228
|
-
`rules/manifest.json`, `
|
|
232
|
+
`rules/manifest.json`, `refs/manifest.json`, `tasks.json`,
|
|
233
|
+
`session-state.json`) — every writer goes through `withFileLock`.
|
|
229
234
|
- `agent_tool` gets `wait` / `poll` / `list` / `kill` operations +
|
|
230
235
|
`stale_subagents` phase-advance signal
|
|
231
|
-
-
|
|
236
|
+
- FINALIZATION phase packages the session into a shippable deliverable
|
|
232
237
|
(canonical `rule_skills/` layout + README + coverage report + final
|
|
233
238
|
dashboard)
|
|
239
|
+
- Filesystem-derived phase milestones (v0.7.0+): the engine reads disk
|
|
240
|
+
artifacts for advance criteria, never trusts tool-call assertions
|
|
234
241
|
- Input stays active during streaming (type-ahead queue), arrow keys +
|
|
235
242
|
history recall, CTX smoothing + peak, per-provider context-limit caps,
|
|
236
243
|
`/tools`, `/parallelism`, and more
|
|
237
244
|
|
|
238
|
-
See [DEV_LOG.md](./DEV_LOG.md) for the
|
|
239
|
-
[docs/
|
|
240
|
-
|
|
245
|
+
See [DEV_LOG.md](./DEV_LOG.md) for the per-release change breakdowns and
|
|
246
|
+
[docs/update_design_v7.md](./docs/update_design_v7.md) for the v0.7.x
|
|
247
|
+
plan and patch notes.
|
|
241
248
|
|
|
242
249
|
Bug reports and PRs welcome at <https://github.com/kitchen-engineer42/kc-cli>.
|
|
243
250
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "kc-beta",
|
|
3
|
-
"version": "0.7.
|
|
3
|
+
"version": "0.7.3",
|
|
4
4
|
"description": "KC Agent — LLM document verification agent (pure Node.js CLI). Dual-licensed: PolyForm Noncommercial 1.0.0 for personal/noncommercial use; commercial license required for enterprise production. See LICENSE and LICENSE-COMMERCIAL.md.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
package/src/agent/engine.js
CHANGED
|
@@ -21,6 +21,7 @@ import { SnapshotTool } from "./tools/snapshot.js";
|
|
|
21
21
|
import { ArchiveFileTool } from "./tools/archive-file.js";
|
|
22
22
|
import { ScheduleFetchTool } from "./tools/schedule-fetch.js";
|
|
23
23
|
import { ReleaseTool } from "./tools/release.js";
|
|
24
|
+
import { readKcVersion } from "../util/kc-version.js";
|
|
24
25
|
import { PhaseAdvanceTool } from "./tools/phase-advance.js";
|
|
25
26
|
import { DocumentParseTool } from "./tools/document-parse.js";
|
|
26
27
|
import { DocumentSearchTool } from "./tools/document-search.js";
|
|
@@ -36,6 +37,7 @@ import { EvolutionCycleTool } from "./tools/evolution-cycle.js";
|
|
|
36
37
|
import { TierDowngradeTool } from "./tools/tier-downgrade.js";
|
|
37
38
|
import { AgentTool } from "./tools/agent-tool.js";
|
|
38
39
|
import { WebSearchTool } from "./tools/web-search.js";
|
|
40
|
+
import { TaskCreateTool, TaskUpdateTool, TaskCompleteTool } from "./tools/task-board.js";
|
|
39
41
|
import { SkillLoader } from "./skill-loader.js";
|
|
40
42
|
import { TaskManager } from "./task-manager.js";
|
|
41
43
|
import { Scheduler } from "./scheduler.js";
|
|
@@ -421,7 +423,7 @@ export class AgentEngine {
|
|
|
421
423
|
new SnapshotTool(this.workspace),
|
|
422
424
|
new ArchiveFileTool(this.workspace),
|
|
423
425
|
new ScheduleFetchTool(this.workspace),
|
|
424
|
-
new ReleaseTool(this.workspace, { kcVersion:
|
|
426
|
+
new ReleaseTool(this.workspace, { kcVersion: readKcVersion() }),
|
|
425
427
|
new PhaseAdvanceTool(
|
|
426
428
|
// v0.7.1 2c: advanceFn returns rich `{advanced, engineCounts?}`
|
|
427
429
|
// so the tool's refusal text can surface the engine telemetry
|
|
@@ -474,6 +476,16 @@ export class AgentEngine {
|
|
|
474
476
|
() => this.currentPhase,
|
|
475
477
|
),
|
|
476
478
|
new WebSearchTool(this.config.tavilyApiKey),
|
|
479
|
+
// v0.7.3: completes the v0.7.0 "agent owns TaskBoard" design.
|
|
480
|
+
// Skills already reference TaskCreate by name; these tools make
|
|
481
|
+
// that contract truthful. See task-board.js + work-decomposition
|
|
482
|
+
// SKILL.md. Skipped for subagents — they don't own a task board
|
|
483
|
+
// (taskManager is null in subagent scope, line 216).
|
|
484
|
+
...(this.taskManager ? [
|
|
485
|
+
new TaskCreateTool(this.workspace, this.taskManager),
|
|
486
|
+
new TaskUpdateTool(this.workspace, this.taskManager),
|
|
487
|
+
new TaskCompleteTool(this.workspace, this.taskManager),
|
|
488
|
+
] : []),
|
|
477
489
|
],
|
|
478
490
|
// Distillation+ only (DISTILL mode)
|
|
479
491
|
distill: [
|
|
@@ -1307,6 +1319,7 @@ export class AgentEngine {
|
|
|
1307
1319
|
yield new AgentEvent({
|
|
1308
1320
|
type: "tool_result",
|
|
1309
1321
|
name: tc.name,
|
|
1322
|
+
input: inputData,
|
|
1310
1323
|
output: historyContent,
|
|
1311
1324
|
isError: result.isError,
|
|
1312
1325
|
});
|
|
@@ -1679,7 +1692,24 @@ export class AgentEngine {
|
|
|
1679
1692
|
parts.push(`monitoring: ${pipeline.monitoringPhase ?? "?"}`);
|
|
1680
1693
|
break;
|
|
1681
1694
|
}
|
|
1682
|
-
|
|
1695
|
+
case "bootstrap": {
|
|
1696
|
+
// v0.7.2 1e: previously fell through to empty string. Both
|
|
1697
|
+
// v0.7.1 audit runs had bootstrap → rule_extraction refusals
|
|
1698
|
+
// with engineCounts: "" — agent saw the refusal but had no
|
|
1699
|
+
// engine telemetry to react to. The InitializerPipeline tracks
|
|
1700
|
+
// boolean checklist flags rather than numeric counters; we
|
|
1701
|
+
// surface those flags as "yes/no" so the agent can see which
|
|
1702
|
+
// bootstrap criterion is missing.
|
|
1703
|
+
if (typeof pipeline.describeBootstrapChecklist === "function") {
|
|
1704
|
+
const cl = pipeline.describeBootstrapChecklist();
|
|
1705
|
+
parts.push(`workspaceCreated: ${cl.workspaceCreated ? "yes" : "no"}`);
|
|
1706
|
+
parts.push(`configReady: ${cl.configReady ? "yes" : "no"}`);
|
|
1707
|
+
parts.push(`hasRegulations: ${cl.hasRegulations ? "yes" : "no"}`);
|
|
1708
|
+
parts.push(`hasSamples: ${cl.hasSamples ? "yes" : "no"}`);
|
|
1709
|
+
}
|
|
1710
|
+
break;
|
|
1711
|
+
}
|
|
1712
|
+
// finalization: no specific counters, fall through
|
|
1683
1713
|
}
|
|
1684
1714
|
} catch { /* never let summary build break phase advance */ }
|
|
1685
1715
|
return parts.join(", ");
|
|
@@ -57,16 +57,21 @@ function listChildFiles(p) {
|
|
|
57
57
|
// Walk a directory recursively, yielding every file path. Skips hidden
|
|
58
58
|
// dirs/files and __pycache__. Used by derive functions that need to
|
|
59
59
|
// match arbitrarily-nested artifacts (e.g., scripts/ subdirs).
|
|
60
|
-
|
|
60
|
+
//
|
|
61
|
+
// v0.7.2 1a: optional maxDepth caps recursion. depth=0 is root's
|
|
62
|
+
// direct children; depth=1 is one level down. Default unbounded
|
|
63
|
+
// (existing callers).
|
|
64
|
+
function* walkFiles(root, { maxDepth } = {}) {
|
|
61
65
|
if (!dirExists(root)) return;
|
|
62
|
-
const stack = [root];
|
|
66
|
+
const stack = [{ dir: root, depth: 0 }];
|
|
63
67
|
while (stack.length) {
|
|
64
|
-
const dir = stack.pop();
|
|
68
|
+
const { dir, depth } = stack.pop();
|
|
65
69
|
for (const e of readDirSafe(dir)) {
|
|
66
70
|
if (e.name.startsWith(".") || e.name === "__pycache__") continue;
|
|
67
71
|
const p = path.join(dir, e.name);
|
|
68
|
-
if (e.isDirectory())
|
|
69
|
-
|
|
72
|
+
if (e.isDirectory()) {
|
|
73
|
+
if (maxDepth == null || depth < maxDepth) stack.push({ dir: p, depth: depth + 1 });
|
|
74
|
+
} else if (e.isFile()) yield p;
|
|
70
75
|
}
|
|
71
76
|
}
|
|
72
77
|
}
|
|
@@ -271,48 +276,66 @@ export function deriveSkillTestingMilestones(workspace) {
|
|
|
271
276
|
}
|
|
272
277
|
}
|
|
273
278
|
|
|
274
|
-
// v0.7.1 1a:
|
|
275
|
-
// Agents
|
|
276
|
-
//
|
|
277
|
-
//
|
|
278
|
-
//
|
|
279
|
-
//
|
|
280
|
-
//
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
279
|
+
// v0.7.1 1a / v0.7.2 1a: credit rules whose verdicts appear anywhere
|
|
280
|
+
// under output/*.json. Agents persist batch-test results in
|
|
281
|
+
// conductor-specific shapes (this is the recurring drift point —
|
|
282
|
+
// engine derivation has to match disk reality, not the other way
|
|
283
|
+
// around). Shapes seen across E2E #5/6/7:
|
|
284
|
+
//
|
|
285
|
+
// - DS v0.7.0/0.7.1: catalog.json as array of {id: "R001", ...}
|
|
286
|
+
// entries; skill_test_*.json as {doc_name: {R019a: bool, ...}};
|
|
287
|
+
// skill_test_阳光资产.json with {doc, results: {R019a: ...}}
|
|
288
|
+
// - GLM v0.7.1: rule_stats.json as {D01-01: {PASS, FAIL, NA}, ...};
|
|
289
|
+
// full_test_results_v[1-6].json as {sample_id: {path, meta,
|
|
290
|
+
// results: {D01-01: {verdict, ...}}}} (nested 2 levels deep, why
|
|
291
|
+
// v0.7.1's shallow walk missed them)
|
|
292
|
+
//
|
|
293
|
+
// The collector recurses (depth-limited) and uses two heuristics to
|
|
294
|
+
// separate rule_ids from sample_ids / doc_names:
|
|
295
|
+
// 1. Rule-id shape: starts with letter, ≤ 30 chars, contains digits
|
|
296
|
+
// (matches R001, D01-01, T02-31; rejects 06f2ed1488, doc paths)
|
|
297
|
+
// 2. Verdict-shape on values: {verdict, passed, pass, PASS, FAIL}
|
|
298
|
+
// keys signal that the parent dict's keys are rule_ids
|
|
299
|
+
const ruleIdShape = /^[A-Za-z][A-Za-z0-9_-]{0,29}$/;
|
|
300
|
+
const isRuleIdShape = (s) => typeof s === "string" && ruleIdShape.test(s) && /\d/.test(s);
|
|
301
|
+
const looksLikeVerdict = (v) =>
|
|
302
|
+
v && typeof v === "object" && !Array.isArray(v) && (
|
|
303
|
+
v.verdict !== undefined ||
|
|
304
|
+
v.passed !== undefined ||
|
|
305
|
+
v.pass !== undefined ||
|
|
306
|
+
typeof v.PASS === "number" ||
|
|
307
|
+
typeof v.FAIL === "number"
|
|
308
|
+
);
|
|
309
|
+
const collectFromJsonFile = (data, depth = 0) => {
|
|
310
|
+
if (!data || depth > 4) return;
|
|
311
|
+
if (typeof data !== "object") return;
|
|
312
|
+
if (Array.isArray(data)) {
|
|
313
|
+
for (const r of data) collectFromJsonFile(r, depth + 1);
|
|
314
|
+
return;
|
|
315
|
+
}
|
|
316
|
+
// {rule_id: "X"} or {id: "R001"} on a rule entry
|
|
317
|
+
if (isRuleIdShape(data.rule_id)) tested.add(data.rule_id);
|
|
318
|
+
if (isRuleIdShape(data.id)) tested.add(data.id);
|
|
319
|
+
// {<rule_id>: <verdict_shaped>, ...} (rule_stats / per-doc test_results)
|
|
320
|
+
for (const [k, v] of Object.entries(data)) {
|
|
321
|
+
if (isRuleIdShape(k) && looksLikeVerdict(v)) tested.add(k);
|
|
286
322
|
}
|
|
287
|
-
|
|
288
|
-
|
|
323
|
+
// {results: {<rule_id>: ...}} — keys must look rule-id-shaped
|
|
324
|
+
if (data.results && typeof data.results === "object" && !Array.isArray(data.results)) {
|
|
325
|
+
for (const k of Object.keys(data.results)) {
|
|
326
|
+
if (isRuleIdShape(k)) tested.add(k);
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
// Recurse into nested objects (handles {sample_id: {results: {...}}})
|
|
330
|
+
for (const v of Object.values(data)) {
|
|
331
|
+
if (v && typeof v === "object") collectFromJsonFile(v, depth + 1);
|
|
289
332
|
}
|
|
290
333
|
};
|
|
291
334
|
|
|
292
335
|
const outputDir = path.join(cwd, "output");
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
collectFromJsonFile(readJsonSafe(path.join(outputDir, f.name)));
|
|
297
|
-
}
|
|
298
|
-
// One level into output/results/, output/distillation/ — the two
|
|
299
|
-
// most common batch-result locations across E2E #5 and v070 sessions.
|
|
300
|
-
for (const sub of ["results", "distillation", "qc"]) {
|
|
301
|
-
const subDir = path.join(outputDir, sub);
|
|
302
|
-
if (!dirExists(subDir)) continue;
|
|
303
|
-
for (const f of listChildFiles(subDir)) {
|
|
304
|
-
if (!f.name.endsWith(".json")) continue;
|
|
305
|
-
collectFromJsonFile(readJsonSafe(path.join(subDir, f.name)));
|
|
306
|
-
}
|
|
307
|
-
// GLM v070 wrote per-rule subdirs under output/results/<rule_id>/
|
|
308
|
-
// — walk one more level for that pattern.
|
|
309
|
-
for (const child of listChildDirs(subDir)) {
|
|
310
|
-
for (const f of listChildFiles(path.join(subDir, child.name))) {
|
|
311
|
-
if (!f.name.endsWith(".json")) continue;
|
|
312
|
-
collectFromJsonFile(readJsonSafe(path.join(subDir, child.name, f.name)));
|
|
313
|
-
}
|
|
314
|
-
}
|
|
315
|
-
}
|
|
336
|
+
for (const p of walkFiles(outputDir, { maxDepth: 6 })) {
|
|
337
|
+
if (!p.endsWith(".json")) continue;
|
|
338
|
+
collectFromJsonFile(readJsonSafe(p));
|
|
316
339
|
}
|
|
317
340
|
|
|
318
341
|
// DS v070 wrote a top-level aggregate at either rules/test_results.json
|
|
@@ -5,6 +5,7 @@ import { PipelineEvent } from "./index.js";
|
|
|
5
5
|
import { Pipeline } from "./base.js";
|
|
6
6
|
import { normalizeRuleCatalog } from "../rule-catalog-normalize.js";
|
|
7
7
|
import { deriveFinalizationMilestones } from "./_milestone-derive.js";
|
|
8
|
+
import { readKcVersion } from "../../util/kc-version.js";
|
|
8
9
|
|
|
9
10
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
10
11
|
// v0.7.0 N: ship template/release/v1/ from the npm package; copy into
|
|
@@ -310,12 +311,7 @@ export class FinalizationPipeline extends Pipeline {
|
|
|
310
311
|
}
|
|
311
312
|
|
|
312
313
|
_readKcVersion() {
|
|
313
|
-
|
|
314
|
-
const pkg = JSON.parse(fs.readFileSync(
|
|
315
|
-
path.resolve(__dirname, "../../../package.json"), "utf-8",
|
|
316
|
-
));
|
|
317
|
-
return pkg.version || "unknown";
|
|
318
|
-
} catch { return "unknown"; }
|
|
314
|
+
return readKcVersion();
|
|
319
315
|
}
|
|
320
316
|
|
|
321
317
|
/**
|
|
@@ -240,6 +240,19 @@ export class ProjectInitializer extends Pipeline {
|
|
|
240
240
|
return this.workspaceCreated && this.configReady && this.hasRegulations && this.hasSamples;
|
|
241
241
|
}
|
|
242
242
|
|
|
243
|
+
// v0.7.2 1e: surface the checklist as engine telemetry so
|
|
244
|
+
// `_buildEngineCountsBlock("bootstrap")` has something to report when
|
|
245
|
+
// bootstrap → rule_extraction is refused. Agent sees the missing
|
|
246
|
+
// criteria directly in the refusal text.
|
|
247
|
+
describeBootstrapChecklist() {
|
|
248
|
+
return {
|
|
249
|
+
workspaceCreated: !!this.workspaceCreated,
|
|
250
|
+
configReady: !!this.configReady,
|
|
251
|
+
hasRegulations: !!this.hasRegulations,
|
|
252
|
+
hasSamples: !!this.hasSamples,
|
|
253
|
+
};
|
|
254
|
+
}
|
|
255
|
+
|
|
243
256
|
/**
|
|
244
257
|
* v0.6.3 (#74): nudge the agent when it does work that belongs to a later
|
|
245
258
|
* phase. Bootstrap is setup — reading rules/samples, configuring keys,
|
|
@@ -94,7 +94,7 @@ export class CopyToWorkspaceTool extends BaseTool {
|
|
|
94
94
|
this._appendGitignore(`refs/${targetName}`);
|
|
95
95
|
}
|
|
96
96
|
|
|
97
|
-
this._appendManifest({
|
|
97
|
+
await this._appendManifest({
|
|
98
98
|
target: targetRel,
|
|
99
99
|
source: sourcePath,
|
|
100
100
|
size: stat.size,
|
|
@@ -113,17 +113,22 @@ export class CopyToWorkspaceTool extends BaseTool {
|
|
|
113
113
|
);
|
|
114
114
|
}
|
|
115
115
|
|
|
116
|
-
_appendManifest(entry) {
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
116
|
+
async _appendManifest(entry) {
|
|
117
|
+
// v0.7.3: refs/manifest.json is a shared coordination path — wrap the
|
|
118
|
+
// whole read-modify-write under the workspace lock so two parallel
|
|
119
|
+
// copy_to_workspace calls (main agent + subagent) don't lose entries.
|
|
120
|
+
return await this._workspace.withSharedLockIfApplicable(MANIFEST_REL, () => {
|
|
121
|
+
const manifestAbs = this._workspace.resolvePath(MANIFEST_REL);
|
|
122
|
+
fs.mkdirSync(path.dirname(manifestAbs), { recursive: true });
|
|
123
|
+
let entries = [];
|
|
124
|
+
if (fs.existsSync(manifestAbs)) {
|
|
125
|
+
try { entries = JSON.parse(fs.readFileSync(manifestAbs, "utf-8")); }
|
|
126
|
+
catch { entries = []; }
|
|
127
|
+
}
|
|
128
|
+
if (!Array.isArray(entries)) entries = [];
|
|
129
|
+
entries.push(entry);
|
|
130
|
+
fs.writeFileSync(manifestAbs, JSON.stringify(entries, null, 2), "utf-8");
|
|
131
|
+
});
|
|
127
132
|
}
|
|
128
133
|
|
|
129
134
|
_appendGitignore(line) {
|
|
@@ -178,7 +178,21 @@ export class ReleaseTool extends BaseTool {
|
|
|
178
178
|
path.join(bundleAbs, "glossary.json"), { fallback: '{"version":1,"entries":[]}\n' });
|
|
179
179
|
this._copyIfExists(path.join(this._workspace.cwd, "corner_cases.json"),
|
|
180
180
|
path.join(bundleAbs, "corner_cases.json"), { fallback: '[]\n' });
|
|
181
|
-
|
|
181
|
+
// v0.7.2 1c: auto-aggregate from output/ if no calibration file at
|
|
182
|
+
// workspace root. Both v0.7.1 audit runs (DS + GLM) shipped releases
|
|
183
|
+
// with empty `historical_accuracy: {}` despite having per-rule QC
|
|
184
|
+
// data on disk under output/ — the release tool just passed the
|
|
185
|
+
// file through and emitted a stub on miss. We try to populate from
|
|
186
|
+
// known QC artifact shapes here; if nothing matches, fall through
|
|
187
|
+
// to the existing stub fallback.
|
|
188
|
+
const calibSrc = path.join(this._workspace.cwd, "confidence_calibration.json");
|
|
189
|
+
if (!fs.existsSync(calibSrc)) {
|
|
190
|
+
const aggregated = this._aggregateAccuracyFromOutput();
|
|
191
|
+
if (aggregated && Object.keys(aggregated.historical_accuracy).length > 0) {
|
|
192
|
+
fs.writeFileSync(calibSrc, JSON.stringify(aggregated, null, 2) + "\n", "utf-8");
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
this._copyIfExists(calibSrc,
|
|
182
196
|
path.join(bundleAbs, "confidence_calibration.json"),
|
|
183
197
|
{ fallback: '{"historical_accuracy":{}}\n' });
|
|
184
198
|
|
|
@@ -233,6 +247,30 @@ export class ReleaseTool extends BaseTool {
|
|
|
233
247
|
.replace(/\{RULES_LIST\}/g, rulesList);
|
|
234
248
|
fs.writeFileSync(path.join(bundleAbs, "README.md"), readme, "utf-8");
|
|
235
249
|
|
|
250
|
+
// v0.7.2 1d: clean up the template scaffold dir if a customized
|
|
251
|
+
// release was just written alongside it. Both v0.7.1 audit runs
|
|
252
|
+
// shipped with `output/releases/v1/` (template-derived, .tmpl
|
|
253
|
+
// files lingering) AND `output/releases/v1-0/` (or v1-0-hybrid/)
|
|
254
|
+
// — the customized release. The pre-scaffold is meant as a hint;
|
|
255
|
+
// once the agent calls `release(label="v1-0")` and we've written
|
|
256
|
+
// the real bundle, the unedited scaffold is just clutter.
|
|
257
|
+
//
|
|
258
|
+
// Conservative gate: only delete a sibling `v1/` if BOTH (a) we
|
|
259
|
+
// didn't just write to v1/ ourselves, AND (b) it still contains
|
|
260
|
+
// .tmpl files (signature of unedited template). If the agent
|
|
261
|
+
// intentionally edited v1/ in place (removing .tmpl), our cleanup
|
|
262
|
+
// leaves it alone.
|
|
263
|
+
if (slug !== "v1") {
|
|
264
|
+
const tmplScaffold = path.join(this._workspace.resolvePath(path.join("output", "releases")), "v1");
|
|
265
|
+
if (fs.existsSync(tmplScaffold) && fs.statSync(tmplScaffold).isDirectory()) {
|
|
266
|
+
let hasTmpl = false;
|
|
267
|
+
try { hasTmpl = fs.readdirSync(tmplScaffold).some((f) => f.endsWith(".tmpl")); } catch { /* ignore */ }
|
|
268
|
+
if (hasTmpl) {
|
|
269
|
+
try { fs.rmSync(tmplScaffold, { recursive: true, force: true }); } catch { /* best-effort */ }
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
|
|
236
274
|
// Bundle dir is in output/ (gitignored). Snapshot manifest in snapshots/ IS tracked.
|
|
237
275
|
const lines = [
|
|
238
276
|
`Release '${label}' bundled at ${bundleRel}`,
|
|
@@ -319,6 +357,118 @@ export class ReleaseTool extends BaseTool {
|
|
|
319
357
|
return null;
|
|
320
358
|
}
|
|
321
359
|
|
|
360
|
+
// v0.7.2 1c: walk output/ for QC artifacts and aggregate per-rule
|
|
361
|
+
// accuracy. Recognized shapes (covering DS + GLM v0.7.1 audit runs):
|
|
362
|
+
//
|
|
363
|
+
// rule_stats_v*.json — {<rule_id>: {PASS: N, FAIL: N, NOT_APPLICABLE: N, ERROR: N}}
|
|
364
|
+
// (GLM produced 4 versions; pick the highest)
|
|
365
|
+
// full_test_results_v*.json — {<sample_id>: {results: {<rule_id>: {verdict}}}}
|
|
366
|
+
// (GLM; accumulate verdicts per rule across samples)
|
|
367
|
+
// skill_test_*.json — {<doc_name>: {<rule_id>: bool}} (DS shape)
|
|
368
|
+
//
|
|
369
|
+
// Returns null if no recognized artifact, or an object with
|
|
370
|
+
// { historical_accuracy: {<rule_id>: {pass_rate, n_samples, ...}}, computed_at, source_files }
|
|
371
|
+
// suitable for confidence_calibration.json.
|
|
372
|
+
_aggregateAccuracyFromOutput() {
|
|
373
|
+
const ruleIdShape = /^[A-Za-z][A-Za-z0-9_-]{0,29}$/;
|
|
374
|
+
const isRuleId = (s) => typeof s === "string" && ruleIdShape.test(s) && /\d/.test(s);
|
|
375
|
+
const tally = new Map(); // rule_id -> {pass, fail, na, n}
|
|
376
|
+
const sourceFiles = [];
|
|
377
|
+
const bump = (rid, kind) => {
|
|
378
|
+
if (!isRuleId(rid)) return;
|
|
379
|
+
const t = tally.get(rid) || { pass: 0, fail: 0, na: 0, n: 0 };
|
|
380
|
+
t[kind] += 1;
|
|
381
|
+
t.n += 1;
|
|
382
|
+
tally.set(rid, t);
|
|
383
|
+
};
|
|
384
|
+
const outputDir = path.join(this._workspace.cwd, "output");
|
|
385
|
+
if (!fs.existsSync(outputDir)) return null;
|
|
386
|
+
|
|
387
|
+
// Collect all .json files under output/ (depth limited)
|
|
388
|
+
const files = [];
|
|
389
|
+
const stack = [{ dir: outputDir, depth: 0 }];
|
|
390
|
+
while (stack.length) {
|
|
391
|
+
const { dir, depth } = stack.pop();
|
|
392
|
+
let entries;
|
|
393
|
+
try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch { continue; }
|
|
394
|
+
for (const e of entries) {
|
|
395
|
+
if (e.name.startsWith(".") || e.name === "__pycache__") continue;
|
|
396
|
+
const p = path.join(dir, e.name);
|
|
397
|
+
if (e.isDirectory()) {
|
|
398
|
+
if (depth < 6) stack.push({ dir: p, depth: depth + 1 });
|
|
399
|
+
} else if (e.isFile() && e.name.endsWith(".json")) {
|
|
400
|
+
files.push({ path: p, name: e.name });
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
// 1) Prefer rule_stats_v<N>.json (highest version) — direct counts
|
|
406
|
+
const ruleStatsFiles = files
|
|
407
|
+
.filter((f) => /^rule_stats(?:_v\d+)?\.json$/i.test(f.name))
|
|
408
|
+
.map((f) => ({ ...f, ver: (f.name.match(/_v(\d+)/) || [0, 0])[1] | 0 }))
|
|
409
|
+
.sort((a, b) => b.ver - a.ver);
|
|
410
|
+
if (ruleStatsFiles.length > 0) {
|
|
411
|
+
const top = ruleStatsFiles[0];
|
|
412
|
+
try {
|
|
413
|
+
const d = JSON.parse(fs.readFileSync(top.path, "utf-8"));
|
|
414
|
+
for (const [rid, stats] of Object.entries(d)) {
|
|
415
|
+
if (!isRuleId(rid) || !stats || typeof stats !== "object") continue;
|
|
416
|
+
const pass = stats.PASS | 0, fail = stats.FAIL | 0;
|
|
417
|
+
const na = stats.NOT_APPLICABLE | stats.NA | 0;
|
|
418
|
+
const t = tally.get(rid) || { pass: 0, fail: 0, na: 0, n: 0 };
|
|
419
|
+
t.pass += pass; t.fail += fail; t.na += na; t.n += pass + fail + na;
|
|
420
|
+
tally.set(rid, t);
|
|
421
|
+
}
|
|
422
|
+
sourceFiles.push(path.relative(this._workspace.cwd, top.path));
|
|
423
|
+
} catch { /* fall through to other shapes */ }
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
// 2) Fallback: full_test_results*.json with nested {sample_id: {results: {rid: {verdict}}}}
|
|
427
|
+
if (tally.size === 0) {
|
|
428
|
+
const ftrFiles = files
|
|
429
|
+
.filter((f) => /^full_test_results(?:_v\d+)?\.json$/i.test(f.name))
|
|
430
|
+
.map((f) => ({ ...f, ver: (f.name.match(/_v(\d+)/) || [0, 0])[1] | 0 }))
|
|
431
|
+
.sort((a, b) => b.ver - a.ver);
|
|
432
|
+
for (const f of ftrFiles.slice(0, 1)) {
|
|
433
|
+
try {
|
|
434
|
+
const d = JSON.parse(fs.readFileSync(f.path, "utf-8"));
|
|
435
|
+
for (const sample of Object.values(d)) {
|
|
436
|
+
if (!sample || typeof sample !== "object") continue;
|
|
437
|
+
const results = sample.results;
|
|
438
|
+
if (!results || typeof results !== "object") continue;
|
|
439
|
+
for (const [rid, r] of Object.entries(results)) {
|
|
440
|
+
if (!isRuleId(rid) || !r || typeof r !== "object") continue;
|
|
441
|
+
const verdict = (r.verdict || "").toString().toUpperCase();
|
|
442
|
+
if (verdict === "PASS") bump(rid, "pass");
|
|
443
|
+
else if (verdict === "FAIL") bump(rid, "fail");
|
|
444
|
+
else if (verdict === "NOT_APPLICABLE" || verdict === "NA") bump(rid, "na");
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
sourceFiles.push(path.relative(this._workspace.cwd, f.path));
|
|
448
|
+
} catch { /* try next shape */ }
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
if (tally.size === 0) return null;
|
|
453
|
+
|
|
454
|
+
const historical_accuracy = {};
|
|
455
|
+
for (const [rid, t] of tally.entries()) {
|
|
456
|
+
const fired = t.pass + t.fail;
|
|
457
|
+
historical_accuracy[rid] = {
|
|
458
|
+
pass_rate: fired > 0 ? +(t.pass / fired).toFixed(4) : null,
|
|
459
|
+
n_passed: t.pass,
|
|
460
|
+
n_failed: t.fail,
|
|
461
|
+
n_not_applicable: t.na,
|
|
462
|
+
n_samples: t.n,
|
|
463
|
+
};
|
|
464
|
+
}
|
|
465
|
+
return {
|
|
466
|
+
historical_accuracy,
|
|
467
|
+
computed_at: new Date().toISOString(),
|
|
468
|
+
source_files: sourceFiles,
|
|
469
|
+
};
|
|
470
|
+
}
|
|
471
|
+
|
|
322
472
|
_readWorkerTiers() {
|
|
323
473
|
const envPath = path.join(this._workspace.cwd, ".env");
|
|
324
474
|
const out = { tier1: "", tier2: "", tier3: "", tier4: "" };
|
|
@@ -44,7 +44,10 @@ export class SandboxExecTool extends BaseTool {
|
|
|
44
44
|
"Execute a shell command. " +
|
|
45
45
|
"cwd='workspace' (default) runs in KC's workspace. " +
|
|
46
46
|
"cwd='project' runs in the user's project directory. " +
|
|
47
|
-
"Pipes, redirects, and chained commands (&&) are supported."
|
|
47
|
+
"Pipes, redirects, and chained commands (&&) are supported. " +
|
|
48
|
+
"stdout + stderr combined are capped at 10,000 chars; longer output is truncated. " +
|
|
49
|
+
"For reading individual files larger than ~10 KB (e.g. regulation documents), " +
|
|
50
|
+
"prefer workspace_file (operation=read) which has a larger 50 KB cap."
|
|
48
51
|
);
|
|
49
52
|
}
|
|
50
53
|
|