kc-beta 0.5.6 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/QUICKSTART.md +17 -4
- package/README.md +58 -11
- package/bin/kc-beta.js +35 -1
- package/package.json +1 -1
- package/src/agent/bundle-tree.js +553 -0
- package/src/agent/context.js +40 -1
- package/src/agent/engine.js +828 -31
- package/src/agent/llm-client.js +67 -18
- package/src/agent/pipelines/distillation.js +15 -0
- package/src/agent/pipelines/extraction.js +60 -3
- package/src/agent/pipelines/finalization.js +186 -0
- package/src/agent/pipelines/index.js +8 -0
- package/src/agent/pipelines/initializer.js +40 -0
- package/src/agent/pipelines/production-qc.js +63 -13
- package/src/agent/pipelines/skill-authoring.js +136 -7
- package/src/agent/skill-loader.js +54 -4
- package/src/agent/task-manager.js +81 -3
- package/src/agent/tools/agent-tool.js +283 -35
- package/src/agent/tools/bundle-search.js +146 -0
- package/src/agent/tools/document-chunk.js +246 -0
- package/src/agent/tools/document-classify.js +311 -0
- package/src/agent/tools/document-parse.js +8 -1
- package/src/agent/tools/phase-advance.js +30 -7
- package/src/agent/tools/registry.js +10 -0
- package/src/agent/tools/rule-catalog.js +17 -3
- package/src/agent/tools/sandbox-exec.js +30 -0
- package/src/agent/tools/workflow-run.js +34 -1
- package/src/agent/workspace.js +168 -14
- package/src/cli/components.js +165 -17
- package/src/cli/index.js +166 -19
- package/src/cli/meme.js +58 -0
- package/src/config.js +39 -2
- package/src/providers.js +26 -0
- package/template/skills/en/meta-meta/evolution-loop/SKILL.md +13 -1
- package/template/skills/en/meta-meta/rule-extraction/SKILL.md +74 -0
- package/template/skills/zh/meta-meta/evolution-loop/SKILL.md +7 -1
- package/template/skills/zh/meta-meta/rule-extraction/SKILL.md +73 -0
package/src/agent/llm-client.js
CHANGED
|
@@ -1,5 +1,20 @@
|
|
|
1
1
|
import { withRetry } from "./retry.js";
|
|
2
2
|
|
|
3
|
+
// A5: SSE accumulator safety cap. If a provider ever sends an abnormally
|
|
4
|
+
// large `data: ...` line without a newline terminator, the parser's
|
|
5
|
+
// `buffer += decoder.decode(chunk)` + `buffer.split("\n")` would grow
|
|
6
|
+
// unbounded and trigger O(n²) splitting once it gets into the hundreds
|
|
7
|
+
// of MB. 8 MB is multiple orders of magnitude above any legitimate single
|
|
8
|
+
// SSE frame (largest seen in the wild: ~80 KB for multi-tool-call deltas).
|
|
9
|
+
const SSE_BUFFER_CAP_BYTES = 8 * 1024 * 1024;
|
|
10
|
+
|
|
11
|
+
class SseOverflowError extends Error {
|
|
12
|
+
constructor(bytes) {
|
|
13
|
+
super(`SSE buffer overflow (${bytes} bytes without newline) — aborting stream`);
|
|
14
|
+
this.code = "SSE_BUFFER_OVERFLOW";
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
|
|
3
18
|
/**
|
|
4
19
|
* Multi-protocol LLM client using native fetch + SSE parsing.
|
|
5
20
|
* Supports OpenAI-compatible APIs and Anthropic Messages API.
|
|
@@ -144,26 +159,56 @@ export class LLMClient {
|
|
|
144
159
|
async *streamChat({ model, messages, tools, maxTokens }) {
|
|
145
160
|
const body = this._buildStreamBody({ model, messages, tools, maxTokens });
|
|
146
161
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
162
|
+
let resp;
|
|
163
|
+
try {
|
|
164
|
+
resp = await withRetry(async () => {
|
|
165
|
+
const r = await fetch(this._getEndpoint(), {
|
|
166
|
+
method: "POST",
|
|
167
|
+
headers: this._buildHeaders(),
|
|
168
|
+
body: JSON.stringify(body),
|
|
169
|
+
});
|
|
170
|
+
if (!r.ok) {
|
|
171
|
+
const text = await r.text();
|
|
172
|
+
const err = new Error(`LLM API error ${r.status}: ${text}`);
|
|
173
|
+
err.status = r.status;
|
|
174
|
+
err.retryAfter = r.headers.get("retry-after");
|
|
175
|
+
err.streamTermination = "http_error";
|
|
176
|
+
throw err;
|
|
177
|
+
}
|
|
178
|
+
return r;
|
|
152
179
|
});
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
return r;
|
|
161
|
-
});
|
|
180
|
+
} catch (err) {
|
|
181
|
+
// A8: Any pre-stream failure (network, auth, 4xx/5xx after retry) is
|
|
182
|
+
// tagged and re-thrown. Engine's outer catch sees exactly one tagged
|
|
183
|
+
// error event.
|
|
184
|
+
if (!err.streamTermination) err.streamTermination = "connect_error";
|
|
185
|
+
throw err;
|
|
186
|
+
}
|
|
162
187
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
188
|
+
// A8: Wrap the SSE consumption so ALL termination paths — clean EOS,
|
|
189
|
+
// mid-token abort, SSE overflow, provider disconnect — surface as a
|
|
190
|
+
// single tagged error the engine can report consistently. The inner
|
|
191
|
+
// parsers throw for overflow (A5) and return silently on clean EOS;
|
|
192
|
+
// mid-stream socket errors (undici "terminated") raise here.
|
|
193
|
+
try {
|
|
194
|
+
if (this.apiFormat === "anthropic") {
|
|
195
|
+
yield* this._parseAnthropicSSE(resp.body);
|
|
196
|
+
} else {
|
|
197
|
+
yield* this._parseOpenaiSSE(resp.body);
|
|
198
|
+
}
|
|
199
|
+
} catch (err) {
|
|
200
|
+
if (!err.streamTermination) {
|
|
201
|
+
if (err.code === "SSE_BUFFER_OVERFLOW") err.streamTermination = "sse_overflow";
|
|
202
|
+
else if (err.name === "AbortError") err.streamTermination = "aborted";
|
|
203
|
+
else if (/terminated|reset|ECONNRESET|UND_ERR_ABORTED/i.test(err.message || err.code || ""))
|
|
204
|
+
err.streamTermination = "stream_terminated";
|
|
205
|
+
else err.streamTermination = "stream_error";
|
|
206
|
+
}
|
|
207
|
+
throw err;
|
|
208
|
+
} finally {
|
|
209
|
+
// Best-effort: cancel the body so the underlying socket returns to the
|
|
210
|
+
// connection pool even if the consumer bailed mid-stream.
|
|
211
|
+
try { await resp.body?.cancel?.(); } catch { /* ignore */ }
|
|
167
212
|
}
|
|
168
213
|
}
|
|
169
214
|
|
|
@@ -261,6 +306,8 @@ export class LLMClient {
|
|
|
261
306
|
|
|
262
307
|
for await (const chunk of body) {
|
|
263
308
|
buffer += decoder.decode(chunk, { stream: true });
|
|
309
|
+
// A5: bail out before O(n²) split explodes on pathological input.
|
|
310
|
+
if (buffer.length > SSE_BUFFER_CAP_BYTES) throw new SseOverflowError(buffer.length);
|
|
264
311
|
const lines = buffer.split("\n");
|
|
265
312
|
buffer = lines.pop();
|
|
266
313
|
|
|
@@ -313,6 +360,8 @@ export class LLMClient {
|
|
|
313
360
|
|
|
314
361
|
for await (const rawChunk of body) {
|
|
315
362
|
buffer += decoder.decode(rawChunk, { stream: true });
|
|
363
|
+
// A5: cap applies to both SSE parsers.
|
|
364
|
+
if (buffer.length > SSE_BUFFER_CAP_BYTES) throw new SseOverflowError(buffer.length);
|
|
316
365
|
const lines = buffer.split("\n");
|
|
317
366
|
buffer = lines.pop();
|
|
318
367
|
|
|
@@ -40,6 +40,13 @@ export class DistillationEngine extends Pipeline {
|
|
|
40
40
|
}
|
|
41
41
|
|
|
42
42
|
_scanWorkflows() {
|
|
43
|
+
// v0.6.1 A6: preserve engine-emitted entries across filesystem rescans.
|
|
44
|
+
// workflow_run hook bumps workflowsTested[ruleId] and adds to
|
|
45
|
+
// workflowsPassing on success — without this preservation, those entries
|
|
46
|
+
// get clobbered on the next describeState() / onToolResult() rescan.
|
|
47
|
+
const engineWfTested = { ...this.workflowsTested };
|
|
48
|
+
const engineWfPassing = [...this.workflowsPassing];
|
|
49
|
+
|
|
43
50
|
this.workflowsCreated = {};
|
|
44
51
|
this.workflowsTested = {};
|
|
45
52
|
this.workflowsPassing = [];
|
|
@@ -68,6 +75,14 @@ export class DistillationEngine extends Pipeline {
|
|
|
68
75
|
this.workflowsCreated[path.parse(e.name).name] = 1;
|
|
69
76
|
}
|
|
70
77
|
}
|
|
78
|
+
|
|
79
|
+
// Re-merge engine-emitted entries on top of filesystem-derived state
|
|
80
|
+
for (const [k, v] of Object.entries(engineWfTested)) {
|
|
81
|
+
if (!(k in this.workflowsTested)) this.workflowsTested[k] = v;
|
|
82
|
+
}
|
|
83
|
+
for (const id of engineWfPassing) {
|
|
84
|
+
if (!this.workflowsPassing.includes(id)) this.workflowsPassing.push(id);
|
|
85
|
+
}
|
|
71
86
|
}
|
|
72
87
|
|
|
73
88
|
describeState() {
|
|
@@ -11,6 +11,11 @@ export class RuleExtractionPipeline extends Pipeline {
|
|
|
11
11
|
this.rulesExtracted = [];
|
|
12
12
|
this.rulesWithTests = [];
|
|
13
13
|
this.coverageAudited = false;
|
|
14
|
+
// v0.6.1 A1: track which rules in catalog.json have non-empty
|
|
15
|
+
// source_chunk_ids — D1 grounded skill_authoring prompts on these but
|
|
16
|
+
// exit didn't require them, so a sloppy extraction could leave rules
|
|
17
|
+
// unmoored.
|
|
18
|
+
this.rulesWithChunkRefs = [];
|
|
14
19
|
this._scanWorkspace();
|
|
15
20
|
}
|
|
16
21
|
|
|
@@ -28,11 +33,21 @@ export class RuleExtractionPipeline extends Pipeline {
|
|
|
28
33
|
|
|
29
34
|
_scanRules() {
|
|
30
35
|
this.rulesExtracted = [];
|
|
36
|
+
this.rulesWithChunkRefs = [];
|
|
31
37
|
const catalogPath = path.join(this._workspace.cwd, "rules", "catalog.json");
|
|
32
38
|
if (fs.existsSync(catalogPath)) {
|
|
33
39
|
try {
|
|
34
40
|
const data = JSON.parse(fs.readFileSync(catalogPath, "utf-8"));
|
|
35
|
-
if (Array.isArray(data))
|
|
41
|
+
if (Array.isArray(data)) {
|
|
42
|
+
this.rulesExtracted = data.map((r, i) => r.id || `rule_${i}`);
|
|
43
|
+
// A1: collect ids whose entry has non-empty source_chunk_ids
|
|
44
|
+
for (const r of data) {
|
|
45
|
+
const ids = r?.source_chunk_ids;
|
|
46
|
+
if (Array.isArray(ids) && ids.length > 0 && r?.id) {
|
|
47
|
+
this.rulesWithChunkRefs.push(r.id);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
}
|
|
36
51
|
} catch { /* skip */ }
|
|
37
52
|
}
|
|
38
53
|
const skillsDir = path.join(this._workspace.cwd, "rule_skills");
|
|
@@ -67,10 +82,43 @@ export class RuleExtractionPipeline extends Pipeline {
|
|
|
67
82
|
parts.push("### Exit\nExtraction complete. Proceed to SKILL_AUTHORING.");
|
|
68
83
|
}
|
|
69
84
|
|
|
70
|
-
|
|
85
|
+
const chunkRefsOk = this._chunkRefsCriterionMet();
|
|
86
|
+
parts.push(
|
|
87
|
+
`### Exit criteria\n` +
|
|
88
|
+
`- [${this.regulationsScanned ? "x" : " "}] All regulations read\n` +
|
|
89
|
+
`- [${this.rulesExtracted.length > 0 ? "x" : " "}] Rules decomposed into atomic units\n` +
|
|
90
|
+
`- [${this.rulesWithTests.length >= Math.max(this.rulesExtracted.length * 0.8, 1) ? "x" : " "}] >=80% of rules have test stubs\n` +
|
|
91
|
+
`- [${this.coverageAudited ? "x" : " "}] Coverage audit completed\n` +
|
|
92
|
+
`- [${chunkRefsOk ? "x" : " "}] Every rule has source_chunk_ids in catalog.json (${this.rulesWithChunkRefs.length}/${this._catalogRuleCount()})`,
|
|
93
|
+
);
|
|
71
94
|
return parts.join("\n\n");
|
|
72
95
|
}
|
|
73
96
|
|
|
97
|
+
/**
|
|
98
|
+
* v0.6.1 A1: number of rules currently in catalog.json (not the union with
|
|
99
|
+
* rule_skills/ dirs that rulesExtracted carries). Used by the chunk-refs
|
|
100
|
+
* gate so we compare apples to apples.
|
|
101
|
+
*/
|
|
102
|
+
_catalogRuleCount() {
|
|
103
|
+
const catalogPath = path.join(this._workspace.cwd, "rules", "catalog.json");
|
|
104
|
+
if (!fs.existsSync(catalogPath)) return 0;
|
|
105
|
+
try {
|
|
106
|
+
const data = JSON.parse(fs.readFileSync(catalogPath, "utf-8"));
|
|
107
|
+
return Array.isArray(data) ? data.length : 0;
|
|
108
|
+
} catch { return 0; }
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* v0.6.1 A1: pass when every rule in catalog.json has a non-empty
|
|
113
|
+
* source_chunk_ids array. Empty catalog (legacy / pre-D1 sessions) passes
|
|
114
|
+
* trivially so resume of v0.6.0 sessions doesn't get trapped.
|
|
115
|
+
*/
|
|
116
|
+
_chunkRefsCriterionMet() {
|
|
117
|
+
const total = this._catalogRuleCount();
|
|
118
|
+
if (total === 0) return true; // backwards-compat for sessions pre-D1
|
|
119
|
+
return this.rulesWithChunkRefs.length >= total;
|
|
120
|
+
}
|
|
121
|
+
|
|
74
122
|
onToolResult(toolName, toolInput, result) {
|
|
75
123
|
if (result.isError) return null;
|
|
76
124
|
const wasReady = this.exitCriteriaMet();
|
|
@@ -85,7 +133,12 @@ export class RuleExtractionPipeline extends Pipeline {
|
|
|
85
133
|
|
|
86
134
|
exitCriteriaMet() {
|
|
87
135
|
return this.regulationsScanned && this.rulesExtracted.length > 0 &&
|
|
88
|
-
this.rulesWithTests.length >= Math.max(this.rulesExtracted.length * 0.8, 1) &&
|
|
136
|
+
this.rulesWithTests.length >= Math.max(this.rulesExtracted.length * 0.8, 1) &&
|
|
137
|
+
this.coverageAudited &&
|
|
138
|
+
// v0.6.1 A1: hard tracking — D1 source-context auto-attach requires
|
|
139
|
+
// catalog.json entries to carry source_chunk_ids. Without them the
|
|
140
|
+
// skill_authoring prompts are blind.
|
|
141
|
+
this._chunkRefsCriterionMet();
|
|
89
142
|
}
|
|
90
143
|
|
|
91
144
|
exportState() {
|
|
@@ -93,6 +146,7 @@ export class RuleExtractionPipeline extends Pipeline {
|
|
|
93
146
|
regulationsScanned: this.regulationsScanned,
|
|
94
147
|
rulesExtracted: this.rulesExtracted,
|
|
95
148
|
rulesWithTests: this.rulesWithTests,
|
|
149
|
+
rulesWithChunkRefs: this.rulesWithChunkRefs,
|
|
96
150
|
coverageAudited: this.coverageAudited,
|
|
97
151
|
};
|
|
98
152
|
}
|
|
@@ -107,5 +161,8 @@ export class RuleExtractionPipeline extends Pipeline {
|
|
|
107
161
|
if (Array.isArray(data.rulesWithTests) && data.rulesWithTests.length > this.rulesWithTests.length) {
|
|
108
162
|
this.rulesWithTests = data.rulesWithTests;
|
|
109
163
|
}
|
|
164
|
+
if (Array.isArray(data.rulesWithChunkRefs) && data.rulesWithChunkRefs.length > this.rulesWithChunkRefs.length) {
|
|
165
|
+
this.rulesWithChunkRefs = data.rulesWithChunkRefs;
|
|
166
|
+
}
|
|
110
167
|
}
|
|
111
168
|
}
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import fs from "node:fs";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { PipelineEvent } from "./index.js";
|
|
4
|
+
import { Pipeline } from "./base.js";
|
|
5
|
+
import { normalizeRuleCatalog } from "../rule-catalog-normalize.js";
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* E1: FINALIZATION — the 7th phase. Runs after PRODUCTION_QC has shown
|
|
9
|
+
* the system working. Goal: turn the working system into a shippable
|
|
10
|
+
* deliverable.
|
|
11
|
+
*
|
|
12
|
+
* Responsibilities (observed via this pipeline's describeState + exit
|
|
13
|
+
* criteria; the agent does the actual work using workspace_file +
|
|
14
|
+
* sandbox_exec):
|
|
15
|
+
* 1. rule_skills/README.md — inventory + how-to-run section.
|
|
16
|
+
* 2. rule_skills/coverage_report.md — rule-id → skill-file mapping,
|
|
17
|
+
* including which rules are "not_applicable" per D6 classification.
|
|
18
|
+
* 3. output/final_dashboard.html — snapshot of the final metrics.
|
|
19
|
+
* 4. (Optional) Reorganized rule_skills/<rule_id>/ canonical layout:
|
|
20
|
+
* when skills were written grouped (check_r002_r007.py), create
|
|
21
|
+
* thin-link dirs for each constituent rule_id pointing at the
|
|
22
|
+
* grouped file. Skipped if rule_skills/ is already per-rule.
|
|
23
|
+
*
|
|
24
|
+
* Exit criteria: all three deliverable files exist. The agent is free
|
|
25
|
+
* to produce more artifacts; these are the minimum-viable finalization
|
|
26
|
+
* set the pipeline requires before marking the release-ready.
|
|
27
|
+
*
|
|
28
|
+
* No successor phase — this is the terminal state. The agent can
|
|
29
|
+
* continue working in this phase (e.g. producing additional dashboards
|
|
30
|
+
* on request), but auto-advance stops here.
|
|
31
|
+
*/
|
|
32
|
+
export class FinalizationPipeline extends Pipeline {
|
|
33
|
+
constructor(workspace) {
|
|
34
|
+
super();
|
|
35
|
+
this._workspace = workspace;
|
|
36
|
+
this.readmeWritten = false;
|
|
37
|
+
this.coverageReportWritten = false;
|
|
38
|
+
this.finalDashboardWritten = false;
|
|
39
|
+
this.canonicalLayoutDone = false;
|
|
40
|
+
this._scanWorkspace();
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
_scanWorkspace() {
|
|
44
|
+
const cwd = this._workspace.cwd;
|
|
45
|
+
this.readmeWritten = fs.existsSync(path.join(cwd, "rule_skills", "README.md"));
|
|
46
|
+
this.coverageReportWritten = fs.existsSync(path.join(cwd, "rule_skills", "coverage_report.md"));
|
|
47
|
+
this.finalDashboardWritten = fs.existsSync(path.join(cwd, "output", "final_dashboard.html"));
|
|
48
|
+
// Canonical layout: every rule_id in the catalog has a dedicated
|
|
49
|
+
// directory OR a thin-link stub under rule_skills/<rule_id>/. When
|
|
50
|
+
// skills are already per-rule (every rule has its own dir) this is
|
|
51
|
+
// trivially true. When skills are grouped, the agent creates
|
|
52
|
+
// per-rule stub dirs that reference the grouped file. We approximate
|
|
53
|
+
// "canonical" by checking: does every catalog rule_id have a
|
|
54
|
+
// matching directory under rule_skills/?
|
|
55
|
+
this.canonicalLayoutDone = this._checkCanonicalLayout();
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
_checkCanonicalLayout() {
|
|
59
|
+
const cwd = this._workspace.cwd;
|
|
60
|
+
const catalogPath = path.join(cwd, "rules", "catalog.json");
|
|
61
|
+
const skillsDir = path.join(cwd, "rule_skills");
|
|
62
|
+
if (!fs.existsSync(catalogPath) || !fs.existsSync(skillsDir)) return false;
|
|
63
|
+
let rules;
|
|
64
|
+
try {
|
|
65
|
+
rules = normalizeRuleCatalog(JSON.parse(fs.readFileSync(catalogPath, "utf-8")));
|
|
66
|
+
} catch { return false; }
|
|
67
|
+
if (rules.length === 0) return false;
|
|
68
|
+
|
|
69
|
+
let existingDirs;
|
|
70
|
+
try {
|
|
71
|
+
existingDirs = new Set(
|
|
72
|
+
fs.readdirSync(skillsDir, { withFileTypes: true })
|
|
73
|
+
.filter((e) => e.isDirectory())
|
|
74
|
+
.map((e) => e.name),
|
|
75
|
+
);
|
|
76
|
+
} catch { return false; }
|
|
77
|
+
|
|
78
|
+
// Every rule id should have a matching directory. Directory name
|
|
79
|
+
// matches rule id (R014) OR falls inside a range dir (R078_R128).
|
|
80
|
+
const rangeDirs = [...existingDirs].map((name) => {
|
|
81
|
+
const m = name.match(/^R0*(\d+)[_-]R0*(\d+)$/i);
|
|
82
|
+
if (m) return { name, lo: parseInt(m[1], 10), hi: parseInt(m[2], 10) };
|
|
83
|
+
return null;
|
|
84
|
+
}).filter(Boolean);
|
|
85
|
+
|
|
86
|
+
for (const r of rules) {
|
|
87
|
+
if (!r.id) continue;
|
|
88
|
+
if (existingDirs.has(r.id)) continue;
|
|
89
|
+
const m = r.id.match(/^R0*(\d+)$/i);
|
|
90
|
+
if (m) {
|
|
91
|
+
const n = parseInt(m[1], 10);
|
|
92
|
+
if (rangeDirs.some((rd) => rd.lo <= n && n <= rd.hi)) continue;
|
|
93
|
+
}
|
|
94
|
+
return false;
|
|
95
|
+
}
|
|
96
|
+
return true;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
describeState() {
|
|
100
|
+
this._scanWorkspace();
|
|
101
|
+
const checklist = [
|
|
102
|
+
`- ${this.readmeWritten ? "✅" : "⏳"} rule_skills/README.md`,
|
|
103
|
+
`- ${this.coverageReportWritten ? "✅" : "⏳"} rule_skills/coverage_report.md`,
|
|
104
|
+
`- ${this.finalDashboardWritten ? "✅" : "⏳"} output/final_dashboard.html`,
|
|
105
|
+
`- ${this.canonicalLayoutDone ? "✅" : "⏳"} rule_skills/ canonical per-rule layout`,
|
|
106
|
+
];
|
|
107
|
+
const parts = [
|
|
108
|
+
"## Phase: FINALIZATION\n" +
|
|
109
|
+
"Turn the working verification system into a shippable deliverable. The " +
|
|
110
|
+
"pipeline has completed end-to-end; now package it for handoff. This is " +
|
|
111
|
+
"the terminal phase — no successor. You can continue producing artifacts " +
|
|
112
|
+
"here on request.\n\n" +
|
|
113
|
+
"**Tasks to complete** (the pipeline considers the phase done when all " +
|
|
114
|
+
"four checkmarks are green):\n\n" +
|
|
115
|
+
checklist.join("\n") + "\n\n" +
|
|
116
|
+
"### What each artifact should contain\n\n" +
|
|
117
|
+
"- **README.md**: top of `rule_skills/` with file inventory, how to run " +
|
|
118
|
+
" `run_all_checks.py` (if present), input format, expected output format, " +
|
|
119
|
+
" dependencies, and a short 'what this does' for a reader who hasn't " +
|
|
120
|
+
" seen the project.\n" +
|
|
121
|
+
"- **coverage_report.md**: one row per rule_id in catalog.json. Columns: " +
|
|
122
|
+
" rule_id, source_ref, skill file (`check_r014.py` or `check_r002_r007.py`), " +
|
|
123
|
+
" tested (Y/N), latest accuracy, retries, applicable-to-this-bundle " +
|
|
124
|
+
" (Y/N from D6 classification). Rules marked not_applicable should be " +
|
|
125
|
+
" grouped at the bottom with a note explaining which bundle-type " +
|
|
126
|
+
" filtered them out.\n" +
|
|
127
|
+
"- **final_dashboard.html**: single-page snapshot. Reuse the " +
|
|
128
|
+
" `dashboard_render` tool — it knows the metrics shape. This is the " +
|
|
129
|
+
" hand-off artifact the developer user opens to see the final state.\n" +
|
|
130
|
+
"- **canonical layout**: the simplest check is `ls rule_skills/ | " +
|
|
131
|
+
" wc -l` ≈ number of rules in the catalog. When grouped files exist " +
|
|
132
|
+
" (`check_r002_r007.py`), create stub `rule_skills/R002/` through " +
|
|
133
|
+
" `rule_skills/R007/` each containing a one-line SKILL.md that points " +
|
|
134
|
+
" at the grouped file. This keeps downstream per-rule lookups simple.",
|
|
135
|
+
];
|
|
136
|
+
return parts.join("\n\n");
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
onToolResult(toolName, toolInput, result) {
|
|
140
|
+
if (result.isError) return null;
|
|
141
|
+
const wasReady = this.exitCriteriaMet();
|
|
142
|
+
const touchedPath = String(
|
|
143
|
+
toolInput?.path || toolInput?.command || "",
|
|
144
|
+
);
|
|
145
|
+
// Re-scan when the agent writes to any relevant path
|
|
146
|
+
if (
|
|
147
|
+
touchedPath.includes("rule_skills/") ||
|
|
148
|
+
touchedPath.includes("output/final_dashboard") ||
|
|
149
|
+
touchedPath.includes("coverage_report")
|
|
150
|
+
) {
|
|
151
|
+
this._scanWorkspace();
|
|
152
|
+
}
|
|
153
|
+
if (!wasReady && this.exitCriteriaMet()) {
|
|
154
|
+
// Terminal phase — no nextPhase. Pipeline event signals "done."
|
|
155
|
+
return new PipelineEvent({
|
|
156
|
+
type: "phase_ready",
|
|
157
|
+
message: "Finalization artifacts complete. Session deliverable is ready.",
|
|
158
|
+
nextPhase: null,
|
|
159
|
+
});
|
|
160
|
+
}
|
|
161
|
+
return null;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
exitCriteriaMet() {
|
|
165
|
+
return this.readmeWritten &&
|
|
166
|
+
this.coverageReportWritten &&
|
|
167
|
+
this.finalDashboardWritten &&
|
|
168
|
+
this.canonicalLayoutDone;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
exportState() {
|
|
172
|
+
return {
|
|
173
|
+
readmeWritten: this.readmeWritten,
|
|
174
|
+
coverageReportWritten: this.coverageReportWritten,
|
|
175
|
+
finalDashboardWritten: this.finalDashboardWritten,
|
|
176
|
+
canonicalLayoutDone: this.canonicalLayoutDone,
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
importState(data) {
|
|
181
|
+
if (typeof data?.readmeWritten === "boolean") this.readmeWritten = data.readmeWritten;
|
|
182
|
+
if (typeof data?.coverageReportWritten === "boolean") this.coverageReportWritten = data.coverageReportWritten;
|
|
183
|
+
if (typeof data?.finalDashboardWritten === "boolean") this.finalDashboardWritten = data.finalDashboardWritten;
|
|
184
|
+
if (typeof data?.canonicalLayoutDone === "boolean") this.canonicalLayoutDone = data.canonicalLayoutDone;
|
|
185
|
+
}
|
|
186
|
+
}
|
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Pipeline phases — sequential workflow of the KC Agent methodology.
|
|
3
|
+
*
|
|
4
|
+
* v0.6.0 E1: FINALIZATION added as the 7th phase. It's a cleanup /
|
|
5
|
+
* deliverable-packaging phase that runs after PRODUCTION_QC has
|
|
6
|
+
* established the system is operating correctly: reorganize
|
|
7
|
+
* rule_skills/ into a canonical layout, write README + coverage
|
|
8
|
+
* report, snapshot a final dashboard, archive stale retry outputs.
|
|
9
|
+
* Short phase, a handful of tasks, driven by a finalization skill.
|
|
3
10
|
*/
|
|
4
11
|
export const Phase = Object.freeze({
|
|
5
12
|
BOOTSTRAP: "bootstrap",
|
|
@@ -8,6 +15,7 @@ export const Phase = Object.freeze({
|
|
|
8
15
|
SKILL_TESTING: "skill_testing",
|
|
9
16
|
DISTILLATION: "distillation",
|
|
10
17
|
PRODUCTION_QC: "production_qc",
|
|
18
|
+
FINALIZATION: "finalization",
|
|
11
19
|
});
|
|
12
20
|
|
|
13
21
|
/**
|
|
@@ -138,6 +138,31 @@ export class ProjectInitializer extends Pipeline {
|
|
|
138
138
|
this.configReady = !!gc.api_key;
|
|
139
139
|
}
|
|
140
140
|
|
|
141
|
+
/**
|
|
142
|
+
* F1b: Worker LLM health snapshot. Static check only — inspect whether
|
|
143
|
+
* TIER1-4 and OCR_MODEL_TIER1 are populated in .env. Does NOT make
|
|
144
|
+
* network calls — a live ping would be invasive for bootstrap (slow,
|
|
145
|
+
* charges money, and the worker LLM isn't actually used until
|
|
146
|
+
* DISTILLATION). Surfacing the config state is enough for bootstrap.
|
|
147
|
+
* The agent can then decide to validate via worker_llm_call later if
|
|
148
|
+
* warranted. Returns null when no .env exists yet.
|
|
149
|
+
*/
|
|
150
|
+
_workerConfigSnapshot() {
|
|
151
|
+
const envPath = path.join(this._workspace.cwd, ".env");
|
|
152
|
+
if (!fs.existsSync(envPath)) return null;
|
|
153
|
+
const tiers = { TIER1: "", TIER2: "", TIER3: "", TIER4: "", OCR_MODEL_TIER1: "" };
|
|
154
|
+
try {
|
|
155
|
+
for (const line of fs.readFileSync(envPath, "utf-8").split("\n")) {
|
|
156
|
+
for (const k of Object.keys(tiers)) {
|
|
157
|
+
if (line.startsWith(`${k}=`)) {
|
|
158
|
+
tiers[k] = line.slice(k.length + 1).trim();
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
} catch { return null; }
|
|
163
|
+
return tiers;
|
|
164
|
+
}
|
|
165
|
+
|
|
141
166
|
_loadGlobalConfig() {
|
|
142
167
|
const p = path.join(os.homedir(), ".kc_agent", "config.json");
|
|
143
168
|
if (fs.existsSync(p)) { try { return JSON.parse(fs.readFileSync(p, "utf-8")); } catch { /* skip */ } }
|
|
@@ -158,6 +183,21 @@ export class ProjectInitializer extends Pipeline {
|
|
|
158
183
|
if (completed.length) parts.push("### Done\n" + completed.map((c) => `- [x] ${c}`).join("\n"));
|
|
159
184
|
if (pending.length) parts.push("### Needed\n" + pending.map((p) => `- [ ] ${p}`).join("\n"));
|
|
160
185
|
|
|
186
|
+
// F1b: surface worker-LLM tier status as part of bootstrap state so
|
|
187
|
+
// the agent can flag missing tiers to the developer user upfront,
|
|
188
|
+
// rather than hitting "worker LLM unreachable" hours later during
|
|
189
|
+
// DISTILLATION. Static inspection only — no network calls.
|
|
190
|
+
const workerConfig = this._workerConfigSnapshot();
|
|
191
|
+
if (workerConfig) {
|
|
192
|
+
const tierLines = [];
|
|
193
|
+
for (const [k, v] of Object.entries(workerConfig)) {
|
|
194
|
+
if (v) tierLines.push(`- ${k}: ${v}`);
|
|
195
|
+
else tierLines.push(`- ${k}: ⚠️ (empty — set before DISTILLATION, or worker_llm_call tools will fail)`);
|
|
196
|
+
}
|
|
197
|
+
parts.push("### Worker LLM tiers (.env snapshot)\n" + tierLines.join("\n") +
|
|
198
|
+
"\n\nThese drive `worker_llm_call`, `workflow_run`, `document_parse` OCR, etc. Empty tiers don't block bootstrap — but DISTILLATION requires at least TIER1 to be live. Discuss with the developer user if any are missing.");
|
|
199
|
+
}
|
|
200
|
+
|
|
161
201
|
if (this.exitCriteriaMet()) {
|
|
162
202
|
parts.push("### Exit\nBootstrap requirements met. Proceed to EXTRACTION.");
|
|
163
203
|
}
|
|
@@ -36,6 +36,11 @@ export class ProductionQCPipeline extends Pipeline {
|
|
|
36
36
|
}
|
|
37
37
|
|
|
38
38
|
_scanQcResults() {
|
|
39
|
+
// v0.6.1 A5/A6: don't reset documentsReviewed if engine emission has
|
|
40
|
+
// bumped it since last scan — workflow_run hooks call _recordMilestone
|
|
41
|
+
// and the increment lives in this same field. Other counters (batches,
|
|
42
|
+
// accuracy, issues) come solely from filesystem scan and reset cleanly.
|
|
43
|
+
const engineDocsReviewed = this.documentsReviewed;
|
|
39
44
|
this.batchesProcessed = 0;
|
|
40
45
|
this.totalDocuments = 0;
|
|
41
46
|
this.documentsReviewed = 0;
|
|
@@ -43,23 +48,57 @@ export class ProductionQCPipeline extends Pipeline {
|
|
|
43
48
|
this.confidenceDistribution = { low: 0, medium: 0, high: 0 };
|
|
44
49
|
this.issuesFound = [];
|
|
45
50
|
|
|
51
|
+
// Existing canonical path: output/qc/*.json (formal QC batch reports)
|
|
46
52
|
const qcDir = path.join(this._workspace.cwd, "output", "qc");
|
|
47
|
-
if (
|
|
53
|
+
if (fs.existsSync(qcDir)) {
|
|
54
|
+
for (const f of fs.readdirSync(qcDir).filter((f) => f.endsWith(".json")).sort()) {
|
|
55
|
+
try {
|
|
56
|
+
const data = JSON.parse(fs.readFileSync(path.join(qcDir, f), "utf-8"));
|
|
57
|
+
this.batchesProcessed++;
|
|
58
|
+
this.totalDocuments += typeof data.documents === "number" ? data.documents : (data.total || 0);
|
|
59
|
+
this.documentsReviewed += data.reviewed || 0;
|
|
60
|
+
if (data.accuracy_by_rule) Object.assign(this.accuracyByRule, data.accuracy_by_rule);
|
|
61
|
+
if (data.confidence) {
|
|
62
|
+
for (const band of ["low", "medium", "high"]) this.confidenceDistribution[band] += data.confidence[band] || 0;
|
|
63
|
+
}
|
|
64
|
+
if (Array.isArray(data.issues)) this.issuesFound.push(...data.issues);
|
|
65
|
+
} catch { /* skip */ }
|
|
66
|
+
}
|
|
67
|
+
}
|
|
48
68
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
69
|
+
// v0.6.1 A5: also pick up batch-style results in output/results/. E2E #4
|
|
70
|
+
// showed agents writing batch QC outputs to output/results/qc_*.json
|
|
71
|
+
// (e.g. unified_qc.py) instead of output/qc/, so the formal scanner
|
|
72
|
+
// missed them. Heuristic match: filename starts with "qc_" or contains
|
|
73
|
+
// "_batch_". Each match counts as one batch; total_checks → totalDocuments.
|
|
74
|
+
const resultsDir = path.join(this._workspace.cwd, "output", "results");
|
|
75
|
+
if (fs.existsSync(resultsDir)) {
|
|
76
|
+
const seen = new Set();
|
|
77
|
+
for (const f of fs.readdirSync(resultsDir).filter((f) => f.endsWith(".json"))) {
|
|
78
|
+
const lower = f.toLowerCase();
|
|
79
|
+
if (!(lower.startsWith("qc_") || lower.includes("_batch_"))) continue;
|
|
80
|
+
// Dedupe near-duplicate filenames that differ only by timestamp
|
|
81
|
+
// suffix (qc_full_batch_20260424_141642.json vs _141921.json
|
|
82
|
+
// — both are real batches, keep both. But qc_pt_x.json and
|
|
83
|
+
// qc_pt_x_<ts>.json are usually the same batch saved twice; key
|
|
84
|
+
// on the prefix before any 8-digit date.)
|
|
85
|
+
const key = f.replace(/_\d{8}_\d{6}/g, "").replace(/\.json$/, "");
|
|
86
|
+
if (seen.has(key)) continue;
|
|
87
|
+
seen.add(key);
|
|
52
88
|
this.batchesProcessed++;
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
}
|
|
89
|
+
try {
|
|
90
|
+
const data = JSON.parse(fs.readFileSync(path.join(resultsDir, f), "utf-8"));
|
|
91
|
+
// Best-effort metric extraction; tolerate missing keys
|
|
92
|
+
this.totalDocuments += typeof data.sample_count === "number" ? data.sample_count
|
|
93
|
+
: typeof data.documents === "number" ? data.documents
|
|
94
|
+
: typeof data.total === "number" ? data.total : 0;
|
|
95
|
+
} catch { /* skip */ }
|
|
96
|
+
}
|
|
61
97
|
}
|
|
62
98
|
|
|
99
|
+
// Restore engine-emitted documentsReviewed if filesystem reported less
|
|
100
|
+
if (engineDocsReviewed > this.documentsReviewed) this.documentsReviewed = engineDocsReviewed;
|
|
101
|
+
|
|
63
102
|
// Determine monitoring phase
|
|
64
103
|
if (this.batchesProcessed < 3) this.monitoringPhase = "initial";
|
|
65
104
|
else if (this.issuesFound.length > 0) this.monitoringPhase = "active";
|
|
@@ -93,7 +132,18 @@ export class ProductionQCPipeline extends Pipeline {
|
|
|
93
132
|
return null;
|
|
94
133
|
}
|
|
95
134
|
|
|
96
|
-
|
|
135
|
+
/**
|
|
136
|
+
* v0.6.1 A5: gate requires at least one batch processed (real telemetry)
|
|
137
|
+
* AND the legacy stable-monitoring criterion. Without the batch floor, the
|
|
138
|
+
* agent could declare PRODUCTION_QC done from a clean session-state file
|
|
139
|
+
* (E2E #4: phase advanced into PRODUCTION_QC, agent ran 6,930 checks via
|
|
140
|
+
* sandbox_exec to non-canonical paths, batchesProcessed stayed 0, exit
|
|
141
|
+
* fired anyway because monitoringPhase defaults can flip to "stable" with
|
|
142
|
+
* empty accuracyByRule + zero issues).
|
|
143
|
+
*/
|
|
144
|
+
exitCriteriaMet() {
|
|
145
|
+
return this.batchesProcessed > 0 && this.monitoringPhase === "stable";
|
|
146
|
+
}
|
|
97
147
|
|
|
98
148
|
exportState() {
|
|
99
149
|
return {
|