kc-beta 0.6.2 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +81 -0
- package/LICENSE-COMMERCIAL.md +125 -0
- package/README.md +21 -3
- package/package.json +14 -5
- package/src/agent/context-window.js +9 -12
- package/src/agent/context.js +14 -1
- package/src/agent/document-parser.js +169 -0
- package/src/agent/engine.js +367 -18
- package/src/agent/history/event-history.js +222 -0
- package/src/agent/llm-client.js +55 -0
- package/src/agent/message-utils.js +63 -0
- package/src/agent/pipelines/_milestone-derive.js +511 -0
- package/src/agent/pipelines/base.js +21 -0
- package/src/agent/pipelines/distillation.js +28 -15
- package/src/agent/pipelines/extraction.js +103 -36
- package/src/agent/pipelines/finalization.js +178 -11
- package/src/agent/pipelines/index.js +6 -1
- package/src/agent/pipelines/initializer.js +74 -8
- package/src/agent/pipelines/production-qc.js +31 -44
- package/src/agent/pipelines/skill-authoring.js +97 -80
- package/src/agent/pipelines/skill-testing.js +67 -23
- package/src/agent/retry.js +10 -2
- package/src/agent/scheduler.js +14 -2
- package/src/agent/session-state.js +18 -1
- package/src/agent/skill-loader.js +13 -7
- package/src/agent/skill-validator.js +19 -5
- package/src/agent/task-manager.js +61 -5
- package/src/agent/tools/document-chunk.js +21 -9
- package/src/agent/tools/phase-advance.js +18 -3
- package/src/agent/tools/release.js +51 -9
- package/src/agent/tools/rule-catalog.js +11 -1
- package/src/agent/tools/workspace-file.js +32 -0
- package/src/agent/workspace.js +39 -1
- package/src/cli/components.js +64 -14
- package/src/cli/index.js +62 -3
- package/src/cli/meme.js +26 -25
- package/src/config.js +65 -22
- package/src/model-tiers.json +24 -8
- package/src/providers.js +42 -0
- package/template/release/v1/README.md.tmpl +108 -0
- package/template/release/v1/catalog.json.tmpl +4 -0
- package/template/release/v1/kc_runtime/__init__.py +11 -0
- package/template/release/v1/kc_runtime/confidence.py +63 -0
- package/template/release/v1/kc_runtime/doc_parser.py +127 -0
- package/template/release/v1/manifest.json.tmpl +11 -0
- package/template/release/v1/render_dashboard.py +117 -0
- package/template/release/v1/run.py +212 -0
- package/template/release/v1/serve.sh +17 -0
- package/template/skills/en/meta-meta/work-decomposition/SKILL.md +266 -0
- package/template/skills/en/skill-creator/SKILL.md +1 -1
- package/template/skills/zh/meta-meta/work-decomposition/SKILL.md +264 -0
- package/template/skills/zh/skill-creator/SKILL.md +1 -1
|
@@ -0,0 +1,511 @@
|
|
|
1
|
+
// v0.7.0 Group A1: filesystem-derived pipeline milestones.
|
|
2
|
+
//
|
|
3
|
+
// E2E #5 finding (DS + GLM audits): every phase gate got force-bypassed
|
|
4
|
+
// because the engine's pipelineMilestones were tracking *which tools the
|
|
5
|
+
// agent called*, not *what artifacts ended up on disk*. Both contestants
|
|
6
|
+
// produced real work (70 skill scripts, 28 workflows, 1951 verdicts) via
|
|
7
|
+
// Write/Bash/sandbox_exec, so the milestone-recording tool wrappers
|
|
8
|
+
// (workflow-run.js → engine._recordMilestone) never fired and the gate
|
|
9
|
+
// stayed empty.
|
|
10
|
+
//
|
|
11
|
+
// This module is the new canonical source. Each derive function reads
|
|
12
|
+
// the workspace filesystem and returns the milestone fields for that
|
|
13
|
+
// phase. Pipelines call these instead of (or in addition to) their
|
|
14
|
+
// previous tool-instrumented counters.
|
|
15
|
+
//
|
|
16
|
+
// Design: simple + correct over fast + complex. Each derive is bounded
|
|
17
|
+
// (~10-50 stat calls per phase, all on warm OS cache → microseconds).
|
|
18
|
+
// No cache layer in v0.7.0 — if profiling later shows it's hot, add it
|
|
19
|
+
// then. The functions are pure: same disk state in, same milestones out.
|
|
20
|
+
//
|
|
21
|
+
// Workspace param is a Workspace instance with a .cwd string. Functions
|
|
22
|
+
// also accept a plain workspaceCwd string for tests / one-off audits
|
|
23
|
+
// (e.g., re-deriving E2E #5 session-state from saved workspaces).
|
|
24
|
+
|
|
25
|
+
import fs from "node:fs";
|
|
26
|
+
import path from "node:path";
|
|
27
|
+
import crypto from "node:crypto";
|
|
28
|
+
|
|
29
|
+
function cwdOf(ws) {
|
|
30
|
+
return typeof ws === "string" ? ws : (ws?.cwd || ws?.path || "");
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
function dirExists(p) {
|
|
34
|
+
try { return fs.statSync(p).isDirectory(); } catch { return false; }
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function fileExists(p) {
|
|
38
|
+
try { return fs.statSync(p).isFile(); } catch { return false; }
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
function readDirSafe(p) {
|
|
42
|
+
try { return fs.readdirSync(p, { withFileTypes: true }); } catch { return []; }
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function listChildren(p) {
|
|
46
|
+
return readDirSafe(p).filter((e) => !e.name.startsWith("."));
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
function listChildDirs(p) {
|
|
50
|
+
return listChildren(p).filter((e) => e.isDirectory());
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function listChildFiles(p) {
|
|
54
|
+
return listChildren(p).filter((e) => e.isFile());
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// Walk a directory recursively, yielding every file path. Skips hidden
|
|
58
|
+
// dirs/files and __pycache__. Used by derive functions that need to
|
|
59
|
+
// match arbitrarily-nested artifacts (e.g., scripts/ subdirs).
|
|
60
|
+
function* walkFiles(root) {
|
|
61
|
+
if (!dirExists(root)) return;
|
|
62
|
+
const stack = [root];
|
|
63
|
+
while (stack.length) {
|
|
64
|
+
const dir = stack.pop();
|
|
65
|
+
for (const e of readDirSafe(dir)) {
|
|
66
|
+
if (e.name.startsWith(".") || e.name === "__pycache__") continue;
|
|
67
|
+
const p = path.join(dir, e.name);
|
|
68
|
+
if (e.isDirectory()) stack.push(p);
|
|
69
|
+
else if (e.isFile()) yield p;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
function readJsonSafe(p) {
|
|
75
|
+
try { return JSON.parse(fs.readFileSync(p, "utf-8")); } catch { return null; }
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function sha256OfFile(p) {
|
|
79
|
+
try {
|
|
80
|
+
const buf = fs.readFileSync(p);
|
|
81
|
+
return crypto.createHash("sha256").update(buf).digest("hex");
|
|
82
|
+
} catch { return null; }
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// Normalize a rule id like "R14" / "r014" / "R0014" to canonical "R014".
|
|
86
|
+
// Returns null for non-matching strings (e.g., thematic skill names like
|
|
87
|
+
// "account_identity" — those stay as-is via the second branch).
|
|
88
|
+
function canonicalRuleId(s) {
|
|
89
|
+
if (typeof s !== "string") return null;
|
|
90
|
+
const m = s.match(/^R0*(\d+)$/i);
|
|
91
|
+
if (m) return `R${String(parseInt(m[1], 10)).padStart(3, "0")}`;
|
|
92
|
+
return null;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// ───────────────────────────────────────────────────────────────────
|
|
96
|
+
// bootstrap
|
|
97
|
+
// ───────────────────────────────────────────────────────────────────
|
|
98
|
+
|
|
99
|
+
export function deriveBootstrapMilestones(workspace) {
|
|
100
|
+
const cwd = cwdOf(workspace);
|
|
101
|
+
const samplesDir = path.join(cwd, "samples");
|
|
102
|
+
let hasSamples = false;
|
|
103
|
+
let sampleCount = 0;
|
|
104
|
+
if (dirExists(samplesDir)) {
|
|
105
|
+
// Count any non-hidden file at any depth — agents may organize
|
|
106
|
+
// samples in subdirs (E2E #5 GLM had samples/samples/ recursion).
|
|
107
|
+
for (const _f of walkFiles(samplesDir)) { sampleCount++; if (sampleCount > 0) hasSamples = true; }
|
|
108
|
+
}
|
|
109
|
+
return { hasSamples, sampleCount };
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// ───────────────────────────────────────────────────────────────────
|
|
113
|
+
// rule_extraction
|
|
114
|
+
// ───────────────────────────────────────────────────────────────────
|
|
115
|
+
|
|
116
|
+
export function deriveRuleExtractionMilestones(workspace) {
|
|
117
|
+
const cwd = cwdOf(workspace);
|
|
118
|
+
const rulesDir = path.join(cwd, "rules");
|
|
119
|
+
|
|
120
|
+
// rulesExtracted: every rule object across every JSON file in rules/
|
|
121
|
+
// that has a non-empty `id` field. catalog.json is canonical but agents
|
|
122
|
+
// sometimes fan out to per-rule files (E2E #5 DS).
|
|
123
|
+
const rulesExtracted = [];
|
|
124
|
+
const rulesWithChunkRefs = [];
|
|
125
|
+
if (dirExists(rulesDir)) {
|
|
126
|
+
for (const e of listChildFiles(rulesDir)) {
|
|
127
|
+
if (!e.name.endsWith(".json")) continue;
|
|
128
|
+
const data = readJsonSafe(path.join(rulesDir, e.name));
|
|
129
|
+
if (!data) continue;
|
|
130
|
+
const items = Array.isArray(data) ? data : (data.rules || []);
|
|
131
|
+
for (const r of items) {
|
|
132
|
+
if (r && typeof r.id === "string" && r.id.length) {
|
|
133
|
+
rulesExtracted.push(r.id);
|
|
134
|
+
if (Array.isArray(r.source_chunk_ids) && r.source_chunk_ids.length > 0) {
|
|
135
|
+
rulesWithChunkRefs.push(r.id);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// coverageAudited: presence of rules/coverage_audit.{md,json} OR a
|
|
143
|
+
// rules/coverage_report.md / output/coverage_report.md. Loose criterion
|
|
144
|
+
// because agents pick different conventions; the spirit is "did the
|
|
145
|
+
// agent produce a coverage doc" not "did they put it in this exact file".
|
|
146
|
+
const coverageAudited =
|
|
147
|
+
fileExists(path.join(rulesDir, "coverage_audit.md")) ||
|
|
148
|
+
fileExists(path.join(rulesDir, "coverage_audit.json")) ||
|
|
149
|
+
fileExists(path.join(rulesDir, "coverage_report.md")) ||
|
|
150
|
+
fileExists(path.join(cwd, "output", "coverage_report.md"));
|
|
151
|
+
|
|
152
|
+
return {
|
|
153
|
+
rulesExtracted,
|
|
154
|
+
rulesWithChunkRefs,
|
|
155
|
+
coverageAudited,
|
|
156
|
+
};
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// ───────────────────────────────────────────────────────────────────
|
|
160
|
+
// skill_authoring
|
|
161
|
+
// ───────────────────────────────────────────────────────────────────
|
|
162
|
+
|
|
163
|
+
// Recognized check-script paths inside a skill dir, per A6 spec:
|
|
164
|
+
// <skillDir>/check_r###.py (DS + most agents)
|
|
165
|
+
// <skillDir>/check.py (canonical meta-meta spec)
|
|
166
|
+
// <skillDir>/scripts/check_r###.py (XM)
|
|
167
|
+
// <skillDir>/scripts/check.py
|
|
168
|
+
function findCheckScripts(skillDir) {
|
|
169
|
+
const found = [];
|
|
170
|
+
for (const f of walkFiles(skillDir)) {
|
|
171
|
+
const base = path.basename(f);
|
|
172
|
+
const rel = path.relative(skillDir, f);
|
|
173
|
+
// Only count scripts at depth ≤ 2 (skillDir/check.py or skillDir/scripts/check.py)
|
|
174
|
+
const depth = rel.split(path.sep).length;
|
|
175
|
+
if (depth > 2) continue;
|
|
176
|
+
if (/^check(_r[\d_-]+)?\.py$/i.test(base) || /^check_r[\d_-]+\.py$/i.test(base)) {
|
|
177
|
+
found.push(f);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
return found;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
export function deriveSkillAuthoringMilestones(workspace) {
|
|
184
|
+
const cwd = cwdOf(workspace);
|
|
185
|
+
const skillsDir = path.join(cwd, "rule_skills");
|
|
186
|
+
const skillsAuthored = [];
|
|
187
|
+
const skillsWithScripts = [];
|
|
188
|
+
const ruleIdsCovered = new Set();
|
|
189
|
+
|
|
190
|
+
if (!dirExists(skillsDir)) {
|
|
191
|
+
return { skillsAuthored, skillsWithScripts, ruleIdsCovered: [] };
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
for (const e of listChildDirs(skillsDir)) {
|
|
195
|
+
if (e.name.startsWith("__")) continue;
|
|
196
|
+
const skillPath = path.join(skillsDir, e.name);
|
|
197
|
+
|
|
198
|
+
// SKILL.md OR skill.md (case-insensitive — macOS/Windows users
|
|
199
|
+
// produce both, see v0.7.0 F1 task).
|
|
200
|
+
const hasSkillMd = listChildFiles(skillPath).some(
|
|
201
|
+
(f) => f.name.toLowerCase() === "skill.md",
|
|
202
|
+
);
|
|
203
|
+
const checkScripts = findCheckScripts(skillPath);
|
|
204
|
+
const hasAnyPy = walkFiles(skillPath).next().done === false &&
|
|
205
|
+
checkScripts.length > 0;
|
|
206
|
+
|
|
207
|
+
if (hasSkillMd || hasAnyPy) skillsAuthored.push(e.name);
|
|
208
|
+
if (checkScripts.length > 0) skillsWithScripts.push(e.name);
|
|
209
|
+
|
|
210
|
+
// Collect ruleIds covered by directory name, single check_r###.py
|
|
211
|
+
// names, grouped check_r###_r###.py names, and range dirs R078_R128.
|
|
212
|
+
const dirCanon = canonicalRuleId(e.name);
|
|
213
|
+
if (dirCanon) ruleIdsCovered.add(dirCanon);
|
|
214
|
+
const rangeDir = e.name.match(/^R0*(\d+)[_-]R0*(\d+)$/i);
|
|
215
|
+
if (rangeDir) {
|
|
216
|
+
const lo = parseInt(rangeDir[1], 10);
|
|
217
|
+
const hi = parseInt(rangeDir[2], 10);
|
|
218
|
+
for (let n = lo; n <= hi; n++) {
|
|
219
|
+
ruleIdsCovered.add(`R${String(n).padStart(3, "0")}`);
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
for (const scriptPath of checkScripts) {
|
|
223
|
+
const base = path.basename(scriptPath);
|
|
224
|
+
const single = base.match(/^check_r0*(\d+)\.py$/i);
|
|
225
|
+
if (single) {
|
|
226
|
+
ruleIdsCovered.add(`R${String(parseInt(single[1], 10)).padStart(3, "0")}`);
|
|
227
|
+
}
|
|
228
|
+
const grouped = base.match(/^check_r0*(\d+)[_-]+r0*(\d+)\.py$/i);
|
|
229
|
+
if (grouped) {
|
|
230
|
+
const lo = parseInt(grouped[1], 10);
|
|
231
|
+
const hi = parseInt(grouped[2], 10);
|
|
232
|
+
for (let n = lo; n <= hi; n++) {
|
|
233
|
+
ruleIdsCovered.add(`R${String(n).padStart(3, "0")}`);
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
return {
|
|
240
|
+
skillsAuthored,
|
|
241
|
+
skillsWithScripts,
|
|
242
|
+
ruleIdsCovered: [...ruleIdsCovered],
|
|
243
|
+
};
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
// ───────────────────────────────────────────────────────────────────
|
|
247
|
+
// skill_testing
|
|
248
|
+
// ───────────────────────────────────────────────────────────────────
|
|
249
|
+
|
|
250
|
+
export function deriveSkillTestingMilestones(workspace) {
|
|
251
|
+
const cwd = cwdOf(workspace);
|
|
252
|
+
const skillsDir = path.join(cwd, "rule_skills");
|
|
253
|
+
const skillsTested = [];
|
|
254
|
+
|
|
255
|
+
if (dirExists(skillsDir)) {
|
|
256
|
+
for (const e of listChildDirs(skillsDir)) {
|
|
257
|
+
if (e.name.startsWith("__")) continue;
|
|
258
|
+
const skillPath = path.join(skillsDir, e.name);
|
|
259
|
+
// Tested ⇔ has any of: tests/ dir, test_results.json, test_results/,
|
|
260
|
+
// assets/test_cases.json, OR a successful test artifact like
|
|
261
|
+
// *_test_output.json. Loose because agents use different conventions.
|
|
262
|
+
const hasTestArtifact =
|
|
263
|
+
dirExists(path.join(skillPath, "tests")) ||
|
|
264
|
+
fileExists(path.join(skillPath, "test_results.json")) ||
|
|
265
|
+
dirExists(path.join(skillPath, "test_results")) ||
|
|
266
|
+
fileExists(path.join(skillPath, "assets", "test_cases.json")) ||
|
|
267
|
+
listChildFiles(skillPath).some((f) =>
|
|
268
|
+
/^(test|.*_test)_(output|result|log)/i.test(f.name) && f.name.endsWith(".json"));
|
|
269
|
+
if (hasTestArtifact) skillsTested.push(e.name);
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
// skillsPassing — per-skill accuracy threshold. Without a uniform
|
|
274
|
+
// schema across agent outputs we report `tested` as the floor; the
|
|
275
|
+
// pipeline's existing _loadTestResults() can layer accuracy on top.
|
|
276
|
+
return { skillsTested };
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
// ───────────────────────────────────────────────────────────────────
|
|
280
|
+
// distillation
|
|
281
|
+
// ───────────────────────────────────────────────────────────────────
|
|
282
|
+
|
|
283
|
+
export function deriveDistillationMilestones(workspace) {
|
|
284
|
+
const cwd = cwdOf(workspace);
|
|
285
|
+
const wfRoot = path.join(cwd, "workflows");
|
|
286
|
+
const workflowsCreated = [];
|
|
287
|
+
|
|
288
|
+
if (dirExists(wfRoot)) {
|
|
289
|
+
// Two layouts seen in E2E #5:
|
|
290
|
+
// workflows/<id>/workflow_v#.py (canonical, what release.js expects)
|
|
291
|
+
// workflows/<id>_workflow.py (DS + GLM flat layout)
|
|
292
|
+
// workflows/<id>.json (DS regex_skill manifest)
|
|
293
|
+
// Accept all three; downstream release tool's auto-relocator (Group C)
|
|
294
|
+
// can normalize.
|
|
295
|
+
for (const e of listChildren(wfRoot)) {
|
|
296
|
+
if (e.isDirectory()) {
|
|
297
|
+
const sub = path.join(wfRoot, e.name);
|
|
298
|
+
const hasPy = listChildFiles(sub).some((f) =>
|
|
299
|
+
/workflow.*\.py$/i.test(f.name) || /^check.*\.py$/i.test(f.name));
|
|
300
|
+
if (hasPy) workflowsCreated.push(e.name);
|
|
301
|
+
continue;
|
|
302
|
+
}
|
|
303
|
+
if (e.isFile()) {
|
|
304
|
+
const m1 = e.name.match(/^(.+)_workflow\.py$/i);
|
|
305
|
+
if (m1) { workflowsCreated.push(m1[1]); continue; }
|
|
306
|
+
const m2 = e.name.match(/^(.+)\.json$/i);
|
|
307
|
+
if (m2) {
|
|
308
|
+
const data = readJsonSafe(path.join(wfRoot, e.name));
|
|
309
|
+
if (data && (data.rule_id || data.entry || data.type)) workflowsCreated.push(m2[1]);
|
|
310
|
+
continue;
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// workflowsTested — look for per-workflow test artifacts. Same loose
|
|
317
|
+
// contract as skill_testing: any test_results.json / test_results/ /
|
|
318
|
+
// baseline_*.json present means the workflow has been exercised.
|
|
319
|
+
const workflowsTested = [];
|
|
320
|
+
if (dirExists(wfRoot)) {
|
|
321
|
+
for (const e of listChildDirs(wfRoot)) {
|
|
322
|
+
const sub = path.join(wfRoot, e.name);
|
|
323
|
+
if (
|
|
324
|
+
fileExists(path.join(sub, "test_results.json")) ||
|
|
325
|
+
dirExists(path.join(sub, "test_results")) ||
|
|
326
|
+
listChildFiles(sub).some((f) => /^(baseline|test|result)_.*\.json$/i.test(f.name))
|
|
327
|
+
) {
|
|
328
|
+
workflowsTested.push(e.name);
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
return { workflowsCreated, workflowsTested };
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
// ───────────────────────────────────────────────────────────────────
|
|
337
|
+
// production_qc
|
|
338
|
+
// ───────────────────────────────────────────────────────────────────
|
|
339
|
+
|
|
340
|
+
export function deriveProductionQcMilestones(workspace) {
|
|
341
|
+
const cwd = cwdOf(workspace);
|
|
342
|
+
const outputDir = path.join(cwd, "output");
|
|
343
|
+
let batchesProcessed = 0;
|
|
344
|
+
const documentsReviewedSet = new Set();
|
|
345
|
+
const candidateDirs = [
|
|
346
|
+
path.join(outputDir, "results"),
|
|
347
|
+
path.join(outputDir, "qc"),
|
|
348
|
+
path.join(outputDir, "distillation"),
|
|
349
|
+
];
|
|
350
|
+
|
|
351
|
+
for (const dir of candidateDirs) {
|
|
352
|
+
if (!dirExists(dir)) continue;
|
|
353
|
+
for (const e of listChildFiles(dir)) {
|
|
354
|
+
if (!e.name.endsWith(".json")) continue;
|
|
355
|
+
const data = readJsonSafe(path.join(dir, e.name));
|
|
356
|
+
if (data === null || data === undefined) continue;
|
|
357
|
+
|
|
358
|
+
// Heuristic, two shapes seen in E2E #5:
|
|
359
|
+
// (a) DS — object with results/verdicts/n_skills/batch_id keys
|
|
360
|
+
// (b) GLM — array of per-document verdict objects (each has
|
|
361
|
+
// .verdict + .file/.path)
|
|
362
|
+
let isBatch = false;
|
|
363
|
+
if (Array.isArray(data) && data.length > 0) {
|
|
364
|
+
const first = data[0];
|
|
365
|
+
if (first && typeof first === "object" && "verdict" in first) isBatch = true;
|
|
366
|
+
} else if (data && typeof data === "object") {
|
|
367
|
+
isBatch = !!(
|
|
368
|
+
data.batch_id ||
|
|
369
|
+
data.n_skills ||
|
|
370
|
+
data.results ||
|
|
371
|
+
data.verdicts ||
|
|
372
|
+
data.verdict_stats ||
|
|
373
|
+
data.accuracyByRule
|
|
374
|
+
);
|
|
375
|
+
}
|
|
376
|
+
if (!isBatch) continue;
|
|
377
|
+
batchesProcessed++;
|
|
378
|
+
|
|
379
|
+
// Documents reviewed: deduped doc paths from whatever shape we got.
|
|
380
|
+
if (Array.isArray(data)) {
|
|
381
|
+
for (const r of data) {
|
|
382
|
+
if (r && typeof r === "object") {
|
|
383
|
+
const key = r.path || r.file || r.doc || r.document;
|
|
384
|
+
if (key) documentsReviewedSet.add(String(key));
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
} else if (data.results && typeof data.results === "object") {
|
|
388
|
+
for (const r of Object.values(data.results)) {
|
|
389
|
+
if (r && typeof r === "object") {
|
|
390
|
+
for (const docKey of Object.keys(r)) documentsReviewedSet.add(docKey);
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
if (Array.isArray(data.documents)) {
|
|
395
|
+
for (const d of data.documents) {
|
|
396
|
+
documentsReviewedSet.add(typeof d === "string" ? d : (d?.path || JSON.stringify(d)));
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
return {
|
|
403
|
+
batchesProcessed,
|
|
404
|
+
documentsReviewed: documentsReviewedSet.size,
|
|
405
|
+
documentsReviewedKeys: [...documentsReviewedSet], // for describeState detail
|
|
406
|
+
};
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
// ───────────────────────────────────────────────────────────────────
|
|
410
|
+
// finalization
|
|
411
|
+
// ───────────────────────────────────────────────────────────────────
|
|
412
|
+
|
|
413
|
+
export function deriveFinalizationMilestones(workspace) {
|
|
414
|
+
const cwd = cwdOf(workspace);
|
|
415
|
+
|
|
416
|
+
// readmeWritten: at least one populated README.md under output/releases/*/
|
|
417
|
+
// (≥500 bytes — sub-template-stub size). Catches DS + GLM E2E #5
|
|
418
|
+
// failure where run.py was shipped without a real README.
|
|
419
|
+
let readmeWritten = false;
|
|
420
|
+
const releasesRoot = path.join(cwd, "output", "releases");
|
|
421
|
+
if (dirExists(releasesRoot)) {
|
|
422
|
+
outer: for (const e of listChildDirs(releasesRoot)) {
|
|
423
|
+
const readme = path.join(releasesRoot, e.name, "README.md");
|
|
424
|
+
try {
|
|
425
|
+
const stat = fs.statSync(readme);
|
|
426
|
+
if (stat.isFile() && stat.size >= 500) { readmeWritten = true; break outer; }
|
|
427
|
+
} catch { /* skip */ }
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
// Also accept (in priority order):
|
|
431
|
+
// - rule_skills/README.md (the v0.6.0 finalization pipeline target)
|
|
432
|
+
// - workspace-root README.md (GLM E2E #5 wrote here)
|
|
433
|
+
// Avoids false-negatives when the agent picks a different shipping
|
|
434
|
+
// location than the canonical release/v1/ directory.
|
|
435
|
+
if (!readmeWritten) {
|
|
436
|
+
for (const candidate of [
|
|
437
|
+
path.join(cwd, "rule_skills", "README.md"),
|
|
438
|
+
path.join(cwd, "README.md"),
|
|
439
|
+
]) {
|
|
440
|
+
try {
|
|
441
|
+
const stat = fs.statSync(candidate);
|
|
442
|
+
if (stat.isFile() && stat.size >= 500) { readmeWritten = true; break; }
|
|
443
|
+
} catch { /* skip */ }
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
// coverageReportWritten: rules/coverage_report.md OR output/coverage_report.md.
|
|
448
|
+
const coverageReportWritten =
|
|
449
|
+
fileExists(path.join(cwd, "rules", "coverage_report.md")) ||
|
|
450
|
+
fileExists(path.join(cwd, "output", "coverage_report.md"));
|
|
451
|
+
|
|
452
|
+
// finalDashboardWritten: at least one dashboards/*.html that is NOT a
|
|
453
|
+
// duplicate of any other. DS + GLM both shipped byte-identical
|
|
454
|
+
// dashboards under different filenames; sha256-distinct guards against
|
|
455
|
+
// it. Single-file case is OK (one dashboard, no comparison needed).
|
|
456
|
+
// Multi-file case requires hashes.size >= 2 OR htmls.length === 1.
|
|
457
|
+
//
|
|
458
|
+
// Fallback path (v0.6.0 final_dashboard.html) only applies when
|
|
459
|
+
// dashboards/ doesn't exist at all — if dashboards/ exists with
|
|
460
|
+
// duplicates, the gate stays closed so Group C's dedup error fires.
|
|
461
|
+
let finalDashboardWritten = false;
|
|
462
|
+
const dashboardsDir = path.join(cwd, "output", "dashboards");
|
|
463
|
+
let dashboardDuplicatesDetected = false;
|
|
464
|
+
if (dirExists(dashboardsDir)) {
|
|
465
|
+
const htmls = listChildFiles(dashboardsDir).filter((e) => e.name.endsWith(".html"));
|
|
466
|
+
if (htmls.length > 0) {
|
|
467
|
+
const hashes = new Set();
|
|
468
|
+
for (const h of htmls) {
|
|
469
|
+
const sig = sha256OfFile(path.join(dashboardsDir, h.name));
|
|
470
|
+
if (sig) hashes.add(sig);
|
|
471
|
+
}
|
|
472
|
+
if (htmls.length === 1) finalDashboardWritten = hashes.size >= 1;
|
|
473
|
+
else if (hashes.size >= 2) finalDashboardWritten = true;
|
|
474
|
+
else dashboardDuplicatesDetected = true;
|
|
475
|
+
}
|
|
476
|
+
} else {
|
|
477
|
+
// No dashboards/ dir — accept v0.6.0 single-file convention
|
|
478
|
+
if (fileExists(path.join(cwd, "output", "final_dashboard.html"))) {
|
|
479
|
+
finalDashboardWritten = true;
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
return {
|
|
484
|
+
readmeWritten,
|
|
485
|
+
coverageReportWritten,
|
|
486
|
+
finalDashboardWritten,
|
|
487
|
+
dashboardDuplicatesDetected,
|
|
488
|
+
};
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
// ───────────────────────────────────────────────────────────────────
|
|
492
|
+
// Phase-keyed dispatcher (convenience for tests + offline audit).
|
|
493
|
+
// ───────────────────────────────────────────────────────────────────
|
|
494
|
+
|
|
495
|
+
export const DERIVE_BY_PHASE = {
|
|
496
|
+
bootstrap: deriveBootstrapMilestones,
|
|
497
|
+
rule_extraction: deriveRuleExtractionMilestones,
|
|
498
|
+
skill_authoring: deriveSkillAuthoringMilestones,
|
|
499
|
+
skill_testing: deriveSkillTestingMilestones,
|
|
500
|
+
distillation: deriveDistillationMilestones,
|
|
501
|
+
production_qc: deriveProductionQcMilestones,
|
|
502
|
+
finalization: deriveFinalizationMilestones,
|
|
503
|
+
};
|
|
504
|
+
|
|
505
|
+
export function deriveAllMilestones(workspace) {
|
|
506
|
+
const out = {};
|
|
507
|
+
for (const [phase, fn] of Object.entries(DERIVE_BY_PHASE)) {
|
|
508
|
+
out[phase] = fn(workspace);
|
|
509
|
+
}
|
|
510
|
+
return out;
|
|
511
|
+
}
|
|
@@ -17,4 +17,25 @@ export class Pipeline {
|
|
|
17
17
|
|
|
18
18
|
/** Restore milestone state from persisted data. Override in subclasses. */
|
|
19
19
|
importState(_data) { /* no-op by default */ }
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* v0.6.3: Phase-misfit nudge. Called after each tool execution. If the tool
|
|
23
|
+
* call looks like work that belongs to a different phase, return a short
|
|
24
|
+
* hint string. Engine appends it as a `<system-reminder>` tag on the tool
|
|
25
|
+
* result, so the agent sees the mismatch on its next turn and can self-
|
|
26
|
+
* check whether to call phase_advance.
|
|
27
|
+
*
|
|
28
|
+
* Default: no hint. Phase-specific pipelines override with patterns they
|
|
29
|
+
* recognize as out-of-phase (e.g., BOOTSTRAP shouldn't write to
|
|
30
|
+
* rule_skills/, RULE_EXTRACTION shouldn't run workflows on production samples).
|
|
31
|
+
*
|
|
32
|
+
* Keep hints terse — they consume context budget every misfit. State the
|
|
33
|
+
* mismatch + suggest the right phase + remind about phase_advance.
|
|
34
|
+
*
|
|
35
|
+
* @param {string} toolName
|
|
36
|
+
* @param {object} toolInput
|
|
37
|
+
* @param {object} result - ToolResult-like { content, isError }
|
|
38
|
+
* @returns {string|null}
|
|
39
|
+
*/
|
|
40
|
+
phaseMisfitHint(_toolName, _toolInput, _result) { return null; }
|
|
20
41
|
}
|
|
@@ -2,6 +2,7 @@ import fs from "node:fs";
|
|
|
2
2
|
import path from "node:path";
|
|
3
3
|
import { Phase, PipelineEvent } from "./index.js";
|
|
4
4
|
import { Pipeline } from "./base.js";
|
|
5
|
+
import { deriveDistillationMilestones } from "./_milestone-derive.js";
|
|
5
6
|
|
|
6
7
|
export class DistillationEngine extends Pipeline {
|
|
7
8
|
constructor(workspace) {
|
|
@@ -40,26 +41,35 @@ export class DistillationEngine extends Pipeline {
|
|
|
40
41
|
}
|
|
41
42
|
|
|
42
43
|
_scanWorkflows() {
|
|
43
|
-
// v0.
|
|
44
|
-
//
|
|
45
|
-
//
|
|
46
|
-
//
|
|
44
|
+
// v0.7.0 A1: route through filesystem-derived helper. The helper
|
|
45
|
+
// recognizes all three workflow layouts seen in E2E #5:
|
|
46
|
+
// workflows/<id>/workflow_v#.py (canonical, release.js's expectation)
|
|
47
|
+
// workflows/<id>_workflow.py (DS + GLM flat layout)
|
|
48
|
+
// workflows/<id>.json (DS regex_skill manifest)
|
|
49
|
+
// Engine-emitted entries (v0.6.1 A6) are still preserved as a soft
|
|
50
|
+
// overlay — disk wins on counter membership, but accuracy /
|
|
51
|
+
// tier-assignment data set by tool wrappers is kept.
|
|
47
52
|
const engineWfTested = { ...this.workflowsTested };
|
|
48
53
|
const engineWfPassing = [...this.workflowsPassing];
|
|
49
54
|
|
|
55
|
+
const m = deriveDistillationMilestones(this._workspace);
|
|
56
|
+
// workflowsCreated becomes a {ruleId: 1} dict for backwards-compat
|
|
57
|
+
// with downstream code that uses Object.keys() / `id in workflows`.
|
|
50
58
|
this.workflowsCreated = {};
|
|
59
|
+
for (const id of m.workflowsCreated) this.workflowsCreated[id] = 1;
|
|
60
|
+
|
|
51
61
|
this.workflowsTested = {};
|
|
52
62
|
this.workflowsPassing = [];
|
|
53
63
|
this.tierAssignments = {};
|
|
64
|
+
|
|
65
|
+
// Layered: also read per-rule config.json for tier + accuracy
|
|
66
|
+
// metadata — this is auxiliary signal not represented on the
|
|
67
|
+
// filesystem at the workflow-existence level.
|
|
54
68
|
const wfDir = path.join(this._workspace.cwd, "workflows");
|
|
55
|
-
if (
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
const ruleDir = path.join(wfDir, e.name);
|
|
60
|
-
const pyFiles = fs.readdirSync(ruleDir).filter((f) => f.endsWith(".py"));
|
|
61
|
-
if (pyFiles.length > 0) this.workflowsCreated[e.name] = pyFiles.length;
|
|
62
|
-
const cfgPath = path.join(ruleDir, "config.json");
|
|
69
|
+
if (fs.existsSync(wfDir)) {
|
|
70
|
+
for (const e of fs.readdirSync(wfDir, { withFileTypes: true })) {
|
|
71
|
+
if (!e.isDirectory()) continue;
|
|
72
|
+
const cfgPath = path.join(wfDir, e.name, "config.json");
|
|
63
73
|
if (fs.existsSync(cfgPath)) {
|
|
64
74
|
try {
|
|
65
75
|
const cfg = JSON.parse(fs.readFileSync(cfgPath, "utf-8"));
|
|
@@ -71,12 +81,15 @@ export class DistillationEngine extends Pipeline {
|
|
|
71
81
|
}
|
|
72
82
|
} catch { /* skip */ }
|
|
73
83
|
}
|
|
74
|
-
} else if (e.isFile() && e.name.endsWith(".py")) {
|
|
75
|
-
this.workflowsCreated[path.parse(e.name).name] = 1;
|
|
76
84
|
}
|
|
77
85
|
}
|
|
86
|
+
// Helper-derived workflowsTested too (per-workflow test_results/ etc.)
|
|
87
|
+
for (const id of m.workflowsTested) {
|
|
88
|
+
if (!(id in this.workflowsTested)) this.workflowsTested[id] = 1.0;
|
|
89
|
+
if (!this.workflowsPassing.includes(id)) this.workflowsPassing.push(id);
|
|
90
|
+
}
|
|
78
91
|
|
|
79
|
-
// Re-merge engine-emitted entries
|
|
92
|
+
// Re-merge engine-emitted entries (v0.6.1 A6 carry-forward)
|
|
80
93
|
for (const [k, v] of Object.entries(engineWfTested)) {
|
|
81
94
|
if (!(k in this.workflowsTested)) this.workflowsTested[k] = v;
|
|
82
95
|
}
|