kc-beta 0.6.1 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/agent/engine.js +132 -2
- package/src/agent/pipelines/skill-authoring.js +55 -0
- package/src/agent/session-state.js +17 -1
- package/src/agent/skill-validator.js +149 -0
- package/src/agent/tools/_workflow-result-schema.js +249 -0
- package/src/agent/tools/phase-advance.js +34 -3
- package/src/agent/tools/workflow-run.js +9 -4
- package/src/agent/workspace.js +23 -0
- package/src/model-tiers.json +32 -0
- package/src/providers.js +45 -0
- package/template/skills/en/meta-meta/skill-authoring/SKILL.md +19 -0
- package/template/skills/zh/meta-meta/skill-authoring/SKILL.md +19 -0
package/package.json
CHANGED
package/src/agent/engine.js
CHANGED
|
@@ -69,6 +69,19 @@ export const NEXT_PHASE = {
|
|
|
69
69
|
[Phase.PRODUCTION_QC]: Phase.FINALIZATION, // E1: new 7th phase
|
|
70
70
|
};
|
|
71
71
|
|
|
72
|
+
// v0.6.2 J2: explicit linear order so `_advancePhase` can detect rollback
|
|
73
|
+
// direction (target index < current index → rollback). Mirrors NEXT_PHASE
|
|
74
|
+
// but ordered, plus FINALIZATION at the end as the terminal phase.
|
|
75
|
+
export const PHASE_ORDER = [
|
|
76
|
+
Phase.BOOTSTRAP,
|
|
77
|
+
Phase.EXTRACTION,
|
|
78
|
+
Phase.SKILL_AUTHORING,
|
|
79
|
+
Phase.SKILL_TESTING,
|
|
80
|
+
Phase.DISTILLATION,
|
|
81
|
+
Phase.PRODUCTION_QC,
|
|
82
|
+
Phase.FINALIZATION,
|
|
83
|
+
];
|
|
84
|
+
|
|
72
85
|
/**
|
|
73
86
|
* The KC Agent conversation engine.
|
|
74
87
|
*
|
|
@@ -150,7 +163,7 @@ export class AgentEngine {
|
|
|
150
163
|
});
|
|
151
164
|
|
|
152
165
|
// Session state persistence
|
|
153
|
-
this.sessionState = new SessionState(this.workspace.cwd, { statePath });
|
|
166
|
+
this.sessionState = new SessionState(this.workspace.cwd, { statePath, workspace: this.workspace });
|
|
154
167
|
|
|
155
168
|
// Task manager (ralph-loop) — sub-agents don't queue further sub-tasks,
|
|
156
169
|
// so they don't get a TaskManager.
|
|
@@ -223,6 +236,11 @@ export class AgentEngine {
|
|
|
223
236
|
historyLen: this.history?.messages?.length ?? 0,
|
|
224
237
|
tasksPending: this.taskManager?.progress?.pending ?? 0,
|
|
225
238
|
tasksInProgress: this.taskManager?.progress?.inProgress ?? 0,
|
|
239
|
+
// v0.6.2 K1: per-component breakdown so heap-analyze.js can
|
|
240
|
+
// attribute growth (history vs subagents vs event log vs cache).
|
|
241
|
+
// All values in MB. Failures inside _sampleComponents are caught
|
|
242
|
+
// and the row gets `componentsErr` instead.
|
|
243
|
+
components: this._sampleComponents(),
|
|
226
244
|
};
|
|
227
245
|
fs.mkdirSync(logDir, { recursive: true });
|
|
228
246
|
fs.appendFileSync(logPath, JSON.stringify(row) + "\n", "utf-8");
|
|
@@ -240,6 +258,89 @@ export class AgentEngine {
|
|
|
240
258
|
};
|
|
241
259
|
}
|
|
242
260
|
|
|
261
|
+
/**
|
|
262
|
+
* v0.6.2 K1: per-component heap accounting. Each value is in MB,
|
|
263
|
+
* rounded. The whole function is wrapped in a single try/catch by the
|
|
264
|
+
* caller; failures are silently dropped to keep the sampler diagnostic
|
|
265
|
+
* (never load-bearing).
|
|
266
|
+
*
|
|
267
|
+
* Components measured (by source):
|
|
268
|
+
* - history: in-memory `this.history.messages` content sizes (sum of
|
|
269
|
+
* JSON-stringified content)
|
|
270
|
+
* - eventLog: disk size of `logs/events.jsonl`
|
|
271
|
+
* - toolResults: disk size of `logs/tool_results/` (offloaded tool
|
|
272
|
+
* output, summed top-level files only — the dir is one level deep)
|
|
273
|
+
* - subagents: disk size of `sub_agents/` (one level — each subagent
|
|
274
|
+
* has its own directory tree but we just want the order of magnitude)
|
|
275
|
+
* - bundleCache: disk size of `cache/bundles/`
|
|
276
|
+
*/
|
|
277
|
+
_sampleComponents() {
|
|
278
|
+
const out = { historyMB: 0, eventLogMB: 0, toolResultsMB: 0, subagentsMB: 0, bundleCacheMB: 0 };
|
|
279
|
+
const cwd = this.workspace?.cwd;
|
|
280
|
+
if (!cwd) return out;
|
|
281
|
+
// history: walk messages, sum content string lengths (UTF-16 → bytes
|
|
282
|
+
// approx 2× length; we conservatively count length itself since most
|
|
283
|
+
// content is ASCII-heavy JSON tool output)
|
|
284
|
+
try {
|
|
285
|
+
const msgs = this.history?.messages || [];
|
|
286
|
+
let bytes = 0;
|
|
287
|
+
for (const m of msgs) {
|
|
288
|
+
const c = m?.content;
|
|
289
|
+
if (typeof c === "string") bytes += c.length;
|
|
290
|
+
else if (Array.isArray(c)) {
|
|
291
|
+
for (const part of c) {
|
|
292
|
+
if (typeof part === "string") bytes += part.length;
|
|
293
|
+
else if (part?.text) bytes += String(part.text).length;
|
|
294
|
+
else if (part?.content) bytes += String(part.content).length;
|
|
295
|
+
else if (part?.input) bytes += JSON.stringify(part.input).length;
|
|
296
|
+
}
|
|
297
|
+
} else if (c && typeof c === "object") {
|
|
298
|
+
bytes += JSON.stringify(c).length;
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
out.historyMB = Math.round(bytes / 1024 / 1024);
|
|
302
|
+
} catch { /* skip */ }
|
|
303
|
+
// events.jsonl — single file size
|
|
304
|
+
try {
|
|
305
|
+
const p = path.join(cwd, "logs", "events.jsonl");
|
|
306
|
+
out.eventLogMB = Math.round(fs.statSync(p).size / 1024 / 1024);
|
|
307
|
+
} catch { /* skip */ }
|
|
308
|
+
// logs/tool_results/ — sum file sizes one level deep (it's flat)
|
|
309
|
+
try {
|
|
310
|
+
const dir = path.join(cwd, "logs", "tool_results");
|
|
311
|
+
let total = 0;
|
|
312
|
+
for (const e of fs.readdirSync(dir, { withFileTypes: true })) {
|
|
313
|
+
if (e.isFile()) {
|
|
314
|
+
try { total += fs.statSync(path.join(dir, e.name)).size; } catch { /* skip */ }
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
out.toolResultsMB = Math.round(total / 1024 / 1024);
|
|
318
|
+
} catch { /* skip */ }
|
|
319
|
+
// sub_agents/ — sum top-level entries (each is a dir, statSync returns
|
|
320
|
+
// dir-block size, not contents — that's fine for an order-of-magnitude
|
|
321
|
+
// signal; recursive walk would be too expensive for the sampler)
|
|
322
|
+
try {
|
|
323
|
+
const dir = path.join(cwd, "sub_agents");
|
|
324
|
+
let total = 0;
|
|
325
|
+
for (const e of fs.readdirSync(dir, { withFileTypes: true })) {
|
|
326
|
+
try { total += fs.statSync(path.join(dir, e.name)).size; } catch { /* skip */ }
|
|
327
|
+
}
|
|
328
|
+
out.subagentsMB = Math.round(total / 1024 / 1024);
|
|
329
|
+
} catch { /* skip */ }
|
|
330
|
+
// cache/bundles/
|
|
331
|
+
try {
|
|
332
|
+
const dir = path.join(cwd, "cache", "bundles");
|
|
333
|
+
let total = 0;
|
|
334
|
+
for (const e of fs.readdirSync(dir, { withFileTypes: true })) {
|
|
335
|
+
if (e.isFile()) {
|
|
336
|
+
try { total += fs.statSync(path.join(dir, e.name)).size; } catch { /* skip */ }
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
out.bundleCacheMB = Math.round(total / 1024 / 1024);
|
|
340
|
+
} catch { /* skip */ }
|
|
341
|
+
return out;
|
|
342
|
+
}
|
|
343
|
+
|
|
243
344
|
/** Stop background diagnostics. Call on graceful shutdown. */
|
|
244
345
|
stop() {
|
|
245
346
|
try { this._heapSamplerStop?.(); } catch { /* ignore */ }
|
|
@@ -280,6 +381,14 @@ export class AgentEngine {
|
|
|
280
381
|
new PhaseAdvanceTool(
|
|
281
382
|
(to, reason, opts) => this._advancePhase(to, reason, opts),
|
|
282
383
|
() => this.currentPhase, // H1: tool reads phase BEFORE its own call
|
|
384
|
+
// v0.6.2 J1: surface running subagents so the tool can refuse
|
|
385
|
+
// advance until the agent explicitly acknowledges them.
|
|
386
|
+
() => {
|
|
387
|
+
try {
|
|
388
|
+
const agentTool = this._buildTools?.core?.find((t) => t?.name === "agent_tool");
|
|
389
|
+
return agentTool?.getRunningTaskIds?.() || [];
|
|
390
|
+
} catch { return []; }
|
|
391
|
+
},
|
|
283
392
|
),
|
|
284
393
|
new DocumentParseTool(this.workspace, {
|
|
285
394
|
mineruApiUrl: this.config.mineruApiUrl,
|
|
@@ -1061,14 +1170,23 @@ export class AgentEngine {
|
|
|
1061
1170
|
return false;
|
|
1062
1171
|
}
|
|
1063
1172
|
|
|
1173
|
+
// v0.6.2 J2: detect rollback direction. PHASE_ORDER is a linear array
|
|
1174
|
+
// of all phases; if target index < current index, this is a rollback
|
|
1175
|
+
// (e.g., production_qc → skill_authoring after gates revealed gaps).
|
|
1176
|
+
const fromIdx = PHASE_ORDER.indexOf(this.currentPhase);
|
|
1177
|
+
const toIdx = PHASE_ORDER.indexOf(nextPhase);
|
|
1178
|
+
const direction = (fromIdx >= 0 && toIdx >= 0 && toIdx < fromIdx)
|
|
1179
|
+
? "rollback" : "forward";
|
|
1180
|
+
|
|
1064
1181
|
// v0.6.1 B1: build engine-appended hard-counts block + heuristic mismatch
|
|
1065
1182
|
// detection so the LLM-narrated reason can be cross-checked against
|
|
1066
1183
|
// ground-truth telemetry. Phase summaries become diagnostic, not just
|
|
1067
1184
|
// narrative.
|
|
1068
1185
|
const engineCounts = this._buildEngineCountsBlock(this.currentPhase);
|
|
1069
1186
|
const mismatchPrefix = this._detectSummaryMismatch(reason, this.currentPhase) ? "⚠️ POSSIBLE MISMATCH: " : "";
|
|
1187
|
+
const directionTag = direction === "rollback" ? " [ROLLBACK]" : "";
|
|
1070
1188
|
const phaseSummary =
|
|
1071
|
-
`[${this.currentPhase.toUpperCase()} → ${nextPhase.toUpperCase()}]: ${mismatchPrefix}${reason}` +
|
|
1189
|
+
`[${this.currentPhase.toUpperCase()} → ${nextPhase.toUpperCase()}]${directionTag}: ${mismatchPrefix}${reason}` +
|
|
1072
1190
|
(force && nextPhase !== expected ? " (forced)" : "") +
|
|
1073
1191
|
(engineCounts ? `\n (engine) ${engineCounts}` : "");
|
|
1074
1192
|
this._phaseSummaries.push(phaseSummary);
|
|
@@ -1076,6 +1194,7 @@ export class AgentEngine {
|
|
|
1076
1194
|
from: this.currentPhase,
|
|
1077
1195
|
to: nextPhase,
|
|
1078
1196
|
reason,
|
|
1197
|
+
direction,
|
|
1079
1198
|
engineCounts: engineCounts || null,
|
|
1080
1199
|
possibleMismatch: !!mismatchPrefix,
|
|
1081
1200
|
forced: force && nextPhase !== expected,
|
|
@@ -1085,6 +1204,17 @@ export class AgentEngine {
|
|
|
1085
1204
|
this._registerToolsForPhase(this.currentPhase);
|
|
1086
1205
|
this.workspace.setPhase(this.currentPhase);
|
|
1087
1206
|
this._createTasksForPhase(this.currentPhase);
|
|
1207
|
+
|
|
1208
|
+
// v0.6.2 J2: on rollback, reset the rolled-FROM phase's lastReady
|
|
1209
|
+
// edge-trigger so that if the agent revisits it and re-flips
|
|
1210
|
+
// exit-criteria true, _maybeAutoAdvance will fire correctly. Without
|
|
1211
|
+
// this, the auto-advance edge trigger stays latched true and the
|
|
1212
|
+
// moment the agent returns to fromPhase the engine immediately
|
|
1213
|
+
// bounces them back out — defeating the rollback.
|
|
1214
|
+
if (direction === "rollback" && this._lastReady) {
|
|
1215
|
+
this._lastReady[fromPhase] = false;
|
|
1216
|
+
}
|
|
1217
|
+
|
|
1088
1218
|
this.saveState();
|
|
1089
1219
|
|
|
1090
1220
|
// B8: Soft signal — surface any sub-agents left running from the prior
|
|
@@ -2,6 +2,7 @@ import fs from "node:fs";
|
|
|
2
2
|
import path from "node:path";
|
|
3
3
|
import { Phase, PipelineEvent } from "./index.js";
|
|
4
4
|
import { Pipeline } from "./base.js";
|
|
5
|
+
import { SkillValidator } from "../skill-validator.js";
|
|
5
6
|
|
|
6
7
|
export class SkillAuthoringPipeline extends Pipeline {
|
|
7
8
|
/**
|
|
@@ -16,6 +17,13 @@ export class SkillAuthoringPipeline extends Pipeline {
|
|
|
16
17
|
super();
|
|
17
18
|
this._workspace = workspace;
|
|
18
19
|
this._taskManager = taskManager;
|
|
20
|
+
// v0.6.2 I2: skill validator catches malformed check_r###.py at the
|
|
21
|
+
// skill_authoring exit boundary instead of silently passing the
|
|
22
|
+
// phase and breaking in production_qc (E2E #4 unified_qc.py
|
|
23
|
+
// SyntaxError went undiagnosed for hours).
|
|
24
|
+
this._validator = new SkillValidator();
|
|
25
|
+
this._validationFailures = [];
|
|
26
|
+
this._validationSkipped = false;
|
|
19
27
|
this.totalRules = [];
|
|
20
28
|
this.skillsAuthored = [];
|
|
21
29
|
this.skillsWithScripts = [];
|
|
@@ -152,6 +160,18 @@ export class SkillAuthoringPipeline extends Pipeline {
|
|
|
152
160
|
(failedT > 0 ? ` (+${failedT} failed)` : "");
|
|
153
161
|
}
|
|
154
162
|
}
|
|
163
|
+
// v0.6.2 I2: validation status (only meaningful after first
|
|
164
|
+
// exitCriteriaMet call populates _validationFailures)
|
|
165
|
+
let validationLine = "";
|
|
166
|
+
if (this._validationSkipped) {
|
|
167
|
+
validationLine = `\n- Skill validation: SKIPPED (python3 not on PATH — install to enable)`;
|
|
168
|
+
} else if (this._validationFailures.length > 0) {
|
|
169
|
+
const f = this._validationFailures.slice(0, 5).map(({ filePath, error }) =>
|
|
170
|
+
`\n - ${path.relative(this._workspace.cwd, filePath)}: ${error.split("\n")[0]}`,
|
|
171
|
+
).join("");
|
|
172
|
+
validationLine = `\n- Skills failing validation (${this._validationFailures.length}):${f}` +
|
|
173
|
+
(this._validationFailures.length > 5 ? `\n - … and ${this._validationFailures.length - 5} more` : "");
|
|
174
|
+
}
|
|
155
175
|
parts.push(
|
|
156
176
|
`### Progress (rule-id coverage, D2)\n` +
|
|
157
177
|
`- Total rules in catalog: ${total}\n` +
|
|
@@ -159,6 +179,7 @@ export class SkillAuthoringPipeline extends Pipeline {
|
|
|
159
179
|
`- Skill directories authored: ${this.skillsAuthored.length}\n` +
|
|
160
180
|
`- Skills with scripts/: ${this.skillsWithScripts.length}` +
|
|
161
181
|
taskLine +
|
|
182
|
+
validationLine +
|
|
162
183
|
(uncovered.length > 0
|
|
163
184
|
? `\n- Missing coverage (${uncovered.length}): ${uncovered.slice(0, 15).join(", ")}${uncovered.length > 15 ? "…" : ""}`
|
|
164
185
|
: ""),
|
|
@@ -204,9 +225,43 @@ export class SkillAuthoringPipeline extends Pipeline {
|
|
|
204
225
|
if (completed + failed < total) return false;
|
|
205
226
|
}
|
|
206
227
|
}
|
|
228
|
+
// v0.6.2 I2: skill validator — every check_r###.py must parse and
|
|
229
|
+
// expose an entry point. Catches the unified_qc.py-style monolith
|
|
230
|
+
// and other malformed scripts before they break in production_qc.
|
|
231
|
+
// mtime cache keeps this O(1) in steady state. Failures preserved
|
|
232
|
+
// in this._validationFailures for describeState rendering.
|
|
233
|
+
const checkFiles = this._collectCheckScripts();
|
|
234
|
+
const v = this._validator.validateAll(checkFiles);
|
|
235
|
+
this._validationFailures = v.failures;
|
|
236
|
+
this._validationSkipped = v.skipped;
|
|
237
|
+
if (!v.ok) return false;
|
|
207
238
|
return this.skillsWithScripts.length >= Math.max(1, this.skillsAuthored.length * 0.5);
|
|
208
239
|
}
|
|
209
240
|
|
|
241
|
+
/**
|
|
242
|
+
* v0.6.2 I2: gather every check_r###.py path under rule_skills/. Used by
|
|
243
|
+
* the skill validator. Walks one level into each skill directory.
|
|
244
|
+
*/
|
|
245
|
+
_collectCheckScripts() {
|
|
246
|
+
const out = [];
|
|
247
|
+
const dir = path.join(this._workspace.cwd, "rule_skills");
|
|
248
|
+
if (!fs.existsSync(dir)) return out;
|
|
249
|
+
const walk = (d) => {
|
|
250
|
+
let entries;
|
|
251
|
+
try { entries = fs.readdirSync(d, { withFileTypes: true }); } catch { return; }
|
|
252
|
+
for (const e of entries) {
|
|
253
|
+
if (e.name.startsWith(".") || e.name.startsWith("__")) continue;
|
|
254
|
+
const p = path.join(d, e.name);
|
|
255
|
+
if (e.isDirectory()) { walk(p); continue; }
|
|
256
|
+
if (e.isFile() && /^check_r[\d_-]+\.py$/i.test(e.name)) {
|
|
257
|
+
out.push(p);
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
};
|
|
261
|
+
walk(dir);
|
|
262
|
+
return out;
|
|
263
|
+
}
|
|
264
|
+
|
|
210
265
|
exportState() {
|
|
211
266
|
return {
|
|
212
267
|
totalRules: this.totalRules,
|
|
@@ -12,9 +12,14 @@ export class SessionState {
|
|
|
12
12
|
* @param {string} workspacePath - Session workspace directory
|
|
13
13
|
* @param {object} [opts]
|
|
14
14
|
* @param {string} [opts.statePath] - Override absolute path (used for sub-agent isolation, Bug 2)
|
|
15
|
+
* @param {Workspace} [opts.workspace] - v0.6.2 J3: optional workspace ref so
|
|
16
|
+
* save() can acquire a sync file lock on session-state.json. Without it
|
|
17
|
+
* (subagents, tests), save() falls back to lock-free writes — same
|
|
18
|
+
* behavior as pre-v0.6.2.
|
|
15
19
|
*/
|
|
16
20
|
constructor(workspacePath, opts = {}) {
|
|
17
21
|
this._path = opts.statePath || path.join(workspacePath, "session-state.json");
|
|
22
|
+
this._workspace = opts.workspace || null;
|
|
18
23
|
}
|
|
19
24
|
|
|
20
25
|
/**
|
|
@@ -46,7 +51,18 @@ export class SessionState {
|
|
|
46
51
|
pipelineMilestones: this._extractMilestones(engine.pipelines),
|
|
47
52
|
};
|
|
48
53
|
|
|
49
|
-
|
|
54
|
+
// v0.6.2 J3: acquire sync file lock if workspace ref available.
|
|
55
|
+
// session-state.json is in SHARED_COORDINATION_PATHS — concurrent
|
|
56
|
+
// writers (parallel ralph-loop workers + main saveState ticks)
|
|
57
|
+
// could otherwise interleave and corrupt the JSON.
|
|
58
|
+
const write = () => {
|
|
59
|
+
fs.writeFileSync(this._path, JSON.stringify(state, null, 2), "utf-8");
|
|
60
|
+
};
|
|
61
|
+
if (this._workspace?.withSyncFileLock) {
|
|
62
|
+
this._workspace.withSyncFileLock("session-state.json", write);
|
|
63
|
+
} else {
|
|
64
|
+
write();
|
|
65
|
+
}
|
|
50
66
|
}
|
|
51
67
|
|
|
52
68
|
/**
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* v0.6.2 I2: Skill validator (was D3c, deferred from v0.6.0/v0.6.1).
|
|
3
|
+
*
|
|
4
|
+
* E2E #4 demonstrated that broken `check_r###.py` contents go undetected
|
|
5
|
+
* until production_qc throws (e.g., `SyntaxError: unexpected character
|
|
6
|
+
* after line continuation character` from line 733 of unified_qc.py).
|
|
7
|
+
* This validator catches such breakage at the skill_authoring phase
|
|
8
|
+
* boundary instead of months later in production.
|
|
9
|
+
*
|
|
10
|
+
* Design constraints:
|
|
11
|
+
* - exitCriteriaMet is sync, so validation is sync (execFileSync).
|
|
12
|
+
* - 110 files × ~50ms subprocess = 5.5s worst case; caching by mtime
|
|
13
|
+
* keeps steady-state cost at ~0 (only re-validate freshly modified
|
|
14
|
+
* files).
|
|
15
|
+
* - Failures are diagnostic, not punitive: `force: true` on phase_advance
|
|
16
|
+
* still bypasses. The validator's job is to refuse the auto-advance,
|
|
17
|
+
* not to trap the agent.
|
|
18
|
+
*
|
|
19
|
+
* Validation rules per `check_r###.py`:
|
|
20
|
+
* 1. File ≥ 100 bytes (smoke test for empty stubs).
|
|
21
|
+
* 2. Passes `python3 -c "import ast; ast.parse(open(F).read())"` (no
|
|
22
|
+
* syntax errors).
|
|
23
|
+
* 3. Defines a function reachable by name `check_rule` or `verify`
|
|
24
|
+
* (regex match on file content).
|
|
25
|
+
*
|
|
26
|
+
* Disable mechanism: if `python3` is not on PATH, validator silently
|
|
27
|
+
* passes everything and emits a one-time warning — we don't want the
|
|
28
|
+
* gate to block on missing tooling. Gate effectively no-ops.
|
|
29
|
+
*/
|
|
30
|
+
|
|
31
|
+
import { execFileSync } from "node:child_process";
|
|
32
|
+
import fs from "node:fs";
|
|
33
|
+
import path from "node:path";
|
|
34
|
+
|
|
35
|
+
const ENTRY_POINT_REGEX = /^\s*(?:async\s+)?def\s+(check_rule|verify)\b/m;
|
|
36
|
+
const MIN_BYTES = 100;
|
|
37
|
+
|
|
38
|
+
export class SkillValidator {
|
|
39
|
+
constructor() {
|
|
40
|
+
/** @type {Map<string, { mtime: number, ok: boolean, error?: string }>} */
|
|
41
|
+
this._cache = new Map();
|
|
42
|
+
/** @type {boolean|null} - null = untested, true/false once probed */
|
|
43
|
+
this._pythonAvailable = null;
|
|
44
|
+
/** @type {boolean} - one-time warning suppression */
|
|
45
|
+
this._warned = false;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Probe whether python3 is available. Cached after first call.
|
|
50
|
+
* @returns {boolean}
|
|
51
|
+
*/
|
|
52
|
+
_probePython() {
|
|
53
|
+
if (this._pythonAvailable !== null) return this._pythonAvailable;
|
|
54
|
+
try {
|
|
55
|
+
execFileSync("python3", ["-c", "import ast"], { stdio: "ignore", timeout: 5000 });
|
|
56
|
+
this._pythonAvailable = true;
|
|
57
|
+
} catch {
|
|
58
|
+
this._pythonAvailable = false;
|
|
59
|
+
}
|
|
60
|
+
return this._pythonAvailable;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Validate one file. Returns `{ ok, error? }`. Cached by mtime.
|
|
65
|
+
* @param {string} filePath - Absolute path to the .py file
|
|
66
|
+
* @returns {{ ok: boolean, error?: string }}
|
|
67
|
+
*/
|
|
68
|
+
validateFile(filePath) {
|
|
69
|
+
let mtime;
|
|
70
|
+
try {
|
|
71
|
+
mtime = fs.statSync(filePath).mtimeMs;
|
|
72
|
+
} catch {
|
|
73
|
+
return { ok: false, error: "file not found" };
|
|
74
|
+
}
|
|
75
|
+
const cached = this._cache.get(filePath);
|
|
76
|
+
if (cached && cached.mtime === mtime) {
|
|
77
|
+
return { ok: cached.ok, error: cached.error };
|
|
78
|
+
}
|
|
79
|
+
const result = this._runValidation(filePath);
|
|
80
|
+
this._cache.set(filePath, { mtime, ...result });
|
|
81
|
+
return result;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Validate all files in a list. Returns:
|
|
86
|
+
* - ok: boolean — true iff every file passes
|
|
87
|
+
* - failures: array of { filePath, error } for each failing file
|
|
88
|
+
* - skipped: boolean — true if python3 unavailable (validator no-op'd)
|
|
89
|
+
*
|
|
90
|
+
* @param {string[]} filePaths
|
|
91
|
+
* @returns {{ ok: boolean, failures: Array<{filePath:string, error:string}>, skipped: boolean }}
|
|
92
|
+
*/
|
|
93
|
+
validateAll(filePaths) {
|
|
94
|
+
if (!this._probePython()) {
|
|
95
|
+
if (!this._warned) {
|
|
96
|
+
// eslint-disable-next-line no-console
|
|
97
|
+
console.warn("[skill-validator] python3 not on PATH — skill validation skipped. " +
|
|
98
|
+
"Phase gate will not catch syntax errors. Install python3 to enable.");
|
|
99
|
+
this._warned = true;
|
|
100
|
+
}
|
|
101
|
+
return { ok: true, failures: [], skipped: true };
|
|
102
|
+
}
|
|
103
|
+
const failures = [];
|
|
104
|
+
for (const f of filePaths) {
|
|
105
|
+
const r = this.validateFile(f);
|
|
106
|
+
if (!r.ok) failures.push({ filePath: f, error: r.error || "unknown" });
|
|
107
|
+
}
|
|
108
|
+
return { ok: failures.length === 0, failures, skipped: false };
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Manually invalidate cache for a path — used when the caller knows
|
|
113
|
+
* the file changed but mtime granularity might not have caught it.
|
|
114
|
+
*/
|
|
115
|
+
invalidate(filePath) { this._cache.delete(filePath); }
|
|
116
|
+
|
|
117
|
+
// --- Internal ---
|
|
118
|
+
|
|
119
|
+
_runValidation(filePath) {
|
|
120
|
+
// Rule 1: size check (cheap)
|
|
121
|
+
let size;
|
|
122
|
+
try { size = fs.statSync(filePath).size; }
|
|
123
|
+
catch { return { ok: false, error: "stat failed" }; }
|
|
124
|
+
if (size < MIN_BYTES) {
|
|
125
|
+
return { ok: false, error: `file too small (${size} < ${MIN_BYTES} bytes)` };
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// Rule 2: ast.parse smoke test via subprocess
|
|
129
|
+
try {
|
|
130
|
+
execFileSync("python3", [
|
|
131
|
+
"-c",
|
|
132
|
+
`import ast,sys\ntry:\n ast.parse(open(${JSON.stringify(filePath)}).read())\nexcept SyntaxError as e:\n print(f"SyntaxError: {e}", file=sys.stderr); sys.exit(1)\nexcept Exception as e:\n print(f"{type(e).__name__}: {e}", file=sys.stderr); sys.exit(1)\n`,
|
|
133
|
+
], { stdio: ["ignore", "ignore", "pipe"], timeout: 10_000 });
|
|
134
|
+
} catch (e) {
|
|
135
|
+
const stderr = (e.stderr ? e.stderr.toString() : "") || e.message || "subprocess failed";
|
|
136
|
+
return { ok: false, error: stderr.trim().slice(0, 300) };
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// Rule 3: entry-point regex (after parse OK so we know file is readable)
|
|
140
|
+
let content;
|
|
141
|
+
try { content = fs.readFileSync(filePath, "utf-8"); }
|
|
142
|
+
catch { return { ok: false, error: "read failed after parse OK" }; }
|
|
143
|
+
if (!ENTRY_POINT_REGEX.test(content)) {
|
|
144
|
+
return { ok: false, error: "no entry point: expected `def check_rule(...)` or `def verify(...)`" };
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
return { ok: true };
|
|
148
|
+
}
|
|
149
|
+
}
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* v0.6.2 I1: Shared workflow-result normalizer + ERROR classifier.
|
|
3
|
+
*
|
|
4
|
+
* E2E #4 produced 1,150 ERROR verdicts out of 6,930 (16.6%) and
|
|
5
|
+
* verdict_stats keys leaked Python dataclass repr() strings like
|
|
6
|
+
* "VerificationResult(rule_id='R049', verdict='NOT_APPLICABLE', ...)".
|
|
7
|
+
* The agent's batch aggregator was using repr(result) as a dict key
|
|
8
|
+
* because the workflow's Python output was a dataclass instance, not
|
|
9
|
+
* a dict.
|
|
10
|
+
*
|
|
11
|
+
* This module fixes the boundary: anything that comes out of a
|
|
12
|
+
* workflow_run tool gets normalized to a strict dict shape before being
|
|
13
|
+
* persisted or returned to the agent. Repr-strings get parsed back into
|
|
14
|
+
* structured fields. ERRORs get classified into typed buckets so we can
|
|
15
|
+
* tell "import failed" from "extraction returned wrong shape" without
|
|
16
|
+
* reading 1,150 stack traces.
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* The required shape every workflow result must satisfy. Unknown extra
|
|
21
|
+
* keys are preserved.
|
|
22
|
+
*/
|
|
23
|
+
export const REQUIRED_KEYS = ["rule_id", "verdict"];
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Canonical verdict values. Anything outside this set is allowed (the
|
|
27
|
+
* worker LLM may extend) but generates a `nonstandard_verdict` warning
|
|
28
|
+
* in the result's `_warnings` array.
|
|
29
|
+
*/
|
|
30
|
+
export const STANDARD_VERDICTS = new Set([
|
|
31
|
+
"PASS", "FAIL", "NOT_APPLICABLE", "SUPPLEMENT_NEEDED", "ERROR", "UNKNOWN",
|
|
32
|
+
]);
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Recognized error_type values used by classifyError(). Add to this set
|
|
36
|
+
* when adding a new pattern below.
|
|
37
|
+
*/
|
|
38
|
+
export const ERROR_TYPES = [
|
|
39
|
+
"import_error",
|
|
40
|
+
"attribute_error",
|
|
41
|
+
"keyword_not_found",
|
|
42
|
+
"sample_unparseable",
|
|
43
|
+
"schema_violation",
|
|
44
|
+
"syntax_error",
|
|
45
|
+
"timeout",
|
|
46
|
+
"permission_error",
|
|
47
|
+
"unknown",
|
|
48
|
+
];
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Detect whether a string looks like a Python dataclass repr —
|
|
52
|
+
* `ClassName(field=value, field=value)`. Used both as a top-level
|
|
53
|
+
* detector and recursively inside dict keys.
|
|
54
|
+
*/
|
|
55
|
+
const REPR_PATTERN = /^([A-Za-z_]\w*)\((.*)\)$/s;
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Parse a Python-repr string into { class_name, fields: { ... } }.
|
|
59
|
+
* Field values are kept as strings (we don't try to re-type them — the
|
|
60
|
+
* downstream consumer can JSON.parse if needed). Returns null if the
|
|
61
|
+
* input doesn't look like a repr.
|
|
62
|
+
*
|
|
63
|
+
* Example:
|
|
64
|
+
* parsePyRepr("VerificationResult(rule_id='R049', verdict='NOT_APPLICABLE')")
|
|
65
|
+
* → { class_name: 'VerificationResult', fields: { rule_id: "'R049'", verdict: "'NOT_APPLICABLE'" } }
|
|
66
|
+
*/
|
|
67
|
+
export function parsePyRepr(s) {
|
|
68
|
+
if (typeof s !== "string") return null;
|
|
69
|
+
const m = s.match(REPR_PATTERN);
|
|
70
|
+
if (!m) return null;
|
|
71
|
+
const className = m[1];
|
|
72
|
+
const body = m[2];
|
|
73
|
+
// Tokenize on top-level commas (ignore commas inside brackets/quotes)
|
|
74
|
+
const fields = {};
|
|
75
|
+
let depth = 0;
|
|
76
|
+
let inQuote = null;
|
|
77
|
+
let buf = "";
|
|
78
|
+
let key = null;
|
|
79
|
+
const flush = () => {
|
|
80
|
+
if (!buf.trim()) return;
|
|
81
|
+
if (key == null) {
|
|
82
|
+
// No `=` seen — entry was positional, skip
|
|
83
|
+
buf = "";
|
|
84
|
+
return;
|
|
85
|
+
}
|
|
86
|
+
fields[key] = buf.trim();
|
|
87
|
+
key = null;
|
|
88
|
+
buf = "";
|
|
89
|
+
};
|
|
90
|
+
for (let i = 0; i < body.length; i++) {
|
|
91
|
+
const c = body[i];
|
|
92
|
+
if (inQuote) {
|
|
93
|
+
buf += c;
|
|
94
|
+
if (c === inQuote && body[i - 1] !== "\\") inQuote = null;
|
|
95
|
+
continue;
|
|
96
|
+
}
|
|
97
|
+
if (c === "'" || c === '"') { inQuote = c; buf += c; continue; }
|
|
98
|
+
if (c === "(" || c === "[" || c === "{") { depth++; buf += c; continue; }
|
|
99
|
+
if (c === ")" || c === "]" || c === "}") { depth--; buf += c; continue; }
|
|
100
|
+
if (c === "=" && depth === 0 && key == null) {
|
|
101
|
+
key = buf.trim();
|
|
102
|
+
buf = "";
|
|
103
|
+
continue;
|
|
104
|
+
}
|
|
105
|
+
if (c === "," && depth === 0) { flush(); continue; }
|
|
106
|
+
buf += c;
|
|
107
|
+
}
|
|
108
|
+
flush();
|
|
109
|
+
return { class_name: className, fields };
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Recursively replace any dict key that looks like a Python repr with
|
|
114
|
+
* a structured object. Also handles arrays. Mutates in place but also
|
|
115
|
+
* returns the input for chaining.
|
|
116
|
+
*/
|
|
117
|
+
export function normalizeReprKeys(obj) {
|
|
118
|
+
if (Array.isArray(obj)) {
|
|
119
|
+
obj.forEach((v, i) => { obj[i] = normalizeReprKeys(v); });
|
|
120
|
+
return obj;
|
|
121
|
+
}
|
|
122
|
+
if (obj && typeof obj === "object") {
|
|
123
|
+
const newObj = {};
|
|
124
|
+
for (const [k, v] of Object.entries(obj)) {
|
|
125
|
+
const parsed = parsePyRepr(k);
|
|
126
|
+
if (parsed) {
|
|
127
|
+
// Merge under a class-name bucket. Multiple repr keys for the
|
|
128
|
+
// same class collapse to a counter (because verdict_stats just
|
|
129
|
+
// wanted distinct buckets).
|
|
130
|
+
const bucket = newObj[parsed.class_name] || (newObj[parsed.class_name] = []);
|
|
131
|
+
bucket.push({ fields: parsed.fields, count: typeof v === "number" ? v : 1 });
|
|
132
|
+
} else {
|
|
133
|
+
newObj[k] = normalizeReprKeys(v);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
return newObj;
|
|
137
|
+
}
|
|
138
|
+
return obj;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Classify an ERROR result by inferring `error_type` from the raw_output
|
|
143
|
+
* stack trace or message. Returns one of ERROR_TYPES.
|
|
144
|
+
*
|
|
145
|
+
* Conservative — when in doubt, return "unknown" rather than guess wrong.
|
|
146
|
+
*/
|
|
147
|
+
export function classifyError(rawOutput) {
|
|
148
|
+
if (!rawOutput || typeof rawOutput !== "string") return "unknown";
|
|
149
|
+
const s = rawOutput;
|
|
150
|
+
if (/ModuleNotFoundError|ImportError|No module named/i.test(s)) return "import_error";
|
|
151
|
+
if (/AttributeError/i.test(s)) return "attribute_error";
|
|
152
|
+
if (/SyntaxError|invalid syntax|unexpected character/i.test(s)) return "syntax_error";
|
|
153
|
+
if (/PermissionError|permission denied/i.test(s)) return "permission_error";
|
|
154
|
+
if (/timed out|timeout|Timeout/i.test(s)) return "timeout";
|
|
155
|
+
// sample parse failures usually mention pdfjs / docx / json
|
|
156
|
+
if (/pdfjs|docx|json\.decoder|JSONDecodeError|UnicodeDecodeError/i.test(s)) return "sample_unparseable";
|
|
157
|
+
// schema violations from our own normalizer would have a hint
|
|
158
|
+
if (/schema_violation|missing required key/i.test(s)) return "schema_violation";
|
|
159
|
+
// Common keyword-not-found signal: the workflow returned no match
|
|
160
|
+
if (/no match|not found|未找到|关键词未匹配/i.test(s)) return "keyword_not_found";
|
|
161
|
+
return "unknown";
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Normalize a parsed workflow-output object to the canonical dict shape.
|
|
166
|
+
* - Ensures `rule_id` and `verdict` are present.
|
|
167
|
+
* - Strips repr-string keys (delegates to normalizeReprKeys).
|
|
168
|
+
* - If verdict is "ERROR" or the parse fell back to raw_output, attaches
|
|
169
|
+
* `error_type` from classifyError().
|
|
170
|
+
* - Records issues in `_warnings: string[]` so the consumer (and the
|
|
171
|
+
* agent reading the tool result) can see them.
|
|
172
|
+
*
|
|
173
|
+
* Inputs:
|
|
174
|
+
* parsed — what JSON.parse yielded (may already be a dict, or be
|
|
175
|
+
* the raw_output fallback object)
|
|
176
|
+
* ruleId — what the caller knows the rule_id should be
|
|
177
|
+
* rawOutput — the original stdout (used for ERROR classification)
|
|
178
|
+
*
|
|
179
|
+
* Returns the normalized result. Always returns a dict with `rule_id`
|
|
180
|
+
* and `verdict`. Never throws.
|
|
181
|
+
*/
|
|
182
|
+
export function normalizeWorkflowResult(parsed, ruleId, rawOutput) {
|
|
183
|
+
const warnings = [];
|
|
184
|
+
let result;
|
|
185
|
+
if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
|
|
186
|
+
result = { ...parsed };
|
|
187
|
+
} else if (typeof parsed === "string") {
|
|
188
|
+
// Parsed yielded a string — could be a repr at top level
|
|
189
|
+
const repr = parsePyRepr(parsed);
|
|
190
|
+
if (repr) {
|
|
191
|
+
// Strip Python's surrounding quote chars from string values so
|
|
192
|
+
// STANDARD_VERDICTS comparisons work and downstream code doesn't
|
|
193
|
+
// see "'PASS'" instead of "PASS". Conservative: only unwrap when
|
|
194
|
+
// the entire value is wrapped in matching ' or " quotes.
|
|
195
|
+
const stripped = {};
|
|
196
|
+
for (const [k, v] of Object.entries(repr.fields)) {
|
|
197
|
+
if (typeof v === "string" && /^(['"]).*\1$/s.test(v) && v.length >= 2) {
|
|
198
|
+
stripped[k] = v.slice(1, -1);
|
|
199
|
+
} else {
|
|
200
|
+
stripped[k] = v;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
result = stripped;
|
|
204
|
+
result._source_class = repr.class_name;
|
|
205
|
+
warnings.push("toplevel_repr_string");
|
|
206
|
+
} else {
|
|
207
|
+
result = { raw_output: parsed.slice(0, 5000) };
|
|
208
|
+
warnings.push("toplevel_string");
|
|
209
|
+
}
|
|
210
|
+
} else {
|
|
211
|
+
result = { raw_output: String(parsed ?? "").slice(0, 5000) };
|
|
212
|
+
warnings.push("toplevel_nonobject");
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// Recursively normalize repr keys in nested dicts (verdict_stats, etc.)
|
|
216
|
+
normalizeReprKeys(result);
|
|
217
|
+
|
|
218
|
+
// rule_id: prefer the caller-supplied value (it's authoritative)
|
|
219
|
+
if (ruleId) result.rule_id = ruleId;
|
|
220
|
+
else if (typeof result.rule_id !== "string") {
|
|
221
|
+
result.rule_id = "unknown";
|
|
222
|
+
warnings.push("missing_rule_id");
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// verdict: ensure present and canonical-or-warn
|
|
226
|
+
if (typeof result.verdict !== "string" || result.verdict === "") {
|
|
227
|
+
// If the workflow fell into raw_output fallback, mark as ERROR
|
|
228
|
+
if (result.raw_output) {
|
|
229
|
+
result.verdict = "ERROR";
|
|
230
|
+
} else {
|
|
231
|
+
result.verdict = "UNKNOWN";
|
|
232
|
+
warnings.push("missing_verdict");
|
|
233
|
+
}
|
|
234
|
+
} else if (!STANDARD_VERDICTS.has(result.verdict)) {
|
|
235
|
+
warnings.push("nonstandard_verdict");
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// ERROR classification
|
|
239
|
+
if (result.verdict === "ERROR") {
|
|
240
|
+
const trace = rawOutput || result.raw_output || result.error || "";
|
|
241
|
+
result.error_type = classifyError(trace);
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
if (warnings.length > 0) {
|
|
245
|
+
result._warnings = (result._warnings || []).concat(warnings);
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
return result;
|
|
249
|
+
}
|
|
@@ -19,13 +19,17 @@ export class PhaseAdvanceTool extends BaseTool {
|
|
|
19
19
|
* @param {() => string} getCurrentPhaseFn - H1: lets the tool read the
|
|
20
20
|
* engine's phase BEFORE the call, so it can distinguish "already there"
|
|
21
21
|
* (silent no-op, informational) from "non-adjacent refusal" (actionable).
|
|
22
|
-
*
|
|
23
|
-
*
|
|
22
|
+
* @param {() => string[]} [getRunningSubagentsFn] - v0.6.2 J1: returns the
|
|
23
|
+
* list of running subagent task_ids. When non-empty, phase_advance
|
|
24
|
+
* refuses unless `acknowledge_stale_subagents: true` is set in input
|
|
25
|
+
* (or `force: true`). Forces the agent to confront live work that
|
|
26
|
+
* started in the prior phase before declaring the phase done.
|
|
24
27
|
*/
|
|
25
|
-
constructor(advanceFn, getCurrentPhaseFn) {
|
|
28
|
+
constructor(advanceFn, getCurrentPhaseFn, getRunningSubagentsFn) {
|
|
26
29
|
super();
|
|
27
30
|
this._advance = advanceFn;
|
|
28
31
|
this._getCurrentPhase = getCurrentPhaseFn || (() => null);
|
|
32
|
+
this._getRunningSubagents = getRunningSubagentsFn || (() => []);
|
|
29
33
|
}
|
|
30
34
|
|
|
31
35
|
get name() { return "phase_advance"; }
|
|
@@ -48,6 +52,11 @@ export class PhaseAdvanceTool extends BaseTool {
|
|
|
48
52
|
type: "boolean",
|
|
49
53
|
description: "Allow non-adjacent or backward transitions. Default false.",
|
|
50
54
|
},
|
|
55
|
+
acknowledge_stale_subagents: {
|
|
56
|
+
type: "boolean",
|
|
57
|
+
description:
|
|
58
|
+
"Set to true after using agent_tool(operation=list|poll|kill) to confirm you've handled any subagents still running from the prior phase. Required when subagents are live; otherwise advance is refused (use force:true to bypass entirely).",
|
|
59
|
+
},
|
|
51
60
|
},
|
|
52
61
|
required: ["to"],
|
|
53
62
|
};
|
|
@@ -68,8 +77,30 @@ export class PhaseAdvanceTool extends BaseTool {
|
|
|
68
77
|
);
|
|
69
78
|
}
|
|
70
79
|
|
|
80
|
+
// v0.6.2 J1: stale-subagents acknowledgement gate. Refuses advance if
|
|
81
|
+
// any subagent is still running and the agent hasn't explicitly
|
|
82
|
+
// acknowledged. force:true bypasses (matches existing escape pattern).
|
|
83
|
+
const running = this._getRunningSubagents();
|
|
84
|
+
if (running.length > 0 && !input.acknowledge_stale_subagents && !input.force) {
|
|
85
|
+
return new ToolResult(
|
|
86
|
+
`Refusing to advance from ${beforePhase || "?"} to ${to}: ${running.length} subagent(s) still running from prior phase: ${running.join(", ")}. ` +
|
|
87
|
+
`Run agent_tool(operation="list") to see status, then either ` +
|
|
88
|
+
`agent_tool(operation="wait"|"kill") on each, OR pass acknowledge_stale_subagents:true ` +
|
|
89
|
+
`to advance while leaving them running (use only if they're legitimate background work).`,
|
|
90
|
+
true,
|
|
91
|
+
);
|
|
92
|
+
}
|
|
93
|
+
|
|
71
94
|
const advanced = this._advance(to, input.reason || "agent request", { force: !!input.force });
|
|
72
95
|
if (advanced) {
|
|
96
|
+
// Log the ack so post-mortems can find phase advances that proceeded
|
|
97
|
+
// with live subagents
|
|
98
|
+
if (running.length > 0 && input.acknowledge_stale_subagents) {
|
|
99
|
+
return new ToolResult(
|
|
100
|
+
`Advanced${beforePhase ? ` from ${beforePhase}` : ""} to ${to}${input.force ? " (forced)" : ""} — ` +
|
|
101
|
+
`acknowledged ${running.length} running subagent(s): ${running.join(", ")}.`,
|
|
102
|
+
);
|
|
103
|
+
}
|
|
73
104
|
return new ToolResult(`Advanced${beforePhase ? ` from ${beforePhase}` : ""} to ${to}${input.force ? " (forced)" : ""}`);
|
|
74
105
|
}
|
|
75
106
|
|
|
@@ -2,6 +2,7 @@ import fs from "node:fs";
|
|
|
2
2
|
import path from "node:path";
|
|
3
3
|
import { spawn } from "node:child_process";
|
|
4
4
|
import { BaseTool, ToolResult } from "./base.js";
|
|
5
|
+
import { normalizeWorkflowResult } from "./_workflow-result-schema.js";
|
|
5
6
|
|
|
6
7
|
/**
|
|
7
8
|
* Execute a distilled workflow script against a document.
|
|
@@ -89,14 +90,18 @@ export class WorkflowRunTool extends BaseTool {
|
|
|
89
90
|
return new ToolResult(e.message, true);
|
|
90
91
|
}
|
|
91
92
|
|
|
92
|
-
// Parse output
|
|
93
|
-
let
|
|
93
|
+
// Parse output (last stdout line as JSON)
|
|
94
|
+
let parsed;
|
|
94
95
|
try {
|
|
95
96
|
const lines = output.trim().split("\n");
|
|
96
|
-
|
|
97
|
+
parsed = JSON.parse(lines[lines.length - 1]);
|
|
97
98
|
} catch {
|
|
98
|
-
|
|
99
|
+
parsed = { raw_output: output.slice(0, 5000) };
|
|
99
100
|
}
|
|
101
|
+
// v0.6.2 I1: normalize to canonical dict shape — strips Python
|
|
102
|
+
// dataclass repr() keys, classifies ERROR results, ensures rule_id
|
|
103
|
+
// and verdict are present.
|
|
104
|
+
const resultData = normalizeWorkflowResult(parsed, ruleId, output);
|
|
100
105
|
|
|
101
106
|
// Attach confidence score
|
|
102
107
|
const extractedValue = String(resultData.extracted_value || resultData.value || "");
|
package/src/agent/workspace.js
CHANGED
|
@@ -240,6 +240,19 @@ export class Workspace {
|
|
|
240
240
|
return traceId;
|
|
241
241
|
}
|
|
242
242
|
|
|
243
|
+
/**
|
|
244
|
+
* v0.6.2 J3: Synchronous lock mirror of `withFileLock`, for callers
|
|
245
|
+
* that can't go async (SessionState.save). Locks a sibling
|
|
246
|
+
* `<relPath>.lock` file via O_CREAT|O_EXCL, with 5s timeout and 30s
|
|
247
|
+
* stale-takeover. On failure to acquire, runs fn anyway — better to
|
|
248
|
+
* lose serialization than deadlock a save call. Use sparingly; prefer
|
|
249
|
+
* `withFileLock` (async) for all paths that allow it.
|
|
250
|
+
*/
|
|
251
|
+
withSyncFileLock(relPath, fn, { timeoutMs = 5_000, staleMs = 30_000 } = {}) {
|
|
252
|
+
const lockPath = path.join(this.path, `${relPath}.lock`);
|
|
253
|
+
return this._withSyncLockAtPath(lockPath, fn, { timeoutMs, staleMs });
|
|
254
|
+
}
|
|
255
|
+
|
|
243
256
|
/**
|
|
244
257
|
* B5: Synchronous gitops lock. Mirror of withFileLock but sync to fit
|
|
245
258
|
* autoCommit's existing call signature. Times out and proceeds anyway
|
|
@@ -247,6 +260,16 @@ export class Workspace {
|
|
|
247
260
|
*/
|
|
248
261
|
_withGitSyncLock(fn, { timeoutMs = 5_000, staleMs = 30_000 } = {}) {
|
|
249
262
|
const lockPath = path.join(this.path, ".git", "kc-commit.lock");
|
|
263
|
+
return this._withSyncLockAtPath(lockPath, fn, { timeoutMs, staleMs });
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
/**
|
|
267
|
+
* Shared sync-lock implementation. Used by `_withGitSyncLock` (B5) and
|
|
268
|
+
* `withSyncFileLock` (J3 / v0.6.2). Same semantics: O_CREAT|O_EXCL on
|
|
269
|
+
* a sibling `.lock` file, busy-spin retry with stale takeover, run fn
|
|
270
|
+
* anyway on timeout.
|
|
271
|
+
*/
|
|
272
|
+
_withSyncLockAtPath(lockPath, fn, { timeoutMs = 5_000, staleMs = 30_000 } = {}) {
|
|
250
273
|
const start = Date.now();
|
|
251
274
|
let acquired = false;
|
|
252
275
|
while (Date.now() - start < timeoutMs) {
|
package/src/model-tiers.json
CHANGED
|
@@ -123,6 +123,38 @@
|
|
|
123
123
|
}
|
|
124
124
|
},
|
|
125
125
|
|
|
126
|
+
"deepseek": {
|
|
127
|
+
"_comment": "DeepSeek v4 family — flagship pro + cheap flash. Native 1M context but KC caps to 200K.",
|
|
128
|
+
"conductor": "deepseek-v4-pro",
|
|
129
|
+
"llm": {
|
|
130
|
+
"tier1": "deepseek-v4-pro",
|
|
131
|
+
"tier2": "deepseek-v4-pro",
|
|
132
|
+
"tier3": "deepseek-v4-flash",
|
|
133
|
+
"tier4": "deepseek-v4-flash"
|
|
134
|
+
},
|
|
135
|
+
"vlm": {
|
|
136
|
+
"tier1": "",
|
|
137
|
+
"tier2": "",
|
|
138
|
+
"tier3": ""
|
|
139
|
+
}
|
|
140
|
+
},
|
|
141
|
+
|
|
142
|
+
"xiaomi": {
|
|
143
|
+
"_comment": "Xiaomi MiMo coding plan — flagship Pro + standard + multimodal Omni. Native 1M context but KC caps to 200K. TTS variants excluded (no KC use case).",
|
|
144
|
+
"conductor": "MiMo-V2.5-Pro",
|
|
145
|
+
"llm": {
|
|
146
|
+
"tier1": "MiMo-V2.5-Pro",
|
|
147
|
+
"tier2": "MiMo-V2.5",
|
|
148
|
+
"tier3": "MiMo-V2-Pro",
|
|
149
|
+
"tier4": "MiMo-V2-Pro"
|
|
150
|
+
},
|
|
151
|
+
"vlm": {
|
|
152
|
+
"tier1": "MiMo-V2-Omni",
|
|
153
|
+
"tier2": "MiMo-V2-Omni",
|
|
154
|
+
"tier3": ""
|
|
155
|
+
}
|
|
156
|
+
},
|
|
157
|
+
|
|
126
158
|
"openrouter": {
|
|
127
159
|
"conductor": "anthropic/claude-sonnet-4-20250514",
|
|
128
160
|
"llm": {
|
package/src/providers.js
CHANGED
|
@@ -211,6 +211,51 @@ const PROVIDERS = [
|
|
|
211
211
|
zh: "MiniMax",
|
|
212
212
|
},
|
|
213
213
|
},
|
|
214
|
+
{
|
|
215
|
+
id: "deepseek",
|
|
216
|
+
name: "DeepSeek",
|
|
217
|
+
baseUrl: "https://api.deepseek.com",
|
|
218
|
+
authType: "bearer",
|
|
219
|
+
apiFormat: "openai",
|
|
220
|
+
modelsEndpoint: "/models",
|
|
221
|
+
contextLimit: 200000, // KC cap — DeepSeek v4 is native 1M; we cap to 200K
|
|
222
|
+
defaultModel: getTierConfig("deepseek").conductor || "deepseek-v4-pro",
|
|
223
|
+
defaultTiers: getTierConfig("deepseek").llm,
|
|
224
|
+
defaultVlm: getTierConfig("deepseek").vlm,
|
|
225
|
+
curatedModels: [
|
|
226
|
+
{ id: "deepseek-v4-pro", ownedBy: "deepseek" },
|
|
227
|
+
{ id: "deepseek-v4-flash", ownedBy: "deepseek" },
|
|
228
|
+
],
|
|
229
|
+
labels: {
|
|
230
|
+
en: "DeepSeek (v4 family)",
|
|
231
|
+
zh: "DeepSeek(v4 系列)",
|
|
232
|
+
},
|
|
233
|
+
},
|
|
234
|
+
{
|
|
235
|
+
id: "xiaomi",
|
|
236
|
+
name: "Xiaomi MiMo",
|
|
237
|
+
baseUrl: "https://token-plan-cn.xiaomimimo.com/v1",
|
|
238
|
+
authType: "bearer",
|
|
239
|
+
apiFormat: "openai",
|
|
240
|
+
modelsEndpoint: null, // Xiaomi coding-plan endpoint, no /models — use curated list
|
|
241
|
+
supportsCodingPlanKey: true,
|
|
242
|
+
contextLimit: 200000, // KC cap — MiMo V2.5 is native 1M
|
|
243
|
+
defaultModel: getTierConfig("xiaomi").conductor || "MiMo-V2.5-Pro",
|
|
244
|
+
defaultTiers: getTierConfig("xiaomi").llm,
|
|
245
|
+
defaultVlm: getTierConfig("xiaomi").vlm,
|
|
246
|
+
curatedModels: [
|
|
247
|
+
{ id: "MiMo-V2.5-Pro", ownedBy: "xiaomi" },
|
|
248
|
+
{ id: "MiMo-V2.5", ownedBy: "xiaomi" },
|
|
249
|
+
{ id: "MiMo-V2-Pro", ownedBy: "xiaomi" },
|
|
250
|
+
{ id: "MiMo-V2-Omni", ownedBy: "xiaomi" }, // multimodal
|
|
251
|
+
// TTS variants (MiMo-V2.5-TTS, *-VoiceClone, *-VoiceDesign, MiMo-V2-TTS)
|
|
252
|
+
// intentionally excluded — KC has no TTS use case.
|
|
253
|
+
],
|
|
254
|
+
labels: {
|
|
255
|
+
en: "Xiaomi MiMo (V2.5 family, coding plan)",
|
|
256
|
+
zh: "小米 MiMo(V2.5 系列,编程计划)",
|
|
257
|
+
},
|
|
258
|
+
},
|
|
214
259
|
{
|
|
215
260
|
id: "openrouter",
|
|
216
261
|
name: "OpenRouter",
|
|
@@ -27,6 +27,25 @@ rule-skills/
|
|
|
27
27
|
|
|
28
28
|
Not every rule needs all of these. A simple threshold check might only need SKILL.md and a script. A complex semantic rule might need detailed references and many samples. Start minimal, add as needed during testing.
|
|
29
29
|
|
|
30
|
+
## Granularity: 1 rule = 1 skill directory (default)
|
|
31
|
+
|
|
32
|
+
Default to **one rule per skill directory**. Group rules into the same file ONLY when they meet BOTH:
|
|
33
|
+
|
|
34
|
+
1. They share the same evidence (same section / same table / same field) — so locating one locates all.
|
|
35
|
+
2. They fail together — when one fails, the others almost always fail too (e.g., siblings in a required-fields list where the table itself is missing).
|
|
36
|
+
|
|
37
|
+
When grouping, name the file with the explicit range so downstream consumers (workflow-run, dashboards, finalization) can parse rule coverage by filename:
|
|
38
|
+
- ✅ `check_r013_r017.py` (R013, R014, R015, R016, R017 — same disclosure table, fail together)
|
|
39
|
+
- ❌ `check_r001_r050_r078.py` (different chapters, even if topically related — keep separate)
|
|
40
|
+
|
|
41
|
+
### Anti-pattern: the unified runner
|
|
42
|
+
|
|
43
|
+
If you find yourself writing a single `unified_qc.py` (or `batch_runner.py`, or `master_check.py`) that handles all 110 rules in one Python file, **stop**. That means your per-rule skills are wrong, not that the architecture is wrong. Fix the skills.
|
|
44
|
+
|
|
45
|
+
E2E #4 demonstrated the cost: an agent wrote `unified_qc.py` to bypass 110 individual skills it didn't trust. Result was 1,150 errors out of 6,930 production checks (16.6%) and a phase counter stuck in `production_qc` while real work happened in skill_authoring. The unified runner felt productive locally and was a global mistake.
|
|
46
|
+
|
|
47
|
+
If individual skills aren't running cleanly, the right response is to identify which ones break and fix them, not consolidate. The whole pipeline (extraction → skill_testing → distillation → production_qc) assumes one rule = one verifiable artifact.
|
|
48
|
+
|
|
30
49
|
## Writing SKILL.md
|
|
31
50
|
|
|
32
51
|
### Frontmatter
|
|
@@ -27,6 +27,25 @@ rule-skills/
|
|
|
27
27
|
|
|
28
28
|
Not every rule needs all of these. A simple threshold check might only need SKILL.md and a script. A complex semantic rule might need detailed references and many samples. Start minimal, add as needed during testing.
|
|
29
29
|
|
|
30
|
+
## 颗粒度:默认 1 条规则 = 1 个技能目录
|
|
31
|
+
|
|
32
|
+
默认**每条规则一个独立技能目录**。仅当同时满足以下两个条件时,才能把多条规则合并到同一个文件:
|
|
33
|
+
|
|
34
|
+
1. 共享同一证据(同一章节 / 同一表格 / 同一字段)——找到一条就找到了全部。
|
|
35
|
+
2. 一同成败——一条失败,其他几乎必然失败(例如必填字段表中的同辈规则,表本身缺失则全部失败)。
|
|
36
|
+
|
|
37
|
+
合并时,用显式范围命名文件,让下游消费者(workflow-run、dashboards、finalization)可以从文件名解析规则覆盖范围:
|
|
38
|
+
- ✅ `check_r013_r017.py`(R013、R014、R015、R016、R017——同一披露表格,一同失败)
|
|
39
|
+
- ❌ `check_r001_r050_r078.py`(不同章节,即使主题相关,也应分开)
|
|
40
|
+
|
|
41
|
+
### 反模式:统一运行器(unified runner)
|
|
42
|
+
|
|
43
|
+
如果你发现自己在写一个 `unified_qc.py`(或 `batch_runner.py`、`master_check.py`)把全部 110 条规则塞进一个 Python 文件里,**停下来**。这说明你的单条规则技能写错了,不是架构错了。请修复单条技能。
|
|
44
|
+
|
|
45
|
+
E2E #4 给出了代价:智能体写了一个 `unified_qc.py` 绕过它不信任的 110 个独立技能。结果是 6,930 条生产检查里出了 1,150 个错误(16.6%),相位计数器卡在 `production_qc`,而真实工作还在 skill_authoring 里进行。统一运行器在局部看起来很高效,全局上是个错误。
|
|
46
|
+
|
|
47
|
+
如果某些独立技能跑不通,正确的应对是定位并修复出问题的那几条,而不是合并所有技能。整个流水线(extraction → skill_testing → distillation → production_qc)的前提就是「一条规则 = 一个可独立验证的产物」。
|
|
48
|
+
|
|
30
49
|
## Writing SKILL.md
|
|
31
50
|
|
|
32
51
|
### Frontmatter
|