kc-beta 0.6.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/agent/engine.js +184 -3
- package/src/agent/pipelines/distillation.js +15 -0
- package/src/agent/pipelines/extraction.js +60 -3
- package/src/agent/pipelines/production-qc.js +63 -13
- package/src/agent/pipelines/skill-authoring.js +36 -1
- package/src/agent/task-manager.js +15 -0
- package/src/agent/tools/workflow-run.js +34 -1
package/package.json
CHANGED
package/src/agent/engine.js
CHANGED
|
@@ -165,7 +165,7 @@ export class AgentEngine {
|
|
|
165
165
|
this.pipelines = {
|
|
166
166
|
[Phase.BOOTSTRAP]: new ProjectInitializer(this.workspace),
|
|
167
167
|
[Phase.EXTRACTION]: new RuleExtractionPipeline(this.workspace),
|
|
168
|
-
[Phase.SKILL_AUTHORING]: new SkillAuthoringPipeline(this.workspace),
|
|
168
|
+
[Phase.SKILL_AUTHORING]: new SkillAuthoringPipeline(this.workspace, this.taskManager),
|
|
169
169
|
[Phase.SKILL_TESTING]: new SkillTestingPipeline(this.workspace),
|
|
170
170
|
[Phase.DISTILLATION]: new DistillationPipeline(this.workspace),
|
|
171
171
|
[Phase.PRODUCTION_QC]: new ProductionQCPipeline(this.workspace),
|
|
@@ -311,7 +311,11 @@ export class AgentEngine {
|
|
|
311
311
|
// Distillation+ only (DISTILL mode)
|
|
312
312
|
distill: [
|
|
313
313
|
workerLlm,
|
|
314
|
-
new WorkflowRunTool(this.workspace, this.versionManager, this.confidence
|
|
314
|
+
new WorkflowRunTool(this.workspace, this.versionManager, this.confidence, {
|
|
315
|
+
// v0.6.1 A6: hook engine-emitted milestones so phase gates see workflow runs
|
|
316
|
+
recordMilestone: (phase, key, value) => this._recordMilestone(phase, key, value),
|
|
317
|
+
getCurrentPhase: () => this.currentPhase,
|
|
318
|
+
}),
|
|
315
319
|
new TierDowngradeTool(this.workspace, workerLlm),
|
|
316
320
|
new QCSampleTool(this.workspace),
|
|
317
321
|
],
|
|
@@ -1057,12 +1061,23 @@ export class AgentEngine {
|
|
|
1057
1061
|
return false;
|
|
1058
1062
|
}
|
|
1059
1063
|
|
|
1060
|
-
|
|
1064
|
+
// v0.6.1 B1: build engine-appended hard-counts block + heuristic mismatch
|
|
1065
|
+
// detection so the LLM-narrated reason can be cross-checked against
|
|
1066
|
+
// ground-truth telemetry. Phase summaries become diagnostic, not just
|
|
1067
|
+
// narrative.
|
|
1068
|
+
const engineCounts = this._buildEngineCountsBlock(this.currentPhase);
|
|
1069
|
+
const mismatchPrefix = this._detectSummaryMismatch(reason, this.currentPhase) ? "⚠️ POSSIBLE MISMATCH: " : "";
|
|
1070
|
+
const phaseSummary =
|
|
1071
|
+
`[${this.currentPhase.toUpperCase()} → ${nextPhase.toUpperCase()}]: ${mismatchPrefix}${reason}` +
|
|
1072
|
+
(force && nextPhase !== expected ? " (forced)" : "") +
|
|
1073
|
+
(engineCounts ? `\n (engine) ${engineCounts}` : "");
|
|
1061
1074
|
this._phaseSummaries.push(phaseSummary);
|
|
1062
1075
|
this.eventLog.append("phase_transition", {
|
|
1063
1076
|
from: this.currentPhase,
|
|
1064
1077
|
to: nextPhase,
|
|
1065
1078
|
reason,
|
|
1079
|
+
engineCounts: engineCounts || null,
|
|
1080
|
+
possibleMismatch: !!mismatchPrefix,
|
|
1066
1081
|
forced: force && nextPhase !== expected,
|
|
1067
1082
|
});
|
|
1068
1083
|
const fromPhase = this.currentPhase;
|
|
@@ -1093,6 +1108,172 @@ export class AgentEngine {
|
|
|
1093
1108
|
return true;
|
|
1094
1109
|
}
|
|
1095
1110
|
|
|
1111
|
+
/**
|
|
1112
|
+
* v0.6.1 A6: Single chokepoint for engine-emitted milestone updates.
|
|
1113
|
+
* Tools call this on successful execution to bump pipeline counters that
|
|
1114
|
+
* the phase-gate hardening (A2-A5) depends on. Without engine emission,
|
|
1115
|
+
* gates fall back to filesystem scans which can miss work that didn't
|
|
1116
|
+
* follow canonical output paths (E2E #4: `unified_qc.py` wrote to
|
|
1117
|
+
* `output/results/`, production-qc only scanned `output/qc/`).
|
|
1118
|
+
*
|
|
1119
|
+
* The mutation routes through the pipeline's existing internal state, so
|
|
1120
|
+
* exportState/importState round-trips work unchanged and the gate sees a
|
|
1121
|
+
* unified view of (filesystem-scanned + engine-emitted) signals.
|
|
1122
|
+
*
|
|
1123
|
+
* Three modes inferred from value shape:
|
|
1124
|
+
* - increment counter: pipeline[key] is number, value is number → add
|
|
1125
|
+
* - set in dict-by-id: pipeline[key] is object, value is { id, value? } → assign
|
|
1126
|
+
* - dedupe-add to array: pipeline[key] is array, value is string → push if absent
|
|
1127
|
+
*
|
|
1128
|
+
* @param {string} phase - Pipeline name (e.g., "distillation")
|
|
1129
|
+
* @param {string} key - Field on the pipeline (e.g., "workflowsTested")
|
|
1130
|
+
* @param {*} value - Shape varies by target type (see modes above)
|
|
1131
|
+
* @returns {boolean} true if a write happened
|
|
1132
|
+
*/
|
|
1133
|
+
_recordMilestone(phase, key, value) {
|
|
1134
|
+
const pipeline = this.pipelines?.[phase];
|
|
1135
|
+
if (!pipeline) return false;
|
|
1136
|
+
const target = pipeline[key];
|
|
1137
|
+
// increment counter
|
|
1138
|
+
if (typeof target === "number" && typeof value === "number") {
|
|
1139
|
+
pipeline[key] = target + value;
|
|
1140
|
+
return true;
|
|
1141
|
+
}
|
|
1142
|
+
// set on dict-by-id
|
|
1143
|
+
if (target && typeof target === "object" && !Array.isArray(target)
|
|
1144
|
+
&& value && typeof value === "object" && "id" in value) {
|
|
1145
|
+
target[value.id] = "value" in value ? value.value : true;
|
|
1146
|
+
return true;
|
|
1147
|
+
}
|
|
1148
|
+
// dedupe-add to array
|
|
1149
|
+
if (Array.isArray(target) && typeof value === "string") {
|
|
1150
|
+
if (!target.includes(value)) target.push(value);
|
|
1151
|
+
return true;
|
|
1152
|
+
}
|
|
1153
|
+
return false;
|
|
1154
|
+
}
|
|
1155
|
+
|
|
1156
|
+
/**
|
|
1157
|
+
* v0.6.1 B1: build a one-line "engine counts" block summarizing the
|
|
1158
|
+
* pipeline's ground-truth telemetry at the moment of phase advance.
|
|
1159
|
+
* Different phases surface different metrics; we keep this short so the
|
|
1160
|
+
* appended summary line stays readable.
|
|
1161
|
+
*
|
|
1162
|
+
* @param {string} fromPhase - The phase being LEFT (we summarize its work)
|
|
1163
|
+
* @returns {string} block text, or "" if pipeline has nothing to report
|
|
1164
|
+
*/
|
|
1165
|
+
_buildEngineCountsBlock(fromPhase) {
|
|
1166
|
+
const pipeline = this.pipelines?.[fromPhase];
|
|
1167
|
+
if (!pipeline) return "";
|
|
1168
|
+
const parts = [];
|
|
1169
|
+
try {
|
|
1170
|
+
switch (fromPhase) {
|
|
1171
|
+
case "extraction": {
|
|
1172
|
+
const total = pipeline._catalogRuleCount?.() ?? pipeline.rulesExtracted?.length ?? 0;
|
|
1173
|
+
parts.push(`rulesExtracted: ${pipeline.rulesExtracted?.length ?? 0}`);
|
|
1174
|
+
parts.push(`rulesWithChunkRefs: ${pipeline.rulesWithChunkRefs?.length ?? 0}/${total}`);
|
|
1175
|
+
parts.push(`rulesWithTests: ${pipeline.rulesWithTests?.length ?? 0}`);
|
|
1176
|
+
parts.push(`coverageAudited: ${pipeline.coverageAudited ? "yes" : "no"}`);
|
|
1177
|
+
break;
|
|
1178
|
+
}
|
|
1179
|
+
case "skill_authoring": {
|
|
1180
|
+
const totalRules = pipeline.totalRules?.length ?? 0;
|
|
1181
|
+
const covered = pipeline.ruleIdsCovered?.size ?? 0;
|
|
1182
|
+
parts.push(`rulesCovered: ${covered}/${totalRules}`);
|
|
1183
|
+
parts.push(`skillDirsAuthored: ${pipeline.skillsAuthored?.length ?? 0}`);
|
|
1184
|
+
if (this.taskManager) {
|
|
1185
|
+
const t = this.taskManager.countByPhase("skill_authoring");
|
|
1186
|
+
const d = this.taskManager.countByPhase("skill_authoring", "completed");
|
|
1187
|
+
const f = this.taskManager.countByPhase("skill_authoring", "failed");
|
|
1188
|
+
parts.push(`tasksCompleted: ${d}/${t}${f > 0 ? ` (+${f} failed)` : ""}`);
|
|
1189
|
+
}
|
|
1190
|
+
break;
|
|
1191
|
+
}
|
|
1192
|
+
case "skill_testing": {
|
|
1193
|
+
const total = pipeline.skillsToTest?.length ?? 0;
|
|
1194
|
+
const tested = Object.keys(pipeline.skillsTested || {}).length;
|
|
1195
|
+
const passing = pipeline.skillsPassing?.length ?? 0;
|
|
1196
|
+
parts.push(`skillsTested: ${tested}/${total}`);
|
|
1197
|
+
parts.push(`skillsPassing: ${passing}`);
|
|
1198
|
+
parts.push(`iterations: ${pipeline.iterationCount ?? 0}`);
|
|
1199
|
+
break;
|
|
1200
|
+
}
|
|
1201
|
+
case "distillation": {
|
|
1202
|
+
const total = pipeline.skillsToDistill?.length ?? 0;
|
|
1203
|
+
const created = Object.keys(pipeline.workflowsCreated || {}).length;
|
|
1204
|
+
const tested = Object.keys(pipeline.workflowsTested || {}).length;
|
|
1205
|
+
const passing = pipeline.workflowsPassing?.length ?? 0;
|
|
1206
|
+
parts.push(`workflowsCreated: ${created}/${total}`);
|
|
1207
|
+
parts.push(`workflowsTested: ${tested}/${total}`);
|
|
1208
|
+
parts.push(`workflowsPassing: ${passing}/${total}`);
|
|
1209
|
+
break;
|
|
1210
|
+
}
|
|
1211
|
+
case "production_qc": {
|
|
1212
|
+
parts.push(`batchesProcessed: ${pipeline.batchesProcessed ?? 0}`);
|
|
1213
|
+
parts.push(`documentsReviewed: ${pipeline.documentsReviewed ?? 0}`);
|
|
1214
|
+
parts.push(`monitoring: ${pipeline.monitoringPhase ?? "?"}`);
|
|
1215
|
+
break;
|
|
1216
|
+
}
|
|
1217
|
+
// bootstrap / finalization: no specific counters, fall through
|
|
1218
|
+
}
|
|
1219
|
+
} catch { /* never let summary build break phase advance */ }
|
|
1220
|
+
return parts.join(", ");
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1223
|
+
/**
|
|
1224
|
+
* v0.6.1 B1: heuristic mismatch detection. Conservative regex over the
|
|
1225
|
+
* LLM's free-form reason for percentages and counts, compared against
|
|
1226
|
+
* engine truth. INFORMATIONAL only — never blocks the transition. False
|
|
1227
|
+
* positives are acceptable (the warning is a hint to the human reviewer,
|
|
1228
|
+
* not a hard signal). False negatives are also acceptable (this catches
|
|
1229
|
+
* the loud, numerical claims; subtle ones still slip through).
|
|
1230
|
+
*
|
|
1231
|
+
* Returns true if the agent's reason mentions a count or percentage that
|
|
1232
|
+
* doesn't match engine state.
|
|
1233
|
+
*/
|
|
1234
|
+
_detectSummaryMismatch(reason, fromPhase) {
|
|
1235
|
+
if (!reason || typeof reason !== "string") return false;
|
|
1236
|
+
const pipeline = this.pipelines?.[fromPhase];
|
|
1237
|
+
if (!pipeline) return false;
|
|
1238
|
+
try {
|
|
1239
|
+
// Match "N/M" fractions and standalone counts
|
|
1240
|
+
const fractionMatches = [...reason.matchAll(/(\d+)\s*\/\s*(\d+)/g)];
|
|
1241
|
+
// Match "N rules / skills / workflows / tasks"
|
|
1242
|
+
const countMatches = [...reason.matchAll(/(\d+)\s*(rules?|skills?|workflows?|tasks?|条规则|个技能)/gi)];
|
|
1243
|
+
// Match accuracy claims like "95%", "0.95"
|
|
1244
|
+
const pctMatches = [...reason.matchAll(/(\d+(?:\.\d+)?)\s*%/g)];
|
|
1245
|
+
|
|
1246
|
+
// Phase-specific cross-checks (cheap conservative comparisons)
|
|
1247
|
+
if (fromPhase === "skill_authoring" && this.taskManager) {
|
|
1248
|
+
const completed = this.taskManager.countByPhase("skill_authoring", "completed");
|
|
1249
|
+
const total = this.taskManager.countByPhase("skill_authoring");
|
|
1250
|
+
for (const m of fractionMatches) {
|
|
1251
|
+
const claimedDone = parseInt(m[1], 10);
|
|
1252
|
+
const claimedTotal = parseInt(m[2], 10);
|
|
1253
|
+
if (claimedTotal === total && claimedDone > completed + 5) return true;
|
|
1254
|
+
}
|
|
1255
|
+
}
|
|
1256
|
+
if (fromPhase === "skill_testing") {
|
|
1257
|
+
const tested = Object.keys(pipeline.skillsTested || {}).length;
|
|
1258
|
+
const passing = pipeline.skillsPassing?.length ?? 0;
|
|
1259
|
+
for (const m of pctMatches) {
|
|
1260
|
+
const claimed = parseFloat(m[1]);
|
|
1261
|
+
// If claimed > 50% but engine sees 0 tested, that's suspicious
|
|
1262
|
+
if (claimed >= 50 && tested === 0 && passing === 0) return true;
|
|
1263
|
+
}
|
|
1264
|
+
}
|
|
1265
|
+
if (fromPhase === "production_qc") {
|
|
1266
|
+
const batches = pipeline.batchesProcessed ?? 0;
|
|
1267
|
+
// Any "complete" or large-count claim while batches==0 is suspicious
|
|
1268
|
+
if (batches === 0) {
|
|
1269
|
+
if (countMatches.some((m) => parseInt(m[1], 10) > 10)) return true;
|
|
1270
|
+
if (pctMatches.some((m) => parseFloat(m[1]) > 50)) return true;
|
|
1271
|
+
}
|
|
1272
|
+
}
|
|
1273
|
+
} catch { /* informational only — never block */ }
|
|
1274
|
+
return false;
|
|
1275
|
+
}
|
|
1276
|
+
|
|
1096
1277
|
/**
|
|
1097
1278
|
* Bug 4 trigger (1) auto-detect, edge-triggered (Bug 5): only fires on a
|
|
1098
1279
|
* fresh false → true flip in `exitCriteriaMet()`. Sessions resumed in an
|
|
@@ -40,6 +40,13 @@ export class DistillationEngine extends Pipeline {
|
|
|
40
40
|
}
|
|
41
41
|
|
|
42
42
|
_scanWorkflows() {
|
|
43
|
+
// v0.6.1 A6: preserve engine-emitted entries across filesystem rescans.
|
|
44
|
+
// workflow_run hook bumps workflowsTested[ruleId] and adds to
|
|
45
|
+
// workflowsPassing on success — without this preservation, those entries
|
|
46
|
+
// get clobbered on the next describeState() / onToolResult() rescan.
|
|
47
|
+
const engineWfTested = { ...this.workflowsTested };
|
|
48
|
+
const engineWfPassing = [...this.workflowsPassing];
|
|
49
|
+
|
|
43
50
|
this.workflowsCreated = {};
|
|
44
51
|
this.workflowsTested = {};
|
|
45
52
|
this.workflowsPassing = [];
|
|
@@ -68,6 +75,14 @@ export class DistillationEngine extends Pipeline {
|
|
|
68
75
|
this.workflowsCreated[path.parse(e.name).name] = 1;
|
|
69
76
|
}
|
|
70
77
|
}
|
|
78
|
+
|
|
79
|
+
// Re-merge engine-emitted entries on top of filesystem-derived state
|
|
80
|
+
for (const [k, v] of Object.entries(engineWfTested)) {
|
|
81
|
+
if (!(k in this.workflowsTested)) this.workflowsTested[k] = v;
|
|
82
|
+
}
|
|
83
|
+
for (const id of engineWfPassing) {
|
|
84
|
+
if (!this.workflowsPassing.includes(id)) this.workflowsPassing.push(id);
|
|
85
|
+
}
|
|
71
86
|
}
|
|
72
87
|
|
|
73
88
|
describeState() {
|
|
@@ -11,6 +11,11 @@ export class RuleExtractionPipeline extends Pipeline {
|
|
|
11
11
|
this.rulesExtracted = [];
|
|
12
12
|
this.rulesWithTests = [];
|
|
13
13
|
this.coverageAudited = false;
|
|
14
|
+
// v0.6.1 A1: track which rules in catalog.json have non-empty
|
|
15
|
+
// source_chunk_ids — D1 grounded skill_authoring prompts on these but
|
|
16
|
+
// exit didn't require them, so a sloppy extraction could leave rules
|
|
17
|
+
// unmoored.
|
|
18
|
+
this.rulesWithChunkRefs = [];
|
|
14
19
|
this._scanWorkspace();
|
|
15
20
|
}
|
|
16
21
|
|
|
@@ -28,11 +33,21 @@ export class RuleExtractionPipeline extends Pipeline {
|
|
|
28
33
|
|
|
29
34
|
_scanRules() {
|
|
30
35
|
this.rulesExtracted = [];
|
|
36
|
+
this.rulesWithChunkRefs = [];
|
|
31
37
|
const catalogPath = path.join(this._workspace.cwd, "rules", "catalog.json");
|
|
32
38
|
if (fs.existsSync(catalogPath)) {
|
|
33
39
|
try {
|
|
34
40
|
const data = JSON.parse(fs.readFileSync(catalogPath, "utf-8"));
|
|
35
|
-
if (Array.isArray(data))
|
|
41
|
+
if (Array.isArray(data)) {
|
|
42
|
+
this.rulesExtracted = data.map((r, i) => r.id || `rule_${i}`);
|
|
43
|
+
// A1: collect ids whose entry has non-empty source_chunk_ids
|
|
44
|
+
for (const r of data) {
|
|
45
|
+
const ids = r?.source_chunk_ids;
|
|
46
|
+
if (Array.isArray(ids) && ids.length > 0 && r?.id) {
|
|
47
|
+
this.rulesWithChunkRefs.push(r.id);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
}
|
|
36
51
|
} catch { /* skip */ }
|
|
37
52
|
}
|
|
38
53
|
const skillsDir = path.join(this._workspace.cwd, "rule_skills");
|
|
@@ -67,10 +82,43 @@ export class RuleExtractionPipeline extends Pipeline {
|
|
|
67
82
|
parts.push("### Exit\nExtraction complete. Proceed to SKILL_AUTHORING.");
|
|
68
83
|
}
|
|
69
84
|
|
|
70
|
-
|
|
85
|
+
const chunkRefsOk = this._chunkRefsCriterionMet();
|
|
86
|
+
parts.push(
|
|
87
|
+
`### Exit criteria\n` +
|
|
88
|
+
`- [${this.regulationsScanned ? "x" : " "}] All regulations read\n` +
|
|
89
|
+
`- [${this.rulesExtracted.length > 0 ? "x" : " "}] Rules decomposed into atomic units\n` +
|
|
90
|
+
`- [${this.rulesWithTests.length >= Math.max(this.rulesExtracted.length * 0.8, 1) ? "x" : " "}] >=80% of rules have test stubs\n` +
|
|
91
|
+
`- [${this.coverageAudited ? "x" : " "}] Coverage audit completed\n` +
|
|
92
|
+
`- [${chunkRefsOk ? "x" : " "}] Every rule has source_chunk_ids in catalog.json (${this.rulesWithChunkRefs.length}/${this._catalogRuleCount()})`,
|
|
93
|
+
);
|
|
71
94
|
return parts.join("\n\n");
|
|
72
95
|
}
|
|
73
96
|
|
|
97
|
+
/**
|
|
98
|
+
* v0.6.1 A1: number of rules currently in catalog.json (not the union with
|
|
99
|
+
* rule_skills/ dirs that rulesExtracted carries). Used by the chunk-refs
|
|
100
|
+
* gate so we compare apples to apples.
|
|
101
|
+
*/
|
|
102
|
+
_catalogRuleCount() {
|
|
103
|
+
const catalogPath = path.join(this._workspace.cwd, "rules", "catalog.json");
|
|
104
|
+
if (!fs.existsSync(catalogPath)) return 0;
|
|
105
|
+
try {
|
|
106
|
+
const data = JSON.parse(fs.readFileSync(catalogPath, "utf-8"));
|
|
107
|
+
return Array.isArray(data) ? data.length : 0;
|
|
108
|
+
} catch { return 0; }
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* v0.6.1 A1: pass when every rule in catalog.json has a non-empty
|
|
113
|
+
* source_chunk_ids array. Empty catalog (legacy / pre-D1 sessions) passes
|
|
114
|
+
* trivially so resume of v0.6.0 sessions doesn't get trapped.
|
|
115
|
+
*/
|
|
116
|
+
_chunkRefsCriterionMet() {
|
|
117
|
+
const total = this._catalogRuleCount();
|
|
118
|
+
if (total === 0) return true; // backwards-compat for sessions pre-D1
|
|
119
|
+
return this.rulesWithChunkRefs.length >= total;
|
|
120
|
+
}
|
|
121
|
+
|
|
74
122
|
onToolResult(toolName, toolInput, result) {
|
|
75
123
|
if (result.isError) return null;
|
|
76
124
|
const wasReady = this.exitCriteriaMet();
|
|
@@ -85,7 +133,12 @@ export class RuleExtractionPipeline extends Pipeline {
|
|
|
85
133
|
|
|
86
134
|
exitCriteriaMet() {
|
|
87
135
|
return this.regulationsScanned && this.rulesExtracted.length > 0 &&
|
|
88
|
-
this.rulesWithTests.length >= Math.max(this.rulesExtracted.length * 0.8, 1) &&
|
|
136
|
+
this.rulesWithTests.length >= Math.max(this.rulesExtracted.length * 0.8, 1) &&
|
|
137
|
+
this.coverageAudited &&
|
|
138
|
+
// v0.6.1 A1: hard tracking — D1 source-context auto-attach requires
|
|
139
|
+
// catalog.json entries to carry source_chunk_ids. Without them the
|
|
140
|
+
// skill_authoring prompts are blind.
|
|
141
|
+
this._chunkRefsCriterionMet();
|
|
89
142
|
}
|
|
90
143
|
|
|
91
144
|
exportState() {
|
|
@@ -93,6 +146,7 @@ export class RuleExtractionPipeline extends Pipeline {
|
|
|
93
146
|
regulationsScanned: this.regulationsScanned,
|
|
94
147
|
rulesExtracted: this.rulesExtracted,
|
|
95
148
|
rulesWithTests: this.rulesWithTests,
|
|
149
|
+
rulesWithChunkRefs: this.rulesWithChunkRefs,
|
|
96
150
|
coverageAudited: this.coverageAudited,
|
|
97
151
|
};
|
|
98
152
|
}
|
|
@@ -107,5 +161,8 @@ export class RuleExtractionPipeline extends Pipeline {
|
|
|
107
161
|
if (Array.isArray(data.rulesWithTests) && data.rulesWithTests.length > this.rulesWithTests.length) {
|
|
108
162
|
this.rulesWithTests = data.rulesWithTests;
|
|
109
163
|
}
|
|
164
|
+
if (Array.isArray(data.rulesWithChunkRefs) && data.rulesWithChunkRefs.length > this.rulesWithChunkRefs.length) {
|
|
165
|
+
this.rulesWithChunkRefs = data.rulesWithChunkRefs;
|
|
166
|
+
}
|
|
110
167
|
}
|
|
111
168
|
}
|
|
@@ -36,6 +36,11 @@ export class ProductionQCPipeline extends Pipeline {
|
|
|
36
36
|
}
|
|
37
37
|
|
|
38
38
|
_scanQcResults() {
|
|
39
|
+
// v0.6.1 A5/A6: don't reset documentsReviewed if engine emission has
|
|
40
|
+
// bumped it since last scan — workflow_run hooks call _recordMilestone
|
|
41
|
+
// and the increment lives in this same field. Other counters (batches,
|
|
42
|
+
// accuracy, issues) come solely from filesystem scan and reset cleanly.
|
|
43
|
+
const engineDocsReviewed = this.documentsReviewed;
|
|
39
44
|
this.batchesProcessed = 0;
|
|
40
45
|
this.totalDocuments = 0;
|
|
41
46
|
this.documentsReviewed = 0;
|
|
@@ -43,23 +48,57 @@ export class ProductionQCPipeline extends Pipeline {
|
|
|
43
48
|
this.confidenceDistribution = { low: 0, medium: 0, high: 0 };
|
|
44
49
|
this.issuesFound = [];
|
|
45
50
|
|
|
51
|
+
// Existing canonical path: output/qc/*.json (formal QC batch reports)
|
|
46
52
|
const qcDir = path.join(this._workspace.cwd, "output", "qc");
|
|
47
|
-
if (
|
|
53
|
+
if (fs.existsSync(qcDir)) {
|
|
54
|
+
for (const f of fs.readdirSync(qcDir).filter((f) => f.endsWith(".json")).sort()) {
|
|
55
|
+
try {
|
|
56
|
+
const data = JSON.parse(fs.readFileSync(path.join(qcDir, f), "utf-8"));
|
|
57
|
+
this.batchesProcessed++;
|
|
58
|
+
this.totalDocuments += typeof data.documents === "number" ? data.documents : (data.total || 0);
|
|
59
|
+
this.documentsReviewed += data.reviewed || 0;
|
|
60
|
+
if (data.accuracy_by_rule) Object.assign(this.accuracyByRule, data.accuracy_by_rule);
|
|
61
|
+
if (data.confidence) {
|
|
62
|
+
for (const band of ["low", "medium", "high"]) this.confidenceDistribution[band] += data.confidence[band] || 0;
|
|
63
|
+
}
|
|
64
|
+
if (Array.isArray(data.issues)) this.issuesFound.push(...data.issues);
|
|
65
|
+
} catch { /* skip */ }
|
|
66
|
+
}
|
|
67
|
+
}
|
|
48
68
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
69
|
+
// v0.6.1 A5: also pick up batch-style results in output/results/. E2E #4
|
|
70
|
+
// showed agents writing batch QC outputs to output/results/qc_*.json
|
|
71
|
+
// (e.g. unified_qc.py) instead of output/qc/, so the formal scanner
|
|
72
|
+
// missed them. Heuristic match: filename starts with "qc_" or contains
|
|
73
|
+
// "_batch_". Each match counts as one batch; total_checks → totalDocuments.
|
|
74
|
+
const resultsDir = path.join(this._workspace.cwd, "output", "results");
|
|
75
|
+
if (fs.existsSync(resultsDir)) {
|
|
76
|
+
const seen = new Set();
|
|
77
|
+
for (const f of fs.readdirSync(resultsDir).filter((f) => f.endsWith(".json"))) {
|
|
78
|
+
const lower = f.toLowerCase();
|
|
79
|
+
if (!(lower.startsWith("qc_") || lower.includes("_batch_"))) continue;
|
|
80
|
+
// Dedupe near-duplicate filenames that differ only by timestamp
|
|
81
|
+
// suffix (qc_full_batch_20260424_141642.json vs _141921.json
|
|
82
|
+
// — both are real batches, keep both. But qc_pt_x.json and
|
|
83
|
+
// qc_pt_x_<ts>.json are usually the same batch saved twice; key
|
|
84
|
+
// on the prefix before any 8-digit date.)
|
|
85
|
+
const key = f.replace(/_\d{8}_\d{6}/g, "").replace(/\.json$/, "");
|
|
86
|
+
if (seen.has(key)) continue;
|
|
87
|
+
seen.add(key);
|
|
52
88
|
this.batchesProcessed++;
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
}
|
|
89
|
+
try {
|
|
90
|
+
const data = JSON.parse(fs.readFileSync(path.join(resultsDir, f), "utf-8"));
|
|
91
|
+
// Best-effort metric extraction; tolerate missing keys
|
|
92
|
+
this.totalDocuments += typeof data.sample_count === "number" ? data.sample_count
|
|
93
|
+
: typeof data.documents === "number" ? data.documents
|
|
94
|
+
: typeof data.total === "number" ? data.total : 0;
|
|
95
|
+
} catch { /* skip */ }
|
|
96
|
+
}
|
|
61
97
|
}
|
|
62
98
|
|
|
99
|
+
// Restore engine-emitted documentsReviewed if filesystem reported less
|
|
100
|
+
if (engineDocsReviewed > this.documentsReviewed) this.documentsReviewed = engineDocsReviewed;
|
|
101
|
+
|
|
63
102
|
// Determine monitoring phase
|
|
64
103
|
if (this.batchesProcessed < 3) this.monitoringPhase = "initial";
|
|
65
104
|
else if (this.issuesFound.length > 0) this.monitoringPhase = "active";
|
|
@@ -93,7 +132,18 @@ export class ProductionQCPipeline extends Pipeline {
|
|
|
93
132
|
return null;
|
|
94
133
|
}
|
|
95
134
|
|
|
96
|
-
|
|
135
|
+
/**
|
|
136
|
+
* v0.6.1 A5: gate requires at least one batch processed (real telemetry)
|
|
137
|
+
* AND the legacy stable-monitoring criterion. Without the batch floor, the
|
|
138
|
+
* agent could declare PRODUCTION_QC done from a clean session-state file
|
|
139
|
+
* (E2E #4: phase advanced into PRODUCTION_QC, agent ran 6,930 checks via
|
|
140
|
+
* sandbox_exec to non-canonical paths, batchesProcessed stayed 0, exit
|
|
141
|
+
* fired anyway because monitoringPhase defaults can flip to "stable" with
|
|
142
|
+
* empty accuracyByRule + zero issues).
|
|
143
|
+
*/
|
|
144
|
+
exitCriteriaMet() {
|
|
145
|
+
return this.batchesProcessed > 0 && this.monitoringPhase === "stable";
|
|
146
|
+
}
|
|
97
147
|
|
|
98
148
|
exportState() {
|
|
99
149
|
return {
|
|
@@ -4,9 +4,18 @@ import { Phase, PipelineEvent } from "./index.js";
|
|
|
4
4
|
import { Pipeline } from "./base.js";
|
|
5
5
|
|
|
6
6
|
export class SkillAuthoringPipeline extends Pipeline {
|
|
7
|
-
|
|
7
|
+
/**
|
|
8
|
+
* @param {Workspace} workspace
|
|
9
|
+
* @param {TaskManager|null} [taskManager] - v0.6.1 A2: pass the engine's
|
|
10
|
+
* TaskManager so exitCriteriaMet can require task-completion parity in
|
|
11
|
+
* addition to D2 filename coverage. Subagents pass null (no taskManager
|
|
12
|
+
* in subagent scope), in which case the gate falls back to D2-only
|
|
13
|
+
* behaviour.
|
|
14
|
+
*/
|
|
15
|
+
constructor(workspace, taskManager = null) {
|
|
8
16
|
super();
|
|
9
17
|
this._workspace = workspace;
|
|
18
|
+
this._taskManager = taskManager;
|
|
10
19
|
this.totalRules = [];
|
|
11
20
|
this.skillsAuthored = [];
|
|
12
21
|
this.skillsWithScripts = [];
|
|
@@ -132,12 +141,24 @@ export class SkillAuthoringPipeline extends Pipeline {
|
|
|
132
141
|
"`rule_catalog` tool for any catalog edits — sandbox_exec bypasses the " +
|
|
133
142
|
"workspace file lock and races with parallel workers."
|
|
134
143
|
];
|
|
144
|
+
// v0.6.1 A2: surface task-completion parity so the agent sees the gate
|
|
145
|
+
let taskLine = "";
|
|
146
|
+
if (this._taskManager) {
|
|
147
|
+
const totalT = this._taskManager.countByPhase("skill_authoring");
|
|
148
|
+
const doneT = this._taskManager.countByPhase("skill_authoring", "completed");
|
|
149
|
+
const failedT = this._taskManager.countByPhase("skill_authoring", "failed");
|
|
150
|
+
if (totalT > 0) {
|
|
151
|
+
taskLine = `\n- Per-rule tasks completed: ${doneT}/${totalT}` +
|
|
152
|
+
(failedT > 0 ? ` (+${failedT} failed)` : "");
|
|
153
|
+
}
|
|
154
|
+
}
|
|
135
155
|
parts.push(
|
|
136
156
|
`### Progress (rule-id coverage, D2)\n` +
|
|
137
157
|
`- Total rules in catalog: ${total}\n` +
|
|
138
158
|
`- Rule ids covered by some skill: ${covered}\n` +
|
|
139
159
|
`- Skill directories authored: ${this.skillsAuthored.length}\n` +
|
|
140
160
|
`- Skills with scripts/: ${this.skillsWithScripts.length}` +
|
|
161
|
+
taskLine +
|
|
141
162
|
(uncovered.length > 0
|
|
142
163
|
? `\n- Missing coverage (${uncovered.length}): ${uncovered.slice(0, 15).join(", ")}${uncovered.length > 15 ? "…" : ""}`
|
|
143
164
|
: ""),
|
|
@@ -169,6 +190,20 @@ export class SkillAuthoringPipeline extends Pipeline {
|
|
|
169
190
|
// preserved as a secondary gate on skill depth.
|
|
170
191
|
const allCovered = this.totalRules.every((r) => this.ruleIdsCovered.has(r));
|
|
171
192
|
if (!allCovered) return false;
|
|
193
|
+
// v0.6.1 A2: tasks-parity gate. The 17-minute skill_authoring transition
|
|
194
|
+
// in E2E #4 happened because D2 fired on 20 skeleton SK01-SK20 dirs
|
|
195
|
+
// covering all 110 rule_ids by filename, while only ~5 of 110 per-rule
|
|
196
|
+
// skill_authoring tasks had actually been worked on. Now require every
|
|
197
|
+
// per-rule task in TaskManager to be in a terminal state (completed or
|
|
198
|
+
// failed). Subagents (no taskManager) skip this gate.
|
|
199
|
+
if (this._taskManager) {
|
|
200
|
+
const total = this._taskManager.countByPhase("skill_authoring");
|
|
201
|
+
if (total > 0) {
|
|
202
|
+
const completed = this._taskManager.countByPhase("skill_authoring", "completed");
|
|
203
|
+
const failed = this._taskManager.countByPhase("skill_authoring", "failed");
|
|
204
|
+
if (completed + failed < total) return false;
|
|
205
|
+
}
|
|
206
|
+
}
|
|
172
207
|
return this.skillsWithScripts.length >= Math.max(1, this.skillsAuthored.length * 0.5);
|
|
173
208
|
}
|
|
174
209
|
|
|
@@ -182,6 +182,21 @@ export class TaskManager {
|
|
|
182
182
|
return { total, completed, inProgress, pending, failed };
|
|
183
183
|
}
|
|
184
184
|
|
|
185
|
+
/**
|
|
186
|
+
* v0.6.1 A2: Phase-scoped task count. Used by SkillAuthoringPipeline's
|
|
187
|
+
* exitCriteriaMet to gate phase advance on TaskManager parity, not just
|
|
188
|
+
* filename-regex coverage. Pass a status to filter; omit for total.
|
|
189
|
+
*
|
|
190
|
+
* @param {string} phase - Phase name (e.g., "skill_authoring")
|
|
191
|
+
* @param {string|null} [status] - Optional status filter ("completed", "pending", etc.)
|
|
192
|
+
* @returns {number}
|
|
193
|
+
*/
|
|
194
|
+
countByPhase(phase, status = null) {
|
|
195
|
+
return this._tasks.filter(
|
|
196
|
+
(t) => t.phase === phase && (status == null || t.status === status),
|
|
197
|
+
).length;
|
|
198
|
+
}
|
|
199
|
+
|
|
185
200
|
/**
|
|
186
201
|
* Format task list for injection into system prompt context.
|
|
187
202
|
* Compact checklist — not conversation history.
|
|
@@ -9,12 +9,33 @@ import { BaseTool, ToolResult } from "./base.js";
|
|
|
9
9
|
* result and trace ID automatically. Saves structured result to output/results/.
|
|
10
10
|
*/
|
|
11
11
|
export class WorkflowRunTool extends BaseTool {
|
|
12
|
-
|
|
12
|
+
/**
|
|
13
|
+
* @param {Workspace} workspace
|
|
14
|
+
* @param {VersionManager} versionManager
|
|
15
|
+
* @param {ConfidenceScorer} confidenceScorer
|
|
16
|
+
* @param {object} [opts]
|
|
17
|
+
* @param {number} [opts.timeout=120]
|
|
18
|
+
* @param {(phase: string, key: string, value: any) => boolean} [opts.recordMilestone]
|
|
19
|
+
* v0.6.1 A6: callback for engine-emitted milestone updates. Called on
|
|
20
|
+
* successful workflow execution so the distillation/production_qc gates
|
|
21
|
+
* see real telemetry, not just filesystem scans of canonical paths.
|
|
22
|
+
* @param {() => string} [opts.getCurrentPhase]
|
|
23
|
+
* v0.6.1 A6: returns the engine's current phase. Used to gate
|
|
24
|
+
* production_qc-specific milestone bumps (documentsReviewed) so
|
|
25
|
+
* distillation-phase calls don't accidentally credit QC.
|
|
26
|
+
*/
|
|
27
|
+
constructor(workspace, versionManager, confidenceScorer, {
|
|
28
|
+
timeout = 120,
|
|
29
|
+
recordMilestone = null,
|
|
30
|
+
getCurrentPhase = null,
|
|
31
|
+
} = {}) {
|
|
13
32
|
super();
|
|
14
33
|
this._workspace = workspace;
|
|
15
34
|
this._versionMgr = versionManager;
|
|
16
35
|
this._confidence = confidenceScorer;
|
|
17
36
|
this._timeout = timeout;
|
|
37
|
+
this._recordMilestone = recordMilestone;
|
|
38
|
+
this._getCurrentPhase = getCurrentPhase;
|
|
18
39
|
}
|
|
19
40
|
|
|
20
41
|
get name() { return "workflow_run"; }
|
|
@@ -97,6 +118,18 @@ export class WorkflowRunTool extends BaseTool {
|
|
|
97
118
|
const resultFile = path.join(resultsDir, `${ruleId}_${path.parse(docResolved).name}.json`);
|
|
98
119
|
fs.writeFileSync(resultFile, JSON.stringify(resultData, null, 2), "utf-8");
|
|
99
120
|
|
|
121
|
+
// v0.6.1 A6: emit milestone signals so phase gates see this run.
|
|
122
|
+
// Wrapped in try/catch so milestone emission can never break a workflow.
|
|
123
|
+
try {
|
|
124
|
+
this._recordMilestone?.("distillation", "workflowsTested",
|
|
125
|
+
{ id: ruleId, value: { confidence, traceId: resultData.trace_id } });
|
|
126
|
+
this._recordMilestone?.("distillation", "workflowsPassing", ruleId);
|
|
127
|
+
const phase = this._getCurrentPhase?.();
|
|
128
|
+
if (phase === "production_qc") {
|
|
129
|
+
this._recordMilestone?.("production_qc", "documentsReviewed", 1);
|
|
130
|
+
}
|
|
131
|
+
} catch { /* never let milestone emission break workflow execution */ }
|
|
132
|
+
|
|
100
133
|
return new ToolResult(JSON.stringify(resultData, null, 2));
|
|
101
134
|
}
|
|
102
135
|
|