kc-beta 0.6.2 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/LICENSE +81 -0
  2. package/LICENSE-COMMERCIAL.md +125 -0
  3. package/README.md +21 -3
  4. package/package.json +14 -5
  5. package/src/agent/context-window.js +9 -12
  6. package/src/agent/context.js +14 -1
  7. package/src/agent/document-parser.js +169 -0
  8. package/src/agent/engine.js +382 -19
  9. package/src/agent/history/event-history.js +222 -0
  10. package/src/agent/llm-client.js +55 -0
  11. package/src/agent/message-utils.js +63 -0
  12. package/src/agent/pipelines/_milestone-derive.js +566 -0
  13. package/src/agent/pipelines/base.js +21 -0
  14. package/src/agent/pipelines/distillation.js +28 -15
  15. package/src/agent/pipelines/extraction.js +130 -36
  16. package/src/agent/pipelines/finalization.js +178 -11
  17. package/src/agent/pipelines/index.js +6 -1
  18. package/src/agent/pipelines/initializer.js +74 -8
  19. package/src/agent/pipelines/production-qc.js +31 -44
  20. package/src/agent/pipelines/skill-authoring.js +97 -80
  21. package/src/agent/pipelines/skill-testing.js +106 -23
  22. package/src/agent/retry.js +10 -2
  23. package/src/agent/scheduler.js +14 -2
  24. package/src/agent/session-state.js +18 -1
  25. package/src/agent/skill-loader.js +13 -7
  26. package/src/agent/skill-validator.js +19 -5
  27. package/src/agent/task-manager.js +61 -5
  28. package/src/agent/tools/document-chunk.js +21 -9
  29. package/src/agent/tools/phase-advance.js +37 -5
  30. package/src/agent/tools/release.js +51 -9
  31. package/src/agent/tools/rule-catalog.js +11 -1
  32. package/src/agent/tools/workspace-file.js +32 -0
  33. package/src/agent/workspace.js +39 -1
  34. package/src/cli/components.js +64 -14
  35. package/src/cli/index.js +62 -3
  36. package/src/cli/meme.js +26 -25
  37. package/src/config.js +65 -22
  38. package/src/model-tiers.json +24 -8
  39. package/src/providers.js +42 -0
  40. package/template/release/v1/README.md.tmpl +108 -0
  41. package/template/release/v1/catalog.json.tmpl +4 -0
  42. package/template/release/v1/kc_runtime/__init__.py +11 -0
  43. package/template/release/v1/kc_runtime/confidence.py +63 -0
  44. package/template/release/v1/kc_runtime/doc_parser.py +127 -0
  45. package/template/release/v1/manifest.json.tmpl +11 -0
  46. package/template/release/v1/render_dashboard.py +117 -0
  47. package/template/release/v1/run.py +212 -0
  48. package/template/release/v1/serve.sh +17 -0
  49. package/template/skills/en/meta-meta/work-decomposition/SKILL.md +326 -0
  50. package/template/skills/en/skill-creator/SKILL.md +1 -1
  51. package/template/skills/zh/meta-meta/work-decomposition/SKILL.md +321 -0
  52. package/template/skills/zh/skill-creator/SKILL.md +1 -1
@@ -2,6 +2,7 @@ import fs from "node:fs";
2
2
  import path from "node:path";
3
3
  import { PipelineEvent } from "./index.js";
4
4
  import { Pipeline } from "./base.js";
5
+ import { deriveProductionQcMilestones } from "./_milestone-derive.js";
5
6
 
6
7
  const FREQUENCY_MAP = { high: 1.0, mid: 0.5, low: 0.2 };
7
8
 
@@ -36,27 +37,31 @@ export class ProductionQCPipeline extends Pipeline {
36
37
  }
37
38
 
38
39
  _scanQcResults() {
39
- // v0.6.1 A5/A6: don't reset documentsReviewed if engine emission has
40
- // bumped it since last scan workflow_run hooks call _recordMilestone
41
- // and the increment lives in this same field. Other counters (batches,
42
- // accuracy, issues) come solely from filesystem scan and reset cleanly.
40
+ // v0.7.0 A1: route through filesystem-derived helper. The helper
41
+ // recognizes both DS-style results (object with `results` keyed by
42
+ // rule_id, doc-paths in nested keys) AND GLM-style array-of-verdicts
43
+ // (one entry per doc with .verdict/.file/.path) neither matched
44
+ // the v0.6.1 A5 heuristic alone, so E2E #5 saw batchesProcessed=0
45
+ // even with 1,951 verdicts on disk.
43
46
  const engineDocsReviewed = this.documentsReviewed;
44
- this.batchesProcessed = 0;
47
+ const m = deriveProductionQcMilestones(this._workspace);
48
+ this.batchesProcessed = m.batchesProcessed;
49
+ this.documentsReviewed = m.documentsReviewed;
50
+
51
+ // Layered: still extract accuracyByRule / confidence / issues from
52
+ // canonical output/qc/*.json batches when present. The helper
53
+ // doesn't try to reconstruct accuracy semantics (too schema-specific),
54
+ // but if the agent followed canonical schema, we surface it.
45
55
  this.totalDocuments = 0;
46
- this.documentsReviewed = 0;
47
56
  this.accuracyByRule = {};
48
57
  this.confidenceDistribution = { low: 0, medium: 0, high: 0 };
49
58
  this.issuesFound = [];
50
-
51
- // Existing canonical path: output/qc/*.json (formal QC batch reports)
52
59
  const qcDir = path.join(this._workspace.cwd, "output", "qc");
53
60
  if (fs.existsSync(qcDir)) {
54
61
  for (const f of fs.readdirSync(qcDir).filter((f) => f.endsWith(".json")).sort()) {
55
62
  try {
56
63
  const data = JSON.parse(fs.readFileSync(path.join(qcDir, f), "utf-8"));
57
- this.batchesProcessed++;
58
64
  this.totalDocuments += typeof data.documents === "number" ? data.documents : (data.total || 0);
59
- this.documentsReviewed += data.reviewed || 0;
60
65
  if (data.accuracy_by_rule) Object.assign(this.accuracyByRule, data.accuracy_by_rule);
61
66
  if (data.confidence) {
62
67
  for (const band of ["low", "medium", "high"]) this.confidenceDistribution[band] += data.confidence[band] || 0;
@@ -66,44 +71,26 @@ export class ProductionQCPipeline extends Pipeline {
66
71
  }
67
72
  }
68
73
 
69
- // v0.6.1 A5: also pick up batch-style results in output/results/. E2E #4
70
- // showed agents writing batch QC outputs to output/results/qc_*.json
71
- // (e.g. unified_qc.py) instead of output/qc/, so the formal scanner
72
- // missed them. Heuristic match: filename starts with "qc_" or contains
73
- // "_batch_". Each match counts as one batch; total_checks → totalDocuments.
74
- const resultsDir = path.join(this._workspace.cwd, "output", "results");
75
- if (fs.existsSync(resultsDir)) {
76
- const seen = new Set();
77
- for (const f of fs.readdirSync(resultsDir).filter((f) => f.endsWith(".json"))) {
78
- const lower = f.toLowerCase();
79
- if (!(lower.startsWith("qc_") || lower.includes("_batch_"))) continue;
80
- // Dedupe near-duplicate filenames that differ only by timestamp
81
- // suffix (qc_full_batch_20260424_141642.json vs _141921.json
82
- // — both are real batches, keep both. But qc_pt_x.json and
83
- // qc_pt_x_<ts>.json are usually the same batch saved twice; key
84
- // on the prefix before any 8-digit date.)
85
- const key = f.replace(/_\d{8}_\d{6}/g, "").replace(/\.json$/, "");
86
- if (seen.has(key)) continue;
87
- seen.add(key);
88
- this.batchesProcessed++;
89
- try {
90
- const data = JSON.parse(fs.readFileSync(path.join(resultsDir, f), "utf-8"));
91
- // Best-effort metric extraction; tolerate missing keys
92
- this.totalDocuments += typeof data.sample_count === "number" ? data.sample_count
93
- : typeof data.documents === "number" ? data.documents
94
- : typeof data.total === "number" ? data.total : 0;
95
- } catch { /* skip */ }
96
- }
97
- }
98
-
99
- // Restore engine-emitted documentsReviewed if filesystem reported less
74
+ // Restore engine-emitted documentsReviewed if disk-derived is lower
75
+ // (engine increment may know about reviews not yet flushed to disk)
100
76
  if (engineDocsReviewed > this.documentsReviewed) this.documentsReviewed = engineDocsReviewed;
101
77
 
102
- // Determine monitoring phase
78
+ // Determine monitoring phase. v0.7.0 H5 fix: empty accuracyByRule
79
+ // no longer flips to "stable" via vacuous truth — require at least
80
+ // one rule with an accuracy reading before claiming stability.
103
81
  if (this.batchesProcessed < 3) this.monitoringPhase = "initial";
104
82
  else if (this.issuesFound.length > 0) this.monitoringPhase = "active";
105
- else if (Object.values(this.accuracyByRule).every((a) => a >= this._accuracyThreshold)) this.monitoringPhase = "stable";
106
- else this.monitoringPhase = "active";
83
+ else {
84
+ const accuracies = Object.values(this.accuracyByRule);
85
+ if (accuracies.length > 0 && accuracies.every((a) => a >= this._accuracyThreshold)) {
86
+ this.monitoringPhase = "stable";
87
+ } else {
88
+ // Helper-derived batches with no accuracy data: agent ran QC but
89
+ // didn't surface accuracy schema. Treat as `active` (work
90
+ // happened, but engine can't auto-bless stability).
91
+ this.monitoringPhase = "active";
92
+ }
93
+ }
107
94
  }
108
95
 
109
96
  describeState() {
@@ -3,6 +3,7 @@ import path from "node:path";
3
3
  import { Phase, PipelineEvent } from "./index.js";
4
4
  import { Pipeline } from "./base.js";
5
5
  import { SkillValidator } from "../skill-validator.js";
6
+ import { deriveSkillAuthoringMilestones } from "./_milestone-derive.js";
6
7
 
7
8
  export class SkillAuthoringPipeline extends Pipeline {
8
9
  /**
@@ -49,83 +50,22 @@ export class SkillAuthoringPipeline extends Pipeline {
49
50
  }
50
51
 
51
52
  _scanSkills() {
52
- this.skillsAuthored = [];
53
- this.skillsWithScripts = [];
54
- // D2: rule_ids that are covered by some authored skill — whether that
55
- // skill is single-rule (rule_skills/R014/) or grouped
56
- // (rule_skills/SK02/check_r002_r007.py). Populated by _walkForRuleIds
57
- // below so the exit criterion counts DISTINCT rule coverage rather
58
- // than skill-directory count, which over-counts when skills are
59
- // grouped (session 6304673afaa0's rule_skills/ had 289 rules packed
60
- // into 23 skill files).
61
- this.ruleIdsCovered = new Set();
62
- const dir = path.join(this._workspace.cwd, "rule_skills");
63
- if (!fs.existsSync(dir)) return;
64
- for (const e of fs.readdirSync(dir, { withFileTypes: true })) {
65
- if (!e.isDirectory() || e.name.startsWith("__")) continue;
66
- const skillPath = path.join(dir, e.name);
67
- if (fs.existsSync(path.join(skillPath, "SKILL.md")) || fs.readdirSync(skillPath).some((f) => f.endsWith(".py"))) {
68
- this.skillsAuthored.push(e.name);
69
- }
70
- const scriptsDir = path.join(skillPath, "scripts");
71
- if (fs.existsSync(scriptsDir) && fs.readdirSync(scriptsDir).length > 0) {
72
- this.skillsWithScripts.push(e.name);
73
- }
74
- this._walkForRuleIds(skillPath);
75
- }
53
+ // v0.7.0 A1: route through filesystem-derived milestone helper. The
54
+ // helper centralizes the ruleId extraction patterns (R### dirs,
55
+ // check_r###.py, range dirs R078_R128, grouped check_r###_r###.py)
56
+ // and recognizes both root-level check_*.py AND scripts/check*.py
57
+ // (per A6 — XM E2E #5 used scripts/ subdir).
58
+ const m = deriveSkillAuthoringMilestones(this._workspace);
59
+ this.skillsAuthored = [...m.skillsAuthored];
60
+ this.skillsWithScripts = [...m.skillsWithScripts];
61
+ this.ruleIdsCovered = new Set(m.ruleIdsCovered);
76
62
  }
77
63
 
78
- /**
79
- * D2: Find rule_ids referenced by any file under the skill directory.
80
- * Recognizes three naming patterns from actual sessions:
81
- * - Directory name matches a rule: rule_skills/R014/
82
- * - Single-rule script: check_r014.py
83
- * - Grouped script: check_r002_r007.py → covers R002 through R007
84
- */
85
- _walkForRuleIds(skillDir) {
86
- const dirName = path.basename(skillDir);
87
- const dirMatch = dirName.match(/^R0*(\d+)$/i);
88
- if (dirMatch) this.ruleIdsCovered.add(`R${String(parseInt(dirMatch[1], 10)).padStart(3, "0")}`);
89
-
90
- const walk = (d) => {
91
- let entries;
92
- try { entries = fs.readdirSync(d, { withFileTypes: true }); }
93
- catch { return; }
94
- for (const e of entries) {
95
- if (e.name.startsWith(".")) continue;
96
- const p = path.join(d, e.name);
97
- if (e.isDirectory()) { walk(p); continue; }
98
- // Per-rule: check_r014.py
99
- const single = e.name.match(/check_r0*(\d+)\.py$/i);
100
- if (single) {
101
- this.ruleIdsCovered.add(`R${String(parseInt(single[1], 10)).padStart(3, "0")}`);
102
- continue;
103
- }
104
- // Grouped: check_r002_r007.py, check_r002-r007.py, check_r59_r77.py
105
- const grouped = e.name.match(/check_r0*(\d+)[_-]+r0*(\d+)\.py$/i);
106
- if (grouped) {
107
- const lo = parseInt(grouped[1], 10);
108
- const hi = parseInt(grouped[2], 10);
109
- for (let n = lo; n <= hi; n++) {
110
- this.ruleIdsCovered.add(`R${String(n).padStart(3, "0")}`);
111
- }
112
- continue;
113
- }
114
- // Directory names that encode ranges: R078_R128/
115
- // handled by caller passing skillDir
116
- }
117
- };
118
- // Also handle dirs named like R078_R128/
119
- const rangeDir = dirName.match(/^R0*(\d+)[_-]R0*(\d+)$/i);
120
- if (rangeDir) {
121
- const lo = parseInt(rangeDir[1], 10);
122
- const hi = parseInt(rangeDir[2], 10);
123
- for (let n = lo; n <= hi; n++) {
124
- this.ruleIdsCovered.add(`R${String(n).padStart(3, "0")}`);
125
- }
126
- }
127
- walk(skillDir);
128
- }
64
+ // v0.7.0 A1: ruleId extraction moved to _milestone-derive.js
65
+ // (deriveSkillAuthoringMilestones). Pattern recognition is identical
66
+ // single rule (R014, check_r014.py), grouped scripts
67
+ // (check_r002_r007.py), range dirs (R078_R128). Kept as a single
68
+ // canonical implementation rather than duplicating across pipelines.
129
69
 
130
70
  describeState() {
131
71
  this._scanWorkspace();
@@ -136,15 +76,37 @@ export class SkillAuthoringPipeline extends Pipeline {
136
76
  "## Phase: SKILL_AUTHORING\n" +
137
77
  "Write verification skills for each extracted rule. Skills are first-class " +
138
78
  "deliverables — they may serve as the production solution when worker LLM " +
139
- "workflows are insufficient. Follow Anthropic skill-creator format. This is " +
140
- "BUILD mode.\n\n" +
79
+ "workflows are insufficient. Follow the canonical skill-folder layout " +
80
+ "(below). This is BUILD mode.\n\n" +
81
+ // v0.7.0 D1: inline the canonical folder structure spec so the
82
+ // agent sees it in every system prompt of this phase. E2E #5
83
+ // showed three of four contestants ignored the meta-meta spec
84
+ // because it required navigating to read the SKILL.md file
85
+ // separately. Inlining costs ~250 tokens and dramatically improves
86
+ // first-attempt structural compliance.
87
+ "### Canonical skill folder layout\n" +
88
+ "```\n" +
89
+ "rule_skills/\n" +
90
+ " R014/ # one dir per rule (or grouped range)\n" +
91
+ " SKILL.md # YAML frontmatter (name+description) + methodology\n" +
92
+ " check_r014.py # entry point: def check_rule|verify|check|evaluate(...)\n" +
93
+ " references/regulation.md # verbatim regulation text (optional)\n" +
94
+ " references/interpretation.md # edge-case notes (optional)\n" +
95
+ " assets/test_cases.json # annotated samples + expected verdicts (optional)\n" +
96
+ "```\n" +
97
+ "Validator-accepted alternatives: `scripts/check_r###.py` (under scripts/) " +
98
+ "instead of root-level. SKILL.md filename is case-insensitive (skill.md " +
99
+ "is also accepted). The check.py just needs a top-level `def` at module " +
100
+ "level — entry-point name does not have to match a strict pattern.\n\n" +
141
101
  // D2: soft granularity nudge
142
102
  "**Granularity preference:** 1 rule = 1 skill directory. Group rules into " +
143
103
  "the same file ONLY when they share evidence and fail together (e.g. " +
144
104
  "siblings from the same required-fields table). When grouping, name the " +
145
105
  "file with the range: `check_r002_r007.py`. Downstream consumers " +
146
- "(workflow-run, dashboards) count rule coverage by parsing these names, " +
147
- "so the file-naming matters.\n\n" +
106
+ "(workflow-run, dashboards, release tool) count rule coverage by parsing " +
107
+ "these names, so the file-naming matters. (Read `meta-meta/work-decomposition` " +
108
+ "for the full grouping/ordering decision framework + PATTERNS.md memory " +
109
+ "discipline.)\n\n" +
148
110
  "**Do not write to rules/catalog.json via sandbox_exec.** Use the " +
149
111
  "`rule_catalog` tool for any catalog edits — sandbox_exec bypasses the " +
150
112
  "workspace file lock and races with parallel workers."
@@ -194,7 +156,38 @@ export class SkillAuthoringPipeline extends Pipeline {
194
156
  onToolResult(toolName, toolInput, result) {
195
157
  if (result.isError) return null;
196
158
  const wasReady = this.exitCriteriaMet();
197
- if (toolName === "workspace_file" && (toolInput.path || "").includes("rule_skills/")) this._scanSkills();
159
+ const writeToSkill = toolName === "workspace_file" &&
160
+ toolInput?.operation === "write" &&
161
+ (toolInput.path || "").includes("rule_skills/");
162
+ if (writeToSkill) {
163
+ this._scanSkills();
164
+ // v0.7.0 A4: validate this specific file immediately if it looks
165
+ // like a check.py. Surfaces syntax/entry-point issues in the next
166
+ // describeState rather than waiting for the phase boundary —
167
+ // E2E #5 had skill_authoring force-bypassed before exitCriteriaMet
168
+ // ever fired, so the v0.6.2 boundary-only validator never ran in
169
+ // practice.
170
+ const p = toolInput.path || "";
171
+ if (/\/check[_a-zA-Z0-9-]*\.py$/i.test(p) && /^rule_skills\//.test(p)) {
172
+ const abs = path.join(this._workspace.cwd, p);
173
+ // Invalidate any stale mtime cache entry for this path then
174
+ // re-validate. Folds the result into _validationFailures so
175
+ // describeState picks it up.
176
+ this._validator.invalidate(abs);
177
+ const r = this._validator.validateFile(abs);
178
+ if (!r.ok) {
179
+ // Replace any prior failure record for this path
180
+ this._validationFailures = this._validationFailures.filter(
181
+ (f) => f.filePath !== abs,
182
+ );
183
+ this._validationFailures.push({ filePath: abs, error: r.error || "unknown" });
184
+ } else {
185
+ this._validationFailures = this._validationFailures.filter(
186
+ (f) => f.filePath !== abs,
187
+ );
188
+ }
189
+ }
190
+ }
198
191
  if (!wasReady && this.exitCriteriaMet()) {
199
192
  return new PipelineEvent({ type: "phase_ready", message: "Skill authoring complete. Ready for SKILL_TESTING.", nextPhase: Phase.SKILL_TESTING });
200
193
  }
@@ -242,6 +235,30 @@ export class SkillAuthoringPipeline extends Pipeline {
242
235
  * v0.6.2 I2: gather every check_r###.py path under rule_skills/. Used by
243
236
  * the skill validator. Walks one level into each skill directory.
244
237
  */
238
+ /**
239
+ * v0.6.3 (#74): SKILL_AUTHORING writes per-rule check scripts under
240
+ * rule_skills/. Workflow runs against production samples or distillation
241
+ * outputs are later-phase work.
242
+ */
243
+ phaseMisfitHint(toolName, toolInput, result) {
244
+ if (result?.isError) return null;
245
+ const exitText = this.exitCriteriaMet()
246
+ ? "Skill-authoring exit criteria are MET — call phase_advance(to=\"skill_testing\") to proceed."
247
+ : "Skill-authoring not yet complete (see describeState).";
248
+
249
+ if (toolName === "workspace_file" && toolInput?.operation === "write") {
250
+ const p = toolInput.path || "";
251
+ if (p.startsWith("workflows/")) {
252
+ return `Writing under workflows/ is DISTILLATION-phase work, but engine is in SKILL_AUTHORING. ${exitText}`;
253
+ }
254
+ if (p.startsWith("output/results/")) {
255
+ return `Writing under output/results/ is PRODUCTION_QC-phase work, but engine is in SKILL_AUTHORING. ${exitText}`;
256
+ }
257
+ }
258
+
259
+ return null;
260
+ }
261
+
245
262
  _collectCheckScripts() {
246
263
  const out = [];
247
264
  const dir = path.join(this._workspace.cwd, "rule_skills");
@@ -2,6 +2,7 @@ import fs from "node:fs";
2
2
  import path from "node:path";
3
3
  import { Phase, PipelineEvent } from "./index.js";
4
4
  import { Pipeline } from "./base.js";
5
+ import { deriveSkillAuthoringMilestones, deriveSkillTestingMilestones } from "./_milestone-derive.js";
5
6
 
6
7
  export class SkillTestingPipeline extends Pipeline {
7
8
  constructor(workspace) {
@@ -13,6 +14,11 @@ export class SkillTestingPipeline extends Pipeline {
13
14
  this.iterationCount = 0;
14
15
  this._accuracyThreshold = 0.9;
15
16
  this._maxIterations = 20;
17
+ // v0.7.1 1b: rate-limit phaseMisfitHint firing for ephemeral
18
+ // sandbox tests. Caps at ~3 nudges per phase entry so the agent
19
+ // sees the path expectation but doesn't get spammed during a
20
+ // batch run.
21
+ this._misfit_nudge_count = 0;
16
22
  this._scanWorkspace();
17
23
  }
18
24
 
@@ -33,35 +39,48 @@ export class SkillTestingPipeline extends Pipeline {
33
39
  }
34
40
 
35
41
  _loadSkills() {
36
- this.skillsToTest = [];
37
- const dir = path.join(this._workspace.cwd, "rule_skills");
38
- if (!fs.existsSync(dir)) return;
39
- for (const e of fs.readdirSync(dir, { withFileTypes: true })) {
40
- if (e.isDirectory() && !e.name.startsWith("__")) {
41
- const p = path.join(dir, e.name);
42
- if (fs.existsSync(path.join(p, "SKILL.md")) || fs.readdirSync(p).some((f) => f.endsWith(".py"))) {
43
- this.skillsToTest.push(e.name);
44
- }
45
- }
46
- }
42
+ // v0.7.0 A1: route through filesystem-derived helper (skill_authoring's
43
+ // skillsAuthored is the canonical "what skills exist" view).
44
+ const m = deriveSkillAuthoringMilestones(this._workspace);
45
+ this.skillsToTest = [...m.skillsAuthored];
47
46
  }
48
47
 
49
48
  _loadTestResults() {
50
49
  this.skillsTested = {};
51
50
  this.skillsPassing = [];
51
+
52
+ // Layer 1 (canonical schema): output/<rule_id>.json with `accuracy` field.
53
+ // Carries the actual numeric threshold check.
52
54
  const outDir = path.join(this._workspace.cwd, "output");
53
- if (!fs.existsSync(outDir)) return;
54
- for (const f of fs.readdirSync(outDir).filter((f) => f.endsWith(".json"))) {
55
- try {
56
- const data = JSON.parse(fs.readFileSync(path.join(outDir, f), "utf-8"));
57
- if (data.accuracy != null) {
58
- const ruleId = data.rule_id || path.parse(f).name;
59
- const acc = parseFloat(data.accuracy);
60
- this.skillsTested[ruleId] = Math.max(this.skillsTested[ruleId] || 0, acc);
61
- }
62
- } catch { /* skip */ }
55
+ if (fs.existsSync(outDir)) {
56
+ for (const f of fs.readdirSync(outDir).filter((f) => f.endsWith(".json"))) {
57
+ try {
58
+ const data = JSON.parse(fs.readFileSync(path.join(outDir, f), "utf-8"));
59
+ if (data.accuracy != null) {
60
+ const ruleId = data.rule_id || path.parse(f).name;
61
+ const acc = parseFloat(data.accuracy);
62
+ this.skillsTested[ruleId] = Math.max(this.skillsTested[ruleId] || 0, acc);
63
+ }
64
+ } catch { /* skip */ }
65
+ }
63
66
  }
64
- this.skillsPassing = Object.entries(this.skillsTested).filter(([, acc]) => acc >= this._accuracyThreshold).map(([id]) => id);
67
+
68
+ // Layer 2 (helper-derived floor): per-skill test_results/, tests/, or
69
+ // assets/test_cases.json count as "tested" even without an accuracy
70
+ // reading. Without this floor, agents who tested via sandbox_exec
71
+ // (no accuracy JSON written) showed skillsTested={} despite real
72
+ // testing — exactly the E2E #5 GLM case.
73
+ const m = deriveSkillTestingMilestones(this._workspace);
74
+ for (const id of m.skillsTested) {
75
+ // Test artifact present but no numeric accuracy → record as tested
76
+ // at threshold value (just-passing). The agent can revise via
77
+ // canonical-schema JSON if needed.
78
+ if (!(id in this.skillsTested)) this.skillsTested[id] = this._accuracyThreshold;
79
+ }
80
+
81
+ this.skillsPassing = Object.entries(this.skillsTested)
82
+ .filter(([, acc]) => acc >= this._accuracyThreshold)
83
+ .map(([id]) => id);
65
84
  }
66
85
 
67
86
  _loadEvolutionLog() {
@@ -104,7 +123,71 @@ export class SkillTestingPipeline extends Pipeline {
104
123
  exitCriteriaMet() {
105
124
  const total = this.skillsToTest.length;
106
125
  if (!total) return false;
107
- return Object.keys(this.skillsTested).length >= total && this.skillsPassing.length >= total * this._accuracyThreshold;
126
+ // v0.7.0 H/C2 fix: previous gate `skillsPassing.length >= total * threshold`
127
+ // was multiplying *count* by accuracy threshold (default 0.9), so 9/10
128
+ // failing skills could still pass the gate. The intent is "every
129
+ // skill passes its per-skill threshold" — count parity, not weighted.
130
+ // (Fraction-of-skills fallbacks belong in optional config, not the
131
+ // default exit criterion.)
132
+ return Object.keys(this.skillsTested).length >= total &&
133
+ this.skillsPassing.length >= total;
134
+ }
135
+
136
+ /**
137
+ * v0.6.3 (#74): SKILL_TESTING runs check scripts against test samples and
138
+ * measures accuracy. Writing distillation outputs or production results
139
+ * here means phase boundaries got skipped.
140
+ *
141
+ * v0.7.1 1b: also nudges agents who run check scripts via sandbox_exec
142
+ * but don't persist verdicts. E2E #6 v070 surfaced this — both
143
+ * conductors batched tests in one sandbox_exec call, read pass/fail
144
+ * from stdout, then declared "testing done" while engine saw
145
+ * skillsTested=0 because nothing landed in a recognized path.
146
+ */
147
+ phaseMisfitHint(toolName, toolInput, result) {
148
+ if (result?.isError) return null;
149
+ const exitText = this.exitCriteriaMet()
150
+ ? "Skill-testing exit criteria are MET — call phase_advance(to=\"distillation\")."
151
+ : "Skill-testing not yet complete.";
152
+
153
+ if (toolName === "workspace_file" && toolInput?.operation === "write") {
154
+ const p = toolInput.path || "";
155
+ if (p.startsWith("workflows/")) {
156
+ return `Writing under workflows/ is DISTILLATION-phase work, but engine is in SKILL_TESTING. ${exitText}`;
157
+ }
158
+ if (p.startsWith("output/results/")) {
159
+ return `Writing under output/results/ is PRODUCTION_QC-phase work, but engine is in SKILL_TESTING. ${exitText}`;
160
+ }
161
+ }
162
+
163
+ // v0.7.1 1b: sandbox_exec test-command nudge
164
+ if (toolName === "sandbox_exec") {
165
+ const cmd = String(toolInput?.command || "");
166
+ const looksLikeTest =
167
+ /python.*check.*\.py.*\.(txt|pdf|md|docx)/i.test(cmd) ||
168
+ /pytest|unittest|run_tests/i.test(cmd) ||
169
+ /python.*workflow.*\.py.*samples/i.test(cmd);
170
+ if (!looksLikeTest) return null;
171
+
172
+ const tested = Object.keys(this.skillsTested).length;
173
+ const total = this.skillsToTest.length;
174
+ // Already satisfied? Don't nudge.
175
+ if (total === 0 || tested >= total) return null;
176
+
177
+ // Rate-limit: ~3 per phase. Counter resets on phase entry
178
+ // (constructor) and on importState if available.
179
+ this._misfit_nudge_count = (this._misfit_nudge_count || 0) + 1;
180
+ if (this._misfit_nudge_count > 3) return null;
181
+
182
+ return (
183
+ `Engine derives skillsTested from rule_skills/<id>/test_results.json, ` +
184
+ `rule_skills/<id>/tests/, OR output/*.json with rule_id field. ` +
185
+ `Sandbox runs are ephemeral — record per-rule verdicts to one of ` +
186
+ `those paths before phase_advance. Currently engine sees ` +
187
+ `${tested}/${total} skills tested.`
188
+ );
189
+ }
190
+ return null;
108
191
  }
109
192
 
110
193
  exportState() {
@@ -1,9 +1,17 @@
1
1
  /**
2
2
  * Retry wrapper with exponential backoff and jitter.
3
3
  * Designed for LLM API calls — retries transient errors, fails fast on auth/validation errors.
4
+ *
5
+ * v0.6.3.1: KC_MAX_RETRIES env override. Default 10 attempts ≈ 5 min of
6
+ * exponential backoff (1+2+4+8+16+32+60+60+60+60s). E2E #5 surfaced a
7
+ * Tencent outage that lasted longer than the default; setting
8
+ * KC_MAX_RETRIES=20 buys ~15 more min before the engine gives up.
4
9
  */
5
-
6
- const MAX_RETRIES = 10;
10
+ const MAX_RETRIES = (() => {
11
+ const raw = parseInt(process.env.KC_MAX_RETRIES || "", 10);
12
+ if (Number.isFinite(raw) && raw >= 0 && raw <= 50) return raw;
13
+ return 10;
14
+ })();
7
15
  const INITIAL_DELAY_MS = 1000;
8
16
  const MAX_DELAY_MS = 60000;
9
17
  const BACKOFF_MULTIPLIER = 2;
@@ -222,14 +222,26 @@ export class Scheduler {
222
222
  }
223
223
 
224
224
  /**
225
- * Count of files directly under input/ (excluding subdirs like archived/).
225
+ * Count of files directly under input/ (excluding subdirs like archived/
226
+ * and v0.7.0 F3 agent-scratch marker .kc-scratch/).
227
+ *
228
+ * Background: E2E #5 DS surfaced "📥 4 new file(s) pending in input/"
229
+ * when the agent's sandbox_exec had dropped 4 test fixtures into
230
+ * input/ during smoke-testing. The user assumed external arrivals.
231
+ * The scheduler never had a way to disambiguate.
232
+ *
233
+ * v0.7.0 F3: agent-side scratch writes go under input/.kc-scratch/
234
+ * (a sidecar dir, hidden by the standard "starts with ." filter).
235
+ * The banner counts only top-level non-hidden files, which is what
236
+ * external arrivals actually look like (schedule_fetch drops files
237
+ * directly into input/ root).
226
238
  */
227
239
  pendingInputCount() {
228
240
  const dir = path.join(this._workspace.cwd, "input");
229
241
  if (!fs.existsSync(dir)) return 0;
230
242
  try {
231
243
  return fs.readdirSync(dir, { withFileTypes: true })
232
- .filter((e) => e.isFile())
244
+ .filter((e) => e.isFile() && !e.name.startsWith("."))
233
245
  .length;
234
246
  } catch {
235
247
  return 0;
@@ -70,7 +70,24 @@ export class SessionState {
70
70
  * @returns {object} The persisted state
71
71
  */
72
72
  load() {
73
- return this._loadRaw() || {};
73
+ const raw = this._loadRaw() || {};
74
+ // v0.6.3: phase value renamed "extraction" → "rule_extraction" to
75
+ // disambiguate from data/entity extraction inside skills. Migrate old
76
+ // session-state on read so resumed workspaces don't end up in a phase
77
+ // the engine doesn't recognize. Idempotent — already-renamed values
78
+ // pass through unchanged.
79
+ if (raw.currentPhase === "extraction") raw.currentPhase = "rule_extraction";
80
+ if (raw.pipelineMilestones?.extraction && !raw.pipelineMilestones.rule_extraction) {
81
+ raw.pipelineMilestones.rule_extraction = raw.pipelineMilestones.extraction;
82
+ delete raw.pipelineMilestones.extraction;
83
+ }
84
+ if (Array.isArray(raw.phaseSummaries)) {
85
+ for (const s of raw.phaseSummaries) {
86
+ if (s?.fromPhase === "extraction") s.fromPhase = "rule_extraction";
87
+ if (s?.toPhase === "extraction") s.toPhase = "rule_extraction";
88
+ }
89
+ }
90
+ return raw;
74
91
  }
75
92
 
76
93
  /**
@@ -17,22 +17,28 @@ const BUNDLED_SKILLS_DIR = path.resolve(__dirname, "../../template/skills");
17
17
  // to default to always-visible.
18
18
  const PHASE_RELEVANT_SKILLS = {
19
19
  "bootstrap-workspace": ["bootstrap"],
20
- "rule-extraction": ["bootstrap", "extraction"],
21
- "rule-graph": ["extraction", "skill_authoring"],
22
- "task-decomposition": ["extraction", "skill_authoring", "distillation"],
20
+ "rule-extraction": ["bootstrap", "rule_extraction"],
21
+ "rule-graph": ["rule_extraction", "skill_authoring"],
22
+ "task-decomposition": ["rule_extraction", "skill_authoring", "distillation"],
23
+ // v0.7.0 B1: work-decomposition teaches the system-level decomposition
24
+ // discipline (ordering, grouping, difficulty triage, PATTERNS.md memory).
25
+ // Distinct from task-decomposition (per-rule sub-tasks). Loaded on
26
+ // rule_extraction → skill_authoring transition where the agent owns
27
+ // the TaskBoard.
28
+ "work-decomposition": ["rule_extraction", "skill_authoring"],
23
29
  "skill-authoring": ["skill_authoring", "skill_testing"],
24
30
  "skill-to-workflow": ["distillation"],
25
31
  "evolution-loop": ["skill_testing", "distillation", "production_qc"],
26
- "version-control": ["bootstrap", "extraction", "skill_authoring", "skill_testing", "distillation", "production_qc", "finalization"],
32
+ "version-control": ["bootstrap", "rule_extraction", "skill_authoring", "skill_testing", "distillation", "production_qc", "finalization"],
27
33
  "quality-control": ["production_qc", "finalization"],
28
34
  "confidence-system": ["distillation", "production_qc"],
29
35
  "dashboard-reporting": ["production_qc", "finalization"],
30
36
  "cross-document-verification": ["production_qc"],
31
37
  "corner-case-management": ["skill_testing", "distillation", "production_qc"],
32
- "data-sensibility": ["extraction", "skill_authoring"],
38
+ "data-sensibility": ["rule_extraction", "skill_authoring"],
33
39
  "entity-extraction": ["skill_authoring", "distillation"],
34
- "document-parsing": ["bootstrap", "extraction", "skill_authoring"],
35
- "document-chunking": ["bootstrap", "extraction"],
40
+ "document-parsing": ["bootstrap", "rule_extraction", "skill_authoring"],
41
+ "document-chunking": ["bootstrap", "rule_extraction"],
36
42
  "tree-processing": ["skill_authoring", "skill_testing"],
37
43
  "compliance-judgment": ["skill_authoring", "skill_testing", "production_qc"],
38
44
  "skill-creator": ["skill_authoring"],