kc-beta 0.6.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/LICENSE +81 -0
  2. package/LICENSE-COMMERCIAL.md +125 -0
  3. package/README.md +21 -3
  4. package/package.json +14 -5
  5. package/src/agent/context-window.js +9 -12
  6. package/src/agent/context.js +14 -1
  7. package/src/agent/document-parser.js +169 -0
  8. package/src/agent/engine.js +499 -20
  9. package/src/agent/history/event-history.js +222 -0
  10. package/src/agent/llm-client.js +55 -0
  11. package/src/agent/message-utils.js +63 -0
  12. package/src/agent/pipelines/_milestone-derive.js +511 -0
  13. package/src/agent/pipelines/base.js +21 -0
  14. package/src/agent/pipelines/distillation.js +28 -15
  15. package/src/agent/pipelines/extraction.js +103 -36
  16. package/src/agent/pipelines/finalization.js +178 -11
  17. package/src/agent/pipelines/index.js +6 -1
  18. package/src/agent/pipelines/initializer.js +74 -8
  19. package/src/agent/pipelines/production-qc.js +31 -44
  20. package/src/agent/pipelines/skill-authoring.js +152 -80
  21. package/src/agent/pipelines/skill-testing.js +67 -23
  22. package/src/agent/retry.js +10 -2
  23. package/src/agent/scheduler.js +14 -2
  24. package/src/agent/session-state.js +35 -2
  25. package/src/agent/skill-loader.js +13 -7
  26. package/src/agent/skill-validator.js +163 -0
  27. package/src/agent/task-manager.js +61 -5
  28. package/src/agent/tools/_workflow-result-schema.js +249 -0
  29. package/src/agent/tools/document-chunk.js +21 -9
  30. package/src/agent/tools/phase-advance.js +52 -6
  31. package/src/agent/tools/release.js +51 -9
  32. package/src/agent/tools/rule-catalog.js +11 -1
  33. package/src/agent/tools/workflow-run.js +9 -4
  34. package/src/agent/tools/workspace-file.js +32 -0
  35. package/src/agent/workspace.js +61 -0
  36. package/src/cli/components.js +64 -14
  37. package/src/cli/index.js +62 -3
  38. package/src/cli/meme.js +26 -25
  39. package/src/config.js +65 -22
  40. package/src/model-tiers.json +48 -0
  41. package/src/providers.js +87 -0
  42. package/template/release/v1/README.md.tmpl +108 -0
  43. package/template/release/v1/catalog.json.tmpl +4 -0
  44. package/template/release/v1/kc_runtime/__init__.py +11 -0
  45. package/template/release/v1/kc_runtime/confidence.py +63 -0
  46. package/template/release/v1/kc_runtime/doc_parser.py +127 -0
  47. package/template/release/v1/manifest.json.tmpl +11 -0
  48. package/template/release/v1/render_dashboard.py +117 -0
  49. package/template/release/v1/run.py +212 -0
  50. package/template/release/v1/serve.sh +17 -0
  51. package/template/skills/en/meta-meta/skill-authoring/SKILL.md +19 -0
  52. package/template/skills/en/meta-meta/work-decomposition/SKILL.md +266 -0
  53. package/template/skills/en/skill-creator/SKILL.md +1 -1
  54. package/template/skills/zh/meta-meta/skill-authoring/SKILL.md +19 -0
  55. package/template/skills/zh/meta-meta/work-decomposition/SKILL.md +264 -0
  56. package/template/skills/zh/skill-creator/SKILL.md +1 -1
@@ -0,0 +1,163 @@
1
+ /**
2
+ * v0.6.2 I2: Skill validator (was D3c, deferred from v0.6.0/v0.6.1).
3
+ *
4
+ * E2E #4 demonstrated that broken `check_r###.py` contents go undetected
5
+ * until production_qc throws (e.g., `SyntaxError: unexpected character
6
+ * after line continuation character` from line 733 of unified_qc.py).
7
+ * This validator catches such breakage at the skill_authoring phase
8
+ * boundary instead of months later in production.
9
+ *
10
+ * Design constraints:
11
+ * - exitCriteriaMet is sync, so validation is sync (execFileSync).
12
+ * - 110 files × ~50ms subprocess = 5.5s worst case; caching by mtime
13
+ * keeps steady-state cost at ~0 (only re-validate freshly modified
14
+ * files).
15
+ * - Failures are diagnostic, not punitive: `force: true` on phase_advance
16
+ * still bypasses. The validator's job is to refuse the auto-advance,
17
+ * not to trap the agent.
18
+ *
19
+ * Validation rules per `check_*.py`:
20
+ * 1. File ≥ 100 bytes (smoke test for empty stubs).
21
+ * 2. Passes `python3 -c "import ast; ast.parse(open(F).read())"` (no
22
+ * syntax errors).
23
+ * 3. Defines a function reachable by one of the names: `check_rule`,
24
+ * `verify`, OR `check_r<digits>` (e.g. `check_r014`, `check_r013_r017`).
25
+ * v0.7.0 A6 broadened the third pattern after E2E #5 audit found
26
+ * three sessions independently chose `def check_r###` over the
27
+ * canonical names — the validator was too strict.
28
+ *
29
+ * Disable mechanism: if `python3` is not on PATH, validator silently
30
+ * passes everything and emits a one-time warning — we don't want the
31
+ * gate to block on missing tooling. Gate effectively no-ops.
32
+ */
33
+
34
+ import { execFileSync } from "node:child_process";
35
+ import fs from "node:fs";
36
+ import path from "node:path";
37
+
38
+ // v0.7.0 A6: entry-point check is a sanity probe, not a style enforcer.
39
+ // The validator's real signal comes from `≥ 100 bytes` + `ast.parse
40
+ // passes`. Restricting to specific verb names rejected 27/28 GLM
41
+ // scripts in E2E #5 — the cost outweighed the catch (every contestant
42
+ // converged on a different naming convention).
43
+ //
44
+ // New rule: any top-level `def \w+(...)` counts. Rejects pure-imports
45
+ // or comment-only stubs (which is what we actually wanted to catch),
46
+ // accepts anything with real logic. The check_*.py *filename* (matched
47
+ // by the path regex in `findCheckScripts`) carries the rule-id signal;
48
+ // the function name doesn't need to.
49
+ const ENTRY_POINT_REGEX = /^(?:async\s+)?def\s+\w+\s*\(/m;
50
+ const MIN_BYTES = 100;
51
+
52
+ export class SkillValidator {
53
+ constructor() {
54
+ /** @type {Map<string, { mtime: number, ok: boolean, error?: string }>} */
55
+ this._cache = new Map();
56
+ /** @type {boolean|null} - null = untested, true/false once probed */
57
+ this._pythonAvailable = null;
58
+ /** @type {boolean} - one-time warning suppression */
59
+ this._warned = false;
60
+ }
61
+
62
+ /**
63
+ * Probe whether python3 is available. Cached after first call.
64
+ * @returns {boolean}
65
+ */
66
+ _probePython() {
67
+ if (this._pythonAvailable !== null) return this._pythonAvailable;
68
+ try {
69
+ execFileSync("python3", ["-c", "import ast"], { stdio: "ignore", timeout: 5000 });
70
+ this._pythonAvailable = true;
71
+ } catch {
72
+ this._pythonAvailable = false;
73
+ }
74
+ return this._pythonAvailable;
75
+ }
76
+
77
+ /**
78
+ * Validate one file. Returns `{ ok, error? }`. Cached by mtime.
79
+ * @param {string} filePath - Absolute path to the .py file
80
+ * @returns {{ ok: boolean, error?: string }}
81
+ */
82
+ validateFile(filePath) {
83
+ let mtime;
84
+ try {
85
+ mtime = fs.statSync(filePath).mtimeMs;
86
+ } catch {
87
+ return { ok: false, error: "file not found" };
88
+ }
89
+ const cached = this._cache.get(filePath);
90
+ if (cached && cached.mtime === mtime) {
91
+ return { ok: cached.ok, error: cached.error };
92
+ }
93
+ const result = this._runValidation(filePath);
94
+ this._cache.set(filePath, { mtime, ...result });
95
+ return result;
96
+ }
97
+
98
+ /**
99
+ * Validate all files in a list. Returns:
100
+ * - ok: boolean — true iff every file passes
101
+ * - failures: array of { filePath, error } for each failing file
102
+ * - skipped: boolean — true if python3 unavailable (validator no-op'd)
103
+ *
104
+ * @param {string[]} filePaths
105
+ * @returns {{ ok: boolean, failures: Array<{filePath:string, error:string}>, skipped: boolean }}
106
+ */
107
+ validateAll(filePaths) {
108
+ if (!this._probePython()) {
109
+ if (!this._warned) {
110
+ // eslint-disable-next-line no-console
111
+ console.warn("[skill-validator] python3 not on PATH — skill validation skipped. " +
112
+ "Phase gate will not catch syntax errors. Install python3 to enable.");
113
+ this._warned = true;
114
+ }
115
+ return { ok: true, failures: [], skipped: true };
116
+ }
117
+ const failures = [];
118
+ for (const f of filePaths) {
119
+ const r = this.validateFile(f);
120
+ if (!r.ok) failures.push({ filePath: f, error: r.error || "unknown" });
121
+ }
122
+ return { ok: failures.length === 0, failures, skipped: false };
123
+ }
124
+
125
+ /**
126
+ * Manually invalidate cache for a path — used when the caller knows
127
+ * the file changed but mtime granularity might not have caught it.
128
+ */
129
+ invalidate(filePath) { this._cache.delete(filePath); }
130
+
131
+ // --- Internal ---
132
+
133
+ _runValidation(filePath) {
134
+ // Rule 1: size check (cheap)
135
+ let size;
136
+ try { size = fs.statSync(filePath).size; }
137
+ catch { return { ok: false, error: "stat failed" }; }
138
+ if (size < MIN_BYTES) {
139
+ return { ok: false, error: `file too small (${size} < ${MIN_BYTES} bytes)` };
140
+ }
141
+
142
+ // Rule 2: ast.parse smoke test via subprocess
143
+ try {
144
+ execFileSync("python3", [
145
+ "-c",
146
+ `import ast,sys\ntry:\n ast.parse(open(${JSON.stringify(filePath)}).read())\nexcept SyntaxError as e:\n print(f"SyntaxError: {e}", file=sys.stderr); sys.exit(1)\nexcept Exception as e:\n print(f"{type(e).__name__}: {e}", file=sys.stderr); sys.exit(1)\n`,
147
+ ], { stdio: ["ignore", "ignore", "pipe"], timeout: 10_000 });
148
+ } catch (e) {
149
+ const stderr = (e.stderr ? e.stderr.toString() : "") || e.message || "subprocess failed";
150
+ return { ok: false, error: stderr.trim().slice(0, 300) };
151
+ }
152
+
153
+ // Rule 3: entry-point regex (after parse OK so we know file is readable)
154
+ let content;
155
+ try { content = fs.readFileSync(filePath, "utf-8"); }
156
+ catch { return { ok: false, error: "read failed after parse OK" }; }
157
+ if (!ENTRY_POINT_REGEX.test(content)) {
158
+ return { ok: false, error: "no callable defined: file has imports/comments only, no top-level `def`" };
159
+ }
160
+
161
+ return { ok: true };
162
+ }
163
+ }
@@ -139,12 +139,23 @@ export class TaskManager {
139
139
  // --- Bulk creation from rule catalog ---
140
140
 
141
141
  /**
142
- * Phases where one-task-per-rule is the natural unit of work.
143
- * For BOOTSTRAP / EXTRACTION the unit is a regulation (one PDF → many rules);
144
- * ralph-loop shouldn't drive per-rule there because the rules don't exist yet
145
- * (or are the *output*, not the input) see E2E #3 coverage check.
142
+ * Phases where the engine auto-creates one-task-per-rule on phase entry.
143
+ *
144
+ * v0.7.0 B2: empty by default. Agent owns TaskBoard decisions per the
145
+ * work-decomposition meta-meta skillengine no longer assumes per-rule
146
+ * granularity is right. The agent reads the rule list from describeState
147
+ * and calls TaskCreate with whatever shape (single, grouped, range,
148
+ * non-rule) makes sense for the corpus.
149
+ *
150
+ * Override `KC_AGENT_OWNS_TASKBOARD=0` to restore v0.6.x behavior
151
+ * (engine auto-populates per-rule for skill_authoring + skill_testing).
152
+ * The override is a staged-rollout safety valve, not a long-lived
153
+ * config — slated for removal in v0.8.0 after E2E #6 validates the
154
+ * agent-owned default.
146
155
  */
147
- static PER_RULE_PHASES = new Set(["skill_authoring", "skill_testing"]);
156
+ static PER_RULE_PHASES = (process.env.KC_AGENT_OWNS_TASKBOARD === "0")
157
+ ? new Set(["skill_authoring", "skill_testing"])
158
+ : new Set();
148
159
 
149
160
  /**
150
161
  * Create one task per rule for a given phase — but only if the phase's unit
@@ -197,6 +208,51 @@ export class TaskManager {
197
208
  ).length;
198
209
  }
199
210
 
211
+ /**
212
+ * v0.7.0 A5: Reconcile per-rule tasks against disk artifacts.
213
+ *
214
+ * Background: E2E #5 DS audit found tasks.json showing 70/70 completed
215
+ * while only ~56 dirs / 36 with check_*.py existed on disk. The agent
216
+ * called markDone() optimistically but the artifacts didn't materialize
217
+ * (or were deleted later). The engine's phase gate trusted the count.
218
+ *
219
+ * Reconcile walks every "completed" task in PER_RULE_PHASES and checks
220
+ * whether the expected disk artifacts exist via a caller-supplied
221
+ * `expectsFn(task) -> boolean` predicate. Tasks whose artifacts are
222
+ * missing are flipped back to `pending` with a `reconcile_failed`
223
+ * note so the agent can re-do the work, and the gate can refuse
224
+ * advance if the per-rule artifact set is incomplete.
225
+ *
226
+ * Called from engine `_advancePhase` before `exitCriteriaMet()`.
227
+ *
228
+ * @param {(task: object) => boolean} expectsFn
229
+ * @returns {{ reconciled: number, flippedBack: string[] }}
230
+ * Number of tasks inspected, plus the IDs of tasks flipped back to
231
+ * pending. Caller logs to events.jsonl.
232
+ */
233
+ reconcileAgainstDisk(expectsFn) {
234
+ let reconciled = 0;
235
+ const flippedBack = [];
236
+ if (typeof expectsFn !== "function") return { reconciled, flippedBack };
237
+ for (const task of this._tasks) {
238
+ if (task.status !== "completed") continue;
239
+ if (!TaskManager.PER_RULE_PHASES.has(task.phase)) continue;
240
+ reconciled++;
241
+ let ok = false;
242
+ try { ok = !!expectsFn(task); }
243
+ catch { ok = false; }
244
+ if (!ok) {
245
+ task.status = "pending";
246
+ task.reconcile_failed = true;
247
+ task.summary = (task.summary ? task.summary + " | " : "") +
248
+ "v0.7.0 A5: artifacts missing on disk → flipped back to pending";
249
+ flippedBack.push(task.id);
250
+ }
251
+ }
252
+ if (flippedBack.length > 0) this.save();
253
+ return { reconciled, flippedBack };
254
+ }
255
+
200
256
  /**
201
257
  * Format task list for injection into system prompt context.
202
258
  * Compact checklist — not conversation history.
@@ -0,0 +1,249 @@
1
+ /**
2
+ * v0.6.2 I1: Shared workflow-result normalizer + ERROR classifier.
3
+ *
4
+ * E2E #4 produced 1,150 ERROR verdicts out of 6,930 (16.6%) and
5
+ * verdict_stats keys leaked Python dataclass repr() strings like
6
+ * "VerificationResult(rule_id='R049', verdict='NOT_APPLICABLE', ...)".
7
+ * The agent's batch aggregator was using repr(result) as a dict key
8
+ * because the workflow's Python output was a dataclass instance, not
9
+ * a dict.
10
+ *
11
+ * This module fixes the boundary: anything that comes out of a
12
+ * workflow_run tool gets normalized to a strict dict shape before being
13
+ * persisted or returned to the agent. Repr-strings get parsed back into
14
+ * structured fields. ERRORs get classified into typed buckets so we can
15
+ * tell "import failed" from "extraction returned wrong shape" without
16
+ * reading 1,150 stack traces.
17
+ */
18
+
19
+ /**
20
+ * The required shape every workflow result must satisfy. Unknown extra
21
+ * keys are preserved.
22
+ */
23
+ export const REQUIRED_KEYS = ["rule_id", "verdict"];
24
+
25
+ /**
26
+ * Canonical verdict values. Anything outside this set is allowed (the
27
+ * worker LLM may extend) but generates a `nonstandard_verdict` warning
28
+ * in the result's `_warnings` array.
29
+ */
30
+ export const STANDARD_VERDICTS = new Set([
31
+ "PASS", "FAIL", "NOT_APPLICABLE", "SUPPLEMENT_NEEDED", "ERROR", "UNKNOWN",
32
+ ]);
33
+
34
+ /**
35
+ * Recognized error_type values used by classifyError(). Add to this set
36
+ * when adding a new pattern below.
37
+ */
38
+ export const ERROR_TYPES = [
39
+ "import_error",
40
+ "attribute_error",
41
+ "keyword_not_found",
42
+ "sample_unparseable",
43
+ "schema_violation",
44
+ "syntax_error",
45
+ "timeout",
46
+ "permission_error",
47
+ "unknown",
48
+ ];
49
+
50
+ /**
51
+ * Detect whether a string looks like a Python dataclass repr —
52
+ * `ClassName(field=value, field=value)`. Used both as a top-level
53
+ * detector and recursively inside dict keys.
54
+ */
55
+ const REPR_PATTERN = /^([A-Za-z_]\w*)\((.*)\)$/s;
56
+
57
+ /**
58
+ * Parse a Python-repr string into { class_name, fields: { ... } }.
59
+ * Field values are kept as strings (we don't try to re-type them — the
60
+ * downstream consumer can JSON.parse if needed). Returns null if the
61
+ * input doesn't look like a repr.
62
+ *
63
+ * Example:
64
+ * parsePyRepr("VerificationResult(rule_id='R049', verdict='NOT_APPLICABLE')")
65
+ * → { class_name: 'VerificationResult', fields: { rule_id: "'R049'", verdict: "'NOT_APPLICABLE'" } }
66
+ */
67
+ export function parsePyRepr(s) {
68
+ if (typeof s !== "string") return null;
69
+ const m = s.match(REPR_PATTERN);
70
+ if (!m) return null;
71
+ const className = m[1];
72
+ const body = m[2];
73
+ // Tokenize on top-level commas (ignore commas inside brackets/quotes)
74
+ const fields = {};
75
+ let depth = 0;
76
+ let inQuote = null;
77
+ let buf = "";
78
+ let key = null;
79
+ const flush = () => {
80
+ if (!buf.trim()) return;
81
+ if (key == null) {
82
+ // No `=` seen — entry was positional, skip
83
+ buf = "";
84
+ return;
85
+ }
86
+ fields[key] = buf.trim();
87
+ key = null;
88
+ buf = "";
89
+ };
90
+ for (let i = 0; i < body.length; i++) {
91
+ const c = body[i];
92
+ if (inQuote) {
93
+ buf += c;
94
+ if (c === inQuote && body[i - 1] !== "\\") inQuote = null;
95
+ continue;
96
+ }
97
+ if (c === "'" || c === '"') { inQuote = c; buf += c; continue; }
98
+ if (c === "(" || c === "[" || c === "{") { depth++; buf += c; continue; }
99
+ if (c === ")" || c === "]" || c === "}") { depth--; buf += c; continue; }
100
+ if (c === "=" && depth === 0 && key == null) {
101
+ key = buf.trim();
102
+ buf = "";
103
+ continue;
104
+ }
105
+ if (c === "," && depth === 0) { flush(); continue; }
106
+ buf += c;
107
+ }
108
+ flush();
109
+ return { class_name: className, fields };
110
+ }
111
+
112
+ /**
113
+ * Recursively replace any dict key that looks like a Python repr with
114
+ * a structured object. Also handles arrays. Mutates in place but also
115
+ * returns the input for chaining.
116
+ */
117
+ export function normalizeReprKeys(obj) {
118
+ if (Array.isArray(obj)) {
119
+ obj.forEach((v, i) => { obj[i] = normalizeReprKeys(v); });
120
+ return obj;
121
+ }
122
+ if (obj && typeof obj === "object") {
123
+ const newObj = {};
124
+ for (const [k, v] of Object.entries(obj)) {
125
+ const parsed = parsePyRepr(k);
126
+ if (parsed) {
127
+ // Merge under a class-name bucket. Multiple repr keys for the
128
+ // same class collapse to a counter (because verdict_stats just
129
+ // wanted distinct buckets).
130
+ const bucket = newObj[parsed.class_name] || (newObj[parsed.class_name] = []);
131
+ bucket.push({ fields: parsed.fields, count: typeof v === "number" ? v : 1 });
132
+ } else {
133
+ newObj[k] = normalizeReprKeys(v);
134
+ }
135
+ }
136
+ return newObj;
137
+ }
138
+ return obj;
139
+ }
140
+
141
+ /**
142
+ * Classify an ERROR result by inferring `error_type` from the raw_output
143
+ * stack trace or message. Returns one of ERROR_TYPES.
144
+ *
145
+ * Conservative — when in doubt, return "unknown" rather than guess wrong.
146
+ */
147
+ export function classifyError(rawOutput) {
148
+ if (!rawOutput || typeof rawOutput !== "string") return "unknown";
149
+ const s = rawOutput;
150
+ if (/ModuleNotFoundError|ImportError|No module named/i.test(s)) return "import_error";
151
+ if (/AttributeError/i.test(s)) return "attribute_error";
152
+ if (/SyntaxError|invalid syntax|unexpected character/i.test(s)) return "syntax_error";
153
+ if (/PermissionError|permission denied/i.test(s)) return "permission_error";
154
+ if (/timed out|timeout|Timeout/i.test(s)) return "timeout";
155
+ // sample parse failures usually mention pdfjs / docx / json
156
+ if (/pdfjs|docx|json\.decoder|JSONDecodeError|UnicodeDecodeError/i.test(s)) return "sample_unparseable";
157
+ // schema violations from our own normalizer would have a hint
158
+ if (/schema_violation|missing required key/i.test(s)) return "schema_violation";
159
+ // Common keyword-not-found signal: the workflow returned no match
160
+ if (/no match|not found|未找到|关键词未匹配/i.test(s)) return "keyword_not_found";
161
+ return "unknown";
162
+ }
163
+
164
+ /**
165
+ * Normalize a parsed workflow-output object to the canonical dict shape.
166
+ * - Ensures `rule_id` and `verdict` are present.
167
+ * - Strips repr-string keys (delegates to normalizeReprKeys).
168
+ * - If verdict is "ERROR" or the parse fell back to raw_output, attaches
169
+ * `error_type` from classifyError().
170
+ * - Records issues in `_warnings: string[]` so the consumer (and the
171
+ * agent reading the tool result) can see them.
172
+ *
173
+ * Inputs:
174
+ * parsed — what JSON.parse yielded (may already be a dict, or be
175
+ * the raw_output fallback object)
176
+ * ruleId — what the caller knows the rule_id should be
177
+ * rawOutput — the original stdout (used for ERROR classification)
178
+ *
179
+ * Returns the normalized result. Always returns a dict with `rule_id`
180
+ * and `verdict`. Never throws.
181
+ */
182
+ export function normalizeWorkflowResult(parsed, ruleId, rawOutput) {
183
+ const warnings = [];
184
+ let result;
185
+ if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
186
+ result = { ...parsed };
187
+ } else if (typeof parsed === "string") {
188
+ // Parsed yielded a string — could be a repr at top level
189
+ const repr = parsePyRepr(parsed);
190
+ if (repr) {
191
+ // Strip Python's surrounding quote chars from string values so
192
+ // STANDARD_VERDICTS comparisons work and downstream code doesn't
193
+ // see "'PASS'" instead of "PASS". Conservative: only unwrap when
194
+ // the entire value is wrapped in matching ' or " quotes.
195
+ const stripped = {};
196
+ for (const [k, v] of Object.entries(repr.fields)) {
197
+ if (typeof v === "string" && /^(['"]).*\1$/s.test(v) && v.length >= 2) {
198
+ stripped[k] = v.slice(1, -1);
199
+ } else {
200
+ stripped[k] = v;
201
+ }
202
+ }
203
+ result = stripped;
204
+ result._source_class = repr.class_name;
205
+ warnings.push("toplevel_repr_string");
206
+ } else {
207
+ result = { raw_output: parsed.slice(0, 5000) };
208
+ warnings.push("toplevel_string");
209
+ }
210
+ } else {
211
+ result = { raw_output: String(parsed ?? "").slice(0, 5000) };
212
+ warnings.push("toplevel_nonobject");
213
+ }
214
+
215
+ // Recursively normalize repr keys in nested dicts (verdict_stats, etc.)
216
+ normalizeReprKeys(result);
217
+
218
+ // rule_id: prefer the caller-supplied value (it's authoritative)
219
+ if (ruleId) result.rule_id = ruleId;
220
+ else if (typeof result.rule_id !== "string") {
221
+ result.rule_id = "unknown";
222
+ warnings.push("missing_rule_id");
223
+ }
224
+
225
+ // verdict: ensure present and canonical-or-warn
226
+ if (typeof result.verdict !== "string" || result.verdict === "") {
227
+ // If the workflow fell into raw_output fallback, mark as ERROR
228
+ if (result.raw_output) {
229
+ result.verdict = "ERROR";
230
+ } else {
231
+ result.verdict = "UNKNOWN";
232
+ warnings.push("missing_verdict");
233
+ }
234
+ } else if (!STANDARD_VERDICTS.has(result.verdict)) {
235
+ warnings.push("nonstandard_verdict");
236
+ }
237
+
238
+ // ERROR classification
239
+ if (result.verdict === "ERROR") {
240
+ const trace = rawOutput || result.raw_output || result.error || "";
241
+ result.error_type = classifyError(trace);
242
+ }
243
+
244
+ if (warnings.length > 0) {
245
+ result._warnings = (result._warnings || []).concat(warnings);
246
+ }
247
+
248
+ return result;
249
+ }
@@ -194,20 +194,32 @@ export class DocumentChunkTool extends BaseTool {
194
194
  };
195
195
  }
196
196
 
197
- // For other formats (.docx, .xlsx, etc): read as UTF-8 best-effort.
198
- // Upstream agent should call document_parse first and then document_chunk
199
- // on the parsed output directly current MVP keeps the tool surface small.
197
+ // v0.7.0 G (#91): route .docx / .doc / others through native parser
198
+ // dispatcher (mammoth / word-extractor / LibreOffice fallback).
199
+ // Replaces the prior "read as UTF-8" stub which produced binary
200
+ // garbage on .docx and forced agents to call document_parse + chunk
201
+ // separately. extractText() returns clean text or a structured
202
+ // failure that downstream can surface to the agent.
200
203
  try {
201
- const txt = fs.readFileSync(absPath, "utf-8");
204
+ const { extractText } = await import("../document-parser.js");
205
+ const result = await extractText(absPath);
206
+ if (result.ok && result.text) {
207
+ return {
208
+ source_file: baseName,
209
+ total_pages: 1,
210
+ blocks: [{ page: 1, markdown: result.text }],
211
+ parse_via: result.via,
212
+ };
213
+ }
202
214
  return {
203
- source_file: baseName,
204
- total_pages: 1,
205
- blocks: [{ page: 1, markdown: txt }],
215
+ source_file: baseName, total_pages: 0, blocks: [],
216
+ parse_error: result.error ||
217
+ `Unsupported format '${suffix}'. Install mammoth / word-extractor or rely on LibreOffice fallback.`,
206
218
  };
207
- } catch {
219
+ } catch (e) {
208
220
  return {
209
221
  source_file: baseName, total_pages: 0, blocks: [],
210
- parse_error: `Unsupported format '${suffix}'. Run document_parse first and use its output, or stick to .pdf / .md / .txt.`,
222
+ parse_error: `parse exception: ${e?.message || String(e)}`,
211
223
  };
212
224
  }
213
225
  }
@@ -19,13 +19,17 @@ export class PhaseAdvanceTool extends BaseTool {
19
19
  * @param {() => string} getCurrentPhaseFn - H1: lets the tool read the
20
20
  * engine's phase BEFORE the call, so it can distinguish "already there"
21
21
  * (silent no-op, informational) from "non-adjacent refusal" (actionable).
22
- * Before H1 both cases returned the same confusing "Either you're already
23
- * there, or transition is non-adjacent" message.
22
+ * @param {() => string[]} [getRunningSubagentsFn] - v0.6.2 J1: returns the
23
+ * list of running subagent task_ids. When non-empty, phase_advance
24
+ * refuses unless `acknowledge_stale_subagents: true` is set in input
25
+ * (or `force: true`). Forces the agent to confront live work that
26
+ * started in the prior phase before declaring the phase done.
24
27
  */
25
- constructor(advanceFn, getCurrentPhaseFn) {
28
+ constructor(advanceFn, getCurrentPhaseFn, getRunningSubagentsFn) {
26
29
  super();
27
30
  this._advance = advanceFn;
28
31
  this._getCurrentPhase = getCurrentPhaseFn || (() => null);
32
+ this._getRunningSubagents = getRunningSubagentsFn || (() => []);
29
33
  }
30
34
 
31
35
  get name() { return "phase_advance"; }
@@ -48,6 +52,11 @@ export class PhaseAdvanceTool extends BaseTool {
48
52
  type: "boolean",
49
53
  description: "Allow non-adjacent or backward transitions. Default false.",
50
54
  },
55
+ acknowledge_stale_subagents: {
56
+ type: "boolean",
57
+ description:
58
+ "Set to true after using agent_tool(operation=list|poll|kill) to confirm you've handled any subagents still running from the prior phase. Required when subagents are live; otherwise advance is refused (use force:true to bypass entirely).",
59
+ },
51
60
  },
52
61
  required: ["to"],
53
62
  };
@@ -68,15 +77,52 @@ export class PhaseAdvanceTool extends BaseTool {
68
77
  );
69
78
  }
70
79
 
80
+ // v0.6.2 J1: stale-subagents acknowledgement gate. Refuses advance if
81
+ // any subagent is still running and the agent hasn't explicitly
82
+ // acknowledged. force:true bypasses (matches existing escape pattern).
83
+ const running = this._getRunningSubagents();
84
+ if (running.length > 0 && !input.acknowledge_stale_subagents && !input.force) {
85
+ return new ToolResult(
86
+ `Refusing to advance from ${beforePhase || "?"} to ${to}: ${running.length} subagent(s) still running from prior phase: ${running.join(", ")}. ` +
87
+ `Run agent_tool(operation="list") to see status, then either ` +
88
+ `agent_tool(operation="wait"|"kill") on each, OR pass acknowledge_stale_subagents:true ` +
89
+ `to advance while leaving them running (use only if they're legitimate background work).`,
90
+ true,
91
+ );
92
+ }
93
+
71
94
  const advanced = this._advance(to, input.reason || "agent request", { force: !!input.force });
72
95
  if (advanced) {
96
+ // Log the ack so post-mortems can find phase advances that proceeded
97
+ // with live subagents
98
+ if (running.length > 0 && input.acknowledge_stale_subagents) {
99
+ return new ToolResult(
100
+ `Advanced${beforePhase ? ` from ${beforePhase}` : ""} to ${to}${input.force ? " (forced)" : ""} — ` +
101
+ `acknowledged ${running.length} running subagent(s): ${running.join(", ")}.`,
102
+ );
103
+ }
73
104
  return new ToolResult(`Advanced${beforePhase ? ` from ${beforePhase}` : ""} to ${to}${input.force ? " (forced)" : ""}`);
74
105
  }
75
106
 
76
- // Truly refused — non-adjacent transition without force, or terminal-phase
77
- // forward attempt. Give the actionable hint.
107
+ // Truly refused — possible reasons: non-adjacent transition,
108
+ // terminal-phase forward attempt, or hard-tracking gate (source phase's
109
+ // exit criteria not met by engine telemetry).
110
+ //
111
+ // v0.7.0 A3: refusal text no longer advertises `force:true`. E2E #5
112
+ // showed every conductor reading the old refusal hint and force-bypassing
113
+ // immediately (12/12 transitions). The escape valve remains in the input
114
+ // schema (discoverable) but isn't hand-fed to the LLM here. Instead,
115
+ // direct the agent at the missing milestones it can satisfy.
78
116
  return new ToolResult(
79
- `Did not advance to ${to}. Transition is non-adjacent${beforePhase ? ` (currently in ${beforePhase})` : ""} set force:true to override, or advance to the immediate-next phase first.`,
117
+ `Did not advance to ${to} (currently in ${beforePhase || "?"}). ` +
118
+ `Likely cause: source-phase exit criteria not met. ` +
119
+ `Run /status (or read the phase describeState block in this turn's system reminder) ` +
120
+ `to see which milestones are missing, then produce the disk artifacts that satisfy them — ` +
121
+ `the engine derives milestones from filesystem facts (rule_skills/<id>/SKILL.md, check.py, ` +
122
+ `workflows/<id>/*.py, output/results/*.json, etc.). ` +
123
+ `If the transition is non-adjacent or this phase truly is done despite the gate, ` +
124
+ `re-call with the documented schema flag. The engine logged the precise reason in ` +
125
+ `events.jsonl as 'phase_advance_refused'.`,
80
126
  false,
81
127
  );
82
128
  }