npm - kc-beta - Versions diffs - 0.6.0 → 0.6.2 - Mend

kc-beta 0.6.0 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/package.json +1 -1
package/src/agent/engine.js +315 -4
package/src/agent/pipelines/distillation.js +15 -0
package/src/agent/pipelines/extraction.js +60 -3
package/src/agent/pipelines/production-qc.js +63 -13
package/src/agent/pipelines/skill-authoring.js +91 -1
package/src/agent/session-state.js +17 -1
package/src/agent/skill-validator.js +149 -0
package/src/agent/task-manager.js +15 -0
package/src/agent/tools/_workflow-result-schema.js +249 -0
package/src/agent/tools/phase-advance.js +34 -3
package/src/agent/tools/workflow-run.js +43 -5
package/src/agent/workspace.js +23 -0
package/src/model-tiers.json +32 -0
package/src/providers.js +45 -0
package/template/skills/en/meta-meta/skill-authoring/SKILL.md +19 -0
package/template/skills/zh/meta-meta/skill-authoring/SKILL.md +19 -0

package/src/agent/pipelines/skill-authoring.js CHANGED Viewed

@@ -2,11 +2,28 @@ import fs from "node:fs";
 import path from "node:path";
 import { Phase, PipelineEvent } from "./index.js";
 import { Pipeline } from "./base.js";
+import { SkillValidator } from "../skill-validator.js";
 export class SkillAuthoringPipeline extends Pipeline {
-  constructor(workspace) {
+  /**
+   * @param {Workspace} workspace
+   * @param {TaskManager|null} [taskManager] - v0.6.1 A2: pass the engine's
+   *   TaskManager so exitCriteriaMet can require task-completion parity in
+   *   addition to D2 filename coverage. Subagents pass null (no taskManager
+   *   in subagent scope), in which case the gate falls back to D2-only
+   *   behaviour.
+   */
+  constructor(workspace, taskManager = null) {
     super();
     this._workspace = workspace;
+    this._taskManager = taskManager;
+    // v0.6.2 I2: skill validator catches malformed check_r###.py at the
+    // skill_authoring exit boundary instead of silently passing the
+    // phase and breaking in production_qc (E2E #4 unified_qc.py
+    // SyntaxError went undiagnosed for hours).
+    this._validator = new SkillValidator();
+    this._validationFailures = [];
+    this._validationSkipped = false;
     this.totalRules = [];
     this.skillsAuthored = [];
     this.skillsWithScripts = [];
@@ -132,12 +149,37 @@ export class SkillAuthoringPipeline extends Pipeline {
       "`rule_catalog` tool for any catalog edits — sandbox_exec bypasses the " +
       "workspace file lock and races with parallel workers."
     ];
+    // v0.6.1 A2: surface task-completion parity so the agent sees the gate
+    let taskLine = "";
+    if (this._taskManager) {
+      const totalT = this._taskManager.countByPhase("skill_authoring");
+      const doneT = this._taskManager.countByPhase("skill_authoring", "completed");
+      const failedT = this._taskManager.countByPhase("skill_authoring", "failed");
+      if (totalT > 0) {
+        taskLine = `\n- Per-rule tasks completed: ${doneT}/${totalT}` +
+          (failedT > 0 ? ` (+${failedT} failed)` : "");
+      }
+    }
+    // v0.6.2 I2: validation status (only meaningful after first
+    // exitCriteriaMet call populates _validationFailures)
+    let validationLine = "";
+    if (this._validationSkipped) {
+      validationLine = `\n- Skill validation: SKIPPED (python3 not on PATH — install to enable)`;
+    } else if (this._validationFailures.length > 0) {
+      const f = this._validationFailures.slice(0, 5).map(({ filePath, error }) =>
+        `\n  - ${path.relative(this._workspace.cwd, filePath)}: ${error.split("\n")[0]}`,
+      ).join("");
+      validationLine = `\n- Skills failing validation (${this._validationFailures.length}):${f}` +
+        (this._validationFailures.length > 5 ? `\n  - … and ${this._validationFailures.length - 5} more` : "");
+    }
     parts.push(
       `### Progress (rule-id coverage, D2)\n` +
       `- Total rules in catalog: ${total}\n` +
       `- Rule ids covered by some skill: ${covered}\n` +
       `- Skill directories authored: ${this.skillsAuthored.length}\n` +
       `- Skills with scripts/: ${this.skillsWithScripts.length}` +
+      taskLine +
+      validationLine +
       (uncovered.length > 0
         ? `\n- Missing coverage (${uncovered.length}): ${uncovered.slice(0, 15).join(", ")}${uncovered.length > 15 ? "…" : ""}`
         : ""),
@@ -169,9 +211,57 @@ export class SkillAuthoringPipeline extends Pipeline {
     // preserved as a secondary gate on skill depth.
     const allCovered = this.totalRules.every((r) => this.ruleIdsCovered.has(r));
     if (!allCovered) return false;
+    // v0.6.1 A2: tasks-parity gate. The 17-minute skill_authoring transition
+    // in E2E #4 happened because D2 fired on 20 skeleton SK01-SK20 dirs
+    // covering all 110 rule_ids by filename, while only ~5 of 110 per-rule
+    // skill_authoring tasks had actually been worked on. Now require every
+    // per-rule task in TaskManager to be in a terminal state (completed or
+    // failed). Subagents (no taskManager) skip this gate.
+    if (this._taskManager) {
+      const total = this._taskManager.countByPhase("skill_authoring");
+      if (total > 0) {
+        const completed = this._taskManager.countByPhase("skill_authoring", "completed");
+        const failed = this._taskManager.countByPhase("skill_authoring", "failed");
+        if (completed + failed < total) return false;
+      }
+    }
+    // v0.6.2 I2: skill validator — every check_r###.py must parse and
+    // expose an entry point. Catches the unified_qc.py-style monolith
+    // and other malformed scripts before they break in production_qc.
+    // mtime cache keeps this O(1) in steady state. Failures preserved
+    // in this._validationFailures for describeState rendering.
+    const checkFiles = this._collectCheckScripts();
+    const v = this._validator.validateAll(checkFiles);
+    this._validationFailures = v.failures;
+    this._validationSkipped = v.skipped;
+    if (!v.ok) return false;
     return this.skillsWithScripts.length >= Math.max(1, this.skillsAuthored.length * 0.5);
   }
+  /**
+   * v0.6.2 I2: gather every check_r###.py path under rule_skills/. Used by
+   * the skill validator. Walks one level into each skill directory.
+   */
+  _collectCheckScripts() {
+    const out = [];
+    const dir = path.join(this._workspace.cwd, "rule_skills");
+    if (!fs.existsSync(dir)) return out;
+    const walk = (d) => {
+      let entries;
+      try { entries = fs.readdirSync(d, { withFileTypes: true }); } catch { return; }
+      for (const e of entries) {
+        if (e.name.startsWith(".") || e.name.startsWith("__")) continue;
+        const p = path.join(d, e.name);
+        if (e.isDirectory()) { walk(p); continue; }
+        if (e.isFile() && /^check_r[\d_-]+\.py$/i.test(e.name)) {
+          out.push(p);
+        }
+      }
+    };
+    walk(dir);
+    return out;
+  }
   exportState() {
     return {
       totalRules: this.totalRules,

package/src/agent/session-state.js CHANGED Viewed

@@ -12,9 +12,14 @@ export class SessionState {
    * @param {string} workspacePath - Session workspace directory
    * @param {object} [opts]
    * @param {string} [opts.statePath] - Override absolute path (used for sub-agent isolation, Bug 2)
+   * @param {Workspace} [opts.workspace] - v0.6.2 J3: optional workspace ref so
+   *   save() can acquire a sync file lock on session-state.json. Without it
+   *   (subagents, tests), save() falls back to lock-free writes — same
+   *   behavior as pre-v0.6.2.
    */
   constructor(workspacePath, opts = {}) {
     this._path = opts.statePath || path.join(workspacePath, "session-state.json");
+    this._workspace = opts.workspace || null;
   }
   /**
@@ -46,7 +51,18 @@ export class SessionState {
       pipelineMilestones: this._extractMilestones(engine.pipelines),
     };
-    fs.writeFileSync(this._path, JSON.stringify(state, null, 2), "utf-8");
+    // v0.6.2 J3: acquire sync file lock if workspace ref available.
+    // session-state.json is in SHARED_COORDINATION_PATHS — concurrent
+    // writers (parallel ralph-loop workers + main saveState ticks)
+    // could otherwise interleave and corrupt the JSON.
+    const write = () => {
+      fs.writeFileSync(this._path, JSON.stringify(state, null, 2), "utf-8");
+    };
+    if (this._workspace?.withSyncFileLock) {
+      this._workspace.withSyncFileLock("session-state.json", write);
+    } else {
+      write();
+    }
   }
   /**

package/src/agent/skill-validator.js ADDED Viewed

@@ -0,0 +1,149 @@
+/**
+ * v0.6.2 I2: Skill validator (was D3c, deferred from v0.6.0/v0.6.1).
+ *
+ * E2E #4 demonstrated that broken `check_r###.py` contents go undetected
+ * until production_qc throws (e.g., `SyntaxError: unexpected character
+ * after line continuation character` from line 733 of unified_qc.py).
+ * This validator catches such breakage at the skill_authoring phase
+ * boundary instead of months later in production.
+ *
+ * Design constraints:
+ *  - exitCriteriaMet is sync, so validation is sync (execFileSync).
+ *  - 110 files × ~50ms subprocess = 5.5s worst case; caching by mtime
+ *    keeps steady-state cost at ~0 (only re-validate freshly modified
+ *    files).
+ *  - Failures are diagnostic, not punitive: `force: true` on phase_advance
+ *    still bypasses. The validator's job is to refuse the auto-advance,
+ *    not to trap the agent.
+ *
+ * Validation rules per `check_r###.py`:
+ *  1. File ≥ 100 bytes (smoke test for empty stubs).
+ *  2. Passes `python3 -c "import ast; ast.parse(open(F).read())"` (no
+ *     syntax errors).
+ *  3. Defines a function reachable by name `check_rule` or `verify`
+ *     (regex match on file content).
+ *
+ * Disable mechanism: if `python3` is not on PATH, validator silently
+ * passes everything and emits a one-time warning — we don't want the
+ * gate to block on missing tooling. Gate effectively no-ops.
+ */
+import { execFileSync } from "node:child_process";
+import fs from "node:fs";
+import path from "node:path";
+const ENTRY_POINT_REGEX = /^\s*(?:async\s+)?def\s+(check_rule|verify)\b/m;
+const MIN_BYTES = 100;
+export class SkillValidator {
+  constructor() {
+    /** @type {Map<string, { mtime: number, ok: boolean, error?: string }>} */
+    this._cache = new Map();
+    /** @type {boolean|null} - null = untested, true/false once probed */
+    this._pythonAvailable = null;
+    /** @type {boolean} - one-time warning suppression */
+    this._warned = false;
+  }
+  /**
+   * Probe whether python3 is available. Cached after first call.
+   * @returns {boolean}
+   */
+  _probePython() {
+    if (this._pythonAvailable !== null) return this._pythonAvailable;
+    try {
+      execFileSync("python3", ["-c", "import ast"], { stdio: "ignore", timeout: 5000 });
+      this._pythonAvailable = true;
+    } catch {
+      this._pythonAvailable = false;
+    }
+    return this._pythonAvailable;
+  }
+  /**
+   * Validate one file. Returns `{ ok, error? }`. Cached by mtime.
+   * @param {string} filePath - Absolute path to the .py file
+   * @returns {{ ok: boolean, error?: string }}
+   */
+  validateFile(filePath) {
+    let mtime;
+    try {
+      mtime = fs.statSync(filePath).mtimeMs;
+    } catch {
+      return { ok: false, error: "file not found" };
+    }
+    const cached = this._cache.get(filePath);
+    if (cached && cached.mtime === mtime) {
+      return { ok: cached.ok, error: cached.error };
+    }
+    const result = this._runValidation(filePath);
+    this._cache.set(filePath, { mtime, ...result });
+    return result;
+  }
+  /**
+   * Validate all files in a list. Returns:
+   *  - ok: boolean — true iff every file passes
+   *  - failures: array of { filePath, error } for each failing file
+   *  - skipped: boolean — true if python3 unavailable (validator no-op'd)
+   *
+   * @param {string[]} filePaths
+   * @returns {{ ok: boolean, failures: Array<{filePath:string, error:string}>, skipped: boolean }}
+   */
+  validateAll(filePaths) {
+    if (!this._probePython()) {
+      if (!this._warned) {
+        // eslint-disable-next-line no-console
+        console.warn("[skill-validator] python3 not on PATH — skill validation skipped. " +
+          "Phase gate will not catch syntax errors. Install python3 to enable.");
+        this._warned = true;
+      }
+      return { ok: true, failures: [], skipped: true };
+    }
+    const failures = [];
+    for (const f of filePaths) {
+      const r = this.validateFile(f);
+      if (!r.ok) failures.push({ filePath: f, error: r.error || "unknown" });
+    }
+    return { ok: failures.length === 0, failures, skipped: false };
+  }
+  /**
+   * Manually invalidate cache for a path — used when the caller knows
+   * the file changed but mtime granularity might not have caught it.
+   */
+  invalidate(filePath) { this._cache.delete(filePath); }
+  // --- Internal ---
+  _runValidation(filePath) {
+    // Rule 1: size check (cheap)
+    let size;
+    try { size = fs.statSync(filePath).size; }
+    catch { return { ok: false, error: "stat failed" }; }
+    if (size < MIN_BYTES) {
+      return { ok: false, error: `file too small (${size} < ${MIN_BYTES} bytes)` };
+    }
+    // Rule 2: ast.parse smoke test via subprocess
+    try {
+      execFileSync("python3", [
+        "-c",
+        `import ast,sys\ntry:\n ast.parse(open(${JSON.stringify(filePath)}).read())\nexcept SyntaxError as e:\n print(f"SyntaxError: {e}", file=sys.stderr); sys.exit(1)\nexcept Exception as e:\n print(f"{type(e).__name__}: {e}", file=sys.stderr); sys.exit(1)\n`,
+      ], { stdio: ["ignore", "ignore", "pipe"], timeout: 10_000 });
+    } catch (e) {
+      const stderr = (e.stderr ? e.stderr.toString() : "") || e.message || "subprocess failed";
+      return { ok: false, error: stderr.trim().slice(0, 300) };
+    }
+    // Rule 3: entry-point regex (after parse OK so we know file is readable)
+    let content;
+    try { content = fs.readFileSync(filePath, "utf-8"); }
+    catch { return { ok: false, error: "read failed after parse OK" }; }
+    if (!ENTRY_POINT_REGEX.test(content)) {
+      return { ok: false, error: "no entry point: expected `def check_rule(...)` or `def verify(...)`" };
+    }
+    return { ok: true };
+  }
+}

package/src/agent/task-manager.js CHANGED Viewed

@@ -182,6 +182,21 @@ export class TaskManager {
     return { total, completed, inProgress, pending, failed };
   }
+  /**
+   * v0.6.1 A2: Phase-scoped task count. Used by SkillAuthoringPipeline's
+   * exitCriteriaMet to gate phase advance on TaskManager parity, not just
+   * filename-regex coverage. Pass a status to filter; omit for total.
+   *
+   * @param {string} phase - Phase name (e.g., "skill_authoring")
+   * @param {string|null} [status] - Optional status filter ("completed", "pending", etc.)
+   * @returns {number}
+   */
+  countByPhase(phase, status = null) {
+    return this._tasks.filter(
+      (t) => t.phase === phase && (status == null || t.status === status),
+    ).length;
+  }
   /**
    * Format task list for injection into system prompt context.
    * Compact checklist — not conversation history.

package/src/agent/tools/_workflow-result-schema.js ADDED Viewed

@@ -0,0 +1,249 @@
+/**
+ * v0.6.2 I1: Shared workflow-result normalizer + ERROR classifier.
+ *
+ * E2E #4 produced 1,150 ERROR verdicts out of 6,930 (16.6%) and
+ * verdict_stats keys leaked Python dataclass repr() strings like
+ * "VerificationResult(rule_id='R049', verdict='NOT_APPLICABLE', ...)".
+ * The agent's batch aggregator was using repr(result) as a dict key
+ * because the workflow's Python output was a dataclass instance, not
+ * a dict.
+ *
+ * This module fixes the boundary: anything that comes out of a
+ * workflow_run tool gets normalized to a strict dict shape before being
+ * persisted or returned to the agent. Repr-strings get parsed back into
+ * structured fields. ERRORs get classified into typed buckets so we can
+ * tell "import failed" from "extraction returned wrong shape" without
+ * reading 1,150 stack traces.
+ */
+/**
+ * The required shape every workflow result must satisfy. Unknown extra
+ * keys are preserved.
+ */
+export const REQUIRED_KEYS = ["rule_id", "verdict"];
+/**
+ * Canonical verdict values. Anything outside this set is allowed (the
+ * worker LLM may extend) but generates a `nonstandard_verdict` warning
+ * in the result's `_warnings` array.
+ */
+export const STANDARD_VERDICTS = new Set([
+  "PASS", "FAIL", "NOT_APPLICABLE", "SUPPLEMENT_NEEDED", "ERROR", "UNKNOWN",
+]);
+/**
+ * Recognized error_type values used by classifyError(). Add to this set
+ * when adding a new pattern below.
+ */
+export const ERROR_TYPES = [
+  "import_error",
+  "attribute_error",
+  "keyword_not_found",
+  "sample_unparseable",
+  "schema_violation",
+  "syntax_error",
+  "timeout",
+  "permission_error",
+  "unknown",
+];
+/**
+ * Detect whether a string looks like a Python dataclass repr —
+ * `ClassName(field=value, field=value)`. Used both as a top-level
+ * detector and recursively inside dict keys.
+ */
+const REPR_PATTERN = /^([A-Za-z_]\w*)\((.*)\)$/s;
+/**
+ * Parse a Python-repr string into { class_name, fields: { ... } }.
+ * Field values are kept as strings (we don't try to re-type them — the
+ * downstream consumer can JSON.parse if needed). Returns null if the
+ * input doesn't look like a repr.
+ *
+ * Example:
+ *   parsePyRepr("VerificationResult(rule_id='R049', verdict='NOT_APPLICABLE')")
+ *   → { class_name: 'VerificationResult', fields: { rule_id: "'R049'", verdict: "'NOT_APPLICABLE'" } }
+ */
+export function parsePyRepr(s) {
+  if (typeof s !== "string") return null;
+  const m = s.match(REPR_PATTERN);
+  if (!m) return null;
+  const className = m[1];
+  const body = m[2];
+  // Tokenize on top-level commas (ignore commas inside brackets/quotes)
+  const fields = {};
+  let depth = 0;
+  let inQuote = null;
+  let buf = "";
+  let key = null;
+  const flush = () => {
+    if (!buf.trim()) return;
+    if (key == null) {
+      // No `=` seen — entry was positional, skip
+      buf = "";
+      return;
+    }
+    fields[key] = buf.trim();
+    key = null;
+    buf = "";
+  };
+  for (let i = 0; i < body.length; i++) {
+    const c = body[i];
+    if (inQuote) {
+      buf += c;
+      if (c === inQuote && body[i - 1] !== "\\") inQuote = null;
+      continue;
+    }
+    if (c === "'" || c === '"') { inQuote = c; buf += c; continue; }
+    if (c === "(" || c === "[" || c === "{") { depth++; buf += c; continue; }
+    if (c === ")" || c === "]" || c === "}") { depth--; buf += c; continue; }
+    if (c === "=" && depth === 0 && key == null) {
+      key = buf.trim();
+      buf = "";
+      continue;
+    }
+    if (c === "," && depth === 0) { flush(); continue; }
+    buf += c;
+  }
+  flush();
+  return { class_name: className, fields };
+}
+/**
+ * Recursively replace any dict key that looks like a Python repr with
+ * a structured object. Also handles arrays. Mutates in place but also
+ * returns the input for chaining.
+ */
+export function normalizeReprKeys(obj) {
+  if (Array.isArray(obj)) {
+    obj.forEach((v, i) => { obj[i] = normalizeReprKeys(v); });
+    return obj;
+  }
+  if (obj && typeof obj === "object") {
+    const newObj = {};
+    for (const [k, v] of Object.entries(obj)) {
+      const parsed = parsePyRepr(k);
+      if (parsed) {
+        // Merge under a class-name bucket. Multiple repr keys for the
+        // same class collapse to a counter (because verdict_stats just
+        // wanted distinct buckets).
+        const bucket = newObj[parsed.class_name] || (newObj[parsed.class_name] = []);
+        bucket.push({ fields: parsed.fields, count: typeof v === "number" ? v : 1 });
+      } else {
+        newObj[k] = normalizeReprKeys(v);
+      }
+    }
+    return newObj;
+  }
+  return obj;
+}
+/**
+ * Classify an ERROR result by inferring `error_type` from the raw_output
+ * stack trace or message. Returns one of ERROR_TYPES.
+ *
+ * Conservative — when in doubt, return "unknown" rather than guess wrong.
+ */
+export function classifyError(rawOutput) {
+  if (!rawOutput || typeof rawOutput !== "string") return "unknown";
+  const s = rawOutput;
+  if (/ModuleNotFoundError|ImportError|No module named/i.test(s)) return "import_error";
+  if (/AttributeError/i.test(s)) return "attribute_error";
+  if (/SyntaxError|invalid syntax|unexpected character/i.test(s)) return "syntax_error";
+  if (/PermissionError|permission denied/i.test(s)) return "permission_error";
+  if (/timed out|timeout|Timeout/i.test(s)) return "timeout";
+  // sample parse failures usually mention pdfjs / docx / json
+  if (/pdfjs|docx|json\.decoder|JSONDecodeError|UnicodeDecodeError/i.test(s)) return "sample_unparseable";
+  // schema violations from our own normalizer would have a hint
+  if (/schema_violation|missing required key/i.test(s)) return "schema_violation";
+  // Common keyword-not-found signal: the workflow returned no match
+  if (/no match|not found|未找到|关键词未匹配/i.test(s)) return "keyword_not_found";
+  return "unknown";
+}
+/**
+ * Normalize a parsed workflow-output object to the canonical dict shape.
+ * - Ensures `rule_id` and `verdict` are present.
+ * - Strips repr-string keys (delegates to normalizeReprKeys).
+ * - If verdict is "ERROR" or the parse fell back to raw_output, attaches
+ *   `error_type` from classifyError().
+ * - Records issues in `_warnings: string[]` so the consumer (and the
+ *   agent reading the tool result) can see them.
+ *
+ * Inputs:
+ *   parsed      — what JSON.parse yielded (may already be a dict, or be
+ *                 the raw_output fallback object)
+ *   ruleId      — what the caller knows the rule_id should be
+ *   rawOutput   — the original stdout (used for ERROR classification)
+ *
+ * Returns the normalized result. Always returns a dict with `rule_id`
+ * and `verdict`. Never throws.
+ */
+export function normalizeWorkflowResult(parsed, ruleId, rawOutput) {
+  const warnings = [];
+  let result;
+  if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
+    result = { ...parsed };
+  } else if (typeof parsed === "string") {
+    // Parsed yielded a string — could be a repr at top level
+    const repr = parsePyRepr(parsed);
+    if (repr) {
+      // Strip Python's surrounding quote chars from string values so
+      // STANDARD_VERDICTS comparisons work and downstream code doesn't
+      // see "'PASS'" instead of "PASS". Conservative: only unwrap when
+      // the entire value is wrapped in matching ' or " quotes.
+      const stripped = {};
+      for (const [k, v] of Object.entries(repr.fields)) {
+        if (typeof v === "string" && /^(['"]).*\1$/s.test(v) && v.length >= 2) {
+          stripped[k] = v.slice(1, -1);
+        } else {
+          stripped[k] = v;
+        }
+      }
+      result = stripped;
+      result._source_class = repr.class_name;
+      warnings.push("toplevel_repr_string");
+    } else {
+      result = { raw_output: parsed.slice(0, 5000) };
+      warnings.push("toplevel_string");
+    }
+  } else {
+    result = { raw_output: String(parsed ?? "").slice(0, 5000) };
+    warnings.push("toplevel_nonobject");
+  }
+  // Recursively normalize repr keys in nested dicts (verdict_stats, etc.)
+  normalizeReprKeys(result);
+  // rule_id: prefer the caller-supplied value (it's authoritative)
+  if (ruleId) result.rule_id = ruleId;
+  else if (typeof result.rule_id !== "string") {
+    result.rule_id = "unknown";
+    warnings.push("missing_rule_id");
+  }
+  // verdict: ensure present and canonical-or-warn
+  if (typeof result.verdict !== "string" || result.verdict === "") {
+    // If the workflow fell into raw_output fallback, mark as ERROR
+    if (result.raw_output) {
+      result.verdict = "ERROR";
+    } else {
+      result.verdict = "UNKNOWN";
+      warnings.push("missing_verdict");
+    }
+  } else if (!STANDARD_VERDICTS.has(result.verdict)) {
+    warnings.push("nonstandard_verdict");
+  }
+  // ERROR classification
+  if (result.verdict === "ERROR") {
+    const trace = rawOutput || result.raw_output || result.error || "";
+    result.error_type = classifyError(trace);
+  }
+  if (warnings.length > 0) {
+    result._warnings = (result._warnings || []).concat(warnings);
+  }
+  return result;
+}

package/src/agent/tools/phase-advance.js CHANGED Viewed

@@ -19,13 +19,17 @@ export class PhaseAdvanceTool extends BaseTool {
    * @param {() => string} getCurrentPhaseFn - H1: lets the tool read the
    *   engine's phase BEFORE the call, so it can distinguish "already there"
    *   (silent no-op, informational) from "non-adjacent refusal" (actionable).
-   *   Before H1 both cases returned the same confusing "Either you're already
-   *   there, or transition is non-adjacent" message.
+   * @param {() => string[]} [getRunningSubagentsFn] - v0.6.2 J1: returns the
+   *   list of running subagent task_ids. When non-empty, phase_advance
+   *   refuses unless `acknowledge_stale_subagents: true` is set in input
+   *   (or `force: true`). Forces the agent to confront live work that
+   *   started in the prior phase before declaring the phase done.
    */
-  constructor(advanceFn, getCurrentPhaseFn) {
+  constructor(advanceFn, getCurrentPhaseFn, getRunningSubagentsFn) {
     super();
     this._advance = advanceFn;
     this._getCurrentPhase = getCurrentPhaseFn || (() => null);
+    this._getRunningSubagents = getRunningSubagentsFn || (() => []);
   }
   get name() { return "phase_advance"; }
@@ -48,6 +52,11 @@ export class PhaseAdvanceTool extends BaseTool {
           type: "boolean",
           description: "Allow non-adjacent or backward transitions. Default false.",
         },
+        acknowledge_stale_subagents: {
+          type: "boolean",
+          description:
+            "Set to true after using agent_tool(operation=list|poll|kill) to confirm you've handled any subagents still running from the prior phase. Required when subagents are live; otherwise advance is refused (use force:true to bypass entirely).",
+        },
       },
       required: ["to"],
     };
@@ -68,8 +77,30 @@ export class PhaseAdvanceTool extends BaseTool {
       );
     }
+    // v0.6.2 J1: stale-subagents acknowledgement gate. Refuses advance if
+    // any subagent is still running and the agent hasn't explicitly
+    // acknowledged. force:true bypasses (matches existing escape pattern).
+    const running = this._getRunningSubagents();
+    if (running.length > 0 && !input.acknowledge_stale_subagents && !input.force) {
+      return new ToolResult(
+        `Refusing to advance from ${beforePhase || "?"} to ${to}: ${running.length} subagent(s) still running from prior phase: ${running.join(", ")}. ` +
+        `Run agent_tool(operation="list") to see status, then either ` +
+        `agent_tool(operation="wait"|"kill") on each, OR pass acknowledge_stale_subagents:true ` +
+        `to advance while leaving them running (use only if they're legitimate background work).`,
+        true,
+      );
+    }
     const advanced = this._advance(to, input.reason || "agent request", { force: !!input.force });
     if (advanced) {
+      // Log the ack so post-mortems can find phase advances that proceeded
+      // with live subagents
+      if (running.length > 0 && input.acknowledge_stale_subagents) {
+        return new ToolResult(
+          `Advanced${beforePhase ? ` from ${beforePhase}` : ""} to ${to}${input.force ? " (forced)" : ""} — ` +
+          `acknowledged ${running.length} running subagent(s): ${running.join(", ")}.`,
+        );
+      }
       return new ToolResult(`Advanced${beforePhase ? ` from ${beforePhase}` : ""} to ${to}${input.force ? " (forced)" : ""}`);
     }