npm - @ctxr/skill-llm-wiki - Versions diffs - 1.0.2 → 1.2.0 - Mend

@ctxr/skill-llm-wiki 1.0.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/CHANGELOG.md +128 -0
package/README.md +11 -8
package/SKILL.md +11 -11
package/guide/cli.md +3 -2
package/guide/correctness/safety.md +2 -2
package/guide/layout/in-place-mode.md +1 -1
package/guide/substrate/operators.md +1 -1
package/guide/substrate/tiered-ai.md +6 -5
package/guide/ux/user-intent.md +1 -1
package/package.json +13 -4
package/scripts/cli.mjs +92 -2
package/scripts/lib/balance.mjs +579 -0
package/scripts/lib/cluster-detect.mjs +482 -4
package/scripts/lib/contract.mjs +53 -4
package/scripts/lib/decision-log.mjs +121 -15
package/scripts/lib/draft.mjs +127 -20
package/scripts/lib/frontmatter.mjs +45 -9
package/scripts/lib/heal.mjs +5 -0
package/scripts/lib/intent.mjs +370 -4
package/scripts/lib/join-constants.mjs +22 -0
package/scripts/lib/join.mjs +917 -0
package/scripts/lib/nest-applier.mjs +395 -32
package/scripts/lib/operators.mjs +472 -38
package/scripts/lib/orchestrator.mjs +419 -12
package/scripts/lib/root-containment.mjs +351 -0
package/scripts/lib/similarity-cache.mjs +115 -20
package/scripts/lib/similarity.mjs +11 -0
package/scripts/lib/soft-dag.mjs +726 -0
package/scripts/lib/tier2-protocol.mjs +169 -37
package/scripts/lib/tiered.mjs +42 -18
package/scripts/lib/validate.mjs +22 -0

package/scripts/lib/decision-log.mjs CHANGED Viewed

@@ -16,9 +16,14 @@
 // queryable even after the op is reset.
 import {
+  appendFileSync,
+  closeSync,
   existsSync,
+  fstatSync,
   mkdirSync,
+  openSync,
   readFileSync,
+  readSync,
   renameSync,
   writeFileSync,
 } from "node:fs";
@@ -130,27 +135,112 @@ function emitEntry(entry) {
   return lines.join("\n");
 }
-// Append an entry atomically.
+// Append an entry.
+//
+// Hot path: at large-corpus scale (596 leaves → 189k pairwise
+// decisions observed) this is called once per decision. An earlier
+// implementation read the whole file, concatenated the new entry,
+// wrote to a temp, and renamed — O(file-size) per append. On a
+// 45 MB decisions.yaml that's ~22 MB of avg-read per call × 189k
+// calls ≈ 4 TB of I/O, which alone accounted for most of a 2h15m
+// build's wall-clock time.
+//
+// Durability guarantees:
+//
+//   - First call (file doesn't exist): writes header + first entry
+//     via temp+rename. The initial file materialises atomically —
+//     a crash during the first call leaves either no file or a
+//     well-formed single-entry file.
+//
+//   - Subsequent calls: best-effort `appendFileSync`. Each call is
+//     a single `write(2)` syscall of the serialised entry. In the
+//     common case the kernel writes the full buffer atomically,
+//     but this is NOT a formal durability contract for regular
+//     files the way temp+rename is:
+//
+//       * A crash mid-write can leave a torn trailing entry. On
+//         recovery the YAML parser will reject the truncated
+//         scalar; the audit log is recoverable by removing the
+//         last partial `- ...` block and re-running the op.
+//
+//       * Node's `writeSync`/`appendFileSync` MAY split a large
+//         buffer into multiple `write(2)` calls. Typical entry
+//         blocks here are ~200 bytes — well under typical
+//         single-write thresholds — but there is no portable
+//         small-write atomicity guarantee for regular files
+//         (POSIX's PIPE_BUF atomicity applies to pipes/FIFOs, not
+//         disk files).
+//
+//       * On Windows, `appendFileSync` has no equivalent of
+//         POSIX O_APPEND kernel serialisation under concurrent
+//         writers from multiple processes. This phase runs
+//         single-process though, so cross-process interleaving
+//         is not a concern in practice.
+//
+// The decision log is an audit trail, not a reproducibility
+// artefact — lost tail bytes on a crash are annoying but
+// recoverable, and the output tree's byte-reproducibility is
+// independent of this file's exact contents. If stronger
+// durability is needed for a specific use case, callers should
+// batch-flush to a temp file and rename on phase boundaries.
+//
+// Cost per append: O(entry-size), not O(file-size). ~200 µs vs
+// ~20 ms on a big log — a 100× speedup at scale.
 export function appendDecision(wikiRoot, entry) {
   validate(entry);
   const path = decisionLogPath(wikiRoot);
   mkdirSync(dirname(path), { recursive: true });
   const block = emitEntry(entry) + "\n";
-  let payload;
   if (!existsSync(path)) {
-    payload =
+    // First call: lay down the header atomically via temp+rename so
+    // a crash mid-creation doesn't leave an empty or orphan file.
+    const payload =
       "# skill-llm-wiki tiered-AI decision log (append-only)\n" +
       "version: 1\n" +
       "entries:\n" +
       block;
-  } else {
-    const existing = readFileSync(path, "utf8");
-    const prefix = existing.endsWith("\n") ? existing : existing + "\n";
-    payload = prefix + block;
+    const tmp = `${path}.tmp.${process.pid}.${Date.now()}`;
+    writeFileSync(tmp, payload, "utf8");
+    renameSync(tmp, path);
+    return;
+  }
+  // Subsequent appends: O(entry-size) via POSIX append. Peek at
+  // the last byte first: if the existing file doesn't end in a
+  // newline (manual edit, prior torn-tail truncation, or a
+  // creative crash), appending directly would concatenate the new
+  // entry onto the previous line and produce invalid YAML. Prefix
+  // a newline in that case — a leading blank line inside the
+  // entries[] list is harmless and parses fine.
+  const needsLeadingNewline = !endsWithNewline(path);
+  appendFileSync(path, needsLeadingNewline ? "\n" + block : block, "utf8");
+}
+// Check the last byte of the decision log without reading the
+// whole file. Uses a small anchored read rather than `readFileSync`
+// so the hot append path still pays O(1) regardless of log size.
+// An unreadable file (ENOENT, EACCES, race window) is treated as
+// "already newline-terminated" so the caller doesn't double up on
+// leading newlines on a transient read error.
+function endsWithNewline(path) {
+  let fd;
+  try {
+    fd = openSync(path, "r");
+    const { size } = fstatSync(fd);
+    if (size === 0) return true; // empty file has no trailing content to collide
+    const buf = Buffer.alloc(1);
+    readSync(fd, buf, 0, 1, size - 1);
+    return buf[0] === 0x0a; // 0x0a == '\n'
+  } catch {
+    return true;
+  } finally {
+    if (fd !== undefined) {
+      try {
+        closeSync(fd);
+      } catch {
+        /* best-effort */
+      }
+    }
   }
-  const tmp = `${path}.tmp.${process.pid}.${Date.now()}`;
-  writeFileSync(tmp, payload, "utf8");
-  renameSync(tmp, path);
 }
 // Convenience helper for cluster-NEST outcomes. The convergence
@@ -164,14 +254,18 @@ export function appendDecision(wikiRoot, entry) {
 //
 //   op_id, operator="NEST"                — as-is
 //   sources                               — leaf ids in the cluster
-//   tier_used                             — 2 (every NEST decision
-//                                           touches Tier 2 either
-//                                           via propose_structure
-//                                           or nest_decision)
+//   tier_used                             — caller-supplied (default 2
+//                                           for legacy Tier-2-touching
+//                                           NEST paths; 0 under
+//                                           `--quality-mode deterministic`
+//                                           since no sub-agent is
+//                                           consulted)
 //   similarity                            — average_affinity
 //   confidence_band                       — one of:
 //                                           "tier2-proposed",
+//                                           "tier2-and-math",
 //                                           "math-gated",
+//                                           "deterministic-math",
 //                                           "empty-partition",
 //                                           "rejected-by-metric",
 //                                           "rejected-by-gate"
@@ -187,16 +281,28 @@ export function appendDecision(wikiRoot, entry) {
 // Coercion: average_affinity may be undefined for Tier-2-proposed
 // clusters; we coerce to 0 so the finite-number validator does
 // not reject the entry.
+//
+// tier_used default: pre-deterministic-mode every NEST decision
+// touched Tier 2 via propose_structure or nest_decision, so the
+// default of 2 was correct. Under `--quality-mode deterministic`
+// Tier 2 is never consulted for math candidates; callers on that
+// path pass `tier_used: 0` so the audit trail correctly reflects
+// the fact that no sub-agent was invoked. The default remains 2
+// for backward compatibility with every existing call site.
 export function appendNestDecision(wikiRoot, entry) {
   const similarity =
     Number.isFinite(entry.similarity)
       ? entry.similarity
       : (Number.isFinite(entry.average_affinity) ? entry.average_affinity : 0);
+  const tier_used =
+    typeof entry.tier_used === "number" && Number.isInteger(entry.tier_used)
+      ? entry.tier_used
+      : 2;
   appendDecision(wikiRoot, {
     op_id: entry.op_id,
     operator: "NEST",
     sources: Array.isArray(entry.sources) ? entry.sources : [],
-    tier_used: 2,
+    tier_used,
     similarity,
     confidence_band: entry.confidence_band ?? null,
     decision: entry.decision,

package/scripts/lib/draft.mjs CHANGED Viewed

@@ -23,21 +23,52 @@
 // `needs_ai` flag on the returned draft tells the caller which entries
 // need AI review.
-// Fields we copy straight from the source frontmatter when the author
-// supplied them. Fields NOT in this list (id / type / depth_role /
-// parents / source) are always re-derived because their authoritative
-// source is the target-tree position, not the original source file.
-const AUTHORED_LEAF_FIELDS = [
+// Prototype-pollution deny-list. Mirrors POLLUTION_KEYS in
+// scripts/lib/frontmatter.mjs — the parser refuses these at parse
+// time, but the new pass-through path in draftLeafFrontmatter could
+// still surface them if a crafted candidate JSON (e.g. from
+// `scripts/cli.mjs draft-leaf` invoked with adversarial input)
+// shipped them via authored_frontmatter. Refusing here keeps the
+// invariant local to the assignment site.
+const POLLUTION_KEYS = new Set(["__proto__", "constructor", "prototype"]);
+// Fields whose authoritative source is the target-tree position (not
+// the original source file). These are ALWAYS re-derived during a
+// rebuild regardless of what the author wrote: `id` comes from the
+// filename / target slot, `type` defaults to "primary" (overlays must
+// be re-asserted explicitly via the rebuild's overlay path),
+// `depth_role` is always "leaf" for non-index leaves, and `source` is
+// recomputed from the build invocation.
+//
+// `parents` is NOT in this set — it's a hand-authored field (the
+// comment in the data object below describes the convention) and the
+// drafter pickAuthored()s it. Including it here would silently drop
+// authored parents and break the soft-DAG.
+//
+// EVERY OTHER authored field flows through verbatim. This is a
+// deny-list, not an allow-list (issue #26): consumers ship their own
+// schemas (e.g. skill-code-review's `dimensions`, `audit_surface`,
+// `languages`, `tools`) and a generic wiki framework should preserve
+// what the author wrote rather than enumerating per-consumer fields.
+const RESERVED_LEAF_FIELDS = new Set([
+  "id",
+  "type",
+  "depth_role",
+  "source",
+]);
+// Fields the drafter computes a heuristic baseline for and writes
+// explicitly in the canonical data object below. Authored values for
+// these win over the heuristic via pickAuthored(); they're listed here
+// only so the pass-through loop knows to skip them (they're already in
+// the data object — re-forwarding would be a no-op but with the wrong
+// authored-vs-heuristic precedence).
+const EXPLICITLY_HANDLED_LEAF_FIELDS = new Set([
   "focus",
   "covers",
   "tags",
-  "domains",
-  "aliases",
-  "activation",
-  "shared_covers",
-  "overlay_targets",
-  "links",
-];
+  "parents",
+]);
 export function draftLeafFrontmatter(candidate, { categoryPath } = {}) {
   const authored = candidate.authored_frontmatter || {};
@@ -71,15 +102,39 @@ export function draftLeafFrontmatter(candidate, { categoryPath } = {}) {
     },
   };
-  // Forward the remaining AUTHORED_LEAF_FIELDS verbatim. These have no
-  // heuristic analogue — when the author supplied them, we keep them;
-  // otherwise we omit the field entirely so the output stays compact.
+  // Forward EVERY authored field that isn't reserved (re-derived from
+  // target-tree position) or explicitly handled above (focus / covers
+  // / tags / parents, where authored-wins-over-drafted is enforced via
+  // pickAuthored). Issue #26: the previous allow-list dropped any
+  // consumer-specific v2 field (dimensions, audit_surface, languages,
+  // tools, …) authored at the source; the deny-list now preserves
+  // arbitrary author-shipped frontmatter VALUES (the downstream
+  // renderer applies canonical top-level key ordering and YAML
+  // formatting, so the rebuilt bytes need not match the source bytes).
   if (hasAuthored) {
-    for (const field of AUTHORED_LEAF_FIELDS) {
-      if (field === "focus" || field === "covers" || field === "tags") continue;
-      if (authored[field] !== undefined && authored[field] !== null) {
-        data[field] = authored[field];
-      }
+    for (const [field, value] of Object.entries(authored)) {
+      if (RESERVED_LEAF_FIELDS.has(field)) continue;
+      if (EXPLICITLY_HANDLED_LEAF_FIELDS.has(field)) continue;
+      // Refuse prototype-pollution keys before any assignment touches
+      // the prototype chain. Mirrors frontmatter.mjs's safeAssign.
+      if (POLLUTION_KEYS.has(field)) continue;
+      if (value === undefined || value === null) continue;
+      const sanitised = sanitiseAuthoredValue(value);
+      if (sanitised === undefined) continue;
+      // Empty arrays / empty strings DO get forwarded — distinguishing
+      // "author wrote []" from "author omitted" matters for some
+      // consumer schemas (e.g. an explicit empty file_globs[] means
+      // "this leaf opts out of glob-based activation"). Only the
+      // null/undefined case is treated as "author omitted".
+      // Use defineProperty (configurable, enumerable, writable) so the
+      // assignment never invokes a setter on Object.prototype if the
+      // POLLUTION_KEYS guard above is ever bypassed.
+      Object.defineProperty(data, field, {
+        value: sanitised,
+        configurable: true,
+        enumerable: true,
+        writable: true,
+      });
     }
   }
@@ -87,6 +142,58 @@ export function draftLeafFrontmatter(candidate, { categoryPath } = {}) {
   return { data, confidence, needs_ai: confidence < 0.6 };
 }
+// Sanitise a value pulled from authored frontmatter for assignment
+// into `data` (which is later passed to renderFrontmatter). The
+// renderer at scripts/lib/frontmatter.mjs handles plain objects,
+// arrays, and scalar primitives (string / number / boolean / null) but
+// not richer JS types — gray-matter / js-yaml can return:
+//   - Date (from YAML timestamps like `created_at: 2026-04-30`):
+//     converted to ISO string. Otherwise renderScalar(date) calls
+//     String(date) which produces the verbose JS Date toString form.
+//   - functions / symbols / class instances: rejected (return
+//     undefined so the pass-through loop skips the field).
+// Plain objects and arrays recurse so a Date nested inside an
+// authored object still gets normalised.
+function sanitiseAuthoredValue(value) {
+  if (value === null) return null;
+  if (value === undefined) return undefined;
+  const t = typeof value;
+  if (t === "string" || t === "number" || t === "boolean") return value;
+  if (t === "function" || t === "symbol" || t === "bigint") return undefined;
+  if (value instanceof Date) {
+    // YAML timestamps come back as Date; canonicalise to ISO string so
+    // a downstream rebuild round-trips the same string back into the
+    // YAML stream.
+    return value.toISOString();
+  }
+  if (Array.isArray(value)) {
+    return value.map(sanitiseAuthoredValue).filter((v) => v !== undefined);
+  }
+  if (t === "object") {
+    // Plain-object check: only recurse into objects whose prototype
+    // is Object.prototype or null. Class instances (URL, Buffer, …)
+    // are rejected — their `Object.entries` shape is rarely what a
+    // YAML frontmatter consumer wants.
+    const proto = Object.getPrototypeOf(value);
+    if (proto !== null && proto !== Object.prototype) return undefined;
+    // Use a null-prototype object as the accumulator so neither the
+    // POLLUTION_KEYS guard nor a setter on Object.prototype can be
+    // triggered by an `out[__proto__] = ...` assignment with a crafted
+    // key. (defineProperty would also work; null-proto is one allocation.)
+    const out = Object.create(null);
+    for (const [k, v] of Object.entries(value)) {
+      if (POLLUTION_KEYS.has(k)) continue;
+      const s = sanitiseAuthoredValue(v);
+      if (s === undefined) continue;
+      out[k] = s;
+    }
+    // Re-parent to Object.prototype before returning so downstream
+    // consumers that do `value.hasOwnProperty(...)` etc. keep working.
+    return Object.assign({}, out);
+  }
+  return undefined;
+}
 function pickAuthored(authoredVal, fallback) {
   if (authoredVal === undefined || authoredVal === null) return fallback;
   if (Array.isArray(authoredVal)) {

package/scripts/lib/frontmatter.mjs CHANGED Viewed

@@ -126,8 +126,9 @@ function parseMap(p, baseIndent) {
     const rest = text.slice(colon + 1).trim();
     p.advance();
-    if (rest === "|" || rest === ">") {
-      safeAssign(out, key, parseBlockScalar(p, baseIndent, rest === "|"), p, tok);
+    const blockHeader = blockScalarHeader(rest);
+    if (blockHeader) {
+      safeAssign(out, key, parseBlockScalar(p, baseIndent, blockHeader), p, tok);
       continue;
     }
     if (rest !== "") {
@@ -178,6 +179,12 @@ function parseSeq(p, baseIndent) {
       continue;
     }
+    const itemBlockHeader = blockScalarHeader(afterDash);
+    if (itemBlockHeader) {
+      out.push(parseBlockScalar(p, baseIndent, itemBlockHeader));
+      continue;
+    }
     const colon = findKeyColon(afterDash);
     if (colon === -1) {
       out.push(parseScalarInline(afterDash));
@@ -189,8 +196,9 @@ function parseSeq(p, baseIndent) {
     const firstRest = afterDash.slice(colon + 1).trim();
     const item = {};
-    if (firstRest === "|" || firstRest === ">") {
-      item[firstKey] = parseBlockScalar(p, baseIndent + 2, firstRest === "|");
+    const firstBlockHeader = blockScalarHeader(firstRest);
+    if (firstBlockHeader) {
+      item[firstKey] = parseBlockScalar(p, baseIndent + 2, firstBlockHeader);
     } else if (firstRest !== "") {
       item[firstKey] = parseScalarInline(firstRest);
     } else {
@@ -237,10 +245,13 @@ function parseSeq(p, baseIndent) {
         } else {
           item[subKey] = null;
         }
-      } else if (subRest === "|" || subRest === ">") {
-        item[subKey] = parseBlockScalar(p, baseIndent + 2, subRest === "|");
       } else {
-        item[subKey] = parseScalarInline(subRest);
+        const subBlockHeader = blockScalarHeader(subRest);
+        if (subBlockHeader) {
+          item[subKey] = parseBlockScalar(p, baseIndent + 2, subBlockHeader);
+        } else {
+          item[subKey] = parseScalarInline(subRest);
+        }
       }
     }
@@ -248,8 +259,29 @@ function parseSeq(p, baseIndent) {
   }
 }
-function parseBlockScalar(p, baseIndent, literal) {
+// Recognise a YAML block scalar header: `|` (literal) or `>` (folded),
+// each optionally carrying a chomping indicator (`+`/`-`) and/or an explicit
+// indentation indicator (a single digit 1-9), in either order (YAML 1.2
+// §8.1.1). Returns { literal } or null. Chomping/indent indicators affect
+// only trailing-newline and indent-detection nuances that do not change the
+// value of the single-line/wrapped scalars our frontmatter uses, so we read
+// them for tolerance but act only on the literal-vs-folded distinction. This
+// is why a serializer-folded `id: >-` (js-yaml's default line wrap) parses
+// instead of tripping "unexpected indent".
+function blockScalarHeader(rest) {
+  const m = /^([|>])(?:(?:([+-])([1-9])?)|(?:([1-9])([+-])?))?$/.exec(rest);
+  return m
+    ? {
+        literal: m[1] === "|",
+        indent: Number(m[3] ?? m[4] ?? 0),
+      }
+    : null;
+}
+function parseBlockScalar(p, baseIndent, header) {
+  const { literal, indent } = header;
   const collected = [];
+  let contentIndent = indent > 0 ? baseIndent + indent : null;
   while (p.pos < p.lines.length) {
     const raw = p.lines[p.pos];
     if (raw.trim() === "") {
@@ -259,7 +291,11 @@ function parseBlockScalar(p, baseIndent, literal) {
     }
     const indent = raw.length - raw.trimStart().length;
     if (indent <= baseIndent) break;
-    collected.push(raw.slice(baseIndent + 2));
+    if (contentIndent == null) {
+      contentIndent = indent;
+    }
+    if (indent < contentIndent) break;
+    collected.push(raw.slice(contentIndent));
     p.pos++;
   }
   // Trim trailing empty lines

package/scripts/lib/heal.mjs CHANGED Viewed

@@ -52,6 +52,11 @@ export const FINDING_ACTIONS = Object.freeze({
   "DANGLING-LINK": "fix",
   "DANGLING-OVERLAY": "fix",
+  // X.11 root-leaf containment invariant — `fix` runs Phase 4.4.5
+  // root-containment to move outlier leaves into per-slug
+  // subcategories:
+  "LEAF-AT-WIKI-ROOT": "fix",
   // Size cap is a warning surface only:
   "SIZE-CAP": "none",
 });