@ctxr/skill-llm-wiki 1.0.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,9 +16,14 @@
16
16
  // queryable even after the op is reset.
17
17
 
18
18
  import {
19
+ appendFileSync,
20
+ closeSync,
19
21
  existsSync,
22
+ fstatSync,
20
23
  mkdirSync,
24
+ openSync,
21
25
  readFileSync,
26
+ readSync,
22
27
  renameSync,
23
28
  writeFileSync,
24
29
  } from "node:fs";
@@ -130,27 +135,112 @@ function emitEntry(entry) {
130
135
  return lines.join("\n");
131
136
  }
132
137
 
133
- // Append an entry atomically.
138
+ // Append an entry.
139
+ //
140
+ // Hot path: at large-corpus scale (596 leaves → 189k pairwise
141
+ // decisions observed) this is called once per decision. An earlier
142
+ // implementation read the whole file, concatenated the new entry,
143
+ // wrote to a temp, and renamed — O(file-size) per append. On a
144
+ // 45 MB decisions.yaml that's ~22 MB of avg-read per call × 189k
145
+ // calls ≈ 4 TB of I/O, which alone accounted for most of a 2h15m
146
+ // build's wall-clock time.
147
+ //
148
+ // Durability guarantees:
149
+ //
150
+ // - First call (file doesn't exist): writes header + first entry
151
+ // via temp+rename. The initial file materialises atomically —
152
+ // a crash during the first call leaves either no file or a
153
+ // well-formed single-entry file.
154
+ //
155
+ // - Subsequent calls: best-effort `appendFileSync`. Each call is
156
+ // a single `write(2)` syscall of the serialised entry. In the
157
+ // common case the kernel writes the full buffer atomically,
158
+ // but this is NOT a formal durability contract for regular
159
+ // files the way temp+rename is:
160
+ //
161
+ // * A crash mid-write can leave a torn trailing entry. On
162
+ // recovery the YAML parser will reject the truncated
163
+ // scalar; the audit log is recoverable by removing the
164
+ // last partial `- ...` block and re-running the op.
165
+ //
166
+ // * Node's `writeSync`/`appendFileSync` MAY split a large
167
+ // buffer into multiple `write(2)` calls. Typical entry
168
+ // blocks here are ~200 bytes — well under typical
169
+ // single-write thresholds — but there is no portable
170
+ // small-write atomicity guarantee for regular files
171
+ // (POSIX's PIPE_BUF atomicity applies to pipes/FIFOs, not
172
+ // disk files).
173
+ //
174
+ // * On Windows, `appendFileSync` has no equivalent of
175
+ // POSIX O_APPEND kernel serialisation under concurrent
176
+ // writers from multiple processes. This phase runs
177
+ // single-process though, so cross-process interleaving
178
+ // is not a concern in practice.
179
+ //
180
+ // The decision log is an audit trail, not a reproducibility
181
+ // artefact — lost tail bytes on a crash are annoying but
182
+ // recoverable, and the output tree's byte-reproducibility is
183
+ // independent of this file's exact contents. If stronger
184
+ // durability is needed for a specific use case, callers should
185
+ // batch-flush to a temp file and rename on phase boundaries.
186
+ //
187
+ // Cost per append: O(entry-size), not O(file-size). ~200 µs vs
188
+ // ~20 ms on a big log — a 100× speedup at scale.
134
189
  export function appendDecision(wikiRoot, entry) {
135
190
  validate(entry);
136
191
  const path = decisionLogPath(wikiRoot);
137
192
  mkdirSync(dirname(path), { recursive: true });
138
193
  const block = emitEntry(entry) + "\n";
139
- let payload;
140
194
  if (!existsSync(path)) {
141
- payload =
195
+ // First call: lay down the header atomically via temp+rename so
196
+ // a crash mid-creation doesn't leave an empty or orphan file.
197
+ const payload =
142
198
  "# skill-llm-wiki tiered-AI decision log (append-only)\n" +
143
199
  "version: 1\n" +
144
200
  "entries:\n" +
145
201
  block;
146
- } else {
147
- const existing = readFileSync(path, "utf8");
148
- const prefix = existing.endsWith("\n") ? existing : existing + "\n";
149
- payload = prefix + block;
202
+ const tmp = `${path}.tmp.${process.pid}.${Date.now()}`;
203
+ writeFileSync(tmp, payload, "utf8");
204
+ renameSync(tmp, path);
205
+ return;
206
+ }
207
+ // Subsequent appends: O(entry-size) via POSIX append. Peek at
208
+ // the last byte first: if the existing file doesn't end in a
209
+ // newline (manual edit, prior torn-tail truncation, or a
210
+ // creative crash), appending directly would concatenate the new
211
+ // entry onto the previous line and produce invalid YAML. Prefix
212
+ // a newline in that case — a leading blank line inside the
213
+ // entries[] list is harmless and parses fine.
214
+ const needsLeadingNewline = !endsWithNewline(path);
215
+ appendFileSync(path, needsLeadingNewline ? "\n" + block : block, "utf8");
216
+ }
217
+
218
+ // Check the last byte of the decision log without reading the
219
+ // whole file. Uses a small anchored read rather than `readFileSync`
220
+ // so the hot append path still pays O(1) regardless of log size.
221
+ // An unreadable file (ENOENT, EACCES, race window) is treated as
222
+ // "already newline-terminated" so the caller doesn't double up on
223
+ // leading newlines on a transient read error.
224
+ function endsWithNewline(path) {
225
+ let fd;
226
+ try {
227
+ fd = openSync(path, "r");
228
+ const { size } = fstatSync(fd);
229
+ if (size === 0) return true; // empty file has no trailing content to collide
230
+ const buf = Buffer.alloc(1);
231
+ readSync(fd, buf, 0, 1, size - 1);
232
+ return buf[0] === 0x0a; // 0x0a == '\n'
233
+ } catch {
234
+ return true;
235
+ } finally {
236
+ if (fd !== undefined) {
237
+ try {
238
+ closeSync(fd);
239
+ } catch {
240
+ /* best-effort */
241
+ }
242
+ }
150
243
  }
151
- const tmp = `${path}.tmp.${process.pid}.${Date.now()}`;
152
- writeFileSync(tmp, payload, "utf8");
153
- renameSync(tmp, path);
154
244
  }
155
245
 
156
246
  // Convenience helper for cluster-NEST outcomes. The convergence
@@ -164,14 +254,18 @@ export function appendDecision(wikiRoot, entry) {
164
254
  //
165
255
  // op_id, operator="NEST" — as-is
166
256
  // sources — leaf ids in the cluster
167
- // tier_used — 2 (every NEST decision
168
- // touches Tier 2 either
169
- // via propose_structure
170
- // or nest_decision)
257
+ // tier_used — caller-supplied (default 2
258
+ // for legacy Tier-2-touching
259
+ // NEST paths; 0 under
260
+ // `--quality-mode deterministic`
261
+ // since no sub-agent is
262
+ // consulted)
171
263
  // similarity — average_affinity
172
264
  // confidence_band — one of:
173
265
  // "tier2-proposed",
266
+ // "tier2-and-math",
174
267
  // "math-gated",
268
+ // "deterministic-math",
175
269
  // "empty-partition",
176
270
  // "rejected-by-metric",
177
271
  // "rejected-by-gate"
@@ -187,16 +281,28 @@ export function appendDecision(wikiRoot, entry) {
187
281
  // Coercion: average_affinity may be undefined for Tier-2-proposed
188
282
  // clusters; we coerce to 0 so the finite-number validator does
189
283
  // not reject the entry.
284
+ //
285
+ // tier_used default: pre-deterministic-mode every NEST decision
286
+ // touched Tier 2 via propose_structure or nest_decision, so the
287
+ // default of 2 was correct. Under `--quality-mode deterministic`
288
+ // Tier 2 is never consulted for math candidates; callers on that
289
+ // path pass `tier_used: 0` so the audit trail correctly reflects
290
+ // the fact that no sub-agent was invoked. The default remains 2
291
+ // for backward compatibility with every existing call site.
190
292
  export function appendNestDecision(wikiRoot, entry) {
191
293
  const similarity =
192
294
  Number.isFinite(entry.similarity)
193
295
  ? entry.similarity
194
296
  : (Number.isFinite(entry.average_affinity) ? entry.average_affinity : 0);
297
+ const tier_used =
298
+ typeof entry.tier_used === "number" && Number.isInteger(entry.tier_used)
299
+ ? entry.tier_used
300
+ : 2;
195
301
  appendDecision(wikiRoot, {
196
302
  op_id: entry.op_id,
197
303
  operator: "NEST",
198
304
  sources: Array.isArray(entry.sources) ? entry.sources : [],
199
- tier_used: 2,
305
+ tier_used,
200
306
  similarity,
201
307
  confidence_band: entry.confidence_band ?? null,
202
308
  decision: entry.decision,
@@ -23,21 +23,52 @@
23
23
  // `needs_ai` flag on the returned draft tells the caller which entries
24
24
  // need AI review.
25
25
 
26
- // Fields we copy straight from the source frontmatter when the author
27
- // supplied them. Fields NOT in this list (id / type / depth_role /
28
- // parents / source) are always re-derived because their authoritative
29
- // source is the target-tree position, not the original source file.
30
- const AUTHORED_LEAF_FIELDS = [
26
+ // Prototype-pollution deny-list. Mirrors POLLUTION_KEYS in
27
+ // scripts/lib/frontmatter.mjs the parser refuses these at parse
28
+ // time, but the new pass-through path in draftLeafFrontmatter could
29
+ // still surface them if a crafted candidate JSON (e.g. from
30
+ // `scripts/cli.mjs draft-leaf` invoked with adversarial input)
31
+ // shipped them via authored_frontmatter. Refusing here keeps the
32
+ // invariant local to the assignment site.
33
+ const POLLUTION_KEYS = new Set(["__proto__", "constructor", "prototype"]);
34
+
35
+ // Fields whose authoritative source is the target-tree position (not
36
+ // the original source file). These are ALWAYS re-derived during a
37
+ // rebuild regardless of what the author wrote: `id` comes from the
38
+ // filename / target slot, `type` defaults to "primary" (overlays must
39
+ // be re-asserted explicitly via the rebuild's overlay path),
40
+ // `depth_role` is always "leaf" for non-index leaves, and `source` is
41
+ // recomputed from the build invocation.
42
+ //
43
+ // `parents` is NOT in this set — it's a hand-authored field (the
44
+ // comment in the data object below describes the convention) and the
45
+ // drafter pickAuthored()s it. Including it here would silently drop
46
+ // authored parents and break the soft-DAG.
47
+ //
48
+ // EVERY OTHER authored field flows through verbatim. This is a
49
+ // deny-list, not an allow-list (issue #26): consumers ship their own
50
+ // schemas (e.g. skill-code-review's `dimensions`, `audit_surface`,
51
+ // `languages`, `tools`) and a generic wiki framework should preserve
52
+ // what the author wrote rather than enumerating per-consumer fields.
53
+ const RESERVED_LEAF_FIELDS = new Set([
54
+ "id",
55
+ "type",
56
+ "depth_role",
57
+ "source",
58
+ ]);
59
+
60
+ // Fields the drafter computes a heuristic baseline for and writes
61
+ // explicitly in the canonical data object below. Authored values for
62
+ // these win over the heuristic via pickAuthored(); they're listed here
63
+ // only so the pass-through loop knows to skip them (they're already in
64
+ // the data object — re-forwarding would be a no-op but with the wrong
65
+ // authored-vs-heuristic precedence).
66
+ const EXPLICITLY_HANDLED_LEAF_FIELDS = new Set([
31
67
  "focus",
32
68
  "covers",
33
69
  "tags",
34
- "domains",
35
- "aliases",
36
- "activation",
37
- "shared_covers",
38
- "overlay_targets",
39
- "links",
40
- ];
70
+ "parents",
71
+ ]);
41
72
 
42
73
  export function draftLeafFrontmatter(candidate, { categoryPath } = {}) {
43
74
  const authored = candidate.authored_frontmatter || {};
@@ -71,15 +102,39 @@ export function draftLeafFrontmatter(candidate, { categoryPath } = {}) {
71
102
  },
72
103
  };
73
104
 
74
- // Forward the remaining AUTHORED_LEAF_FIELDS verbatim. These have no
75
- // heuristic analogue when the author supplied them, we keep them;
76
- // otherwise we omit the field entirely so the output stays compact.
105
+ // Forward EVERY authored field that isn't reserved (re-derived from
106
+ // target-tree position) or explicitly handled above (focus / covers
107
+ // / tags / parents, where authored-wins-over-drafted is enforced via
108
+ // pickAuthored). Issue #26: the previous allow-list dropped any
109
+ // consumer-specific v2 field (dimensions, audit_surface, languages,
110
+ // tools, …) authored at the source; the deny-list now preserves
111
+ // arbitrary author-shipped frontmatter VALUES (the downstream
112
+ // renderer applies canonical top-level key ordering and YAML
113
+ // formatting, so the rebuilt bytes need not match the source bytes).
77
114
  if (hasAuthored) {
78
- for (const field of AUTHORED_LEAF_FIELDS) {
79
- if (field === "focus" || field === "covers" || field === "tags") continue;
80
- if (authored[field] !== undefined && authored[field] !== null) {
81
- data[field] = authored[field];
82
- }
115
+ for (const [field, value] of Object.entries(authored)) {
116
+ if (RESERVED_LEAF_FIELDS.has(field)) continue;
117
+ if (EXPLICITLY_HANDLED_LEAF_FIELDS.has(field)) continue;
118
+ // Refuse prototype-pollution keys before any assignment touches
119
+ // the prototype chain. Mirrors frontmatter.mjs's safeAssign.
120
+ if (POLLUTION_KEYS.has(field)) continue;
121
+ if (value === undefined || value === null) continue;
122
+ const sanitised = sanitiseAuthoredValue(value);
123
+ if (sanitised === undefined) continue;
124
+ // Empty arrays / empty strings DO get forwarded — distinguishing
125
+ // "author wrote []" from "author omitted" matters for some
126
+ // consumer schemas (e.g. an explicit empty file_globs[] means
127
+ // "this leaf opts out of glob-based activation"). Only the
128
+ // null/undefined case is treated as "author omitted".
129
+ // Use defineProperty (configurable, enumerable, writable) so the
130
+ // assignment never invokes a setter on Object.prototype if the
131
+ // POLLUTION_KEYS guard above is ever bypassed.
132
+ Object.defineProperty(data, field, {
133
+ value: sanitised,
134
+ configurable: true,
135
+ enumerable: true,
136
+ writable: true,
137
+ });
83
138
  }
84
139
  }
85
140
 
@@ -87,6 +142,58 @@ export function draftLeafFrontmatter(candidate, { categoryPath } = {}) {
87
142
  return { data, confidence, needs_ai: confidence < 0.6 };
88
143
  }
89
144
 
145
+ // Sanitise a value pulled from authored frontmatter for assignment
146
+ // into `data` (which is later passed to renderFrontmatter). The
147
+ // renderer at scripts/lib/frontmatter.mjs handles plain objects,
148
+ // arrays, and scalar primitives (string / number / boolean / null) but
149
+ // not richer JS types — gray-matter / js-yaml can return:
150
+ // - Date (from YAML timestamps like `created_at: 2026-04-30`):
151
+ // converted to ISO string. Otherwise renderScalar(date) calls
152
+ // String(date) which produces the verbose JS Date toString form.
153
+ // - functions / symbols / class instances: rejected (return
154
+ // undefined so the pass-through loop skips the field).
155
+ // Plain objects and arrays recurse so a Date nested inside an
156
+ // authored object still gets normalised.
157
+ function sanitiseAuthoredValue(value) {
158
+ if (value === null) return null;
159
+ if (value === undefined) return undefined;
160
+ const t = typeof value;
161
+ if (t === "string" || t === "number" || t === "boolean") return value;
162
+ if (t === "function" || t === "symbol" || t === "bigint") return undefined;
163
+ if (value instanceof Date) {
164
+ // YAML timestamps come back as Date; canonicalise to ISO string so
165
+ // a downstream rebuild round-trips the same string back into the
166
+ // YAML stream.
167
+ return value.toISOString();
168
+ }
169
+ if (Array.isArray(value)) {
170
+ return value.map(sanitiseAuthoredValue).filter((v) => v !== undefined);
171
+ }
172
+ if (t === "object") {
173
+ // Plain-object check: only recurse into objects whose prototype
174
+ // is Object.prototype or null. Class instances (URL, Buffer, …)
175
+ // are rejected — their `Object.entries` shape is rarely what a
176
+ // YAML frontmatter consumer wants.
177
+ const proto = Object.getPrototypeOf(value);
178
+ if (proto !== null && proto !== Object.prototype) return undefined;
179
+ // Use a null-prototype object as the accumulator so neither the
180
+ // POLLUTION_KEYS guard nor a setter on Object.prototype can be
181
+ // triggered by an `out[__proto__] = ...` assignment with a crafted
182
+ // key. (defineProperty would also work; null-proto is one allocation.)
183
+ const out = Object.create(null);
184
+ for (const [k, v] of Object.entries(value)) {
185
+ if (POLLUTION_KEYS.has(k)) continue;
186
+ const s = sanitiseAuthoredValue(v);
187
+ if (s === undefined) continue;
188
+ out[k] = s;
189
+ }
190
+ // Re-parent to Object.prototype before returning so downstream
191
+ // consumers that do `value.hasOwnProperty(...)` etc. keep working.
192
+ return Object.assign({}, out);
193
+ }
194
+ return undefined;
195
+ }
196
+
90
197
  function pickAuthored(authoredVal, fallback) {
91
198
  if (authoredVal === undefined || authoredVal === null) return fallback;
92
199
  if (Array.isArray(authoredVal)) {
@@ -126,8 +126,9 @@ function parseMap(p, baseIndent) {
126
126
  const rest = text.slice(colon + 1).trim();
127
127
  p.advance();
128
128
 
129
- if (rest === "|" || rest === ">") {
130
- safeAssign(out, key, parseBlockScalar(p, baseIndent, rest === "|"), p, tok);
129
+ const blockHeader = blockScalarHeader(rest);
130
+ if (blockHeader) {
131
+ safeAssign(out, key, parseBlockScalar(p, baseIndent, blockHeader), p, tok);
131
132
  continue;
132
133
  }
133
134
  if (rest !== "") {
@@ -178,6 +179,12 @@ function parseSeq(p, baseIndent) {
178
179
  continue;
179
180
  }
180
181
 
182
+ const itemBlockHeader = blockScalarHeader(afterDash);
183
+ if (itemBlockHeader) {
184
+ out.push(parseBlockScalar(p, baseIndent, itemBlockHeader));
185
+ continue;
186
+ }
187
+
181
188
  const colon = findKeyColon(afterDash);
182
189
  if (colon === -1) {
183
190
  out.push(parseScalarInline(afterDash));
@@ -189,8 +196,9 @@ function parseSeq(p, baseIndent) {
189
196
  const firstRest = afterDash.slice(colon + 1).trim();
190
197
  const item = {};
191
198
 
192
- if (firstRest === "|" || firstRest === ">") {
193
- item[firstKey] = parseBlockScalar(p, baseIndent + 2, firstRest === "|");
199
+ const firstBlockHeader = blockScalarHeader(firstRest);
200
+ if (firstBlockHeader) {
201
+ item[firstKey] = parseBlockScalar(p, baseIndent + 2, firstBlockHeader);
194
202
  } else if (firstRest !== "") {
195
203
  item[firstKey] = parseScalarInline(firstRest);
196
204
  } else {
@@ -237,10 +245,13 @@ function parseSeq(p, baseIndent) {
237
245
  } else {
238
246
  item[subKey] = null;
239
247
  }
240
- } else if (subRest === "|" || subRest === ">") {
241
- item[subKey] = parseBlockScalar(p, baseIndent + 2, subRest === "|");
242
248
  } else {
243
- item[subKey] = parseScalarInline(subRest);
249
+ const subBlockHeader = blockScalarHeader(subRest);
250
+ if (subBlockHeader) {
251
+ item[subKey] = parseBlockScalar(p, baseIndent + 2, subBlockHeader);
252
+ } else {
253
+ item[subKey] = parseScalarInline(subRest);
254
+ }
244
255
  }
245
256
  }
246
257
 
@@ -248,8 +259,29 @@ function parseSeq(p, baseIndent) {
248
259
  }
249
260
  }
250
261
 
251
- function parseBlockScalar(p, baseIndent, literal) {
262
+ // Recognise a YAML block scalar header: `|` (literal) or `>` (folded),
263
+ // each optionally carrying a chomping indicator (`+`/`-`) and/or an explicit
264
+ // indentation indicator (a single digit 1-9), in either order (YAML 1.2
265
+ // §8.1.1). Returns { literal } or null. Chomping/indent indicators affect
266
+ // only trailing-newline and indent-detection nuances that do not change the
267
+ // value of the single-line/wrapped scalars our frontmatter uses, so we read
268
+ // them for tolerance but act only on the literal-vs-folded distinction. This
269
+ // is why a serializer-folded `id: >-` (js-yaml's default line wrap) parses
270
+ // instead of tripping "unexpected indent".
271
+ function blockScalarHeader(rest) {
272
+ const m = /^([|>])(?:(?:([+-])([1-9])?)|(?:([1-9])([+-])?))?$/.exec(rest);
273
+ return m
274
+ ? {
275
+ literal: m[1] === "|",
276
+ indent: Number(m[3] ?? m[4] ?? 0),
277
+ }
278
+ : null;
279
+ }
280
+
281
+ function parseBlockScalar(p, baseIndent, header) {
282
+ const { literal, indent } = header;
252
283
  const collected = [];
284
+ let contentIndent = indent > 0 ? baseIndent + indent : null;
253
285
  while (p.pos < p.lines.length) {
254
286
  const raw = p.lines[p.pos];
255
287
  if (raw.trim() === "") {
@@ -259,7 +291,11 @@ function parseBlockScalar(p, baseIndent, literal) {
259
291
  }
260
292
  const indent = raw.length - raw.trimStart().length;
261
293
  if (indent <= baseIndent) break;
262
- collected.push(raw.slice(baseIndent + 2));
294
+ if (contentIndent == null) {
295
+ contentIndent = indent;
296
+ }
297
+ if (indent < contentIndent) break;
298
+ collected.push(raw.slice(contentIndent));
263
299
  p.pos++;
264
300
  }
265
301
  // Trim trailing empty lines
@@ -52,6 +52,11 @@ export const FINDING_ACTIONS = Object.freeze({
52
52
  "DANGLING-LINK": "fix",
53
53
  "DANGLING-OVERLAY": "fix",
54
54
 
55
+ // X.11 root-leaf containment invariant — `fix` runs Phase 4.4.5
56
+ // root-containment to move outlier leaves into per-slug
57
+ // subcategories:
58
+ "LEAF-AT-WIKI-ROOT": "fix",
59
+
55
60
  // Size cap is a warning surface only:
56
61
  "SIZE-CAP": "none",
57
62
  });