@ctxr/skill-llm-wiki 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CHANGELOG.md +134 -0
  2. package/LICENSE +21 -0
  3. package/README.md +484 -0
  4. package/SKILL.md +252 -0
  5. package/guide/basics/concepts.md +74 -0
  6. package/guide/basics/index.md +45 -0
  7. package/guide/basics/schema.md +140 -0
  8. package/guide/cli.md +256 -0
  9. package/guide/correctness/index.md +45 -0
  10. package/guide/correctness/invariants.md +89 -0
  11. package/guide/correctness/safety.md +96 -0
  12. package/guide/history/diff.md +110 -0
  13. package/guide/history/hidden-git.md +130 -0
  14. package/guide/history/index.md +52 -0
  15. package/guide/history/remote-sync.md +113 -0
  16. package/guide/index.md +134 -0
  17. package/guide/isolation/coexistence.md +134 -0
  18. package/guide/isolation/index.md +44 -0
  19. package/guide/isolation/scale.md +251 -0
  20. package/guide/layout/in-place-mode.md +97 -0
  21. package/guide/layout/index.md +53 -0
  22. package/guide/layout/layout-contract.md +131 -0
  23. package/guide/layout/layout-modes.md +115 -0
  24. package/guide/operations/index.md +76 -0
  25. package/guide/operations/ingest/build.md +75 -0
  26. package/guide/operations/ingest/extend.md +61 -0
  27. package/guide/operations/ingest/index.md +54 -0
  28. package/guide/operations/ingest/join.md +65 -0
  29. package/guide/operations/maintain/fix.md +66 -0
  30. package/guide/operations/maintain/index.md +47 -0
  31. package/guide/operations/maintain/rebuild.md +86 -0
  32. package/guide/operations/validate.md +48 -0
  33. package/guide/substrate/index.md +47 -0
  34. package/guide/substrate/operators.md +96 -0
  35. package/guide/substrate/tiered-ai.md +363 -0
  36. package/guide/ux/index.md +44 -0
  37. package/guide/ux/preflight.md +150 -0
  38. package/guide/ux/user-intent.md +135 -0
  39. package/package.json +55 -0
  40. package/scripts/cli.mjs +893 -0
  41. package/scripts/commands/remote.mjs +93 -0
  42. package/scripts/commands/review.mjs +253 -0
  43. package/scripts/commands/sync.mjs +84 -0
  44. package/scripts/lib/chunk.mjs +421 -0
  45. package/scripts/lib/cluster-detect.mjs +516 -0
  46. package/scripts/lib/decision-log.mjs +343 -0
  47. package/scripts/lib/draft.mjs +158 -0
  48. package/scripts/lib/embeddings.mjs +366 -0
  49. package/scripts/lib/frontmatter.mjs +497 -0
  50. package/scripts/lib/git-commands.mjs +155 -0
  51. package/scripts/lib/git.mjs +486 -0
  52. package/scripts/lib/gitignore.mjs +62 -0
  53. package/scripts/lib/history.mjs +331 -0
  54. package/scripts/lib/indices.mjs +510 -0
  55. package/scripts/lib/ingest.mjs +258 -0
  56. package/scripts/lib/intent.mjs +713 -0
  57. package/scripts/lib/interactive.mjs +99 -0
  58. package/scripts/lib/migrate.mjs +126 -0
  59. package/scripts/lib/nest-applier.mjs +260 -0
  60. package/scripts/lib/operators.mjs +1365 -0
  61. package/scripts/lib/orchestrator.mjs +718 -0
  62. package/scripts/lib/paths.mjs +197 -0
  63. package/scripts/lib/preflight.mjs +213 -0
  64. package/scripts/lib/provenance.mjs +672 -0
  65. package/scripts/lib/quality-metric.mjs +269 -0
  66. package/scripts/lib/query-fixture.mjs +71 -0
  67. package/scripts/lib/rollback.mjs +95 -0
  68. package/scripts/lib/shape-check.mjs +172 -0
  69. package/scripts/lib/similarity-cache.mjs +126 -0
  70. package/scripts/lib/similarity.mjs +230 -0
  71. package/scripts/lib/snapshot.mjs +54 -0
  72. package/scripts/lib/source-frontmatter.mjs +85 -0
  73. package/scripts/lib/tier2-protocol.mjs +470 -0
  74. package/scripts/lib/tiered.mjs +453 -0
  75. package/scripts/lib/validate.mjs +362 -0
@@ -0,0 +1,421 @@
1
+ // chunk.mjs — scale-safe chunked iteration over a wiki's entries.
2
+ //
3
+ // The orchestrator's operator-convergence, classify, and plan-review
4
+ // phases (methodology sections 3.5, 3.6, 8.5) must run frontmatter-only
5
+ // at detection time. Loading every leaf body into memory defeats the
6
+ // skill's ability to handle multi-megabyte corpora.
7
+ //
8
+ // This module is the single chokepoint for "walk the wiki, give me
9
+ // every entry". It yields frontmatter-parsed records with a lazy
10
+ // `loadBody()` thunk the caller must call explicitly to read the
11
+ // body content. If the chokepoint is respected, the orchestrator's
12
+ // working set stays bounded by the largest single entry regardless of
13
+ // corpus size.
14
+ //
15
+ // Two scale guarantees this file enforces:
16
+ //
17
+ // 1. Frontmatter reads are BOUNDED per entry via a streaming fs
18
+ // reader that stops at the closing `---` fence. We never pull a
19
+ // 10 MB body into memory just to parse a 500-byte frontmatter.
20
+ //
21
+ // 2. `loadBody()` re-opens the file on demand and returns a string.
22
+ // The iterator does NOT cache it. Callers that hold the returned
23
+ // string retain its bytes; callers that let it go out of scope
24
+ // release them. Module-level metrics (inFlightBodies /
25
+ // peakInFlightBodies) let scale tests prove the discipline is
26
+ // being followed.
27
+
28
+ import {
29
+ closeSync,
30
+ openSync,
31
+ readSync,
32
+ readdirSync,
33
+ readFileSync,
34
+ } from "node:fs";
35
+ import { basename, join, relative } from "node:path";
36
+ import { parseFrontmatter } from "./frontmatter.mjs";
37
+
38
+ // Max bytes we ever read while looking for a frontmatter closing
39
+ // fence. Real frontmatters are typically <4 KB; 256 KB is a generous
40
+ // pathology ceiling. A file that somehow needs more is either
41
+ // corrupted or adversarial — the chunk API refuses to play AND
42
+ // `listChildren` tolerates this refusal by skipping the entry, so
43
+ // index generation never blocks on an adversarial file.
44
+ //
45
+ // The ceiling is deliberately much larger than typical frontmatter
46
+ // (~64×) so that hand-authored corner cases (giant `activation.file_globs`
47
+ // arrays, huge shared_covers lists at a fat parent index) do not hit
48
+ // it in practice.
49
+ const MAX_FRONTMATTER_BYTES = 256 * 1024;
50
+ const READ_CHUNK_SIZE = 4096;
51
+
52
+ // ── Body-load discipline metrics ─────────────────────────────────────
53
+ //
54
+ // These counters track *caller discipline*, NOT actual memory
55
+ // residency. V8 has no cheap hook for measuring string residency from
56
+ // inside JavaScript, and the module cannot observe when a caller's
57
+ // reference falls out of scope. What these counters do give us:
58
+ //
59
+ // - `totalBodyLoads` — how many times `loadBody()` was called.
60
+ // Scale tests use this to prove a frontmatter-only walk never
61
+ // invoked the thunk (should be 0).
62
+ // - `inFlightBodies` — how many `loadBody()` calls have happened
63
+ // since the matching `releaseBody()` call. Callers that follow
64
+ // the streaming-consumer pattern (load → process → release →
65
+ // next) keep this at 0 or 1.
66
+ // - `peakInFlightBodies` — the maximum value `inFlightBodies`
67
+ // reached since the last `resetBodyMetrics()`. A streaming
68
+ // consumer's peak is 1; a naive consumer that holds every body
69
+ // sees peak == N.
70
+ //
71
+ // The metric does not prove memory residency. A caller can call
72
+ // `releaseBody()` and still hold a reference to the body string; V8
73
+ // keeps the string alive regardless of the counter. A caller can
74
+ // forget `releaseBody()` and drop the reference; the counter never
75
+ // decrements but GC still reclaims the string. The metric is a
76
+ // discipline tracker — it catches bugs where consumers accidentally
77
+ // accumulate bodies in an array, it does not measure heap pressure.
78
+ //
79
+ // Counters are PROCESS-GLOBAL. Tests that care about the value MUST
80
+ // call `resetBodyMetrics()` at the start of their scenario, and MUST
81
+ // not run in parallel with other metric-sensitive tests.
82
+ let _inFlightBodies = 0;
83
+ let _peakInFlightBodies = 0;
84
+ let _totalBodyLoads = 0;
85
+ // Parallel counter for the streaming frontmatter reader itself.
86
+ // Scale tests that need to prove `listChildren` went through
87
+ // `readFrontmatterStreaming` (and NOT a full-file readFileSync) read
88
+ // this counter to assert the rewire is still intact. Reset with
89
+ // `resetBodyMetrics()` so cross-test contamination cannot poison it.
90
+ let _totalFrontmatterReads = 0;
91
+
92
+ export function resetBodyMetrics() {
93
+ _inFlightBodies = 0;
94
+ _peakInFlightBodies = 0;
95
+ _totalBodyLoads = 0;
96
+ _totalFrontmatterReads = 0;
97
+ }
98
+
99
+ export function getBodyMetrics() {
100
+ return {
101
+ inFlightBodies: _inFlightBodies,
102
+ peakInFlightBodies: _peakInFlightBodies,
103
+ totalBodyLoads: _totalBodyLoads,
104
+ totalFrontmatterReads: _totalFrontmatterReads,
105
+ };
106
+ }
107
+
108
+ // Called by `loadBody` thunks immediately before returning the body
109
+ // string. Increments the in-flight counter and updates the peak.
110
+ function _markBodyLoadStart() {
111
+ _inFlightBodies++;
112
+ _totalBodyLoads++;
113
+ if (_inFlightBodies > _peakInFlightBodies) {
114
+ _peakInFlightBodies = _inFlightBodies;
115
+ }
116
+ }
117
+
118
+ // Called by consumers when they have finished with a body string.
119
+ // Decrements `inFlightBodies`. Callers MUST call this after every
120
+ // matching `loadBody()` if they want the discipline metric to stay
121
+ // meaningful. An unbalanced release (more releases than loads)
122
+ // throws loudly so the bug surfaces instead of silently muddying
123
+ // the counter.
124
+ export function releaseBody() {
125
+ if (_inFlightBodies === 0) {
126
+ throw new Error(
127
+ "chunk.mjs: releaseBody called without a matching loadBody — " +
128
+ "consumer discipline bug",
129
+ );
130
+ }
131
+ _inFlightBodies--;
132
+ }
133
+
134
+ // ── Streaming frontmatter reader ─────────────────────────────────────
135
+ //
136
+ // Reads bytes from the file in 4 KB chunks until we find the closing
137
+ // frontmatter fence OR exceed MAX_FRONTMATTER_BYTES. The reader
138
+ // operates purely on Buffers — it never decodes partial chunks as
139
+ // UTF-8, because a multi-byte codepoint split across chunk boundaries
140
+ // would emit replacement characters and corrupt both the decoded
141
+ // frontmatter and the byte offset used by `loadBody`. Only the final
142
+ // full frontmatter buffer is decoded once at the end.
143
+ //
144
+ // `bodyOffset` is authoritatively a BYTE offset (not a code-unit
145
+ // index) so `loadBody` can slice the raw file Buffer before decoding.
146
+ // Returns `null` for files that do not begin with a frontmatter
147
+ // fence; throws for files whose frontmatter has no closing fence
148
+ // within the pathology budget.
149
+ //
150
+ // Opening and closing fence line-endings must agree. A file that
151
+ // opens `---\n` must close `\n---\n`; a file that opens `---\r\n`
152
+ // must close `\r\n---\r\n`. Mixed line-endings are rejected loudly.
153
+ const OPEN_LF = Buffer.from("---\n");
154
+ const OPEN_CRLF = Buffer.from("---\r\n");
155
+ const CLOSE_LF = Buffer.from("\n---\n");
156
+ const CLOSE_CRLF = Buffer.from("\r\n---\r\n");
157
+
158
+ export function readFrontmatterStreaming(absPath) {
159
+ _totalFrontmatterReads++;
160
+ const fd = openSync(absPath, "r");
161
+ try {
162
+ const chunk = Buffer.alloc(READ_CHUNK_SIZE);
163
+ let collected = Buffer.alloc(0);
164
+ let pos = 0;
165
+ // Style is set once the opening fence is confirmed, so we search
166
+ // for the matching closing fence variant and never mix.
167
+ let style = null; // "lf" | "crlf"
168
+ while (collected.length < MAX_FRONTMATTER_BYTES) {
169
+ const n = readSync(fd, chunk, 0, chunk.length, pos);
170
+ if (n === 0) break;
171
+ collected = Buffer.concat([collected, chunk.slice(0, n)]);
172
+ pos += n;
173
+
174
+ if (style === null) {
175
+ // Fence detection needs at least 4 bytes. Once we have them:
176
+ // - `- - - \n` (bytes 0x2d 0x2d 0x2d 0x0a) → LF style.
177
+ // - `- - - \r` needs one more byte to confirm `\n` and
178
+ // become CRLF style; with only 4 bytes on a file that
179
+ // ends there, it's malformed.
180
+ // - Anything else at bytes 0-3 is a plain markdown file.
181
+ if (collected.length < 4) continue;
182
+ if (collected.slice(0, 4).equals(OPEN_LF)) {
183
+ style = "lf";
184
+ } else if (
185
+ collected.length >= 5 &&
186
+ collected.slice(0, 5).equals(OPEN_CRLF)
187
+ ) {
188
+ style = "crlf";
189
+ } else if (collected[0] === 0x2d && collected[3] === 0x0d) {
190
+ // We have `---\r` and are waiting for the next byte to
191
+ // decide LF-vs-CRLF. Keep reading.
192
+ continue;
193
+ } else {
194
+ // First 4 bytes are not a frontmatter opening. This is a
195
+ // plain markdown file; skip silently.
196
+ return null;
197
+ }
198
+ }
199
+
200
+ const closeFence = style === "crlf" ? CLOSE_CRLF : CLOSE_LF;
201
+ const searchFrom = style === "crlf" ? OPEN_CRLF.length : OPEN_LF.length;
202
+ const idx = collected.indexOf(closeFence, searchFrom);
203
+ if (idx !== -1) {
204
+ const end = idx + closeFence.length;
205
+ const frontmatterBytes = collected.slice(0, end);
206
+ // Decode the frontmatter to UTF-8. For CRLF files we also
207
+ // normalise to LF so downstream parsers (which are LF-only
208
+ // on this codebase) see the expected line endings. The
209
+ // `bodyOffset` stays as the original BYTE offset into the
210
+ // file, independent of the text normalisation, so loadBody
211
+ // still cuts the body at the right position.
212
+ let text = frontmatterBytes.toString("utf8");
213
+ if (style === "crlf") {
214
+ text = text.replace(/\r\n/g, "\n");
215
+ }
216
+ return {
217
+ frontmatterText: text,
218
+ // `bodyOffset` is the number of bytes from the start of the
219
+ // file to just after the closing fence. `loadBody` reads the
220
+ // file as a Buffer and slices at this offset BEFORE decoding,
221
+ // so multi-byte frontmatter characters cannot corrupt the
222
+ // body boundary.
223
+ bodyOffset: end,
224
+ lineEnding: style,
225
+ };
226
+ }
227
+ }
228
+
229
+ // Distinguish three terminal states: empty file, short file, and
230
+ // budget exhausted. The empty / short cases should NOT look like
231
+ // "frontmatter too big" because the diagnostic would mislead a
232
+ // user whose file is 4 bytes.
233
+ if (collected.length === 0) {
234
+ return null;
235
+ }
236
+ if (style === null) {
237
+ // Too short to even confirm the opening fence — treat as plain.
238
+ return null;
239
+ }
240
+ throw new Error(
241
+ `chunk.mjs: frontmatter in ${absPath} has no closing --- fence ` +
242
+ `within ${MAX_FRONTMATTER_BYTES} bytes`,
243
+ );
244
+ } finally {
245
+ closeSync(fd);
246
+ }
247
+ }
248
+
249
+ // ── Entry path collection ────────────────────────────────────────────
250
+ //
251
+ // Walks the wiki tree starting at `wikiRoot`, returning every `.md`
252
+ // file (including `index.md`) sorted by their absolute path. The walk
253
+ // is iterative with an explicit stack so we do not blow the call
254
+ // stack on deeply-nested corpora.
255
+ //
256
+ // Dot files and dot directories are skipped entirely. This is a
257
+ // blanket rule covering every metadata surface the skill owns
258
+ // (`.llmwiki/`, `.work/`, `.shape/`) plus any user dotfile the
259
+ // caller might reasonably not want yielded as an entry (`.git/`,
260
+ // `.github/`, `.DS_Store`, etc). There is no allow-list: if you
261
+ // want a dotfile indexed, rename it.
262
+ //
263
+ // The caller receives `onDirError(err, dir)` notifications for any
264
+ // directory that fails to enumerate (permission denied, etc).
265
+ // Default: silently swallow, because skipping an unreadable subdir
266
+ // is the safer behaviour for a walk over user-supplied paths. Tests
267
+ // use the callback to assert errors are raised for known-bad
268
+ // fixtures.
269
+ export function collectEntryPaths(wikiRoot, opts = {}) {
270
+ const { onDirError = null } = opts;
271
+ const out = [];
272
+ const stack = [wikiRoot];
273
+ while (stack.length > 0) {
274
+ const dir = stack.pop();
275
+ let entries;
276
+ try {
277
+ entries = readdirSync(dir, { withFileTypes: true });
278
+ } catch (err) {
279
+ if (onDirError) onDirError(err, dir);
280
+ continue;
281
+ }
282
+ for (const e of entries) {
283
+ if (e.name.startsWith(".")) continue;
284
+ const full = join(dir, e.name);
285
+ if (e.isDirectory()) {
286
+ stack.push(full);
287
+ continue;
288
+ }
289
+ if (e.isFile() && e.name.endsWith(".md")) {
290
+ out.push(full);
291
+ }
292
+ }
293
+ }
294
+ out.sort();
295
+ return out;
296
+ }
297
+
298
+ // ── The iterator ─────────────────────────────────────────────────────
299
+ //
300
+ // Synchronous generator that yields one entry at a time in
301
+ // deterministic sorted-path order. Callers may drive it with either
302
+ // `for (const entry of iterEntries(...))` or
303
+ // `for await (const entry of iterEntries(...))` — for-await accepts
304
+ // sync iterables transparently, so existing async callers keep
305
+ // working.
306
+ //
307
+ // Each yielded entry carries:
308
+ //
309
+ // path absolute filesystem path to the .md file
310
+ // relPath path relative to wikiRoot (POSIX separators)
311
+ // data parsed frontmatter (same shape as parseFrontmatter.data)
312
+ // type "index" | "leaf" (derived from filename basename)
313
+ // loadBody async () => string, reads and returns the body
314
+ //
315
+ // `loadBody()` is declared `async` on purpose: Phase 6's tiered AI
316
+ // may eventually back it with a remote fetch for partial-retrieval
317
+ // scenarios, and promoting the signature later is a breaking change.
318
+ // Phase 5's implementation is synchronous under the hood.
319
+ //
320
+ // `loadBody()` is NOT cached by the iterator: calling it twice reads
321
+ // the file twice. A caller that wants to hold a body does so in a
322
+ // local variable and knows exactly when it is retained.
323
+ //
324
+ // `opts.includeIndexFiles` (default: true) — set false when the
325
+ // caller only cares about leaves (operator-convergence detection).
326
+ //
327
+ // `opts.onMalformed` — callback invoked when a file has a `---`
328
+ // opening fence but reads or parses pathologically (missing closing
329
+ // fence, YAML parse error, non-object frontmatter). The callback's
330
+ // return value is ignored; to ABORT iteration on first bad file it
331
+ // must throw. The default behaviour is exactly that: the default
332
+ // callback throws, which propagates out of the generator and
333
+ // terminates the walk. To COLLECT errors and keep walking, pass an
334
+ // explicit non-throwing callback — the iterator then skips the bad
335
+ // file and continues. These are two distinct modes; choose which
336
+ // one you want by choosing whether the callback throws.
337
+ //
338
+ // `opts.onDirError` (default: null) — forwarded to
339
+ // `collectEntryPaths` for directory-level I/O errors.
340
+ export function* iterEntries(wikiRoot, opts = {}) {
341
+ const {
342
+ includeIndexFiles = true,
343
+ onMalformed = (err) => {
344
+ throw err;
345
+ },
346
+ onDirError = null,
347
+ } = opts;
348
+
349
+ const paths = collectEntryPaths(wikiRoot, { onDirError });
350
+ for (const absPath of paths) {
351
+ const rel = relative(wikiRoot, absPath).replace(/\\/g, "/");
352
+ const isIndex = basename(absPath) === "index.md";
353
+ if (isIndex && !includeIndexFiles) continue;
354
+
355
+ let captured;
356
+ try {
357
+ captured = readFrontmatterStreaming(absPath);
358
+ } catch (err) {
359
+ onMalformed(err);
360
+ continue;
361
+ }
362
+ if (captured === null) {
363
+ // Not a frontmatter-bearing file. Skip silently: a wiki may
364
+ // carry incidental markdown files (README snippets, etc.)
365
+ // and we should not treat them as entries.
366
+ continue;
367
+ }
368
+
369
+ let parsed;
370
+ try {
371
+ parsed = parseFrontmatter(captured.frontmatterText, absPath);
372
+ } catch (err) {
373
+ onMalformed(err);
374
+ continue;
375
+ }
376
+ if (!parsed.data || typeof parsed.data !== "object") {
377
+ onMalformed(
378
+ new Error(
379
+ `${absPath}: frontmatter parsed to non-object ${typeof parsed.data}`,
380
+ ),
381
+ );
382
+ continue;
383
+ }
384
+
385
+ // `bodyOffset` is a byte offset. `loadBody` reads the file as a
386
+ // Buffer, slices BEFORE decoding, and only then converts to
387
+ // UTF-8 — so multi-byte frontmatter codepoints cannot misalign
388
+ // the body boundary. The thunk captures both `absPath` and
389
+ // `bodyOffset`, so repeated calls always seek past the
390
+ // frontmatter correctly.
391
+ const bodyOffset = captured.bodyOffset;
392
+ const loadBody = async () => {
393
+ const rawBuf = readFileSync(absPath);
394
+ const bodyBuf = rawBuf.slice(bodyOffset);
395
+ const body = bodyBuf.toString("utf8");
396
+ _markBodyLoadStart();
397
+ return body;
398
+ };
399
+
400
+ yield {
401
+ path: absPath,
402
+ relPath: rel,
403
+ data: parsed.data,
404
+ type: isIndex ? "index" : "leaf",
405
+ loadBody,
406
+ };
407
+ }
408
+ }
409
+
410
+ // Convenience wrapper for callers that only want frontmatter (the
411
+ // common case for operator-convergence + classify). Returns an array
412
+ // of { path, relPath, data, type, loadBody } entries with the
413
+ // loadBody thunk included but never called by this helper.
414
+ export function collectFrontmatterOnly(wikiRoot, opts = {}) {
415
+ const out = [];
416
+ for (const entry of iterEntries(wikiRoot, opts)) {
417
+ out.push(entry);
418
+ }
419
+ return out;
420
+ }
421
+