@ctxr/skill-llm-wiki 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +134 -0
- package/LICENSE +21 -0
- package/README.md +484 -0
- package/SKILL.md +252 -0
- package/guide/basics/concepts.md +74 -0
- package/guide/basics/index.md +45 -0
- package/guide/basics/schema.md +140 -0
- package/guide/cli.md +256 -0
- package/guide/correctness/index.md +45 -0
- package/guide/correctness/invariants.md +89 -0
- package/guide/correctness/safety.md +96 -0
- package/guide/history/diff.md +110 -0
- package/guide/history/hidden-git.md +130 -0
- package/guide/history/index.md +52 -0
- package/guide/history/remote-sync.md +113 -0
- package/guide/index.md +134 -0
- package/guide/isolation/coexistence.md +134 -0
- package/guide/isolation/index.md +44 -0
- package/guide/isolation/scale.md +251 -0
- package/guide/layout/in-place-mode.md +97 -0
- package/guide/layout/index.md +53 -0
- package/guide/layout/layout-contract.md +131 -0
- package/guide/layout/layout-modes.md +115 -0
- package/guide/operations/index.md +76 -0
- package/guide/operations/ingest/build.md +75 -0
- package/guide/operations/ingest/extend.md +61 -0
- package/guide/operations/ingest/index.md +54 -0
- package/guide/operations/ingest/join.md +65 -0
- package/guide/operations/maintain/fix.md +66 -0
- package/guide/operations/maintain/index.md +47 -0
- package/guide/operations/maintain/rebuild.md +86 -0
- package/guide/operations/validate.md +48 -0
- package/guide/substrate/index.md +47 -0
- package/guide/substrate/operators.md +96 -0
- package/guide/substrate/tiered-ai.md +363 -0
- package/guide/ux/index.md +44 -0
- package/guide/ux/preflight.md +150 -0
- package/guide/ux/user-intent.md +135 -0
- package/package.json +55 -0
- package/scripts/cli.mjs +893 -0
- package/scripts/commands/remote.mjs +93 -0
- package/scripts/commands/review.mjs +253 -0
- package/scripts/commands/sync.mjs +84 -0
- package/scripts/lib/chunk.mjs +421 -0
- package/scripts/lib/cluster-detect.mjs +516 -0
- package/scripts/lib/decision-log.mjs +343 -0
- package/scripts/lib/draft.mjs +158 -0
- package/scripts/lib/embeddings.mjs +366 -0
- package/scripts/lib/frontmatter.mjs +497 -0
- package/scripts/lib/git-commands.mjs +155 -0
- package/scripts/lib/git.mjs +486 -0
- package/scripts/lib/gitignore.mjs +62 -0
- package/scripts/lib/history.mjs +331 -0
- package/scripts/lib/indices.mjs +510 -0
- package/scripts/lib/ingest.mjs +258 -0
- package/scripts/lib/intent.mjs +713 -0
- package/scripts/lib/interactive.mjs +99 -0
- package/scripts/lib/migrate.mjs +126 -0
- package/scripts/lib/nest-applier.mjs +260 -0
- package/scripts/lib/operators.mjs +1365 -0
- package/scripts/lib/orchestrator.mjs +718 -0
- package/scripts/lib/paths.mjs +197 -0
- package/scripts/lib/preflight.mjs +213 -0
- package/scripts/lib/provenance.mjs +672 -0
- package/scripts/lib/quality-metric.mjs +269 -0
- package/scripts/lib/query-fixture.mjs +71 -0
- package/scripts/lib/rollback.mjs +95 -0
- package/scripts/lib/shape-check.mjs +172 -0
- package/scripts/lib/similarity-cache.mjs +126 -0
- package/scripts/lib/similarity.mjs +230 -0
- package/scripts/lib/snapshot.mjs +54 -0
- package/scripts/lib/source-frontmatter.mjs +85 -0
- package/scripts/lib/tier2-protocol.mjs +470 -0
- package/scripts/lib/tiered.mjs +453 -0
- package/scripts/lib/validate.mjs +362 -0
|
@@ -0,0 +1,421 @@
|
|
|
1
|
+
// chunk.mjs — scale-safe chunked iteration over a wiki's entries.
|
|
2
|
+
//
|
|
3
|
+
// The orchestrator's operator-convergence, classify, and plan-review
|
|
4
|
+
// phases (methodology sections 3.5, 3.6, 8.5) must run frontmatter-only
|
|
5
|
+
// at detection time. Loading every leaf body into memory defeats the
|
|
6
|
+
// skill's ability to handle multi-megabyte corpora.
|
|
7
|
+
//
|
|
8
|
+
// This module is the single chokepoint for "walk the wiki, give me
|
|
9
|
+
// every entry". It yields frontmatter-parsed records with a lazy
|
|
10
|
+
// `loadBody()` thunk the caller must call explicitly to read the
|
|
11
|
+
// body content. If the chokepoint is respected, the orchestrator's
|
|
12
|
+
// working set stays bounded by the largest single entry regardless of
|
|
13
|
+
// corpus size.
|
|
14
|
+
//
|
|
15
|
+
// Two scale guarantees this file enforces:
|
|
16
|
+
//
|
|
17
|
+
// 1. Frontmatter reads are BOUNDED per entry via a streaming fs
|
|
18
|
+
// reader that stops at the closing `---` fence. We never pull a
|
|
19
|
+
// 10 MB body into memory just to parse a 500-byte frontmatter.
|
|
20
|
+
//
|
|
21
|
+
// 2. `loadBody()` re-opens the file on demand and returns a string.
|
|
22
|
+
// The iterator does NOT cache it. Callers that hold the returned
|
|
23
|
+
// string retain its bytes; callers that let it go out of scope
|
|
24
|
+
// release them. Module-level metrics (inFlightBodies /
|
|
25
|
+
// peakInFlightBodies) let scale tests prove the discipline is
|
|
26
|
+
// being followed.
|
|
27
|
+
|
|
28
|
+
import {
|
|
29
|
+
closeSync,
|
|
30
|
+
openSync,
|
|
31
|
+
readSync,
|
|
32
|
+
readdirSync,
|
|
33
|
+
readFileSync,
|
|
34
|
+
} from "node:fs";
|
|
35
|
+
import { basename, join, relative } from "node:path";
|
|
36
|
+
import { parseFrontmatter } from "./frontmatter.mjs";
|
|
37
|
+
|
|
38
|
+
// Max bytes we ever read while looking for a frontmatter closing
|
|
39
|
+
// fence. Real frontmatters are typically <4 KB; 256 KB is a generous
|
|
40
|
+
// pathology ceiling. A file that somehow needs more is either
|
|
41
|
+
// corrupted or adversarial — the chunk API refuses to play AND
|
|
42
|
+
// `listChildren` tolerates this refusal by skipping the entry, so
|
|
43
|
+
// index generation never blocks on an adversarial file.
|
|
44
|
+
//
|
|
45
|
+
// The ceiling is deliberately much larger than typical frontmatter
|
|
46
|
+
// (~64×) so that hand-authored corner cases (giant `activation.file_globs`
|
|
47
|
+
// arrays, huge shared_covers lists at a fat parent index) do not hit
|
|
48
|
+
// it in practice.
|
|
49
|
+
const MAX_FRONTMATTER_BYTES = 256 * 1024;
|
|
50
|
+
const READ_CHUNK_SIZE = 4096;
|
|
51
|
+
|
|
52
|
+
// ── Body-load discipline metrics ─────────────────────────────────────
|
|
53
|
+
//
|
|
54
|
+
// These counters track *caller discipline*, NOT actual memory
|
|
55
|
+
// residency. V8 has no cheap hook for measuring string residency from
|
|
56
|
+
// inside JavaScript, and the module cannot observe when a caller's
|
|
57
|
+
// reference falls out of scope. What these counters do give us:
|
|
58
|
+
//
|
|
59
|
+
// - `totalBodyLoads` — how many times `loadBody()` was called.
|
|
60
|
+
// Scale tests use this to prove a frontmatter-only walk never
|
|
61
|
+
// invoked the thunk (should be 0).
|
|
62
|
+
// - `inFlightBodies` — how many `loadBody()` calls have happened
|
|
63
|
+
// since the matching `releaseBody()` call. Callers that follow
|
|
64
|
+
// the streaming-consumer pattern (load → process → release →
|
|
65
|
+
// next) keep this at 0 or 1.
|
|
66
|
+
// - `peakInFlightBodies` — the maximum value `inFlightBodies`
|
|
67
|
+
// reached since the last `resetBodyMetrics()`. A streaming
|
|
68
|
+
// consumer's peak is 1; a naive consumer that holds every body
|
|
69
|
+
// sees peak == N.
|
|
70
|
+
//
|
|
71
|
+
// The metric does not prove memory residency. A caller can call
|
|
72
|
+
// `releaseBody()` and still hold a reference to the body string; V8
|
|
73
|
+
// keeps the string alive regardless of the counter. A caller can
|
|
74
|
+
// forget `releaseBody()` and drop the reference; the counter never
|
|
75
|
+
// decrements but GC still reclaims the string. The metric is a
|
|
76
|
+
// discipline tracker — it catches bugs where consumers accidentally
|
|
77
|
+
// accumulate bodies in an array, it does not measure heap pressure.
|
|
78
|
+
//
|
|
79
|
+
// Counters are PROCESS-GLOBAL. Tests that care about the value MUST
|
|
80
|
+
// call `resetBodyMetrics()` at the start of their scenario, and MUST
|
|
81
|
+
// not run in parallel with other metric-sensitive tests.
|
|
82
|
+
let _inFlightBodies = 0;
|
|
83
|
+
let _peakInFlightBodies = 0;
|
|
84
|
+
let _totalBodyLoads = 0;
|
|
85
|
+
// Parallel counter for the streaming frontmatter reader itself.
|
|
86
|
+
// Scale tests that need to prove `listChildren` went through
|
|
87
|
+
// `readFrontmatterStreaming` (and NOT a full-file readFileSync) read
|
|
88
|
+
// this counter to assert the rewire is still intact. Reset with
|
|
89
|
+
// `resetBodyMetrics()` so cross-test contamination cannot poison it.
|
|
90
|
+
let _totalFrontmatterReads = 0;
|
|
91
|
+
|
|
92
|
+
export function resetBodyMetrics() {
|
|
93
|
+
_inFlightBodies = 0;
|
|
94
|
+
_peakInFlightBodies = 0;
|
|
95
|
+
_totalBodyLoads = 0;
|
|
96
|
+
_totalFrontmatterReads = 0;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
export function getBodyMetrics() {
|
|
100
|
+
return {
|
|
101
|
+
inFlightBodies: _inFlightBodies,
|
|
102
|
+
peakInFlightBodies: _peakInFlightBodies,
|
|
103
|
+
totalBodyLoads: _totalBodyLoads,
|
|
104
|
+
totalFrontmatterReads: _totalFrontmatterReads,
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// Called by `loadBody` thunks immediately before returning the body
|
|
109
|
+
// string. Increments the in-flight counter and updates the peak.
|
|
110
|
+
function _markBodyLoadStart() {
|
|
111
|
+
_inFlightBodies++;
|
|
112
|
+
_totalBodyLoads++;
|
|
113
|
+
if (_inFlightBodies > _peakInFlightBodies) {
|
|
114
|
+
_peakInFlightBodies = _inFlightBodies;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// Called by consumers when they have finished with a body string.
|
|
119
|
+
// Decrements `inFlightBodies`. Callers MUST call this after every
|
|
120
|
+
// matching `loadBody()` if they want the discipline metric to stay
|
|
121
|
+
// meaningful. An unbalanced release (more releases than loads)
|
|
122
|
+
// throws loudly so the bug surfaces instead of silently muddying
|
|
123
|
+
// the counter.
|
|
124
|
+
export function releaseBody() {
|
|
125
|
+
if (_inFlightBodies === 0) {
|
|
126
|
+
throw new Error(
|
|
127
|
+
"chunk.mjs: releaseBody called without a matching loadBody — " +
|
|
128
|
+
"consumer discipline bug",
|
|
129
|
+
);
|
|
130
|
+
}
|
|
131
|
+
_inFlightBodies--;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// ── Streaming frontmatter reader ─────────────────────────────────────
|
|
135
|
+
//
|
|
136
|
+
// Reads bytes from the file in 4 KB chunks until we find the closing
|
|
137
|
+
// frontmatter fence OR exceed MAX_FRONTMATTER_BYTES. The reader
|
|
138
|
+
// operates purely on Buffers — it never decodes partial chunks as
|
|
139
|
+
// UTF-8, because a multi-byte codepoint split across chunk boundaries
|
|
140
|
+
// would emit replacement characters and corrupt both the decoded
|
|
141
|
+
// frontmatter and the byte offset used by `loadBody`. Only the final
|
|
142
|
+
// full frontmatter buffer is decoded once at the end.
|
|
143
|
+
//
|
|
144
|
+
// `bodyOffset` is authoritatively a BYTE offset (not a code-unit
|
|
145
|
+
// index) so `loadBody` can slice the raw file Buffer before decoding.
|
|
146
|
+
// Returns `null` for files that do not begin with a frontmatter
|
|
147
|
+
// fence; throws for files whose frontmatter has no closing fence
|
|
148
|
+
// within the pathology budget.
|
|
149
|
+
//
|
|
150
|
+
// Opening and closing fence line-endings must agree. A file that
|
|
151
|
+
// opens `---\n` must close `\n---\n`; a file that opens `---\r\n`
|
|
152
|
+
// must close `\r\n---\r\n`. Mixed line-endings are rejected loudly.
|
|
153
|
+
const OPEN_LF = Buffer.from("---\n");
|
|
154
|
+
const OPEN_CRLF = Buffer.from("---\r\n");
|
|
155
|
+
const CLOSE_LF = Buffer.from("\n---\n");
|
|
156
|
+
const CLOSE_CRLF = Buffer.from("\r\n---\r\n");
|
|
157
|
+
|
|
158
|
+
export function readFrontmatterStreaming(absPath) {
|
|
159
|
+
_totalFrontmatterReads++;
|
|
160
|
+
const fd = openSync(absPath, "r");
|
|
161
|
+
try {
|
|
162
|
+
const chunk = Buffer.alloc(READ_CHUNK_SIZE);
|
|
163
|
+
let collected = Buffer.alloc(0);
|
|
164
|
+
let pos = 0;
|
|
165
|
+
// Style is set once the opening fence is confirmed, so we search
|
|
166
|
+
// for the matching closing fence variant and never mix.
|
|
167
|
+
let style = null; // "lf" | "crlf"
|
|
168
|
+
while (collected.length < MAX_FRONTMATTER_BYTES) {
|
|
169
|
+
const n = readSync(fd, chunk, 0, chunk.length, pos);
|
|
170
|
+
if (n === 0) break;
|
|
171
|
+
collected = Buffer.concat([collected, chunk.slice(0, n)]);
|
|
172
|
+
pos += n;
|
|
173
|
+
|
|
174
|
+
if (style === null) {
|
|
175
|
+
// Fence detection needs at least 4 bytes. Once we have them:
|
|
176
|
+
// - `- - - \n` (bytes 0x2d 0x2d 0x2d 0x0a) → LF style.
|
|
177
|
+
// - `- - - \r` needs one more byte to confirm `\n` and
|
|
178
|
+
// become CRLF style; with only 4 bytes on a file that
|
|
179
|
+
// ends there, it's malformed.
|
|
180
|
+
// - Anything else at bytes 0-3 is a plain markdown file.
|
|
181
|
+
if (collected.length < 4) continue;
|
|
182
|
+
if (collected.slice(0, 4).equals(OPEN_LF)) {
|
|
183
|
+
style = "lf";
|
|
184
|
+
} else if (
|
|
185
|
+
collected.length >= 5 &&
|
|
186
|
+
collected.slice(0, 5).equals(OPEN_CRLF)
|
|
187
|
+
) {
|
|
188
|
+
style = "crlf";
|
|
189
|
+
} else if (collected[0] === 0x2d && collected[3] === 0x0d) {
|
|
190
|
+
// We have `---\r` and are waiting for the next byte to
|
|
191
|
+
// decide LF-vs-CRLF. Keep reading.
|
|
192
|
+
continue;
|
|
193
|
+
} else {
|
|
194
|
+
// First 4 bytes are not a frontmatter opening. This is a
|
|
195
|
+
// plain markdown file; skip silently.
|
|
196
|
+
return null;
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
const closeFence = style === "crlf" ? CLOSE_CRLF : CLOSE_LF;
|
|
201
|
+
const searchFrom = style === "crlf" ? OPEN_CRLF.length : OPEN_LF.length;
|
|
202
|
+
const idx = collected.indexOf(closeFence, searchFrom);
|
|
203
|
+
if (idx !== -1) {
|
|
204
|
+
const end = idx + closeFence.length;
|
|
205
|
+
const frontmatterBytes = collected.slice(0, end);
|
|
206
|
+
// Decode the frontmatter to UTF-8. For CRLF files we also
|
|
207
|
+
// normalise to LF so downstream parsers (which are LF-only
|
|
208
|
+
// on this codebase) see the expected line endings. The
|
|
209
|
+
// `bodyOffset` stays as the original BYTE offset into the
|
|
210
|
+
// file, independent of the text normalisation, so loadBody
|
|
211
|
+
// still cuts the body at the right position.
|
|
212
|
+
let text = frontmatterBytes.toString("utf8");
|
|
213
|
+
if (style === "crlf") {
|
|
214
|
+
text = text.replace(/\r\n/g, "\n");
|
|
215
|
+
}
|
|
216
|
+
return {
|
|
217
|
+
frontmatterText: text,
|
|
218
|
+
// `bodyOffset` is the number of bytes from the start of the
|
|
219
|
+
// file to just after the closing fence. `loadBody` reads the
|
|
220
|
+
// file as a Buffer and slices at this offset BEFORE decoding,
|
|
221
|
+
// so multi-byte frontmatter characters cannot corrupt the
|
|
222
|
+
// body boundary.
|
|
223
|
+
bodyOffset: end,
|
|
224
|
+
lineEnding: style,
|
|
225
|
+
};
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// Distinguish three terminal states: empty file, short file, and
|
|
230
|
+
// budget exhausted. The empty / short cases should NOT look like
|
|
231
|
+
// "frontmatter too big" because the diagnostic would mislead a
|
|
232
|
+
// user whose file is 4 bytes.
|
|
233
|
+
if (collected.length === 0) {
|
|
234
|
+
return null;
|
|
235
|
+
}
|
|
236
|
+
if (style === null) {
|
|
237
|
+
// Too short to even confirm the opening fence — treat as plain.
|
|
238
|
+
return null;
|
|
239
|
+
}
|
|
240
|
+
throw new Error(
|
|
241
|
+
`chunk.mjs: frontmatter in ${absPath} has no closing --- fence ` +
|
|
242
|
+
`within ${MAX_FRONTMATTER_BYTES} bytes`,
|
|
243
|
+
);
|
|
244
|
+
} finally {
|
|
245
|
+
closeSync(fd);
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// ── Entry path collection ────────────────────────────────────────────
|
|
250
|
+
//
|
|
251
|
+
// Walks the wiki tree starting at `wikiRoot`, returning every `.md`
|
|
252
|
+
// file (including `index.md`) sorted by their absolute path. The walk
|
|
253
|
+
// is iterative with an explicit stack so we do not blow the call
|
|
254
|
+
// stack on deeply-nested corpora.
|
|
255
|
+
//
|
|
256
|
+
// Dot files and dot directories are skipped entirely. This is a
|
|
257
|
+
// blanket rule covering every metadata surface the skill owns
|
|
258
|
+
// (`.llmwiki/`, `.work/`, `.shape/`) plus any user dotfile the
|
|
259
|
+
// caller might reasonably not want yielded as an entry (`.git/`,
|
|
260
|
+
// `.github/`, `.DS_Store`, etc). There is no allow-list: if you
|
|
261
|
+
// want a dotfile indexed, rename it.
|
|
262
|
+
//
|
|
263
|
+
// The caller receives `onDirError(err, dir)` notifications for any
|
|
264
|
+
// directory that fails to enumerate (permission denied, etc).
|
|
265
|
+
// Default: silently swallow, because skipping an unreadable subdir
|
|
266
|
+
// is the safer behaviour for a walk over user-supplied paths. Tests
|
|
267
|
+
// use the callback to assert errors are raised for known-bad
|
|
268
|
+
// fixtures.
|
|
269
|
+
export function collectEntryPaths(wikiRoot, opts = {}) {
|
|
270
|
+
const { onDirError = null } = opts;
|
|
271
|
+
const out = [];
|
|
272
|
+
const stack = [wikiRoot];
|
|
273
|
+
while (stack.length > 0) {
|
|
274
|
+
const dir = stack.pop();
|
|
275
|
+
let entries;
|
|
276
|
+
try {
|
|
277
|
+
entries = readdirSync(dir, { withFileTypes: true });
|
|
278
|
+
} catch (err) {
|
|
279
|
+
if (onDirError) onDirError(err, dir);
|
|
280
|
+
continue;
|
|
281
|
+
}
|
|
282
|
+
for (const e of entries) {
|
|
283
|
+
if (e.name.startsWith(".")) continue;
|
|
284
|
+
const full = join(dir, e.name);
|
|
285
|
+
if (e.isDirectory()) {
|
|
286
|
+
stack.push(full);
|
|
287
|
+
continue;
|
|
288
|
+
}
|
|
289
|
+
if (e.isFile() && e.name.endsWith(".md")) {
|
|
290
|
+
out.push(full);
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
out.sort();
|
|
295
|
+
return out;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
// ── The iterator ─────────────────────────────────────────────────────
|
|
299
|
+
//
|
|
300
|
+
// Synchronous generator that yields one entry at a time in
|
|
301
|
+
// deterministic sorted-path order. Callers may drive it with either
|
|
302
|
+
// `for (const entry of iterEntries(...))` or
|
|
303
|
+
// `for await (const entry of iterEntries(...))` — for-await accepts
|
|
304
|
+
// sync iterables transparently, so existing async callers keep
|
|
305
|
+
// working.
|
|
306
|
+
//
|
|
307
|
+
// Each yielded entry carries:
|
|
308
|
+
//
|
|
309
|
+
// path absolute filesystem path to the .md file
|
|
310
|
+
// relPath path relative to wikiRoot (POSIX separators)
|
|
311
|
+
// data parsed frontmatter (same shape as parseFrontmatter.data)
|
|
312
|
+
// type "index" | "leaf" (derived from filename basename)
|
|
313
|
+
// loadBody async () => string, reads and returns the body
|
|
314
|
+
//
|
|
315
|
+
// `loadBody()` is declared `async` on purpose: Phase 6's tiered AI
|
|
316
|
+
// may eventually back it with a remote fetch for partial-retrieval
|
|
317
|
+
// scenarios, and promoting the signature later is a breaking change.
|
|
318
|
+
// Phase 5's implementation is synchronous under the hood.
|
|
319
|
+
//
|
|
320
|
+
// `loadBody()` is NOT cached by the iterator: calling it twice reads
|
|
321
|
+
// the file twice. A caller that wants to hold a body does so in a
|
|
322
|
+
// local variable and knows exactly when it is retained.
|
|
323
|
+
//
|
|
324
|
+
// `opts.includeIndexFiles` (default: true) — set false when the
|
|
325
|
+
// caller only cares about leaves (operator-convergence detection).
|
|
326
|
+
//
|
|
327
|
+
// `opts.onMalformed` — callback invoked when a file has a `---`
|
|
328
|
+
// opening fence but reads or parses pathologically (missing closing
|
|
329
|
+
// fence, YAML parse error, non-object frontmatter). The callback's
|
|
330
|
+
// return value is ignored; to ABORT iteration on first bad file it
|
|
331
|
+
// must throw. The default behaviour is exactly that: the default
|
|
332
|
+
// callback throws, which propagates out of the generator and
|
|
333
|
+
// terminates the walk. To COLLECT errors and keep walking, pass an
|
|
334
|
+
// explicit non-throwing callback — the iterator then skips the bad
|
|
335
|
+
// file and continues. These are two distinct modes; choose which
|
|
336
|
+
// one you want by choosing whether the callback throws.
|
|
337
|
+
//
|
|
338
|
+
// `opts.onDirError` (default: null) — forwarded to
|
|
339
|
+
// `collectEntryPaths` for directory-level I/O errors.
|
|
340
|
+
export function* iterEntries(wikiRoot, opts = {}) {
|
|
341
|
+
const {
|
|
342
|
+
includeIndexFiles = true,
|
|
343
|
+
onMalformed = (err) => {
|
|
344
|
+
throw err;
|
|
345
|
+
},
|
|
346
|
+
onDirError = null,
|
|
347
|
+
} = opts;
|
|
348
|
+
|
|
349
|
+
const paths = collectEntryPaths(wikiRoot, { onDirError });
|
|
350
|
+
for (const absPath of paths) {
|
|
351
|
+
const rel = relative(wikiRoot, absPath).replace(/\\/g, "/");
|
|
352
|
+
const isIndex = basename(absPath) === "index.md";
|
|
353
|
+
if (isIndex && !includeIndexFiles) continue;
|
|
354
|
+
|
|
355
|
+
let captured;
|
|
356
|
+
try {
|
|
357
|
+
captured = readFrontmatterStreaming(absPath);
|
|
358
|
+
} catch (err) {
|
|
359
|
+
onMalformed(err);
|
|
360
|
+
continue;
|
|
361
|
+
}
|
|
362
|
+
if (captured === null) {
|
|
363
|
+
// Not a frontmatter-bearing file. Skip silently: a wiki may
|
|
364
|
+
// carry incidental markdown files (README snippets, etc.)
|
|
365
|
+
// and we should not treat them as entries.
|
|
366
|
+
continue;
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
let parsed;
|
|
370
|
+
try {
|
|
371
|
+
parsed = parseFrontmatter(captured.frontmatterText, absPath);
|
|
372
|
+
} catch (err) {
|
|
373
|
+
onMalformed(err);
|
|
374
|
+
continue;
|
|
375
|
+
}
|
|
376
|
+
if (!parsed.data || typeof parsed.data !== "object") {
|
|
377
|
+
onMalformed(
|
|
378
|
+
new Error(
|
|
379
|
+
`${absPath}: frontmatter parsed to non-object ${typeof parsed.data}`,
|
|
380
|
+
),
|
|
381
|
+
);
|
|
382
|
+
continue;
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
// `bodyOffset` is a byte offset. `loadBody` reads the file as a
|
|
386
|
+
// Buffer, slices BEFORE decoding, and only then converts to
|
|
387
|
+
// UTF-8 — so multi-byte frontmatter codepoints cannot misalign
|
|
388
|
+
// the body boundary. The thunk captures both `absPath` and
|
|
389
|
+
// `bodyOffset`, so repeated calls always seek past the
|
|
390
|
+
// frontmatter correctly.
|
|
391
|
+
const bodyOffset = captured.bodyOffset;
|
|
392
|
+
const loadBody = async () => {
|
|
393
|
+
const rawBuf = readFileSync(absPath);
|
|
394
|
+
const bodyBuf = rawBuf.slice(bodyOffset);
|
|
395
|
+
const body = bodyBuf.toString("utf8");
|
|
396
|
+
_markBodyLoadStart();
|
|
397
|
+
return body;
|
|
398
|
+
};
|
|
399
|
+
|
|
400
|
+
yield {
|
|
401
|
+
path: absPath,
|
|
402
|
+
relPath: rel,
|
|
403
|
+
data: parsed.data,
|
|
404
|
+
type: isIndex ? "index" : "leaf",
|
|
405
|
+
loadBody,
|
|
406
|
+
};
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// Convenience wrapper for callers that only want frontmatter (the
|
|
411
|
+
// common case for operator-convergence + classify). Returns an array
|
|
412
|
+
// of { path, relPath, data, type, loadBody } entries with the
|
|
413
|
+
// loadBody thunk included but never called by this helper.
|
|
414
|
+
export function collectFrontmatterOnly(wikiRoot, opts = {}) {
|
|
415
|
+
const out = [];
|
|
416
|
+
for (const entry of iterEntries(wikiRoot, opts)) {
|
|
417
|
+
out.push(entry);
|
|
418
|
+
}
|
|
419
|
+
return out;
|
|
420
|
+
}
|
|
421
|
+
|