@ctxr/skill-llm-wiki 1.0.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +118 -0
- package/README.md +2 -2
- package/guide/cli.md +3 -2
- package/guide/substrate/operators.md +1 -1
- package/guide/substrate/tiered-ai.md +6 -5
- package/guide/ux/user-intent.md +1 -1
- package/package.json +4 -2
- package/scripts/cli.mjs +92 -2
- package/scripts/lib/balance.mjs +579 -0
- package/scripts/lib/cluster-detect.mjs +482 -4
- package/scripts/lib/contract.mjs +31 -3
- package/scripts/lib/decision-log.mjs +121 -15
- package/scripts/lib/heal.mjs +5 -0
- package/scripts/lib/intent.mjs +370 -4
- package/scripts/lib/join-constants.mjs +22 -0
- package/scripts/lib/join.mjs +917 -0
- package/scripts/lib/nest-applier.mjs +395 -32
- package/scripts/lib/operators.mjs +472 -38
- package/scripts/lib/orchestrator.mjs +419 -12
- package/scripts/lib/root-containment.mjs +351 -0
- package/scripts/lib/similarity-cache.mjs +115 -20
- package/scripts/lib/similarity.mjs +11 -0
- package/scripts/lib/soft-dag.mjs +726 -0
- package/scripts/lib/tiered.mjs +42 -18
- package/scripts/lib/validate.mjs +22 -0
|
@@ -0,0 +1,917 @@
|
|
|
1
|
+
// join.mjs — merge N ≥ 2 existing wikis into one unified output wiki.
|
|
2
|
+
//
|
|
3
|
+
// Implements the 11-phase pipeline from guide/operations/ingest/join.md:
|
|
4
|
+
//
|
|
5
|
+
// 0 preflight (handled by orchestrator caller; not here)
|
|
6
|
+
// 1 ingest-all — read every source wiki's tree into memory
|
|
7
|
+
// 2 source-validate — validate each source; halt on errors
|
|
8
|
+
// 3 plan-union — merge per-source leaf lists into one
|
|
9
|
+
// 4 resolve-id-collisions — namespace (default) / merge / ask policies
|
|
10
|
+
// 5 merge-categories — detect matching-focus category folds
|
|
11
|
+
// (actual directory-MERGE fold is
|
|
12
|
+
// deferred — runConvergence's MERGE
|
|
13
|
+
// operator only handles sibling leaves,
|
|
14
|
+
// not entire category subtrees)
|
|
15
|
+
// 6 rewire-references — resolve links[].id / overlay_targets
|
|
16
|
+
// via id→alias→rename map. parents[]
|
|
17
|
+
// are POSIX paths, not ids; they're
|
|
18
|
+
// re-derived at phase 8 by
|
|
19
|
+
// `rebuildAllIndices` from the target's
|
|
20
|
+
// actual tree shape, not rewritten here.
|
|
21
|
+
// 7 apply-operators — runConvergence on the unified tree
|
|
22
|
+
// 8 generate-indices — rebuildAllIndices on the joined tree
|
|
23
|
+
// 9 validation — validateWiki on the joined tree
|
|
24
|
+
// 10 golden-path-union — each source's fixtures must still pass
|
|
25
|
+
// 11 commit — phase-commit via the caller's callback
|
|
26
|
+
//
|
|
27
|
+
// Source immutability: every source wiki is treated as strictly
|
|
28
|
+
// read-only. The pipeline materialises the unified output at the
|
|
29
|
+
// target path (created empty by the orchestrator before runJoin is
|
|
30
|
+
// called); sources are never touched on disk. This module assumes
|
|
31
|
+
// that precondition has already been enforced by the caller
|
|
32
|
+
// (intent.mjs's join branch refuses a non-empty target via INT-01
|
|
33
|
+
// and the CLI creates the target empty via mkdirSync when
|
|
34
|
+
// `plan.is_new_wiki` is set).
|
|
35
|
+
|
|
36
|
+
import {
|
|
37
|
+
closeSync,
|
|
38
|
+
existsSync,
|
|
39
|
+
mkdirSync,
|
|
40
|
+
openSync,
|
|
41
|
+
readFileSync,
|
|
42
|
+
readSync,
|
|
43
|
+
readdirSync,
|
|
44
|
+
writeFileSync,
|
|
45
|
+
} from "node:fs";
|
|
46
|
+
import { basename, dirname, join, relative } from "node:path";
|
|
47
|
+
import { readFrontmatterStreaming } from "./chunk.mjs";
|
|
48
|
+
import { parseFrontmatter, renderFrontmatter } from "./frontmatter.mjs";
|
|
49
|
+
import { rebuildAllIndices } from "./indices.mjs";
|
|
50
|
+
import { runConvergence } from "./operators.mjs";
|
|
51
|
+
import { summariseFindings, validateWiki } from "./validate.mjs";
|
|
52
|
+
// Re-export the policy constants from `join-constants.mjs` so
|
|
53
|
+
// existing callers that pull them from this module keep working.
|
|
54
|
+
// The intent layer imports from `join-constants.mjs` directly to
|
|
55
|
+
// avoid loading the full pipeline on non-join CLI paths.
|
|
56
|
+
export {
|
|
57
|
+
DEFAULT_COLLISION_POLICY,
|
|
58
|
+
VALID_COLLISION_POLICIES,
|
|
59
|
+
} from "./join-constants.mjs";
|
|
60
|
+
import { DEFAULT_COLLISION_POLICY, VALID_COLLISION_POLICIES } from "./join-constants.mjs";
|
|
61
|
+
|
|
62
|
+
// Body-read streaming chunk size, 64 KiB. Sized well above the
|
|
63
|
+
// 4 KiB chunk.mjs uses for the bounded frontmatter read (where
|
|
64
|
+
// frontmatter is expected to be small and the read budget matters)
|
|
65
|
+
// — leaf bodies are much larger and can be full documents, so a
|
|
66
|
+
// bigger chunk amortises the per-readSync syscall overhead while
|
|
67
|
+
// still keeping each individual allocation predictably small.
|
|
68
|
+
// Note this is the PER-READ ceiling, not a total-memory ceiling:
|
|
69
|
+
// each leaf's body is still fully held in memory after the
|
|
70
|
+
// `Buffer.concat` that assembles the chunks into the final UTF-8
|
|
71
|
+
// string. A future optimisation could switch to lazy body loading
|
|
72
|
+
// if the holding-every-body-in-memory shape becomes the bottleneck
|
|
73
|
+
// on very large corpora.
|
|
74
|
+
const BODY_READ_CHUNK_SIZE = 64 * 1024;
|
|
75
|
+
|
|
76
|
+
// ── Phase 1: ingest-all ──────────────────────────────────────────
|
|
77
|
+
//
|
|
78
|
+
// Read one source wiki into memory. Returns a normalised
|
|
79
|
+
// representation with `leaves[]` (non-index .md files) and
|
|
80
|
+
// `indices[]` (index.md files) — both carrying `relPath` (relative
|
|
81
|
+
// POSIX path under wikiRoot), parsed `data` (frontmatter), and
|
|
82
|
+
// `body` (everything after the closing fence). CRLF fences are
|
|
83
|
+
// handled by `readFrontmatterStreaming` which normalises to LF on
|
|
84
|
+
// the frontmatter payload; the body is sliced at the pre-normalisation
|
|
85
|
+
// byte offset and normalised to LF on read for downstream
|
|
86
|
+
// consistency with every other writer in this codebase.
|
|
87
|
+
//
|
|
88
|
+
// Files that fail to parse are collected into `malformed[]` so the
|
|
89
|
+
// caller can surface them as part of source-validate rather than
|
|
90
|
+
// silently dropping them.
|
|
91
|
+
export function ingestWiki(wikiRoot) {
|
|
92
|
+
const out = {
|
|
93
|
+
wikiRoot,
|
|
94
|
+
leaves: [],
|
|
95
|
+
indices: [],
|
|
96
|
+
malformed: [],
|
|
97
|
+
};
|
|
98
|
+
const stack = [wikiRoot];
|
|
99
|
+
while (stack.length > 0) {
|
|
100
|
+
const dir = stack.pop();
|
|
101
|
+
let entries;
|
|
102
|
+
try {
|
|
103
|
+
entries = readdirSync(dir, { withFileTypes: true });
|
|
104
|
+
} catch {
|
|
105
|
+
continue;
|
|
106
|
+
}
|
|
107
|
+
for (const e of entries) {
|
|
108
|
+
if (e.name.startsWith(".")) continue;
|
|
109
|
+
const full = join(dir, e.name);
|
|
110
|
+
if (e.isDirectory()) {
|
|
111
|
+
stack.push(full);
|
|
112
|
+
continue;
|
|
113
|
+
}
|
|
114
|
+
if (!e.isFile() || !e.name.endsWith(".md")) continue;
|
|
115
|
+
let captured;
|
|
116
|
+
try {
|
|
117
|
+
captured = readFrontmatterStreaming(full);
|
|
118
|
+
} catch (err) {
|
|
119
|
+
out.malformed.push({ path: full, error: err.message });
|
|
120
|
+
continue;
|
|
121
|
+
}
|
|
122
|
+
if (!captured) continue; // plain .md with no fence — not a wiki entry
|
|
123
|
+
let parsed;
|
|
124
|
+
try {
|
|
125
|
+
parsed = parseFrontmatter(captured.frontmatterText, full);
|
|
126
|
+
} catch (err) {
|
|
127
|
+
out.malformed.push({ path: full, error: err.message });
|
|
128
|
+
continue;
|
|
129
|
+
}
|
|
130
|
+
if (!parsed?.data?.id) continue;
|
|
131
|
+
// Read the body starting at `captured.bodyOffset` rather than
|
|
132
|
+
// re-reading the whole file via `readFileSync(full)` — the
|
|
133
|
+
// full-file read was a needless double-I/O that loaded the
|
|
134
|
+
// frontmatter bytes twice. Stream the body in bounded
|
|
135
|
+
// `BODY_READ_CHUNK_SIZE` chunks (per-read allocation stays
|
|
136
|
+
// small and predictable) rather than a single
|
|
137
|
+
// `Buffer.alloc(bodyLen)` up front. Chunks accumulate in an
|
|
138
|
+
// array and assemble into the final UTF-8 string via
|
|
139
|
+
// `Buffer.concat` at the end; peak memory during the concat
|
|
140
|
+
// is ~2× the body size momentarily, but the per-read
|
|
141
|
+
// allocation is always the small chunk buffer regardless of
|
|
142
|
+
// how large the body turns out to be (which matters because
|
|
143
|
+
// we don't know the body size before reading).
|
|
144
|
+
let body;
|
|
145
|
+
try {
|
|
146
|
+
const chunks = [];
|
|
147
|
+
const fd = openSync(full, "r");
|
|
148
|
+
try {
|
|
149
|
+
const buf = Buffer.alloc(BODY_READ_CHUNK_SIZE);
|
|
150
|
+
let pos = captured.bodyOffset;
|
|
151
|
+
while (true) {
|
|
152
|
+
const n = readSync(fd, buf, 0, buf.length, pos);
|
|
153
|
+
if (n === 0) break;
|
|
154
|
+
// Copy the populated slice — `buf` is reused on the
|
|
155
|
+
// next iteration, so retaining a view would corrupt
|
|
156
|
+
// previously-accumulated chunks.
|
|
157
|
+
chunks.push(Buffer.from(buf.subarray(0, n)));
|
|
158
|
+
pos += n;
|
|
159
|
+
}
|
|
160
|
+
} finally {
|
|
161
|
+
closeSync(fd);
|
|
162
|
+
}
|
|
163
|
+
body = Buffer.concat(chunks).toString("utf8");
|
|
164
|
+
} catch {
|
|
165
|
+
// Fall back to the full-file read on any low-level error
|
|
166
|
+
// (e.g., stat race between the streaming read and now).
|
|
167
|
+
const raw = readFileSync(full);
|
|
168
|
+
body = raw.slice(captured.bodyOffset).toString("utf8");
|
|
169
|
+
}
|
|
170
|
+
if (captured.lineEnding === "crlf") {
|
|
171
|
+
body = body.replace(/\r\n/g, "\n");
|
|
172
|
+
}
|
|
173
|
+
const relPath = relPosix(relative(wikiRoot, full));
|
|
174
|
+
const record = { relPath, absolutePath: full, data: parsed.data, body };
|
|
175
|
+
if (e.name === "index.md") {
|
|
176
|
+
out.indices.push(record);
|
|
177
|
+
} else {
|
|
178
|
+
out.leaves.push(record);
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
// Lex sort for determinism — downstream collision resolution is
|
|
183
|
+
// order-sensitive when the policy emits sequential suffixes.
|
|
184
|
+
out.leaves.sort((a, b) => a.relPath.localeCompare(b.relPath));
|
|
185
|
+
out.indices.sort((a, b) => a.relPath.localeCompare(b.relPath));
|
|
186
|
+
return out;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
// ── Phase 2: source-validate ─────────────────────────────────────
|
|
190
|
+
//
|
|
191
|
+
// Run `validateWiki` on each source. Any hard error halts join
|
|
192
|
+
// with "fix this source first" — joining a broken source produces
|
|
193
|
+
// a broken joined wiki. Warnings are surfaced but don't block.
|
|
194
|
+
// Malformed files collected by `ingestWiki` are folded in as
|
|
195
|
+
// synthetic PARSE findings so the caller sees a single unified
|
|
196
|
+
// list rather than two separate error channels — deduped by
|
|
197
|
+
// (code, target) against `validateWiki`'s own PARSE findings so
|
|
198
|
+
// the same file doesn't get reported twice when both channels
|
|
199
|
+
// detect the same parse failure.
|
|
200
|
+
export function validateSources(ingested) {
|
|
201
|
+
const report = { errors: [], warnings: [] };
|
|
202
|
+
for (const src of ingested) {
|
|
203
|
+
const findings = validateWiki(src.wikiRoot);
|
|
204
|
+
const seenErrorKeys = new Set();
|
|
205
|
+
for (const f of findings) {
|
|
206
|
+
const entry = { wikiRoot: src.wikiRoot, ...f };
|
|
207
|
+
if (f.severity === "error") {
|
|
208
|
+
seenErrorKeys.add(`${f.code}:${f.target}`);
|
|
209
|
+
report.errors.push(entry);
|
|
210
|
+
} else if (f.severity === "warning") {
|
|
211
|
+
report.warnings.push(entry);
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
for (const m of src.malformed) {
|
|
215
|
+
// Skip synthetic PARSE entries that `validateWiki` already
|
|
216
|
+
// reported — duplicating would make the
|
|
217
|
+
// JOIN-SOURCE-INVALID summary noisy without adding signal.
|
|
218
|
+
if (seenErrorKeys.has(`PARSE:${m.path}`)) continue;
|
|
219
|
+
report.errors.push({
|
|
220
|
+
wikiRoot: src.wikiRoot,
|
|
221
|
+
severity: "error",
|
|
222
|
+
code: "PARSE",
|
|
223
|
+
target: m.path,
|
|
224
|
+
message: m.error,
|
|
225
|
+
});
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
return report;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// ── Phase 3: plan-union ──────────────────────────────────────────
|
|
232
|
+
//
|
|
233
|
+
// Merge per-source leaf + index records into a single in-memory
|
|
234
|
+
// plan. Each record gets tagged with `sourceWiki` (the absolute
|
|
235
|
+
// path of its origin wiki) so downstream phases can reason about
|
|
236
|
+
// provenance — the `source_wikis[]` frontmatter field on merged
|
|
237
|
+
// entries, the rename-map namespacing prefix, and the
|
|
238
|
+
// golden-path-union phase's fixture tracing all rely on this.
|
|
239
|
+
//
|
|
240
|
+
// The union preserves source order (source A's entries first, then
|
|
241
|
+
// B, etc.) with lex-sorted records within each source, so the plan
|
|
242
|
+
// is byte-stable for identical inputs regardless of filesystem
|
|
243
|
+
// readdir ordering.
|
|
244
|
+
export function planUnion(ingestedSources) {
|
|
245
|
+
const leaves = [];
|
|
246
|
+
const indices = [];
|
|
247
|
+
for (const src of ingestedSources) {
|
|
248
|
+
for (const leaf of src.leaves) {
|
|
249
|
+
leaves.push({ ...leaf, sourceWiki: src.wikiRoot });
|
|
250
|
+
}
|
|
251
|
+
for (const idx of src.indices) {
|
|
252
|
+
indices.push({ ...idx, sourceWiki: src.wikiRoot });
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
return { leaves, indices };
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
// ── Phase 4: resolve-id-collisions ───────────────────────────────
|
|
259
|
+
//
|
|
260
|
+
// Detect duplicate ids across sources and apply the configured
|
|
261
|
+
// collision policy. Includes BOTH leaves and indices — `validateWiki`
|
|
262
|
+
// enforces global id uniqueness across every entry type, so an
|
|
263
|
+
// index-only collision (two sources both with `auth/index.md`) would
|
|
264
|
+
// still trip `DUP-ID` at phase 9 if left alone.
|
|
265
|
+
//
|
|
266
|
+
// Index collisions: this first-cut throws a structured
|
|
267
|
+
// `JOIN-INDEX-COLLISION` the moment the collision is detected —
|
|
268
|
+
// safely renaming just the id frontmatter would immediately trip
|
|
269
|
+
// `ID-MISMATCH-DIR` (id must match parent dir basename), and
|
|
270
|
+
// directory-renaming an entire subtree to produce a unique id is
|
|
271
|
+
// a full directory-MERGE operator tracked as a follow-up. The
|
|
272
|
+
// throw asks the user to disambiguate one of the conflicting
|
|
273
|
+
// subcategories in its source wiki before re-joining. In the
|
|
274
|
+
// common case of joining topically-distinct source wikis, index
|
|
275
|
+
// collisions don't occur at all.
|
|
276
|
+
//
|
|
277
|
+
// Collision policies:
|
|
278
|
+
//
|
|
279
|
+
// `namespace` (default): rename the colliding entry to
|
|
280
|
+
// `<source-prefix>.<original-id>` where the prefix is the
|
|
281
|
+
// colliding source-wiki's basename (e.g. `reviewers.wiki` →
|
|
282
|
+
// `reviewers`). Inbound references from that source resolve
|
|
283
|
+
// via source-scoped `renameMap` rewrites during the rewire
|
|
284
|
+
// phase. The original id is NOT added to `aliases[]` — the
|
|
285
|
+
// keeper record (from the first source) retains the
|
|
286
|
+
// un-prefixed id as its live id, so an alias entry with the
|
|
287
|
+
// same value would trip `ALIAS-COLLIDES-ID` at phase 9.
|
|
288
|
+
// Never loses an entry; never shadows the keeper.
|
|
289
|
+
//
|
|
290
|
+
// `merge`: when frontmatter is compatible (same focus, same
|
|
291
|
+
// type, same depth_role), fold duplicates into the keeper —
|
|
292
|
+
// preserving the absorbed record's existing `aliases[]` on
|
|
293
|
+
// the keeper and recording both source wikis via
|
|
294
|
+
// `source_wikis[]`. Because collisions are on IDENTICAL ids,
|
|
295
|
+
// the absorbed id is NOT added to `aliases[]` (it equals the
|
|
296
|
+
// keeper's live id; a self-alias would trip
|
|
297
|
+
// `ALIAS-COLLIDES-ID` at phase 9). When frontmatter is
|
|
298
|
+
// INcompatible, fall through to `namespace` — we never lose
|
|
299
|
+
// content on a merge fallback.
|
|
300
|
+
//
|
|
301
|
+
// `ask`: halt and throw `JOIN-COLLISION-ASK` with the collision
|
|
302
|
+
// set for the caller to surface. The interactive resolution
|
|
303
|
+
// flow is deferred; first-cut just fails loud.
|
|
304
|
+
//
|
|
305
|
+
// Returns `{ plan, renameMap, mergeMap }`:
|
|
306
|
+
// - plan: the mutated plan (leaves/indices with renamed ids)
|
|
307
|
+
// - renameMap: `Map<sourceWiki, Map<oldId, newId>>` — source-
|
|
308
|
+
// scoped so 3+ sources sharing an id each get their
|
|
309
|
+
// own entry without overwriting. `rewireReferences`
|
|
310
|
+
// uses the referring entry's `sourceWiki` to pick
|
|
311
|
+
// the right rename.
|
|
312
|
+
// - mergeMap: `Map<absorbedId, keeperId>` — a flat map because
|
|
313
|
+
// every absorbed id collapses to the keeper's id
|
|
314
|
+
// regardless of the referring source's identity.
|
|
315
|
+
export function resolveIdCollisions(plan, policy = DEFAULT_COLLISION_POLICY) {
|
|
316
|
+
if (!VALID_COLLISION_POLICIES.includes(policy)) {
|
|
317
|
+
throw new Error(
|
|
318
|
+
`join: unknown id-collision policy "${policy}" (valid: ${VALID_COLLISION_POLICIES.join(", ")})`,
|
|
319
|
+
);
|
|
320
|
+
}
|
|
321
|
+
// Include both leaves and indices in collision detection. Each
|
|
322
|
+
// entry is tagged with `kind` so we can apply slightly different
|
|
323
|
+
// rename shapes (a leaf rename also updates relPath's filename;
|
|
324
|
+
// an index rename only updates the id frontmatter — renaming the
|
|
325
|
+
// whole subdirectory is deferred).
|
|
326
|
+
const byId = new Map();
|
|
327
|
+
for (const leaf of plan.leaves) {
|
|
328
|
+
const id = leaf.data.id;
|
|
329
|
+
if (!byId.has(id)) byId.set(id, []);
|
|
330
|
+
byId.get(id).push({ kind: "leaf", record: leaf });
|
|
331
|
+
}
|
|
332
|
+
for (const idx of plan.indices) {
|
|
333
|
+
const id = idx.data.id;
|
|
334
|
+
if (!byId.has(id)) byId.set(id, []);
|
|
335
|
+
byId.get(id).push({ kind: "index", record: idx });
|
|
336
|
+
}
|
|
337
|
+
const collisions = [...byId.entries()].filter(([, arr]) => arr.length > 1);
|
|
338
|
+
if (collisions.length === 0) {
|
|
339
|
+
return { plan, renameMap: new Map(), mergeMap: new Map() };
|
|
340
|
+
}
|
|
341
|
+
if (policy === "ask") {
|
|
342
|
+
const err = new Error(
|
|
343
|
+
`join: id collisions found and policy=ask — ` +
|
|
344
|
+
`${collisions.length} colliding id(s). Resolve manually and re-invoke.`,
|
|
345
|
+
);
|
|
346
|
+
err.code = "JOIN-COLLISION-ASK";
|
|
347
|
+
err.collisions = collisions.map(([id, arr]) => ({
|
|
348
|
+
id,
|
|
349
|
+
sources: arr.map((entry) => entry.record.sourceWiki),
|
|
350
|
+
}));
|
|
351
|
+
throw err;
|
|
352
|
+
}
|
|
353
|
+
// Source-scoped rename map: Map<sourceWiki, Map<oldId, newId>>.
|
|
354
|
+
// When three sources all contain id "dup", each gets its own
|
|
355
|
+
// namespace-prefixed id; `rewireReferences` picks the right
|
|
356
|
+
// rename by looking up `renameMap.get(referrer.sourceWiki)`.
|
|
357
|
+
const renameMap = new Map();
|
|
358
|
+
const addRename = (sourceWiki, oldId, newId) => {
|
|
359
|
+
if (!renameMap.has(sourceWiki)) renameMap.set(sourceWiki, new Map());
|
|
360
|
+
renameMap.get(sourceWiki).set(oldId, newId);
|
|
361
|
+
};
|
|
362
|
+
const mergeMap = new Map();
|
|
363
|
+
const absorbedPaths = new Set();
|
|
364
|
+
// Running set of every "live" id we've already assigned — both
|
|
365
|
+
// un-renamed keepers AND namespace-prefixed renames. Used to
|
|
366
|
+
// detect secondary collisions: two source wikis with the same
|
|
367
|
+
// basename (e.g. both `/a/reviewers.wiki` and `/b/reviewers.wiki`)
|
|
368
|
+
// would produce identical `reviewers.<id>` prefixed ids without
|
|
369
|
+
// this guard. Seeded with every plan record's current id so
|
|
370
|
+
// namespace renames don't silently collide with non-duplicated
|
|
371
|
+
// ids elsewhere in the plan either.
|
|
372
|
+
const liveIds = new Set();
|
|
373
|
+
for (const leaf of plan.leaves) liveIds.add(leaf.data.id);
|
|
374
|
+
for (const idx of plan.indices) liveIds.add(idx.data.id);
|
|
375
|
+
const reserveId = (baseId) => {
|
|
376
|
+
if (!liveIds.has(baseId)) {
|
|
377
|
+
liveIds.add(baseId);
|
|
378
|
+
return baseId;
|
|
379
|
+
}
|
|
380
|
+
for (let n = 2; n < 1000; n++) {
|
|
381
|
+
const candidate = `${baseId}-${n}`;
|
|
382
|
+
if (!liveIds.has(candidate)) {
|
|
383
|
+
liveIds.add(candidate);
|
|
384
|
+
return candidate;
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
throw new Error(
|
|
388
|
+
`join: could not disambiguate namespace-prefixed id "${baseId}" within 1000 attempts`,
|
|
389
|
+
);
|
|
390
|
+
};
|
|
391
|
+
for (const [id, dupes] of collisions) {
|
|
392
|
+
// Keeper is the first source's entry; subsequent entries either
|
|
393
|
+
// merge (merge policy + compatible) or rename (namespace policy,
|
|
394
|
+
// or merge-fallback on incompatible frontmatter).
|
|
395
|
+
const [keeperEntry, ...rest] = dupes;
|
|
396
|
+
const keeper = keeperEntry.record;
|
|
397
|
+
for (const dupEntry of rest) {
|
|
398
|
+
const dup = dupEntry.record;
|
|
399
|
+
const canMerge =
|
|
400
|
+
policy === "merge" &&
|
|
401
|
+
dupEntry.kind === keeperEntry.kind &&
|
|
402
|
+
dup.data.focus === keeper.data.focus &&
|
|
403
|
+
dup.data.type === keeper.data.type &&
|
|
404
|
+
dup.data.depth_role === keeper.data.depth_role;
|
|
405
|
+
if (canMerge) {
|
|
406
|
+
// Do NOT add an identity `{id → id}` entry to mergeMap:
|
|
407
|
+
// under collision the absorbed and keeper share the same
|
|
408
|
+
// id, so a self-mapping is a no-op semantically AND
|
|
409
|
+
// actively harmful — `rewireReferences` consults mergeMap
|
|
410
|
+
// before the source-scoped renameMap, so an identity
|
|
411
|
+
// entry would intercept a reference from a DIFFERENT
|
|
412
|
+
// source (which got namespace-renamed for the same
|
|
413
|
+
// collision id) and short-circuit before the renameMap
|
|
414
|
+
// rewrite fires. The absorbed record is dropped from the
|
|
415
|
+
// plan via `absorbedPaths` regardless.
|
|
416
|
+
if (dup.data.id !== keeper.data.id) {
|
|
417
|
+
mergeMap.set(dup.data.id, keeper.data.id);
|
|
418
|
+
}
|
|
419
|
+
absorbedPaths.add(dup.absolutePath);
|
|
420
|
+
// Inherit absorbed's aliases (NOT its id — the absorbed
|
|
421
|
+
// id is already the keeper's id, so adding it to
|
|
422
|
+
// aliases[] would create a self-alias the validator
|
|
423
|
+
// rejects under ALIAS-COLLIDES-ID).
|
|
424
|
+
keeper.data.aliases = dedupe([
|
|
425
|
+
...(keeper.data.aliases || []),
|
|
426
|
+
...(dup.data.aliases || []),
|
|
427
|
+
]);
|
|
428
|
+
keeper.data.source_wikis = dedupe([
|
|
429
|
+
...(keeper.data.source_wikis || [keeper.sourceWiki]),
|
|
430
|
+
dup.sourceWiki,
|
|
431
|
+
]);
|
|
432
|
+
} else {
|
|
433
|
+
// Namespace fallback. Index collisions are a hard fail
|
|
434
|
+
// under the current scope (full directory-rename is a
|
|
435
|
+
// follow-up), so short-circuit BEFORE mutating
|
|
436
|
+
// `dup.data.id` / renameMap / relPath. Leaving partial
|
|
437
|
+
// mutations on the in-memory plan would make the pre-
|
|
438
|
+
// throw state unsafe to reuse — a caller that caught
|
|
439
|
+
// JOIN-INDEX-COLLISION and wanted to inspect the plan
|
|
440
|
+
// would see a half-renamed dup mixed with un-renamed
|
|
441
|
+
// peers, producing confusing diagnostics.
|
|
442
|
+
if (dupEntry.kind === "index") {
|
|
443
|
+
const err = new Error(
|
|
444
|
+
`join: index id collision on "${id}" between ` +
|
|
445
|
+
`${keeper.sourceWiki} and ${dup.sourceWiki}. ` +
|
|
446
|
+
`Directory-rename for index collisions is not yet ` +
|
|
447
|
+
`supported; rename one of the conflicting directories ` +
|
|
448
|
+
`in its source wiki before joining.`,
|
|
449
|
+
);
|
|
450
|
+
err.code = "JOIN-INDEX-COLLISION";
|
|
451
|
+
throw err;
|
|
452
|
+
}
|
|
453
|
+
// Namespace: prefix with the basename of the dup's source
|
|
454
|
+
// wiki. `reviewers.wiki` → `reviewers`; the trailing
|
|
455
|
+
// `.wiki` suffix is idiomatic and stripped for the prefix.
|
|
456
|
+
// `reserveId` guards against secondary collisions when two
|
|
457
|
+
// sources share the same basename or when the generated
|
|
458
|
+
// prefix happens to clash with an existing unrelated id.
|
|
459
|
+
const prefix = namespacePrefix(dup.sourceWiki);
|
|
460
|
+
const newId = reserveId(`${prefix}.${id}`);
|
|
461
|
+
// DON'T add the old id to aliases[]: the keeper from the
|
|
462
|
+
// first source still owns the un-prefixed "dup" as its
|
|
463
|
+
// live id. A dup.aliases = ["dup"] would trip
|
|
464
|
+
// ALIAS-COLLIDES-ID at phase 9. Existing aliases the
|
|
465
|
+
// source carried on the dup are preserved byte-identical.
|
|
466
|
+
addRename(dup.sourceWiki, dup.data.id, newId);
|
|
467
|
+
dup.data.id = newId;
|
|
468
|
+
// Leaf rename: the filename must track the id (validator
|
|
469
|
+
// enforces `ID-MISMATCH-FILE`); rewrite relPath.
|
|
470
|
+
const dir = dirname(dup.relPath);
|
|
471
|
+
dup.relPath = dir === "." ? `${newId}.md` : `${dir}/${newId}.md`;
|
|
472
|
+
}
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
const rebuiltLeaves = plan.leaves.filter(
|
|
476
|
+
(l) => !absorbedPaths.has(l.absolutePath),
|
|
477
|
+
);
|
|
478
|
+
const rebuiltIndices = plan.indices.filter(
|
|
479
|
+
(i) => !absorbedPaths.has(i.absolutePath),
|
|
480
|
+
);
|
|
481
|
+
return {
|
|
482
|
+
plan: { leaves: rebuiltLeaves, indices: rebuiltIndices },
|
|
483
|
+
renameMap,
|
|
484
|
+
mergeMap,
|
|
485
|
+
};
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
// ── Phase 5: merge-categories ────────────────────────────────────
|
|
489
|
+
//
|
|
490
|
+
// Detect top-level categories that share the same `focus` across
|
|
491
|
+
// source wikis. DETECT-ONLY: this helper walks every source's
|
|
492
|
+
// top-level indices, groups them by focus, and returns the
|
|
493
|
+
// multi-source groups without mutating the plan or the filesystem.
|
|
494
|
+
// No entries[] appends, no subdirectory moves — a category-level
|
|
495
|
+
// directory-MERGE operator that would actually fold two same-focus
|
|
496
|
+
// category subtrees is tracked as a follow-up; for now the joined
|
|
497
|
+
// tree retains both categories side-by-side and downstream
|
|
498
|
+
// convergence can decide whether to merge individual leaves across
|
|
499
|
+
// them.
|
|
500
|
+
export function mergeCategoriesWithSameFocus(ingestedSources) {
|
|
501
|
+
const byFocus = new Map();
|
|
502
|
+
for (const src of ingestedSources) {
|
|
503
|
+
for (const idx of src.indices) {
|
|
504
|
+
const rel = idx.relPath;
|
|
505
|
+
// Top-level only: relative path must be `<name>/index.md` —
|
|
506
|
+
// exactly one slash.
|
|
507
|
+
if (rel.split("/").length !== 2) continue;
|
|
508
|
+
const focus = idx.data.focus || "";
|
|
509
|
+
if (!focus) continue;
|
|
510
|
+
if (!byFocus.has(focus)) byFocus.set(focus, []);
|
|
511
|
+
byFocus.get(focus).push({ ...idx, sourceWiki: src.wikiRoot });
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
const merges = [];
|
|
515
|
+
for (const [focus, group] of byFocus) {
|
|
516
|
+
if (group.length < 2) continue;
|
|
517
|
+
merges.push({ focus, categories: group });
|
|
518
|
+
}
|
|
519
|
+
return merges;
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
// ── Phase 6: rewire-references ───────────────────────────────────
|
|
523
|
+
//
|
|
524
|
+
// Walk every leaf + index in the plan and rewrite any `links[].id`
|
|
525
|
+
// or `overlay_targets[]` entry that points at a renamed or merged
|
|
526
|
+
// id. Resolution order for each reference:
|
|
527
|
+
// 1. `mergeMap` — absorbed → keeper is source-agnostic (every
|
|
528
|
+
// source's reference to the absorbed id collapses to the
|
|
529
|
+
// same keeper id), so it's consulted first and applies flat.
|
|
530
|
+
// 2. `renameMap.get(referrerSourceWiki)` — source-scoped, so a
|
|
531
|
+
// link in source B's frontmatter pointing at "dup" resolves
|
|
532
|
+
// to B's renamed id (e.g. "b.dup"), not A's preserved "dup".
|
|
533
|
+
// This is the fix for the N>2 collision case: before the
|
|
534
|
+
// scope-by-source change, 3+ sources sharing "dup" all
|
|
535
|
+
// clobbered renameMap entries and left references pointing
|
|
536
|
+
// at the last-renamed value.
|
|
537
|
+
// Unresolvable references are left as-is; the downstream
|
|
538
|
+
// `validateWiki` flags them as `DANGLING-LINK` / `DANGLING-OVERLAY`
|
|
539
|
+
// so the user gets a single structured report at phase 9.
|
|
540
|
+
//
|
|
541
|
+
// `parents[]` entries are POSIX paths, not ids, and never resolve
|
|
542
|
+
// via the id maps — they're re-derived at phase 8 by
|
|
543
|
+
// `rebuildAllIndices` using the same path-relative rules the
|
|
544
|
+
// regular build pipeline uses.
|
|
545
|
+
export function rewireReferences(plan, renameMap, mergeMap) {
|
|
546
|
+
const resolveId = (ref, sourceWiki) => {
|
|
547
|
+
if (typeof ref !== "string") return ref;
|
|
548
|
+
if (mergeMap.has(ref)) return mergeMap.get(ref);
|
|
549
|
+
const sourceRenames = renameMap.get(sourceWiki);
|
|
550
|
+
if (sourceRenames && sourceRenames.has(ref)) return sourceRenames.get(ref);
|
|
551
|
+
return ref;
|
|
552
|
+
};
|
|
553
|
+
const rewriteLinks = (entry) => {
|
|
554
|
+
const src = entry.sourceWiki;
|
|
555
|
+
if (Array.isArray(entry.data.links)) {
|
|
556
|
+
entry.data.links = entry.data.links.map((link) => {
|
|
557
|
+
if (link && typeof link === "object" && typeof link.id === "string") {
|
|
558
|
+
return { ...link, id: resolveId(link.id, src) };
|
|
559
|
+
}
|
|
560
|
+
return link;
|
|
561
|
+
});
|
|
562
|
+
}
|
|
563
|
+
if (Array.isArray(entry.data.overlay_targets)) {
|
|
564
|
+
entry.data.overlay_targets = entry.data.overlay_targets.map((id) =>
|
|
565
|
+
resolveId(id, src),
|
|
566
|
+
);
|
|
567
|
+
}
|
|
568
|
+
};
|
|
569
|
+
for (const leaf of plan.leaves) rewriteLinks(leaf);
|
|
570
|
+
for (const idx of plan.indices) rewriteLinks(idx);
|
|
571
|
+
return plan;
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
// ── materialise-to-target ────────────────────────────────────────
|
|
575
|
+
//
|
|
576
|
+
// Intermediate step between phase 6 (rewire-references) and phase 7
|
|
577
|
+
// (apply-operators / runConvergence). Writes the unified plan into
|
|
578
|
+
// the prepared empty target directory so subsequent phases
|
|
579
|
+
// (convergence, index-generation, validation) operate on a real
|
|
580
|
+
// on-disk tree. Not one of the 11 methodology phases itself — it's
|
|
581
|
+
// the materialise step that makes phases 7+ possible. Each leaf's
|
|
582
|
+
// file is written with the (possibly rewritten) frontmatter + body;
|
|
583
|
+
// subdirectories are created as needed. Category indices are
|
|
584
|
+
// written last so directories exist before any index tries to
|
|
585
|
+
// enumerate entries[] at rebuild time.
|
|
586
|
+
//
|
|
587
|
+
// Structural fields on indices (`id`, `depth_role`, `parents`,
|
|
588
|
+
// `depth`, `entries`) are STRIPPED before writing. `rebuildAllIndices`
|
|
589
|
+
// in phase 8 re-derives every one of them from the target tree's
|
|
590
|
+
// actual shape — the source's stale values would trip
|
|
591
|
+
// `ID-MISMATCH-DIR` (source root id == `basename(sourceWiki)` which
|
|
592
|
+
// is not `basename(target)`) and `PARENTS-REQUIRED` (source subcat
|
|
593
|
+
// parents are relative to the source root, not the unified target).
|
|
594
|
+
// Authored-intent fields (`focus`, `shared_covers`, `orientation`,
|
|
595
|
+
// etc.) ARE preserved from the first source whose index lands at
|
|
596
|
+
// a given relPath — that's the closest we get to "category merge"
|
|
597
|
+
// without running the convergence MERGE operator here.
|
|
598
|
+
//
|
|
599
|
+
// Duplicate index relPaths (two sources with the same
|
|
600
|
+
// `foo/index.md`) are resolved first-wins: the first source's
|
|
601
|
+
// index contributes the authored-intent fields; subsequent writes
|
|
602
|
+
// at the same relPath are dropped SILENTLY — phase 9's validator
|
|
603
|
+
// doesn't see them (the second file never lands on disk), so no
|
|
604
|
+
// DUP-ID warning surfaces. This is intentional for the common
|
|
605
|
+
// case where two sources happen to share a top-level shape and
|
|
606
|
+
// their category indices carry near-identical metadata; if the
|
|
607
|
+
// focus values genuinely differ, the authored-intent divergence
|
|
608
|
+
// is lost. The upshot: `resolveIdCollisions` already throws
|
|
609
|
+
// `JOIN-INDEX-COLLISION` on same-id collisions, which catches
|
|
610
|
+
// the meaningful case; a same-relPath-different-focus pair would
|
|
611
|
+
// also mean same-id (the index id equals its directory basename),
|
|
612
|
+
// so the resolveIdCollisions throw fires before we reach
|
|
613
|
+
// materialisation. This silent-drop path is load-bearing only
|
|
614
|
+
// for truly identical duplicate indices.
|
|
615
|
+
//
|
|
616
|
+
// Source immutability: writes happen ONLY under `target`, never
|
|
617
|
+
// back to any source wiki.
|
|
618
|
+
export function materialisePlan(plan, target) {
|
|
619
|
+
if (!existsSync(target)) mkdirSync(target, { recursive: true });
|
|
620
|
+
// Write leaves first.
|
|
621
|
+
for (const leaf of plan.leaves) {
|
|
622
|
+
const absPath = join(target, leaf.relPath);
|
|
623
|
+
mkdirSync(dirname(absPath), { recursive: true });
|
|
624
|
+
const data = { ...leaf.data };
|
|
625
|
+
writeFileSync(absPath, renderFrontmatter(data, leaf.body), "utf8");
|
|
626
|
+
}
|
|
627
|
+
// Category indices: write only for directories that actually hold
|
|
628
|
+
// at least one leaf; strip structural fields so `rebuildAllIndices`
|
|
629
|
+
// re-derives them; first-wins on duplicate relPath.
|
|
630
|
+
const liveDirs = new Set();
|
|
631
|
+
for (const leaf of plan.leaves) {
|
|
632
|
+
const parts = leaf.relPath.split("/");
|
|
633
|
+
for (let i = 1; i < parts.length; i++) {
|
|
634
|
+
liveDirs.add(parts.slice(0, i).join("/"));
|
|
635
|
+
}
|
|
636
|
+
}
|
|
637
|
+
liveDirs.add(""); // wiki root always gets an index
|
|
638
|
+
const writtenIndexPaths = new Set();
|
|
639
|
+
for (const idx of plan.indices) {
|
|
640
|
+
const parts = idx.relPath.split("/");
|
|
641
|
+
const dirRel = parts.slice(0, -1).join("/");
|
|
642
|
+
if (!liveDirs.has(dirRel)) continue;
|
|
643
|
+
if (writtenIndexPaths.has(idx.relPath)) continue;
|
|
644
|
+
writtenIndexPaths.add(idx.relPath);
|
|
645
|
+
const absPath = join(target, idx.relPath);
|
|
646
|
+
mkdirSync(dirname(absPath), { recursive: true });
|
|
647
|
+
const data = { ...idx.data };
|
|
648
|
+
// Location-dependent structural fields are re-derived by
|
|
649
|
+
// `rebuildAllIndices` from the materialised tree's actual
|
|
650
|
+
// shape. Source values would mismatch the target's position
|
|
651
|
+
// in the unified hierarchy (e.g. source root `id` is
|
|
652
|
+
// `basename(sourceWiki)`, not `basename(target)`).
|
|
653
|
+
delete data.id;
|
|
654
|
+
delete data.depth_role;
|
|
655
|
+
delete data.depth;
|
|
656
|
+
delete data.parents;
|
|
657
|
+
delete data.entries;
|
|
658
|
+
// `type: "index"` stays. Phase 7 (`runConvergence`) runs
|
|
659
|
+
// BEFORE Phase 8's rebuildAllIndices, and convergence
|
|
660
|
+
// classifies entries as indices vs leaves via the `type`
|
|
661
|
+
// field. Stripping type would leave the intermediate tree
|
|
662
|
+
// with typeless index.md files, making convergence read
|
|
663
|
+
// them as plain leaves and tripping operator-proposal bugs
|
|
664
|
+
// (e.g. LIFT detection's `listChildren` filter).
|
|
665
|
+
writeFileSync(absPath, renderFrontmatter(data, idx.body), "utf8");
|
|
666
|
+
}
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
// ── Main entry ───────────────────────────────────────────────────
|
|
670
|
+
//
|
|
671
|
+
// Orchestrator calls this after it has:
|
|
672
|
+
// - taken a pre-op snapshot on the target
|
|
673
|
+
// - confirmed target is a fresh empty directory
|
|
674
|
+
//
|
|
675
|
+
// Returns a structured phase log:
|
|
676
|
+
// {
|
|
677
|
+
// phases: [{ name, summary }],
|
|
678
|
+
// warnings: [...],
|
|
679
|
+
// unified: { leaves: N, indices: M }
|
|
680
|
+
// }
|
|
681
|
+
//
|
|
682
|
+
// Phase 7/8/9 call the existing convergence/indices/validation
|
|
683
|
+
// helpers after materialisation, so the same tiered-AI quality mode
|
|
684
|
+
// applies to joined trees that applies to ordinary builds.
|
|
685
|
+
export async function runJoin(sources, target, ctx = {}) {
|
|
686
|
+
const {
|
|
687
|
+
opId = null,
|
|
688
|
+
qualityMode = "tiered-fast",
|
|
689
|
+
idCollisionPolicy = DEFAULT_COLLISION_POLICY,
|
|
690
|
+
// Optional per-phase commit hook. The orchestrator passes a
|
|
691
|
+
// function that stages + commits between phases so the private
|
|
692
|
+
// git log records the join's progression at per-phase
|
|
693
|
+
// granularity (matching the build pipeline's commit cadence).
|
|
694
|
+
// Shape: async ({ phase, summary }) => void. If absent, runJoin
|
|
695
|
+
// runs end-to-end without intermediate commits — the shape tests
|
|
696
|
+
// in `tests/unit/join.test.mjs` use that path.
|
|
697
|
+
onPhaseCommit = null,
|
|
698
|
+
// Optional per-phase progress hook, invoked synchronously the
|
|
699
|
+
// moment each phase records its summary (i.e. BEFORE the op
|
|
700
|
+
// awaits the next phase's I/O). This is what makes CLI-level
|
|
701
|
+
// progress streaming work for join — without it the
|
|
702
|
+
// orchestrator only sees join's phases AFTER `runJoin`
|
|
703
|
+
// returns, so the `[<op-id> N] phase: summary` breadcrumbs
|
|
704
|
+
// batch-print at the end of the join instead of streaming
|
|
705
|
+
// during execution. Shape:
|
|
706
|
+
// ({ phase, summary }) => any | Promise<any>
|
|
707
|
+
// The return value is ignored. Synchronous throws AND Promise
|
|
708
|
+
// rejections are both swallowed — a misbehaving progress hook
|
|
709
|
+
// must never halt the op. Async hooks are supported (the
|
|
710
|
+
// caller is responsible for ensuring their observable side
|
|
711
|
+
// effects don't race with subsequent phases; `runJoin` itself
|
|
712
|
+
// does not await the hook).
|
|
713
|
+
onPhase = null,
|
|
714
|
+
} = ctx;
|
|
715
|
+
const commitPhase = async (phase, summary) => {
|
|
716
|
+
if (onPhaseCommit) await onPhaseCommit({ phase, summary });
|
|
717
|
+
};
|
|
718
|
+
if (!Array.isArray(sources) || sources.length < 2) {
|
|
719
|
+
throw new Error(`join: at least 2 source wikis required, got ${sources?.length ?? 0}`);
|
|
720
|
+
}
|
|
721
|
+
const phaseLog = [];
|
|
722
|
+
const record = (name, summary) => {
|
|
723
|
+
phaseLog.push({ name, summary });
|
|
724
|
+
if (onPhase) {
|
|
725
|
+
// Cover BOTH sync throws and async rejections from the hook
|
|
726
|
+
// (mirrors `orchestrator.mjs::record()`). An `async` onPhase
|
|
727
|
+
// that rejects would otherwise escape as unhandledRejection
|
|
728
|
+
// and could terminate the process under Node's default
|
|
729
|
+
// policy, violating the "progress hook failures never halt
|
|
730
|
+
// the op" contract.
|
|
731
|
+
try {
|
|
732
|
+
const ret = onPhase({ phase: name, summary });
|
|
733
|
+
if (ret && typeof ret.then === "function") {
|
|
734
|
+
Promise.resolve(ret).catch(() => {
|
|
735
|
+
/* async onPhase rejection silently swallowed */
|
|
736
|
+
});
|
|
737
|
+
}
|
|
738
|
+
} catch {
|
|
739
|
+
/* sync onPhase throw silently swallowed */
|
|
740
|
+
}
|
|
741
|
+
}
|
|
742
|
+
};
|
|
743
|
+
|
|
744
|
+
// Phase 1 — ingest-all.
|
|
745
|
+
const ingested = sources.map((s) => ingestWiki(s));
|
|
746
|
+
record(
|
|
747
|
+
"ingest-all",
|
|
748
|
+
`read ${ingested.length} source(s); ` +
|
|
749
|
+
`${ingested.reduce((n, i) => n + i.leaves.length, 0)} leaf/leaves, ` +
|
|
750
|
+
`${ingested.reduce((n, i) => n + i.indices.length, 0)} index/indices`,
|
|
751
|
+
);
|
|
752
|
+
|
|
753
|
+
// Phase 2 — source-validate.
|
|
754
|
+
const vreport = validateSources(ingested);
|
|
755
|
+
if (vreport.errors.length > 0) {
|
|
756
|
+
const err = new Error(
|
|
757
|
+
`join: source-validate failed — ${vreport.errors.length} error(s) across ${sources.length} source(s). Fix each source before joining:\n` +
|
|
758
|
+
summariseFindings(vreport.errors.slice(0, 10)),
|
|
759
|
+
);
|
|
760
|
+
err.code = "JOIN-SOURCE-INVALID";
|
|
761
|
+
err.findings = vreport.errors;
|
|
762
|
+
throw err;
|
|
763
|
+
}
|
|
764
|
+
record(
|
|
765
|
+
"source-validate",
|
|
766
|
+
`0 errors, ${vreport.warnings.length} warning(s) across ${sources.length} source(s)`,
|
|
767
|
+
);
|
|
768
|
+
|
|
769
|
+
// Phase 3 — plan-union.
|
|
770
|
+
const unionPlan = planUnion(ingested);
|
|
771
|
+
record(
|
|
772
|
+
"plan-union",
|
|
773
|
+
`${unionPlan.leaves.length} leaf/leaves + ${unionPlan.indices.length} index/indices in union`,
|
|
774
|
+
);
|
|
775
|
+
|
|
776
|
+
// Phase 4 — resolve-id-collisions.
|
|
777
|
+
const { plan: resolvedPlan, renameMap, mergeMap } = resolveIdCollisions(
|
|
778
|
+
unionPlan,
|
|
779
|
+
idCollisionPolicy,
|
|
780
|
+
);
|
|
781
|
+
const totalRenames = [...renameMap.values()].reduce(
|
|
782
|
+
(sum, perSource) => sum + perSource.size,
|
|
783
|
+
0,
|
|
784
|
+
);
|
|
785
|
+
record(
|
|
786
|
+
"resolve-id-collisions",
|
|
787
|
+
`policy=${idCollisionPolicy}; ${totalRenames} rename(s), ${mergeMap.size} merge(s)`,
|
|
788
|
+
);
|
|
789
|
+
|
|
790
|
+
// Phase 5 — merge-categories. First-cut is DETECT-ONLY: the
|
|
791
|
+
// helper identifies same-focus top-level categories but does not
|
|
792
|
+
// fold them. runConvergence's MERGE operator only merges sibling
|
|
793
|
+
// leaves (listChildren + leaf-pair scoring), not entire category
|
|
794
|
+
// subtrees — applying category merges requires a separate
|
|
795
|
+
// directory-MERGE operator that's tracked as a follow-up. In
|
|
796
|
+
// the common case (topically-distinct source wikis) there are
|
|
797
|
+
// zero same-focus categories so the phase is a no-op anyway; in
|
|
798
|
+
// the corner case where two sources both have `auth/` with
|
|
799
|
+
// matching focus, the joined tree ends up with two adjacent
|
|
800
|
+
// top-level categories that downstream rebalance can still
|
|
801
|
+
// consolidate if the user chooses.
|
|
802
|
+
const categoryMerges = mergeCategoriesWithSameFocus(ingested);
|
|
803
|
+
record(
|
|
804
|
+
"merge-categories",
|
|
805
|
+
`${categoryMerges.length} same-focus category group(s) detected (fold deferred to a future directory-MERGE operator)`,
|
|
806
|
+
);
|
|
807
|
+
|
|
808
|
+
// Phase 6 — rewire-references.
|
|
809
|
+
rewireReferences(resolvedPlan, renameMap, mergeMap);
|
|
810
|
+
record("rewire-references", `resolved via renameMap + mergeMap`);
|
|
811
|
+
|
|
812
|
+
// Materialise the in-memory plan to the target directory before
|
|
813
|
+
// phase 7 (runConvergence operates on a real on-disk tree).
|
|
814
|
+
materialisePlan(resolvedPlan, target);
|
|
815
|
+
record(
|
|
816
|
+
"materialise",
|
|
817
|
+
`wrote ${resolvedPlan.leaves.length} leaf/leaves into ${target}`,
|
|
818
|
+
);
|
|
819
|
+
await commitPhase(
|
|
820
|
+
"join-materialise",
|
|
821
|
+
`${resolvedPlan.leaves.length} leaf/leaves; policy=${idCollisionPolicy}`,
|
|
822
|
+
);
|
|
823
|
+
|
|
824
|
+
// Phase 7 — apply-operators (operator-convergence on unified tree).
|
|
825
|
+
const convergence = await runConvergence(target, {
|
|
826
|
+
opId,
|
|
827
|
+
qualityMode,
|
|
828
|
+
interactive: false,
|
|
829
|
+
});
|
|
830
|
+
record(
|
|
831
|
+
"operator-convergence",
|
|
832
|
+
`${convergence.applied.length} operator(s) applied across ${convergence.iterations} iteration(s)`,
|
|
833
|
+
);
|
|
834
|
+
await commitPhase(
|
|
835
|
+
"join-convergence",
|
|
836
|
+
`${convergence.applied.length} operator(s) applied`,
|
|
837
|
+
);
|
|
838
|
+
|
|
839
|
+
// Honour the Tier 2 suspend/resume contract. If convergence
|
|
840
|
+
// parked any Tier 2 requests, we MUST stop here — finalising
|
|
841
|
+
// phases 8-11 (index-generation, validation, commit) on a tree
|
|
842
|
+
// that still has pending decisions would produce a half-baked
|
|
843
|
+
// joined wiki and leave orphan `.work/tier2/pending-*.json` the
|
|
844
|
+
// re-invoke flow expects to consume. The orchestrator wraps the
|
|
845
|
+
// `runJoin` call in a catch that routes NeedsTier2Error through
|
|
846
|
+
// the exit-7 handshake (drain pending queue → write batch → exit
|
|
847
|
+
// 7 for wiki-runner to resolve) and preserves the pre-op state
|
|
848
|
+
// without tagging `op/<id>`.
|
|
849
|
+
if (convergence.needs_tier2) {
|
|
850
|
+
return {
|
|
851
|
+
phases: phaseLog,
|
|
852
|
+
convergence,
|
|
853
|
+
needs_tier2: true,
|
|
854
|
+
unified: {
|
|
855
|
+
leaves: resolvedPlan.leaves.length,
|
|
856
|
+
indices: resolvedPlan.indices.length,
|
|
857
|
+
},
|
|
858
|
+
};
|
|
859
|
+
}
|
|
860
|
+
|
|
861
|
+
// Phase 8 — generate-indices.
|
|
862
|
+
const rebuilt = rebuildAllIndices(target);
|
|
863
|
+
record("index-generation", `rebuilt ${rebuilt.length} indices`);
|
|
864
|
+
await commitPhase("join-index-generation", `rebuilt ${rebuilt.length} indices`);
|
|
865
|
+
|
|
866
|
+
// Phase 9 — validation.
|
|
867
|
+
const findings = validateWiki(target);
|
|
868
|
+
const errors = findings.filter((f) => f.severity === "error");
|
|
869
|
+
const warnings = findings.filter((f) => f.severity === "warning");
|
|
870
|
+
if (errors.length > 0) {
|
|
871
|
+
const err = new Error(
|
|
872
|
+
`join: target validation failed — ${errors.length} error(s).\n` +
|
|
873
|
+
summariseFindings(errors.slice(0, 10)),
|
|
874
|
+
);
|
|
875
|
+
err.code = "JOIN-TARGET-INVALID";
|
|
876
|
+
err.findings = errors;
|
|
877
|
+
throw err;
|
|
878
|
+
}
|
|
879
|
+
record("validation", `0 errors, ${warnings.length} warning(s)`);
|
|
880
|
+
|
|
881
|
+
// Phase 10 — golden-path-union. Source fixtures are out of scope
|
|
882
|
+
// for the first-cut implementation; record a no-op and leave the
|
|
883
|
+
// hook for downstream work.
|
|
884
|
+
record(
|
|
885
|
+
"golden-path-union",
|
|
886
|
+
"skipped (fixture-regression gate lands as a follow-up)",
|
|
887
|
+
);
|
|
888
|
+
|
|
889
|
+
// Phase 11 — commit (the orchestrator handles tagging).
|
|
890
|
+
return {
|
|
891
|
+
phases: phaseLog,
|
|
892
|
+
convergence,
|
|
893
|
+
validation: { errors, warnings },
|
|
894
|
+
unified: {
|
|
895
|
+
leaves: resolvedPlan.leaves.length,
|
|
896
|
+
indices: resolvedPlan.indices.length,
|
|
897
|
+
},
|
|
898
|
+
};
|
|
899
|
+
}
|
|
900
|
+
|
|
901
|
+
// ── Helpers ──────────────────────────────────────────────────────
|
|
902
|
+
|
|
903
|
+
function relPosix(p) {
|
|
904
|
+
return p.split(/[\\\/]/).join("/");
|
|
905
|
+
}
|
|
906
|
+
|
|
907
|
+
function dedupe(arr) {
|
|
908
|
+
return [...new Set(arr)];
|
|
909
|
+
}
|
|
910
|
+
|
|
911
|
+
// Basename of wiki path, stripped of trailing `.wiki` if present.
|
|
912
|
+
// Used as the namespace prefix when the `namespace` id-collision
|
|
913
|
+
// policy renames `<prefix>.<id>`.
|
|
914
|
+
function namespacePrefix(wikiRoot) {
|
|
915
|
+
const base = basename(wikiRoot);
|
|
916
|
+
return base.endsWith(".wiki") ? base.slice(0, -".wiki".length) : base;
|
|
917
|
+
}
|