@ctxr/skill-llm-wiki 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +134 -0
- package/LICENSE +21 -0
- package/README.md +484 -0
- package/SKILL.md +252 -0
- package/guide/basics/concepts.md +74 -0
- package/guide/basics/index.md +45 -0
- package/guide/basics/schema.md +140 -0
- package/guide/cli.md +256 -0
- package/guide/correctness/index.md +45 -0
- package/guide/correctness/invariants.md +89 -0
- package/guide/correctness/safety.md +96 -0
- package/guide/history/diff.md +110 -0
- package/guide/history/hidden-git.md +130 -0
- package/guide/history/index.md +52 -0
- package/guide/history/remote-sync.md +113 -0
- package/guide/index.md +134 -0
- package/guide/isolation/coexistence.md +134 -0
- package/guide/isolation/index.md +44 -0
- package/guide/isolation/scale.md +251 -0
- package/guide/layout/in-place-mode.md +97 -0
- package/guide/layout/index.md +53 -0
- package/guide/layout/layout-contract.md +131 -0
- package/guide/layout/layout-modes.md +115 -0
- package/guide/operations/index.md +76 -0
- package/guide/operations/ingest/build.md +75 -0
- package/guide/operations/ingest/extend.md +61 -0
- package/guide/operations/ingest/index.md +54 -0
- package/guide/operations/ingest/join.md +65 -0
- package/guide/operations/maintain/fix.md +66 -0
- package/guide/operations/maintain/index.md +47 -0
- package/guide/operations/maintain/rebuild.md +86 -0
- package/guide/operations/validate.md +48 -0
- package/guide/substrate/index.md +47 -0
- package/guide/substrate/operators.md +96 -0
- package/guide/substrate/tiered-ai.md +363 -0
- package/guide/ux/index.md +44 -0
- package/guide/ux/preflight.md +150 -0
- package/guide/ux/user-intent.md +135 -0
- package/package.json +55 -0
- package/scripts/cli.mjs +893 -0
- package/scripts/commands/remote.mjs +93 -0
- package/scripts/commands/review.mjs +253 -0
- package/scripts/commands/sync.mjs +84 -0
- package/scripts/lib/chunk.mjs +421 -0
- package/scripts/lib/cluster-detect.mjs +516 -0
- package/scripts/lib/decision-log.mjs +343 -0
- package/scripts/lib/draft.mjs +158 -0
- package/scripts/lib/embeddings.mjs +366 -0
- package/scripts/lib/frontmatter.mjs +497 -0
- package/scripts/lib/git-commands.mjs +155 -0
- package/scripts/lib/git.mjs +486 -0
- package/scripts/lib/gitignore.mjs +62 -0
- package/scripts/lib/history.mjs +331 -0
- package/scripts/lib/indices.mjs +510 -0
- package/scripts/lib/ingest.mjs +258 -0
- package/scripts/lib/intent.mjs +713 -0
- package/scripts/lib/interactive.mjs +99 -0
- package/scripts/lib/migrate.mjs +126 -0
- package/scripts/lib/nest-applier.mjs +260 -0
- package/scripts/lib/operators.mjs +1365 -0
- package/scripts/lib/orchestrator.mjs +718 -0
- package/scripts/lib/paths.mjs +197 -0
- package/scripts/lib/preflight.mjs +213 -0
- package/scripts/lib/provenance.mjs +672 -0
- package/scripts/lib/quality-metric.mjs +269 -0
- package/scripts/lib/query-fixture.mjs +71 -0
- package/scripts/lib/rollback.mjs +95 -0
- package/scripts/lib/shape-check.mjs +172 -0
- package/scripts/lib/similarity-cache.mjs +126 -0
- package/scripts/lib/similarity.mjs +230 -0
- package/scripts/lib/snapshot.mjs +54 -0
- package/scripts/lib/source-frontmatter.mjs +85 -0
- package/scripts/lib/tier2-protocol.mjs +470 -0
- package/scripts/lib/tiered.mjs +453 -0
- package/scripts/lib/validate.mjs +362 -0
|
@@ -0,0 +1,672 @@
|
|
|
1
|
+
// provenance.mjs — byte-range traceability from source files to wiki
|
|
2
|
+
// leaves, with a loud LOSS-01 invariant.
|
|
3
|
+
//
|
|
4
|
+
// Git tracks file-level transitions (via rename detection). It does NOT
|
|
5
|
+
// track "which bytes of source file X became which bytes of wiki leaf Y"
|
|
6
|
+
// — especially important for the DECOMPOSE / NEST operators which split
|
|
7
|
+
// one source into several leaves. `<wiki>/.llmwiki/provenance.yaml`
|
|
8
|
+
// fills that gap as a semantic layer on top of git.
|
|
9
|
+
//
|
|
10
|
+
// Schema:
|
|
11
|
+
//
|
|
12
|
+
// version: 1
|
|
13
|
+
// corpus:
|
|
14
|
+
// root: /abs/path/to/source
|
|
15
|
+
// root_hash: sha256:...
|
|
16
|
+
// pre_commit: <sha of pre-op/<first-op-id> in the private git>
|
|
17
|
+
// ingested_at: 2026-04-14T19:53:00Z
|
|
18
|
+
// targets:
|
|
19
|
+
// api/hello.md:
|
|
20
|
+
// sources:
|
|
21
|
+
// - source_path: api/hello.md
|
|
22
|
+
// source_pre_hash: sha256:...
|
|
23
|
+
// source_size: 4900
|
|
24
|
+
// byte_range: [0, 4821]
|
|
25
|
+
// disposition: preserved
|
|
26
|
+
// discarded_ranges:
|
|
27
|
+
// - source_path: api/hello.md
|
|
28
|
+
// byte_range: [4821, 4900]
|
|
29
|
+
// reason: "trailing whitespace"
|
|
30
|
+
//
|
|
31
|
+
// Keyed by target so Build can append one entry per drafted leaf. For
|
|
32
|
+
// LOSS-01 verification we compute the reverse mapping on demand. For
|
|
33
|
+
// incremental refresh (Phase 4+ extend), the corpus entry gives us the
|
|
34
|
+
// source path + hash to compare against the current filesystem state.
|
|
35
|
+
//
|
|
36
|
+
// This module is pure: it reads and writes YAML through hand-rolled
|
|
37
|
+
// emitters (same pattern as history.mjs) so we add no dependency and
|
|
38
|
+
// the file stays deterministic across runs.
|
|
39
|
+
|
|
40
|
+
import { existsSync, mkdirSync, readFileSync, renameSync, writeFileSync } from "node:fs";
|
|
41
|
+
import { dirname, join } from "node:path";
|
|
42
|
+
|
|
43
|
+
export function provenancePath(wikiRoot) {
|
|
44
|
+
return join(wikiRoot, ".llmwiki", "provenance.yaml");
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// In-memory document shape — identical to the serialised YAML but
|
|
48
|
+
// using JS objects for mutation. Load on first access, flush on every
|
|
49
|
+
// mutation.
|
|
50
|
+
function emptyDoc() {
|
|
51
|
+
return {
|
|
52
|
+
version: 1,
|
|
53
|
+
corpus: null,
|
|
54
|
+
targets: {},
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
export function readProvenance(wikiRoot) {
|
|
59
|
+
const path = provenancePath(wikiRoot);
|
|
60
|
+
if (!existsSync(path)) return emptyDoc();
|
|
61
|
+
const raw = readFileSync(path, "utf8");
|
|
62
|
+
return parseYaml(raw);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Atomic write via temp-file + rename, same paranoia as op-log append.
|
|
66
|
+
export function writeProvenance(wikiRoot, doc) {
|
|
67
|
+
const path = provenancePath(wikiRoot);
|
|
68
|
+
mkdirSync(dirname(path), { recursive: true });
|
|
69
|
+
const body = emitYaml(doc);
|
|
70
|
+
const tmp = `${path}.tmp.${process.pid}.${Date.now()}`;
|
|
71
|
+
writeFileSync(tmp, body, "utf8");
|
|
72
|
+
renameSync(tmp, path);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Record the corpus identity AND reset the targets map. Called once
|
|
76
|
+
// per operation, before any source entries are logged. `startCorpus`
|
|
77
|
+
// explicitly discards any prior targets because a new operation's
|
|
78
|
+
// provenance is a fresh document — the prior run's entries reference
|
|
79
|
+
// a pre-op commit that is no longer the current one, so carrying
|
|
80
|
+
// them forward would produce cross-op bleed and false LOSS-01
|
|
81
|
+
// failures on stale entries.
|
|
82
|
+
//
|
|
83
|
+
// `pre_commit` is the sha of the pre-op tag captured in the private
|
|
84
|
+
// git — that sha pins source file sizes even if the user edits the
|
|
85
|
+
// source mid-operation.
|
|
86
|
+
export function startCorpus(
|
|
87
|
+
wikiRoot,
|
|
88
|
+
{ root, root_hash, pre_commit, ingested_at },
|
|
89
|
+
) {
|
|
90
|
+
const doc = emptyDoc();
|
|
91
|
+
doc.corpus = {
|
|
92
|
+
root,
|
|
93
|
+
root_hash: root_hash || null,
|
|
94
|
+
pre_commit: pre_commit || null,
|
|
95
|
+
ingested_at: ingested_at || new Date().toISOString(),
|
|
96
|
+
};
|
|
97
|
+
writeProvenance(wikiRoot, doc);
|
|
98
|
+
return doc;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Record a single source → target mapping. `byteRange` is [startInclusive,
|
|
102
|
+
// endExclusive]; `disposition` must be one of preserved/split/merged/transformed.
|
|
103
|
+
// Idempotent by (target, source_path, byte_range): a duplicate call appends
|
|
104
|
+
// only if no existing source entry has the same triple.
|
|
105
|
+
const VALID_DISPOSITIONS = new Set([
|
|
106
|
+
"preserved",
|
|
107
|
+
"split",
|
|
108
|
+
"merged",
|
|
109
|
+
"transformed",
|
|
110
|
+
]);
|
|
111
|
+
|
|
112
|
+
export function recordSource(
|
|
113
|
+
wikiRoot,
|
|
114
|
+
target,
|
|
115
|
+
{
|
|
116
|
+
source_path,
|
|
117
|
+
source_pre_hash,
|
|
118
|
+
source_size,
|
|
119
|
+
byte_range,
|
|
120
|
+
disposition = "preserved",
|
|
121
|
+
},
|
|
122
|
+
) {
|
|
123
|
+
if (!target || typeof target !== "string") {
|
|
124
|
+
throw new Error("recordSource: target must be a non-empty string");
|
|
125
|
+
}
|
|
126
|
+
if (!source_path || typeof source_path !== "string") {
|
|
127
|
+
throw new Error("recordSource: source_path must be a non-empty string");
|
|
128
|
+
}
|
|
129
|
+
if (!Array.isArray(byte_range) || byte_range.length !== 2) {
|
|
130
|
+
throw new Error(
|
|
131
|
+
"recordSource: byte_range must be [startInclusive, endExclusive]",
|
|
132
|
+
);
|
|
133
|
+
}
|
|
134
|
+
const [start, end] = byte_range;
|
|
135
|
+
if (
|
|
136
|
+
!Number.isSafeInteger(start) ||
|
|
137
|
+
!Number.isSafeInteger(end) ||
|
|
138
|
+
start < 0 ||
|
|
139
|
+
end <= start
|
|
140
|
+
) {
|
|
141
|
+
throw new Error(
|
|
142
|
+
`recordSource: invalid byte_range [${start}, ${end}] ` +
|
|
143
|
+
"(start must be ≥ 0, end must be strictly greater than start, " +
|
|
144
|
+
"both must be safe integers)",
|
|
145
|
+
);
|
|
146
|
+
}
|
|
147
|
+
if (typeof source_size === "number") {
|
|
148
|
+
if (!Number.isSafeInteger(source_size) || source_size < 0) {
|
|
149
|
+
throw new Error(
|
|
150
|
+
`recordSource: invalid source_size ${source_size} ` +
|
|
151
|
+
"(must be a non-negative safe integer)",
|
|
152
|
+
);
|
|
153
|
+
}
|
|
154
|
+
if (end > source_size) {
|
|
155
|
+
throw new Error(
|
|
156
|
+
`recordSource: byte_range end ${end} exceeds source_size ${source_size}`,
|
|
157
|
+
);
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
if (!VALID_DISPOSITIONS.has(disposition)) {
|
|
161
|
+
throw new Error(
|
|
162
|
+
`recordSource: unknown disposition "${disposition}" (valid: ${[...VALID_DISPOSITIONS].join(", ")})`,
|
|
163
|
+
);
|
|
164
|
+
}
|
|
165
|
+
const doc = readProvenance(wikiRoot);
|
|
166
|
+
if (!doc.targets[target]) {
|
|
167
|
+
doc.targets[target] = { sources: [], discarded_ranges: [] };
|
|
168
|
+
}
|
|
169
|
+
const entry = doc.targets[target];
|
|
170
|
+
const duplicate = entry.sources.find(
|
|
171
|
+
(s) =>
|
|
172
|
+
s.source_path === source_path &&
|
|
173
|
+
s.byte_range[0] === start &&
|
|
174
|
+
s.byte_range[1] === end,
|
|
175
|
+
);
|
|
176
|
+
if (!duplicate) {
|
|
177
|
+
entry.sources.push({
|
|
178
|
+
source_path,
|
|
179
|
+
source_pre_hash: source_pre_hash || null,
|
|
180
|
+
source_size: source_size ?? null,
|
|
181
|
+
byte_range: [start, end],
|
|
182
|
+
disposition,
|
|
183
|
+
});
|
|
184
|
+
}
|
|
185
|
+
writeProvenance(wikiRoot, doc);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// Record a byte range that was deliberately discarded (boilerplate,
|
|
189
|
+
// whitespace, license header that appears in every source, etc.). The
|
|
190
|
+
// `reason` string is mandatory so the audit trail is meaningful.
|
|
191
|
+
export function recordDiscarded(
|
|
192
|
+
wikiRoot,
|
|
193
|
+
source_path,
|
|
194
|
+
byte_range,
|
|
195
|
+
reason,
|
|
196
|
+
) {
|
|
197
|
+
if (!reason || typeof reason !== "string") {
|
|
198
|
+
throw new Error("recordDiscarded: reason must be a non-empty string");
|
|
199
|
+
}
|
|
200
|
+
if (!Array.isArray(byte_range) || byte_range.length !== 2) {
|
|
201
|
+
throw new Error("recordDiscarded: byte_range must be [start, end]");
|
|
202
|
+
}
|
|
203
|
+
const [ds, de] = byte_range;
|
|
204
|
+
if (
|
|
205
|
+
!Number.isSafeInteger(ds) ||
|
|
206
|
+
!Number.isSafeInteger(de) ||
|
|
207
|
+
ds < 0 ||
|
|
208
|
+
de <= ds
|
|
209
|
+
) {
|
|
210
|
+
throw new Error(
|
|
211
|
+
`recordDiscarded: invalid byte_range [${ds}, ${de}] ` +
|
|
212
|
+
"(start must be ≥ 0, end must be strictly greater than start)",
|
|
213
|
+
);
|
|
214
|
+
}
|
|
215
|
+
// Discarded ranges are tracked against a `_discarded` virtual target
|
|
216
|
+
// so LOSS-01 can iterate them uniformly with real targets.
|
|
217
|
+
const doc = readProvenance(wikiRoot);
|
|
218
|
+
if (!doc.targets._discarded) {
|
|
219
|
+
doc.targets._discarded = { sources: [], discarded_ranges: [] };
|
|
220
|
+
}
|
|
221
|
+
const dup = doc.targets._discarded.discarded_ranges.find(
|
|
222
|
+
(d) =>
|
|
223
|
+
d.source_path === source_path &&
|
|
224
|
+
d.byte_range[0] === byte_range[0] &&
|
|
225
|
+
d.byte_range[1] === byte_range[1],
|
|
226
|
+
);
|
|
227
|
+
if (!dup) {
|
|
228
|
+
doc.targets._discarded.discarded_ranges.push({
|
|
229
|
+
source_path,
|
|
230
|
+
byte_range: [byte_range[0], byte_range[1]],
|
|
231
|
+
reason,
|
|
232
|
+
});
|
|
233
|
+
}
|
|
234
|
+
writeProvenance(wikiRoot, doc);
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// Verify that every source byte is accounted for across every target
|
|
238
|
+
// that references it, plus any explicitly-discarded ranges. Returns
|
|
239
|
+
// { ok, uncovered, overlaps } — `uncovered` lists source paths with
|
|
240
|
+
// gaps, `overlaps` lists source paths where two targets claim the
|
|
241
|
+
// same byte range. A healthy provenance manifest has both arrays empty.
|
|
242
|
+
//
|
|
243
|
+
// `lookupSourceSize` is an injected function `(source_path) => number`
|
|
244
|
+
// so callers can read sizes either from the pre-op git commit (via
|
|
245
|
+
// `gitCatFileSize`) or from the filesystem directly (tests).
|
|
246
|
+
export function verifyCoverage(wikiRoot, lookupSourceSize) {
|
|
247
|
+
if (typeof lookupSourceSize !== "function") {
|
|
248
|
+
throw new Error(
|
|
249
|
+
"verifyCoverage: lookupSourceSize must be a function(source_path) → number",
|
|
250
|
+
);
|
|
251
|
+
}
|
|
252
|
+
const doc = readProvenance(wikiRoot);
|
|
253
|
+
// Build the reverse index: source_path → [{ byte_range, target }...]
|
|
254
|
+
const sourceIndex = new Map();
|
|
255
|
+
for (const [target, entry] of Object.entries(doc.targets)) {
|
|
256
|
+
for (const s of entry.sources || []) {
|
|
257
|
+
if (!sourceIndex.has(s.source_path)) sourceIndex.set(s.source_path, []);
|
|
258
|
+
sourceIndex.get(s.source_path).push({
|
|
259
|
+
byte_range: s.byte_range,
|
|
260
|
+
target,
|
|
261
|
+
kind: "preserved",
|
|
262
|
+
});
|
|
263
|
+
}
|
|
264
|
+
for (const d of entry.discarded_ranges || []) {
|
|
265
|
+
if (!sourceIndex.has(d.source_path)) sourceIndex.set(d.source_path, []);
|
|
266
|
+
sourceIndex.get(d.source_path).push({
|
|
267
|
+
byte_range: d.byte_range,
|
|
268
|
+
target,
|
|
269
|
+
kind: "discarded",
|
|
270
|
+
});
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
const uncovered = [];
|
|
274
|
+
const overlaps = [];
|
|
275
|
+
const outOfBounds = [];
|
|
276
|
+
for (const [source_path, ranges] of sourceIndex) {
|
|
277
|
+
const size = lookupSourceSize(source_path);
|
|
278
|
+
if (typeof size !== "number" || !Number.isFinite(size) || size < 0) {
|
|
279
|
+
uncovered.push({
|
|
280
|
+
source_path,
|
|
281
|
+
reason: `lookupSourceSize returned ${size}; source size unknown`,
|
|
282
|
+
});
|
|
283
|
+
continue;
|
|
284
|
+
}
|
|
285
|
+
// Any range that extends past `size` is a provenance bug: it
|
|
286
|
+
// claims bytes that do not exist in the source. Report as
|
|
287
|
+
// out-of-bounds and do NOT advance the cursor past `size` so the
|
|
288
|
+
// gap/tail machinery keeps working meaningfully.
|
|
289
|
+
const sorted = [...ranges].sort(
|
|
290
|
+
(a, b) => a.byte_range[0] - b.byte_range[0],
|
|
291
|
+
);
|
|
292
|
+
let cursor = 0;
|
|
293
|
+
for (const r of sorted) {
|
|
294
|
+
const [s, e] = r.byte_range;
|
|
295
|
+
if (e > size) {
|
|
296
|
+
outOfBounds.push({
|
|
297
|
+
source_path,
|
|
298
|
+
byte_range: [s, e],
|
|
299
|
+
source_size: size,
|
|
300
|
+
target: r.target,
|
|
301
|
+
reason: "range end exceeds source_size",
|
|
302
|
+
});
|
|
303
|
+
}
|
|
304
|
+
if (s < cursor) {
|
|
305
|
+
overlaps.push({ source_path, byte_range: [s, e], target: r.target });
|
|
306
|
+
continue;
|
|
307
|
+
}
|
|
308
|
+
if (s > cursor) {
|
|
309
|
+
uncovered.push({
|
|
310
|
+
source_path,
|
|
311
|
+
byte_range: [cursor, s],
|
|
312
|
+
reason: "gap",
|
|
313
|
+
});
|
|
314
|
+
}
|
|
315
|
+
// Clamp the cursor to `size` so a range that overshoots doesn't
|
|
316
|
+
// poison subsequent gap checks with a bogus cursor value.
|
|
317
|
+
cursor = Math.min(size, Math.max(cursor, e));
|
|
318
|
+
}
|
|
319
|
+
if (cursor < size) {
|
|
320
|
+
uncovered.push({
|
|
321
|
+
source_path,
|
|
322
|
+
byte_range: [cursor, size],
|
|
323
|
+
reason: "tail not covered",
|
|
324
|
+
});
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
return {
|
|
328
|
+
ok:
|
|
329
|
+
uncovered.length === 0 &&
|
|
330
|
+
overlaps.length === 0 &&
|
|
331
|
+
outOfBounds.length === 0,
|
|
332
|
+
uncovered,
|
|
333
|
+
overlaps,
|
|
334
|
+
out_of_bounds: outOfBounds,
|
|
335
|
+
};
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
// ──────────────────────────────────────────────────────────────────────
|
|
339
|
+
// Hand-rolled YAML emitter / parser. Same philosophy as history.mjs:
|
|
340
|
+
// the schema is fixed, a full YAML dep is overkill, determinism > DWIM.
|
|
341
|
+
// Supports only the shapes this module writes. Round-trip-safe tested in
|
|
342
|
+
// tests/unit/provenance.test.mjs.
|
|
343
|
+
// ──────────────────────────────────────────────────────────────────────
|
|
344
|
+
|
|
345
|
+
function needsQuoting(value) {
|
|
346
|
+
if (value === "") return true;
|
|
347
|
+
if (/[:#{}\[\],&*!|>'"%@`\n\r\t]/.test(value)) return true;
|
|
348
|
+
if (/^[- ?]/.test(value)) return true;
|
|
349
|
+
if (/^-?\d+$/.test(value)) return true;
|
|
350
|
+
if (value === "true" || value === "false" || value === "null") return true;
|
|
351
|
+
return false;
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
function escapeQuoted(value) {
|
|
355
|
+
for (let i = 0; i < value.length; i++) {
|
|
356
|
+
const c = value.charCodeAt(i);
|
|
357
|
+
if (c < 0x20 && c !== 0x09 && c !== 0x0a && c !== 0x0d) {
|
|
358
|
+
throw new Error(
|
|
359
|
+
`provenance emitter: control character U+${c.toString(16).padStart(4, "0")} not round-trip-safe`,
|
|
360
|
+
);
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
let out = '"';
|
|
364
|
+
for (const ch of value) {
|
|
365
|
+
switch (ch) {
|
|
366
|
+
case "\\":
|
|
367
|
+
out += "\\\\";
|
|
368
|
+
break;
|
|
369
|
+
case '"':
|
|
370
|
+
out += '\\"';
|
|
371
|
+
break;
|
|
372
|
+
case "\n":
|
|
373
|
+
out += "\\n";
|
|
374
|
+
break;
|
|
375
|
+
case "\r":
|
|
376
|
+
out += "\\r";
|
|
377
|
+
break;
|
|
378
|
+
case "\t":
|
|
379
|
+
out += "\\t";
|
|
380
|
+
break;
|
|
381
|
+
default:
|
|
382
|
+
out += ch;
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
out += '"';
|
|
386
|
+
return out;
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
function emitScalar(value) {
|
|
390
|
+
if (value === null || value === undefined) return "null";
|
|
391
|
+
if (typeof value === "boolean" || typeof value === "number") {
|
|
392
|
+
return String(value);
|
|
393
|
+
}
|
|
394
|
+
if (typeof value === "string") {
|
|
395
|
+
if (needsQuoting(value)) return escapeQuoted(value);
|
|
396
|
+
return value;
|
|
397
|
+
}
|
|
398
|
+
throw new Error(`provenance emitter: unsupported scalar ${typeof value}`);
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
function emitFlowArrayOfInts(arr) {
|
|
402
|
+
return "[" + arr.map((n) => emitScalar(n)).join(", ") + "]";
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
function emitYaml(doc) {
|
|
406
|
+
const lines = [];
|
|
407
|
+
lines.push("# skill-llm-wiki provenance manifest (version 1)");
|
|
408
|
+
lines.push("version: 1");
|
|
409
|
+
if (doc.corpus) {
|
|
410
|
+
lines.push("corpus:");
|
|
411
|
+
lines.push(` root: ${emitScalar(doc.corpus.root)}`);
|
|
412
|
+
lines.push(` root_hash: ${emitScalar(doc.corpus.root_hash)}`);
|
|
413
|
+
lines.push(` pre_commit: ${emitScalar(doc.corpus.pre_commit)}`);
|
|
414
|
+
lines.push(` ingested_at: ${emitScalar(doc.corpus.ingested_at)}`);
|
|
415
|
+
} else {
|
|
416
|
+
lines.push("corpus: null");
|
|
417
|
+
}
|
|
418
|
+
lines.push("targets:");
|
|
419
|
+
const targetKeys = Object.keys(doc.targets).sort();
|
|
420
|
+
if (targetKeys.length === 0) {
|
|
421
|
+
lines.push(" {}");
|
|
422
|
+
}
|
|
423
|
+
for (const target of targetKeys) {
|
|
424
|
+
const entry = doc.targets[target];
|
|
425
|
+
lines.push(` ${emitScalar(target)}:`);
|
|
426
|
+
lines.push(" sources:");
|
|
427
|
+
if ((entry.sources || []).length === 0) {
|
|
428
|
+
lines.push(" []");
|
|
429
|
+
}
|
|
430
|
+
for (const s of entry.sources || []) {
|
|
431
|
+
lines.push(` - source_path: ${emitScalar(s.source_path)}`);
|
|
432
|
+
lines.push(` source_pre_hash: ${emitScalar(s.source_pre_hash)}`);
|
|
433
|
+
lines.push(` source_size: ${emitScalar(s.source_size)}`);
|
|
434
|
+
lines.push(` byte_range: ${emitFlowArrayOfInts(s.byte_range)}`);
|
|
435
|
+
lines.push(` disposition: ${emitScalar(s.disposition)}`);
|
|
436
|
+
}
|
|
437
|
+
lines.push(" discarded_ranges:");
|
|
438
|
+
if ((entry.discarded_ranges || []).length === 0) {
|
|
439
|
+
lines.push(" []");
|
|
440
|
+
}
|
|
441
|
+
for (const d of entry.discarded_ranges || []) {
|
|
442
|
+
lines.push(` - source_path: ${emitScalar(d.source_path)}`);
|
|
443
|
+
lines.push(` byte_range: ${emitFlowArrayOfInts(d.byte_range)}`);
|
|
444
|
+
lines.push(` reason: ${emitScalar(d.reason)}`);
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
return lines.join("\n") + "\n";
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
function unescapeQuoted(body) {
|
|
451
|
+
let out = "";
|
|
452
|
+
for (let i = 0; i < body.length; i++) {
|
|
453
|
+
if (body[i] === "\\" && i + 1 < body.length) {
|
|
454
|
+
const next = body[i + 1];
|
|
455
|
+
switch (next) {
|
|
456
|
+
case "\\":
|
|
457
|
+
out += "\\";
|
|
458
|
+
break;
|
|
459
|
+
case '"':
|
|
460
|
+
out += '"';
|
|
461
|
+
break;
|
|
462
|
+
case "n":
|
|
463
|
+
out += "\n";
|
|
464
|
+
break;
|
|
465
|
+
case "r":
|
|
466
|
+
out += "\r";
|
|
467
|
+
break;
|
|
468
|
+
case "t":
|
|
469
|
+
out += "\t";
|
|
470
|
+
break;
|
|
471
|
+
default:
|
|
472
|
+
out += next;
|
|
473
|
+
}
|
|
474
|
+
i++;
|
|
475
|
+
} else {
|
|
476
|
+
out += body[i];
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
return out;
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
function parseValue(raw) {
|
|
483
|
+
if (raw === "null" || raw === "") return null;
|
|
484
|
+
if (raw === "true") return true;
|
|
485
|
+
if (raw === "false") return false;
|
|
486
|
+
if (/^-?\d+$/.test(raw)) {
|
|
487
|
+
const n = Number(raw);
|
|
488
|
+
if (!Number.isSafeInteger(n)) {
|
|
489
|
+
throw new Error(
|
|
490
|
+
`provenance parser: integer ${raw} is not a safe JavaScript integer`,
|
|
491
|
+
);
|
|
492
|
+
}
|
|
493
|
+
return n;
|
|
494
|
+
}
|
|
495
|
+
// Quote symmetry: both ends must be quoted or neither. An
|
|
496
|
+
// asymmetric quote is a hand-edit error and we refuse to pretend
|
|
497
|
+
// the raw string is what the author meant.
|
|
498
|
+
const startsQ = raw.startsWith('"');
|
|
499
|
+
const endsQ = raw.endsWith('"');
|
|
500
|
+
if (startsQ !== endsQ) {
|
|
501
|
+
throw new Error(`provenance parser: unbalanced quote in value: ${raw}`);
|
|
502
|
+
}
|
|
503
|
+
if (startsQ && endsQ) {
|
|
504
|
+
if (raw.length < 2) {
|
|
505
|
+
throw new Error(`provenance parser: lone quote: ${raw}`);
|
|
506
|
+
}
|
|
507
|
+
return unescapeQuoted(raw.slice(1, -1));
|
|
508
|
+
}
|
|
509
|
+
return raw;
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
function parseFlowIntArray(raw) {
|
|
513
|
+
const inner = raw.trim().replace(/^\[/, "").replace(/\]$/, "");
|
|
514
|
+
if (inner.trim() === "") return [];
|
|
515
|
+
return inner.split(",").map((s) => {
|
|
516
|
+
const trimmed = s.trim();
|
|
517
|
+
if (!/^-?\d+$/.test(trimmed)) {
|
|
518
|
+
throw new Error(
|
|
519
|
+
`provenance parser: non-integer in flow array: ${raw}`,
|
|
520
|
+
);
|
|
521
|
+
}
|
|
522
|
+
const n = Number(trimmed);
|
|
523
|
+
if (!Number.isSafeInteger(n)) {
|
|
524
|
+
throw new Error(
|
|
525
|
+
`provenance parser: integer ${trimmed} is not a safe JavaScript integer`,
|
|
526
|
+
);
|
|
527
|
+
}
|
|
528
|
+
return n;
|
|
529
|
+
});
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
// Strip every comment line and every empty line before parsing.
|
|
533
|
+
// Comments are anywhere; the emitter only writes them at the header,
|
|
534
|
+
// but a hand-edit might add more. Removing them uniformly prevents
|
|
535
|
+
// the state machine from getting confused by a mid-document `#` line
|
|
536
|
+
// (previous bug: the machine only stripped comments at the top,
|
|
537
|
+
// letting a mid-doc comment silently zero the document).
|
|
538
|
+
function preprocess(raw) {
|
|
539
|
+
return raw
|
|
540
|
+
.split(/\r?\n/)
|
|
541
|
+
.filter((l) => l.length > 0 && !/^\s*#/.test(l));
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
function parseYaml(raw) {
|
|
545
|
+
const doc = emptyDoc();
|
|
546
|
+
const lines = preprocess(raw);
|
|
547
|
+
let i = 0;
|
|
548
|
+
|
|
549
|
+
// version — required
|
|
550
|
+
if (i >= lines.length || !lines[i].startsWith("version:")) {
|
|
551
|
+
throw new Error("provenance parser: missing `version:` at top of document");
|
|
552
|
+
}
|
|
553
|
+
doc.version = parseValue(lines[i].slice("version:".length).trim());
|
|
554
|
+
i++;
|
|
555
|
+
|
|
556
|
+
// corpus — required (value may be `null`)
|
|
557
|
+
if (i >= lines.length || !lines[i].startsWith("corpus:")) {
|
|
558
|
+
throw new Error("provenance parser: missing `corpus:` after version");
|
|
559
|
+
}
|
|
560
|
+
const corpusRest = lines[i].slice("corpus:".length).trim();
|
|
561
|
+
i++;
|
|
562
|
+
if (corpusRest === "null") {
|
|
563
|
+
doc.corpus = null;
|
|
564
|
+
} else if (corpusRest === "") {
|
|
565
|
+
doc.corpus = {};
|
|
566
|
+
while (i < lines.length && /^ \w+:/.test(lines[i])) {
|
|
567
|
+
const m = /^ (\w+):\s*(.*)$/.exec(lines[i]);
|
|
568
|
+
if (!m) {
|
|
569
|
+
throw new Error(
|
|
570
|
+
`provenance parser: unrecognised corpus line: ${lines[i]}`,
|
|
571
|
+
);
|
|
572
|
+
}
|
|
573
|
+
doc.corpus[m[1]] = parseValue(m[2]);
|
|
574
|
+
i++;
|
|
575
|
+
}
|
|
576
|
+
} else {
|
|
577
|
+
throw new Error(
|
|
578
|
+
`provenance parser: corpus: must be followed by null or a block, got "${corpusRest}"`,
|
|
579
|
+
);
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
// targets — required
|
|
583
|
+
if (i >= lines.length || !lines[i].startsWith("targets:")) {
|
|
584
|
+
throw new Error("provenance parser: missing `targets:` after corpus");
|
|
585
|
+
}
|
|
586
|
+
i++;
|
|
587
|
+
// Empty targets sentinel — {} on its own indented line.
|
|
588
|
+
if (i < lines.length && lines[i].trim() === "{}") {
|
|
589
|
+
i++;
|
|
590
|
+
if (i < lines.length) {
|
|
591
|
+
throw new Error(
|
|
592
|
+
`provenance parser: trailing content after empty targets: ${lines[i]}`,
|
|
593
|
+
);
|
|
594
|
+
}
|
|
595
|
+
return doc;
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
while (i < lines.length) {
|
|
599
|
+
// Each target: ` <quoted-or-bare>:`
|
|
600
|
+
const m = /^ (\S.*?):\s*$/.exec(lines[i]);
|
|
601
|
+
if (!m) {
|
|
602
|
+
throw new Error(
|
|
603
|
+
`provenance parser: expected target key at line ${i + 1}, got: ${lines[i]}`,
|
|
604
|
+
);
|
|
605
|
+
}
|
|
606
|
+
const targetName = parseValue(m[1]);
|
|
607
|
+
i++;
|
|
608
|
+
const entry = { sources: [], discarded_ranges: [] };
|
|
609
|
+
doc.targets[targetName] = entry;
|
|
610
|
+
// A target carries exactly two sub-keys in this order: sources:
|
|
611
|
+
// and discarded_ranges:. Both are required but either may be
|
|
612
|
+
// empty (rendered as `[]`). We loop until we encounter a line
|
|
613
|
+
// that isn't indented to the target's child level.
|
|
614
|
+
while (i < lines.length && lines[i].startsWith(" ")) {
|
|
615
|
+
const trimmed = lines[i].trim();
|
|
616
|
+
if (trimmed === "sources:") {
|
|
617
|
+
i++;
|
|
618
|
+
const consumed = parseSourcesOrDiscarded(lines, i);
|
|
619
|
+
entry.sources = consumed.items;
|
|
620
|
+
i = consumed.nextI;
|
|
621
|
+
continue;
|
|
622
|
+
}
|
|
623
|
+
if (trimmed === "discarded_ranges:") {
|
|
624
|
+
i++;
|
|
625
|
+
const consumed = parseSourcesOrDiscarded(lines, i);
|
|
626
|
+
entry.discarded_ranges = consumed.items;
|
|
627
|
+
i = consumed.nextI;
|
|
628
|
+
continue;
|
|
629
|
+
}
|
|
630
|
+
throw new Error(
|
|
631
|
+
`provenance parser: unknown target field at line ${i + 1}: ${lines[i]}`,
|
|
632
|
+
);
|
|
633
|
+
}
|
|
634
|
+
}
|
|
635
|
+
return doc;
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
// Parse a list of ` - key: value\n` blocks starting at lines[i].
|
|
639
|
+
// Returns { items, nextI } so the caller can advance its own cursor.
|
|
640
|
+
function parseSourcesOrDiscarded(lines, startI) {
|
|
641
|
+
const out = [];
|
|
642
|
+
let i = startI;
|
|
643
|
+
// Empty-list sentinel.
|
|
644
|
+
if (i < lines.length && lines[i].trim() === "[]") {
|
|
645
|
+
return { items: out, nextI: i + 1 };
|
|
646
|
+
}
|
|
647
|
+
while (i < lines.length) {
|
|
648
|
+
const line = lines[i];
|
|
649
|
+
// Item starter: ` - source_path: <value>`
|
|
650
|
+
if (/^ {6}- (\w+):/.test(line)) {
|
|
651
|
+
const item = {};
|
|
652
|
+
const m = /^ {6}- (\w+):\s*(.*)$/.exec(line);
|
|
653
|
+
item[m[1]] = parseValue(m[2]);
|
|
654
|
+
i++;
|
|
655
|
+
while (i < lines.length && /^ {8}(\w+):/.test(lines[i])) {
|
|
656
|
+
const m2 = /^ {8}(\w+):\s*(.*)$/.exec(lines[i]);
|
|
657
|
+
const key = m2[1];
|
|
658
|
+
const raw = m2[2];
|
|
659
|
+
if (key === "byte_range") {
|
|
660
|
+
item[key] = parseFlowIntArray(raw);
|
|
661
|
+
} else {
|
|
662
|
+
item[key] = parseValue(raw);
|
|
663
|
+
}
|
|
664
|
+
i++;
|
|
665
|
+
}
|
|
666
|
+
out.push(item);
|
|
667
|
+
continue;
|
|
668
|
+
}
|
|
669
|
+
break;
|
|
670
|
+
}
|
|
671
|
+
return { items: out, nextI: i };
|
|
672
|
+
}
|