@delfini/drift-engine 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,579 @@
1
+ // packages/drift-engine/src/diff-filter.ts
2
+ //
3
+ // Deterministic unified-diff pre-filter (Story P3.7.2 / FR151). Drops hunks
4
+ // that cannot carry doc-claim signal before prompt assembly: lockfile churn,
5
+ // generated output, vendored code, test fixtures (path-level), plus pure
6
+ // whitespace and import-ordering hunks (hunk-level).
7
+ //
8
+ // Pure-logic — no I/O, no clock, no randomness, no new runtime dep. ESLint
9
+ // `no-restricted-imports` on packages/drift-engine/src/**/*.ts forbids fs /
10
+ // child_process / http / https / @anthropic-ai/sdk / openai / @langchain/* /
11
+ // process.env. Path classification uses hand-written predicates — NO
12
+ // `picomatch` (Story Dev Notes §"Path classification predicates").
13
+ //
14
+ // Exposed via `index.ts` (unlike the P3.7.1 relevance internals, which run
15
+ // INSIDE `buildPrompt`). The gate for this filter lives at the CONSUMER —
16
+ // the CLI's `runLocalPrepare` and the Action's `buildAnalysisInput` call
17
+ // `filterDiff` directly at input-assembly time and must reach it through the
18
+ // package's public surface (the package `exports` map blocks deep
19
+ // `@delfini/drift-engine/src/...` imports). The default consumer path never
20
+ // invokes this module, so `buildPrompt` output stays byte-identical and the
21
+ // NFR44 snapshot gate stays green (NFR49(b) parity discipline).
22
+ // -- Public entry point ------------------------------------------------------
23
+ /**
24
+ * Filter a unified-diff string deterministically.
25
+ *
26
+ * The input shape is the same one `buildPrompt`'s `AnalysisInput.diff`
27
+ * consumes: a sequence of `diff --git a/<path> b/<path>` blocks, each with a
28
+ * `--- a/...` / `+++ b/...` preamble followed by one or more `@@ ... @@`
29
+ * hunks. Content before the first `diff --git` header (rare in practice — git
30
+ * does not emit any) is preserved as a leading "noise" segment so callers do
31
+ * not lose surrounding context.
32
+ *
33
+ * Identical input → identical output (NFR46 reproducibility carries forward).
34
+ */
35
+ export function filterDiff(diff) {
36
+ const droppedPaths = [];
37
+ const droppedHunks = [];
38
+ const files = parseDiffIntoFiles(diff);
39
+ const keptParts = [];
40
+ if (files.preamble.length > 0) {
41
+ keptParts.push(files.preamble);
42
+ }
43
+ for (const file of files.files) {
44
+ // Path-level drops — classified first, in priority order. Lockfiles win
45
+ // over generated/vendored/fixture since the canonical lockfile names
46
+ // never overlap those patterns in practice.
47
+ const pathReason = classifyPath(file.path);
48
+ if (pathReason !== null) {
49
+ droppedPaths.push({ path: file.path, reason: pathReason });
50
+ continue;
51
+ }
52
+ // Hunk-level drops — re-emit the preamble verbatim, then per hunk decide
53
+ // keep/drop, then if EVERY hunk was dropped promote the file itself to
54
+ // droppedPaths (no orphan preamble in keptDiff; see Story Dev Notes
55
+ // §"File whose every hunk is dropped").
56
+ const keptHunks = [];
57
+ const fileDroppedHunks = [];
58
+ for (const hunk of file.hunks) {
59
+ const hunkReason = classifyHunk(hunk, file.path);
60
+ if (hunkReason !== null) {
61
+ fileDroppedHunks.push({
62
+ path: file.path,
63
+ hunkHeader: hunk.header,
64
+ reason: hunkReason,
65
+ });
66
+ continue;
67
+ }
68
+ keptHunks.push(hunk);
69
+ }
70
+ if (keptHunks.length === 0 && fileDroppedHunks.length > 0) {
71
+ // Every hunk dropped — collapse to a path-level drop. Pick the most
72
+ // common reason across the file's dropped hunks (deterministic
73
+ // tie-break: first reason encountered wins on ties). This keeps
74
+ // `keptDiff` parseable as a valid unified diff (no orphan preamble).
75
+ const reason = mostCommonHunkReason(fileDroppedHunks);
76
+ droppedPaths.push({ path: file.path, reason });
77
+ continue;
78
+ }
79
+ droppedHunks.push(...fileDroppedHunks);
80
+ keptParts.push(emitFile(file, keptHunks));
81
+ }
82
+ return {
83
+ keptDiff: keptParts.join(''),
84
+ droppedPaths,
85
+ droppedHunks,
86
+ };
87
+ }
88
+ function parseDiffIntoFiles(diff) {
89
+ if (diff.length === 0) {
90
+ return { preamble: '', files: [] };
91
+ }
92
+ // Locate every `diff --git ` header. We treat the string position of each
93
+ // match as a file boundary; everything between two consecutive boundaries
94
+ // (or the last boundary and end-of-string) is one file's slice.
95
+ //
96
+ // Two header dialects must be recognised, or a file is silently absorbed
97
+ // into its predecessor's slice (and dropped with it if the predecessor is
98
+ // path-dropped — a silent-data-loss bug):
99
+ // 1. unquoted: `diff --git a/path b/path`
100
+ // 2. C-quoted: `diff --git "a/pa th" "b/pa th"` — git quotes paths
101
+ // containing spaces / control bytes / non-ASCII when core.quotePath
102
+ // is on (the default).
103
+ // We anchor on the literal `diff --git ` prefix and split the remainder
104
+ // into the a-side and b-side tokens (quoted or not), then unquote.
105
+ const headerRegex = /^diff --git (.+)$/gm;
106
+ const boundaries = [];
107
+ let m;
108
+ while ((m = headerRegex.exec(diff)) !== null) {
109
+ const paths = parseHeaderPaths(m[1]);
110
+ if (paths === null)
111
+ continue;
112
+ boundaries.push({ index: m.index, aPath: paths.aPath, bPath: paths.bPath });
113
+ }
114
+ if (boundaries.length === 0) {
115
+ return { preamble: diff, files: [] };
116
+ }
117
+ const preamble = diff.slice(0, boundaries[0].index);
118
+ const files = [];
119
+ for (let i = 0; i < boundaries.length; i++) {
120
+ const start = boundaries[i].index;
121
+ const end = i + 1 < boundaries.length ? boundaries[i + 1].index : diff.length;
122
+ const fileSlice = diff.slice(start, end);
123
+ const path = pickPrimaryPath(boundaries[i].aPath, boundaries[i].bPath);
124
+ files.push(parseFileSlice(path, fileSlice));
125
+ }
126
+ return { preamble, files };
127
+ }
128
+ function pickPrimaryPath(aPath, bPath) {
129
+ if (bPath && bPath !== '/dev/null')
130
+ return bPath;
131
+ return aPath;
132
+ }
133
+ /**
134
+ * Parse the `<a-side> <b-side>` remainder of a `diff --git ` header into the
135
+ * two repo-relative paths, stripping the `a/` / `b/` prefixes and unquoting
136
+ * git's C-quoted form. Returns null if two path tokens cannot be read.
137
+ *
138
+ * Handles:
139
+ * - unquoted: `a/path b/path` (no spaces inside either token)
140
+ * - C-quoted: `"a/pa th" "b/pa th"` and mixed (`a/x "b/pa th"`)
141
+ */
142
+ function parseHeaderPaths(rest) {
143
+ const tokens = tokenizeTwoPaths(rest);
144
+ if (tokens === null)
145
+ return null;
146
+ return {
147
+ aPath: stripSidePrefix(unquoteGitPath(tokens[0]), 'a/'),
148
+ bPath: stripSidePrefix(unquoteGitPath(tokens[1]), 'b/'),
149
+ };
150
+ }
151
+ /** Read exactly two whitespace- or quote-delimited tokens from `rest`. */
152
+ function tokenizeTwoPaths(rest) {
153
+ const tokens = [];
154
+ let i = 0;
155
+ while (i < rest.length && tokens.length < 2) {
156
+ while (i < rest.length && rest[i] === ' ')
157
+ i++;
158
+ if (i >= rest.length)
159
+ break;
160
+ if (rest[i] === '"') {
161
+ // Quoted token — consume to the matching unescaped closing quote.
162
+ let j = i + 1;
163
+ while (j < rest.length) {
164
+ if (rest[j] === '\\') {
165
+ j += 2;
166
+ continue;
167
+ }
168
+ if (rest[j] === '"')
169
+ break;
170
+ j++;
171
+ }
172
+ if (j >= rest.length)
173
+ return null; // unterminated quote
174
+ tokens.push(rest.slice(i, j + 1));
175
+ i = j + 1;
176
+ }
177
+ else {
178
+ // Bare token — consume to the next space.
179
+ let j = i;
180
+ while (j < rest.length && rest[j] !== ' ')
181
+ j++;
182
+ tokens.push(rest.slice(i, j));
183
+ i = j;
184
+ }
185
+ }
186
+ return tokens.length === 2 ? [tokens[0], tokens[1]] : null;
187
+ }
188
+ /** Strip a leading `a/` or `b/` (post-unquote). */
189
+ function stripSidePrefix(p, prefix) {
190
+ return p.startsWith(prefix) ? p.slice(prefix.length) : p;
191
+ }
192
+ /**
193
+ * Unquote a git C-quoted path token. Handles the named escapes and octal
194
+ * byte escapes git emits; the dominant real case (a space in the path) carries
195
+ * no escapes at all, so this is mostly a quote-strip. Best-effort for exotic
196
+ * byte sequences — the load-bearing win is that the header is recognised as a
197
+ * boundary so the file is never silently absorbed into a neighbour's slice.
198
+ */
199
+ function unquoteGitPath(token) {
200
+ if (token.length < 2 || token[0] !== '"' || token[token.length - 1] !== '"') {
201
+ return token;
202
+ }
203
+ const inner = token.slice(1, -1);
204
+ if (!inner.includes('\\'))
205
+ return inner;
206
+ let out = '';
207
+ for (let i = 0; i < inner.length; i++) {
208
+ if (inner[i] !== '\\') {
209
+ out += inner[i];
210
+ continue;
211
+ }
212
+ const next = inner[i + 1];
213
+ if (next === undefined) {
214
+ out += '\\';
215
+ break;
216
+ }
217
+ if (next >= '0' && next <= '7') {
218
+ // Octal byte escape \NNN (1–3 octal digits).
219
+ let oct = '';
220
+ let k = i + 1;
221
+ while (k < inner.length && oct.length < 3 && inner[k] >= '0' && inner[k] <= '7') {
222
+ oct += inner[k];
223
+ k++;
224
+ }
225
+ out += String.fromCharCode(parseInt(oct, 8));
226
+ i = k - 1;
227
+ continue;
228
+ }
229
+ switch (next) {
230
+ case 'n':
231
+ out += '\n';
232
+ break;
233
+ case 't':
234
+ out += '\t';
235
+ break;
236
+ case 'r':
237
+ out += '\r';
238
+ break;
239
+ case '"':
240
+ out += '"';
241
+ break;
242
+ case '\\':
243
+ out += '\\';
244
+ break;
245
+ default:
246
+ out += next;
247
+ }
248
+ i++;
249
+ }
250
+ return out;
251
+ }
252
+ function parseFileSlice(path, slice) {
253
+ // Find every `@@ ` hunk-header start position INSIDE the slice. Hunk headers
254
+ // always appear at column 0 — we use a line-start anchored regex with `m`
255
+ // flag so the byte offsets it reports are usable for re-slicing.
256
+ const hunkRegex = /^@@ /gm;
257
+ const hunkStarts = [];
258
+ let m;
259
+ while ((m = hunkRegex.exec(slice)) !== null) {
260
+ hunkStarts.push(m.index);
261
+ }
262
+ if (hunkStarts.length === 0) {
263
+ // No hunks — e.g. binary marker, rename-only, mode-only diff. Whole slice
264
+ // is preamble; the filter will keep it verbatim.
265
+ return { path, rawSlice: slice, preamble: slice, hunks: [] };
266
+ }
267
+ const preamble = slice.slice(0, hunkStarts[0]);
268
+ const hunks = [];
269
+ for (let i = 0; i < hunkStarts.length; i++) {
270
+ const start = hunkStarts[i];
271
+ const end = i + 1 < hunkStarts.length ? hunkStarts[i + 1] : slice.length;
272
+ const hunkSlice = slice.slice(start, end);
273
+ // Split the hunk into header line + body. The header line runs from `@@`
274
+ // up to and including its trailing newline. If there's no newline (last
275
+ // line of file, malformed), the entire hunkSlice is the header.
276
+ const newlineIdx = hunkSlice.indexOf('\n');
277
+ if (newlineIdx === -1) {
278
+ hunks.push({ header: hunkSlice, body: '' });
279
+ }
280
+ else {
281
+ hunks.push({
282
+ header: hunkSlice.slice(0, newlineIdx + 1),
283
+ body: hunkSlice.slice(newlineIdx + 1),
284
+ });
285
+ }
286
+ }
287
+ return { path, rawSlice: slice, preamble, hunks };
288
+ }
289
+ function emitFile(file, keptHunks) {
290
+ // Fast path: every hunk kept → re-emit the original slice verbatim. This
291
+ // guarantees byte-equality for files that were not touched at hunk level.
292
+ if (keptHunks.length === file.hunks.length) {
293
+ return file.rawSlice;
294
+ }
295
+ if (keptHunks.length === 0) {
296
+ return file.preamble;
297
+ }
298
+ let out = file.preamble;
299
+ for (const hunk of keptHunks) {
300
+ out += hunk.header + hunk.body;
301
+ }
302
+ return out;
303
+ }
304
+ // -- Path classification (hand-written, no globs) ----------------------------
305
+ function classifyPath(filePath) {
306
+ if (isLockfile(filePath))
307
+ return 'lockfile';
308
+ if (isGenerated(filePath))
309
+ return 'generated';
310
+ if (isVendored(filePath))
311
+ return 'vendored';
312
+ if (isFixture(filePath))
313
+ return 'fixture';
314
+ return null;
315
+ }
316
+ const LOCKFILE_BASENAMES = new Set([
317
+ 'pnpm-lock.yaml',
318
+ 'package-lock.json',
319
+ 'yarn.lock',
320
+ 'cargo.lock',
321
+ 'gemfile.lock',
322
+ 'poetry.lock',
323
+ 'go.sum',
324
+ 'composer.lock',
325
+ 'uv.lock',
326
+ ]);
327
+ function isLockfile(filePath) {
328
+ const base = basename(filePath).toLowerCase();
329
+ return LOCKFILE_BASENAMES.has(base);
330
+ }
331
+ function isGenerated(filePath) {
332
+ const segments = filePath.split('/');
333
+ // Directory-segment matches — `dist/` / `build/` anywhere in the path.
334
+ for (const seg of segments) {
335
+ if (seg === 'dist' || seg === 'build')
336
+ return true;
337
+ }
338
+ // Filename suffix matches.
339
+ const base = basename(filePath);
340
+ if (/\.(generated|gen)\.[a-zA-Z0-9]+$/.test(base))
341
+ return true;
342
+ if (/\.min\.(js|css)$/i.test(base))
343
+ return true;
344
+ // TanStack Router generated route-tree, sqlc/protoc output.
345
+ if (base === 'routeTree.gen.ts')
346
+ return true;
347
+ if (/^schema\.generated\.(ts|sql)$/.test(base))
348
+ return true;
349
+ return false;
350
+ }
351
+ function isVendored(filePath) {
352
+ const segments = filePath.split('/');
353
+ for (const seg of segments) {
354
+ if (seg === 'vendor')
355
+ return true;
356
+ if (seg === 'third_party')
357
+ return true;
358
+ if (seg === 'node_modules')
359
+ return true;
360
+ if (seg === '.pnpm')
361
+ return true;
362
+ }
363
+ return false;
364
+ }
365
+ function isFixture(filePath) {
366
+ const segments = filePath.split('/');
367
+ for (const seg of segments) {
368
+ if (seg === '__fixtures__')
369
+ return true;
370
+ if (seg === 'test-fixtures')
371
+ return true;
372
+ // `__snapshots__/` is the canonical Jest/Vitest generated-snapshot dir —
373
+ // always a test artefact, never source-of-truth docs.
374
+ if (seg === '__snapshots__')
375
+ return true;
376
+ }
377
+ const base = basename(filePath);
378
+ if (/\.snap$/.test(base))
379
+ return true;
380
+ if (/\.snapshot$/.test(base))
381
+ return true;
382
+ return false;
383
+ }
384
+ function basename(filePath) {
385
+ const lastSlash = filePath.lastIndexOf('/');
386
+ return lastSlash === -1 ? filePath : filePath.slice(lastSlash + 1);
387
+ }
388
+ // -- Hunk classification -----------------------------------------------------
389
+ function classifyHunk(hunk, filePath) {
390
+ const lines = hunk.body.split('\n');
391
+ const adds = [];
392
+ const dels = [];
393
+ for (const line of lines) {
394
+ if (line.length === 0)
395
+ continue;
396
+ if (line.startsWith('+++ ') || line.startsWith('--- '))
397
+ continue;
398
+ if (line.startsWith('\\'))
399
+ continue;
400
+ if (line.startsWith('+'))
401
+ adds.push(line.slice(1));
402
+ else if (line.startsWith('-'))
403
+ dels.push(line.slice(1));
404
+ }
405
+ if (adds.length === 0 && dels.length === 0) {
406
+ // Context-only hunk — no change at all. Should not appear in a real
407
+ // unified diff but treat as kept (not noise) to be safe.
408
+ return null;
409
+ }
410
+ // Whitespace-only dropping is UNSAFE for indentation-significant languages:
411
+ // a Python/YAML dedent (`- return x` / `+return x`) trims-equal but
412
+ // changes control flow / structure. Dropping it would lose real drift
413
+ // signal (violating the epic's "no loss of recall" invariant), so we only
414
+ // treat whitespace changes as noise in languages where indentation is
415
+ // purely cosmetic (braces / explicit terminators).
416
+ if (!isIndentationSensitive(filePath) && isWhitespaceOnly(adds, dels)) {
417
+ return 'whitespace-only';
418
+ }
419
+ if (isImportOnly(adds, dels))
420
+ return 'import-only';
421
+ return null;
422
+ }
423
+ // Extensions / filenames where leading indentation is load-bearing. For these
424
+ // a "whitespace-only" hunk can still be a real semantic change, so it is never
425
+ // dropped. Conservative list — when unsure, keep the hunk.
426
+ const INDENTATION_SENSITIVE_EXTS = new Set([
427
+ 'py',
428
+ 'pyi',
429
+ 'yaml',
430
+ 'yml',
431
+ 'hs',
432
+ 'fs',
433
+ 'fsx',
434
+ 'nim',
435
+ 'coffee',
436
+ 'sass',
437
+ 'styl',
438
+ 'pug',
439
+ 'jade',
440
+ 'haml',
441
+ 'slim',
442
+ ]);
443
+ const INDENTATION_SENSITIVE_BASENAMES = new Set([
444
+ 'makefile',
445
+ 'gnumakefile',
446
+ ]);
447
+ function isIndentationSensitive(filePath) {
448
+ const base = basename(filePath).toLowerCase();
449
+ if (INDENTATION_SENSITIVE_BASENAMES.has(base))
450
+ return true;
451
+ // `.mk` makefiles and `*.mk`-style fragments are also tab-significant.
452
+ if (base.endsWith('.mk'))
453
+ return true;
454
+ const dot = base.lastIndexOf('.');
455
+ if (dot === -1)
456
+ return false;
457
+ const ext = base.slice(dot + 1);
458
+ return INDENTATION_SENSITIVE_EXTS.has(ext);
459
+ }
460
+ /**
461
+ * A hunk is whitespace-only when, positionally paired, each `+` line and its
462
+ * corresponding `-` line are equal after trimming leading/trailing whitespace.
463
+ * Catches re-indentation, trailing-whitespace cleanup, CRLF/LF normalisation
464
+ * — but explicitly NOT a pure re-ordering of non-whitespace content (which
465
+ * has identical trimmed multisets but would survive an import-only classifier
466
+ * downstream — the right place for it).
467
+ */
468
+ function isWhitespaceOnly(adds, dels) {
469
+ if (adds.length === 0 || dels.length === 0)
470
+ return false;
471
+ if (adds.length !== dels.length)
472
+ return false;
473
+ for (let i = 0; i < adds.length; i++) {
474
+ if (adds[i].trim() !== dels[i].trim())
475
+ return false;
476
+ }
477
+ return true;
478
+ }
479
+ /**
480
+ * A hunk is import-only when:
481
+ * 1. every `+` and every `-` line is an ES (TS/JS) or Python import
482
+ * statement (so the hunk is ENTIRELY imports — a mixed hunk is kept), AND
483
+ * 2. the multiset of NORMALISED import lines is identical on both sides —
484
+ * i.e. the change is a pure re-ordering of the same import statements.
485
+ *
486
+ * Comparing the normalised LINES (not just the source paths) is what keeps a
487
+ * genuine binding change from being dropped: `import { foo } from './a'` →
488
+ * `import { foo, bar } from './a'` shares the source `./a` but the lines
489
+ * differ, so it is NOT import-only and is retained (it adds a real
490
+ * dependency on `bar` that could contradict docs).
491
+ */
492
+ function isImportOnly(adds, dels) {
493
+ if (adds.length === 0 || dels.length === 0)
494
+ return false;
495
+ const addNorm = normaliseImportLines(adds);
496
+ if (addNorm === null)
497
+ return false;
498
+ const delNorm = normaliseImportLines(dels);
499
+ if (delNorm === null)
500
+ return false;
501
+ if (addNorm.length !== delNorm.length)
502
+ return false;
503
+ return multisetsEqual(addNorm, delNorm);
504
+ }
505
+ /**
506
+ * Returns the normalised text of every line (leading whitespace + trailing
507
+ * `;` stripped) when EVERY non-empty line is an import statement; null if any
508
+ * line is not an import.
509
+ */
510
+ function normaliseImportLines(lines) {
511
+ const out = [];
512
+ for (const raw of lines) {
513
+ const trimmed = raw.replace(/^\s+/, '').replace(/;\s*$/, '');
514
+ if (trimmed.length === 0)
515
+ continue;
516
+ if (extractImportSource(trimmed) === null)
517
+ return null;
518
+ out.push(trimmed);
519
+ }
520
+ return out;
521
+ }
522
+ function multisetsEqual(a, b) {
523
+ if (a.length !== b.length)
524
+ return false;
525
+ const counts = new Map();
526
+ for (const s of a)
527
+ counts.set(s, (counts.get(s) ?? 0) + 1);
528
+ for (const s of b) {
529
+ const c = counts.get(s);
530
+ if (c === undefined)
531
+ return false;
532
+ if (c === 1)
533
+ counts.delete(s);
534
+ else
535
+ counts.set(s, c - 1);
536
+ }
537
+ return counts.size === 0;
538
+ }
539
+ function extractImportSource(line) {
540
+ // ES import: `import ... from 'x'` or `import 'x'` or `import type ... from 'x'`.
541
+ const esFrom = /^import\s+(?:type\s+)?(?:[^'"]+?\s+from\s+)?['"]([^'"]+)['"]/;
542
+ const esMatch = esFrom.exec(line);
543
+ if (esMatch !== null)
544
+ return esMatch[1];
545
+ // Python: `from X import Y` or `from X.Y import Z` or `import X` / `import X as Y`.
546
+ const pyFromImport = /^from\s+(\S+)\s+import\s+/;
547
+ const pyMatch = pyFromImport.exec(line);
548
+ if (pyMatch !== null)
549
+ return pyMatch[1];
550
+ const pyImport = /^import\s+(\S+)(?:\s+as\s+\S+)?\s*$/;
551
+ const pyImportMatch = pyImport.exec(line);
552
+ if (pyImportMatch !== null)
553
+ return pyImportMatch[1];
554
+ return null;
555
+ }
556
+ // -- File-wide hunk-drop promotion ------------------------------------------
557
+ function mostCommonHunkReason(droppedHunks) {
558
+ // Deterministic tie-break: first reason encountered wins on equal counts.
559
+ const counts = new Map();
560
+ const firstSeen = new Map();
561
+ for (let i = 0; i < droppedHunks.length; i++) {
562
+ const r = droppedHunks[i].reason;
563
+ counts.set(r, (counts.get(r) ?? 0) + 1);
564
+ if (!firstSeen.has(r))
565
+ firstSeen.set(r, i);
566
+ }
567
+ let bestReason = droppedHunks[0].reason;
568
+ let bestCount = -1;
569
+ let bestFirstSeen = Number.POSITIVE_INFINITY;
570
+ for (const [reason, count] of counts) {
571
+ const seen = firstSeen.get(reason) ?? 0;
572
+ if (count > bestCount || (count === bestCount && seen < bestFirstSeen)) {
573
+ bestReason = reason;
574
+ bestCount = count;
575
+ bestFirstSeen = seen;
576
+ }
577
+ }
578
+ return bestReason;
579
+ }
@@ -0,0 +1,119 @@
1
+ /**
2
+ * Canonicalise a doc-scope value to a deduped POSIX `string[]`.
3
+ *
4
+ * - `null` / `undefined` coerce to `[]` (defensive — JSON config loaders
5
+ * commonly produce these at the boundary).
6
+ * - A single `string` wraps as `[value]`. It is NOT comma/newline-split —
7
+ * delimited-string splitting is a per-surface concern (e.g. Lite's
8
+ * `docs_path` is split in `readPipelineInputs()`), deliberately kept out
9
+ * of the pure algebra.
10
+ * - Each entry is `.trim()`-ed before further processing so `' docs '`
11
+ * and `'docs'` dedupe to one entry (matches `validateDocScopeEntry`'s
12
+ * own trim — keeps validate/normalize aligned).
13
+ * - Backslashes are normalised to forward slashes (the persisted dialect is
14
+ * POSIX). Trailing slashes are stripped; `//` runs collapse; `./` and
15
+ * `..` segments resolve via the inline POSIX normaliser. So `'./docs'`
16
+ * and `'docs'` dedupe, `'docs//api'` becomes `'docs/api'`, and
17
+ * `'docs/sub/../api/*.md'` becomes `'docs/api/*.md'` (which the matcher
18
+ * can actually match).
19
+ * - Entries are deduped, preserving first-occurrence order.
20
+ * - Entries that collapse to nothing (`''`, `'/'`, `'.'`, `'./'`) are
21
+ * dropped — these are tautological or empty, and `validateDocScopeEntry`
22
+ * would otherwise have to special-case them.
23
+ *
24
+ * Non-emptiness of the OUTPUT is NOT enforced here — that is a schema /
25
+ * validation concern at each surface (`docScopeSchema.min(1)`,
26
+ * `writeDocScope`).
27
+ *
28
+ * `normalizeDocScope` is intentionally NOT a security gate: an escape entry
29
+ * like `'../secrets'` survives (validation is `validateDocScopeEntry`'s
30
+ * job). The matcher in `isFileInDocScope` then can't match it against any
31
+ * real in-tree file path, so the worst-case outcome is "silent no-match,"
32
+ * not exfiltration.
33
+ */
34
+ export declare function normalizeDocScope(input: string | string[] | null | undefined): string[];
35
+ /**
36
+ * Validate a single doc-scope entry. Returns `null` on success, or a
37
+ * human-readable error string on failure.
38
+ *
39
+ * Ports the @delfini/cli `validatePath` + `longestStaticPrefix` repo-escape
40
+ * technique (the richest existing implementation) — reworked to be PURE and
41
+ * RELATIVE-root based. `repoRootRel` is a relative marker (callers pass
42
+ * `'.'`); we never resolve against an absolute filesystem path or use
43
+ * `path.sep`.
44
+ *
45
+ * Rejects:
46
+ * - absolute paths (POSIX `/...` and Windows-drive `C:\...` / `C:/...`),
47
+ * - entries containing ASCII control characters (CR, LF, TAB, NUL, etc.)
48
+ * — these survive a JSON round-trip but can never be a real path; the
49
+ * matcher silently no-ops them, which is a worse failure mode than a
50
+ * loud rejection,
51
+ * - entries whose normalisation escapes the repo root (`../`, mid-path
52
+ * traversal, AND traversal hidden inside a glob portion such as
53
+ * `**\/../../x` — the CLI's static-prefix-only check could not catch the
54
+ * last case, so we normalise the FULL entry, which is strictly stronger),
55
+ * - empty / whitespace-only entries.
56
+ *
57
+ * NOTE: this validator is layered, not auto-invoked by `normalizeDocScope`
58
+ * or `isFileInDocScope`. Each surface must call it at the persistence
59
+ * boundary (`writeDocScope`, the Zod refine for the FR88g contract, the
60
+ * Web settings list-editor). Bypassing it produces silent matcher
61
+ * no-matches, not insecure behaviour — but callers should treat it as
62
+ * mandatory at user-input boundaries.
63
+ */
64
+ export declare function validateDocScopeEntry(entry: string, repoRootRel: string): string | null;
65
+ /**
66
+ * Classify a doc-scope entry by SHAPE — a pure string heuristic, NOT a
67
+ * filesystem check (this module cannot `stat`):
68
+ * - `'glob'` — contains glob magic (decided by picomatch's own scanner, so
69
+ * the classification dialect matches the matching dialect).
70
+ * - `'dir'` — `.` / `''` (repo-root tautology), OR last segment starts
71
+ * with a `.` (hidden directory pattern: `.github`, `.husky`, `.vscode`,
72
+ * `.changeset`, etc.), OR last segment has no `.` at all.
73
+ * - `'file'` — not a glob, not dot-prefix, AND last segment contains a `.`
74
+ * (heuristic: it looks like `name.ext`).
75
+ *
76
+ * KNOWN LIMITATION: versioned directories like `docs/v1.2` are misclassified
77
+ * as files by the dot-in-last-segment heuristic (we'd need a real extension
78
+ * registry to distinguish `v1.2` from `index.md`). Users who scope a
79
+ * versioned doc tree should prefer an explicit glob form (e.g.
80
+ * `docs/v1.2/<globstar>/*.md`). The predicate's dir/file branches degrade
81
+ * silently here — there is no authoritative fs-expander rescue for the
82
+ * smart-skip path-shape use case.
83
+ */
84
+ export declare function classifyEntry(entry: string): 'dir' | 'file' | 'glob';
85
+ /**
86
+ * True iff `filePath` falls within any entry of `scope`. Both `filePath` and
87
+ * the scope entries are repo-relative POSIX paths.
88
+ *
89
+ * Per-entry strategy keys off `classifyEntry`:
90
+ * - `'dir'` -> matches the recursive subtree (`docs` ⇒ `docs/**`).
91
+ * - `'file'` -> exact path match.
92
+ * - `'glob'` -> picomatch semantics.
93
+ *
94
+ * The predicate is PATH-SHAPE-ONLY — it does not filter by `.md` extension.
95
+ * The `.md`-only restriction on directory expansion belongs to the expanders
96
+ * (CLI `expandDocScope`, Action/Web git-trees match), which keeps this
97
+ * predicate usable by smart-skip on arbitrary changed-file paths.
98
+ *
99
+ * Matching is `dot: false, nocase: true`:
100
+ * - case-insensitive matching aligns with the CLI expander's existing
101
+ * `caseSensitiveMatch: false` (fs realism on Windows/macOS), so the same
102
+ * repo cloned across platforms returns identical in-scope decisions —
103
+ * the dialect-parity invariant the ADR exists to enforce. The header's
104
+ * "platform-independent results" promise IS the case-insensitive choice.
105
+ * - `dot: false` matches the CLI expander default. Dot-prefix hidden
106
+ * directories still match via the `classifyEntry` → `'dir'` path
107
+ * (entry `'.github'` becomes pattern `'.github/**'`, which picomatch
108
+ * matches against `.github/workflows/x.yml` even with `dot: false`
109
+ * because the literal `.github` prefix is present in the pattern).
110
+ *
111
+ * The `filePath` is defensively normalised: backslashes converted to
112
+ * forward slashes, leading `/` and `./` runs stripped, `..` segments
113
+ * resolved — so callers feeding webhook payloads (`/docs/a.md` from
114
+ * `URL.pathname`), Windows-style paths (`docs\a.md`), or composed paths
115
+ * (`./docs/sub/../a.md`) all collapse to the same canonical form before
116
+ * matching.
117
+ */
118
+ export declare function isFileInDocScope(filePath: string, scope: string[]): boolean;
119
+ //# sourceMappingURL=doc-scope.d.ts.map