@delfini/drift-engine 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +172 -0
- package/dist/diff-filter.d.ts +33 -0
- package/dist/diff-filter.d.ts.map +1 -0
- package/dist/diff-filter.js +579 -0
- package/dist/doc-scope.d.ts +119 -0
- package/dist/doc-scope.d.ts.map +1 -0
- package/dist/doc-scope.js +260 -0
- package/dist/index.d.ts +11 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +46 -0
- package/dist/prompt-budget.d.ts +2 -0
- package/dist/prompt-budget.d.ts.map +1 -0
- package/dist/prompt-budget.js +16 -0
- package/dist/prompt-builder.d.ts +21 -0
- package/dist/prompt-builder.d.ts.map +1 -0
- package/dist/prompt-builder.js +267 -0
- package/dist/reconcile.d.ts +17 -0
- package/dist/reconcile.d.ts.map +1 -0
- package/dist/reconcile.js +290 -0
- package/dist/relevance.d.ts +73 -0
- package/dist/relevance.d.ts.map +1 -0
- package/dist/relevance.js +266 -0
- package/dist/schema.d.ts +293 -0
- package/dist/schema.d.ts.map +1 -0
- package/dist/schema.js +50 -0
- package/dist/types.d.ts +81 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +6 -0
- package/package.json +39 -0
- package/src/prompt.md +360 -0
|
@@ -0,0 +1,579 @@
|
|
|
1
|
+
// packages/drift-engine/src/diff-filter.ts
|
|
2
|
+
//
|
|
3
|
+
// Deterministic unified-diff pre-filter (Story P3.7.2 / FR151). Drops hunks
|
|
4
|
+
// that cannot carry doc-claim signal before prompt assembly: lockfile churn,
|
|
5
|
+
// generated output, vendored code, test fixtures (path-level), plus pure
|
|
6
|
+
// whitespace and import-ordering hunks (hunk-level).
|
|
7
|
+
//
|
|
8
|
+
// Pure-logic — no I/O, no clock, no randomness, no new runtime dep. ESLint
|
|
9
|
+
// `no-restricted-imports` on packages/drift-engine/src/**/*.ts forbids fs /
|
|
10
|
+
// child_process / http / https / @anthropic-ai/sdk / openai / @langchain/* /
|
|
11
|
+
// process.env. Path classification uses hand-written predicates — NO
|
|
12
|
+
// `picomatch` (Story Dev Notes §"Path classification predicates").
|
|
13
|
+
//
|
|
14
|
+
// Exposed via `index.ts` (unlike the P3.7.1 relevance internals, which run
|
|
15
|
+
// INSIDE `buildPrompt`). The gate for this filter lives at the CONSUMER —
|
|
16
|
+
// the CLI's `runLocalPrepare` and the Action's `buildAnalysisInput` call
|
|
17
|
+
// `filterDiff` directly at input-assembly time and must reach it through the
|
|
18
|
+
// package's public surface (the package `exports` map blocks deep
|
|
19
|
+
// `@delfini/drift-engine/src/...` imports). The default consumer path never
|
|
20
|
+
// invokes this module, so `buildPrompt` output stays byte-identical and the
|
|
21
|
+
// NFR44 snapshot gate stays green (NFR49(b) parity discipline).
|
|
22
|
+
// -- Public entry point ------------------------------------------------------
|
|
23
|
+
/**
|
|
24
|
+
* Filter a unified-diff string deterministically.
|
|
25
|
+
*
|
|
26
|
+
* The input shape is the same one `buildPrompt`'s `AnalysisInput.diff`
|
|
27
|
+
* consumes: a sequence of `diff --git a/<path> b/<path>` blocks, each with a
|
|
28
|
+
* `--- a/...` / `+++ b/...` preamble followed by one or more `@@ ... @@`
|
|
29
|
+
* hunks. Content before the first `diff --git` header (rare in practice — git
|
|
30
|
+
* does not emit any) is preserved as a leading "noise" segment so callers do
|
|
31
|
+
* not lose surrounding context.
|
|
32
|
+
*
|
|
33
|
+
* Identical input → identical output (NFR46 reproducibility carries forward).
|
|
34
|
+
*/
|
|
35
|
+
export function filterDiff(diff) {
|
|
36
|
+
const droppedPaths = [];
|
|
37
|
+
const droppedHunks = [];
|
|
38
|
+
const files = parseDiffIntoFiles(diff);
|
|
39
|
+
const keptParts = [];
|
|
40
|
+
if (files.preamble.length > 0) {
|
|
41
|
+
keptParts.push(files.preamble);
|
|
42
|
+
}
|
|
43
|
+
for (const file of files.files) {
|
|
44
|
+
// Path-level drops — classified first, in priority order. Lockfiles win
|
|
45
|
+
// over generated/vendored/fixture since the canonical lockfile names
|
|
46
|
+
// never overlap those patterns in practice.
|
|
47
|
+
const pathReason = classifyPath(file.path);
|
|
48
|
+
if (pathReason !== null) {
|
|
49
|
+
droppedPaths.push({ path: file.path, reason: pathReason });
|
|
50
|
+
continue;
|
|
51
|
+
}
|
|
52
|
+
// Hunk-level drops — re-emit the preamble verbatim, then per hunk decide
|
|
53
|
+
// keep/drop, then if EVERY hunk was dropped promote the file itself to
|
|
54
|
+
// droppedPaths (no orphan preamble in keptDiff; see Story Dev Notes
|
|
55
|
+
// §"File whose every hunk is dropped").
|
|
56
|
+
const keptHunks = [];
|
|
57
|
+
const fileDroppedHunks = [];
|
|
58
|
+
for (const hunk of file.hunks) {
|
|
59
|
+
const hunkReason = classifyHunk(hunk, file.path);
|
|
60
|
+
if (hunkReason !== null) {
|
|
61
|
+
fileDroppedHunks.push({
|
|
62
|
+
path: file.path,
|
|
63
|
+
hunkHeader: hunk.header,
|
|
64
|
+
reason: hunkReason,
|
|
65
|
+
});
|
|
66
|
+
continue;
|
|
67
|
+
}
|
|
68
|
+
keptHunks.push(hunk);
|
|
69
|
+
}
|
|
70
|
+
if (keptHunks.length === 0 && fileDroppedHunks.length > 0) {
|
|
71
|
+
// Every hunk dropped — collapse to a path-level drop. Pick the most
|
|
72
|
+
// common reason across the file's dropped hunks (deterministic
|
|
73
|
+
// tie-break: first reason encountered wins on ties). This keeps
|
|
74
|
+
// `keptDiff` parseable as a valid unified diff (no orphan preamble).
|
|
75
|
+
const reason = mostCommonHunkReason(fileDroppedHunks);
|
|
76
|
+
droppedPaths.push({ path: file.path, reason });
|
|
77
|
+
continue;
|
|
78
|
+
}
|
|
79
|
+
droppedHunks.push(...fileDroppedHunks);
|
|
80
|
+
keptParts.push(emitFile(file, keptHunks));
|
|
81
|
+
}
|
|
82
|
+
return {
|
|
83
|
+
keptDiff: keptParts.join(''),
|
|
84
|
+
droppedPaths,
|
|
85
|
+
droppedHunks,
|
|
86
|
+
};
|
|
87
|
+
}
|
|
88
|
+
function parseDiffIntoFiles(diff) {
|
|
89
|
+
if (diff.length === 0) {
|
|
90
|
+
return { preamble: '', files: [] };
|
|
91
|
+
}
|
|
92
|
+
// Locate every `diff --git ` header. We treat the string position of each
|
|
93
|
+
// match as a file boundary; everything between two consecutive boundaries
|
|
94
|
+
// (or the last boundary and end-of-string) is one file's slice.
|
|
95
|
+
//
|
|
96
|
+
// Two header dialects must be recognised, or a file is silently absorbed
|
|
97
|
+
// into its predecessor's slice (and dropped with it if the predecessor is
|
|
98
|
+
// path-dropped — a silent-data-loss bug):
|
|
99
|
+
// 1. unquoted: `diff --git a/path b/path`
|
|
100
|
+
// 2. C-quoted: `diff --git "a/pa th" "b/pa th"` — git quotes paths
|
|
101
|
+
// containing spaces / control bytes / non-ASCII when core.quotePath
|
|
102
|
+
// is on (the default).
|
|
103
|
+
// We anchor on the literal `diff --git ` prefix and split the remainder
|
|
104
|
+
// into the a-side and b-side tokens (quoted or not), then unquote.
|
|
105
|
+
const headerRegex = /^diff --git (.+)$/gm;
|
|
106
|
+
const boundaries = [];
|
|
107
|
+
let m;
|
|
108
|
+
while ((m = headerRegex.exec(diff)) !== null) {
|
|
109
|
+
const paths = parseHeaderPaths(m[1]);
|
|
110
|
+
if (paths === null)
|
|
111
|
+
continue;
|
|
112
|
+
boundaries.push({ index: m.index, aPath: paths.aPath, bPath: paths.bPath });
|
|
113
|
+
}
|
|
114
|
+
if (boundaries.length === 0) {
|
|
115
|
+
return { preamble: diff, files: [] };
|
|
116
|
+
}
|
|
117
|
+
const preamble = diff.slice(0, boundaries[0].index);
|
|
118
|
+
const files = [];
|
|
119
|
+
for (let i = 0; i < boundaries.length; i++) {
|
|
120
|
+
const start = boundaries[i].index;
|
|
121
|
+
const end = i + 1 < boundaries.length ? boundaries[i + 1].index : diff.length;
|
|
122
|
+
const fileSlice = diff.slice(start, end);
|
|
123
|
+
const path = pickPrimaryPath(boundaries[i].aPath, boundaries[i].bPath);
|
|
124
|
+
files.push(parseFileSlice(path, fileSlice));
|
|
125
|
+
}
|
|
126
|
+
return { preamble, files };
|
|
127
|
+
}
|
|
128
|
+
function pickPrimaryPath(aPath, bPath) {
|
|
129
|
+
if (bPath && bPath !== '/dev/null')
|
|
130
|
+
return bPath;
|
|
131
|
+
return aPath;
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Parse the `<a-side> <b-side>` remainder of a `diff --git ` header into the
|
|
135
|
+
* two repo-relative paths, stripping the `a/` / `b/` prefixes and unquoting
|
|
136
|
+
* git's C-quoted form. Returns null if two path tokens cannot be read.
|
|
137
|
+
*
|
|
138
|
+
* Handles:
|
|
139
|
+
* - unquoted: `a/path b/path` (no spaces inside either token)
|
|
140
|
+
* - C-quoted: `"a/pa th" "b/pa th"` and mixed (`a/x "b/pa th"`)
|
|
141
|
+
*/
|
|
142
|
+
function parseHeaderPaths(rest) {
|
|
143
|
+
const tokens = tokenizeTwoPaths(rest);
|
|
144
|
+
if (tokens === null)
|
|
145
|
+
return null;
|
|
146
|
+
return {
|
|
147
|
+
aPath: stripSidePrefix(unquoteGitPath(tokens[0]), 'a/'),
|
|
148
|
+
bPath: stripSidePrefix(unquoteGitPath(tokens[1]), 'b/'),
|
|
149
|
+
};
|
|
150
|
+
}
|
|
151
|
+
/** Read exactly two whitespace- or quote-delimited tokens from `rest`. */
|
|
152
|
+
function tokenizeTwoPaths(rest) {
|
|
153
|
+
const tokens = [];
|
|
154
|
+
let i = 0;
|
|
155
|
+
while (i < rest.length && tokens.length < 2) {
|
|
156
|
+
while (i < rest.length && rest[i] === ' ')
|
|
157
|
+
i++;
|
|
158
|
+
if (i >= rest.length)
|
|
159
|
+
break;
|
|
160
|
+
if (rest[i] === '"') {
|
|
161
|
+
// Quoted token — consume to the matching unescaped closing quote.
|
|
162
|
+
let j = i + 1;
|
|
163
|
+
while (j < rest.length) {
|
|
164
|
+
if (rest[j] === '\\') {
|
|
165
|
+
j += 2;
|
|
166
|
+
continue;
|
|
167
|
+
}
|
|
168
|
+
if (rest[j] === '"')
|
|
169
|
+
break;
|
|
170
|
+
j++;
|
|
171
|
+
}
|
|
172
|
+
if (j >= rest.length)
|
|
173
|
+
return null; // unterminated quote
|
|
174
|
+
tokens.push(rest.slice(i, j + 1));
|
|
175
|
+
i = j + 1;
|
|
176
|
+
}
|
|
177
|
+
else {
|
|
178
|
+
// Bare token — consume to the next space.
|
|
179
|
+
let j = i;
|
|
180
|
+
while (j < rest.length && rest[j] !== ' ')
|
|
181
|
+
j++;
|
|
182
|
+
tokens.push(rest.slice(i, j));
|
|
183
|
+
i = j;
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
return tokens.length === 2 ? [tokens[0], tokens[1]] : null;
|
|
187
|
+
}
|
|
188
|
+
/** Strip a leading `a/` or `b/` (post-unquote). */
|
|
189
|
+
function stripSidePrefix(p, prefix) {
|
|
190
|
+
return p.startsWith(prefix) ? p.slice(prefix.length) : p;
|
|
191
|
+
}
|
|
192
|
+
/**
|
|
193
|
+
* Unquote a git C-quoted path token. Handles the named escapes and octal
|
|
194
|
+
* byte escapes git emits; the dominant real case (a space in the path) carries
|
|
195
|
+
* no escapes at all, so this is mostly a quote-strip. Best-effort for exotic
|
|
196
|
+
* byte sequences — the load-bearing win is that the header is recognised as a
|
|
197
|
+
* boundary so the file is never silently absorbed into a neighbour's slice.
|
|
198
|
+
*/
|
|
199
|
+
function unquoteGitPath(token) {
|
|
200
|
+
if (token.length < 2 || token[0] !== '"' || token[token.length - 1] !== '"') {
|
|
201
|
+
return token;
|
|
202
|
+
}
|
|
203
|
+
const inner = token.slice(1, -1);
|
|
204
|
+
if (!inner.includes('\\'))
|
|
205
|
+
return inner;
|
|
206
|
+
let out = '';
|
|
207
|
+
for (let i = 0; i < inner.length; i++) {
|
|
208
|
+
if (inner[i] !== '\\') {
|
|
209
|
+
out += inner[i];
|
|
210
|
+
continue;
|
|
211
|
+
}
|
|
212
|
+
const next = inner[i + 1];
|
|
213
|
+
if (next === undefined) {
|
|
214
|
+
out += '\\';
|
|
215
|
+
break;
|
|
216
|
+
}
|
|
217
|
+
if (next >= '0' && next <= '7') {
|
|
218
|
+
// Octal byte escape \NNN (1–3 octal digits).
|
|
219
|
+
let oct = '';
|
|
220
|
+
let k = i + 1;
|
|
221
|
+
while (k < inner.length && oct.length < 3 && inner[k] >= '0' && inner[k] <= '7') {
|
|
222
|
+
oct += inner[k];
|
|
223
|
+
k++;
|
|
224
|
+
}
|
|
225
|
+
out += String.fromCharCode(parseInt(oct, 8));
|
|
226
|
+
i = k - 1;
|
|
227
|
+
continue;
|
|
228
|
+
}
|
|
229
|
+
switch (next) {
|
|
230
|
+
case 'n':
|
|
231
|
+
out += '\n';
|
|
232
|
+
break;
|
|
233
|
+
case 't':
|
|
234
|
+
out += '\t';
|
|
235
|
+
break;
|
|
236
|
+
case 'r':
|
|
237
|
+
out += '\r';
|
|
238
|
+
break;
|
|
239
|
+
case '"':
|
|
240
|
+
out += '"';
|
|
241
|
+
break;
|
|
242
|
+
case '\\':
|
|
243
|
+
out += '\\';
|
|
244
|
+
break;
|
|
245
|
+
default:
|
|
246
|
+
out += next;
|
|
247
|
+
}
|
|
248
|
+
i++;
|
|
249
|
+
}
|
|
250
|
+
return out;
|
|
251
|
+
}
|
|
252
|
+
function parseFileSlice(path, slice) {
|
|
253
|
+
// Find every `@@ ` hunk-header start position INSIDE the slice. Hunk headers
|
|
254
|
+
// always appear at column 0 — we use a line-start anchored regex with `m`
|
|
255
|
+
// flag so the byte offsets it reports are usable for re-slicing.
|
|
256
|
+
const hunkRegex = /^@@ /gm;
|
|
257
|
+
const hunkStarts = [];
|
|
258
|
+
let m;
|
|
259
|
+
while ((m = hunkRegex.exec(slice)) !== null) {
|
|
260
|
+
hunkStarts.push(m.index);
|
|
261
|
+
}
|
|
262
|
+
if (hunkStarts.length === 0) {
|
|
263
|
+
// No hunks — e.g. binary marker, rename-only, mode-only diff. Whole slice
|
|
264
|
+
// is preamble; the filter will keep it verbatim.
|
|
265
|
+
return { path, rawSlice: slice, preamble: slice, hunks: [] };
|
|
266
|
+
}
|
|
267
|
+
const preamble = slice.slice(0, hunkStarts[0]);
|
|
268
|
+
const hunks = [];
|
|
269
|
+
for (let i = 0; i < hunkStarts.length; i++) {
|
|
270
|
+
const start = hunkStarts[i];
|
|
271
|
+
const end = i + 1 < hunkStarts.length ? hunkStarts[i + 1] : slice.length;
|
|
272
|
+
const hunkSlice = slice.slice(start, end);
|
|
273
|
+
// Split the hunk into header line + body. The header line runs from `@@`
|
|
274
|
+
// up to and including its trailing newline. If there's no newline (last
|
|
275
|
+
// line of file, malformed), the entire hunkSlice is the header.
|
|
276
|
+
const newlineIdx = hunkSlice.indexOf('\n');
|
|
277
|
+
if (newlineIdx === -1) {
|
|
278
|
+
hunks.push({ header: hunkSlice, body: '' });
|
|
279
|
+
}
|
|
280
|
+
else {
|
|
281
|
+
hunks.push({
|
|
282
|
+
header: hunkSlice.slice(0, newlineIdx + 1),
|
|
283
|
+
body: hunkSlice.slice(newlineIdx + 1),
|
|
284
|
+
});
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
return { path, rawSlice: slice, preamble, hunks };
|
|
288
|
+
}
|
|
289
|
+
function emitFile(file, keptHunks) {
|
|
290
|
+
// Fast path: every hunk kept → re-emit the original slice verbatim. This
|
|
291
|
+
// guarantees byte-equality for files that were not touched at hunk level.
|
|
292
|
+
if (keptHunks.length === file.hunks.length) {
|
|
293
|
+
return file.rawSlice;
|
|
294
|
+
}
|
|
295
|
+
if (keptHunks.length === 0) {
|
|
296
|
+
return file.preamble;
|
|
297
|
+
}
|
|
298
|
+
let out = file.preamble;
|
|
299
|
+
for (const hunk of keptHunks) {
|
|
300
|
+
out += hunk.header + hunk.body;
|
|
301
|
+
}
|
|
302
|
+
return out;
|
|
303
|
+
}
|
|
304
|
+
// -- Path classification (hand-written, no globs) ----------------------------
|
|
305
|
+
function classifyPath(filePath) {
|
|
306
|
+
if (isLockfile(filePath))
|
|
307
|
+
return 'lockfile';
|
|
308
|
+
if (isGenerated(filePath))
|
|
309
|
+
return 'generated';
|
|
310
|
+
if (isVendored(filePath))
|
|
311
|
+
return 'vendored';
|
|
312
|
+
if (isFixture(filePath))
|
|
313
|
+
return 'fixture';
|
|
314
|
+
return null;
|
|
315
|
+
}
|
|
316
|
+
const LOCKFILE_BASENAMES = new Set([
|
|
317
|
+
'pnpm-lock.yaml',
|
|
318
|
+
'package-lock.json',
|
|
319
|
+
'yarn.lock',
|
|
320
|
+
'cargo.lock',
|
|
321
|
+
'gemfile.lock',
|
|
322
|
+
'poetry.lock',
|
|
323
|
+
'go.sum',
|
|
324
|
+
'composer.lock',
|
|
325
|
+
'uv.lock',
|
|
326
|
+
]);
|
|
327
|
+
function isLockfile(filePath) {
|
|
328
|
+
const base = basename(filePath).toLowerCase();
|
|
329
|
+
return LOCKFILE_BASENAMES.has(base);
|
|
330
|
+
}
|
|
331
|
+
function isGenerated(filePath) {
|
|
332
|
+
const segments = filePath.split('/');
|
|
333
|
+
// Directory-segment matches — `dist/` / `build/` anywhere in the path.
|
|
334
|
+
for (const seg of segments) {
|
|
335
|
+
if (seg === 'dist' || seg === 'build')
|
|
336
|
+
return true;
|
|
337
|
+
}
|
|
338
|
+
// Filename suffix matches.
|
|
339
|
+
const base = basename(filePath);
|
|
340
|
+
if (/\.(generated|gen)\.[a-zA-Z0-9]+$/.test(base))
|
|
341
|
+
return true;
|
|
342
|
+
if (/\.min\.(js|css)$/i.test(base))
|
|
343
|
+
return true;
|
|
344
|
+
// TanStack Router generated route-tree, sqlc/protoc output.
|
|
345
|
+
if (base === 'routeTree.gen.ts')
|
|
346
|
+
return true;
|
|
347
|
+
if (/^schema\.generated\.(ts|sql)$/.test(base))
|
|
348
|
+
return true;
|
|
349
|
+
return false;
|
|
350
|
+
}
|
|
351
|
+
function isVendored(filePath) {
|
|
352
|
+
const segments = filePath.split('/');
|
|
353
|
+
for (const seg of segments) {
|
|
354
|
+
if (seg === 'vendor')
|
|
355
|
+
return true;
|
|
356
|
+
if (seg === 'third_party')
|
|
357
|
+
return true;
|
|
358
|
+
if (seg === 'node_modules')
|
|
359
|
+
return true;
|
|
360
|
+
if (seg === '.pnpm')
|
|
361
|
+
return true;
|
|
362
|
+
}
|
|
363
|
+
return false;
|
|
364
|
+
}
|
|
365
|
+
function isFixture(filePath) {
|
|
366
|
+
const segments = filePath.split('/');
|
|
367
|
+
for (const seg of segments) {
|
|
368
|
+
if (seg === '__fixtures__')
|
|
369
|
+
return true;
|
|
370
|
+
if (seg === 'test-fixtures')
|
|
371
|
+
return true;
|
|
372
|
+
// `__snapshots__/` is the canonical Jest/Vitest generated-snapshot dir —
|
|
373
|
+
// always a test artefact, never source-of-truth docs.
|
|
374
|
+
if (seg === '__snapshots__')
|
|
375
|
+
return true;
|
|
376
|
+
}
|
|
377
|
+
const base = basename(filePath);
|
|
378
|
+
if (/\.snap$/.test(base))
|
|
379
|
+
return true;
|
|
380
|
+
if (/\.snapshot$/.test(base))
|
|
381
|
+
return true;
|
|
382
|
+
return false;
|
|
383
|
+
}
|
|
384
|
+
function basename(filePath) {
|
|
385
|
+
const lastSlash = filePath.lastIndexOf('/');
|
|
386
|
+
return lastSlash === -1 ? filePath : filePath.slice(lastSlash + 1);
|
|
387
|
+
}
|
|
388
|
+
// -- Hunk classification -----------------------------------------------------
|
|
389
|
+
function classifyHunk(hunk, filePath) {
|
|
390
|
+
const lines = hunk.body.split('\n');
|
|
391
|
+
const adds = [];
|
|
392
|
+
const dels = [];
|
|
393
|
+
for (const line of lines) {
|
|
394
|
+
if (line.length === 0)
|
|
395
|
+
continue;
|
|
396
|
+
if (line.startsWith('+++ ') || line.startsWith('--- '))
|
|
397
|
+
continue;
|
|
398
|
+
if (line.startsWith('\\'))
|
|
399
|
+
continue;
|
|
400
|
+
if (line.startsWith('+'))
|
|
401
|
+
adds.push(line.slice(1));
|
|
402
|
+
else if (line.startsWith('-'))
|
|
403
|
+
dels.push(line.slice(1));
|
|
404
|
+
}
|
|
405
|
+
if (adds.length === 0 && dels.length === 0) {
|
|
406
|
+
// Context-only hunk — no change at all. Should not appear in a real
|
|
407
|
+
// unified diff but treat as kept (not noise) to be safe.
|
|
408
|
+
return null;
|
|
409
|
+
}
|
|
410
|
+
// Whitespace-only dropping is UNSAFE for indentation-significant languages:
|
|
411
|
+
// a Python/YAML dedent (`- return x` / `+return x`) trims-equal but
|
|
412
|
+
// changes control flow / structure. Dropping it would lose real drift
|
|
413
|
+
// signal (violating the epic's "no loss of recall" invariant), so we only
|
|
414
|
+
// treat whitespace changes as noise in languages where indentation is
|
|
415
|
+
// purely cosmetic (braces / explicit terminators).
|
|
416
|
+
if (!isIndentationSensitive(filePath) && isWhitespaceOnly(adds, dels)) {
|
|
417
|
+
return 'whitespace-only';
|
|
418
|
+
}
|
|
419
|
+
if (isImportOnly(adds, dels))
|
|
420
|
+
return 'import-only';
|
|
421
|
+
return null;
|
|
422
|
+
}
|
|
423
|
+
// Extensions / filenames where leading indentation is load-bearing. For these
|
|
424
|
+
// a "whitespace-only" hunk can still be a real semantic change, so it is never
|
|
425
|
+
// dropped. Conservative list — when unsure, keep the hunk.
|
|
426
|
+
const INDENTATION_SENSITIVE_EXTS = new Set([
|
|
427
|
+
'py',
|
|
428
|
+
'pyi',
|
|
429
|
+
'yaml',
|
|
430
|
+
'yml',
|
|
431
|
+
'hs',
|
|
432
|
+
'fs',
|
|
433
|
+
'fsx',
|
|
434
|
+
'nim',
|
|
435
|
+
'coffee',
|
|
436
|
+
'sass',
|
|
437
|
+
'styl',
|
|
438
|
+
'pug',
|
|
439
|
+
'jade',
|
|
440
|
+
'haml',
|
|
441
|
+
'slim',
|
|
442
|
+
]);
|
|
443
|
+
const INDENTATION_SENSITIVE_BASENAMES = new Set([
|
|
444
|
+
'makefile',
|
|
445
|
+
'gnumakefile',
|
|
446
|
+
]);
|
|
447
|
+
function isIndentationSensitive(filePath) {
|
|
448
|
+
const base = basename(filePath).toLowerCase();
|
|
449
|
+
if (INDENTATION_SENSITIVE_BASENAMES.has(base))
|
|
450
|
+
return true;
|
|
451
|
+
// `.mk` makefiles and `*.mk`-style fragments are also tab-significant.
|
|
452
|
+
if (base.endsWith('.mk'))
|
|
453
|
+
return true;
|
|
454
|
+
const dot = base.lastIndexOf('.');
|
|
455
|
+
if (dot === -1)
|
|
456
|
+
return false;
|
|
457
|
+
const ext = base.slice(dot + 1);
|
|
458
|
+
return INDENTATION_SENSITIVE_EXTS.has(ext);
|
|
459
|
+
}
|
|
460
|
+
/**
|
|
461
|
+
* A hunk is whitespace-only when, positionally paired, each `+` line and its
|
|
462
|
+
* corresponding `-` line are equal after trimming leading/trailing whitespace.
|
|
463
|
+
* Catches re-indentation, trailing-whitespace cleanup, CRLF/LF normalisation
|
|
464
|
+
* — but explicitly NOT a pure re-ordering of non-whitespace content (which
|
|
465
|
+
* has identical trimmed multisets but would survive an import-only classifier
|
|
466
|
+
* downstream — the right place for it).
|
|
467
|
+
*/
|
|
468
|
+
function isWhitespaceOnly(adds, dels) {
|
|
469
|
+
if (adds.length === 0 || dels.length === 0)
|
|
470
|
+
return false;
|
|
471
|
+
if (adds.length !== dels.length)
|
|
472
|
+
return false;
|
|
473
|
+
for (let i = 0; i < adds.length; i++) {
|
|
474
|
+
if (adds[i].trim() !== dels[i].trim())
|
|
475
|
+
return false;
|
|
476
|
+
}
|
|
477
|
+
return true;
|
|
478
|
+
}
|
|
479
|
+
/**
|
|
480
|
+
* A hunk is import-only when:
|
|
481
|
+
* 1. every `+` and every `-` line is an ES (TS/JS) or Python import
|
|
482
|
+
* statement (so the hunk is ENTIRELY imports — a mixed hunk is kept), AND
|
|
483
|
+
* 2. the multiset of NORMALISED import lines is identical on both sides —
|
|
484
|
+
* i.e. the change is a pure re-ordering of the same import statements.
|
|
485
|
+
*
|
|
486
|
+
* Comparing the normalised LINES (not just the source paths) is what keeps a
|
|
487
|
+
* genuine binding change from being dropped: `import { foo } from './a'` →
|
|
488
|
+
* `import { foo, bar } from './a'` shares the source `./a` but the lines
|
|
489
|
+
* differ, so it is NOT import-only and is retained (it adds a real
|
|
490
|
+
* dependency on `bar` that could contradict docs).
|
|
491
|
+
*/
|
|
492
|
+
function isImportOnly(adds, dels) {
|
|
493
|
+
if (adds.length === 0 || dels.length === 0)
|
|
494
|
+
return false;
|
|
495
|
+
const addNorm = normaliseImportLines(adds);
|
|
496
|
+
if (addNorm === null)
|
|
497
|
+
return false;
|
|
498
|
+
const delNorm = normaliseImportLines(dels);
|
|
499
|
+
if (delNorm === null)
|
|
500
|
+
return false;
|
|
501
|
+
if (addNorm.length !== delNorm.length)
|
|
502
|
+
return false;
|
|
503
|
+
return multisetsEqual(addNorm, delNorm);
|
|
504
|
+
}
|
|
505
|
+
/**
|
|
506
|
+
* Returns the normalised text of every line (leading whitespace + trailing
|
|
507
|
+
* `;` stripped) when EVERY non-empty line is an import statement; null if any
|
|
508
|
+
* line is not an import.
|
|
509
|
+
*/
|
|
510
|
+
function normaliseImportLines(lines) {
|
|
511
|
+
const out = [];
|
|
512
|
+
for (const raw of lines) {
|
|
513
|
+
const trimmed = raw.replace(/^\s+/, '').replace(/;\s*$/, '');
|
|
514
|
+
if (trimmed.length === 0)
|
|
515
|
+
continue;
|
|
516
|
+
if (extractImportSource(trimmed) === null)
|
|
517
|
+
return null;
|
|
518
|
+
out.push(trimmed);
|
|
519
|
+
}
|
|
520
|
+
return out;
|
|
521
|
+
}
|
|
522
|
+
function multisetsEqual(a, b) {
|
|
523
|
+
if (a.length !== b.length)
|
|
524
|
+
return false;
|
|
525
|
+
const counts = new Map();
|
|
526
|
+
for (const s of a)
|
|
527
|
+
counts.set(s, (counts.get(s) ?? 0) + 1);
|
|
528
|
+
for (const s of b) {
|
|
529
|
+
const c = counts.get(s);
|
|
530
|
+
if (c === undefined)
|
|
531
|
+
return false;
|
|
532
|
+
if (c === 1)
|
|
533
|
+
counts.delete(s);
|
|
534
|
+
else
|
|
535
|
+
counts.set(s, c - 1);
|
|
536
|
+
}
|
|
537
|
+
return counts.size === 0;
|
|
538
|
+
}
|
|
539
|
+
function extractImportSource(line) {
|
|
540
|
+
// ES import: `import ... from 'x'` or `import 'x'` or `import type ... from 'x'`.
|
|
541
|
+
const esFrom = /^import\s+(?:type\s+)?(?:[^'"]+?\s+from\s+)?['"]([^'"]+)['"]/;
|
|
542
|
+
const esMatch = esFrom.exec(line);
|
|
543
|
+
if (esMatch !== null)
|
|
544
|
+
return esMatch[1];
|
|
545
|
+
// Python: `from X import Y` or `from X.Y import Z` or `import X` / `import X as Y`.
|
|
546
|
+
const pyFromImport = /^from\s+(\S+)\s+import\s+/;
|
|
547
|
+
const pyMatch = pyFromImport.exec(line);
|
|
548
|
+
if (pyMatch !== null)
|
|
549
|
+
return pyMatch[1];
|
|
550
|
+
const pyImport = /^import\s+(\S+)(?:\s+as\s+\S+)?\s*$/;
|
|
551
|
+
const pyImportMatch = pyImport.exec(line);
|
|
552
|
+
if (pyImportMatch !== null)
|
|
553
|
+
return pyImportMatch[1];
|
|
554
|
+
return null;
|
|
555
|
+
}
|
|
556
|
+
// -- File-wide hunk-drop promotion ------------------------------------------
|
|
557
|
+
function mostCommonHunkReason(droppedHunks) {
|
|
558
|
+
// Deterministic tie-break: first reason encountered wins on equal counts.
|
|
559
|
+
const counts = new Map();
|
|
560
|
+
const firstSeen = new Map();
|
|
561
|
+
for (let i = 0; i < droppedHunks.length; i++) {
|
|
562
|
+
const r = droppedHunks[i].reason;
|
|
563
|
+
counts.set(r, (counts.get(r) ?? 0) + 1);
|
|
564
|
+
if (!firstSeen.has(r))
|
|
565
|
+
firstSeen.set(r, i);
|
|
566
|
+
}
|
|
567
|
+
let bestReason = droppedHunks[0].reason;
|
|
568
|
+
let bestCount = -1;
|
|
569
|
+
let bestFirstSeen = Number.POSITIVE_INFINITY;
|
|
570
|
+
for (const [reason, count] of counts) {
|
|
571
|
+
const seen = firstSeen.get(reason) ?? 0;
|
|
572
|
+
if (count > bestCount || (count === bestCount && seen < bestFirstSeen)) {
|
|
573
|
+
bestReason = reason;
|
|
574
|
+
bestCount = count;
|
|
575
|
+
bestFirstSeen = seen;
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
return bestReason;
|
|
579
|
+
}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Canonicalise a doc-scope value to a deduped POSIX `string[]`.
|
|
3
|
+
*
|
|
4
|
+
* - `null` / `undefined` coerce to `[]` (defensive — JSON config loaders
|
|
5
|
+
* commonly produce these at the boundary).
|
|
6
|
+
* - A single `string` wraps as `[value]`. It is NOT comma/newline-split —
|
|
7
|
+
* delimited-string splitting is a per-surface concern (e.g. Lite's
|
|
8
|
+
* `docs_path` is split in `readPipelineInputs()`), deliberately kept out
|
|
9
|
+
* of the pure algebra.
|
|
10
|
+
* - Each entry is `.trim()`-ed before further processing so `' docs '`
|
|
11
|
+
* and `'docs'` dedupe to one entry (matches `validateDocScopeEntry`'s
|
|
12
|
+
* own trim — keeps validate/normalize aligned).
|
|
13
|
+
* - Backslashes are normalised to forward slashes (the persisted dialect is
|
|
14
|
+
* POSIX). Trailing slashes are stripped; `//` runs collapse; `./` and
|
|
15
|
+
* `..` segments resolve via the inline POSIX normaliser. So `'./docs'`
|
|
16
|
+
* and `'docs'` dedupe, `'docs//api'` becomes `'docs/api'`, and
|
|
17
|
+
* `'docs/sub/../api/*.md'` becomes `'docs/api/*.md'` (which the matcher
|
|
18
|
+
* can actually match).
|
|
19
|
+
* - Entries are deduped, preserving first-occurrence order.
|
|
20
|
+
* - Entries that collapse to nothing (`''`, `'/'`, `'.'`, `'./'`) are
|
|
21
|
+
* dropped — these are tautological or empty, and `validateDocScopeEntry`
|
|
22
|
+
* would otherwise have to special-case them.
|
|
23
|
+
*
|
|
24
|
+
* Non-emptiness of the OUTPUT is NOT enforced here — that is a schema /
|
|
25
|
+
* validation concern at each surface (`docScopeSchema.min(1)`,
|
|
26
|
+
* `writeDocScope`).
|
|
27
|
+
*
|
|
28
|
+
* `normalizeDocScope` is intentionally NOT a security gate: an escape entry
|
|
29
|
+
* like `'../secrets'` survives (validation is `validateDocScopeEntry`'s
|
|
30
|
+
* job). The matcher in `isFileInDocScope` then can't match it against any
|
|
31
|
+
* real in-tree file path, so the worst-case outcome is "silent no-match,"
|
|
32
|
+
* not exfiltration.
|
|
33
|
+
*/
|
|
34
|
+
export declare function normalizeDocScope(input: string | string[] | null | undefined): string[];
|
|
35
|
+
/**
|
|
36
|
+
* Validate a single doc-scope entry. Returns `null` on success, or a
|
|
37
|
+
* human-readable error string on failure.
|
|
38
|
+
*
|
|
39
|
+
* Ports the @delfini/cli `validatePath` + `longestStaticPrefix` repo-escape
|
|
40
|
+
* technique (the richest existing implementation) — reworked to be PURE and
|
|
41
|
+
* RELATIVE-root based. `repoRootRel` is a relative marker (callers pass
|
|
42
|
+
* `'.'`); we never resolve against an absolute filesystem path or use
|
|
43
|
+
* `path.sep`.
|
|
44
|
+
*
|
|
45
|
+
* Rejects:
|
|
46
|
+
* - absolute paths (POSIX `/...` and Windows-drive `C:\...` / `C:/...`),
|
|
47
|
+
* - entries containing ASCII control characters (CR, LF, TAB, NUL, etc.)
|
|
48
|
+
* — these survive a JSON round-trip but can never be a real path; the
|
|
49
|
+
* matcher silently no-ops them, which is a worse failure mode than a
|
|
50
|
+
* loud rejection,
|
|
51
|
+
* - entries whose normalisation escapes the repo root (`../`, mid-path
|
|
52
|
+
* traversal, AND traversal hidden inside a glob portion such as
|
|
53
|
+
* `**\/../../x` — the CLI's static-prefix-only check could not catch the
|
|
54
|
+
* last case, so we normalise the FULL entry, which is strictly stronger),
|
|
55
|
+
* - empty / whitespace-only entries.
|
|
56
|
+
*
|
|
57
|
+
* NOTE: this validator is layered, not auto-invoked by `normalizeDocScope`
|
|
58
|
+
* or `isFileInDocScope`. Each surface must call it at the persistence
|
|
59
|
+
* boundary (`writeDocScope`, the Zod refine for the FR88g contract, the
|
|
60
|
+
* Web settings list-editor). Bypassing it produces silent matcher
|
|
61
|
+
* no-matches, not insecure behaviour — but callers should treat it as
|
|
62
|
+
* mandatory at user-input boundaries.
|
|
63
|
+
*/
|
|
64
|
+
export declare function validateDocScopeEntry(entry: string, repoRootRel: string): string | null;
|
|
65
|
+
/**
|
|
66
|
+
* Classify a doc-scope entry by SHAPE — a pure string heuristic, NOT a
|
|
67
|
+
* filesystem check (this module cannot `stat`):
|
|
68
|
+
* - `'glob'` — contains glob magic (decided by picomatch's own scanner, so
|
|
69
|
+
* the classification dialect matches the matching dialect).
|
|
70
|
+
* - `'dir'` — `.` / `''` (repo-root tautology), OR last segment starts
|
|
71
|
+
* with a `.` (hidden directory pattern: `.github`, `.husky`, `.vscode`,
|
|
72
|
+
* `.changeset`, etc.), OR last segment has no `.` at all.
|
|
73
|
+
* - `'file'` — not a glob, not dot-prefix, AND last segment contains a `.`
|
|
74
|
+
* (heuristic: it looks like `name.ext`).
|
|
75
|
+
*
|
|
76
|
+
* KNOWN LIMITATION: versioned directories like `docs/v1.2` are misclassified
|
|
77
|
+
* as files by the dot-in-last-segment heuristic (we'd need a real extension
|
|
78
|
+
* registry to distinguish `v1.2` from `index.md`). Users who scope a
|
|
79
|
+
* versioned doc tree should prefer an explicit glob form (e.g.
|
|
80
|
+
* `docs/v1.2/<globstar>/*.md`). The predicate's dir/file branches degrade
|
|
81
|
+
* silently here — there is no authoritative fs-expander rescue for the
|
|
82
|
+
* smart-skip path-shape use case.
|
|
83
|
+
*/
|
|
84
|
+
export declare function classifyEntry(entry: string): 'dir' | 'file' | 'glob';
|
|
85
|
+
/**
|
|
86
|
+
* True iff `filePath` falls within any entry of `scope`. Both `filePath` and
|
|
87
|
+
* the scope entries are repo-relative POSIX paths.
|
|
88
|
+
*
|
|
89
|
+
* Per-entry strategy keys off `classifyEntry`:
|
|
90
|
+
* - `'dir'` -> matches the recursive subtree (`docs` ⇒ `docs/**`).
|
|
91
|
+
* - `'file'` -> exact path match.
|
|
92
|
+
* - `'glob'` -> picomatch semantics.
|
|
93
|
+
*
|
|
94
|
+
* The predicate is PATH-SHAPE-ONLY — it does not filter by `.md` extension.
|
|
95
|
+
* The `.md`-only restriction on directory expansion belongs to the expanders
|
|
96
|
+
* (CLI `expandDocScope`, Action/Web git-trees match), which keeps this
|
|
97
|
+
* predicate usable by smart-skip on arbitrary changed-file paths.
|
|
98
|
+
*
|
|
99
|
+
* Matching is `dot: false, nocase: true`:
|
|
100
|
+
* - case-insensitive matching aligns with the CLI expander's existing
|
|
101
|
+
* `caseSensitiveMatch: false` (fs realism on Windows/macOS), so the same
|
|
102
|
+
* repo cloned across platforms returns identical in-scope decisions —
|
|
103
|
+
* the dialect-parity invariant the ADR exists to enforce. The header's
|
|
104
|
+
* "platform-independent results" promise IS the case-insensitive choice.
|
|
105
|
+
* - `dot: false` matches the CLI expander default. Dot-prefix hidden
|
|
106
|
+
* directories still match via the `classifyEntry` → `'dir'` path
|
|
107
|
+
* (entry `'.github'` becomes pattern `'.github/**'`, which picomatch
|
|
108
|
+
* matches against `.github/workflows/x.yml` even with `dot: false`
|
|
109
|
+
* because the literal `.github` prefix is present in the pattern).
|
|
110
|
+
*
|
|
111
|
+
* The `filePath` is defensively normalised: backslashes converted to
|
|
112
|
+
* forward slashes, leading `/` and `./` runs stripped, `..` segments
|
|
113
|
+
* resolved — so callers feeding webhook payloads (`/docs/a.md` from
|
|
114
|
+
* `URL.pathname`), Windows-style paths (`docs\a.md`), or composed paths
|
|
115
|
+
* (`./docs/sub/../a.md`) all collapse to the same canonical form before
|
|
116
|
+
* matching.
|
|
117
|
+
*/
|
|
118
|
+
export declare function isFileInDocScope(filePath: string, scope: string[]): boolean;
|
|
119
|
+
//# sourceMappingURL=doc-scope.d.ts.map
|