@delfini/drift-engine 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ import type { Addition, AnalysisResult, Contradiction, DocFile } from './types.js';
2
+ export interface LocatedRange {
3
+ start: number;
4
+ end: number;
5
+ }
6
+ export declare function locateQuote(quote: string, docBody: string, frontMatterLineCount: number): LocatedRange | null;
7
+ export type WarnFn = (message: string) => void;
8
+ export declare function filterActionableContradictions(contradictions: Contradiction[], onWarn?: WarnFn): {
9
+ kept: Contradiction[];
10
+ narrativeOnly: Contradiction[];
11
+ };
12
+ export declare function dedupeOverlappingContradictions(contradictions: Contradiction[], onWarn?: WarnFn): Contradiction[];
13
+ export declare function locateAnchorHeading(anchorSection: string, docBody: string, frontMatterLineCount: number): number | null;
14
+ export declare function reconcileAdditiveAnchors(additions: Addition[], docs: DocFile[], onWarn?: WarnFn): Addition[];
15
+ export declare function reconcileLineNumbers(contradictions: Contradiction[], docs: DocFile[], onWarn?: WarnFn): Contradiction[];
16
+ export declare function validateAndReconcile(rawJson: unknown, docs: DocFile[], onWarn?: WarnFn): AnalysisResult;
17
+ //# sourceMappingURL=reconcile.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"reconcile.d.ts","sourceRoot":"","sources":["../src/reconcile.ts"],"names":[],"mappings":"AAyBA,OAAO,KAAK,EAAE,QAAQ,EAAE,cAAc,EAAE,aAAa,EAAE,OAAO,EAAE,MAAM,YAAY,CAAA;AAgBlF,MAAM,WAAW,YAAY;IAI3B,KAAK,EAAE,MAAM,CAAA;IACb,GAAG,EAAE,MAAM,CAAA;CACZ;AAOD,wBAAgB,WAAW,CACzB,KAAK,EAAE,MAAM,EACb,OAAO,EAAE,MAAM,EACf,oBAAoB,EAAE,MAAM,GAC3B,YAAY,GAAG,IAAI,CAsBrB;AAED,MAAM,MAAM,MAAM,GAAG,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAA;AA0B9C,wBAAgB,8BAA8B,CAC5C,cAAc,EAAE,aAAa,EAAE,EAC/B,MAAM,GAAE,MAAiB,GACxB;IAAE,IAAI,EAAE,aAAa,EAAE,CAAC;IAAC,aAAa,EAAE,aAAa,EAAE,CAAA;CAAE,CAqB3D;AAuCD,wBAAgB,+BAA+B,CAC7C,cAAc,EAAE,aAAa,EAAE,EAC/B,MAAM,GAAE,MAAiB,GACxB,aAAa,EAAE,CAmCjB;AAUD,wBAAgB,mBAAmB,CACjC,aAAa,EAAE,MAAM,EACrB,OAAO,EAAE,MAAM,EACf,oBAAoB,EAAE,MAAM,GAC3B,MAAM,GAAG,IAAI,CAmBf;AAMD,wBAAgB,wBAAwB,CACtC,SAAS,EAAE,QAAQ,EAAE,EACrB,IAAI,EAAE,OAAO,EAAE,EACf,MAAM,GAAE,MAAiB,GACxB,QAAQ,EAAE,CAyBZ;AAOD,wBAAgB,oBAAoB,CAClC,cAAc,EAAE,aAAa,EAAE,EAC/B,IAAI,EAAE,OAAO,EAAE,EACf,MAAM,GAAE,MAAiB,GACxB,aAAa,EAAE,CA2BjB;AAeD,wBAAgB,oBAAoB,CAClC,OAAO,EAAE,OAAO,EAChB,IAAI,EAAE,OAAO,EAAE,EACf,MAAM,GAAE,MAAiB,GACxB,cAAc,CAkChB"}
@@ -0,0 +1,290 @@
1
+ // Story 3.9b — orchestrator-side line-range reconciliation.
2
+ //
3
+ // architecture.md L268-269 — "Guardrail 2 — Citation Grounding (deterministic):
4
+ // every cited evidence excerpt must appear verbatim (CRLF/whitespace
5
+ // normalized) in the input docs. Findings whose citations don't ground are
6
+ // downgraded from DRIFT to NEEDS CLARIFICATION (no-fabrication principle)."
7
+ //
8
+ // MVP implementation: drop ungrounded findings (instead of synthesising a
9
+ // clarification). The clarification leg is unreachable in v1 (the orchestrator
10
+ // only emits drift) and surfacing a synthesised clarification with no
11
+ // concrete proposed_replacement re-introduces the noise this guardrail exists
12
+ // to suppress. Operators see drops via the `core.warning` callback so the
13
+ // silent-drop rate is observable in the Actions log.
14
+ //
15
+ // Mechanism:
16
+ // 1. The LLM saw doc lines prefixed with absolute line numbers (Story 3.9b
17
+ // Task 3) and emitted both `targetLineStart` / `targetLineEnd` AND a
18
+ // verbatim `quotedDocText` excerpt.
19
+ // 2. We `indexOf` the quote in the doc body (after CRLF→LF + per-line
20
+ // trailing-whitespace normalisation).
21
+ // 3. If the quote is found, derive the line range from the match and
22
+ // overwrite the LLM's claimed range. The code, not the LLM, is the
23
+ // source of truth for line numbers.
24
+ // 4. If not found, drop the finding.
25
+ import { analysisSchema } from './schema.js';
26
+ // Whitespace normalisation matched to `apps/web/src/server/reviews/compare-forgiving.ts`
27
+ // (Story 4.10): CRLF→LF and per-line trailing-whitespace trim. Keeping the
28
+ // shape symmetric means a quote that grounds in the orchestrator also matches
29
+ // the slice the FR102 commit-splicer reads — divergence here would create a
30
+ // "grounded in analysis but missing at commit-time" failure mode.
31
+ function normaliseLineEndings(text) {
32
+ return text
33
+ .replace(/\r\n/g, '\n')
34
+ .split('\n')
35
+ .map((line) => line.replace(/[ \t]+$/, ''))
36
+ .join('\n');
37
+ }
38
+ // Locate the verbatim quote in the doc body. Returns absolute line numbers
39
+ // (offset by `frontMatterLineCount`), or `null` if the quote isn't found.
40
+ // First-match-wins on duplicate occurrences — there's no obvious better tie-
41
+ // breaker without re-introducing the LLM line range as a hint, which defeats
42
+ // the whole point of this reconciliation pass.
43
+ export function locateQuote(quote, docBody, frontMatterLineCount) {
44
+ const normBody = normaliseLineEndings(docBody);
45
+ const normQuote = normaliseLineEndings(quote);
46
+ if (normQuote.length === 0)
47
+ return null;
48
+ const idx = normBody.indexOf(normQuote);
49
+ if (idx === -1)
50
+ return null;
51
+ // Convert character offset → 1-indexed body line number.
52
+ // `bodyStart` = number of newlines in `normBody[0..idx)` + 1.
53
+ const before = normBody.slice(0, idx);
54
+ const bodyStart = (before.match(/\n/g)?.length ?? 0) + 1;
55
+ // The match spans newlines internal to `normQuote`. Body-end line is body-
56
+ // start plus the count of newlines inside the quote.
57
+ const quoteNewlineCount = normQuote.match(/\n/g)?.length ?? 0;
58
+ const bodyEnd = bodyStart + quoteNewlineCount;
59
+ return {
60
+ start: bodyStart + frontMatterLineCount,
61
+ end: bodyEnd + frontMatterLineCount,
62
+ };
63
+ }
64
+ // Partition drift findings by actionability:
65
+ // - null or whitespace-only `proposedReplacement` → returned in `narrativeOnly`.
66
+ // The LLM correctly detected drift but had no concrete doc patch to
67
+ // suggest (typically because the doc rule is right and the code is the
68
+ // violation — resolution is to fix code, not docs). These are NOT
69
+ // apply-eligible but are real findings the user should see; the Skill's
70
+ // CLI report surfaces them under "Manual review required". The Action's
71
+ // hosted-review consumers ignore the `narrativeOnly` arm to preserve
72
+ // Stream 4a auto-resolve semantics (a null-replacement on the hosted
73
+ // surface would block auto-resolve forever).
74
+ // - byte-equal to `quotedDocText` after CRLF/whitespace normalisation
75
+ // → silently dropped. This is genuine no-op noise — a stale-comment
76
+ // artefact where the LLM senses contradiction on a code comment but the
77
+ // doc is already correct. Accepting it would produce no diff. Drop is
78
+ // logged via `onWarn` for observability but the finding is discarded
79
+ // (not surfaced as narrative-only) because there is nothing to convey
80
+ // to the user — the doc and the proposed wording are the same string.
81
+ //
82
+ // Drops in the byte-equal branch are surfaced via `onWarn` (operator-
83
+ // observable in the Actions log) for the same reason `reconcileLineNumbers`
84
+ // warns on ungrounded drops — silent-drop rate is the only signal that the
85
+ // LLM is producing this shape. Null/whitespace drops do NOT warn — they
86
+ // flow through `narrativeOnly` instead, which is the operator-observable
87
+ // surface for that case.
88
+ export function filterActionableContradictions(contradictions, onWarn = () => { }) {
89
+ const kept = [];
90
+ const narrativeOnly = [];
91
+ for (const c of contradictions) {
92
+ if (c.proposedReplacement === null || c.proposedReplacement.trim() === '') {
93
+ narrativeOnly.push(c);
94
+ continue;
95
+ }
96
+ if (normaliseLineEndings(c.proposedReplacement) ===
97
+ normaliseLineEndings(c.quotedDocText)) {
98
+ const preview = c.quotedDocText.slice(0, 80).replace(/\n/g, ' ');
99
+ onWarn(`Reconciliation dropped finding for "${c.targetDocPath}" — proposedReplacement is byte-equal to quotedDocText (no-op; preview: "${preview}${c.quotedDocText.length > 80 ? '…' : ''}").`);
100
+ continue;
101
+ }
102
+ kept.push(c);
103
+ }
104
+ return { kept, narrativeOnly };
105
+ }
106
+ // Drop overlapping contradictions before they ship to the platform. The
107
+ // downstream splicer (`findOverlappingRanges` in splice-helpers.ts) rejects
108
+ // the entire Approve-and-Commit batch when any two findings on the same doc
109
+ // share an overlapping `[targetLineStart..targetLineEnd]` range — by design,
110
+ // because applying two replacements to the same span clobbers each other.
111
+ //
112
+ // Observed Sonnet-4.6 behaviour: when a section block contains multiple
113
+ // conceptual drifts (e.g. the 3-line TanStack stack block where each line is
114
+ // a separate drift target), the model emits N findings ALL targeting the
115
+ // same line range, each with its own proposed replacement. The reviewer
116
+ // hits "Approve and Commit" → splicer rejects the whole batch. The result
117
+ // is a dead-end for the demo.
118
+ //
119
+ // Strategy: keep the highest-confidence finding per overlap group; drop the
120
+ // rest with a warning so operators see the silent-drop rate in the Actions
121
+ // log. Tie-break by first-seen so behaviour is deterministic for fixtures.
122
+ //
123
+ // This is a SAFETY NET, not a model fix. The prompt instructs the LLM to
124
+ // emit one consolidated finding per overlapping span, but compliance is
125
+ // best-effort. The dedup pass guarantees the platform never receives
126
+ // findings that the splicer will reject.
127
+ //
128
+ // Story 4.26 AC6 (Path B) — additive findings are NOT deduped here. The
129
+ // canonical gate is `findOverlappingRanges` in `apps/web/src/server/reviews/
130
+ // lib/splice-helpers.ts`, which runs server-side after FR88d and before any
131
+ // DB write. Its tests at `apps/web/src/server/reviews/lib/__tests__/
132
+ // splice-helpers.test.ts` (Story 4.25 — "Story 4.25 — additive findings
133
+ // expand the conflict taxonomy" describe block at L396+) cover the three
134
+ // additive overlap classes from the AC2 prompt rewrite:
135
+ // 1. additive anchor inside a drift range — rejected
136
+ // 2. additive anchors at the same line with the SAME insertionMode —
137
+ // rejected (ambiguous)
138
+ // 3. additive anchors at the same line with DIFFERENT insertionMode
139
+ // values ('before' vs 'after') — accepted (deterministic before/after)
140
+ // Adding a sibling additive-dedup pass at the orchestrator was considered
141
+ // (AC6 Path A) but rejected for the lower-lift / single-canonical-gate
142
+ // posture — the splicer already catches every overlap the LLM can emit.
143
+ export function dedupeOverlappingContradictions(contradictions, onWarn = () => { }) {
144
+ // Group by doc path so overlap is only considered within a single file.
145
+ const byDoc = new Map();
146
+ for (const c of contradictions) {
147
+ const list = byDoc.get(c.targetDocPath) ?? [];
148
+ list.push(c);
149
+ byDoc.set(c.targetDocPath, list);
150
+ }
151
+ const kept = [];
152
+ for (const [, docFindings] of byDoc) {
153
+ // Sort by confidence desc, then by original index (stable tie-break).
154
+ const indexed = docFindings.map((c, i) => ({ c, i }));
155
+ indexed.sort((a, b) => {
156
+ if (b.c.confidence !== a.c.confidence)
157
+ return b.c.confidence - a.c.confidence;
158
+ return a.i - b.i;
159
+ });
160
+ const docKept = [];
161
+ for (const { c } of indexed) {
162
+ const overlap = docKept.find((k) => c.targetLineStart <= k.targetLineEnd && c.targetLineEnd >= k.targetLineStart);
163
+ if (overlap) {
164
+ onWarn(`Reconciliation dropped overlapping finding on "${c.targetDocPath}" lines ${c.targetLineStart}-${c.targetLineEnd} — already covered by a higher-confidence finding at lines ${overlap.targetLineStart}-${overlap.targetLineEnd}.`);
165
+ continue;
166
+ }
167
+ docKept.push(c);
168
+ }
169
+ kept.push(...docKept);
170
+ }
171
+ return kept;
172
+ }
173
+ // Story 4.25 — locate an anchor section heading in the doc body and return
174
+ // its absolute line number. Mirrors `locateQuote`'s contract (1-indexed,
175
+ // offset by frontMatterLineCount, first-match-wins) but searches for a
176
+ // markdown heading line whose visible text equals the anchor section. We
177
+ // match any line that strips down to the anchor text after removing leading
178
+ // `#` markers and surrounding whitespace — that lets the LLM emit
179
+ // "Technology Stack & Versions" and ground against `## Technology Stack &
180
+ // Versions` in the doc.
181
+ export function locateAnchorHeading(anchorSection, docBody, frontMatterLineCount) {
182
+ const normBody = normaliseLineEndings(docBody);
183
+ const wantedText = anchorSection.trim();
184
+ if (wantedText.length === 0)
185
+ return null;
186
+ const lines = normBody.split('\n');
187
+ const HEADING_RE = /^\s*#{1,6}\s/;
188
+ for (let i = 0; i < lines.length; i++) {
189
+ const line = lines[i];
190
+ // Story 4.25 code-review fix: require an actual heading prefix. `replace`
191
+ // returns the line unchanged when the regex doesn't match, so without this
192
+ // gate any prose line whose trimmed text equals the anchor would match
193
+ // (e.g. a TOC entry rendered as the bare section name).
194
+ if (!HEADING_RE.test(line))
195
+ continue;
196
+ const stripped = line.replace(/^\s*#{1,6}\s*/, '').trim();
197
+ if (stripped === wantedText) {
198
+ return i + 1 + frontMatterLineCount;
199
+ }
200
+ }
201
+ return null;
202
+ }
203
+ // Story 4.25 — drop additive findings whose anchor heading can't be located
204
+ // in the doc body (no-fabrication invariant); overwrite `anchorLine` with
205
+ // the located absolute line number for findings whose anchor IS located.
206
+ // The LLM emits a section heading text; the orchestrator derives the line.
207
+ export function reconcileAdditiveAnchors(additions, docs, onWarn = () => { }) {
208
+ const kept = [];
209
+ for (const a of additions) {
210
+ const doc = docs.find((d) => d.path === a.targetDocPath);
211
+ if (!doc) {
212
+ onWarn(`Reconciliation dropped additive finding for unknown doc path "${a.targetDocPath}" (no doc with this path was analysed).`);
213
+ continue;
214
+ }
215
+ const lineNumber = locateAnchorHeading(a.anchorSection, doc.content, doc.frontMatterLineCount);
216
+ if (lineNumber === null) {
217
+ const preview = a.anchorSection.slice(0, 80);
218
+ onWarn(`Reconciliation dropped additive finding for "${a.targetDocPath}" — anchor section heading not found in doc body (preview: "${preview}${a.anchorSection.length > 80 ? '…' : ''}").`);
219
+ continue;
220
+ }
221
+ kept.push({ ...a, anchorLine: lineNumber });
222
+ }
223
+ return kept;
224
+ }
225
+ // Reconcile each contradiction's line range against the doc body via
226
+ // `locateQuote(quotedDocText)`. Findings whose quote can't be located are
227
+ // dropped (returned list is shorter than input). Findings whose quote is
228
+ // located have their `targetLineStart` / `targetLineEnd` overwritten — the
229
+ // LLM's emitted numbers become advisory.
230
+ export function reconcileLineNumbers(contradictions, docs, onWarn = () => { }) {
231
+ const kept = [];
232
+ for (const c of contradictions) {
233
+ const doc = docs.find((d) => d.path === c.targetDocPath);
234
+ if (!doc) {
235
+ onWarn(`Reconciliation dropped finding for unknown doc path "${c.targetDocPath}" (no doc with this path was analysed).`);
236
+ continue;
237
+ }
238
+ const located = locateQuote(c.quotedDocText, doc.content, doc.frontMatterLineCount);
239
+ if (located === null) {
240
+ // First 80 chars of the quote — never log the full quote (could be
241
+ // arbitrary doc content; in public repos the Actions log is public).
242
+ const preview = c.quotedDocText.slice(0, 80).replace(/\n/g, ' ');
243
+ onWarn(`Reconciliation dropped finding for "${c.targetDocPath}" — quotedDocText not found in doc body (preview: "${preview}${c.quotedDocText.length > 80 ? '…' : ''}").`);
244
+ continue;
245
+ }
246
+ kept.push({
247
+ ...c,
248
+ targetLineStart: located.start,
249
+ targetLineEnd: located.end,
250
+ });
251
+ }
252
+ return kept;
253
+ }
254
+ // Public surface — composed reconciliation entry point for the analysis core.
255
+ // Validates the LLM's raw JSON output against `analysisSchema`, then runs the
256
+ // four-stage reconciliation pipeline (line-number grounding, actionability
257
+ // filter, overlap dedup, additive-anchor grounding) in the same order the
258
+ // Action's orchestrator used pre-extraction.
259
+ //
260
+ // Throws on schema-validation failure (`analysisSchema.parse` throws). The
261
+ // `@delfini/cli` `local-finalize` command keys exit-code 3 off this throw via
262
+ // FR145's one-retry contract.
263
+ //
264
+ // `onWarn` defaults to a no-op. The Action wires `core.warning` so dropped
265
+ // findings (ungrounded quotes, unactionable replacements, overlap losers,
266
+ // missing additive anchors) show up in the Actions log.
267
+ export function validateAndReconcile(rawJson, docs, onWarn = () => { }) {
268
+ const parsed = analysisSchema.parse(rawJson);
269
+ const reconciledContradictions = reconcileLineNumbers(parsed.contradictions, docs, onWarn);
270
+ const { kept: actionable, narrativeOnly } = filterActionableContradictions(reconciledContradictions, onWarn);
271
+ const dedupedContradictions = dedupeOverlappingContradictions(actionable, onWarn);
272
+ const reconciledAdditions = reconcileAdditiveAnchors(parsed.additions, docs, onWarn);
273
+ // narrativeOnly is intentionally NOT deduped: it never enters the splicer,
274
+ // so overlap doesn't matter, and dropping a narrative-only entry that
275
+ // overlaps with an apply-eligible one would hide useful context from the
276
+ // user (the two findings can describe the same drift from different angles).
277
+ //
278
+ // `narrativeOnlyContradictions` is conditionally spread (only when non-
279
+ // empty) to keep the optional-field semantics honest: consumers that
280
+ // predate this field (notably apps/action's hosted-review path) see no
281
+ // change in result shape for the common case, and their `toEqual`-style
282
+ // fixture comparisons stay stable. CLI consumers use `?? []` so the
283
+ // undefined case is handled identically to `[]`.
284
+ return {
285
+ contradictions: dedupedContradictions,
286
+ additions: reconciledAdditions,
287
+ rawConfidence: parsed.rawConfidence,
288
+ ...(narrativeOnly.length > 0 ? { narrativeOnlyContradictions: narrativeOnly } : {}),
289
+ };
290
+ }
@@ -0,0 +1,73 @@
1
+ import type { DocFile } from './types.js';
2
+ export interface DocRelevanceScore {
3
+ path: string;
4
+ score: number;
5
+ breakdown: {
6
+ docPathInDiff: number;
7
+ fileOverlap: number;
8
+ identifierOverlap: number;
9
+ headingOverlap: number;
10
+ };
11
+ }
12
+ export interface SelectRelevantDocsResult {
13
+ kept: DocFile[];
14
+ dropped: DocRelevanceScore[];
15
+ }
16
+ export declare function scoreDocRelevance(doc: DocFile, diff: string): DocRelevanceScore;
17
+ export interface DocSection {
18
+ lines: string[];
19
+ startLineIndex: number;
20
+ }
21
+ export interface DroppedSection {
22
+ startLineIndex: number;
23
+ score: number;
24
+ /**
25
+ * Set by `rankedFillSections` (Story P3.7.3) when the dropped record is
26
+ * cross-doc (ranked-fill operates over a flat candidate list from any
27
+ * number of docs). `selectRelevantSections` operates per-doc and leaves
28
+ * this undefined — its caller already knows which doc the section came
29
+ * from.
30
+ */
31
+ docPath?: string;
32
+ }
33
+ export interface SelectRelevantSectionsResult {
34
+ kept: DocSection[];
35
+ dropped: DroppedSection[];
36
+ }
37
+ export declare function splitIntoSections(content: string): DocSection[];
38
+ /**
39
+ * Select the heading-delimited sections of a single doc whose relevance score
40
+ * is at/above `threshold`. Mirrors `selectRelevantDocs`:
41
+ * - inclusive lower bound (`score >= threshold` keeps the section)
42
+ * - `threshold <= 0` / non-finite → keep every section (no-op fast-path)
43
+ *
44
+ * Each section reuses the same four signals as `scoreDocRelevance`, applied to
45
+ * the section content — EXCEPT `docPathInDiff`, which is a whole-document
46
+ * property (the doc itself was edited in the diff). It is computed once per doc
47
+ * and added to every section's score, so a doc that appears in the diff header
48
+ * is retained whole.
49
+ */
50
+ export declare function selectRelevantSections(doc: DocFile, diff: string, threshold: number): SelectRelevantSectionsResult;
51
+ export interface RankedFillCandidate {
52
+ doc: DocFile;
53
+ section: DocSection;
54
+ score: number;
55
+ }
56
+ export interface RankedFillResult {
57
+ included: RankedFillCandidate[];
58
+ dropped: RankedFillCandidate[];
59
+ }
60
+ export declare function rankedFillSections(candidates: RankedFillCandidate[], budgetTokens: number, measure: (candidate: RankedFillCandidate) => number): RankedFillResult;
61
+ /**
62
+ * Filter docs by relevance score against the diff. Docs with
63
+ * `score >= threshold` are kept; docs below are dropped.
64
+ *
65
+ * The threshold is INCLUSIVE on the lower bound — a doc scoring exactly the
66
+ * threshold value is kept, not dropped.
67
+ *
68
+ * Fast-path: `threshold <= 0` or non-finite (NaN, Infinity) returns every
69
+ * doc in `kept` with `dropped` empty. This makes the function observably
70
+ * no-op for the default `buildPrompt` call path (NFR44 parity).
71
+ */
72
+ export declare function selectRelevantDocs(docs: DocFile[], diff: string, threshold: number): SelectRelevantDocsResult;
73
+ //# sourceMappingURL=relevance.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"relevance.d.ts","sourceRoot":"","sources":["../src/relevance.ts"],"names":[],"mappings":"AAUA,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,YAAY,CAAA;AAEzC,MAAM,WAAW,iBAAiB;IAChC,IAAI,EAAE,MAAM,CAAA;IACZ,KAAK,EAAE,MAAM,CAAA;IACb,SAAS,EAAE;QACT,aAAa,EAAE,MAAM,CAAA;QACrB,WAAW,EAAE,MAAM,CAAA;QACnB,iBAAiB,EAAE,MAAM,CAAA;QACzB,cAAc,EAAE,MAAM,CAAA;KACvB,CAAA;CACF;AAED,MAAM,WAAW,wBAAwB;IACvC,IAAI,EAAE,OAAO,EAAE,CAAA;IACf,OAAO,EAAE,iBAAiB,EAAE,CAAA;CAC7B;AAED,wBAAgB,iBAAiB,CAAC,GAAG,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,GAAG,iBAAiB,CAe/E;AAoHD,MAAM,WAAW,UAAU;IACzB,KAAK,EAAE,MAAM,EAAE,CAAA;IACf,cAAc,EAAE,MAAM,CAAA;CACvB;AAED,MAAM,WAAW,cAAc;IAC7B,cAAc,EAAE,MAAM,CAAA;IACtB,KAAK,EAAE,MAAM,CAAA;IACb;;;;;;OAMG;IACH,OAAO,CAAC,EAAE,MAAM,CAAA;CACjB;AAED,MAAM,WAAW,4BAA4B;IAC3C,IAAI,EAAE,UAAU,EAAE,CAAA;IAClB,OAAO,EAAE,cAAc,EAAE,CAAA;CAC1B;AAQD,wBAAgB,iBAAiB,CAAC,OAAO,EAAE,MAAM,GAAG,UAAU,EAAE,CAe/D;AAED;;;;;;;;;;;GAWG;AACH,wBAAgB,sBAAsB,CACpC,GAAG,EAAE,OAAO,EACZ,IAAI,EAAE,MAAM,EACZ,SAAS,EAAE,MAAM,GAChB,4BAA4B,CAwB9B;AAmBD,MAAM,WAAW,mBAAmB;IAClC,GAAG,EAAE,OAAO,CAAA;IACZ,OAAO,EAAE,UAAU,CAAA;IACnB,KAAK,EAAE,MAAM,CAAA;CACd;AAED,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,EAAE,mBAAmB,EAAE,CAAA;IAC/B,OAAO,EAAE,mBAAmB,EAAE,CAAA;CAC/B;AAED,wBAAgB,kBAAkB,CAChC,UAAU,EAAE,mBAAmB,EAAE,EACjC,YAAY,EAAE,MAAM,EACpB,OAAO,EAAE,CAAC,SAAS,EAAE,mBAAmB,KAAK,MAAM,GAClD,gBAAgB,CAyDlB;AAED;;;;;;;;;;GAUG;AACH,wBAAgB,kBAAkB,CAChC,IAAI,EAAE,OAAO,EAAE,EACf,IAAI,EAAE,MAAM,EACZ,SAAS,EAAE,MAAM,GAChB,wBAAwB,CAe1B"}
@@ -0,0 +1,266 @@
1
+ // packages/drift-engine/src/relevance.ts
2
+ //
3
+ // Doc-relevance scoring + selection. Pure-logic — no I/O, no clock, no
4
+ // randomness. Consumed by `buildPrompt` when `BuildPromptOptions.relevanceThreshold`
5
+ // is set; bypassed entirely at default (NFR44 parity preserved).
6
+ //
7
+ // Internal helper — not exposed via `index.ts`. Tests reach it via the
8
+ // relative `../src/relevance.js` import, same convention as `reconcile`
9
+ // internal helpers.
10
+ export function scoreDocRelevance(doc, diff) {
11
+ const diffFilePaths = extractDiffFilePaths(diff);
12
+ const diffIdentifiers = extractIdentifiers(diff);
13
+ const breakdown = {
14
+ docPathInDiff: scoreDocPathInDiff(doc.path, diff),
15
+ fileOverlap: scoreFileOverlap(doc.content, diffFilePaths),
16
+ identifierOverlap: scoreIdentifierOverlap(doc.content, diffIdentifiers),
17
+ headingOverlap: scoreHeadingOverlap(doc.content, diffIdentifiers),
18
+ };
19
+ const score = breakdown.docPathInDiff +
20
+ breakdown.fileOverlap +
21
+ breakdown.identifierOverlap +
22
+ breakdown.headingOverlap;
23
+ return { path: doc.path, score, breakdown };
24
+ }
25
+ function scoreHeadingOverlap(docContent, diffIdentifiers) {
26
+ let score = 0;
27
+ const lines = docContent.split(/\r?\n/);
28
+ for (const line of lines) {
29
+ if (!/^#{1,6}\s+/.test(line))
30
+ continue;
31
+ const headingIdents = extractIdentifiers(line);
32
+ for (const ident of headingIdents) {
33
+ if (diffIdentifiers.has(ident)) {
34
+ score += 5;
35
+ break; // count each heading at most once
36
+ }
37
+ }
38
+ }
39
+ return score;
40
+ }
41
+ function scoreDocPathInDiff(docPath, diff) {
42
+ // `diff --git a/<path> b/<path>` is the canonical header. Match either side.
43
+ const escaped = docPath.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
44
+ const pattern = new RegExp(`diff --git [ab]/${escaped} `, 'm');
45
+ return pattern.test(diff) ? 20 : 0;
46
+ }
47
+ // Extracts file paths from `diff --git a/<path> b/<path>` headers where both
48
+ // sides match. Known limitations (deliberate scope for Tier 2):
49
+ // - Renames are NOT extracted (`a/old.ts b/new.ts` — sides differ)
50
+ // - Binary-file headers ARE matched on the diff line (harmless — binary
51
+ // paths rarely appear verbatim in .md docs)
52
+ // - Paths containing whitespace (quoted in git's diff format) are truncated
53
+ // at the first space; out of scope for Tier 2
54
+ function extractDiffFilePaths(diff) {
55
+ const paths = new Set();
56
+ const pattern = /^diff --git a\/(\S+) b\/\1/gm;
57
+ let match;
58
+ while ((match = pattern.exec(diff)) !== null) {
59
+ paths.add(match[1]);
60
+ }
61
+ return paths;
62
+ }
63
+ // Common keywords we filter out to avoid false positives. Not an exhaustive
64
+ // list — just the highest-frequency offenders. The 3-char minimum already
65
+ // drops `if`, `do`, `or`, `is`, etc.
66
+ const COMMON_KEYWORDS = new Set([
67
+ 'the', 'and', 'for', 'with', 'this', 'that', 'from', 'into', 'have',
68
+ 'has', 'are', 'was', 'were', 'will', 'can', 'not', 'but', 'all', 'any',
69
+ 'one', 'two', 'use', 'add', 'get', 'set', 'put', 'new', 'old', 'now',
70
+ 'let', 'var', 'const', 'function', 'return', 'import', 'export',
71
+ 'true', 'false', 'null', 'undefined', 'void', 'string', 'number',
72
+ 'boolean', 'object', 'array', 'type', 'interface',
73
+ ]);
74
+ function extractIdentifiers(text) {
75
+ // Match camelCase / PascalCase / snake_case / kebab-case tokens of length 3+.
76
+ // The character class includes `-`, so kebab tokens like `some-token` are
77
+ // captured whole (one identifier, not two). The token must START with a
78
+ // letter or underscore — a leading `-` is rejected (so `-flag` is not an
79
+ // identifier; the `-` is a boundary on the front but not in the middle).
80
+ const pattern = /[a-zA-Z_][a-zA-Z0-9_-]{2,}/g;
81
+ const out = new Set();
82
+ let match;
83
+ while ((match = pattern.exec(text)) !== null) {
84
+ const token = match[0];
85
+ if (!COMMON_KEYWORDS.has(token.toLowerCase())) {
86
+ out.add(token);
87
+ }
88
+ }
89
+ return out;
90
+ }
91
+ function scoreIdentifierOverlap(docContent, diffIdentifiers) {
92
+ let count = 0;
93
+ // Substring match is intentional here (unlike `scoreFileOverlap` which is
94
+ // boundary-aware). A doc mentioning `processPayment` is plausibly relevant
95
+ // to a change touching `process`. The +30 cap bounds noise from this.
96
+ for (const ident of diffIdentifiers) {
97
+ if (docContent.includes(ident)) {
98
+ count += 1;
99
+ }
100
+ }
101
+ return Math.min(count * 3, 30);
102
+ }
103
+ function scoreFileOverlap(docContent, diffFilePaths) {
104
+ let score = 0;
105
+ for (const filePath of diffFilePaths) {
106
+ const escaped = filePath.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
107
+ // Boundary check: filePath must not be immediately followed by a word
108
+ // character, `/`, `-`, or a `.` that itself continues into another word
109
+ // character (extension continuation like `.map`). This prevents
110
+ // `src/pay.ts` from spuriously matching `src/pay.ts.map`,
111
+ // `src/payments`, or `src/pay.ts-old`, while still matching
112
+ // `src/auth.ts.` at sentence end (period is not followed by `\w`).
113
+ const pattern = new RegExp(`${escaped}(?![\\w/-])(?!\\.\\w)`);
114
+ if (pattern.test(docContent)) {
115
+ score += 10;
116
+ }
117
+ }
118
+ return score;
119
+ }
120
+ // Split a doc body into heading-delimited sections. Splitting on the single
121
+ // char `\n` (not `/\r?\n/`) is lossless — `sections.flatMap(s => s.lines)
122
+ // .join('\n')` reconstructs `content` byte-for-byte (a CRLF line keeps its
123
+ // trailing `\r` as the last char of its line element). Content before the
124
+ // first heading is a leading section; an all-body doc is one leading section;
125
+ // empty content is one empty section. Nothing is dropped on the floor.
126
+ export function splitIntoSections(content) {
127
+ const lines = content.split('\n');
128
+ const sections = [];
129
+ let current = null;
130
+ lines.forEach((line, i) => {
131
+ if (/^#{1,6}\s+/.test(line)) {
132
+ if (current)
133
+ sections.push(current);
134
+ current = { lines: [line], startLineIndex: i };
135
+ }
136
+ else {
137
+ if (!current)
138
+ current = { lines: [], startLineIndex: i };
139
+ current.lines.push(line);
140
+ }
141
+ });
142
+ if (current)
143
+ sections.push(current);
144
+ return sections;
145
+ }
146
+ /**
147
+ * Select the heading-delimited sections of a single doc whose relevance score
148
+ * is at/above `threshold`. Mirrors `selectRelevantDocs`:
149
+ * - inclusive lower bound (`score >= threshold` keeps the section)
150
+ * - `threshold <= 0` / non-finite → keep every section (no-op fast-path)
151
+ *
152
+ * Each section reuses the same four signals as `scoreDocRelevance`, applied to
153
+ * the section content — EXCEPT `docPathInDiff`, which is a whole-document
154
+ * property (the doc itself was edited in the diff). It is computed once per doc
155
+ * and added to every section's score, so a doc that appears in the diff header
156
+ * is retained whole.
157
+ */
158
+ export function selectRelevantSections(doc, diff, threshold) {
159
+ const sections = splitIntoSections(doc.content);
160
+ if (!Number.isFinite(threshold) || threshold <= 0) {
161
+ return { kept: sections, dropped: [] };
162
+ }
163
+ const diffFilePaths = extractDiffFilePaths(diff);
164
+ const diffIdentifiers = extractIdentifiers(diff);
165
+ const docPathScore = scoreDocPathInDiff(doc.path, diff);
166
+ const kept = [];
167
+ const dropped = [];
168
+ for (const section of sections) {
169
+ const sectionContent = section.lines.join('\n');
170
+ const score = docPathScore +
171
+ scoreFileOverlap(sectionContent, diffFilePaths) +
172
+ scoreIdentifierOverlap(sectionContent, diffIdentifiers) +
173
+ scoreHeadingOverlap(sectionContent, diffIdentifiers);
174
+ if (score >= threshold) {
175
+ kept.push(section);
176
+ }
177
+ else {
178
+ dropped.push({ startLineIndex: section.startLineIndex, score });
179
+ }
180
+ }
181
+ return { kept, dropped };
182
+ }
183
+ export function rankedFillSections(candidates, budgetTokens, measure) {
184
+ // Non-positive / non-finite budget → no constraint expressed; include
185
+ // everything (matches the threshold fast-path semantics elsewhere in this
186
+ // module). A budget of 0 means "include nothing fits at non-zero cost" but
187
+ // we still apply the inclusion rule per-candidate, so a measure() returning
188
+ // 0 for every candidate would still keep them all — deliberate.
189
+ if (!Number.isFinite(budgetTokens) || budgetTokens <= 0) {
190
+ return { included: [...candidates], dropped: [] };
191
+ }
192
+ // Tag with original index so we can restore input order on the dropped side.
193
+ const indexed = candidates.map((candidate, originalIndex) => ({
194
+ candidate,
195
+ originalIndex,
196
+ }));
197
+ // Stable, deterministic ranking. Score DESC, then docPath ASC, then
198
+ // startLineIndex ASC. Equal-score sections from the same doc sort by line.
199
+ //
200
+ // Path comparison is CODEPOINT-based (`<` / `>`), NOT `localeCompare`:
201
+ // `localeCompare` with no locale argument uses the host's default ICU
202
+ // collation, so identical inputs could rank differently across machines
203
+ // (different `LANG` / ICU build) — a determinism violation in a pure-logic
204
+ // engine whose charter (NFR44) is byte-for-byte reproducibility.
205
+ const ranked = [...indexed].sort((a, b) => {
206
+ if (b.candidate.score !== a.candidate.score) {
207
+ return b.candidate.score - a.candidate.score;
208
+ }
209
+ const pa = a.candidate.doc.path;
210
+ const pb = b.candidate.doc.path;
211
+ if (pa !== pb)
212
+ return pa < pb ? -1 : 1;
213
+ return a.candidate.section.startLineIndex - b.candidate.section.startLineIndex;
214
+ });
215
+ const includedFlags = new Array(candidates.length).fill(false);
216
+ let runningTokens = 0;
217
+ for (const entry of ranked) {
218
+ const cost = measure(entry.candidate);
219
+ if (runningTokens + cost <= budgetTokens) {
220
+ runningTokens += cost;
221
+ includedFlags[entry.originalIndex] = true;
222
+ }
223
+ // Else: skip this candidate. Do NOT break — a smaller later candidate
224
+ // may still fit ("first-fit decreasing"-style packing). Deterministic
225
+ // and a strict improvement over "stop at first overflow."
226
+ }
227
+ const included = [];
228
+ const dropped = [];
229
+ for (let i = 0; i < candidates.length; i++) {
230
+ if (includedFlags[i]) {
231
+ included.push(candidates[i]);
232
+ }
233
+ else {
234
+ dropped.push(candidates[i]);
235
+ }
236
+ }
237
+ return { included, dropped };
238
+ }
239
+ /**
240
+ * Filter docs by relevance score against the diff. Docs with
241
+ * `score >= threshold` are kept; docs below are dropped.
242
+ *
243
+ * The threshold is INCLUSIVE on the lower bound — a doc scoring exactly the
244
+ * threshold value is kept, not dropped.
245
+ *
246
+ * Fast-path: `threshold <= 0` or non-finite (NaN, Infinity) returns every
247
+ * doc in `kept` with `dropped` empty. This makes the function observably
248
+ * no-op for the default `buildPrompt` call path (NFR44 parity).
249
+ */
250
+ export function selectRelevantDocs(docs, diff, threshold) {
251
+ if (!Number.isFinite(threshold) || threshold <= 0) {
252
+ return { kept: [...docs], dropped: [] };
253
+ }
254
+ const kept = [];
255
+ const dropped = [];
256
+ for (const doc of docs) {
257
+ const scored = scoreDocRelevance(doc, diff);
258
+ if (scored.score >= threshold) {
259
+ kept.push(doc);
260
+ }
261
+ else {
262
+ dropped.push(scored);
263
+ }
264
+ }
265
+ return { kept, dropped };
266
+ }