docrev 0.9.13 → 0.9.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +9 -9
- package/.gitattributes +1 -1
- package/CHANGELOG.md +149 -149
- package/PLAN-tables-and-postprocess.md +850 -850
- package/README.md +391 -391
- package/bin/rev.js +11 -11
- package/bin/rev.ts +145 -145
- package/completions/rev.bash +127 -127
- package/completions/rev.ps1 +210 -210
- package/completions/rev.zsh +207 -207
- package/dev_notes/stress2/build_adversarial.ts +186 -186
- package/dev_notes/stress2/drift_matcher.ts +62 -62
- package/dev_notes/stress2/probe_anchors.ts +35 -35
- package/dev_notes/stress2/project/discussion.before.md +3 -3
- package/dev_notes/stress2/project/discussion.md +3 -3
- package/dev_notes/stress2/project/methods.before.md +20 -20
- package/dev_notes/stress2/project/methods.md +20 -20
- package/dev_notes/stress2/project/rev.yaml +5 -5
- package/dev_notes/stress2/project/sections.yaml +4 -4
- package/dev_notes/stress2/sections.yaml +5 -5
- package/dev_notes/stress2/trace_placement.ts +50 -50
- package/dev_notes/stresstest_boundaries.ts +27 -27
- package/dev_notes/stresstest_drift_apply.ts +43 -43
- package/dev_notes/stresstest_drift_compare.ts +43 -43
- package/dev_notes/stresstest_drift_v2.ts +54 -54
- package/dev_notes/stresstest_inspect.ts +54 -54
- package/dev_notes/stresstest_pstyle.ts +55 -55
- package/dev_notes/stresstest_section_debug.ts +23 -23
- package/dev_notes/stresstest_split.ts +70 -70
- package/dev_notes/stresstest_trace.ts +19 -19
- package/dev_notes/stresstest_verify_no_overwrite.ts +40 -40
- package/dist/lib/build.d.ts +38 -1
- package/dist/lib/build.d.ts.map +1 -1
- package/dist/lib/build.js +68 -30
- package/dist/lib/build.js.map +1 -1
- package/dist/lib/commands/build.d.ts.map +1 -1
- package/dist/lib/commands/build.js +38 -5
- package/dist/lib/commands/build.js.map +1 -1
- package/dist/lib/commands/utilities.js +164 -164
- package/dist/lib/commands/word-tools.js +8 -8
- package/dist/lib/grammar.js +3 -3
- package/dist/lib/pdf-comments.js +44 -44
- package/dist/lib/plugins.js +57 -57
- package/dist/lib/pptx-themes.js +115 -115
- package/dist/lib/spelling.js +2 -2
- package/dist/lib/templates.js +387 -387
- package/dist/lib/themes.js +51 -51
- package/eslint.config.js +27 -27
- package/lib/anchor-match.ts +276 -276
- package/lib/annotations.ts +644 -644
- package/lib/build.ts +1300 -1251
- package/lib/citations.ts +160 -160
- package/lib/commands/build.ts +833 -801
- package/lib/commands/citations.ts +515 -515
- package/lib/commands/comments.ts +1050 -1050
- package/lib/commands/context.ts +174 -174
- package/lib/commands/core.ts +309 -309
- package/lib/commands/doi.ts +435 -435
- package/lib/commands/file-ops.ts +372 -372
- package/lib/commands/history.ts +320 -320
- package/lib/commands/index.ts +87 -87
- package/lib/commands/init.ts +259 -259
- package/lib/commands/merge-resolve.ts +378 -378
- package/lib/commands/preview.ts +178 -178
- package/lib/commands/project-info.ts +244 -244
- package/lib/commands/quality.ts +517 -517
- package/lib/commands/response.ts +454 -454
- package/lib/commands/section-boundaries.ts +82 -82
- package/lib/commands/sections.ts +451 -451
- package/lib/commands/sync.ts +706 -706
- package/lib/commands/text-ops.ts +449 -449
- package/lib/commands/utilities.ts +448 -448
- package/lib/commands/verify-anchors.ts +272 -272
- package/lib/commands/word-tools.ts +340 -340
- package/lib/comment-realign.ts +517 -517
- package/lib/config.ts +84 -84
- package/lib/crossref.ts +781 -781
- package/lib/csl.ts +191 -191
- package/lib/dependencies.ts +98 -98
- package/lib/diff-engine.ts +465 -465
- package/lib/doi-cache.ts +115 -115
- package/lib/doi.ts +897 -897
- package/lib/equations.ts +506 -506
- package/lib/errors.ts +346 -346
- package/lib/format.ts +541 -541
- package/lib/git.ts +326 -326
- package/lib/grammar.ts +303 -303
- package/lib/image-registry.ts +180 -180
- package/lib/import.ts +911 -911
- package/lib/journals.ts +543 -543
- package/lib/merge.ts +633 -633
- package/lib/orcid.ts +144 -144
- package/lib/pdf-comments.ts +263 -263
- package/lib/pdf-import.ts +524 -524
- package/lib/plugins.ts +362 -362
- package/lib/postprocess.ts +188 -188
- package/lib/pptx-color-filter.lua +37 -37
- package/lib/pptx-template.ts +469 -469
- package/lib/pptx-themes.ts +483 -483
- package/lib/protect-restore.ts +520 -520
- package/lib/rate-limiter.ts +94 -94
- package/lib/response.ts +197 -197
- package/lib/restore-references.ts +240 -240
- package/lib/review.ts +327 -327
- package/lib/schema.ts +417 -417
- package/lib/scientific-words.ts +73 -73
- package/lib/sections.ts +335 -335
- package/lib/slides.ts +756 -756
- package/lib/spelling.ts +334 -334
- package/lib/templates.ts +526 -526
- package/lib/themes.ts +742 -742
- package/lib/trackchanges.ts +247 -247
- package/lib/tui.ts +450 -450
- package/lib/types.ts +550 -550
- package/lib/undo.ts +250 -250
- package/lib/utils.ts +69 -69
- package/lib/variables.ts +179 -179
- package/lib/word-extraction.ts +806 -806
- package/lib/word.ts +643 -643
- package/lib/wordcomments.ts +817 -817
- package/package.json +137 -137
- package/scripts/postbuild.js +28 -28
- package/skill/REFERENCE.md +431 -431
- package/skill/SKILL.md +258 -258
- package/tsconfig.json +26 -26
- package/types/index.d.ts +525 -525
package/lib/anchor-match.ts
CHANGED
|
@@ -1,276 +1,276 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Anchor matching primitives shared between sync (insertion) and
|
|
3
|
-
* verify-anchors (drift reporting). The functions are pure: given an
|
|
4
|
-
* anchor string and surrounding context, locate candidate positions in
|
|
5
|
-
* a target text using progressively looser strategies.
|
|
6
|
-
*/
|
|
7
|
-
|
|
8
|
-
export type AnchorStrategy =
|
|
9
|
-
| 'direct'
|
|
10
|
-
| 'normalized'
|
|
11
|
-
| 'stripped'
|
|
12
|
-
| 'partial-start'
|
|
13
|
-
| 'partial-start-stripped'
|
|
14
|
-
| 'context-both'
|
|
15
|
-
| 'context-before'
|
|
16
|
-
| 'context-after'
|
|
17
|
-
| 'split-match'
|
|
18
|
-
| 'empty-anchor'
|
|
19
|
-
| 'failed';
|
|
20
|
-
|
|
21
|
-
export interface AnchorSearchResult {
|
|
22
|
-
occurrences: number[];
|
|
23
|
-
matchedAnchor: string | null;
|
|
24
|
-
strategy: AnchorStrategy;
|
|
25
|
-
stripped?: boolean;
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
/**
|
|
29
|
-
* Strip CriticMarkup so the matcher sees plain prose instead of
|
|
30
|
-
* `{++inserted++}`/`{--deleted--}`/etc. Used when an anchor lives
|
|
31
|
-
* underneath previously imported track changes.
|
|
32
|
-
*/
|
|
33
|
-
export function stripCriticMarkup(text: string): string {
|
|
34
|
-
return text
|
|
35
|
-
.replace(/\{\+\+([^+]*)\+\+\}/g, '$1') // insertions: keep new text
|
|
36
|
-
.replace(/\{--([^-]*)--\}/g, '') // deletions: remove old text
|
|
37
|
-
.replace(/\{~~([^~]*)~>([^~]*)~~\}/g, '$2') // substitutions: keep new text
|
|
38
|
-
.replace(/\{>>[\s\S]*?<<\}/g, '') // comments: remove (non-greedy; comment text may contain '<')
|
|
39
|
-
.replace(/\[([^\]]*)\]\{\.mark\}/g, '$1'); // marked text: keep text
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
/**
|
|
43
|
-
* Return every starting index where `needle` occurs in `haystack`.
|
|
44
|
-
* Empty needles return no occurrences (empty matches are not useful
|
|
45
|
-
* for anchor placement).
|
|
46
|
-
*/
|
|
47
|
-
/**
|
|
48
|
-
* Score how well the docx-side `before` / `after` context matches the
|
|
49
|
-
* surroundings of a candidate position in the target text. Used by
|
|
50
|
-
* `verify-anchors` to tell apart "multiple hits but context picks one
|
|
51
|
-
* cleanly" (sync will place it correctly) from "multiple hits, context
|
|
52
|
-
* doesn't help" (truly ambiguous, needs human placement).
|
|
53
|
-
*
|
|
54
|
-
* Returns 0 if no context was provided.
|
|
55
|
-
*/
|
|
56
|
-
export function scoreContextAt(
|
|
57
|
-
pos: number,
|
|
58
|
-
text: string,
|
|
59
|
-
before: string,
|
|
60
|
-
after: string,
|
|
61
|
-
anchorLen: number,
|
|
62
|
-
): number {
|
|
63
|
-
let score = 0;
|
|
64
|
-
if (before) {
|
|
65
|
-
const contextBefore = text.slice(Math.max(0, pos - before.length - 20), pos).toLowerCase();
|
|
66
|
-
const beforeLower = before.toLowerCase();
|
|
67
|
-
const beforeWords = beforeLower.split(/\s+/).filter(w => w.length > 3);
|
|
68
|
-
for (const word of beforeWords) {
|
|
69
|
-
if (contextBefore.includes(word)) score += 2;
|
|
70
|
-
}
|
|
71
|
-
if (contextBefore.includes(beforeLower.slice(-30))) score += 5;
|
|
72
|
-
}
|
|
73
|
-
if (after) {
|
|
74
|
-
const contextAfter = text.slice(pos + anchorLen, pos + anchorLen + after.length + 20).toLowerCase();
|
|
75
|
-
const afterLower = after.toLowerCase();
|
|
76
|
-
const afterWords = afterLower.split(/\s+/).filter(w => w.length > 3);
|
|
77
|
-
for (const word of afterWords) {
|
|
78
|
-
if (contextAfter.includes(word)) score += 2;
|
|
79
|
-
}
|
|
80
|
-
if (contextAfter.includes(afterLower.slice(0, 30))) score += 5;
|
|
81
|
-
}
|
|
82
|
-
return score;
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
export function findAllOccurrences(haystack: string, needle: string): number[] {
|
|
86
|
-
if (!needle || needle.length === 0) return [];
|
|
87
|
-
const occurrences: number[] = [];
|
|
88
|
-
let idx = 0;
|
|
89
|
-
while ((idx = haystack.indexOf(needle, idx)) !== -1) {
|
|
90
|
-
occurrences.push(idx);
|
|
91
|
-
idx += 1;
|
|
92
|
-
}
|
|
93
|
-
return occurrences;
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
/**
|
|
97
|
-
* Find candidate positions for `anchor` in `text`, falling back through
|
|
98
|
-
* progressively looser strategies (whitespace normalization, stripped
|
|
99
|
-
* CriticMarkup, partial-prefix, surrounding context, word splitting).
|
|
100
|
-
*
|
|
101
|
-
* The returned `strategy` lets callers distinguish a clean direct hit
|
|
102
|
-
* from a fuzzy approximation — useful for drift reporting.
|
|
103
|
-
*/
|
|
104
|
-
export function findAnchorInText(
|
|
105
|
-
anchor: string,
|
|
106
|
-
text: string,
|
|
107
|
-
before: string = '',
|
|
108
|
-
after: string = ''
|
|
109
|
-
): AnchorSearchResult {
|
|
110
|
-
// Empty anchor: skip directly to context-based matching
|
|
111
|
-
if (!anchor || anchor.trim().length === 0) {
|
|
112
|
-
if (before || after) {
|
|
113
|
-
const beforeLower = (before || '').toLowerCase();
|
|
114
|
-
const afterLower = (after || '').toLowerCase();
|
|
115
|
-
const textLower = text.toLowerCase();
|
|
116
|
-
|
|
117
|
-
if (before && after) {
|
|
118
|
-
const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
|
|
119
|
-
if (beforeIdx !== -1) {
|
|
120
|
-
const searchStart = beforeIdx + beforeLower.slice(-50).length;
|
|
121
|
-
const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
|
|
122
|
-
if (afterIdx !== -1 && afterIdx - searchStart < 500) {
|
|
123
|
-
return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
|
|
124
|
-
}
|
|
125
|
-
}
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
if (before) {
|
|
129
|
-
const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30));
|
|
130
|
-
if (beforeIdx !== -1) {
|
|
131
|
-
return {
|
|
132
|
-
occurrences: [beforeIdx + beforeLower.slice(-30).length],
|
|
133
|
-
matchedAnchor: null,
|
|
134
|
-
strategy: 'context-before',
|
|
135
|
-
};
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
if (after) {
|
|
140
|
-
const afterIdx = textLower.indexOf(afterLower.slice(0, 30));
|
|
141
|
-
if (afterIdx !== -1) {
|
|
142
|
-
return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' };
|
|
143
|
-
}
|
|
144
|
-
}
|
|
145
|
-
}
|
|
146
|
-
return { occurrences: [], matchedAnchor: null, strategy: 'empty-anchor' };
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
const anchorLower = anchor.toLowerCase();
|
|
150
|
-
const textLower = text.toLowerCase();
|
|
151
|
-
|
|
152
|
-
// Strategy 1: direct match
|
|
153
|
-
let occurrences = findAllOccurrences(textLower, anchorLower);
|
|
154
|
-
if (occurrences.length > 0) {
|
|
155
|
-
return { occurrences, matchedAnchor: anchor, strategy: 'direct' };
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
// Strategy 2: normalized whitespace
|
|
159
|
-
const normalizedAnchor = anchor.replace(/\s+/g, ' ').toLowerCase();
|
|
160
|
-
const normalizedText = text.replace(/\s+/g, ' ').toLowerCase();
|
|
161
|
-
const idx = normalizedText.indexOf(normalizedAnchor);
|
|
162
|
-
if (idx !== -1) {
|
|
163
|
-
return { occurrences: [idx], matchedAnchor: anchor, strategy: 'normalized' };
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
// Strategy 3: match in stripped CriticMarkup version
|
|
167
|
-
const strippedText = stripCriticMarkup(text);
|
|
168
|
-
const strippedLower = strippedText.toLowerCase();
|
|
169
|
-
occurrences = findAllOccurrences(strippedLower, anchorLower);
|
|
170
|
-
if (occurrences.length > 0) {
|
|
171
|
-
return { occurrences, matchedAnchor: anchor, strategy: 'stripped', stripped: true };
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
// Strategy 4: first N words of anchor (long anchors)
|
|
175
|
-
const words = anchor.split(/\s+/);
|
|
176
|
-
if (words.length > 3) {
|
|
177
|
-
for (let n = Math.min(6, words.length); n >= 3; n--) {
|
|
178
|
-
const partialAnchor = words.slice(0, n).join(' ').toLowerCase();
|
|
179
|
-
if (partialAnchor.length >= 15) {
|
|
180
|
-
occurrences = findAllOccurrences(textLower, partialAnchor);
|
|
181
|
-
if (occurrences.length > 0) {
|
|
182
|
-
return { occurrences, matchedAnchor: words.slice(0, n).join(' '), strategy: 'partial-start' };
|
|
183
|
-
}
|
|
184
|
-
occurrences = findAllOccurrences(strippedLower, partialAnchor);
|
|
185
|
-
if (occurrences.length > 0) {
|
|
186
|
-
return {
|
|
187
|
-
occurrences,
|
|
188
|
-
matchedAnchor: words.slice(0, n).join(' '),
|
|
189
|
-
strategy: 'partial-start-stripped',
|
|
190
|
-
stripped: true,
|
|
191
|
-
};
|
|
192
|
-
}
|
|
193
|
-
}
|
|
194
|
-
}
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
// Strategy 5: context (before/after) only
|
|
198
|
-
if (before || after) {
|
|
199
|
-
const beforeLower = before.toLowerCase();
|
|
200
|
-
const afterLower = after.toLowerCase();
|
|
201
|
-
|
|
202
|
-
if (before && after) {
|
|
203
|
-
const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
|
|
204
|
-
if (beforeIdx !== -1) {
|
|
205
|
-
const searchStart = beforeIdx + beforeLower.slice(-50).length;
|
|
206
|
-
const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
|
|
207
|
-
if (afterIdx !== -1 && afterIdx - searchStart < 500) {
|
|
208
|
-
return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
|
|
209
|
-
}
|
|
210
|
-
}
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
if (before) {
|
|
214
|
-
const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30));
|
|
215
|
-
if (beforeIdx !== -1) {
|
|
216
|
-
return {
|
|
217
|
-
occurrences: [beforeIdx + beforeLower.slice(-30).length],
|
|
218
|
-
matchedAnchor: null,
|
|
219
|
-
strategy: 'context-before',
|
|
220
|
-
};
|
|
221
|
-
}
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
if (after) {
|
|
225
|
-
const afterIdx = textLower.indexOf(afterLower.slice(0, 30));
|
|
226
|
-
if (afterIdx !== -1) {
|
|
227
|
-
return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' };
|
|
228
|
-
}
|
|
229
|
-
}
|
|
230
|
-
}
|
|
231
|
-
|
|
232
|
-
// Strategy 6: split anchor on transition characters
|
|
233
|
-
const splitPatterns = [' ', ', ', '. ', ' - ', ' – '];
|
|
234
|
-
for (const sep of splitPatterns) {
|
|
235
|
-
if (anchor.includes(sep)) {
|
|
236
|
-
const parts = anchor.split(sep).filter(p => p.length >= 4);
|
|
237
|
-
for (const part of parts) {
|
|
238
|
-
const partLower = part.toLowerCase();
|
|
239
|
-
occurrences = findAllOccurrences(textLower, partLower);
|
|
240
|
-
if (occurrences.length > 0 && occurrences.length < 5) {
|
|
241
|
-
return { occurrences, matchedAnchor: part, strategy: 'split-match' };
|
|
242
|
-
}
|
|
243
|
-
}
|
|
244
|
-
}
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
return { occurrences: [], matchedAnchor: null, strategy: 'failed' };
|
|
248
|
-
}
|
|
249
|
-
|
|
250
|
-
/**
|
|
251
|
-
* Classify a strategy as a clean hit, a fuzzy/drifted hit, or no hit.
|
|
252
|
-
* Used by `verify-anchors` to summarize per-comment match quality.
|
|
253
|
-
*/
|
|
254
|
-
export type AnchorMatchQuality = 'clean' | 'drift' | 'context-only' | 'unmatched';
|
|
255
|
-
|
|
256
|
-
export function classifyStrategy(strategy: AnchorStrategy, occurrences: number): AnchorMatchQuality {
|
|
257
|
-
if (occurrences === 0) return 'unmatched';
|
|
258
|
-
switch (strategy) {
|
|
259
|
-
case 'direct':
|
|
260
|
-
case 'normalized':
|
|
261
|
-
return 'clean';
|
|
262
|
-
case 'stripped':
|
|
263
|
-
case 'partial-start':
|
|
264
|
-
case 'partial-start-stripped':
|
|
265
|
-
case 'split-match':
|
|
266
|
-
return 'drift';
|
|
267
|
-
case 'context-both':
|
|
268
|
-
case 'context-before':
|
|
269
|
-
case 'context-after':
|
|
270
|
-
return 'context-only';
|
|
271
|
-
case 'empty-anchor':
|
|
272
|
-
case 'failed':
|
|
273
|
-
default:
|
|
274
|
-
return 'unmatched';
|
|
275
|
-
}
|
|
276
|
-
}
|
|
1
|
+
/**
|
|
2
|
+
* Anchor matching primitives shared between sync (insertion) and
|
|
3
|
+
* verify-anchors (drift reporting). The functions are pure: given an
|
|
4
|
+
* anchor string and surrounding context, locate candidate positions in
|
|
5
|
+
* a target text using progressively looser strategies.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export type AnchorStrategy =
|
|
9
|
+
| 'direct'
|
|
10
|
+
| 'normalized'
|
|
11
|
+
| 'stripped'
|
|
12
|
+
| 'partial-start'
|
|
13
|
+
| 'partial-start-stripped'
|
|
14
|
+
| 'context-both'
|
|
15
|
+
| 'context-before'
|
|
16
|
+
| 'context-after'
|
|
17
|
+
| 'split-match'
|
|
18
|
+
| 'empty-anchor'
|
|
19
|
+
| 'failed';
|
|
20
|
+
|
|
21
|
+
export interface AnchorSearchResult {
|
|
22
|
+
occurrences: number[];
|
|
23
|
+
matchedAnchor: string | null;
|
|
24
|
+
strategy: AnchorStrategy;
|
|
25
|
+
stripped?: boolean;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Strip CriticMarkup so the matcher sees plain prose instead of
|
|
30
|
+
* `{++inserted++}`/`{--deleted--}`/etc. Used when an anchor lives
|
|
31
|
+
* underneath previously imported track changes.
|
|
32
|
+
*/
|
|
33
|
+
export function stripCriticMarkup(text: string): string {
|
|
34
|
+
return text
|
|
35
|
+
.replace(/\{\+\+([^+]*)\+\+\}/g, '$1') // insertions: keep new text
|
|
36
|
+
.replace(/\{--([^-]*)--\}/g, '') // deletions: remove old text
|
|
37
|
+
.replace(/\{~~([^~]*)~>([^~]*)~~\}/g, '$2') // substitutions: keep new text
|
|
38
|
+
.replace(/\{>>[\s\S]*?<<\}/g, '') // comments: remove (non-greedy; comment text may contain '<')
|
|
39
|
+
.replace(/\[([^\]]*)\]\{\.mark\}/g, '$1'); // marked text: keep text
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Return every starting index where `needle` occurs in `haystack`.
|
|
44
|
+
* Empty needles return no occurrences (empty matches are not useful
|
|
45
|
+
* for anchor placement).
|
|
46
|
+
*/
|
|
47
|
+
/**
|
|
48
|
+
* Score how well the docx-side `before` / `after` context matches the
|
|
49
|
+
* surroundings of a candidate position in the target text. Used by
|
|
50
|
+
* `verify-anchors` to tell apart "multiple hits but context picks one
|
|
51
|
+
* cleanly" (sync will place it correctly) from "multiple hits, context
|
|
52
|
+
* doesn't help" (truly ambiguous, needs human placement).
|
|
53
|
+
*
|
|
54
|
+
* Returns 0 if no context was provided.
|
|
55
|
+
*/
|
|
56
|
+
export function scoreContextAt(
|
|
57
|
+
pos: number,
|
|
58
|
+
text: string,
|
|
59
|
+
before: string,
|
|
60
|
+
after: string,
|
|
61
|
+
anchorLen: number,
|
|
62
|
+
): number {
|
|
63
|
+
let score = 0;
|
|
64
|
+
if (before) {
|
|
65
|
+
const contextBefore = text.slice(Math.max(0, pos - before.length - 20), pos).toLowerCase();
|
|
66
|
+
const beforeLower = before.toLowerCase();
|
|
67
|
+
const beforeWords = beforeLower.split(/\s+/).filter(w => w.length > 3);
|
|
68
|
+
for (const word of beforeWords) {
|
|
69
|
+
if (contextBefore.includes(word)) score += 2;
|
|
70
|
+
}
|
|
71
|
+
if (contextBefore.includes(beforeLower.slice(-30))) score += 5;
|
|
72
|
+
}
|
|
73
|
+
if (after) {
|
|
74
|
+
const contextAfter = text.slice(pos + anchorLen, pos + anchorLen + after.length + 20).toLowerCase();
|
|
75
|
+
const afterLower = after.toLowerCase();
|
|
76
|
+
const afterWords = afterLower.split(/\s+/).filter(w => w.length > 3);
|
|
77
|
+
for (const word of afterWords) {
|
|
78
|
+
if (contextAfter.includes(word)) score += 2;
|
|
79
|
+
}
|
|
80
|
+
if (contextAfter.includes(afterLower.slice(0, 30))) score += 5;
|
|
81
|
+
}
|
|
82
|
+
return score;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
export function findAllOccurrences(haystack: string, needle: string): number[] {
|
|
86
|
+
if (!needle || needle.length === 0) return [];
|
|
87
|
+
const occurrences: number[] = [];
|
|
88
|
+
let idx = 0;
|
|
89
|
+
while ((idx = haystack.indexOf(needle, idx)) !== -1) {
|
|
90
|
+
occurrences.push(idx);
|
|
91
|
+
idx += 1;
|
|
92
|
+
}
|
|
93
|
+
return occurrences;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Find candidate positions for `anchor` in `text`, falling back through
|
|
98
|
+
* progressively looser strategies (whitespace normalization, stripped
|
|
99
|
+
* CriticMarkup, partial-prefix, surrounding context, word splitting).
|
|
100
|
+
*
|
|
101
|
+
* The returned `strategy` lets callers distinguish a clean direct hit
|
|
102
|
+
* from a fuzzy approximation — useful for drift reporting.
|
|
103
|
+
*/
|
|
104
|
+
export function findAnchorInText(
|
|
105
|
+
anchor: string,
|
|
106
|
+
text: string,
|
|
107
|
+
before: string = '',
|
|
108
|
+
after: string = ''
|
|
109
|
+
): AnchorSearchResult {
|
|
110
|
+
// Empty anchor: skip directly to context-based matching
|
|
111
|
+
if (!anchor || anchor.trim().length === 0) {
|
|
112
|
+
if (before || after) {
|
|
113
|
+
const beforeLower = (before || '').toLowerCase();
|
|
114
|
+
const afterLower = (after || '').toLowerCase();
|
|
115
|
+
const textLower = text.toLowerCase();
|
|
116
|
+
|
|
117
|
+
if (before && after) {
|
|
118
|
+
const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
|
|
119
|
+
if (beforeIdx !== -1) {
|
|
120
|
+
const searchStart = beforeIdx + beforeLower.slice(-50).length;
|
|
121
|
+
const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
|
|
122
|
+
if (afterIdx !== -1 && afterIdx - searchStart < 500) {
|
|
123
|
+
return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
if (before) {
|
|
129
|
+
const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30));
|
|
130
|
+
if (beforeIdx !== -1) {
|
|
131
|
+
return {
|
|
132
|
+
occurrences: [beforeIdx + beforeLower.slice(-30).length],
|
|
133
|
+
matchedAnchor: null,
|
|
134
|
+
strategy: 'context-before',
|
|
135
|
+
};
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
if (after) {
|
|
140
|
+
const afterIdx = textLower.indexOf(afterLower.slice(0, 30));
|
|
141
|
+
if (afterIdx !== -1) {
|
|
142
|
+
return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' };
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
return { occurrences: [], matchedAnchor: null, strategy: 'empty-anchor' };
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
const anchorLower = anchor.toLowerCase();
|
|
150
|
+
const textLower = text.toLowerCase();
|
|
151
|
+
|
|
152
|
+
// Strategy 1: direct match
|
|
153
|
+
let occurrences = findAllOccurrences(textLower, anchorLower);
|
|
154
|
+
if (occurrences.length > 0) {
|
|
155
|
+
return { occurrences, matchedAnchor: anchor, strategy: 'direct' };
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// Strategy 2: normalized whitespace
|
|
159
|
+
const normalizedAnchor = anchor.replace(/\s+/g, ' ').toLowerCase();
|
|
160
|
+
const normalizedText = text.replace(/\s+/g, ' ').toLowerCase();
|
|
161
|
+
const idx = normalizedText.indexOf(normalizedAnchor);
|
|
162
|
+
if (idx !== -1) {
|
|
163
|
+
return { occurrences: [idx], matchedAnchor: anchor, strategy: 'normalized' };
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// Strategy 3: match in stripped CriticMarkup version
|
|
167
|
+
const strippedText = stripCriticMarkup(text);
|
|
168
|
+
const strippedLower = strippedText.toLowerCase();
|
|
169
|
+
occurrences = findAllOccurrences(strippedLower, anchorLower);
|
|
170
|
+
if (occurrences.length > 0) {
|
|
171
|
+
return { occurrences, matchedAnchor: anchor, strategy: 'stripped', stripped: true };
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// Strategy 4: first N words of anchor (long anchors)
|
|
175
|
+
const words = anchor.split(/\s+/);
|
|
176
|
+
if (words.length > 3) {
|
|
177
|
+
for (let n = Math.min(6, words.length); n >= 3; n--) {
|
|
178
|
+
const partialAnchor = words.slice(0, n).join(' ').toLowerCase();
|
|
179
|
+
if (partialAnchor.length >= 15) {
|
|
180
|
+
occurrences = findAllOccurrences(textLower, partialAnchor);
|
|
181
|
+
if (occurrences.length > 0) {
|
|
182
|
+
return { occurrences, matchedAnchor: words.slice(0, n).join(' '), strategy: 'partial-start' };
|
|
183
|
+
}
|
|
184
|
+
occurrences = findAllOccurrences(strippedLower, partialAnchor);
|
|
185
|
+
if (occurrences.length > 0) {
|
|
186
|
+
return {
|
|
187
|
+
occurrences,
|
|
188
|
+
matchedAnchor: words.slice(0, n).join(' '),
|
|
189
|
+
strategy: 'partial-start-stripped',
|
|
190
|
+
stripped: true,
|
|
191
|
+
};
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// Strategy 5: context (before/after) only
|
|
198
|
+
if (before || after) {
|
|
199
|
+
const beforeLower = before.toLowerCase();
|
|
200
|
+
const afterLower = after.toLowerCase();
|
|
201
|
+
|
|
202
|
+
if (before && after) {
|
|
203
|
+
const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
|
|
204
|
+
if (beforeIdx !== -1) {
|
|
205
|
+
const searchStart = beforeIdx + beforeLower.slice(-50).length;
|
|
206
|
+
const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
|
|
207
|
+
if (afterIdx !== -1 && afterIdx - searchStart < 500) {
|
|
208
|
+
return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
if (before) {
|
|
214
|
+
const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30));
|
|
215
|
+
if (beforeIdx !== -1) {
|
|
216
|
+
return {
|
|
217
|
+
occurrences: [beforeIdx + beforeLower.slice(-30).length],
|
|
218
|
+
matchedAnchor: null,
|
|
219
|
+
strategy: 'context-before',
|
|
220
|
+
};
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
if (after) {
|
|
225
|
+
const afterIdx = textLower.indexOf(afterLower.slice(0, 30));
|
|
226
|
+
if (afterIdx !== -1) {
|
|
227
|
+
return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' };
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// Strategy 6: split anchor on transition characters
|
|
233
|
+
const splitPatterns = [' ', ', ', '. ', ' - ', ' – '];
|
|
234
|
+
for (const sep of splitPatterns) {
|
|
235
|
+
if (anchor.includes(sep)) {
|
|
236
|
+
const parts = anchor.split(sep).filter(p => p.length >= 4);
|
|
237
|
+
for (const part of parts) {
|
|
238
|
+
const partLower = part.toLowerCase();
|
|
239
|
+
occurrences = findAllOccurrences(textLower, partLower);
|
|
240
|
+
if (occurrences.length > 0 && occurrences.length < 5) {
|
|
241
|
+
return { occurrences, matchedAnchor: part, strategy: 'split-match' };
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
return { occurrences: [], matchedAnchor: null, strategy: 'failed' };
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
/**
|
|
251
|
+
* Classify a strategy as a clean hit, a fuzzy/drifted hit, or no hit.
|
|
252
|
+
* Used by `verify-anchors` to summarize per-comment match quality.
|
|
253
|
+
*/
|
|
254
|
+
export type AnchorMatchQuality = 'clean' | 'drift' | 'context-only' | 'unmatched';
|
|
255
|
+
|
|
256
|
+
export function classifyStrategy(strategy: AnchorStrategy, occurrences: number): AnchorMatchQuality {
|
|
257
|
+
if (occurrences === 0) return 'unmatched';
|
|
258
|
+
switch (strategy) {
|
|
259
|
+
case 'direct':
|
|
260
|
+
case 'normalized':
|
|
261
|
+
return 'clean';
|
|
262
|
+
case 'stripped':
|
|
263
|
+
case 'partial-start':
|
|
264
|
+
case 'partial-start-stripped':
|
|
265
|
+
case 'split-match':
|
|
266
|
+
return 'drift';
|
|
267
|
+
case 'context-both':
|
|
268
|
+
case 'context-before':
|
|
269
|
+
case 'context-after':
|
|
270
|
+
return 'context-only';
|
|
271
|
+
case 'empty-anchor':
|
|
272
|
+
case 'failed':
|
|
273
|
+
default:
|
|
274
|
+
return 'unmatched';
|
|
275
|
+
}
|
|
276
|
+
}
|