codex-overleaf-link 1.3.0 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -143
- package/extension/src/shared/compatibility.js +1 -1
- package/extension/src/shared/i18n.js +44 -20
- package/extension/src/shared/lineReferences.js +4 -6
- package/extension/src/shared/sessionState.js +31 -5
- package/extension/src/shared/storageDb.js +21 -1
- package/extension/src/shared/undoOperations.js +9 -1
- package/native-host/src/codexHome.js +83 -7
- package/native-host/src/codexSessionRunner.js +3 -0
- package/native-host/src/localSkills.js +46 -7
- package/native-host/src/skills/annotated-rewrite/SKILL.md +71 -0
- package/native-host/src/textPatch.js +669 -16
- package/package.json +1 -1
|
@@ -7,12 +7,53 @@ function computeTextPatches(oldText, newText) {
|
|
|
7
7
|
return [];
|
|
8
8
|
}
|
|
9
9
|
|
|
10
|
-
const
|
|
11
|
-
if (
|
|
12
|
-
return
|
|
10
|
+
const groups = computeLineAnchoredChangeGroups(oldValue, newValue);
|
|
11
|
+
if (!groups.length) {
|
|
12
|
+
return [computeSingleTextPatch(oldValue, newValue)];
|
|
13
13
|
}
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
const patches = [];
|
|
16
|
+
for (const group of groups) {
|
|
17
|
+
patches.push(...computeNaturalGroupPatches(group));
|
|
18
|
+
}
|
|
19
|
+
return patches.length ? patches : [computeSingleTextPatch(oldValue, newValue)];
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// Computes the natural-granularity patches for one changed group (spec
|
|
23
|
+
// "Algorithm sketch"). Builds token patches and metrics, classifies the group,
|
|
24
|
+
// then dispatches to the matching builder. `singleGroupPatch` already returns
|
|
25
|
+
// a one-element array; `computeParagraphPatches` / `computeSentencePatches`
|
|
26
|
+
// return an array or `null`, so a null/empty result falls back to a single
|
|
27
|
+
// group patch. `coalesceTokenPatches` always returns a non-empty array when it
|
|
28
|
+
// receives non-empty token patches.
|
|
29
|
+
function computeNaturalGroupPatches(group) {
|
|
30
|
+
const tokenPatches = computeTokenAnchoredPatches(
|
|
31
|
+
group.oldText,
|
|
32
|
+
group.newText,
|
|
33
|
+
group.oldStart
|
|
34
|
+
);
|
|
35
|
+
const metrics = computeGroupMetrics(group, tokenPatches);
|
|
36
|
+
const { type } = classifyChangedGroup(group, tokenPatches, metrics);
|
|
37
|
+
|
|
38
|
+
if (type === 'annotated_block') {
|
|
39
|
+
return singleGroupPatch(group);
|
|
40
|
+
}
|
|
41
|
+
if (type === 'paragraph_rewrite') {
|
|
42
|
+
const paragraphPatches = computeParagraphPatches(group);
|
|
43
|
+
return (paragraphPatches && paragraphPatches.length)
|
|
44
|
+
? paragraphPatches
|
|
45
|
+
: singleGroupPatch(group);
|
|
46
|
+
}
|
|
47
|
+
if (type === 'sentence_rewrite') {
|
|
48
|
+
const sentencePatches = computeSentencePatches(group, tokenPatches);
|
|
49
|
+
return (sentencePatches && sentencePatches.length)
|
|
50
|
+
? sentencePatches
|
|
51
|
+
: singleGroupPatch(group);
|
|
52
|
+
}
|
|
53
|
+
if (type === 'small_edit' && tokenPatches && tokenPatches.length) {
|
|
54
|
+
return coalesceTokenPatches(group, tokenPatches);
|
|
55
|
+
}
|
|
56
|
+
return singleGroupPatch(group);
|
|
16
57
|
}
|
|
17
58
|
|
|
18
59
|
function computeSingleTextPatch(oldValue, newValue, offset = 0) {
|
|
@@ -41,7 +82,7 @@ function computeSingleTextPatch(oldValue, newValue, offset = 0) {
|
|
|
41
82
|
};
|
|
42
83
|
}
|
|
43
84
|
|
|
44
|
-
function
|
|
85
|
+
function computeLineAnchoredChangeGroups(oldValue, newValue) {
|
|
45
86
|
const oldParts = splitTextParts(oldValue);
|
|
46
87
|
const newParts = splitTextParts(newValue);
|
|
47
88
|
const MAX_PARTS = 5000;
|
|
@@ -58,7 +99,7 @@ function computeLineAnchoredPatches(oldValue, newValue) {
|
|
|
58
99
|
}
|
|
59
100
|
|
|
60
101
|
const edits = computePartEdits(oldParts, newParts);
|
|
61
|
-
const
|
|
102
|
+
const groups = [];
|
|
62
103
|
let oldOffset = 0;
|
|
63
104
|
let newOffset = 0;
|
|
64
105
|
let group = null;
|
|
@@ -91,20 +132,13 @@ function computeLineAnchoredPatches(oldValue, newValue) {
|
|
|
91
132
|
}
|
|
92
133
|
flushGroup();
|
|
93
134
|
|
|
94
|
-
return
|
|
135
|
+
return groups;
|
|
95
136
|
|
|
96
137
|
function flushGroup() {
|
|
97
138
|
if (!group) {
|
|
98
139
|
return;
|
|
99
140
|
}
|
|
100
|
-
|
|
101
|
-
const tokenPatches = computeTokenAnchoredPatches(group.oldText, group.newText, group.oldStart);
|
|
102
|
-
if (tokenPatches) {
|
|
103
|
-
patches.push(...tokenPatches);
|
|
104
|
-
} else {
|
|
105
|
-
patches.push(computeSingleTextPatch(group.oldText, group.newText, group.oldStart));
|
|
106
|
-
}
|
|
107
|
-
}
|
|
141
|
+
groups.push(group);
|
|
108
142
|
group = null;
|
|
109
143
|
}
|
|
110
144
|
}
|
|
@@ -282,6 +316,625 @@ function computePartEdits(oldParts, newParts) {
|
|
|
282
316
|
return edits;
|
|
283
317
|
}
|
|
284
318
|
|
|
319
|
+
function countNonEmptyLines(text) {
|
|
320
|
+
const value = String(text ?? '');
|
|
321
|
+
let count = 0;
|
|
322
|
+
for (const line of value.split('\n')) {
|
|
323
|
+
if (line.trim() !== '') {
|
|
324
|
+
count += 1;
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
return count;
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
function countSentenceTerminators(text) {
|
|
331
|
+
const value = String(text ?? '');
|
|
332
|
+
let count = 0;
|
|
333
|
+
for (let index = 0; index < value.length; index += 1) {
|
|
334
|
+
const char = value[index];
|
|
335
|
+
if (char === '。' || char === '?' || char === '!') {
|
|
336
|
+
count += 1;
|
|
337
|
+
continue;
|
|
338
|
+
}
|
|
339
|
+
if (char === '.' || char === '?' || char === '!') {
|
|
340
|
+
if (
|
|
341
|
+
char === '.'
|
|
342
|
+
&& /[0-9]/.test(value[index - 1] || '')
|
|
343
|
+
&& /[0-9]/.test(value[index + 1] || '')
|
|
344
|
+
) {
|
|
345
|
+
// Decimal point inside a number such as `1.23` is not a boundary.
|
|
346
|
+
continue;
|
|
347
|
+
}
|
|
348
|
+
count += 1;
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
return count;
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
function hasOriginalMarkerLine(text) {
|
|
355
|
+
return String(text ?? '')
|
|
356
|
+
.split('\n')
|
|
357
|
+
.some(line => /^\s*%\s*\[original\]\s*$/.test(line));
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
function hasLaterRevisedMarkerLine(text) {
|
|
361
|
+
const lines = String(text ?? '').split('\n');
|
|
362
|
+
const originalIndex = lines.findIndex(line => /^\s*%\s*\[original\]\s*$/.test(line));
|
|
363
|
+
if (originalIndex === -1) {
|
|
364
|
+
return false;
|
|
365
|
+
}
|
|
366
|
+
return lines.some((line, index) => (
|
|
367
|
+
index > originalIndex && /^\s*%\s*\[revised\]\s*$/.test(line)
|
|
368
|
+
));
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
function hasAnyAnnotatedMarker(text) {
|
|
372
|
+
return String(text ?? '')
|
|
373
|
+
.split('\n')
|
|
374
|
+
.some(line => (
|
|
375
|
+
/^\s*%\s*\[original\]\s*$/.test(line) || /^\s*%\s*\[revised\]\s*$/.test(line)
|
|
376
|
+
));
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
function splitParagraphs(text) {
|
|
380
|
+
const value = String(text ?? '');
|
|
381
|
+
const separator = /\n\s*\n/g;
|
|
382
|
+
const segments = [];
|
|
383
|
+
let lastIndex = 0;
|
|
384
|
+
let match = separator.exec(value);
|
|
385
|
+
|
|
386
|
+
while (match) {
|
|
387
|
+
segments.push({ text: value.slice(lastIndex, match.index), start: lastIndex });
|
|
388
|
+
segments.push({ text: match[0], start: match.index });
|
|
389
|
+
lastIndex = match.index + match[0].length;
|
|
390
|
+
match = separator.exec(value);
|
|
391
|
+
}
|
|
392
|
+
segments.push({ text: value.slice(lastIndex), start: lastIndex });
|
|
393
|
+
|
|
394
|
+
return segments;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
// Lowercase abbreviations whose trailing `.` is conservatively NOT a sentence
|
|
398
|
+
// boundary. The word ending in the dot is matched case-insensitively, so this
|
|
399
|
+
// also covers `Fig.`, `Eq.`, `No.`, etc.
|
|
400
|
+
const NON_TERMINAL_ABBREVIATIONS = new Set([
|
|
401
|
+
'e.g', 'i.e', 'cf', 'vs', 'etc', 'al', 'fig', 'figs', 'eq', 'eqs', 'sec',
|
|
402
|
+
'secs', 'thm', 'lem', 'def', 'prop', 'cor', 'ref', 'no', 'vol', 'pp',
|
|
403
|
+
'ch', 'app', 'resp', 'approx', 'mr', 'ms', 'mrs', 'dr', 'prof', 'st'
|
|
404
|
+
]);
|
|
405
|
+
|
|
406
|
+
function isLatexCommandStart(text, index) {
|
|
407
|
+
return text[index] === '\\' && /[A-Za-z@]/.test(text[index + 1] || '');
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// True when the contiguous non-whitespace run containing `index` looks like a
|
|
411
|
+
// URL (has a scheme such as `https://`, or starts with `www.`). A `.` inside
|
|
412
|
+
// such a run is never a confident sentence boundary.
|
|
413
|
+
function isInsideUrl(text, index) {
|
|
414
|
+
let runStart = index;
|
|
415
|
+
while (runStart > 0 && !/\s/.test(text[runStart - 1])) {
|
|
416
|
+
runStart -= 1;
|
|
417
|
+
}
|
|
418
|
+
let runEnd = index;
|
|
419
|
+
while (runEnd < text.length && !/\s/.test(text[runEnd])) {
|
|
420
|
+
runEnd += 1;
|
|
421
|
+
}
|
|
422
|
+
const run = text.slice(runStart, runEnd);
|
|
423
|
+
return /:\/\//.test(run) || /^www\./i.test(run);
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
// True when the `.` at `index` completes a known abbreviation such as `e.g.`
|
|
427
|
+
// or `Fig.` rather than ending a sentence.
|
|
428
|
+
function completesAbbreviation(text, index) {
|
|
429
|
+
let wordStart = index;
|
|
430
|
+
while (wordStart > 0 && /[A-Za-z.]/.test(text[wordStart - 1])) {
|
|
431
|
+
wordStart -= 1;
|
|
432
|
+
}
|
|
433
|
+
const word = text.slice(wordStart, index).toLowerCase();
|
|
434
|
+
return word.length > 0 && NON_TERMINAL_ABBREVIATIONS.has(word);
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
// True when the ASCII terminator `.` `?` `!` at `index` is a confident
|
|
438
|
+
// sentence boundary: it must be followed by whitespace, end-of-string, or a
|
|
439
|
+
// LaTeX command boundary, and must not sit inside a decimal number, a URL, or
|
|
440
|
+
// a known abbreviation.
|
|
441
|
+
function isConfidentAsciiBoundary(text, index) {
|
|
442
|
+
const next = text[index + 1];
|
|
443
|
+
const followedByBoundary = next === undefined
|
|
444
|
+
|| /\s/.test(next)
|
|
445
|
+
|| isLatexCommandStart(text, index + 1);
|
|
446
|
+
if (!followedByBoundary) {
|
|
447
|
+
return false;
|
|
448
|
+
}
|
|
449
|
+
if (text[index] === '.') {
|
|
450
|
+
const prev = text[index - 1];
|
|
451
|
+
if (/[0-9]/.test(prev || '') && /[0-9]/.test(next || '')) {
|
|
452
|
+
// Decimal point inside a number such as `1.23`.
|
|
453
|
+
return false;
|
|
454
|
+
}
|
|
455
|
+
if (isInsideUrl(text, index)) {
|
|
456
|
+
return false;
|
|
457
|
+
}
|
|
458
|
+
if (completesAbbreviation(text, index)) {
|
|
459
|
+
return false;
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
return true;
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
// Splits `text` into ordered sentence spans `[{text, start, end}]` that
|
|
466
|
+
// partition the input exactly (concatenated they equal `text`). Each span
|
|
467
|
+
// includes its trailing terminator and the whitespace up to the next
|
|
468
|
+
// sentence. Conservative: when no confident boundary is found the whole input
|
|
469
|
+
// is returned as a single span.
|
|
470
|
+
function splitSentences(text) {
|
|
471
|
+
const value = String(text ?? '');
|
|
472
|
+
if (value.length === 0) {
|
|
473
|
+
return [{ text: '', start: 0, end: 0 }];
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
const spans = [];
|
|
477
|
+
let spanStart = 0;
|
|
478
|
+
let index = 0;
|
|
479
|
+
|
|
480
|
+
while (index < value.length) {
|
|
481
|
+
const char = value[index];
|
|
482
|
+
let isBoundary = false;
|
|
483
|
+
|
|
484
|
+
if (char === '。' || char === '?' || char === '!') {
|
|
485
|
+
// CJK terminators are unambiguous: they never appear in decimals, URLs,
|
|
486
|
+
// or LaTeX command names, so they always end a sentence.
|
|
487
|
+
isBoundary = true;
|
|
488
|
+
} else if (char === '.' || char === '?' || char === '!') {
|
|
489
|
+
isBoundary = isConfidentAsciiBoundary(value, index);
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
if (isBoundary) {
|
|
493
|
+
// Absorb trailing whitespace up to the next sentence into this span.
|
|
494
|
+
let spanEnd = index + 1;
|
|
495
|
+
while (spanEnd < value.length && /\s/.test(value[spanEnd])) {
|
|
496
|
+
spanEnd += 1;
|
|
497
|
+
}
|
|
498
|
+
if (spanEnd < value.length) {
|
|
499
|
+
spans.push({
|
|
500
|
+
text: value.slice(spanStart, spanEnd),
|
|
501
|
+
start: spanStart,
|
|
502
|
+
end: spanEnd
|
|
503
|
+
});
|
|
504
|
+
spanStart = spanEnd;
|
|
505
|
+
index = spanEnd;
|
|
506
|
+
continue;
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
index += 1;
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
spans.push({
|
|
514
|
+
text: value.slice(spanStart),
|
|
515
|
+
start: spanStart,
|
|
516
|
+
end: value.length
|
|
517
|
+
});
|
|
518
|
+
|
|
519
|
+
return spans;
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
function computeGroupMetrics(group, tokenPatches) {
|
|
523
|
+
const oldNonEmptyLineCount = countNonEmptyLines(group.oldText);
|
|
524
|
+
const newNonEmptyLineCount = countNonEmptyLines(group.newText);
|
|
525
|
+
|
|
526
|
+
return {
|
|
527
|
+
oldNonEmptyLineCount,
|
|
528
|
+
newNonEmptyLineCount,
|
|
529
|
+
maxNonEmptyLineCount: Math.max(oldNonEmptyLineCount, newNonEmptyLineCount),
|
|
530
|
+
changedSpanChars: Math.max(group.oldText.length, group.newText.length),
|
|
531
|
+
tokenPatchCount: tokenPatches === null ? null : tokenPatches.length,
|
|
532
|
+
totalTokenChangedChars: tokenPatches === null
|
|
533
|
+
? null
|
|
534
|
+
: tokenPatches.reduce((sum, patch) => (
|
|
535
|
+
sum + Math.max(patch.to - patch.from, patch.insert.length)
|
|
536
|
+
), 0),
|
|
537
|
+
oldSentenceTerminatorCount: countSentenceTerminators(group.oldText),
|
|
538
|
+
newSentenceTerminatorCount: countSentenceTerminators(group.newText)
|
|
539
|
+
};
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
// Resolves the sentence-span quantities used by the `isSentenceRewrite`
|
|
543
|
+
// predicate (the design spec leaves them undefined). It segments the changed
|
|
544
|
+
// group's OLD text into sentence spans and checks whether every token patch's
|
|
545
|
+
// old range maps within a single span.
|
|
546
|
+
//
|
|
547
|
+
// Returns:
|
|
548
|
+
// - `fitsOneSpan`: true iff exactly one sentence span contains every token
|
|
549
|
+
// patch's old range (relative to the group).
|
|
550
|
+
// - `spanChars` / `spanTokenCount`: the char length / token count of that
|
|
551
|
+
// single span when `fitsOneSpan` is true; `0` otherwise (irrelevant then).
|
|
552
|
+
// - `spanStart` / `spanEnd`: the group-relative `[start,end)` offsets of that
|
|
553
|
+
// single span when `fitsOneSpan` is true; `0` otherwise (irrelevant then).
|
|
554
|
+
//
|
|
555
|
+
// When `tokenPatches` is `null` or empty, `fitsOneSpan` is false.
|
|
556
|
+
function resolveTokenPatchSentenceSpan(group, tokenPatches) {
|
|
557
|
+
const empty = {
|
|
558
|
+
fitsOneSpan: false,
|
|
559
|
+
spanChars: 0,
|
|
560
|
+
spanTokenCount: 0,
|
|
561
|
+
spanStart: 0,
|
|
562
|
+
spanEnd: 0
|
|
563
|
+
};
|
|
564
|
+
if (tokenPatches === null || tokenPatches.length === 0) {
|
|
565
|
+
return empty;
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
const sentenceSpans = splitSentences(group.oldText);
|
|
569
|
+
let containingSpan = null;
|
|
570
|
+
|
|
571
|
+
for (const span of sentenceSpans) {
|
|
572
|
+
const containsEveryPatch = tokenPatches.every(patch => {
|
|
573
|
+
const relativeFrom = patch.from - group.oldStart;
|
|
574
|
+
const relativeTo = patch.to - group.oldStart;
|
|
575
|
+
return relativeFrom >= span.start && relativeTo <= span.end;
|
|
576
|
+
});
|
|
577
|
+
if (!containsEveryPatch) {
|
|
578
|
+
continue;
|
|
579
|
+
}
|
|
580
|
+
if (containingSpan !== null) {
|
|
581
|
+
// More than one span contains every patch (possible for a zero-length
|
|
582
|
+
// patch sitting on a span boundary). Not a confident single sentence.
|
|
583
|
+
return empty;
|
|
584
|
+
}
|
|
585
|
+
containingSpan = span;
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
if (containingSpan === null) {
|
|
589
|
+
return empty;
|
|
590
|
+
}
|
|
591
|
+
return {
|
|
592
|
+
fitsOneSpan: true,
|
|
593
|
+
spanChars: containingSpan.text.length,
|
|
594
|
+
spanTokenCount: splitTextTokens(containingSpan.text).length,
|
|
595
|
+
spanStart: containingSpan.start,
|
|
596
|
+
spanEnd: containingSpan.end
|
|
597
|
+
};
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
// Classifies a changed group into a natural review granularity. Pure function.
|
|
601
|
+
//
|
|
602
|
+
// `group` is `{oldStart, oldText, newText}`; `tokenPatches` is the array from
|
|
603
|
+
// `computeTokenAnchoredPatches(group.oldText, group.newText, group.oldStart)`
|
|
604
|
+
// or `null`; `metrics` is the object from `computeGroupMetrics(group,
|
|
605
|
+
// tokenPatches)`.
|
|
606
|
+
//
|
|
607
|
+
// Returns `{type}` where `type` is one of `annotated_block`,
|
|
608
|
+
// `paragraph_rewrite`, `sentence_rewrite`, `small_edit`, `fallback`. The
|
|
609
|
+
// predicates are evaluated in first-match order: annotated_block →
|
|
610
|
+
// paragraph_rewrite → sentence_rewrite → small_edit → fallback. When
|
|
611
|
+
// `tokenPatches === null`, every token-dependent predicate is false, so the
|
|
612
|
+
// only reachable results are `annotated_block`, `paragraph_rewrite` (via the
|
|
613
|
+
// line-count or sentence-terminator branch), and `fallback`.
|
|
614
|
+
function classifyChangedGroup(group, tokenPatches, metrics) {
|
|
615
|
+
const newGroupText = group.newText;
|
|
616
|
+
const {
|
|
617
|
+
maxNonEmptyLineCount,
|
|
618
|
+
changedSpanChars,
|
|
619
|
+
tokenPatchCount,
|
|
620
|
+
totalTokenChangedChars,
|
|
621
|
+
oldSentenceTerminatorCount,
|
|
622
|
+
newSentenceTerminatorCount
|
|
623
|
+
} = metrics;
|
|
624
|
+
|
|
625
|
+
const isAnnotatedBlock = hasOriginalMarkerLine(newGroupText)
|
|
626
|
+
&& hasLaterRevisedMarkerLine(newGroupText)
|
|
627
|
+
&& maxNonEmptyLineCount >= 3;
|
|
628
|
+
if (isAnnotatedBlock) {
|
|
629
|
+
return { type: 'annotated_block' };
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
const isDenseTokenRewrite = tokenPatches !== null
|
|
633
|
+
&& tokenPatchCount >= 6
|
|
634
|
+
&& changedSpanChars >= 160
|
|
635
|
+
&& tokenPatchCount / Math.max(1, maxNonEmptyLineCount) >= 2;
|
|
636
|
+
|
|
637
|
+
const isParagraphRewrite = !isAnnotatedBlock
|
|
638
|
+
&& (
|
|
639
|
+
maxNonEmptyLineCount >= 3
|
|
640
|
+
|| (oldSentenceTerminatorCount >= 2 && newSentenceTerminatorCount >= 2)
|
|
641
|
+
|| isDenseTokenRewrite
|
|
642
|
+
);
|
|
643
|
+
if (isParagraphRewrite) {
|
|
644
|
+
return { type: 'paragraph_rewrite' };
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
const sentenceSpan = resolveTokenPatchSentenceSpan(group, tokenPatches);
|
|
648
|
+
|
|
649
|
+
const isSentenceRewrite = !isAnnotatedBlock
|
|
650
|
+
&& !isParagraphRewrite
|
|
651
|
+
&& tokenPatches !== null
|
|
652
|
+
&& tokenPatchCount >= 3
|
|
653
|
+
&& sentenceSpan.fitsOneSpan
|
|
654
|
+
&& (sentenceSpan.spanChars >= 80 || sentenceSpan.spanTokenCount >= 12);
|
|
655
|
+
if (isSentenceRewrite) {
|
|
656
|
+
return { type: 'sentence_rewrite' };
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
const isSmallEdit = !isAnnotatedBlock
|
|
660
|
+
&& !isParagraphRewrite
|
|
661
|
+
&& !isSentenceRewrite
|
|
662
|
+
&& tokenPatches !== null
|
|
663
|
+
&& (
|
|
664
|
+
tokenPatchCount <= 2
|
|
665
|
+
|| (
|
|
666
|
+
totalTokenChangedChars < 80
|
|
667
|
+
&& maxNonEmptyLineCount <= 2
|
|
668
|
+
&& !hasAnyAnnotatedMarker(newGroupText)
|
|
669
|
+
)
|
|
670
|
+
);
|
|
671
|
+
if (isSmallEdit) {
|
|
672
|
+
return { type: 'small_edit' };
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
return { type: 'fallback' };
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
// The single-patch fallback for a whole changed group (spec algorithm sketch).
|
|
679
|
+
// Returns a one-element array so callers can treat every builder uniformly.
|
|
680
|
+
// The patch's `from`/`to` are absolute offsets into the full original text:
|
|
681
|
+
// `computeSingleTextPatch` adds `group.oldStart` to its segment-local offsets.
|
|
682
|
+
function singleGroupPatch(group) {
|
|
683
|
+
return [computeSingleTextPatch(group.oldText, group.newText, group.oldStart)];
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
// Builds paragraph-level patches for a changed group (spec §4).
|
|
687
|
+
//
|
|
688
|
+
// Segments `group.oldText` and `group.newText` with `splitParagraphs`, which
|
|
689
|
+
// yields alternating [content, separator, content, ...] segments. When both
|
|
690
|
+
// sides share the SAME separator structure (same segment count and identical
|
|
691
|
+
// separator segments) the content paragraphs are paired positionally and one
|
|
692
|
+
// patch is emitted per changed pair, with `from`/`to` as absolute offsets
|
|
693
|
+
// (`group.oldStart` + the old paragraph segment's start). A single-paragraph
|
|
694
|
+
// group is the degenerate case of this rule: one pair, one patch.
|
|
695
|
+
//
|
|
696
|
+
// Returns `null` when pairing is ambiguous (separator counts differ or a
|
|
697
|
+
// separator segment changed), so the caller can fall back to a group patch.
|
|
698
|
+
function computeParagraphPatches(group) {
|
|
699
|
+
const oldSegments = splitParagraphs(group.oldText);
|
|
700
|
+
const newSegments = splitParagraphs(group.newText);
|
|
701
|
+
|
|
702
|
+
if (oldSegments.length !== newSegments.length) {
|
|
703
|
+
return null;
|
|
704
|
+
}
|
|
705
|
+
// splitParagraphs always yields an odd count: content at even indices,
|
|
706
|
+
// blank-line separators at odd indices. Every separator must be unchanged
|
|
707
|
+
// for positional pairing of the content paragraphs to be sound.
|
|
708
|
+
for (let index = 1; index < oldSegments.length; index += 2) {
|
|
709
|
+
if (oldSegments[index].text !== newSegments[index].text) {
|
|
710
|
+
return null;
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
const patches = [];
|
|
715
|
+
for (let index = 0; index < oldSegments.length; index += 2) {
|
|
716
|
+
const oldParagraph = oldSegments[index];
|
|
717
|
+
const newParagraph = newSegments[index];
|
|
718
|
+
if (oldParagraph.text === newParagraph.text) {
|
|
719
|
+
continue;
|
|
720
|
+
}
|
|
721
|
+
patches.push(computeSingleTextPatch(
|
|
722
|
+
oldParagraph.text,
|
|
723
|
+
newParagraph.text,
|
|
724
|
+
group.oldStart + oldParagraph.start
|
|
725
|
+
));
|
|
726
|
+
}
|
|
727
|
+
return patches;
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
// Builds a single sentence-level patch for a `sentence_rewrite` group (spec
|
|
731
|
+
// §5). Every token patch lies inside one confident sentence span `[a,b)` of
|
|
732
|
+
// `group.oldText`. Because all token changes are inside that span, the regions
|
|
733
|
+
// `group.oldText.slice(0,a)` and `group.oldText.slice(b)` are unchanged, so
|
|
734
|
+
// `group.newText` is `prefix + <new sentence> + suffix` with the same prefix
|
|
735
|
+
// and suffix; `<new sentence>` is derived by stripping them.
|
|
736
|
+
//
|
|
737
|
+
// Returns `[patch]` whose `from`/`to` cover only that old sentence span
|
|
738
|
+
// (absolute offsets), or `null` when the single span cannot be identified or
|
|
739
|
+
// the unchanged prefix/suffix do not actually match (defensive).
|
|
740
|
+
function computeSentencePatches(group, tokenPatches) {
|
|
741
|
+
const span = resolveTokenPatchSentenceSpan(group, tokenPatches);
|
|
742
|
+
if (!span.fitsOneSpan) {
|
|
743
|
+
return null;
|
|
744
|
+
}
|
|
745
|
+
|
|
746
|
+
const { spanStart, spanEnd } = span;
|
|
747
|
+
const oldPrefix = group.oldText.slice(0, spanStart);
|
|
748
|
+
const oldSuffix = group.oldText.slice(spanEnd);
|
|
749
|
+
const oldSentence = group.oldText.slice(spanStart, spanEnd);
|
|
750
|
+
|
|
751
|
+
// The regions outside the sentence span must be byte-identical between old
|
|
752
|
+
// and new text; otherwise a change leaked outside the span and a single
|
|
753
|
+
// sentence patch would be wrong.
|
|
754
|
+
if (
|
|
755
|
+
!group.newText.startsWith(oldPrefix)
|
|
756
|
+
|| !group.newText.endsWith(oldSuffix)
|
|
757
|
+
|| group.newText.length < oldPrefix.length + oldSuffix.length
|
|
758
|
+
) {
|
|
759
|
+
return null;
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
const newSentence = group.newText.slice(
|
|
763
|
+
oldPrefix.length,
|
|
764
|
+
group.newText.length - oldSuffix.length
|
|
765
|
+
);
|
|
766
|
+
return [computeSingleTextPatch(
|
|
767
|
+
oldSentence,
|
|
768
|
+
newSentence,
|
|
769
|
+
group.oldStart + spanStart
|
|
770
|
+
)];
|
|
771
|
+
}
|
|
772
|
+
|
|
773
|
+
// Short function words whose presence inside a coalescing gap does not block a
|
|
774
|
+
// merge. Combined with pure punctuation and whitespace, these define a gap
|
|
775
|
+
// that is "mostly" connective filler (spec §7).
|
|
776
|
+
const COALESCE_FILLER_WORDS = new Set([
|
|
777
|
+
'a', 'an', 'the', 'and', 'or', 'but', 'nor', 'so', 'yet', 'of', 'to', 'in',
|
|
778
|
+
'on', 'at', 'by', 'as', 'is', 'are', 'was', 'were', 'be', 'for', 'with',
|
|
779
|
+
'that', 'this', 'it', 'its', 'we', 'our'
|
|
780
|
+
]);
|
|
781
|
+
|
|
782
|
+
// True when the gap text between two token patches is short connective filler:
|
|
783
|
+
// only whitespace, punctuation, and short function words. An empty gap counts
|
|
784
|
+
// as filler.
|
|
785
|
+
function isCoalesceFillerGap(gap) {
|
|
786
|
+
if (gap.length > 40) {
|
|
787
|
+
return false;
|
|
788
|
+
}
|
|
789
|
+
for (const token of splitTextTokens(gap)) {
|
|
790
|
+
const text = token.text;
|
|
791
|
+
if (/^\s+$/.test(text)) {
|
|
792
|
+
continue;
|
|
793
|
+
}
|
|
794
|
+
if (!/[A-Za-z0-9]/.test(text)) {
|
|
795
|
+
// Pure punctuation / symbols.
|
|
796
|
+
continue;
|
|
797
|
+
}
|
|
798
|
+
if (COALESCE_FILLER_WORDS.has(text.toLowerCase())) {
|
|
799
|
+
continue;
|
|
800
|
+
}
|
|
801
|
+
return false;
|
|
802
|
+
}
|
|
803
|
+
return true;
|
|
804
|
+
}
|
|
805
|
+
|
|
806
|
+
// Conservative safety-net coalescing of token patches (spec §7). Adjacent
|
|
807
|
+
// token patches are merged when they lie in the same sentence span of
|
|
808
|
+
// `group.oldText`, the gap between them is at most 40 chars of whitespace /
|
|
809
|
+
// punctuation / short function words, and that sentence span contains at
|
|
810
|
+
// least 3 token patches. A merged patch spans `[firstFrom, lastTo)` with
|
|
811
|
+
// `expected` the original slice and `insert` the merged inserts interleaved
|
|
812
|
+
// with the unchanged gap text. When nothing qualifies the token patches are
|
|
813
|
+
// returned unchanged. Absolute offsets are preserved throughout.
|
|
814
|
+
function coalesceTokenPatches(group, tokenPatches) {
|
|
815
|
+
if (tokenPatches === null || tokenPatches.length < 3) {
|
|
816
|
+
return tokenPatches;
|
|
817
|
+
}
|
|
818
|
+
|
|
819
|
+
const spans = splitSentences(group.oldText);
|
|
820
|
+
// Index of the sentence span that fully contains a patch's group-relative
|
|
821
|
+
// range, or -1 when it is not cleanly inside any single span.
|
|
822
|
+
const spanOf = patch => {
|
|
823
|
+
const relativeFrom = patch.from - group.oldStart;
|
|
824
|
+
const relativeTo = patch.to - group.oldStart;
|
|
825
|
+
return spans.findIndex(span => (
|
|
826
|
+
relativeFrom >= span.start && relativeTo <= span.end
|
|
827
|
+
));
|
|
828
|
+
};
|
|
829
|
+
|
|
830
|
+
// Count patches per sentence span so the ">= 3 in the span" gate can be
|
|
831
|
+
// checked before merging any run.
|
|
832
|
+
const patchesPerSpan = new Map();
|
|
833
|
+
for (const patch of tokenPatches) {
|
|
834
|
+
const spanIndex = spanOf(patch);
|
|
835
|
+
patchesPerSpan.set(spanIndex, (patchesPerSpan.get(spanIndex) || 0) + 1);
|
|
836
|
+
}
|
|
837
|
+
|
|
838
|
+
const result = [];
|
|
839
|
+
let run = [];
|
|
840
|
+
let runSpanIndex = -1;
|
|
841
|
+
|
|
842
|
+
const flushRun = () => {
|
|
843
|
+
if (run.length === 0) {
|
|
844
|
+
return;
|
|
845
|
+
}
|
|
846
|
+
if (run.length === 1) {
|
|
847
|
+
result.push(run[0]);
|
|
848
|
+
} else {
|
|
849
|
+
result.push(mergeTokenPatchRun(group, run));
|
|
850
|
+
}
|
|
851
|
+
run = [];
|
|
852
|
+
};
|
|
853
|
+
|
|
854
|
+
for (const patch of tokenPatches) {
|
|
855
|
+
const spanIndex = spanOf(patch);
|
|
856
|
+
const eligibleSpan = spanIndex !== -1 && patchesPerSpan.get(spanIndex) >= 3;
|
|
857
|
+
|
|
858
|
+
if (run.length === 0) {
|
|
859
|
+
run = eligibleSpan ? [patch] : [];
|
|
860
|
+
runSpanIndex = eligibleSpan ? spanIndex : -1;
|
|
861
|
+
if (!eligibleSpan) {
|
|
862
|
+
result.push(patch);
|
|
863
|
+
}
|
|
864
|
+
continue;
|
|
865
|
+
}
|
|
866
|
+
|
|
867
|
+
const prev = run[run.length - 1];
|
|
868
|
+
const gap = group.oldText.slice(
|
|
869
|
+
prev.to - group.oldStart,
|
|
870
|
+
patch.from - group.oldStart
|
|
871
|
+
);
|
|
872
|
+
const mergeable = eligibleSpan
|
|
873
|
+
&& spanIndex === runSpanIndex
|
|
874
|
+
&& isCoalesceFillerGap(gap);
|
|
875
|
+
|
|
876
|
+
if (mergeable) {
|
|
877
|
+
run.push(patch);
|
|
878
|
+
continue;
|
|
879
|
+
}
|
|
880
|
+
|
|
881
|
+
flushRun();
|
|
882
|
+
run = eligibleSpan ? [patch] : [];
|
|
883
|
+
runSpanIndex = eligibleSpan ? spanIndex : -1;
|
|
884
|
+
if (!eligibleSpan) {
|
|
885
|
+
result.push(patch);
|
|
886
|
+
}
|
|
887
|
+
}
|
|
888
|
+
flushRun();
|
|
889
|
+
|
|
890
|
+
return result;
|
|
891
|
+
}
|
|
892
|
+
|
|
893
|
+
// Merges a run of >= 2 adjacent token patches into one patch spanning
|
|
894
|
+
// `[firstFrom, lastTo)`. `expected` is the original-text slice (the patches'
|
|
895
|
+
// expecteds interleaved with the unchanged gap text); `insert` is the patches'
|
|
896
|
+
// inserts interleaved with the same gap text.
|
|
897
|
+
function mergeTokenPatchRun(group, run) {
|
|
898
|
+
const first = run[0];
|
|
899
|
+
const last = run[run.length - 1];
|
|
900
|
+
let expected = first.expected;
|
|
901
|
+
let insert = first.insert;
|
|
902
|
+
|
|
903
|
+
for (let index = 1; index < run.length; index += 1) {
|
|
904
|
+
const prev = run[index - 1];
|
|
905
|
+
const current = run[index];
|
|
906
|
+
const gap = group.oldText.slice(
|
|
907
|
+
prev.to - group.oldStart,
|
|
908
|
+
current.from - group.oldStart
|
|
909
|
+
);
|
|
910
|
+
expected += gap + current.expected;
|
|
911
|
+
insert += gap + current.insert;
|
|
912
|
+
}
|
|
913
|
+
|
|
914
|
+
return {
|
|
915
|
+
from: first.from,
|
|
916
|
+
to: last.to,
|
|
917
|
+
expected,
|
|
918
|
+
insert
|
|
919
|
+
};
|
|
920
|
+
}
|
|
921
|
+
|
|
285
922
|
module.exports = {
|
|
286
|
-
computeTextPatches
|
|
923
|
+
computeTextPatches,
|
|
924
|
+
computeSingleTextPatch,
|
|
925
|
+
computeLineAnchoredChangeGroups,
|
|
926
|
+
computeTokenAnchoredPatches,
|
|
927
|
+
computeGroupMetrics,
|
|
928
|
+
classifyChangedGroup,
|
|
929
|
+
splitParagraphs,
|
|
930
|
+
splitSentences,
|
|
931
|
+
hasOriginalMarkerLine,
|
|
932
|
+
hasLaterRevisedMarkerLine,
|
|
933
|
+
hasAnyAnnotatedMarker,
|
|
934
|
+
countNonEmptyLines,
|
|
935
|
+
countSentenceTerminators,
|
|
936
|
+
singleGroupPatch,
|
|
937
|
+
computeParagraphPatches,
|
|
938
|
+
computeSentencePatches,
|
|
939
|
+
coalesceTokenPatches
|
|
287
940
|
};
|