codex-overleaf-link 1.3.0 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,12 +7,53 @@ function computeTextPatches(oldText, newText) {
7
7
  return [];
8
8
  }
9
9
 
10
- const linePatches = computeLineAnchoredPatches(oldValue, newValue);
11
- if (linePatches.length) {
12
- return linePatches;
10
+ const groups = computeLineAnchoredChangeGroups(oldValue, newValue);
11
+ if (!groups.length) {
12
+ return [computeSingleTextPatch(oldValue, newValue)];
13
13
  }
14
14
 
15
- return [computeSingleTextPatch(oldValue, newValue)];
15
+ const patches = [];
16
+ for (const group of groups) {
17
+ patches.push(...computeNaturalGroupPatches(group));
18
+ }
19
+ return patches.length ? patches : [computeSingleTextPatch(oldValue, newValue)];
20
+ }
21
+
22
+ // Computes the natural-granularity patches for one changed group (spec
23
+ // "Algorithm sketch"). Builds token patches and metrics, classifies the group,
24
+ // then dispatches to the matching builder. `singleGroupPatch` already returns
25
+ // a one-element array; `computeParagraphPatches` / `computeSentencePatches`
26
+ // return an array or `null`, so a null/empty result falls back to a single
27
+ // group patch. `coalesceTokenPatches` always returns a non-empty array when it
28
+ // receives non-empty token patches.
29
+ function computeNaturalGroupPatches(group) {
30
+ const tokenPatches = computeTokenAnchoredPatches(
31
+ group.oldText,
32
+ group.newText,
33
+ group.oldStart
34
+ );
35
+ const metrics = computeGroupMetrics(group, tokenPatches);
36
+ const { type } = classifyChangedGroup(group, tokenPatches, metrics);
37
+
38
+ if (type === 'annotated_block') {
39
+ return singleGroupPatch(group);
40
+ }
41
+ if (type === 'paragraph_rewrite') {
42
+ const paragraphPatches = computeParagraphPatches(group);
43
+ return (paragraphPatches && paragraphPatches.length)
44
+ ? paragraphPatches
45
+ : singleGroupPatch(group);
46
+ }
47
+ if (type === 'sentence_rewrite') {
48
+ const sentencePatches = computeSentencePatches(group, tokenPatches);
49
+ return (sentencePatches && sentencePatches.length)
50
+ ? sentencePatches
51
+ : singleGroupPatch(group);
52
+ }
53
+ if (type === 'small_edit' && tokenPatches && tokenPatches.length) {
54
+ return coalesceTokenPatches(group, tokenPatches);
55
+ }
56
+ return singleGroupPatch(group);
16
57
  }
17
58
 
18
59
  function computeSingleTextPatch(oldValue, newValue, offset = 0) {
@@ -41,7 +82,7 @@ function computeSingleTextPatch(oldValue, newValue, offset = 0) {
41
82
  };
42
83
  }
43
84
 
44
- function computeLineAnchoredPatches(oldValue, newValue) {
85
+ function computeLineAnchoredChangeGroups(oldValue, newValue) {
45
86
  const oldParts = splitTextParts(oldValue);
46
87
  const newParts = splitTextParts(newValue);
47
88
  const MAX_PARTS = 5000;
@@ -58,7 +99,7 @@ function computeLineAnchoredPatches(oldValue, newValue) {
58
99
  }
59
100
 
60
101
  const edits = computePartEdits(oldParts, newParts);
61
- const patches = [];
102
+ const groups = [];
62
103
  let oldOffset = 0;
63
104
  let newOffset = 0;
64
105
  let group = null;
@@ -91,20 +132,13 @@ function computeLineAnchoredPatches(oldValue, newValue) {
91
132
  }
92
133
  flushGroup();
93
134
 
94
- return patches;
135
+ return groups;
95
136
 
96
137
  function flushGroup() {
97
138
  if (!group) {
98
139
  return;
99
140
  }
100
- if (group.oldText !== group.newText) {
101
- const tokenPatches = computeTokenAnchoredPatches(group.oldText, group.newText, group.oldStart);
102
- if (tokenPatches) {
103
- patches.push(...tokenPatches);
104
- } else {
105
- patches.push(computeSingleTextPatch(group.oldText, group.newText, group.oldStart));
106
- }
107
- }
141
+ groups.push(group);
108
142
  group = null;
109
143
  }
110
144
  }
@@ -282,6 +316,625 @@ function computePartEdits(oldParts, newParts) {
282
316
  return edits;
283
317
  }
284
318
 
319
+ function countNonEmptyLines(text) {
320
+ const value = String(text ?? '');
321
+ let count = 0;
322
+ for (const line of value.split('\n')) {
323
+ if (line.trim() !== '') {
324
+ count += 1;
325
+ }
326
+ }
327
+ return count;
328
+ }
329
+
330
+ function countSentenceTerminators(text) {
331
+ const value = String(text ?? '');
332
+ let count = 0;
333
+ for (let index = 0; index < value.length; index += 1) {
334
+ const char = value[index];
335
+ if (char === '。' || char === '?' || char === '!') {
336
+ count += 1;
337
+ continue;
338
+ }
339
+ if (char === '.' || char === '?' || char === '!') {
340
+ if (
341
+ char === '.'
342
+ && /[0-9]/.test(value[index - 1] || '')
343
+ && /[0-9]/.test(value[index + 1] || '')
344
+ ) {
345
+ // Decimal point inside a number such as `1.23` is not a boundary.
346
+ continue;
347
+ }
348
+ count += 1;
349
+ }
350
+ }
351
+ return count;
352
+ }
353
+
354
+ function hasOriginalMarkerLine(text) {
355
+ return String(text ?? '')
356
+ .split('\n')
357
+ .some(line => /^\s*%\s*\[original\]\s*$/.test(line));
358
+ }
359
+
360
+ function hasLaterRevisedMarkerLine(text) {
361
+ const lines = String(text ?? '').split('\n');
362
+ const originalIndex = lines.findIndex(line => /^\s*%\s*\[original\]\s*$/.test(line));
363
+ if (originalIndex === -1) {
364
+ return false;
365
+ }
366
+ return lines.some((line, index) => (
367
+ index > originalIndex && /^\s*%\s*\[revised\]\s*$/.test(line)
368
+ ));
369
+ }
370
+
371
+ function hasAnyAnnotatedMarker(text) {
372
+ return String(text ?? '')
373
+ .split('\n')
374
+ .some(line => (
375
+ /^\s*%\s*\[original\]\s*$/.test(line) || /^\s*%\s*\[revised\]\s*$/.test(line)
376
+ ));
377
+ }
378
+
379
+ function splitParagraphs(text) {
380
+ const value = String(text ?? '');
381
+ const separator = /\n\s*\n/g;
382
+ const segments = [];
383
+ let lastIndex = 0;
384
+ let match = separator.exec(value);
385
+
386
+ while (match) {
387
+ segments.push({ text: value.slice(lastIndex, match.index), start: lastIndex });
388
+ segments.push({ text: match[0], start: match.index });
389
+ lastIndex = match.index + match[0].length;
390
+ match = separator.exec(value);
391
+ }
392
+ segments.push({ text: value.slice(lastIndex), start: lastIndex });
393
+
394
+ return segments;
395
+ }
396
+
397
+ // Lowercase abbreviations whose trailing `.` is conservatively NOT a sentence
398
+ // boundary. The word ending in the dot is matched case-insensitively, so this
399
+ // also covers `Fig.`, `Eq.`, `No.`, etc.
400
+ const NON_TERMINAL_ABBREVIATIONS = new Set([
401
+ 'e.g', 'i.e', 'cf', 'vs', 'etc', 'al', 'fig', 'figs', 'eq', 'eqs', 'sec',
402
+ 'secs', 'thm', 'lem', 'def', 'prop', 'cor', 'ref', 'no', 'vol', 'pp',
403
+ 'ch', 'app', 'resp', 'approx', 'mr', 'ms', 'mrs', 'dr', 'prof', 'st'
404
+ ]);
405
+
406
+ function isLatexCommandStart(text, index) {
407
+ return text[index] === '\\' && /[A-Za-z@]/.test(text[index + 1] || '');
408
+ }
409
+
410
+ // True when the contiguous non-whitespace run containing `index` looks like a
411
+ // URL (has a scheme such as `https://`, or starts with `www.`). A `.` inside
412
+ // such a run is never a confident sentence boundary.
413
+ function isInsideUrl(text, index) {
414
+ let runStart = index;
415
+ while (runStart > 0 && !/\s/.test(text[runStart - 1])) {
416
+ runStart -= 1;
417
+ }
418
+ let runEnd = index;
419
+ while (runEnd < text.length && !/\s/.test(text[runEnd])) {
420
+ runEnd += 1;
421
+ }
422
+ const run = text.slice(runStart, runEnd);
423
+ return /:\/\//.test(run) || /^www\./i.test(run);
424
+ }
425
+
426
+ // True when the `.` at `index` completes a known abbreviation such as `e.g.`
427
+ // or `Fig.` rather than ending a sentence.
428
+ function completesAbbreviation(text, index) {
429
+ let wordStart = index;
430
+ while (wordStart > 0 && /[A-Za-z.]/.test(text[wordStart - 1])) {
431
+ wordStart -= 1;
432
+ }
433
+ const word = text.slice(wordStart, index).toLowerCase();
434
+ return word.length > 0 && NON_TERMINAL_ABBREVIATIONS.has(word);
435
+ }
436
+
437
+ // True when the ASCII terminator `.` `?` `!` at `index` is a confident
438
+ // sentence boundary: it must be followed by whitespace, end-of-string, or a
439
+ // LaTeX command boundary, and must not sit inside a decimal number, a URL, or
440
+ // a known abbreviation.
441
+ function isConfidentAsciiBoundary(text, index) {
442
+ const next = text[index + 1];
443
+ const followedByBoundary = next === undefined
444
+ || /\s/.test(next)
445
+ || isLatexCommandStart(text, index + 1);
446
+ if (!followedByBoundary) {
447
+ return false;
448
+ }
449
+ if (text[index] === '.') {
450
+ const prev = text[index - 1];
451
+ if (/[0-9]/.test(prev || '') && /[0-9]/.test(next || '')) {
452
+ // Decimal point inside a number such as `1.23`.
453
+ return false;
454
+ }
455
+ if (isInsideUrl(text, index)) {
456
+ return false;
457
+ }
458
+ if (completesAbbreviation(text, index)) {
459
+ return false;
460
+ }
461
+ }
462
+ return true;
463
+ }
464
+
465
+ // Splits `text` into ordered sentence spans `[{text, start, end}]` that
466
+ // partition the input exactly (concatenated they equal `text`). Each span
467
+ // includes its trailing terminator and the whitespace up to the next
468
+ // sentence. Conservative: when no confident boundary is found the whole input
469
+ // is returned as a single span.
470
+ function splitSentences(text) {
471
+ const value = String(text ?? '');
472
+ if (value.length === 0) {
473
+ return [{ text: '', start: 0, end: 0 }];
474
+ }
475
+
476
+ const spans = [];
477
+ let spanStart = 0;
478
+ let index = 0;
479
+
480
+ while (index < value.length) {
481
+ const char = value[index];
482
+ let isBoundary = false;
483
+
484
+ if (char === '。' || char === '?' || char === '!') {
485
+ // CJK terminators are unambiguous: they never appear in decimals, URLs,
486
+ // or LaTeX command names, so they always end a sentence.
487
+ isBoundary = true;
488
+ } else if (char === '.' || char === '?' || char === '!') {
489
+ isBoundary = isConfidentAsciiBoundary(value, index);
490
+ }
491
+
492
+ if (isBoundary) {
493
+ // Absorb trailing whitespace up to the next sentence into this span.
494
+ let spanEnd = index + 1;
495
+ while (spanEnd < value.length && /\s/.test(value[spanEnd])) {
496
+ spanEnd += 1;
497
+ }
498
+ if (spanEnd < value.length) {
499
+ spans.push({
500
+ text: value.slice(spanStart, spanEnd),
501
+ start: spanStart,
502
+ end: spanEnd
503
+ });
504
+ spanStart = spanEnd;
505
+ index = spanEnd;
506
+ continue;
507
+ }
508
+ }
509
+
510
+ index += 1;
511
+ }
512
+
513
+ spans.push({
514
+ text: value.slice(spanStart),
515
+ start: spanStart,
516
+ end: value.length
517
+ });
518
+
519
+ return spans;
520
+ }
521
+
522
+ function computeGroupMetrics(group, tokenPatches) {
523
+ const oldNonEmptyLineCount = countNonEmptyLines(group.oldText);
524
+ const newNonEmptyLineCount = countNonEmptyLines(group.newText);
525
+
526
+ return {
527
+ oldNonEmptyLineCount,
528
+ newNonEmptyLineCount,
529
+ maxNonEmptyLineCount: Math.max(oldNonEmptyLineCount, newNonEmptyLineCount),
530
+ changedSpanChars: Math.max(group.oldText.length, group.newText.length),
531
+ tokenPatchCount: tokenPatches === null ? null : tokenPatches.length,
532
+ totalTokenChangedChars: tokenPatches === null
533
+ ? null
534
+ : tokenPatches.reduce((sum, patch) => (
535
+ sum + Math.max(patch.to - patch.from, patch.insert.length)
536
+ ), 0),
537
+ oldSentenceTerminatorCount: countSentenceTerminators(group.oldText),
538
+ newSentenceTerminatorCount: countSentenceTerminators(group.newText)
539
+ };
540
+ }
541
+
542
+ // Resolves the sentence-span quantities used by the `isSentenceRewrite`
543
+ // predicate (the design spec leaves them undefined). It segments the changed
544
+ // group's OLD text into sentence spans and checks whether every token patch's
545
+ // old range maps within a single span.
546
+ //
547
+ // Returns:
548
+ // - `fitsOneSpan`: true iff exactly one sentence span contains every token
549
+ // patch's old range (relative to the group).
550
+ // - `spanChars` / `spanTokenCount`: the char length / token count of that
551
+ // single span when `fitsOneSpan` is true; `0` otherwise (irrelevant then).
552
+ // - `spanStart` / `spanEnd`: the group-relative `[start,end)` offsets of that
553
+ // single span when `fitsOneSpan` is true; `0` otherwise (irrelevant then).
554
+ //
555
+ // When `tokenPatches` is `null` or empty, `fitsOneSpan` is false.
556
+ function resolveTokenPatchSentenceSpan(group, tokenPatches) {
557
+ const empty = {
558
+ fitsOneSpan: false,
559
+ spanChars: 0,
560
+ spanTokenCount: 0,
561
+ spanStart: 0,
562
+ spanEnd: 0
563
+ };
564
+ if (tokenPatches === null || tokenPatches.length === 0) {
565
+ return empty;
566
+ }
567
+
568
+ const sentenceSpans = splitSentences(group.oldText);
569
+ let containingSpan = null;
570
+
571
+ for (const span of sentenceSpans) {
572
+ const containsEveryPatch = tokenPatches.every(patch => {
573
+ const relativeFrom = patch.from - group.oldStart;
574
+ const relativeTo = patch.to - group.oldStart;
575
+ return relativeFrom >= span.start && relativeTo <= span.end;
576
+ });
577
+ if (!containsEveryPatch) {
578
+ continue;
579
+ }
580
+ if (containingSpan !== null) {
581
+ // More than one span contains every patch (possible for a zero-length
582
+ // patch sitting on a span boundary). Not a confident single sentence.
583
+ return empty;
584
+ }
585
+ containingSpan = span;
586
+ }
587
+
588
+ if (containingSpan === null) {
589
+ return empty;
590
+ }
591
+ return {
592
+ fitsOneSpan: true,
593
+ spanChars: containingSpan.text.length,
594
+ spanTokenCount: splitTextTokens(containingSpan.text).length,
595
+ spanStart: containingSpan.start,
596
+ spanEnd: containingSpan.end
597
+ };
598
+ }
599
+
600
+ // Classifies a changed group into a natural review granularity. Pure function.
601
+ //
602
+ // `group` is `{oldStart, oldText, newText}`; `tokenPatches` is the array from
603
+ // `computeTokenAnchoredPatches(group.oldText, group.newText, group.oldStart)`
604
+ // or `null`; `metrics` is the object from `computeGroupMetrics(group,
605
+ // tokenPatches)`.
606
+ //
607
+ // Returns `{type}` where `type` is one of `annotated_block`,
608
+ // `paragraph_rewrite`, `sentence_rewrite`, `small_edit`, `fallback`. The
609
+ // predicates are evaluated in first-match order: annotated_block →
610
+ // paragraph_rewrite → sentence_rewrite → small_edit → fallback. When
611
+ // `tokenPatches === null`, every token-dependent predicate is false, so the
612
+ // only reachable results are `annotated_block`, `paragraph_rewrite` (via the
613
+ // line-count or sentence-terminator branch), and `fallback`.
614
+ function classifyChangedGroup(group, tokenPatches, metrics) {
615
+ const newGroupText = group.newText;
616
+ const {
617
+ maxNonEmptyLineCount,
618
+ changedSpanChars,
619
+ tokenPatchCount,
620
+ totalTokenChangedChars,
621
+ oldSentenceTerminatorCount,
622
+ newSentenceTerminatorCount
623
+ } = metrics;
624
+
625
+ const isAnnotatedBlock = hasOriginalMarkerLine(newGroupText)
626
+ && hasLaterRevisedMarkerLine(newGroupText)
627
+ && maxNonEmptyLineCount >= 3;
628
+ if (isAnnotatedBlock) {
629
+ return { type: 'annotated_block' };
630
+ }
631
+
632
+ const isDenseTokenRewrite = tokenPatches !== null
633
+ && tokenPatchCount >= 6
634
+ && changedSpanChars >= 160
635
+ && tokenPatchCount / Math.max(1, maxNonEmptyLineCount) >= 2;
636
+
637
+ const isParagraphRewrite = !isAnnotatedBlock
638
+ && (
639
+ maxNonEmptyLineCount >= 3
640
+ || (oldSentenceTerminatorCount >= 2 && newSentenceTerminatorCount >= 2)
641
+ || isDenseTokenRewrite
642
+ );
643
+ if (isParagraphRewrite) {
644
+ return { type: 'paragraph_rewrite' };
645
+ }
646
+
647
+ const sentenceSpan = resolveTokenPatchSentenceSpan(group, tokenPatches);
648
+
649
+ const isSentenceRewrite = !isAnnotatedBlock
650
+ && !isParagraphRewrite
651
+ && tokenPatches !== null
652
+ && tokenPatchCount >= 3
653
+ && sentenceSpan.fitsOneSpan
654
+ && (sentenceSpan.spanChars >= 80 || sentenceSpan.spanTokenCount >= 12);
655
+ if (isSentenceRewrite) {
656
+ return { type: 'sentence_rewrite' };
657
+ }
658
+
659
+ const isSmallEdit = !isAnnotatedBlock
660
+ && !isParagraphRewrite
661
+ && !isSentenceRewrite
662
+ && tokenPatches !== null
663
+ && (
664
+ tokenPatchCount <= 2
665
+ || (
666
+ totalTokenChangedChars < 80
667
+ && maxNonEmptyLineCount <= 2
668
+ && !hasAnyAnnotatedMarker(newGroupText)
669
+ )
670
+ );
671
+ if (isSmallEdit) {
672
+ return { type: 'small_edit' };
673
+ }
674
+
675
+ return { type: 'fallback' };
676
+ }
677
+
678
+ // The single-patch fallback for a whole changed group (spec algorithm sketch).
679
+ // Returns a one-element array so callers can treat every builder uniformly.
680
+ // The patch's `from`/`to` are absolute offsets into the full original text:
681
+ // `computeSingleTextPatch` adds `group.oldStart` to its segment-local offsets.
682
+ function singleGroupPatch(group) {
683
+ return [computeSingleTextPatch(group.oldText, group.newText, group.oldStart)];
684
+ }
685
+
686
+ // Builds paragraph-level patches for a changed group (spec §4).
687
+ //
688
+ // Segments `group.oldText` and `group.newText` with `splitParagraphs`, which
689
+ // yields alternating [content, separator, content, ...] segments. When both
690
+ // sides share the SAME separator structure (same segment count and identical
691
+ // separator segments) the content paragraphs are paired positionally and one
692
+ // patch is emitted per changed pair, with `from`/`to` as absolute offsets
693
+ // (`group.oldStart` + the old paragraph segment's start). A single-paragraph
694
+ // group is the degenerate case of this rule: one pair, one patch.
695
+ //
696
+ // Returns `null` when pairing is ambiguous (separator counts differ or a
697
+ // separator segment changed), so the caller can fall back to a group patch.
698
+ function computeParagraphPatches(group) {
699
+ const oldSegments = splitParagraphs(group.oldText);
700
+ const newSegments = splitParagraphs(group.newText);
701
+
702
+ if (oldSegments.length !== newSegments.length) {
703
+ return null;
704
+ }
705
+ // splitParagraphs always yields an odd count: content at even indices,
706
+ // blank-line separators at odd indices. Every separator must be unchanged
707
+ // for positional pairing of the content paragraphs to be sound.
708
+ for (let index = 1; index < oldSegments.length; index += 2) {
709
+ if (oldSegments[index].text !== newSegments[index].text) {
710
+ return null;
711
+ }
712
+ }
713
+
714
+ const patches = [];
715
+ for (let index = 0; index < oldSegments.length; index += 2) {
716
+ const oldParagraph = oldSegments[index];
717
+ const newParagraph = newSegments[index];
718
+ if (oldParagraph.text === newParagraph.text) {
719
+ continue;
720
+ }
721
+ patches.push(computeSingleTextPatch(
722
+ oldParagraph.text,
723
+ newParagraph.text,
724
+ group.oldStart + oldParagraph.start
725
+ ));
726
+ }
727
+ return patches;
728
+ }
729
+
730
+ // Builds a single sentence-level patch for a `sentence_rewrite` group (spec
731
+ // §5). Every token patch lies inside one confident sentence span `[a,b)` of
732
+ // `group.oldText`. Because all token changes are inside that span, the regions
733
+ // `group.oldText.slice(0,a)` and `group.oldText.slice(b)` are unchanged, so
734
+ // `group.newText` is `prefix + <new sentence> + suffix` with the same prefix
735
+ // and suffix; `<new sentence>` is derived by stripping them.
736
+ //
737
+ // Returns `[patch]` whose `from`/`to` cover only that old sentence span
738
+ // (absolute offsets), or `null` when the single span cannot be identified or
739
+ // the unchanged prefix/suffix do not actually match (defensive).
740
+ function computeSentencePatches(group, tokenPatches) {
741
+ const span = resolveTokenPatchSentenceSpan(group, tokenPatches);
742
+ if (!span.fitsOneSpan) {
743
+ return null;
744
+ }
745
+
746
+ const { spanStart, spanEnd } = span;
747
+ const oldPrefix = group.oldText.slice(0, spanStart);
748
+ const oldSuffix = group.oldText.slice(spanEnd);
749
+ const oldSentence = group.oldText.slice(spanStart, spanEnd);
750
+
751
+ // The regions outside the sentence span must be byte-identical between old
752
+ // and new text; otherwise a change leaked outside the span and a single
753
+ // sentence patch would be wrong.
754
+ if (
755
+ !group.newText.startsWith(oldPrefix)
756
+ || !group.newText.endsWith(oldSuffix)
757
+ || group.newText.length < oldPrefix.length + oldSuffix.length
758
+ ) {
759
+ return null;
760
+ }
761
+
762
+ const newSentence = group.newText.slice(
763
+ oldPrefix.length,
764
+ group.newText.length - oldSuffix.length
765
+ );
766
+ return [computeSingleTextPatch(
767
+ oldSentence,
768
+ newSentence,
769
+ group.oldStart + spanStart
770
+ )];
771
+ }
772
+
773
+ // Short function words whose presence inside a coalescing gap does not block a
774
+ // merge. Combined with pure punctuation and whitespace, these define a gap
775
+ // that is "mostly" connective filler (spec §7).
776
+ const COALESCE_FILLER_WORDS = new Set([
777
+ 'a', 'an', 'the', 'and', 'or', 'but', 'nor', 'so', 'yet', 'of', 'to', 'in',
778
+ 'on', 'at', 'by', 'as', 'is', 'are', 'was', 'were', 'be', 'for', 'with',
779
+ 'that', 'this', 'it', 'its', 'we', 'our'
780
+ ]);
781
+
782
+ // True when the gap text between two token patches is short connective filler:
783
+ // only whitespace, punctuation, and short function words. An empty gap counts
784
+ // as filler.
785
+ function isCoalesceFillerGap(gap) {
786
+ if (gap.length > 40) {
787
+ return false;
788
+ }
789
+ for (const token of splitTextTokens(gap)) {
790
+ const text = token.text;
791
+ if (/^\s+$/.test(text)) {
792
+ continue;
793
+ }
794
+ if (!/[A-Za-z0-9]/.test(text)) {
795
+ // Pure punctuation / symbols.
796
+ continue;
797
+ }
798
+ if (COALESCE_FILLER_WORDS.has(text.toLowerCase())) {
799
+ continue;
800
+ }
801
+ return false;
802
+ }
803
+ return true;
804
+ }
805
+
806
+ // Conservative safety-net coalescing of token patches (spec §7). Adjacent
807
+ // token patches are merged when they lie in the same sentence span of
808
+ // `group.oldText`, the gap between them is at most 40 chars of whitespace /
809
+ // punctuation / short function words, and that sentence span contains at
810
+ // least 3 token patches. A merged patch spans `[firstFrom, lastTo)` with
811
+ // `expected` the original slice and `insert` the merged inserts interleaved
812
+ // with the unchanged gap text. When nothing qualifies the token patches are
813
+ // returned unchanged. Absolute offsets are preserved throughout.
814
+ function coalesceTokenPatches(group, tokenPatches) {
815
+ if (tokenPatches === null || tokenPatches.length < 3) {
816
+ return tokenPatches;
817
+ }
818
+
819
+ const spans = splitSentences(group.oldText);
820
+ // Index of the sentence span that fully contains a patch's group-relative
821
+ // range, or -1 when it is not cleanly inside any single span.
822
+ const spanOf = patch => {
823
+ const relativeFrom = patch.from - group.oldStart;
824
+ const relativeTo = patch.to - group.oldStart;
825
+ return spans.findIndex(span => (
826
+ relativeFrom >= span.start && relativeTo <= span.end
827
+ ));
828
+ };
829
+
830
+ // Count patches per sentence span so the ">= 3 in the span" gate can be
831
+ // checked before merging any run.
832
+ const patchesPerSpan = new Map();
833
+ for (const patch of tokenPatches) {
834
+ const spanIndex = spanOf(patch);
835
+ patchesPerSpan.set(spanIndex, (patchesPerSpan.get(spanIndex) || 0) + 1);
836
+ }
837
+
838
+ const result = [];
839
+ let run = [];
840
+ let runSpanIndex = -1;
841
+
842
+ const flushRun = () => {
843
+ if (run.length === 0) {
844
+ return;
845
+ }
846
+ if (run.length === 1) {
847
+ result.push(run[0]);
848
+ } else {
849
+ result.push(mergeTokenPatchRun(group, run));
850
+ }
851
+ run = [];
852
+ };
853
+
854
+ for (const patch of tokenPatches) {
855
+ const spanIndex = spanOf(patch);
856
+ const eligibleSpan = spanIndex !== -1 && patchesPerSpan.get(spanIndex) >= 3;
857
+
858
+ if (run.length === 0) {
859
+ run = eligibleSpan ? [patch] : [];
860
+ runSpanIndex = eligibleSpan ? spanIndex : -1;
861
+ if (!eligibleSpan) {
862
+ result.push(patch);
863
+ }
864
+ continue;
865
+ }
866
+
867
+ const prev = run[run.length - 1];
868
+ const gap = group.oldText.slice(
869
+ prev.to - group.oldStart,
870
+ patch.from - group.oldStart
871
+ );
872
+ const mergeable = eligibleSpan
873
+ && spanIndex === runSpanIndex
874
+ && isCoalesceFillerGap(gap);
875
+
876
+ if (mergeable) {
877
+ run.push(patch);
878
+ continue;
879
+ }
880
+
881
+ flushRun();
882
+ run = eligibleSpan ? [patch] : [];
883
+ runSpanIndex = eligibleSpan ? spanIndex : -1;
884
+ if (!eligibleSpan) {
885
+ result.push(patch);
886
+ }
887
+ }
888
+ flushRun();
889
+
890
+ return result;
891
+ }
892
+
893
+ // Merges a run of >= 2 adjacent token patches into one patch spanning
894
+ // `[firstFrom, lastTo)`. `expected` is the original-text slice (the patches'
895
+ // expecteds interleaved with the unchanged gap text); `insert` is the patches'
896
+ // inserts interleaved with the same gap text.
897
+ function mergeTokenPatchRun(group, run) {
898
+ const first = run[0];
899
+ const last = run[run.length - 1];
900
+ let expected = first.expected;
901
+ let insert = first.insert;
902
+
903
+ for (let index = 1; index < run.length; index += 1) {
904
+ const prev = run[index - 1];
905
+ const current = run[index];
906
+ const gap = group.oldText.slice(
907
+ prev.to - group.oldStart,
908
+ current.from - group.oldStart
909
+ );
910
+ expected += gap + current.expected;
911
+ insert += gap + current.insert;
912
+ }
913
+
914
+ return {
915
+ from: first.from,
916
+ to: last.to,
917
+ expected,
918
+ insert
919
+ };
920
+ }
921
+
285
922
  module.exports = {
286
- computeTextPatches
923
+ computeTextPatches,
924
+ computeSingleTextPatch,
925
+ computeLineAnchoredChangeGroups,
926
+ computeTokenAnchoredPatches,
927
+ computeGroupMetrics,
928
+ classifyChangedGroup,
929
+ splitParagraphs,
930
+ splitSentences,
931
+ hasOriginalMarkerLine,
932
+ hasLaterRevisedMarkerLine,
933
+ hasAnyAnnotatedMarker,
934
+ countNonEmptyLines,
935
+ countSentenceTerminators,
936
+ singleGroupPatch,
937
+ computeParagraphPatches,
938
+ computeSentencePatches,
939
+ coalesceTokenPatches
287
940
  };