@createiq/htmldiff 1.2.0-beta.1 → 1.2.0-beta.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/HtmlDiff.cjs +249 -52
- package/dist/HtmlDiff.cjs.map +1 -1
- package/dist/HtmlDiff.d.cts +72 -18
- package/dist/HtmlDiff.d.mts +72 -18
- package/dist/HtmlDiff.mjs +244 -52
- package/dist/HtmlDiff.mjs.map +1 -1
- package/package.json +1 -1
- package/src/HtmlDiff.ts +172 -48
- package/src/ThreeWayDiff.ts +58 -11
- package/src/ThreeWayTable.ts +143 -9
- package/test/HtmlDiff.spec.ts +15 -0
- package/test/HtmlDiff.threeWay.spec.ts +232 -6
- package/test/HtmlDiff.threeWay.tables.spec.ts +111 -1
- package/test/Utils.spec.ts +3 -3
package/package.json
CHANGED
package/src/HtmlDiff.ts
CHANGED
|
@@ -71,6 +71,40 @@ export interface AnalyzeResult {
|
|
|
71
71
|
*/
|
|
72
72
|
export type ThreeWayOptions = AnalyzeOptions
|
|
73
73
|
|
|
74
|
+
/**
|
|
75
|
+
* Opinionated options that align htmldiff's output with Microsoft Word's
|
|
76
|
+
* track-changes rendering for legal-document rewrites.
|
|
77
|
+
*
|
|
78
|
+
* The library's bare default (`orphanMatchThreshold = 0`) keeps every
|
|
79
|
+
* LCS match, however small — which fragments long sentence rewrites
|
|
80
|
+
* into many tiny ins/del pairs around stray word matches ("of", "the",
|
|
81
|
+
* "shall"). Word collapses those into a single coarse del+ins, which is
|
|
82
|
+
* dramatically more readable for legal text.
|
|
83
|
+
*
|
|
84
|
+
* 0.25 was tuned empirically against a customer Word reference (US
|
|
85
|
+
* Commercial One CP, May 2026):
|
|
86
|
+
* - short edits (typo / one-word insert): output identical to
|
|
87
|
+
* threshold=0 — inter-match distances are tiny so every match
|
|
88
|
+
* trivially clears the bar;
|
|
89
|
+
* - long rewrites (the "Specified Indebtedness" rewrite in the
|
|
90
|
+
* reference): previously produced 6 dels + 5 ins fragmented around
|
|
91
|
+
* stray matches; at 0.25 it condenses to 3 dels + 2 ins — close to
|
|
92
|
+
* Word's 1+1 and a major readability win;
|
|
93
|
+
* - higher values (0.3+) collapsed short edits containing inline
|
|
94
|
+
* formatting changes into a single block — too aggressive.
|
|
95
|
+
*
|
|
96
|
+
* Consumers rendering legal documents should spread this into their
|
|
97
|
+
* options:
|
|
98
|
+
* `HtmlDiff.execute(old, new, { ...WORD_ALIGNED_OPTIONS })`
|
|
99
|
+
* `HtmlDiff.executeThreeWay(g, c, m, { ...WORD_ALIGNED_OPTIONS })`
|
|
100
|
+
*
|
|
101
|
+
* Other consumers (machine-readable diff, exact-token alignment) can
|
|
102
|
+
* keep the bare default.
|
|
103
|
+
*/
|
|
104
|
+
export const WORD_ALIGNED_OPTIONS: AnalyzeOptions = {
|
|
105
|
+
orphanMatchThreshold: 0.25,
|
|
106
|
+
}
|
|
107
|
+
|
|
74
108
|
export default class HtmlDiff {
|
|
75
109
|
/**
|
|
76
110
|
* This value defines balance between speed and memory utilization. The higher it is the faster it works and more memory consumes.
|
|
@@ -162,7 +196,22 @@ export default class HtmlDiff {
|
|
|
162
196
|
// constructor overload that would re-leak the parameter we just hid.
|
|
163
197
|
private tablePreprocessDepth = 0
|
|
164
198
|
|
|
165
|
-
|
|
199
|
+
/**
|
|
200
|
+
* Tracks currently-open formatting-tag wraps. Each entry pairs the
|
|
201
|
+
* opening tag (so a later closing tag can find its match) with the
|
|
202
|
+
* styling info needed to RE-OPEN the wrap if an overlapping
|
|
203
|
+
* formatting-tag close forces it to split. Without the styling info,
|
|
204
|
+
* an overlap like `<strong>X</strong>` ↔ `<u>X</u>` produces an
|
|
205
|
+
* unclosable wrap (the closing tag for the outer wrap arrives while
|
|
206
|
+
* an inner wrap is still on the stack); see `insertTag`'s closing
|
|
207
|
+
* handler for the split logic.
|
|
208
|
+
*/
|
|
209
|
+
private specialTagDiffStack: Array<{
|
|
210
|
+
tag: string
|
|
211
|
+
styledTagNames: string
|
|
212
|
+
cssClass: string
|
|
213
|
+
metadata: WrapMetadata | undefined
|
|
214
|
+
}> = []
|
|
166
215
|
private newWords: string[] = []
|
|
167
216
|
private oldWords: string[] = []
|
|
168
217
|
/**
|
|
@@ -234,8 +283,31 @@ export default class HtmlDiff {
|
|
|
234
283
|
this.newText = newText
|
|
235
284
|
}
|
|
236
285
|
|
|
237
|
-
|
|
238
|
-
|
|
286
|
+
/**
|
|
287
|
+
* Two-way diff entry point. Accepts the same `AnalyzeOptions` bag as
|
|
288
|
+
* `executeThreeWay`, with two intentional exceptions documented
|
|
289
|
+
* inline below. Consumers wanting Word-aligned output should spread
|
|
290
|
+
* `WORD_ALIGNED_OPTIONS` into the third argument.
|
|
291
|
+
*
|
|
292
|
+
* Note: unlike `analyze`, `execute` runs `build()` which performs
|
|
293
|
+
* full table preprocessing — `tablePreprocessDepth` stays at 0 so
|
|
294
|
+
* the recursive cell diff can happen. Callers can't override that.
|
|
295
|
+
*/
|
|
296
|
+
static execute(oldText: string, newText: string, options: AnalyzeOptions = {}): string {
|
|
297
|
+
const inner = new HtmlDiff(oldText, newText)
|
|
298
|
+
if (options.blockExpressions) {
|
|
299
|
+
for (const expr of options.blockExpressions) inner.addBlockExpression(expr)
|
|
300
|
+
}
|
|
301
|
+
if (options.repeatingWordsAccuracy !== undefined) inner.repeatingWordsAccuracy = options.repeatingWordsAccuracy
|
|
302
|
+
if (options.orphanMatchThreshold !== undefined) inner.orphanMatchThreshold = options.orphanMatchThreshold
|
|
303
|
+
if (options.ignoreWhitespaceDifferences !== undefined) {
|
|
304
|
+
inner.ignoreWhitespaceDifferences = options.ignoreWhitespaceDifferences
|
|
305
|
+
}
|
|
306
|
+
// `useProjections` is intentionally NOT plumbed here — the 2-way
|
|
307
|
+
// path's build() runs its own heuristic. `analyze` honours it; if
|
|
308
|
+
// you need to force it for a 2-way result, route through `analyze`
|
|
309
|
+
// and consume the operations directly.
|
|
310
|
+
return inner.build()
|
|
239
311
|
}
|
|
240
312
|
|
|
241
313
|
/**
|
|
@@ -320,22 +392,6 @@ export default class HtmlDiff {
|
|
|
320
392
|
return HtmlDiff.shouldUseContentProjections(oldWords, newWords, oldProj, newProj)
|
|
321
393
|
}
|
|
322
394
|
|
|
323
|
-
/**
|
|
324
|
-
* Three-way HTML diff. Given V1 (the version Me last sent), V2 (the
|
|
325
|
-
* version CP sent back), and V3 (Me's current draft), produces a
|
|
326
|
-
* single attributed HTML output where CP's and Me's changes are
|
|
327
|
-
* distinguished by `data-author` ('cp' or 'me') and matching
|
|
328
|
-
* `class='diffins cp'` / `class='diffdel me'` etc. The "Me rejected
|
|
329
|
-
* CP's proposal" case (Me deleted text CP had inserted) gets a
|
|
330
|
-
* dedicated marker: `data-rejects='cp'` plus `class='... rejects-cp'`.
|
|
331
|
-
*
|
|
332
|
-
* Coordinates the symmetric-projection decision (D1) across both
|
|
333
|
-
* internal `analyze` calls so V2 tokenises identically on each side
|
|
334
|
-
* of the spine. When `useProjections` is left undefined, the decision
|
|
335
|
-
* is the conjunction of both pair-wise heuristics — project iff both
|
|
336
|
-
* pairs would project on their own. Pass an explicit boolean to
|
|
337
|
-
* override.
|
|
338
|
-
*/
|
|
339
395
|
/**
|
|
340
396
|
* Three-way HTML diff against a shared genesis. Produces attributed
|
|
341
397
|
* HTML that distinguishes CP's accumulated changes (genesis → cpLatest)
|
|
@@ -424,6 +480,25 @@ export default class HtmlDiff {
|
|
|
424
480
|
* buffer. Reusing the instance keeps the formatting-tag stack
|
|
425
481
|
* (`specialTagDiffStack`) coherent across segments — a `<strong>`
|
|
426
482
|
* opened in one segment and closed in another stays balanced.
|
|
483
|
+
*
|
|
484
|
+
* Edge case: an ins/del segment can open a formatting wrap whose
|
|
485
|
+
* matching closer ends up in an equal segment (`<strong>` deleted
|
|
486
|
+
* by CP but `</strong>` kept by both — buildSegments emits the open
|
|
487
|
+
* as del-cp and the close as equal). Equal segments bypass
|
|
488
|
+
* `insertTag` and push raw, so the stack entry for the open is
|
|
489
|
+
* never popped. Rather than throw — which forces the caller's UI
|
|
490
|
+
* into an error boundary — close every leftover wrap with `</ins>`
|
|
491
|
+
* at the end of emission.
|
|
492
|
+
*
|
|
493
|
+
* Caveat: the `</ins>` close is honest for the mod-wrap that the
|
|
494
|
+
* opener pushed (every formatting opener emits an inner `<ins…>`
|
|
495
|
+
* postInject regardless of whether the outer segment is ins or
|
|
496
|
+
* del). For del-segment formatting openers the outer `<del>` may
|
|
497
|
+
* itself be left open by the same emission imbalance; this fixup
|
|
498
|
+
* doesn't address that. Downstream browsers/DOMParser normalise
|
|
499
|
+
* mildly-malformed HTML by closing dangling tags, so the rendered
|
|
500
|
+
* output is usually acceptable — but the warning IS the signal
|
|
501
|
+
* that the input had a real imbalance worth investigating.
|
|
427
502
|
*/
|
|
428
503
|
private static emitSegments(segments: Segment[]): string {
|
|
429
504
|
const emitter = new HtmlDiff('', '')
|
|
@@ -436,18 +511,21 @@ export default class HtmlDiff {
|
|
|
436
511
|
// insertTag mutates its `words` array; pass a copy.
|
|
437
512
|
emitter.insertTag(tag, baseClass, [...seg.words], metadata)
|
|
438
513
|
}
|
|
439
|
-
// Stack-balance invariant: every special-case opening tag pushed onto
|
|
440
|
-
// `specialTagDiffStack` during emission must have been matched by a
|
|
441
|
-
// closing tag. An unbalanced stack means the input had unbalanced
|
|
442
|
-
// formatting tags AND a Replace at an inconvenient position — the
|
|
443
|
-
// output would be silently malformed (half-closed `<ins>`). Fail
|
|
444
|
-
// loudly so the caller can investigate rather than ship broken HTML.
|
|
445
514
|
if (emitter.specialTagDiffStack.length > 0) {
|
|
446
|
-
|
|
515
|
+
// Log once so we can spot bad inputs in dev tools, but don't
|
|
516
|
+
// throw — the caller's only fallback was to crash the React
|
|
517
|
+
// tree, which is worse than emitting slightly-imperfect HTML.
|
|
518
|
+
// eslint-disable-next-line no-console
|
|
519
|
+
console.warn(
|
|
447
520
|
`HtmlDiff.executeThreeWay: emission left ${emitter.specialTagDiffStack.length} ` +
|
|
448
|
-
'unclosed formatting
|
|
449
|
-
'
|
|
521
|
+
'unclosed formatting wrap(s) on the stack. Closing defensively. ' +
|
|
522
|
+
'This usually means a formatting tag opens in a del/ins segment ' +
|
|
523
|
+
'and its matching closer is in an equal segment.'
|
|
450
524
|
)
|
|
525
|
+
while (emitter.specialTagDiffStack.length > 0) {
|
|
526
|
+
emitter.content.push('</ins>')
|
|
527
|
+
emitter.specialTagDiffStack.pop()
|
|
528
|
+
}
|
|
451
529
|
}
|
|
452
530
|
return emitter.content.join('')
|
|
453
531
|
}
|
|
@@ -827,8 +905,13 @@ export default class HtmlDiff {
|
|
|
827
905
|
// if there are nonTags, the index of the last tag is the index before the first nonTag.
|
|
828
906
|
const indexLastTagInFirstTagBlock = indexOfFirstNonTag === -1 ? words.length - 1 : indexOfFirstNonTag - 1
|
|
829
907
|
|
|
830
|
-
|
|
831
|
-
|
|
908
|
+
// Pre-injection sits BEFORE the extracted tag-block content (used
|
|
909
|
+
// by closing tags so `</ins></strong>` reads left-to-right).
|
|
910
|
+
// Post-injection sits AFTER (used by opening tags so the rendered
|
|
911
|
+
// order is `<strong><ins ...>` and by the overlap-split case so
|
|
912
|
+
// the re-opened `<ins>`s sit AFTER the actual closing tag).
|
|
913
|
+
let preInject = ''
|
|
914
|
+
let postInject = ''
|
|
832
915
|
|
|
833
916
|
// handle opening tag
|
|
834
917
|
if (HtmlDiff.SpecialCaseOpeningTagRegex.test(words[0])) {
|
|
@@ -840,10 +923,11 @@ export default class HtmlDiff {
|
|
|
840
923
|
}
|
|
841
924
|
const styledTagNames = Array.from(tagNames).join(' ')
|
|
842
925
|
|
|
843
|
-
this.specialTagDiffStack.push(words[0])
|
|
844
926
|
// Carry the caller's metadata into the formatting-tag wrapper so
|
|
845
927
|
// a 3-way author tag survives a `<strong>`/`<em>` content edit.
|
|
846
|
-
|
|
928
|
+
const styledCssClass = `mod ${styledTagNames}`
|
|
929
|
+
this.specialTagDiffStack.push({ tag: words[0], styledTagNames, cssClass: styledCssClass, metadata })
|
|
930
|
+
postInject = `<ins${Utils.composeTagAttributes(styledCssClass, metadata ?? {})}>`
|
|
847
931
|
if (tag === HtmlDiff.DelTag) {
|
|
848
932
|
words.shift()
|
|
849
933
|
|
|
@@ -855,7 +939,6 @@ export default class HtmlDiff {
|
|
|
855
939
|
}
|
|
856
940
|
// handle closing tag
|
|
857
941
|
else if (HtmlDiff.SpecialCaseClosingTagsSet.has(words[0].toLowerCase())) {
|
|
858
|
-
const openingTag = this.specialTagDiffStack.length === 0 ? null : this.specialTagDiffStack.pop()
|
|
859
942
|
// For delete operations: when the tag block contains a mix of formatting and
|
|
860
943
|
// non-formatting closing tags (e.g. </strong></div>), compare against the first
|
|
861
944
|
// closing tag (the formatting one) rather than the last tag in the block.
|
|
@@ -870,19 +953,39 @@ export default class HtmlDiff {
|
|
|
870
953
|
tagIndexToCompare = 0
|
|
871
954
|
}
|
|
872
955
|
}
|
|
873
|
-
const openingAndClosingTagsMatch =
|
|
874
|
-
!!openingTag && Utils.getTagName(openingTag) === Utils.getTagName(words[tagIndexToCompare])
|
|
875
956
|
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
957
|
+
// Search the stack for a matching opener (LIFO). When the match
|
|
958
|
+
// is the top entry, this is the normal balanced case and we
|
|
959
|
+
// emit a single `</ins>` before the closing tag. When the match
|
|
960
|
+
// is below an unmatched opener — i.e. another formatting wrap
|
|
961
|
+
// opened after it but hasn't been closed yet — the wraps
|
|
962
|
+
// overlap in source order, which has no valid LIFO HTML
|
|
963
|
+
// expression. Resolve by SPLITTING the wraps: close everything
|
|
964
|
+
// above the match (their `<ins>`s and the match's `<ins>`), then
|
|
965
|
+
// re-open the above wraps with fresh `<ins>` tags AFTER the
|
|
966
|
+
// closing tag emits. The above wraps continue to apply until
|
|
967
|
+
// their own closing tag arrives.
|
|
968
|
+
const closingTagName = Utils.getTagName(words[tagIndexToCompare])
|
|
969
|
+
let matchIdx = -1
|
|
970
|
+
for (let i = this.specialTagDiffStack.length - 1; i >= 0; i--) {
|
|
971
|
+
if (Utils.getTagName(this.specialTagDiffStack[i].tag) === closingTagName) {
|
|
972
|
+
matchIdx = i
|
|
973
|
+
break
|
|
974
|
+
}
|
|
879
975
|
}
|
|
880
976
|
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
977
|
+
if (matchIdx >= 0) {
|
|
978
|
+
const aboveEntries = this.specialTagDiffStack.splice(matchIdx + 1)
|
|
979
|
+
this.specialTagDiffStack.pop() // pop the matched entry
|
|
980
|
+
// One `</ins>` per above entry, then one for the match itself.
|
|
981
|
+
preInject = '</ins>'.repeat(aboveEntries.length + 1)
|
|
982
|
+
for (const entry of aboveEntries) {
|
|
983
|
+
postInject += `<ins${Utils.composeTagAttributes(entry.cssClass, entry.metadata ?? {})}>`
|
|
984
|
+
this.specialTagDiffStack.push(entry) // their wrap continues via the new <ins>
|
|
985
|
+
}
|
|
885
986
|
}
|
|
987
|
+
// No match in stack — orphan closing tag, drop the `<ins>` work
|
|
988
|
+
// and just let the tag itself flow through extractConsecutiveWords.
|
|
886
989
|
|
|
887
990
|
if (tag === HtmlDiff.DelTag) {
|
|
888
991
|
words.shift()
|
|
@@ -893,7 +996,7 @@ export default class HtmlDiff {
|
|
|
893
996
|
}
|
|
894
997
|
}
|
|
895
998
|
|
|
896
|
-
if (words.length === 0 &&
|
|
999
|
+
if (words.length === 0 && preInject.length === 0 && postInject.length === 0) {
|
|
897
1000
|
break
|
|
898
1001
|
}
|
|
899
1002
|
|
|
@@ -909,11 +1012,7 @@ export default class HtmlDiff {
|
|
|
909
1012
|
!HtmlDiff.SpecialCaseClosingTagsSet.has(x.toLowerCase())
|
|
910
1013
|
: Utils.isTag
|
|
911
1014
|
|
|
912
|
-
|
|
913
|
-
this.content.push(specialCaseTagInjection + this.extractConsecutiveWords(words, isTagForExtraction).join(''))
|
|
914
|
-
} else {
|
|
915
|
-
this.content.push(this.extractConsecutiveWords(words, isTagForExtraction).join('') + specialCaseTagInjection)
|
|
916
|
-
}
|
|
1015
|
+
this.content.push(preInject + this.extractConsecutiveWords(words, isTagForExtraction).join('') + postInject)
|
|
917
1016
|
|
|
918
1017
|
if (words.length === 0) continue
|
|
919
1018
|
|
|
@@ -1020,6 +1119,31 @@ export default class HtmlDiff {
|
|
|
1020
1119
|
continue
|
|
1021
1120
|
}
|
|
1022
1121
|
|
|
1122
|
+
// Never orphan-reject a match whose tokens are ALL HTML tags.
|
|
1123
|
+
// Tag tokens are structural; rejecting `</strong>` / `</em>` as
|
|
1124
|
+
// an orphan match between two content deletions merges the tag
|
|
1125
|
+
// into the deletion, leaving the matching opener unclosed —
|
|
1126
|
+
// browsers then auto-close the opener at the END of the
|
|
1127
|
+
// deletion, producing visually-wrong output (e.g. the body of
|
|
1128
|
+
// a section deletion rendered as bold-italic because the
|
|
1129
|
+
// closing `</strong></em>` ended up after the body deletion
|
|
1130
|
+
// rather than after the heading). The orphan threshold is
|
|
1131
|
+
// designed for stray word matches between heavily-edited spans,
|
|
1132
|
+
// not for formatting boundaries.
|
|
1133
|
+
let allTags = true
|
|
1134
|
+
for (let i = curr.startInNew; i < curr.endInNew; i++) {
|
|
1135
|
+
if (!Utils.isTag(wordsForDiffNew[i])) {
|
|
1136
|
+
allTags = false
|
|
1137
|
+
break
|
|
1138
|
+
}
|
|
1139
|
+
}
|
|
1140
|
+
if (allTags) {
|
|
1141
|
+
yield curr
|
|
1142
|
+
prev = curr
|
|
1143
|
+
curr = next
|
|
1144
|
+
continue
|
|
1145
|
+
}
|
|
1146
|
+
|
|
1023
1147
|
let oldDistanceInChars = 0
|
|
1024
1148
|
for (let i = prev.endInOld; i < next.startInOld; i++) {
|
|
1025
1149
|
oldDistanceInChars += wordsForDiffOld[i].length
|
package/src/ThreeWayDiff.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import Action from './Action'
|
|
2
|
+
import { lcsAlign } from './Alignment'
|
|
2
3
|
import type { AnalyzeResult } from './HtmlDiff'
|
|
3
4
|
import type Operation from './Operation'
|
|
4
5
|
import type { WrapMetadata } from './Utils'
|
|
@@ -183,13 +184,33 @@ function collectInsertionsKeyedByEnd(d: AnalyzeResult): Map<number, string[]> {
|
|
|
183
184
|
}
|
|
184
185
|
|
|
185
186
|
/**
|
|
186
|
-
* Emit any insertions at boundary `b`.
|
|
187
|
-
* the same boundary AND the inserted token sequences are textually
|
|
188
|
-
* identical, the insertion is treated as agreed and emitted unmarked.
|
|
189
|
-
* Otherwise each side's insertion is emitted with author attribution.
|
|
187
|
+
* Emit any insertions at boundary `b`.
|
|
190
188
|
*
|
|
191
|
-
*
|
|
192
|
-
*
|
|
189
|
+
* Reading model: a legal reviewer wants to see CP's INTENT relative
|
|
190
|
+
* to Me's current content. Me's content is the base; CP's deltas are
|
|
191
|
+
* what they need to act on. Under that framing:
|
|
192
|
+
* - tokens both authors inserted at the same boundary → settled
|
|
193
|
+
* - tokens CP inserted that Me doesn't have → ins-cp (CP wants
|
|
194
|
+
* this added)
|
|
195
|
+
* - tokens Me inserted that CP doesn't have → del-cp (CP wants
|
|
196
|
+
* this removed from Me's content)
|
|
197
|
+
*
|
|
198
|
+
* The third case is the load-bearing attribution flip. The
|
|
199
|
+
* genesis-spine view technically labels me-only-at-boundary tokens
|
|
200
|
+
* as "ins-me" (Me added them; CP didn't), but that's confusing to
|
|
201
|
+
* a reviewer: they see "Me added X" alongside "CP added Y" and have
|
|
202
|
+
* to mentally derive "CP wants X gone, replaced with Y". Surfacing
|
|
203
|
+
* me-only tokens as `del-cp` shows CP's intent directly:
|
|
204
|
+
* - "CP accepted Me's text minus `things`": settled bulk + del-cp
|
|
205
|
+
* `things` (no parallel redundant insertions)
|
|
206
|
+
* - "CP wants `cruel` where Me wrote `brave`": ins-cp `cruel` +
|
|
207
|
+
* del-cp `brave` (the substitution intent reads directly)
|
|
208
|
+
* - "CP added extra words": cp-extras stay as ins-cp (same as
|
|
209
|
+
* before; the cp-only direction was always intent-correct)
|
|
210
|
+
*
|
|
211
|
+
* Pure single-side insertions (Me added text CP doesn't engage
|
|
212
|
+
* with at all, or vice versa) keep their genesis-spine attribution
|
|
213
|
+
* — these aren't refinement cases, just Me's own content additions.
|
|
193
214
|
*/
|
|
194
215
|
function emitBoundary(
|
|
195
216
|
b: number,
|
|
@@ -205,14 +226,40 @@ function emitBoundary(
|
|
|
205
226
|
const hasMe = !!meIns && meIns.length > 0
|
|
206
227
|
if (!hasCp && !hasMe) return
|
|
207
228
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
229
|
+
// Only-one-side: emit verbatim with that side's attribution.
|
|
230
|
+
// Genuine single-author additions stay author-attributed.
|
|
231
|
+
if (!hasCp) {
|
|
232
|
+
appendSegment(segments, { kind: 'ins', author: 'me' }, meIns!)
|
|
233
|
+
return
|
|
234
|
+
}
|
|
235
|
+
if (!hasMe) {
|
|
236
|
+
appendSegment(segments, { kind: 'ins', author: 'cp' }, cpIns!)
|
|
211
237
|
return
|
|
212
238
|
}
|
|
213
239
|
|
|
214
|
-
|
|
215
|
-
|
|
240
|
+
// Both sides inserted. Identical → settled. Otherwise LCS-align
|
|
241
|
+
// and apply the asymmetric intent reading.
|
|
242
|
+
if (tokenArraysEqual(cpIns!, meIns!)) {
|
|
243
|
+
appendSegment(segments, { kind: 'equal' }, cpIns!)
|
|
244
|
+
return
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
const alignment = lcsAlign(cpIns! as string[], meIns! as string[])
|
|
248
|
+
for (const a of alignment) {
|
|
249
|
+
if (a.oldIdx !== null && a.newIdx !== null) {
|
|
250
|
+
// Token appears in both insertions → settled.
|
|
251
|
+
appendSegment(segments, { kind: 'equal' }, [cpIns![a.oldIdx]])
|
|
252
|
+
} else if (a.oldIdx !== null) {
|
|
253
|
+
// Token in cp's insertion only → CP wants this added.
|
|
254
|
+
appendSegment(segments, { kind: 'ins', author: 'cp' }, [cpIns![a.oldIdx]])
|
|
255
|
+
} else if (a.newIdx !== null) {
|
|
256
|
+
// Token in me's insertion only → CP wants this removed from
|
|
257
|
+
// Me's content. (Genesis-spine would label this ins-me, but
|
|
258
|
+
// that reading is misleading for a reviewer at this kind of
|
|
259
|
+
// shared boundary — see the function-level comment.)
|
|
260
|
+
appendSegment(segments, { kind: 'del', author: 'cp' }, [meIns![a.newIdx]])
|
|
261
|
+
}
|
|
262
|
+
}
|
|
216
263
|
}
|
|
217
264
|
|
|
218
265
|
function tokenArraysEqual(a: readonly string[], b: readonly string[]): boolean {
|
package/src/ThreeWayTable.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { lcsAlign, textSimilarity } from './Alignment'
|
|
1
|
+
import { type Alignment, lcsAlign, pairSimilarUnmatched, textSimilarity } from './Alignment'
|
|
2
2
|
import { injectClass, parseOpeningTagAt } from './HtmlScanner'
|
|
3
3
|
import {
|
|
4
4
|
type CellRange,
|
|
@@ -8,6 +8,7 @@ import {
|
|
|
8
8
|
PLACEHOLDER_SUFFIX,
|
|
9
9
|
type RowRange,
|
|
10
10
|
rowKey,
|
|
11
|
+
rowText,
|
|
11
12
|
sameDimensions,
|
|
12
13
|
spliceString,
|
|
13
14
|
type TableRange,
|
|
@@ -143,8 +144,14 @@ function preprocessByContent(
|
|
|
143
144
|
const cKeys = cTables.map(t => tableKey(cpLatest, t))
|
|
144
145
|
const mKeys = mTables.map(t => tableKey(meCurrent, t))
|
|
145
146
|
|
|
146
|
-
|
|
147
|
-
|
|
147
|
+
// Exact tableKey LCS, then fuzzy-pair unmatched runs by content
|
|
148
|
+
// similarity. Without this, a table whose cells were edited (but
|
|
149
|
+
// not its overall shape) fails the exact tableKey match and the
|
|
150
|
+
// table-level aligner pulls it apart into a whole-table del + a
|
|
151
|
+
// whole-table ins. Same fuzzy pass `TableDiff` uses for the 2-way
|
|
152
|
+
// path — `pairSimilarTablesThreeWay` is defined below.
|
|
153
|
+
const alignCp = pairSimilarTablesThreeWay(lcsAlign(gKeys, cKeys), genesis, cpLatest, gTables, cTables)
|
|
154
|
+
const alignMe = pairSimilarTablesThreeWay(lcsAlign(gKeys, mKeys), genesis, meCurrent, gTables, mTables)
|
|
148
155
|
|
|
149
156
|
// Maps: genesisIdx → matching cpIdx (-1 if none); cpIdx → matching genesisIdx; etc.
|
|
150
157
|
const gToCp = new Array<number>(gTables.length).fill(-1)
|
|
@@ -303,7 +310,21 @@ function preprocessByContent(
|
|
|
303
310
|
return { modifiedGenesis, modifiedCp, modifiedMe, placeholderToDiff }
|
|
304
311
|
}
|
|
305
312
|
|
|
306
|
-
|
|
313
|
+
// Positional pairing is the strict-default for three-way table merge:
|
|
314
|
+
// when all three inputs have the same number of tables in the same
|
|
315
|
+
// order, we pair them by index and let `diffTableThreeWay` handle
|
|
316
|
+
// per-table cell/row level differences. The similarity guard below
|
|
317
|
+
// only kicks in to *reject* positional alignment when a pair is
|
|
318
|
+
// SO dissimilar that it's near-certainly a table reorder/rename
|
|
319
|
+
// where content-LCS pairing would be materially better. The
|
|
320
|
+
// threshold is intentionally low — the 2-way path has no such guard
|
|
321
|
+
// and pairs purely by index (its `diffTable` falls back through
|
|
322
|
+
// same-dimension → equal-row-count → row-LCS → whole-table on its
|
|
323
|
+
// own), so the three-way path was stricter than its sibling and
|
|
324
|
+
// silently dropped to whole-table del+ins for legitimate edits
|
|
325
|
+
// like "rename one column and tweak its values". Aligning the
|
|
326
|
+
// threshold here keeps the two-way and three-way paths in step.
|
|
327
|
+
const POSITIONAL_PAIR_SIMILARITY_THRESHOLD = 0.15
|
|
307
328
|
|
|
308
329
|
function positionallyAligned(
|
|
309
330
|
genesis: string,
|
|
@@ -328,6 +349,79 @@ function tableKey(html: string, table: TableRange): string {
|
|
|
328
349
|
return html.slice(table.tableStart, table.tableEnd).replace(/\s+/g, ' ').trim()
|
|
329
350
|
}
|
|
330
351
|
|
|
352
|
+
/**
|
|
353
|
+
* Character-level similarity above which the three-way aligner treats
|
|
354
|
+
* two rows / tables as "the same logical entry, edited" rather than
|
|
355
|
+
* an unrelated delete + insert. Matched to TableDiff's
|
|
356
|
+
* `ROW_FUZZY_THRESHOLD` / `CELL_FUZZY_THRESHOLD` so 2-way and 3-way
|
|
357
|
+
* agree on which pairings are reachable; if a row's content overlap
|
|
358
|
+
* is enough to fool the 2-way diff into pairing, it should also be
|
|
359
|
+
* enough for 3-way.
|
|
360
|
+
*/
|
|
361
|
+
const THREE_WAY_FUZZY_THRESHOLD = 0.5
|
|
362
|
+
|
|
363
|
+
/**
|
|
364
|
+
* Run the same fuzzy-pairing pass `TableDiff.pairSimilarUnmatchedRows`
|
|
365
|
+
* applies after its exact-LCS, but against one side of the genesis
|
|
366
|
+
* spine (either cp or me). The genesis tables/rows are always the
|
|
367
|
+
* "old" side; `newTable` is the cp or me table being aligned. Returns
|
|
368
|
+
* the enriched alignment with additional paired entries.
|
|
369
|
+
*
|
|
370
|
+
* Cell-count guard: only fuzzy-pair when both rows have the same cell
|
|
371
|
+
* count. Without this guard an asymmetric restructure — e.g. CP and
|
|
372
|
+
* Me both added a different column — leads to ONE side fuzzy-pairing
|
|
373
|
+
* its row with genesis (content overlap above threshold) while the
|
|
374
|
+
* other side falls below threshold. That mismatch routes through
|
|
375
|
+
* `diffTableStructural`'s "Me dropped, CP kept" (or the mirror)
|
|
376
|
+
* branch, which emits CP's row as a Me-attributed deletion. In
|
|
377
|
+
* cp-only mode `stripMeAttributedMarkers` then removes the row
|
|
378
|
+
* entirely and CP's edit vanishes from the view — exactly the
|
|
379
|
+
* content-loss case we're meant to prevent. Restricting fuzzy
|
|
380
|
+
* pairing to same-shape rows preserves the common case (single cell
|
|
381
|
+
* edit, identical row shape) while pushing structural mismatches
|
|
382
|
+
* back to the boundary-insertion path that emits both sides
|
|
383
|
+
* explicitly.
|
|
384
|
+
*/
|
|
385
|
+
function pairSimilarRowsThreeWay(
|
|
386
|
+
alignment: Alignment[],
|
|
387
|
+
genesis: string,
|
|
388
|
+
newHtml: string,
|
|
389
|
+
oldTable: TableRange,
|
|
390
|
+
newTable: TableRange
|
|
391
|
+
): Alignment[] {
|
|
392
|
+
const oldTexts = oldTable.rows.map(r => rowText(genesis, r))
|
|
393
|
+
const newTexts = newTable.rows.map(r => rowText(newHtml, r))
|
|
394
|
+
return pairSimilarUnmatched(alignment, THREE_WAY_FUZZY_THRESHOLD, (oldIdx, newIdx) => {
|
|
395
|
+
// Returning 0 sits below any positive threshold so
|
|
396
|
+
// `pairSimilarUnmatched` won't pair these rows; the guard remains
|
|
397
|
+
// defensive should the threshold ever be lowered to 0.
|
|
398
|
+
if (oldTable.rows[oldIdx].cells.length !== newTable.rows[newIdx].cells.length) return 0
|
|
399
|
+
return textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
|
|
400
|
+
})
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
/**
|
|
404
|
+
* Table-level counterpart: after `lcsAlign(gKeys, otherKeys)` over
|
|
405
|
+
* full table HTML keys, fuzzy-pair unmatched table runs by their
|
|
406
|
+
* row-text-concatenated content. Without this, a table whose body
|
|
407
|
+
* was edited (but not its outer shape) fails the exact-key match
|
|
408
|
+
* and the preprocessing emits whole-table del + whole-table ins
|
|
409
|
+
* instead of recursing into per-cell three-way diffs.
|
|
410
|
+
*/
|
|
411
|
+
function pairSimilarTablesThreeWay(
|
|
412
|
+
alignment: Alignment[],
|
|
413
|
+
oldHtml: string,
|
|
414
|
+
newHtml: string,
|
|
415
|
+
oldTables: TableRange[],
|
|
416
|
+
newTables: TableRange[]
|
|
417
|
+
): Alignment[] {
|
|
418
|
+
const oldTexts = oldTables.map(t => t.rows.map(r => rowText(oldHtml, r)).join(' '))
|
|
419
|
+
const newTexts = newTables.map(t => t.rows.map(r => rowText(newHtml, r)).join(' '))
|
|
420
|
+
return pairSimilarUnmatched(alignment, THREE_WAY_FUZZY_THRESHOLD, (oldIdx, newIdx) =>
|
|
421
|
+
textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
|
|
422
|
+
)
|
|
423
|
+
}
|
|
424
|
+
|
|
331
425
|
// ────────────────────────────────────────────────────────────────────────────
|
|
332
426
|
// Per-table diff: positional cells or row-level structural change.
|
|
333
427
|
|
|
@@ -412,8 +506,17 @@ function diffTableStructural(
|
|
|
412
506
|
const cKeys = tC.rows.map(r => rowKey(cpLatest, r))
|
|
413
507
|
const mKeys = tM.rows.map(r => rowKey(meCurrent, r))
|
|
414
508
|
|
|
415
|
-
|
|
416
|
-
|
|
509
|
+
// Exact LCS first, then fuzzy-pair remaining unmatched runs. Without
|
|
510
|
+
// the fuzzy pass, a row where CP edited just a single cell's text
|
|
511
|
+
// produces no key match — the row aligner emits the genesis row as
|
|
512
|
+
// CP-deleted AND CP's reshaped row as inserted, when a cell-level
|
|
513
|
+
// diff against the paired row would render the edit far more
|
|
514
|
+
// legibly. The 2-way path (`TableDiff.pairSimilarUnmatchedRows`)
|
|
515
|
+
// has done this since inception; bringing the three-way path in
|
|
516
|
+
// step removes the asymmetry where the cp-only / all-changes view
|
|
517
|
+
// looks markedly worse than plain 2-way for ordinary cell edits.
|
|
518
|
+
const alignCp = pairSimilarRowsThreeWay(lcsAlign(gKeys, cKeys), genesis, cpLatest, tG, tC)
|
|
519
|
+
const alignMe = pairSimilarRowsThreeWay(lcsAlign(gKeys, mKeys), genesis, meCurrent, tG, tM)
|
|
417
520
|
|
|
418
521
|
// genesisIdx → matching cpIdx (-1 if cp deleted this row)
|
|
419
522
|
const gToCp = new Array<number>(tG.rows.length).fill(-1)
|
|
@@ -521,9 +624,40 @@ function emitPreservedRow(
|
|
|
521
624
|
return out.join('')
|
|
522
625
|
}
|
|
523
626
|
// Cell-count mismatch within a preserved row — cell-level structural
|
|
524
|
-
//
|
|
525
|
-
//
|
|
526
|
-
|
|
627
|
+
// alignment is non-trivial (which Me cell maps to which CP cell when
|
|
628
|
+
// the counts diverge?). The previous fallback emitted only
|
|
629
|
+
// genesis-as-del + me-as-ins, which silently destroyed CP's row
|
|
630
|
+
// content whenever CP changed the cell count — a content-loss bug
|
|
631
|
+
// (a row where CP added a column would disappear from the rendered
|
|
632
|
+
// diff entirely). Emit each side's row as a distinct attributed
|
|
633
|
+
// block so neither party's restructure can vanish:
|
|
634
|
+
// - if both restructured (different shapes on both sides) the
|
|
635
|
+
// genesis row is settled-deleted (silent) and we emit cp + me
|
|
636
|
+
// rows side by side, each attributed to its author;
|
|
637
|
+
// - if only one restructured, the genesis row is del-attributed to
|
|
638
|
+
// the restructuring author so the reader sees what was there
|
|
639
|
+
// before, then the new shape ins-attributed to the same author.
|
|
640
|
+
//
|
|
641
|
+
// Content edits inside a side that DID keep the genesis cell count
|
|
642
|
+
// are not surfaced here (no positional path is available across
|
|
643
|
+
// mismatched shapes); the underlying data is still present in the
|
|
644
|
+
// source document but the visual diff doesn't decompose it. That is
|
|
645
|
+
// a degradation of detail, not content loss — symmetric for cp/me.
|
|
646
|
+
const cpRestructured = rC.cells.length !== rG.cells.length
|
|
647
|
+
const meRestructured = rM.cells.length !== rG.cells.length
|
|
648
|
+
const blocks: string[] = []
|
|
649
|
+
if (cpRestructured && meRestructured) {
|
|
650
|
+
// Both sides restructured; genesis shape retained by neither.
|
|
651
|
+
blocks.push(emitFullRowAttributed(cpLatest, rC, 'ins', 'cp'))
|
|
652
|
+
blocks.push(emitFullRowAttributed(meCurrent, rM, 'ins', 'me'))
|
|
653
|
+
} else if (cpRestructured) {
|
|
654
|
+
blocks.push(emitFullRowAttributed(genesis, rG, 'del', 'cp'))
|
|
655
|
+
blocks.push(emitFullRowAttributed(cpLatest, rC, 'ins', 'cp'))
|
|
656
|
+
} else {
|
|
657
|
+
blocks.push(emitFullRowAttributed(genesis, rG, 'del', 'me'))
|
|
658
|
+
blocks.push(emitFullRowAttributed(meCurrent, rM, 'ins', 'me'))
|
|
659
|
+
}
|
|
660
|
+
return blocks.join('')
|
|
527
661
|
}
|
|
528
662
|
|
|
529
663
|
/**
|
package/test/HtmlDiff.spec.ts
CHANGED
|
@@ -48,6 +48,21 @@ describe('HtmlDiff', () => {
|
|
|
48
48
|
'Some formatted text',
|
|
49
49
|
"Some <ins class='mod strong i'>formatted</ins> text",
|
|
50
50
|
],
|
|
51
|
+
// Overlapping formatting wraps — old wraps a word in <strong>, new wraps the same
|
|
52
|
+
// word in <u>. The wraps cross (mod-strong opens before mod-u, but the </strong>
|
|
53
|
+
// closing arrives before </u>), so emission must split the inner wrap to keep
|
|
54
|
+
// HTML well-formed. Regression: previously left mod-strong unclosed and the
|
|
55
|
+
// 3-way path threw on the unbalanced stack.
|
|
56
|
+
[
|
|
57
|
+
'<strong>three</strong>',
|
|
58
|
+
'<u>three</u>',
|
|
59
|
+
"<ins class='mod strong'><u><ins class='mod u'>three</ins></ins><ins class='mod u'></ins></u>",
|
|
60
|
+
],
|
|
61
|
+
[
|
|
62
|
+
'a <strong>three</strong> b',
|
|
63
|
+
'a <u>three</u> b',
|
|
64
|
+
"a <ins class='mod strong'><u><ins class='mod u'>three</ins></ins><ins class='mod u'></ins></u> b",
|
|
65
|
+
],
|
|
51
66
|
[
|
|
52
67
|
'<table><tr><td>col1</td><td>col2</td></tr><tr><td>Data 1</td><td>Data 2</td></tr></table>',
|
|
53
68
|
'<table><tr><td>col1</td><td>col2</td></tr></table>',
|