@createiq/htmldiff 1.2.0-beta.1 → 1.2.0-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@createiq/htmldiff",
3
- "version": "1.2.0-beta.1",
3
+ "version": "1.2.0-beta.3",
4
4
  "description": "TypeScript port of htmldiff.net",
5
5
  "type": "module",
6
6
  "author": "Mathew Mannion <mathew.mannion@linklaters.com>",
package/src/HtmlDiff.ts CHANGED
@@ -162,7 +162,22 @@ export default class HtmlDiff {
162
162
  // constructor overload that would re-leak the parameter we just hid.
163
163
  private tablePreprocessDepth = 0
164
164
 
165
- private specialTagDiffStack: string[] = []
165
+ /**
166
+ * Tracks currently-open formatting-tag wraps. Each entry pairs the
167
+ * opening tag (so a later closing tag can find its match) with the
168
+ * styling info needed to RE-OPEN the wrap if an overlapping
169
+ * formatting-tag close forces it to split. Without the styling info,
170
+ * an overlap like `<strong>X</strong>` ↔ `<u>X</u>` produces an
171
+ * unclosable wrap (the closing tag for the outer wrap arrives while
172
+ * an inner wrap is still on the stack); see `insertTag`'s closing
173
+ * handler for the split logic.
174
+ */
175
+ private specialTagDiffStack: Array<{
176
+ tag: string
177
+ styledTagNames: string
178
+ cssClass: string
179
+ metadata: WrapMetadata | undefined
180
+ }> = []
166
181
  private newWords: string[] = []
167
182
  private oldWords: string[] = []
168
183
  /**
@@ -827,8 +842,13 @@ export default class HtmlDiff {
827
842
  // if there are nonTags, the index of the last tag is the index before the first nonTag.
828
843
  const indexLastTagInFirstTagBlock = indexOfFirstNonTag === -1 ? words.length - 1 : indexOfFirstNonTag - 1
829
844
 
830
- let specialCaseTagInjection = ''
831
- let specialCaseTagInjectionIsBefore = false
845
+ // Pre-injection sits BEFORE the extracted tag-block content (used
846
+ // by closing tags so `</ins></strong>` reads left-to-right).
847
+ // Post-injection sits AFTER (used by opening tags so the rendered
848
+ // order is `<strong><ins ...>` and by the overlap-split case so
849
+ // the re-opened `<ins>`s sit AFTER the actual closing tag).
850
+ let preInject = ''
851
+ let postInject = ''
832
852
 
833
853
  // handle opening tag
834
854
  if (HtmlDiff.SpecialCaseOpeningTagRegex.test(words[0])) {
@@ -840,10 +860,11 @@ export default class HtmlDiff {
840
860
  }
841
861
  const styledTagNames = Array.from(tagNames).join(' ')
842
862
 
843
- this.specialTagDiffStack.push(words[0])
844
863
  // Carry the caller's metadata into the formatting-tag wrapper so
845
864
  // a 3-way author tag survives a `<strong>`/`<em>` content edit.
846
- specialCaseTagInjection = `<ins${Utils.composeTagAttributes(`mod ${styledTagNames}`, metadata ?? {})}>`
865
+ const styledCssClass = `mod ${styledTagNames}`
866
+ this.specialTagDiffStack.push({ tag: words[0], styledTagNames, cssClass: styledCssClass, metadata })
867
+ postInject = `<ins${Utils.composeTagAttributes(styledCssClass, metadata ?? {})}>`
847
868
  if (tag === HtmlDiff.DelTag) {
848
869
  words.shift()
849
870
 
@@ -855,7 +876,6 @@ export default class HtmlDiff {
855
876
  }
856
877
  // handle closing tag
857
878
  else if (HtmlDiff.SpecialCaseClosingTagsSet.has(words[0].toLowerCase())) {
858
- const openingTag = this.specialTagDiffStack.length === 0 ? null : this.specialTagDiffStack.pop()
859
879
  // For delete operations: when the tag block contains a mix of formatting and
860
880
  // non-formatting closing tags (e.g. </strong></div>), compare against the first
861
881
  // closing tag (the formatting one) rather than the last tag in the block.
@@ -870,19 +890,39 @@ export default class HtmlDiff {
870
890
  tagIndexToCompare = 0
871
891
  }
872
892
  }
873
- const openingAndClosingTagsMatch =
874
- !!openingTag && Utils.getTagName(openingTag) === Utils.getTagName(words[tagIndexToCompare])
875
893
 
876
- if (openingTag && openingAndClosingTagsMatch) {
877
- specialCaseTagInjection = '</ins>'
878
- specialCaseTagInjectionIsBefore = true
894
+ // Search the stack for a matching opener (LIFO). When the match
895
+ // is the top entry, this is the normal balanced case and we
896
+ // emit a single `</ins>` before the closing tag. When the match
897
+ // is below an unmatched opener — i.e. another formatting wrap
898
+ // opened after it but hasn't been closed yet — the wraps
899
+ // overlap in source order, which has no valid LIFO HTML
900
+ // expression. Resolve by SPLITTING the wraps: close everything
901
+ // above the match (their `<ins>`s and the match's `<ins>`), then
902
+ // re-open the above wraps with fresh `<ins>` tags AFTER the
903
+ // closing tag emits. The above wraps continue to apply until
904
+ // their own closing tag arrives.
905
+ const closingTagName = Utils.getTagName(words[tagIndexToCompare])
906
+ let matchIdx = -1
907
+ for (let i = this.specialTagDiffStack.length - 1; i >= 0; i--) {
908
+ if (Utils.getTagName(this.specialTagDiffStack[i].tag) === closingTagName) {
909
+ matchIdx = i
910
+ break
911
+ }
879
912
  }
880
913
 
881
- // if the tag has a corresponding opening tag, but they don't match,
882
- // we need to push the opening tag back onto the stack
883
- else if (openingTag) {
884
- this.specialTagDiffStack.push(openingTag)
914
+ if (matchIdx >= 0) {
915
+ const aboveEntries = this.specialTagDiffStack.splice(matchIdx + 1)
916
+ this.specialTagDiffStack.pop() // pop the matched entry
917
+ // One `</ins>` per above entry, then one for the match itself.
918
+ preInject = '</ins>'.repeat(aboveEntries.length + 1)
919
+ for (const entry of aboveEntries) {
920
+ postInject += `<ins${Utils.composeTagAttributes(entry.cssClass, entry.metadata ?? {})}>`
921
+ this.specialTagDiffStack.push(entry) // their wrap continues via the new <ins>
922
+ }
885
923
  }
924
+ // No match in stack — orphan closing tag, drop the `<ins>` work
925
+ // and just let the tag itself flow through extractConsecutiveWords.
886
926
 
887
927
  if (tag === HtmlDiff.DelTag) {
888
928
  words.shift()
@@ -893,7 +933,7 @@ export default class HtmlDiff {
893
933
  }
894
934
  }
895
935
 
896
- if (words.length === 0 && specialCaseTagInjection.length === 0) {
936
+ if (words.length === 0 && preInject.length === 0 && postInject.length === 0) {
897
937
  break
898
938
  }
899
939
 
@@ -909,11 +949,7 @@ export default class HtmlDiff {
909
949
  !HtmlDiff.SpecialCaseClosingTagsSet.has(x.toLowerCase())
910
950
  : Utils.isTag
911
951
 
912
- if (specialCaseTagInjectionIsBefore) {
913
- this.content.push(specialCaseTagInjection + this.extractConsecutiveWords(words, isTagForExtraction).join(''))
914
- } else {
915
- this.content.push(this.extractConsecutiveWords(words, isTagForExtraction).join('') + specialCaseTagInjection)
916
- }
952
+ this.content.push(preInject + this.extractConsecutiveWords(words, isTagForExtraction).join('') + postInject)
917
953
 
918
954
  if (words.length === 0) continue
919
955
 
@@ -303,7 +303,21 @@ function preprocessByContent(
303
303
  return { modifiedGenesis, modifiedCp, modifiedMe, placeholderToDiff }
304
304
  }
305
305
 
306
- const POSITIONAL_PAIR_SIMILARITY_THRESHOLD = 0.5
306
+ // Positional pairing is the strict-default for three-way table merge:
307
+ // when all three inputs have the same number of tables in the same
308
+ // order, we pair them by index and let `diffTableThreeWay` handle
309
+ // per-table cell/row level differences. The similarity guard below
310
+ // only kicks in to *reject* positional alignment when a pair is
311
+ // SO dissimilar that it's near-certainly a table reorder/rename
312
+ // where content-LCS pairing would be materially better. The
313
+ // threshold is intentionally low — the 2-way path has no such guard
314
+ // and pairs purely by index (its `diffTable` falls back through
315
+ // same-dimension → equal-row-count → row-LCS → whole-table on its
316
+ // own), so the three-way path was stricter than its sibling and
317
+ // silently dropped to whole-table del+ins for legitimate edits
318
+ // like "rename one column and tweak its values". Aligning the
319
+ // threshold here keeps the two-way and three-way paths in step.
320
+ const POSITIONAL_PAIR_SIMILARITY_THRESHOLD = 0.15
307
321
 
308
322
  function positionallyAligned(
309
323
  genesis: string,
@@ -48,6 +48,21 @@ describe('HtmlDiff', () => {
48
48
  'Some formatted text',
49
49
  "Some <ins class='mod strong i'>formatted</ins> text",
50
50
  ],
51
+ // Overlapping formatting wraps — old wraps a word in <strong>, new wraps the same
52
+ // word in <u>. The wraps cross (mod-strong opens before mod-u, but the </strong>
53
+ // closing arrives before </u>), so emission must split the inner wrap to keep
54
+ // HTML well-formed. Regression: previously left mod-strong unclosed and the
55
+ // 3-way path threw on the unbalanced stack.
56
+ [
57
+ '<strong>three</strong>',
58
+ '<u>three</u>',
59
+ "<ins class='mod strong'><u><ins class='mod u'>three</ins></ins><ins class='mod u'></ins></u>",
60
+ ],
61
+ [
62
+ 'a <strong>three</strong> b',
63
+ 'a <u>three</u> b',
64
+ "a <ins class='mod strong'><u><ins class='mod u'>three</ins></ins><ins class='mod u'></ins></u> b",
65
+ ],
51
66
  [
52
67
  '<table><tr><td>col1</td><td>col2</td></tr><tr><td>Data 1</td><td>Data 2</td></tr></table>',
53
68
  '<table><tr><td>col1</td><td>col2</td></tr></table>',
@@ -144,6 +144,17 @@ describe('HtmlDiff.executeThreeWay (genesis-spine)', () => {
144
144
  expect(out).toMatch(/<p>First paragraph.*data-author='cp'.*<\/p>/)
145
145
  expect(out).toMatch(/<p>Second paragraph.*data-author='me'.*<\/p>/)
146
146
  })
147
+
148
+ it('overlapping formatting wraps from each author do not unbalance the emission stack', () => {
149
+ // Genesis: plain "three". CP wrapped it in <strong>, Me in <u>. The
150
+ // mod-strong (cp) and mod-u (me) wraps cross: <strong> opens before
151
+ // <u>, but </strong> arrives before </u>. The emitter must split
152
+ // the inner wrap so the output stays well-formed instead of
153
+ // throwing an unbalanced-stack error.
154
+ expect(HtmlDiff.executeThreeWay('three', '<strong>three</strong>', '<u>three</u>')).toBe(
155
+ "<strong><ins class='mod strong cp' data-author='cp'><u><ins class='mod u me' data-author='me'>three</ins></ins></strong><ins class='mod u me' data-author='me'></ins></u>"
156
+ )
157
+ })
147
158
  })
148
159
 
149
160
  describe('options pass-through', () => {
@@ -298,4 +298,30 @@ describe('HtmlDiff.executeThreeWay (tables, genesis-spine)', () => {
298
298
  expect(HtmlDiff.executeThreeWay('<p>a</p>', '<p>a</p>', '<p>a</p>')).toBe('<p>a</p>')
299
299
  })
300
300
  })
301
+
302
+ describe('positional pairing under moderate dissimilarity', () => {
303
+ it('column rename + value rewrite still routes through cell-level diff (not whole-table del+ins)', () => {
304
+ // Real-world regression: cp renamed a column ("Form/Document/Certificate"
305
+ // → "Extra column") and replaced the values in that column with short
306
+ // tokens. Word-level Jaccard between the genesis table and cp's edited
307
+ // table drops to ~0.38 — under the 0.5 threshold the three-way path
308
+ // used to take, which kicked the diff into multi-table content-LCS
309
+ // and produced whole-table del+ins (the cp's CP-bubble showed the
310
+ // entire old table struck through and the entire new table inserted).
311
+ // 2-way had no such guard and produced a cell-level diff for the same
312
+ // inputs; lowering the 3-way threshold brings the two paths in step.
313
+ const genesis =
314
+ '<table><tr><td>A</td><td>Form/Document/Certificate</td><td>Date</td></tr><tr><td>Party A</td><td>IRS W-8</td><td>On execution</td></tr></table>'
315
+ const cp =
316
+ '<table><tr><td>A</td><td>Extra column</td><td>Date</td></tr><tr><td>Party A</td><td>Yes</td><td>On execution</td></tr></table>'
317
+ const me = genesis
318
+ const out = HtmlDiff.executeThreeWay(genesis, cp, me)
319
+ // Expect cell-level cp attribution INSIDE the table cells, NOT a
320
+ // whole-table del+ins wrapping the entire <table>.
321
+ expect(out).not.toMatch(/<del[^>]*><table/)
322
+ expect(out).toMatch(/data-author='cp'/)
323
+ expect(out).toContain('Extra column')
324
+ expect(out).toContain('Form/Document/Certificate')
325
+ })
326
+ })
301
327
  })