@createiq/htmldiff 1.1.0-beta.0 → 1.2.0-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +40 -0
- package/dist/HtmlDiff.cjs +1259 -498
- package/dist/HtmlDiff.cjs.map +1 -1
- package/dist/HtmlDiff.d.cts +141 -7
- package/dist/HtmlDiff.d.mts +140 -7
- package/dist/HtmlDiff.mjs +1259 -498
- package/dist/HtmlDiff.mjs.map +1 -1
- package/package.json +7 -7
- package/src/Alignment.ts +349 -0
- package/src/HtmlDiff.ts +323 -33
- package/src/HtmlScanner.ts +200 -0
- package/src/TableDiff.ts +99 -550
- package/src/ThreeWayDiff.ts +223 -0
- package/src/ThreeWayTable.ts +701 -0
- package/src/Utils.ts +34 -2
- package/test/HtmlDiff.analyze.spec.ts +152 -0
- package/test/HtmlDiff.tables.matrix.spec.ts +8 -3
- package/test/HtmlDiff.tables.spec.ts +368 -19
- package/test/HtmlDiff.threeWay.spec.ts +175 -0
- package/test/HtmlDiff.threeWay.tables.spec.ts +407 -0
- package/test/TableDiff.bench.ts +39 -0
- package/test/Utils.spec.ts +48 -0
package/src/TableDiff.ts
CHANGED
|
@@ -1,3 +1,20 @@
|
|
|
1
|
+
import {
|
|
2
|
+
type Alignment,
|
|
3
|
+
findOptimalAlignmentSkips,
|
|
4
|
+
lcsAlign,
|
|
5
|
+
orderAlignmentForEmission,
|
|
6
|
+
pairSimilarUnmatched,
|
|
7
|
+
textSimilarity,
|
|
8
|
+
} from './Alignment'
|
|
9
|
+
import {
|
|
10
|
+
findMatchingClosingTag,
|
|
11
|
+
injectClass,
|
|
12
|
+
matchesClosingTagAt,
|
|
13
|
+
matchesTagAt,
|
|
14
|
+
parseOpeningTagAt,
|
|
15
|
+
} from './HtmlScanner'
|
|
16
|
+
import { wrapText } from './Utils'
|
|
17
|
+
|
|
1
18
|
/**
|
|
2
19
|
* Table-aware preprocessing for HtmlDiff.
|
|
3
20
|
*
|
|
@@ -22,7 +39,7 @@
|
|
|
22
39
|
* by the normal word-level pipeline.
|
|
23
40
|
*/
|
|
24
41
|
|
|
25
|
-
interface CellRange {
|
|
42
|
+
export interface CellRange {
|
|
26
43
|
/** Start index of the cell's opening tag in the original html. */
|
|
27
44
|
cellStart: number
|
|
28
45
|
/** Index just past the cell's closing tag. */
|
|
@@ -32,13 +49,13 @@ interface CellRange {
|
|
|
32
49
|
contentEnd: number
|
|
33
50
|
}
|
|
34
51
|
|
|
35
|
-
interface RowRange {
|
|
52
|
+
export interface RowRange {
|
|
36
53
|
rowStart: number
|
|
37
54
|
rowEnd: number
|
|
38
55
|
cells: CellRange[]
|
|
39
56
|
}
|
|
40
57
|
|
|
41
|
-
interface TableRange {
|
|
58
|
+
export interface TableRange {
|
|
42
59
|
tableStart: number
|
|
43
60
|
tableEnd: number
|
|
44
61
|
rows: RowRange[]
|
|
@@ -73,9 +90,26 @@ const PLACEHOLDER_SUFFIX = '-->'
|
|
|
73
90
|
const MAX_TABLE_ROWS = 1500
|
|
74
91
|
const MAX_TABLE_CELLS_PER_ROW = 200
|
|
75
92
|
|
|
76
|
-
|
|
93
|
+
// Caps for the per-row column-position DP in
|
|
94
|
+
// findBestColumnInsertPositions / findBestColumnDeletePositions.
|
|
95
|
+
// MAX_COLUMN_DELTA is the *semantic* guard: a row with more than 6
|
|
96
|
+
// columns added or deleted is almost always a row rewrite rather than
|
|
97
|
+
// a structural column change, and is better handled by cell-LCS with
|
|
98
|
+
// fuzzy pairing. MAX_COLUMN_SEARCH_WIDTH bounds the per-row DP at
|
|
99
|
+
// O(MAX_COLUMN_SEARCH_WIDTH²) ≈ 40K ops; aligned with
|
|
100
|
+
// MAX_TABLE_CELLS_PER_ROW so any row that survives the table-size cap
|
|
101
|
+
// can still use the DP path.
|
|
102
|
+
const MAX_COLUMN_DELTA = 6
|
|
103
|
+
const MAX_COLUMN_SEARCH_WIDTH = 200
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Generate a placeholder-prefix nonce that doesn't collide with any
|
|
107
|
+
* existing content in the inputs. Variadic so callers with N inputs
|
|
108
|
+
* (e.g. three-way diff with V1/V2/V3) check across all of them.
|
|
109
|
+
*/
|
|
110
|
+
export function makePlaceholderPrefix(...inputs: string[]): string {
|
|
77
111
|
// 4 random bytes → 8 hex chars → 16^8 ≈ 4.3 billion combinations. We
|
|
78
|
-
// also retry if the generated nonce happens to occur in
|
|
112
|
+
// also retry if the generated nonce happens to occur in any input.
|
|
79
113
|
// Using `Math.random` here is fine: we're not defending against a
|
|
80
114
|
// malicious adversary, just avoiding accidental collisions.
|
|
81
115
|
for (let attempt = 0; attempt < 8; attempt++) {
|
|
@@ -83,7 +117,7 @@ function makePlaceholderPrefix(oldHtml: string, newHtml: string): string {
|
|
|
83
117
|
.toString(16)
|
|
84
118
|
.padStart(8, '0')
|
|
85
119
|
const prefix = `${PLACEHOLDER_PREFIX_BASE}${nonce}_`
|
|
86
|
-
if (
|
|
120
|
+
if (inputs.every(input => !input.includes(prefix))) {
|
|
87
121
|
return prefix
|
|
88
122
|
}
|
|
89
123
|
}
|
|
@@ -93,6 +127,8 @@ function makePlaceholderPrefix(oldHtml: string, newHtml: string): string {
|
|
|
93
127
|
return `${PLACEHOLDER_PREFIX_BASE}fallback_${Date.now()}_`
|
|
94
128
|
}
|
|
95
129
|
|
|
130
|
+
export { PLACEHOLDER_SUFFIX }
|
|
131
|
+
|
|
96
132
|
type DiffCellFn = (oldCellContent: string, newCellContent: string) => string
|
|
97
133
|
|
|
98
134
|
/**
|
|
@@ -145,11 +181,11 @@ export function restoreTablePlaceholders(diffOutput: string, placeholderToDiff:
|
|
|
145
181
|
return result
|
|
146
182
|
}
|
|
147
183
|
|
|
148
|
-
function spliceString(s: string, start: number, end: number, replacement: string): string {
|
|
184
|
+
export function spliceString(s: string, start: number, end: number, replacement: string): string {
|
|
149
185
|
return s.slice(0, start) + replacement + s.slice(end)
|
|
150
186
|
}
|
|
151
187
|
|
|
152
|
-
function exceedsSizeLimit(table: TableRange): boolean {
|
|
188
|
+
export function exceedsSizeLimit(table: TableRange): boolean {
|
|
153
189
|
if (table.rows.length > MAX_TABLE_ROWS) return true
|
|
154
190
|
for (const row of table.rows) {
|
|
155
191
|
if (row.cells.length > MAX_TABLE_CELLS_PER_ROW) return true
|
|
@@ -320,7 +356,7 @@ function emitEmptyRow(html: string, row: RowRange): string {
|
|
|
320
356
|
return html.slice(row.rowStart, row.rowEnd)
|
|
321
357
|
}
|
|
322
358
|
|
|
323
|
-
function sameDimensions(a: TableRange, b: TableRange): boolean {
|
|
359
|
+
export function sameDimensions(a: TableRange, b: TableRange): boolean {
|
|
324
360
|
if (a.rows.length !== b.rows.length) return false
|
|
325
361
|
for (let i = 0; i < a.rows.length; i++) {
|
|
326
362
|
if (a.rows[i].cells.length !== b.rows[i].cells.length) return false
|
|
@@ -395,7 +431,7 @@ function diffStructurallyAlignedTable(
|
|
|
395
431
|
// has no rows at all, fall back to a from-scratch reconstruction so
|
|
396
432
|
// we still emit deleted rows.
|
|
397
433
|
if (newTable.rows.length === 0) {
|
|
398
|
-
return rebuildStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, alignment
|
|
434
|
+
return rebuildStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, alignment)
|
|
399
435
|
}
|
|
400
436
|
|
|
401
437
|
const out: string[] = []
|
|
@@ -408,97 +444,23 @@ function diffStructurallyAlignedTable(
|
|
|
408
444
|
if (align.oldIdx !== null) {
|
|
409
445
|
out.push(diffPreservedRow(oldHtml, newHtml, oldTable.rows[align.oldIdx], newRow, diffCell))
|
|
410
446
|
} else {
|
|
411
|
-
out.push(emitFullRow(newHtml, newRow, 'ins'
|
|
447
|
+
out.push(emitFullRow(newHtml, newRow, 'ins'))
|
|
412
448
|
}
|
|
413
449
|
cursor = newRow.rowEnd
|
|
414
450
|
} else if (align.oldIdx !== null) {
|
|
415
|
-
out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], 'del'
|
|
451
|
+
out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], 'del'))
|
|
416
452
|
}
|
|
417
453
|
}
|
|
418
454
|
out.push(newHtml.slice(cursor, newTable.tableEnd))
|
|
419
455
|
return out.join('')
|
|
420
456
|
}
|
|
421
457
|
|
|
422
|
-
/**
|
|
423
|
-
* Reorders the alignment so emission produces rows in the visually-
|
|
424
|
-
* correct order. Each entry is assigned a fractional "position" in
|
|
425
|
-
* new's flow:
|
|
426
|
-
*
|
|
427
|
-
* • Preserved/paired (oldIdx, newIdx): position = newIdx.
|
|
428
|
-
* • Pure insert (null, newIdx): position = newIdx.
|
|
429
|
-
* • Pure delete (oldIdx, null): position = newIdx-of-preserved-just-
|
|
430
|
-
* before-this-oldIdx + 0.5. Dels at the same gap sort by oldIdx so
|
|
431
|
-
* they appear in old's row order. The +0.5 places dels BEFORE any
|
|
432
|
-
* insert at the same gap (insert at newIdx N1+1 has position N1+1
|
|
433
|
-
* which is > N1+0.5), giving the natural "delete first, insert
|
|
434
|
-
* second" reading order at a replaced position.
|
|
435
|
-
*
|
|
436
|
-
* This handles the full range:
|
|
437
|
-
* • Run of unpaired dels at the start (no preserved predecessor):
|
|
438
|
-
* position -0.5, sorted by oldIdx.
|
|
439
|
-
* • Dels in the middle: positioned right after their preceding
|
|
440
|
-
* preserved row.
|
|
441
|
-
* • Dels at the end (no preserved successor): positioned after the
|
|
442
|
-
* last preserved row.
|
|
443
|
-
*
|
|
444
|
-
* Without this reordering, a run of unpaired deletes at low alignment
|
|
445
|
-
* indices got emitted at cursor = first-new-row position — putting
|
|
446
|
-
* all deletes before any preserved row in the output, regardless of
|
|
447
|
-
* where they came from in old.
|
|
448
|
-
*/
|
|
449
|
-
function orderAlignmentForEmission(alignment: Alignment[]): Alignment[] {
|
|
450
|
-
const preserved: Array<{ oldIdx: number; newIdx: number }> = []
|
|
451
|
-
for (const a of alignment) {
|
|
452
|
-
if (a.oldIdx !== null && a.newIdx !== null) {
|
|
453
|
-
preserved.push({ oldIdx: a.oldIdx, newIdx: a.newIdx })
|
|
454
|
-
}
|
|
455
|
-
}
|
|
456
|
-
preserved.sort((a, b) => a.oldIdx - b.oldIdx)
|
|
457
|
-
|
|
458
|
-
// For a deleted row with oldIdx K, return the newIdx of the preserved
|
|
459
|
-
// entry with the largest oldIdx less than K, or -1 if none.
|
|
460
|
-
function newIdxOfPreservedBefore(oldIdx: number): number {
|
|
461
|
-
let result = -1
|
|
462
|
-
for (const p of preserved) {
|
|
463
|
-
if (p.oldIdx >= oldIdx) break
|
|
464
|
-
result = p.newIdx
|
|
465
|
-
}
|
|
466
|
-
return result
|
|
467
|
-
}
|
|
468
|
-
|
|
469
|
-
// Decorate each alignment with a fractional position. We use
|
|
470
|
-
// (primary, secondary) tuples so dels at the same gap sort by oldIdx
|
|
471
|
-
// (in old's row order) and inserts at the same newIdx stay stable.
|
|
472
|
-
const decorated = alignment.map((a, i) => {
|
|
473
|
-
let primary: number
|
|
474
|
-
let secondary: number
|
|
475
|
-
if (a.newIdx !== null) {
|
|
476
|
-
primary = a.newIdx
|
|
477
|
-
secondary = a.oldIdx === null ? 1 : 0 // preserved before pure-insert at same newIdx (rare)
|
|
478
|
-
} else {
|
|
479
|
-
// Pure delete
|
|
480
|
-
primary = newIdxOfPreservedBefore(a.oldIdx as number) + 0.5
|
|
481
|
-
secondary = a.oldIdx as number
|
|
482
|
-
}
|
|
483
|
-
return { entry: a, primary, secondary, originalIdx: i }
|
|
484
|
-
})
|
|
485
|
-
|
|
486
|
-
decorated.sort((a, b) => {
|
|
487
|
-
if (a.primary !== b.primary) return a.primary - b.primary
|
|
488
|
-
if (a.secondary !== b.secondary) return a.secondary - b.secondary
|
|
489
|
-
return a.originalIdx - b.originalIdx // stable
|
|
490
|
-
})
|
|
491
|
-
|
|
492
|
-
return decorated.map(d => d.entry)
|
|
493
|
-
}
|
|
494
|
-
|
|
495
458
|
function rebuildStructurallyAlignedTable(
|
|
496
459
|
oldHtml: string,
|
|
497
460
|
newHtml: string,
|
|
498
461
|
oldTable: TableRange,
|
|
499
462
|
newTable: TableRange,
|
|
500
|
-
alignment: Alignment[]
|
|
501
|
-
diffCell: DiffCellFn
|
|
463
|
+
alignment: Alignment[]
|
|
502
464
|
): string {
|
|
503
465
|
// Used when new has no rows but old does — we lose the per-row
|
|
504
466
|
// wrappers from new (there are none), so reconstruct from old's frame.
|
|
@@ -506,9 +468,9 @@ function rebuildStructurallyAlignedTable(
|
|
|
506
468
|
out.push(headerSlice(newHtml, newTable, oldHtml, oldTable))
|
|
507
469
|
for (const align of alignment) {
|
|
508
470
|
if (align.oldIdx !== null) {
|
|
509
|
-
out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], 'del'
|
|
471
|
+
out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], 'del'))
|
|
510
472
|
} else if (align.newIdx !== null) {
|
|
511
|
-
out.push(emitFullRow(newHtml, newTable.rows[align.newIdx], 'ins'
|
|
473
|
+
out.push(emitFullRow(newHtml, newTable.rows[align.newIdx], 'ins'))
|
|
512
474
|
}
|
|
513
475
|
}
|
|
514
476
|
out.push('</table>')
|
|
@@ -526,7 +488,7 @@ function headerSlice(newHtml: string, newTable: TableRange, oldHtml: string, old
|
|
|
526
488
|
return oldHtml.slice(oldTable.tableStart, oldFirstRow)
|
|
527
489
|
}
|
|
528
490
|
|
|
529
|
-
function rowKey(html: string, row: RowRange): string {
|
|
491
|
+
export function rowKey(html: string, row: RowRange): string {
|
|
530
492
|
// Include cell tag text in the key so column-add doesn't accidentally
|
|
531
493
|
// match a row to one with different cell counts. Whitespace-normalize to
|
|
532
494
|
// tolerate formatting differences.
|
|
@@ -548,59 +510,51 @@ function diffPreservedRow(
|
|
|
548
510
|
// on each affected cell.
|
|
549
511
|
const colspanAligned = diffColspanChangedRow(oldHtml, newHtml, oldRow, newRow, diffCell)
|
|
550
512
|
if (colspanAligned !== null) return colspanAligned
|
|
551
|
-
// For a single-column add/delete (cell count differs by exactly 1),
|
|
552
|
-
// detect the position via positional similarity scan and align the
|
|
553
|
-
// remaining cells positionally. This handles the case where a column
|
|
554
|
-
// was added AND a different cell got an unrelated content edit — the
|
|
555
|
-
// edited cell still aligns by position rather than getting orphaned by
|
|
556
|
-
// the cell-LCS exact-match.
|
|
557
|
-
const delta = newRow.cells.length - oldRow.cells.length
|
|
558
513
|
// For column add/delete (cell counts differ), find the best insertion
|
|
559
514
|
// or deletion positions via positional similarity scan and align the
|
|
560
515
|
// remaining cells positionally. This handles content-edit alongside
|
|
561
516
|
// column-add by keeping the edited cell in its column position rather
|
|
562
517
|
// than orphaning it via the cell-LCS exact match.
|
|
563
|
-
// Guardrail:
|
|
564
|
-
//
|
|
565
|
-
//
|
|
518
|
+
// Guardrail: O(M × N) DP scales fine within MAX_COLUMN_SEARCH_WIDTH;
|
|
519
|
+
// wider rows fall through to cell-LCS so we don't run the per-row DP
|
|
520
|
+
// on multi-thousand-cell exotica. MAX_COLUMN_DELTA stays as a
|
|
521
|
+
// semantic guard — a delta > 6 usually means "row rewrite", not
|
|
522
|
+
// "column added", and is better handled by cell-LCS.
|
|
523
|
+
const delta = newRow.cells.length - oldRow.cells.length
|
|
566
524
|
const absDelta = Math.abs(delta)
|
|
567
525
|
if (
|
|
568
526
|
absDelta > 0 &&
|
|
569
527
|
absDelta <= MAX_COLUMN_DELTA &&
|
|
570
528
|
Math.max(oldRow.cells.length, newRow.cells.length) <= MAX_COLUMN_SEARCH_WIDTH
|
|
571
529
|
) {
|
|
572
|
-
if (delta > 0) return diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow,
|
|
573
|
-
return diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow,
|
|
530
|
+
if (delta > 0) return diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, diffCell)
|
|
531
|
+
return diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, diffCell)
|
|
574
532
|
}
|
|
575
533
|
return diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell)
|
|
576
534
|
}
|
|
577
535
|
|
|
578
|
-
const MAX_COLUMN_DELTA = 6
|
|
579
|
-
const MAX_COLUMN_SEARCH_WIDTH = 40
|
|
580
|
-
|
|
581
536
|
/**
|
|
582
|
-
* For a row where new has
|
|
583
|
-
*
|
|
584
|
-
*
|
|
585
|
-
*
|
|
586
|
-
*
|
|
587
|
-
* diff for matched pairs.
|
|
537
|
+
* For a row where new has more cells than old, find the column positions
|
|
538
|
+
* in new where cells were inserted by running a monotonic-alignment DP
|
|
539
|
+
* over the cell texts: pick the skip positions that maximise the sum-of-
|
|
540
|
+
* similarities of the unskipped new cells aligned positionally against
|
|
541
|
+
* the old cells. The inserted cells are emitted with diff markers; the
|
|
542
|
+
* rest are aligned positionally with content diff for matched pairs.
|
|
588
543
|
*/
|
|
589
544
|
function diffMultiColumnAddRow(
|
|
590
545
|
oldHtml: string,
|
|
591
546
|
newHtml: string,
|
|
592
547
|
oldRow: RowRange,
|
|
593
548
|
newRow: RowRange,
|
|
594
|
-
k: number,
|
|
595
549
|
diffCell: DiffCellFn
|
|
596
550
|
): string {
|
|
597
|
-
const insertedPositions = findBestColumnInsertPositions(oldRow, newRow,
|
|
551
|
+
const insertedPositions = findBestColumnInsertPositions(oldRow, newRow, oldHtml, newHtml)
|
|
598
552
|
const inserted = new Set(insertedPositions)
|
|
599
553
|
const out: string[] = [rowHeaderSlice(newHtml, newRow)]
|
|
600
554
|
let oldIdx = 0
|
|
601
555
|
for (let c = 0; c < newRow.cells.length; c++) {
|
|
602
556
|
if (inserted.has(c)) {
|
|
603
|
-
out.push(emitFullCell(newHtml, newRow.cells[c], 'ins'
|
|
557
|
+
out.push(emitFullCell(newHtml, newRow.cells[c], 'ins'))
|
|
604
558
|
} else {
|
|
605
559
|
out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[c], diffCell))
|
|
606
560
|
oldIdx++
|
|
@@ -615,16 +569,15 @@ function diffMultiColumnDeleteRow(
|
|
|
615
569
|
newHtml: string,
|
|
616
570
|
oldRow: RowRange,
|
|
617
571
|
newRow: RowRange,
|
|
618
|
-
k: number,
|
|
619
572
|
diffCell: DiffCellFn
|
|
620
573
|
): string {
|
|
621
|
-
const deletedPositions = findBestColumnDeletePositions(oldRow, newRow,
|
|
574
|
+
const deletedPositions = findBestColumnDeletePositions(oldRow, newRow, oldHtml, newHtml)
|
|
622
575
|
const deleted = new Set(deletedPositions)
|
|
623
576
|
const out: string[] = [rowHeaderSlice(newHtml, newRow)]
|
|
624
577
|
let newIdx = 0
|
|
625
578
|
for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
|
|
626
579
|
if (deleted.has(oldIdx)) {
|
|
627
|
-
out.push(emitFullCell(oldHtml, oldRow.cells[oldIdx], 'del'
|
|
580
|
+
out.push(emitFullCell(oldHtml, oldRow.cells[oldIdx], 'del'))
|
|
628
581
|
continue
|
|
629
582
|
}
|
|
630
583
|
out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[newIdx], diffCell))
|
|
@@ -634,74 +587,20 @@ function diffMultiColumnDeleteRow(
|
|
|
634
587
|
return out.join('')
|
|
635
588
|
}
|
|
636
589
|
|
|
637
|
-
function findBestColumnInsertPositions(
|
|
638
|
-
oldRow
|
|
639
|
-
newRow
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
): number[] {
|
|
644
|
-
let bestPositions: number[] = []
|
|
645
|
-
let bestScore = -1
|
|
646
|
-
for (const combo of combinationsOfRange(newRow.cells.length, k)) {
|
|
647
|
-
const inserted = new Set(combo)
|
|
648
|
-
let score = 0
|
|
649
|
-
let oldIdx = 0
|
|
650
|
-
for (let newIdx = 0; newIdx < newRow.cells.length; newIdx++) {
|
|
651
|
-
if (inserted.has(newIdx)) continue
|
|
652
|
-
score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
|
|
653
|
-
oldIdx++
|
|
654
|
-
}
|
|
655
|
-
if (score > bestScore) {
|
|
656
|
-
bestScore = score
|
|
657
|
-
bestPositions = combo
|
|
658
|
-
}
|
|
659
|
-
}
|
|
660
|
-
return bestPositions
|
|
661
|
-
}
|
|
662
|
-
|
|
663
|
-
function findBestColumnDeletePositions(
|
|
664
|
-
oldRow: RowRange,
|
|
665
|
-
newRow: RowRange,
|
|
666
|
-
k: number,
|
|
667
|
-
oldHtml: string,
|
|
668
|
-
newHtml: string
|
|
669
|
-
): number[] {
|
|
670
|
-
let bestPositions: number[] = []
|
|
671
|
-
let bestScore = -1
|
|
672
|
-
for (const combo of combinationsOfRange(oldRow.cells.length, k)) {
|
|
673
|
-
const deleted = new Set(combo)
|
|
674
|
-
let score = 0
|
|
675
|
-
let newIdx = 0
|
|
676
|
-
for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
|
|
677
|
-
if (deleted.has(oldIdx)) continue
|
|
678
|
-
score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
|
|
679
|
-
newIdx++
|
|
680
|
-
}
|
|
681
|
-
if (score > bestScore) {
|
|
682
|
-
bestScore = score
|
|
683
|
-
bestPositions = combo
|
|
684
|
-
}
|
|
685
|
-
}
|
|
686
|
-
return bestPositions
|
|
590
|
+
function findBestColumnInsertPositions(oldRow: RowRange, newRow: RowRange, oldHtml: string, newHtml: string): number[] {
|
|
591
|
+
const oldTexts = oldRow.cells.map(c => cellText(oldHtml, c))
|
|
592
|
+
const newTexts = newRow.cells.map(c => cellText(newHtml, c))
|
|
593
|
+
return findOptimalAlignmentSkips(oldTexts, newTexts, (oldIdx, newIdx) =>
|
|
594
|
+
textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
|
|
595
|
+
)
|
|
687
596
|
}
|
|
688
597
|
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
if (k === 0 || k > n) return
|
|
696
|
-
const indices = Array.from({ length: k }, (_, i) => i)
|
|
697
|
-
while (true) {
|
|
698
|
-
yield indices.slice()
|
|
699
|
-
let i = k - 1
|
|
700
|
-
while (i >= 0 && indices[i] === n - k + i) i--
|
|
701
|
-
if (i < 0) return
|
|
702
|
-
indices[i]++
|
|
703
|
-
for (let j = i + 1; j < k; j++) indices[j] = indices[j - 1] + 1
|
|
704
|
-
}
|
|
598
|
+
function findBestColumnDeletePositions(oldRow: RowRange, newRow: RowRange, oldHtml: string, newHtml: string): number[] {
|
|
599
|
+
const oldTexts = oldRow.cells.map(c => cellText(oldHtml, c))
|
|
600
|
+
const newTexts = newRow.cells.map(c => cellText(newHtml, c))
|
|
601
|
+
return findOptimalAlignmentSkips(newTexts, oldTexts, (newIdx, oldIdx) =>
|
|
602
|
+
textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
|
|
603
|
+
)
|
|
705
604
|
}
|
|
706
605
|
|
|
707
606
|
/**
|
|
@@ -864,9 +763,9 @@ function diffStructurallyAlignedRow(
|
|
|
864
763
|
const newCell = newRow.cells[align.newIdx]
|
|
865
764
|
out.push(emitDiffedCell(oldHtml, newHtml, oldCell, newCell, diffCell))
|
|
866
765
|
} else if (align.newIdx !== null) {
|
|
867
|
-
out.push(emitFullCell(newHtml, newRow.cells[align.newIdx], 'ins'
|
|
766
|
+
out.push(emitFullCell(newHtml, newRow.cells[align.newIdx], 'ins'))
|
|
868
767
|
} else if (align.oldIdx !== null) {
|
|
869
|
-
out.push(emitFullCell(oldHtml, oldRow.cells[align.oldIdx], 'del'
|
|
768
|
+
out.push(emitFullCell(oldHtml, oldRow.cells[align.oldIdx], 'del'))
|
|
870
769
|
}
|
|
871
770
|
}
|
|
872
771
|
|
|
@@ -888,7 +787,7 @@ function cellKey(html: string, cell: CellRange): string {
|
|
|
888
787
|
* each `<td>`, with an `<ins>`/`<del>` wrapper around any cell content
|
|
889
788
|
* (empty cells get the class but no wrapper).
|
|
890
789
|
*/
|
|
891
|
-
function emitFullRow(html: string, row: RowRange, kind: 'ins' | 'del'
|
|
790
|
+
function emitFullRow(html: string, row: RowRange, kind: 'ins' | 'del'): string {
|
|
892
791
|
const cls = kind === 'ins' ? 'diffins' : 'diffdel'
|
|
893
792
|
const trOpening = parseOpeningTagAt(html, row.rowStart)
|
|
894
793
|
if (!trOpening) return html.slice(row.rowStart, row.rowEnd)
|
|
@@ -898,7 +797,7 @@ function emitFullRow(html: string, row: RowRange, kind: 'ins' | 'del', diffCell:
|
|
|
898
797
|
let cursor = trOpening.end
|
|
899
798
|
for (const cell of row.cells) {
|
|
900
799
|
out.push(html.slice(cursor, cell.cellStart))
|
|
901
|
-
out.push(emitFullCell(html, cell, kind
|
|
800
|
+
out.push(emitFullCell(html, cell, kind))
|
|
902
801
|
cursor = cell.cellEnd
|
|
903
802
|
}
|
|
904
803
|
out.push(html.slice(cursor, row.rowEnd))
|
|
@@ -913,7 +812,7 @@ function emitFullRow(html: string, row: RowRange, kind: 'ins' | 'del', diffCell:
|
|
|
913
812
|
* the full recursive diff would produce for newly-inserted formatting.
|
|
914
813
|
* Empty cells get the class on the `<td>` but no inner wrapping.
|
|
915
814
|
*/
|
|
916
|
-
function emitFullCell(html: string, cell: CellRange, kind: 'ins' | 'del'
|
|
815
|
+
function emitFullCell(html: string, cell: CellRange, kind: 'ins' | 'del'): string {
|
|
917
816
|
const cls = kind === 'ins' ? 'diffins' : 'diffdel'
|
|
918
817
|
const tdOpening = parseOpeningTagAt(html, cell.cellStart)
|
|
919
818
|
if (!tdOpening) return html.slice(cell.cellStart, cell.cellEnd)
|
|
@@ -954,7 +853,7 @@ function wrapInlineTextRuns(content: string, kind: 'ins' | 'del'): string {
|
|
|
954
853
|
while (j < content.length && content[j] !== '<') j++
|
|
955
854
|
const text = content.slice(i, j)
|
|
956
855
|
if (text.trim().length > 0) {
|
|
957
|
-
out.push(
|
|
856
|
+
out.push(wrapText(text, tag, cls))
|
|
958
857
|
} else {
|
|
959
858
|
out.push(text)
|
|
960
859
|
}
|
|
@@ -993,11 +892,6 @@ function rowHeaderSlice(html: string, row: RowRange): string {
|
|
|
993
892
|
return html.slice(row.rowStart, row.cells[0].cellStart)
|
|
994
893
|
}
|
|
995
894
|
|
|
996
|
-
interface Alignment {
|
|
997
|
-
oldIdx: number | null
|
|
998
|
-
newIdx: number | null
|
|
999
|
-
}
|
|
1000
|
-
|
|
1001
895
|
/** Character-level similarity threshold above which we treat two rows as "the same row, edited". */
|
|
1002
896
|
const ROW_FUZZY_THRESHOLD = 0.5
|
|
1003
897
|
|
|
@@ -1024,8 +918,13 @@ function pairSimilarUnmatchedRows(
|
|
|
1024
918
|
oldHtml: string,
|
|
1025
919
|
newHtml: string
|
|
1026
920
|
): Alignment[] {
|
|
921
|
+
// Pre-compute row texts once; the similarity callback is invoked
|
|
922
|
+
// O(D × I) times per unmatched run (every del × every ins), and
|
|
923
|
+
// rowText walks every cell.
|
|
924
|
+
const oldTexts = oldTable.rows.map(r => rowText(oldHtml, r))
|
|
925
|
+
const newTexts = newTable.rows.map(r => rowText(newHtml, r))
|
|
1027
926
|
return pairSimilarUnmatched(alignment, ROW_FUZZY_THRESHOLD, (oldIdx, newIdx) =>
|
|
1028
|
-
|
|
927
|
+
textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
|
|
1029
928
|
)
|
|
1030
929
|
}
|
|
1031
930
|
|
|
@@ -1036,149 +935,14 @@ function pairSimilarUnmatchedCells(
|
|
|
1036
935
|
oldHtml: string,
|
|
1037
936
|
newHtml: string
|
|
1038
937
|
): Alignment[] {
|
|
938
|
+
const oldTexts = oldRow.cells.map(c => cellText(oldHtml, c))
|
|
939
|
+
const newTexts = newRow.cells.map(c => cellText(newHtml, c))
|
|
1039
940
|
return pairSimilarUnmatched(alignment, CELL_FUZZY_THRESHOLD, (oldIdx, newIdx) =>
|
|
1040
|
-
|
|
941
|
+
textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
|
|
1041
942
|
)
|
|
1042
943
|
}
|
|
1043
944
|
|
|
1044
|
-
|
|
1045
|
-
* Identify pairings inside each unmatched-only run, then build the output
|
|
1046
|
-
* alignment by walking the original and substituting paired entries at
|
|
1047
|
-
* the *ins position* (not the del position). This keeps the result
|
|
1048
|
-
* monotonic in newIdx — critical because the cursor-based emission
|
|
1049
|
-
* downstream walks new's html in order. Emitting at the del position
|
|
1050
|
-
* would be fine when del<ins in the alignment array (the typical case),
|
|
1051
|
-
* but can violate monotonicity when there are mixed unpaired entries in
|
|
1052
|
-
* between (column-add + row-add together, content-edit + column-add,
|
|
1053
|
-
* etc.).
|
|
1054
|
-
*
|
|
1055
|
-
* Generic over what's being paired — works for both rows (by full row
|
|
1056
|
-
* content similarity) and cells (by per-cell content similarity).
|
|
1057
|
-
*/
|
|
1058
|
-
function pairSimilarUnmatched(
|
|
1059
|
-
alignment: Alignment[],
|
|
1060
|
-
threshold: number,
|
|
1061
|
-
similarity: (oldIdx: number, newIdx: number) => number
|
|
1062
|
-
): Alignment[] {
|
|
1063
|
-
const pairs = new Map<number, number>() // del-alignment-idx → ins-alignment-idx
|
|
1064
|
-
let i = 0
|
|
1065
|
-
while (i < alignment.length) {
|
|
1066
|
-
if (alignment[i].oldIdx !== null && alignment[i].newIdx !== null) {
|
|
1067
|
-
i++
|
|
1068
|
-
continue
|
|
1069
|
-
}
|
|
1070
|
-
const runStart = i
|
|
1071
|
-
while (i < alignment.length && (alignment[i].oldIdx === null) !== (alignment[i].newIdx === null)) i++
|
|
1072
|
-
const runEnd = i
|
|
1073
|
-
|
|
1074
|
-
const delIndices: number[] = []
|
|
1075
|
-
const insIndices: number[] = []
|
|
1076
|
-
for (let k = runStart; k < runEnd; k++) {
|
|
1077
|
-
if (alignment[k].oldIdx !== null) delIndices.push(k)
|
|
1078
|
-
else insIndices.push(k)
|
|
1079
|
-
}
|
|
1080
|
-
|
|
1081
|
-
const usedIns = new Set<number>()
|
|
1082
|
-
for (const di of delIndices) {
|
|
1083
|
-
let bestIi = -1
|
|
1084
|
-
let bestSim = threshold
|
|
1085
|
-
for (const ii of insIndices) {
|
|
1086
|
-
if (usedIns.has(ii)) continue
|
|
1087
|
-
const sim = similarity(alignment[di].oldIdx as number, alignment[ii].newIdx as number)
|
|
1088
|
-
if (sim > bestSim) {
|
|
1089
|
-
bestSim = sim
|
|
1090
|
-
bestIi = ii
|
|
1091
|
-
}
|
|
1092
|
-
}
|
|
1093
|
-
if (bestIi >= 0) {
|
|
1094
|
-
pairs.set(di, bestIi)
|
|
1095
|
-
usedIns.add(bestIi)
|
|
1096
|
-
}
|
|
1097
|
-
}
|
|
1098
|
-
}
|
|
1099
|
-
|
|
1100
|
-
const insToDel = new Map<number, number>() // ins-alignment-idx → del-alignment-idx
|
|
1101
|
-
for (const [delAi, insAi] of pairs) insToDel.set(insAi, delAi)
|
|
1102
|
-
const pairedDels = new Set<number>(pairs.keys())
|
|
1103
|
-
|
|
1104
|
-
const result: Alignment[] = []
|
|
1105
|
-
for (let k = 0; k < alignment.length; k++) {
|
|
1106
|
-
if (pairedDels.has(k)) continue // paired del — emitted when we reach its ins
|
|
1107
|
-
if (insToDel.has(k)) {
|
|
1108
|
-
const delAi = insToDel.get(k) as number
|
|
1109
|
-
result.push({ oldIdx: alignment[delAi].oldIdx, newIdx: alignment[k].newIdx })
|
|
1110
|
-
} else {
|
|
1111
|
-
result.push(alignment[k])
|
|
1112
|
-
}
|
|
1113
|
-
}
|
|
1114
|
-
return result
|
|
1115
|
-
}
|
|
1116
|
-
|
|
1117
|
-
/**
|
|
1118
|
-
* Combined similarity metric used for both row-level and cell-level
|
|
1119
|
-
* fuzzy pairing. Returns the MAX of two complementary metrics:
|
|
1120
|
-
*
|
|
1121
|
-
* 1. **Character prefix+suffix similarity** — fraction of the longer
|
|
1122
|
-
* string covered by shared prefix + shared suffix. Catches small
|
|
1123
|
-
* edits in the middle of a string (one word changed in a row).
|
|
1124
|
-
* Misses cases where the bulk of common content is in the middle
|
|
1125
|
-
* and the ends differ.
|
|
1126
|
-
*
|
|
1127
|
-
* 2. **Token Jaccard similarity** — intersection-over-union of the
|
|
1128
|
-
* whitespace-split tokens. Catches "most of the content is the
|
|
1129
|
-
* same but bookended by different bits" — e.g. a row whose only
|
|
1130
|
-
* edit is a column added at the start and another at the end,
|
|
1131
|
-
* where the ~50 chars in the middle that DO match would be
|
|
1132
|
-
* invisible to prefix+suffix.
|
|
1133
|
-
*
|
|
1134
|
-
* Either metric exceeding the threshold means pair. Neither alone is
|
|
1135
|
-
* sufficient for the full range of legal-doc edits we see in
|
|
1136
|
-
* production tables.
|
|
1137
|
-
*/
|
|
1138
|
-
function rowSimilarity(oldRow: RowRange, newRow: RowRange, oldHtml: string, newHtml: string): number {
|
|
1139
|
-
return textSimilarity(rowText(oldHtml, oldRow), rowText(newHtml, newRow))
|
|
1140
|
-
}
|
|
1141
|
-
|
|
1142
|
-
function cellSimilarity(oldCell: CellRange, newCell: CellRange, oldHtml: string, newHtml: string): number {
|
|
1143
|
-
return textSimilarity(cellText(oldHtml, oldCell), cellText(newHtml, newCell))
|
|
1144
|
-
}
|
|
1145
|
-
|
|
1146
|
-
function textSimilarity(a: string, b: string): number {
|
|
1147
|
-
if (a === b) return 1
|
|
1148
|
-
if (a.length === 0 || b.length === 0) return 0
|
|
1149
|
-
return Math.max(charPrefixSuffixSimilarity(a, b), tokenJaccardSimilarity(a, b))
|
|
1150
|
-
}
|
|
1151
|
-
|
|
1152
|
-
function charPrefixSuffixSimilarity(a: string, b: string): number {
|
|
1153
|
-
let prefix = 0
|
|
1154
|
-
const minLen = Math.min(a.length, b.length)
|
|
1155
|
-
while (prefix < minLen && a[prefix] === b[prefix]) prefix++
|
|
1156
|
-
|
|
1157
|
-
let suffix = 0
|
|
1158
|
-
while (
|
|
1159
|
-
suffix < a.length - prefix &&
|
|
1160
|
-
suffix < b.length - prefix &&
|
|
1161
|
-
a[a.length - 1 - suffix] === b[b.length - 1 - suffix]
|
|
1162
|
-
) {
|
|
1163
|
-
suffix++
|
|
1164
|
-
}
|
|
1165
|
-
|
|
1166
|
-
return (prefix + suffix) / Math.max(a.length, b.length)
|
|
1167
|
-
}
|
|
1168
|
-
|
|
1169
|
-
function tokenJaccardSimilarity(a: string, b: string): number {
|
|
1170
|
-
const tokensA = new Set(a.split(/\s+/).filter(Boolean))
|
|
1171
|
-
const tokensB = new Set(b.split(/\s+/).filter(Boolean))
|
|
1172
|
-
if (tokensA.size === 0 && tokensB.size === 0) return 1
|
|
1173
|
-
let intersection = 0
|
|
1174
|
-
for (const t of tokensA) {
|
|
1175
|
-
if (tokensB.has(t)) intersection++
|
|
1176
|
-
}
|
|
1177
|
-
const union = tokensA.size + tokensB.size - intersection
|
|
1178
|
-
return union === 0 ? 0 : intersection / union
|
|
1179
|
-
}
|
|
1180
|
-
|
|
1181
|
-
function rowText(html: string, row: RowRange): string {
|
|
945
|
+
export function rowText(html: string, row: RowRange): string {
|
|
1182
946
|
const parts: string[] = []
|
|
1183
947
|
for (const cell of row.cells) {
|
|
1184
948
|
parts.push(html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, ' '))
|
|
@@ -1195,140 +959,13 @@ function cellText(html: string, cell: CellRange): string {
|
|
|
1195
959
|
.toLowerCase()
|
|
1196
960
|
}
|
|
1197
961
|
|
|
1198
|
-
/**
|
|
1199
|
-
* Standard LCS alignment: walks both sequences and emits a list of pairs
|
|
1200
|
-
* where `(oldIdx, newIdx)` are both set for matching positions, and one
|
|
1201
|
-
* side is null for an unmatched entry on the other side. Equality uses
|
|
1202
|
-
* strict ===.
|
|
1203
|
-
*/
|
|
1204
|
-
function lcsAlign(oldKeys: string[], newKeys: string[]): Alignment[] {
|
|
1205
|
-
const m = oldKeys.length
|
|
1206
|
-
const n = newKeys.length
|
|
1207
|
-
const dp: number[][] = Array.from({ length: m + 1 }, () => new Array<number>(n + 1).fill(0))
|
|
1208
|
-
for (let i = 1; i <= m; i++) {
|
|
1209
|
-
for (let j = 1; j <= n; j++) {
|
|
1210
|
-
if (oldKeys[i - 1] === newKeys[j - 1]) {
|
|
1211
|
-
dp[i][j] = dp[i - 1][j - 1] + 1
|
|
1212
|
-
} else {
|
|
1213
|
-
dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1])
|
|
1214
|
-
}
|
|
1215
|
-
}
|
|
1216
|
-
}
|
|
1217
|
-
|
|
1218
|
-
const result: Alignment[] = []
|
|
1219
|
-
let i = m
|
|
1220
|
-
let j = n
|
|
1221
|
-
while (i > 0 || j > 0) {
|
|
1222
|
-
if (i > 0 && j > 0 && oldKeys[i - 1] === newKeys[j - 1]) {
|
|
1223
|
-
result.unshift({ oldIdx: i - 1, newIdx: j - 1 })
|
|
1224
|
-
i--
|
|
1225
|
-
j--
|
|
1226
|
-
} else if (j > 0 && (i === 0 || dp[i][j - 1] >= dp[i - 1][j])) {
|
|
1227
|
-
result.unshift({ oldIdx: null, newIdx: j - 1 })
|
|
1228
|
-
j--
|
|
1229
|
-
} else {
|
|
1230
|
-
result.unshift({ oldIdx: i - 1, newIdx: null })
|
|
1231
|
-
i--
|
|
1232
|
-
}
|
|
1233
|
-
}
|
|
1234
|
-
return result
|
|
1235
|
-
}
|
|
1236
|
-
|
|
1237
|
-
/**
|
|
1238
|
-
* Returns the opening tag string with the given class injected. Existing
|
|
1239
|
-
* `class` attributes are preserved and the new class appended.
|
|
1240
|
-
*/
|
|
1241
|
-
/**
|
|
1242
|
-
* Returns the opening tag with the given class injected. Locates the real
|
|
1243
|
-
* `class` attribute via attribute-aware walking (NOT a flat regex — that
|
|
1244
|
-
* would mis-match inside a foreign attribute value like
|
|
1245
|
-
* `title="see class='x'"`). When the class already partially overlaps with
|
|
1246
|
-
* `cls` — e.g. existing `class="mod"` and we're injecting `mod colspan` —
|
|
1247
|
-
* only the missing tokens get appended, so we never end up with
|
|
1248
|
-
* `class="mod mod colspan"`.
|
|
1249
|
-
*/
|
|
1250
|
-
function injectClass(openingTag: string, cls: string): string {
|
|
1251
|
-
const clsTokens = cls.split(/\s+/).filter(Boolean)
|
|
1252
|
-
if (clsTokens.length === 0) return openingTag
|
|
1253
|
-
|
|
1254
|
-
const classAttr = findClassAttribute(openingTag)
|
|
1255
|
-
if (classAttr) {
|
|
1256
|
-
const existingTokens = classAttr.value.split(/\s+/).filter(Boolean)
|
|
1257
|
-
const missing = clsTokens.filter(t => !existingTokens.includes(t))
|
|
1258
|
-
if (missing.length === 0) return openingTag
|
|
1259
|
-
const updatedValue =
|
|
1260
|
-
existingTokens.length === 0 ? missing.join(' ') : `${existingTokens.join(' ')} ${missing.join(' ')}`
|
|
1261
|
-
return openingTag.slice(0, classAttr.valueStart) + updatedValue + openingTag.slice(classAttr.valueEnd)
|
|
1262
|
-
}
|
|
1263
|
-
|
|
1264
|
-
const isSelfClosing = openingTag.endsWith('/>')
|
|
1265
|
-
const insertAt = isSelfClosing ? openingTag.length - 2 : openingTag.length - 1
|
|
1266
|
-
return `${openingTag.slice(0, insertAt).replace(/\s*$/, '')} class='${cls}'${openingTag.slice(insertAt)}`
|
|
1267
|
-
}
|
|
1268
|
-
|
|
1269
|
-
/**
|
|
1270
|
-
* Walks the opening tag's attributes (respecting quoted values) to find
|
|
1271
|
-
* the actual `class` attribute. Returns the value range (start/end of the
|
|
1272
|
-
* value content, *excluding* the surrounding quotes) and the value, or
|
|
1273
|
-
* null if no `class` attribute is present.
|
|
1274
|
-
*/
|
|
1275
|
-
function findClassAttribute(openingTag: string): { valueStart: number; valueEnd: number; value: string } | null {
|
|
1276
|
-
// Skip past the tag name. Tag starts with `<`; first run of [A-Za-z0-9-]
|
|
1277
|
-
// is the tag name. Anything after is attribute territory.
|
|
1278
|
-
let i = 1
|
|
1279
|
-
while (i < openingTag.length && /[A-Za-z0-9_:-]/.test(openingTag[i])) i++
|
|
1280
|
-
|
|
1281
|
-
while (i < openingTag.length) {
|
|
1282
|
-
// Skip whitespace
|
|
1283
|
-
while (i < openingTag.length && /\s/.test(openingTag[i])) i++
|
|
1284
|
-
if (i >= openingTag.length) break
|
|
1285
|
-
if (openingTag[i] === '>' || openingTag[i] === '/') break
|
|
1286
|
-
|
|
1287
|
-
// Read attribute name
|
|
1288
|
-
const nameStart = i
|
|
1289
|
-
while (i < openingTag.length && !/[\s=>/]/.test(openingTag[i])) i++
|
|
1290
|
-
const name = openingTag.slice(nameStart, i)
|
|
1291
|
-
|
|
1292
|
-
// Optional whitespace + '=' + optional whitespace + value
|
|
1293
|
-
while (i < openingTag.length && /\s/.test(openingTag[i])) i++
|
|
1294
|
-
if (openingTag[i] !== '=') {
|
|
1295
|
-
// Bare attribute (no value) — not class
|
|
1296
|
-
continue
|
|
1297
|
-
}
|
|
1298
|
-
i++ // past '='
|
|
1299
|
-
while (i < openingTag.length && /\s/.test(openingTag[i])) i++
|
|
1300
|
-
|
|
1301
|
-
// Value: quoted or unquoted
|
|
1302
|
-
let valueStart: number
|
|
1303
|
-
let valueEnd: number
|
|
1304
|
-
if (openingTag[i] === '"' || openingTag[i] === "'") {
|
|
1305
|
-
const quote = openingTag[i]
|
|
1306
|
-
i++
|
|
1307
|
-
valueStart = i
|
|
1308
|
-
while (i < openingTag.length && openingTag[i] !== quote) i++
|
|
1309
|
-
valueEnd = i
|
|
1310
|
-
if (i < openingTag.length) i++ // past closing quote
|
|
1311
|
-
} else {
|
|
1312
|
-
valueStart = i
|
|
1313
|
-
while (i < openingTag.length && !/[\s>/]/.test(openingTag[i])) i++
|
|
1314
|
-
valueEnd = i
|
|
1315
|
-
}
|
|
1316
|
-
|
|
1317
|
-
if (name.toLowerCase() === 'class') {
|
|
1318
|
-
return { valueStart, valueEnd, value: openingTag.slice(valueStart, valueEnd) }
|
|
1319
|
-
}
|
|
1320
|
-
}
|
|
1321
|
-
|
|
1322
|
-
return null
|
|
1323
|
-
}
|
|
1324
|
-
|
|
1325
962
|
/**
|
|
1326
963
|
* Walks html and returns ranges for every top-level `<table>...</table>`
|
|
1327
964
|
* block. Nested tables aren't extracted as separate top-level entries —
|
|
1328
965
|
* they're captured inside the parent's content range and handled when the
|
|
1329
966
|
* cell-level diff recurses through them.
|
|
1330
967
|
*/
|
|
1331
|
-
function findTopLevelTables(html: string): TableRange[] {
|
|
968
|
+
export function findTopLevelTables(html: string): TableRange[] {
|
|
1332
969
|
const tables: TableRange[] = []
|
|
1333
970
|
let i = 0
|
|
1334
971
|
while (i < html.length) {
|
|
@@ -1414,91 +1051,3 @@ function findTopLevelCells(html: string, start: number, end: number): CellRange[
|
|
|
1414
1051
|
}
|
|
1415
1052
|
return cells
|
|
1416
1053
|
}
|
|
1417
|
-
|
|
1418
|
-
function matchesTagAt(html: string, i: number, tagName: string): boolean {
|
|
1419
|
-
if (html[i] !== '<') return false
|
|
1420
|
-
const candidate = html.slice(i + 1, i + 1 + tagName.length).toLowerCase()
|
|
1421
|
-
if (candidate !== tagName) return false
|
|
1422
|
-
const after = html[i + 1 + tagName.length]
|
|
1423
|
-
return after === '>' || after === ' ' || after === '\t' || after === '\n' || after === '\r' || after === '/'
|
|
1424
|
-
}
|
|
1425
|
-
|
|
1426
|
-
function matchesClosingTagAt(html: string, i: number, tagName: string): boolean {
|
|
1427
|
-
if (html[i] !== '<' || html[i + 1] !== '/') return false
|
|
1428
|
-
const candidate = html.slice(i + 2, i + 2 + tagName.length).toLowerCase()
|
|
1429
|
-
if (candidate !== tagName) return false
|
|
1430
|
-
const after = html[i + 2 + tagName.length]
|
|
1431
|
-
return after === '>' || after === ' ' || after === '\t' || after === '\n' || after === '\r'
|
|
1432
|
-
}
|
|
1433
|
-
|
|
1434
|
-
interface OpeningTag {
|
|
1435
|
-
/** Index just past the closing `>` of the opening tag. */
|
|
1436
|
-
end: number
|
|
1437
|
-
}
|
|
1438
|
-
|
|
1439
|
-
function parseOpeningTagAt(html: string, i: number): OpeningTag | null {
|
|
1440
|
-
// HTML comments, CDATA, processing instructions, and DOCTYPE need their
|
|
1441
|
-
// own terminators — a plain `>`-walker would cut a comment like
|
|
1442
|
-
// `<!-- a > b -->` at the first inner `>`, treating the rest as text
|
|
1443
|
-
// and corrupting downstream offsets. Word-exported HTML routinely
|
|
1444
|
-
// emits comments inside tables (conditional comments, OLE markers) so
|
|
1445
|
-
// these have to be handled, not just be theoretical.
|
|
1446
|
-
if (html.startsWith('<!--', i)) {
|
|
1447
|
-
const close = html.indexOf('-->', i + 4)
|
|
1448
|
-
return close === -1 ? null : { end: close + 3 }
|
|
1449
|
-
}
|
|
1450
|
-
if (html.startsWith('<![CDATA[', i)) {
|
|
1451
|
-
const close = html.indexOf(']]>', i + 9)
|
|
1452
|
-
return close === -1 ? null : { end: close + 3 }
|
|
1453
|
-
}
|
|
1454
|
-
if (html.startsWith('<?', i)) {
|
|
1455
|
-
const close = html.indexOf('?>', i + 2)
|
|
1456
|
-
return close === -1 ? null : { end: close + 2 }
|
|
1457
|
-
}
|
|
1458
|
-
// Walk to the next unquoted '>'. Handles attributes whose values contain
|
|
1459
|
-
// a literal '>' inside quotes, which a plain indexOf would mishandle.
|
|
1460
|
-
let j = i + 1
|
|
1461
|
-
let quote: string | null = null
|
|
1462
|
-
while (j < html.length) {
|
|
1463
|
-
const ch = html[j]
|
|
1464
|
-
if (quote) {
|
|
1465
|
-
if (ch === quote) quote = null
|
|
1466
|
-
} else if (ch === '"' || ch === "'") {
|
|
1467
|
-
quote = ch
|
|
1468
|
-
} else if (ch === '>') {
|
|
1469
|
-
return { end: j + 1 }
|
|
1470
|
-
}
|
|
1471
|
-
j++
|
|
1472
|
-
}
|
|
1473
|
-
return null
|
|
1474
|
-
}
|
|
1475
|
-
|
|
1476
|
-
/**
|
|
1477
|
-
* Returns the index just past the matching `</tagName>`, accounting for
|
|
1478
|
-
* nested tags of the same name. Returns -1 if no match before `limit`.
|
|
1479
|
-
*/
|
|
1480
|
-
function findMatchingClosingTag(html: string, from: number, tagName: string, limit: number = html.length): number {
|
|
1481
|
-
let depth = 1
|
|
1482
|
-
let i = from
|
|
1483
|
-
while (i < limit) {
|
|
1484
|
-
if (matchesTagAt(html, i, tagName)) {
|
|
1485
|
-
const opening = parseOpeningTagAt(html, i)
|
|
1486
|
-
if (!opening) {
|
|
1487
|
-
i++
|
|
1488
|
-
continue
|
|
1489
|
-
}
|
|
1490
|
-
const tagText = html.slice(i, opening.end)
|
|
1491
|
-
if (!tagText.endsWith('/>')) depth++
|
|
1492
|
-
i = opening.end
|
|
1493
|
-
} else if (matchesClosingTagAt(html, i, tagName)) {
|
|
1494
|
-
depth--
|
|
1495
|
-
const closing = parseOpeningTagAt(html, i)
|
|
1496
|
-
const closingEnd = closing?.end ?? i + `</${tagName}>`.length
|
|
1497
|
-
if (depth === 0) return closingEnd
|
|
1498
|
-
i = closingEnd
|
|
1499
|
-
} else {
|
|
1500
|
-
i++
|
|
1501
|
-
}
|
|
1502
|
-
}
|
|
1503
|
-
return -1
|
|
1504
|
-
}
|