@createiq/htmldiff 1.1.0 → 1.2.0-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +15 -0
- package/README.md +40 -0
- package/dist/HtmlDiff.cjs +1255 -493
- package/dist/HtmlDiff.cjs.map +1 -1
- package/dist/HtmlDiff.d.cts +141 -7
- package/dist/HtmlDiff.d.mts +140 -7
- package/dist/HtmlDiff.mjs +1255 -493
- package/dist/HtmlDiff.mjs.map +1 -1
- package/package.json +1 -1
- package/src/Alignment.ts +349 -0
- package/src/HtmlDiff.ts +323 -33
- package/src/HtmlScanner.ts +200 -0
- package/src/TableDiff.ts +67 -522
- package/src/ThreeWayDiff.ts +223 -0
- package/src/ThreeWayTable.ts +701 -0
- package/src/Utils.ts +34 -2
- package/test/HtmlDiff.analyze.spec.ts +152 -0
- package/test/HtmlDiff.tables.spec.ts +43 -19
- package/test/HtmlDiff.threeWay.spec.ts +175 -0
- package/test/HtmlDiff.threeWay.tables.spec.ts +407 -0
- package/test/TableDiff.bench.ts +39 -0
- package/test/Utils.spec.ts +48 -0
package/src/HtmlDiff.ts
CHANGED
|
@@ -3,9 +3,74 @@ import Match from './Match'
|
|
|
3
3
|
import MatchFinder from './MatchFinder'
|
|
4
4
|
import Operation from './Operation'
|
|
5
5
|
import { preprocessTables, restoreTablePlaceholders } from './TableDiff'
|
|
6
|
-
import
|
|
6
|
+
import { buildSegments, type Segment, segmentEmissionShape } from './ThreeWayDiff'
|
|
7
|
+
import { preprocessTablesThreeWay } from './ThreeWayTable'
|
|
8
|
+
import Utils, { type WrapMetadata } from './Utils'
|
|
7
9
|
import WordSplitter from './WordSplitter'
|
|
8
10
|
|
|
11
|
+
/**
|
|
12
|
+
* State threaded into the recursive cell-level diff inside
|
|
13
|
+
* `preprocessTables`. Bundles the nesting depth (for the recursion-cap
|
|
14
|
+
* guard) with the caller-configurable settings that must propagate
|
|
15
|
+
* unchanged to inner instances so cell-level output stays consistent
|
|
16
|
+
* with the top-level call. Internal — never crosses the public API.
|
|
17
|
+
*/
|
|
18
|
+
interface RecursionContext {
|
|
19
|
+
depth: number
|
|
20
|
+
blockExpressions: readonly RegExp[]
|
|
21
|
+
repeatingWordsAccuracy: number
|
|
22
|
+
orphanMatchThreshold: number
|
|
23
|
+
ignoreWhitespaceDifferences: boolean
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Options for the `HtmlDiff.analyze` static helper.
|
|
28
|
+
*
|
|
29
|
+
* `useProjections` controls structural-tag normalisation:
|
|
30
|
+
* - `undefined` → use the same heuristic as `build()` (per-call decision)
|
|
31
|
+
* - `true` → force projection on (skipped if either side has no content)
|
|
32
|
+
* - `false` → force projection off (diff runs on raw word arrays)
|
|
33
|
+
* Composers of multiple analyses (e.g. three-way diff) MUST pass the
|
|
34
|
+
* same explicit boolean to all calls so shared inputs tokenise
|
|
35
|
+
* identically across analyses.
|
|
36
|
+
*
|
|
37
|
+
* The remaining options mirror the per-instance fields on `HtmlDiff`
|
|
38
|
+
* itself — they exist on the options bag because `analyze` constructs
|
|
39
|
+
* the inner instance internally.
|
|
40
|
+
*/
|
|
41
|
+
export interface AnalyzeOptions {
|
|
42
|
+
useProjections?: boolean
|
|
43
|
+
blockExpressions?: readonly RegExp[]
|
|
44
|
+
repeatingWordsAccuracy?: number
|
|
45
|
+
orphanMatchThreshold?: number
|
|
46
|
+
ignoreWhitespaceDifferences?: boolean
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export interface AnalyzeResult {
|
|
50
|
+
/** Word array the `operations` index into (projected or raw). */
|
|
51
|
+
readonly oldDiffWords: readonly string[]
|
|
52
|
+
readonly newDiffWords: readonly string[]
|
|
53
|
+
readonly operations: readonly Operation[]
|
|
54
|
+
/** Original WordSplitter output, before any projection. */
|
|
55
|
+
readonly oldOriginalWords: readonly string[]
|
|
56
|
+
readonly newOriginalWords: readonly string[]
|
|
57
|
+
/** Diff-index → original-word-index map; null when projections inactive. */
|
|
58
|
+
readonly oldContentToOriginal: readonly number[] | null
|
|
59
|
+
readonly newContentToOriginal: readonly number[] | null
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Options for `HtmlDiff.executeThreeWay`. Same shape as `AnalyzeOptions`
|
|
64
|
+
* — the values flow unchanged into both internal `analyze` calls so V2's
|
|
65
|
+
* tokenisation stays symmetric. Aliased so future divergence in either
|
|
66
|
+
* direction lives in one place.
|
|
67
|
+
*
|
|
68
|
+
* `useProjections`: when undefined, `executeThreeWay` computes the
|
|
69
|
+
* decision as the conjunction of both pair-wise
|
|
70
|
+
* `evaluateProjectionApplicability` results.
|
|
71
|
+
*/
|
|
72
|
+
export type ThreeWayOptions = AnalyzeOptions
|
|
73
|
+
|
|
9
74
|
export default class HtmlDiff {
|
|
10
75
|
/**
|
|
11
76
|
* This value defines balance between speed and memory utilization. The higher it is the faster it works and more memory consumes.
|
|
@@ -76,10 +141,26 @@ export default class HtmlDiff {
|
|
|
76
141
|
*/
|
|
77
142
|
private static MaxTablePreprocessDepth = 8
|
|
78
143
|
|
|
144
|
+
/**
|
|
145
|
+
* Mirror cap for the three-way path. The 2-way `MaxTablePreprocessDepth`
|
|
146
|
+
* guards the recursion inside `executeWithContext`; the 3-way path has
|
|
147
|
+
* its own recursion (`executeThreeWay` → `preprocessTablesThreeWay` →
|
|
148
|
+
* `cellDiff` → `executeThreeWay`) which needs its own guard. Once the
|
|
149
|
+
* cap is reached, `executeThreeWay` skips table preprocessing and
|
|
150
|
+
* falls back to the word-level merge — same bail-out semantics as the
|
|
151
|
+
* 2-way path.
|
|
152
|
+
*/
|
|
153
|
+
private static MaxThreeWayDepth = 8
|
|
154
|
+
|
|
79
155
|
private content: string[] = []
|
|
80
156
|
private newText: string
|
|
81
157
|
private oldText: string
|
|
82
|
-
|
|
158
|
+
// Written exactly once, by `executeWithContext` on the inner instance
|
|
159
|
+
// for a recursive cell-diff. Top-level instances stay at 0. Treated as
|
|
160
|
+
// effectively-readonly elsewhere — we dropped the modifier only so
|
|
161
|
+
// `executeWithContext` can populate it without needing a private
|
|
162
|
+
// constructor overload that would re-leak the parameter we just hid.
|
|
163
|
+
private tablePreprocessDepth = 0
|
|
83
164
|
|
|
84
165
|
private specialTagDiffStack: string[] = []
|
|
85
166
|
private newWords: string[] = []
|
|
@@ -147,18 +228,225 @@ export default class HtmlDiff {
|
|
|
147
228
|
* Initializes a new instance of the class.
|
|
148
229
|
* @param oldText The old text.
|
|
149
230
|
* @param newText The new text.
|
|
150
|
-
* @param tablePreprocessDepth Internal: nested-call depth for table
|
|
151
|
-
* preprocessing. Callers should leave at default (0); the recursive
|
|
152
|
-
* `diffCell` callback in TableDiff bumps it.
|
|
153
231
|
*/
|
|
154
|
-
constructor(oldText: string, newText: string
|
|
232
|
+
constructor(oldText: string, newText: string) {
|
|
155
233
|
this.oldText = oldText
|
|
156
234
|
this.newText = newText
|
|
157
|
-
this.tablePreprocessDepth = tablePreprocessDepth
|
|
158
235
|
}
|
|
159
236
|
|
|
160
|
-
static execute(oldText: string, newText: string
|
|
161
|
-
return new HtmlDiff(oldText, newText
|
|
237
|
+
static execute(oldText: string, newText: string): string {
|
|
238
|
+
return new HtmlDiff(oldText, newText).build()
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
/**
|
|
242
|
+
* Analyse a two-way diff and return its raw building blocks: the word
|
|
243
|
+
* arrays the diff ran against, the operations produced, the original
|
|
244
|
+
* (pre-projection) word arrays, and the mappings from diff-index back
|
|
245
|
+
* to original-word index when structural projection is active.
|
|
246
|
+
* Consumed by `executeThreeWay` so it can compose two diffs by walking
|
|
247
|
+
* their Operation streams.
|
|
248
|
+
*
|
|
249
|
+
* The caller is expected to coordinate `useProjections` symmetrically
|
|
250
|
+
* across composed analyses — if V1↔V2 projects but V2↔V3 doesn't,
|
|
251
|
+
* V2's "new" array in the first analysis won't equal V2's "old" array
|
|
252
|
+
* in the second. `evaluateProjectionApplicability` exposes the same
|
|
253
|
+
* heuristic `build()` uses internally, so the orchestrator can compute
|
|
254
|
+
* a single decision and pass it into every `analyze` call.
|
|
255
|
+
*
|
|
256
|
+
* Table preprocessing is skipped here. Placeholders mutate the input
|
|
257
|
+
* in ways that don't compose across two independent analyses; the
|
|
258
|
+
* 3-way orchestrator handles tables explicitly before calling analyze.
|
|
259
|
+
*/
|
|
260
|
+
static analyze(oldText: string, newText: string, options: AnalyzeOptions = {}): AnalyzeResult {
|
|
261
|
+
const inner = new HtmlDiff(oldText, newText)
|
|
262
|
+
// Bypass table preprocessing — the caller handles tables.
|
|
263
|
+
inner.tablePreprocessDepth = HtmlDiff.MaxTablePreprocessDepth
|
|
264
|
+
if (options.blockExpressions) {
|
|
265
|
+
for (const expr of options.blockExpressions) inner.addBlockExpression(expr)
|
|
266
|
+
}
|
|
267
|
+
if (options.repeatingWordsAccuracy !== undefined) inner.repeatingWordsAccuracy = options.repeatingWordsAccuracy
|
|
268
|
+
if (options.orphanMatchThreshold !== undefined) inner.orphanMatchThreshold = options.orphanMatchThreshold
|
|
269
|
+
if (options.ignoreWhitespaceDifferences !== undefined) {
|
|
270
|
+
inner.ignoreWhitespaceDifferences = options.ignoreWhitespaceDifferences
|
|
271
|
+
}
|
|
272
|
+
inner.splitInputsToWords()
|
|
273
|
+
if (options.useProjections === undefined) {
|
|
274
|
+
// Mirror build()'s heuristic — same behaviour as a standalone 2-way diff.
|
|
275
|
+
inner.buildContentProjections()
|
|
276
|
+
} else if (options.useProjections) {
|
|
277
|
+
// Caller forced projections on. Still skip if either side has no
|
|
278
|
+
// structural content, since projecting an empty side produces an
|
|
279
|
+
// empty diff space and the merge degrades.
|
|
280
|
+
const oldProj = HtmlDiff.createContentProjection(inner.oldWords)
|
|
281
|
+
const newProj = HtmlDiff.createContentProjection(inner.newWords)
|
|
282
|
+
if (oldProj.contentWords.length > 0 && newProj.contentWords.length > 0) {
|
|
283
|
+
inner.oldContentWords = oldProj.contentWords
|
|
284
|
+
inner.oldContentToOriginal = oldProj.contentToOriginal
|
|
285
|
+
inner.newContentWords = newProj.contentWords
|
|
286
|
+
inner.newContentToOriginal = newProj.contentToOriginal
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
// useProjections === false: leave projections unset, diff runs on raw words.
|
|
290
|
+
const wordsForDiffOld = inner.oldContentWords ?? inner.oldWords
|
|
291
|
+
const wordsForDiffNew = inner.newContentWords ?? inner.newWords
|
|
292
|
+
inner.matchGranularity = Math.min(
|
|
293
|
+
HtmlDiff.MatchGranularityMaximum,
|
|
294
|
+
Math.min(wordsForDiffOld.length, wordsForDiffNew.length)
|
|
295
|
+
)
|
|
296
|
+
return {
|
|
297
|
+
oldDiffWords: wordsForDiffOld,
|
|
298
|
+
newDiffWords: wordsForDiffNew,
|
|
299
|
+
operations: inner.operations(),
|
|
300
|
+
oldOriginalWords: inner.oldWords,
|
|
301
|
+
newOriginalWords: inner.newWords,
|
|
302
|
+
oldContentToOriginal: inner.oldContentToOriginal,
|
|
303
|
+
newContentToOriginal: inner.newContentToOriginal,
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
/**
|
|
308
|
+
* Whether content-projection (structural-tag normalisation) would
|
|
309
|
+
* apply to this pair of inputs under `build()`'s default heuristic.
|
|
310
|
+
* Exposed so composers of multiple analyses can compute a symmetric
|
|
311
|
+
* decision before calling `analyze` — see `analyze`'s docstring for
|
|
312
|
+
* why symmetry matters.
|
|
313
|
+
*/
|
|
314
|
+
static evaluateProjectionApplicability(oldText: string, newText: string): boolean {
|
|
315
|
+
const oldWords = WordSplitter.convertHtmlToListOfWords(oldText, [])
|
|
316
|
+
const newWords = WordSplitter.convertHtmlToListOfWords(newText, [])
|
|
317
|
+
if (!HtmlDiff.hasStructuralDifferences(oldWords, newWords)) return false
|
|
318
|
+
const oldProj = HtmlDiff.createContentProjection(oldWords)
|
|
319
|
+
const newProj = HtmlDiff.createContentProjection(newWords)
|
|
320
|
+
return HtmlDiff.shouldUseContentProjections(oldWords, newWords, oldProj, newProj)
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
/**
|
|
324
|
+
* Three-way HTML diff. Given V1 (the version Me last sent), V2 (the
|
|
325
|
+
* version CP sent back), and V3 (Me's current draft), produces a
|
|
326
|
+
* single attributed HTML output where CP's and Me's changes are
|
|
327
|
+
* distinguished by `data-author` ('cp' or 'me') and matching
|
|
328
|
+
* `class='diffins cp'` / `class='diffdel me'` etc. The "Me rejected
|
|
329
|
+
* CP's proposal" case (Me deleted text CP had inserted) gets a
|
|
330
|
+
* dedicated marker: `data-rejects='cp'` plus `class='... rejects-cp'`.
|
|
331
|
+
*
|
|
332
|
+
* Coordinates the symmetric-projection decision (D1) across both
|
|
333
|
+
* internal `analyze` calls so V2 tokenises identically on each side
|
|
334
|
+
* of the spine. When `useProjections` is left undefined, the decision
|
|
335
|
+
* is the conjunction of both pair-wise heuristics — project iff both
|
|
336
|
+
* pairs would project on their own. Pass an explicit boolean to
|
|
337
|
+
* override.
|
|
338
|
+
*/
|
|
339
|
+
static executeThreeWay(v1: string, v2: string, v3: string, options: ThreeWayOptions = {}): string {
|
|
340
|
+
return HtmlDiff.executeThreeWayWithDepth(v1, v2, v3, options, 0)
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
private static executeThreeWayWithDepth(
|
|
344
|
+
v1: string,
|
|
345
|
+
v2: string,
|
|
346
|
+
v3: string,
|
|
347
|
+
options: ThreeWayOptions,
|
|
348
|
+
depth: number
|
|
349
|
+
): string {
|
|
350
|
+
// Table preprocessing first — replaces each V1/V2/V3 table with a
|
|
351
|
+
// shared-nonce placeholder, then the word-level merge runs over the
|
|
352
|
+
// table-free inputs. Cells are diffed recursively via executeThreeWay
|
|
353
|
+
// so the cell content is itself three-way attributed. Restoration
|
|
354
|
+
// happens at the end.
|
|
355
|
+
//
|
|
356
|
+
// Depth-cap the recursion. Each level recurses cellDiff → executeThreeWay,
|
|
357
|
+
// which would otherwise run unbounded on adversarially-nested input.
|
|
358
|
+
// Beyond the cap, skip table preprocessing entirely and let the
|
|
359
|
+
// word-level merge handle the raw HTML — same bail-out semantics as
|
|
360
|
+
// the 2-way `MaxTablePreprocessDepth` cap.
|
|
361
|
+
const tablePreprocess =
|
|
362
|
+
depth < HtmlDiff.MaxThreeWayDepth
|
|
363
|
+
? preprocessTablesThreeWay(v1, v2, v3, (c1, c2, c3) =>
|
|
364
|
+
HtmlDiff.executeThreeWayWithDepth(c1, c2, c3, options, depth + 1)
|
|
365
|
+
)
|
|
366
|
+
: null
|
|
367
|
+
const inV1 = tablePreprocess?.modifiedV1 ?? v1
|
|
368
|
+
const inV2 = tablePreprocess?.modifiedV2 ?? v2
|
|
369
|
+
const inV3 = tablePreprocess?.modifiedV3 ?? v3
|
|
370
|
+
|
|
371
|
+
const useProjections =
|
|
372
|
+
options.useProjections ??
|
|
373
|
+
(HtmlDiff.evaluateProjectionApplicability(inV1, inV2) && HtmlDiff.evaluateProjectionApplicability(inV2, inV3))
|
|
374
|
+
|
|
375
|
+
const analyzeOpts: AnalyzeOptions = {
|
|
376
|
+
useProjections,
|
|
377
|
+
blockExpressions: options.blockExpressions,
|
|
378
|
+
repeatingWordsAccuracy: options.repeatingWordsAccuracy,
|
|
379
|
+
orphanMatchThreshold: options.orphanMatchThreshold,
|
|
380
|
+
ignoreWhitespaceDifferences: options.ignoreWhitespaceDifferences,
|
|
381
|
+
}
|
|
382
|
+
const d1 = HtmlDiff.analyze(inV1, inV2, analyzeOpts)
|
|
383
|
+
const d2 = HtmlDiff.analyze(inV2, inV3, analyzeOpts)
|
|
384
|
+
|
|
385
|
+
// Spine sanity check. Symmetric `useProjections` should guarantee
|
|
386
|
+
// alignment, but if a bug ever lets these diverge we want to fail
|
|
387
|
+
// loudly rather than silently produce a misattributed output.
|
|
388
|
+
if (d1.newDiffWords.length !== d2.oldDiffWords.length) {
|
|
389
|
+
throw new Error(
|
|
390
|
+
'HtmlDiff.executeThreeWay: V2 tokenisation diverged across pair-wise analyses ' +
|
|
391
|
+
`(${d1.newDiffWords.length} vs ${d2.oldDiffWords.length}). ` +
|
|
392
|
+
'This indicates the symmetric-projection coordination has a bug.'
|
|
393
|
+
)
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
const segments = buildSegments(d1, d2)
|
|
397
|
+
const merged = HtmlDiff.emitSegments(segments)
|
|
398
|
+
return tablePreprocess ? restoreTablePlaceholders(merged, tablePreprocess.placeholderToDiff) : merged
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
/**
|
|
402
|
+
* Drives a fresh `HtmlDiff` instance through `insertTag` for ins/del
|
|
403
|
+
* segments and pushes equal segments straight to its `content`
|
|
404
|
+
* buffer. Reusing the instance keeps the formatting-tag stack
|
|
405
|
+
* (`specialTagDiffStack`) coherent across segments — a `<strong>`
|
|
406
|
+
* opened in one segment and closed in another stays balanced.
|
|
407
|
+
*/
|
|
408
|
+
private static emitSegments(segments: Segment[]): string {
|
|
409
|
+
const emitter = new HtmlDiff('', '')
|
|
410
|
+
for (const seg of segments) {
|
|
411
|
+
if (seg.attr.kind === 'equal') {
|
|
412
|
+
emitter.content.push(seg.words.join(''))
|
|
413
|
+
continue
|
|
414
|
+
}
|
|
415
|
+
const { tag, baseClass, metadata } = segmentEmissionShape(seg.attr)
|
|
416
|
+
// insertTag mutates its `words` array; pass a copy.
|
|
417
|
+
emitter.insertTag(tag, baseClass, [...seg.words], metadata)
|
|
418
|
+
}
|
|
419
|
+
// Stack-balance invariant: every special-case opening tag pushed onto
|
|
420
|
+
// `specialTagDiffStack` during emission must have been matched by a
|
|
421
|
+
// closing tag. An unbalanced stack means the input had unbalanced
|
|
422
|
+
// formatting tags AND a Replace at an inconvenient position — the
|
|
423
|
+
// output would be silently malformed (half-closed `<ins>`). Fail
|
|
424
|
+
// loudly so the caller can investigate rather than ship broken HTML.
|
|
425
|
+
if (emitter.specialTagDiffStack.length > 0) {
|
|
426
|
+
throw new Error(
|
|
427
|
+
`HtmlDiff.executeThreeWay: emission left ${emitter.specialTagDiffStack.length} ` +
|
|
428
|
+
'unclosed formatting tag(s) on the stack — input may have unbalanced ' +
|
|
429
|
+
'<strong>/<em>/etc. or there is a bug in segment emission.'
|
|
430
|
+
)
|
|
431
|
+
}
|
|
432
|
+
return emitter.content.join('')
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
/**
|
|
436
|
+
* Internal entry point used by the table-cell recursion. Constructs an
|
|
437
|
+
* inner `HtmlDiff`, applies the caller's settings, and bumps the
|
|
438
|
+
* recursion depth — keeping the public constructor signature clean
|
|
439
|
+
* while still threading the configuration that's required for cell-
|
|
440
|
+
* level output to match the top-level call's behaviour.
|
|
441
|
+
*/
|
|
442
|
+
private static executeWithContext(oldText: string, newText: string, ctx: RecursionContext): string {
|
|
443
|
+
const inner = new HtmlDiff(oldText, newText)
|
|
444
|
+
inner.tablePreprocessDepth = ctx.depth
|
|
445
|
+
for (const expr of ctx.blockExpressions) inner.addBlockExpression(expr)
|
|
446
|
+
inner.repeatingWordsAccuracy = ctx.repeatingWordsAccuracy
|
|
447
|
+
inner.orphanMatchThreshold = ctx.orphanMatchThreshold
|
|
448
|
+
inner.ignoreWhitespaceDifferences = ctx.ignoreWhitespaceDifferences
|
|
449
|
+
return inner.build()
|
|
162
450
|
}
|
|
163
451
|
|
|
164
452
|
/**
|
|
@@ -174,26 +462,26 @@ export default class HtmlDiff {
|
|
|
174
462
|
// Table preprocessing: when both sides have matching `<table>` structures,
|
|
175
463
|
// diff cells positionally so cross-cell content shifts produce one
|
|
176
464
|
// independent del/ins per cell rather than cell-misaligned output.
|
|
177
|
-
// Recursion guarded by MaxTablePreprocessDepth
|
|
178
|
-
//
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
:
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
465
|
+
// Recursion is guarded by MaxTablePreprocessDepth — check the cap
|
|
466
|
+
// first so we don't construct a context that will never be used.
|
|
467
|
+
let tablePreprocess: ReturnType<typeof preprocessTables> = null
|
|
468
|
+
if (this.tablePreprocessDepth < HtmlDiff.MaxTablePreprocessDepth) {
|
|
469
|
+
// Caller-configured settings (block expressions, accuracy
|
|
470
|
+
// thresholds) flow to the recursive cell diff via `RecursionContext`
|
|
471
|
+
// so cell-level output is consistent with the top-level
|
|
472
|
+
// configuration. The context is built once here and reused for
|
|
473
|
+
// every cell-diff callback invocation.
|
|
474
|
+
const ctx: RecursionContext = {
|
|
475
|
+
depth: this.tablePreprocessDepth + 1,
|
|
476
|
+
blockExpressions: this.blockExpressions,
|
|
477
|
+
repeatingWordsAccuracy: this.repeatingWordsAccuracy,
|
|
478
|
+
orphanMatchThreshold: this.orphanMatchThreshold,
|
|
479
|
+
ignoreWhitespaceDifferences: this.ignoreWhitespaceDifferences,
|
|
480
|
+
}
|
|
481
|
+
tablePreprocess = preprocessTables(this.oldText, this.newText, (oldCell, newCell) =>
|
|
482
|
+
HtmlDiff.executeWithContext(oldCell, newCell, ctx)
|
|
483
|
+
)
|
|
484
|
+
}
|
|
197
485
|
if (tablePreprocess) {
|
|
198
486
|
this.oldText = tablePreprocess.modifiedOld
|
|
199
487
|
this.newText = tablePreprocess.modifiedNew
|
|
@@ -491,7 +779,7 @@ export default class HtmlDiff {
|
|
|
491
779
|
* @param words
|
|
492
780
|
* @private
|
|
493
781
|
*/
|
|
494
|
-
private insertTag(tag: string, cssClass: string, words: string[]) {
|
|
782
|
+
private insertTag(tag: string, cssClass: string, words: string[], metadata?: WrapMetadata) {
|
|
495
783
|
while (true) {
|
|
496
784
|
if (words.length === 0) {
|
|
497
785
|
break
|
|
@@ -499,7 +787,7 @@ export default class HtmlDiff {
|
|
|
499
787
|
|
|
500
788
|
const allWordsUntilFirstTag = this.extractConsecutiveWords(words, x => !Utils.isTag(x))
|
|
501
789
|
if (allWordsUntilFirstTag.length > 0) {
|
|
502
|
-
const text = Utils.wrapText(allWordsUntilFirstTag.join(''), tag, cssClass)
|
|
790
|
+
const text = Utils.wrapText(allWordsUntilFirstTag.join(''), tag, cssClass, metadata)
|
|
503
791
|
this.content.push(text)
|
|
504
792
|
}
|
|
505
793
|
|
|
@@ -533,7 +821,9 @@ export default class HtmlDiff {
|
|
|
533
821
|
const styledTagNames = Array.from(tagNames).join(' ')
|
|
534
822
|
|
|
535
823
|
this.specialTagDiffStack.push(words[0])
|
|
536
|
-
|
|
824
|
+
// Carry the caller's metadata into the formatting-tag wrapper so
|
|
825
|
+
// a 3-way author tag survives a `<strong>`/`<em>` content edit.
|
|
826
|
+
specialCaseTagInjection = `<ins${Utils.composeTagAttributes(`mod ${styledTagNames}`, metadata ?? {})}>`
|
|
537
827
|
if (tag === HtmlDiff.DelTag) {
|
|
538
828
|
words.shift()
|
|
539
829
|
|
|
@@ -608,7 +898,7 @@ export default class HtmlDiff {
|
|
|
608
898
|
if (words.length === 0) continue
|
|
609
899
|
|
|
610
900
|
// if there are still words left, they must start with a nonTag and need to be handled in the next iteration.
|
|
611
|
-
this.insertTag(tag, cssClass, words)
|
|
901
|
+
this.insertTag(tag, cssClass, words, metadata)
|
|
612
902
|
break
|
|
613
903
|
}
|
|
614
904
|
}
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Low-level HTML tag-parsing primitives shared by the table-aware
|
|
3
|
+
* preprocessing and (potentially) other consumers. These are deliberately
|
|
4
|
+
* generic over the document type — no table-specific assumptions live
|
|
5
|
+
* here.
|
|
6
|
+
*
|
|
7
|
+
* The goal is to walk HTML at the tag boundary level *without* parsing
|
|
8
|
+
* into a full DOM, so we stay fast and never round-trip through
|
|
9
|
+
* htmlparser2/DOMPurify in the diff hot path.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
export interface OpeningTag {
|
|
13
|
+
/** Index just past the closing `>` of the opening tag. */
|
|
14
|
+
end: number
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export interface ClassAttributeLocation {
|
|
18
|
+
/** Index of the value's first character (inside the surrounding quotes). */
|
|
19
|
+
valueStart: number
|
|
20
|
+
/** Index just past the last character of the value (still inside quotes). */
|
|
21
|
+
valueEnd: number
|
|
22
|
+
/** The class attribute's value, with surrounding quotes stripped. */
|
|
23
|
+
value: string
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Parses the opening tag (or comment/CDATA/PI) starting at `i`. Returns
|
|
28
|
+
* the index just past the closing delimiter, or null if the tag is
|
|
29
|
+
* malformed (unterminated). HTML comments, CDATA, processing
|
|
30
|
+
* instructions, and DOCTYPE need their own terminators — a plain
|
|
31
|
+
* `>`-walker would cut a comment like `<!-- a > b -->` at the first
|
|
32
|
+
* inner `>`, treating the rest as text and corrupting downstream
|
|
33
|
+
* offsets. Word-exported HTML routinely emits comments inside tables
|
|
34
|
+
* (conditional comments, OLE markers) so these have to be handled.
|
|
35
|
+
*/
|
|
36
|
+
export function parseOpeningTagAt(html: string, i: number): OpeningTag | null {
|
|
37
|
+
if (html.startsWith('<!--', i)) {
|
|
38
|
+
const close = html.indexOf('-->', i + 4)
|
|
39
|
+
return close === -1 ? null : { end: close + 3 }
|
|
40
|
+
}
|
|
41
|
+
if (html.startsWith('<![CDATA[', i)) {
|
|
42
|
+
const close = html.indexOf(']]>', i + 9)
|
|
43
|
+
return close === -1 ? null : { end: close + 3 }
|
|
44
|
+
}
|
|
45
|
+
if (html.startsWith('<?', i)) {
|
|
46
|
+
const close = html.indexOf('?>', i + 2)
|
|
47
|
+
return close === -1 ? null : { end: close + 2 }
|
|
48
|
+
}
|
|
49
|
+
// Walk to the next unquoted '>'. Handles attributes whose values contain
|
|
50
|
+
// a literal '>' inside quotes, which a plain indexOf would mishandle.
|
|
51
|
+
let j = i + 1
|
|
52
|
+
let quote: string | null = null
|
|
53
|
+
while (j < html.length) {
|
|
54
|
+
const ch = html[j]
|
|
55
|
+
if (quote) {
|
|
56
|
+
if (ch === quote) quote = null
|
|
57
|
+
} else if (ch === '"' || ch === "'") {
|
|
58
|
+
quote = ch
|
|
59
|
+
} else if (ch === '>') {
|
|
60
|
+
return { end: j + 1 }
|
|
61
|
+
}
|
|
62
|
+
j++
|
|
63
|
+
}
|
|
64
|
+
return null
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
export function matchesTagAt(html: string, i: number, tagName: string): boolean {
|
|
68
|
+
if (html[i] !== '<') return false
|
|
69
|
+
const candidate = html.slice(i + 1, i + 1 + tagName.length).toLowerCase()
|
|
70
|
+
if (candidate !== tagName) return false
|
|
71
|
+
const after = html[i + 1 + tagName.length]
|
|
72
|
+
return after === '>' || after === ' ' || after === '\t' || after === '\n' || after === '\r' || after === '/'
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
export function matchesClosingTagAt(html: string, i: number, tagName: string): boolean {
|
|
76
|
+
if (html[i] !== '<' || html[i + 1] !== '/') return false
|
|
77
|
+
const candidate = html.slice(i + 2, i + 2 + tagName.length).toLowerCase()
|
|
78
|
+
if (candidate !== tagName) return false
|
|
79
|
+
const after = html[i + 2 + tagName.length]
|
|
80
|
+
return after === '>' || after === ' ' || after === '\t' || after === '\n' || after === '\r'
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Returns the index just past the matching `</tagName>`, accounting for
|
|
85
|
+
* nested tags of the same name. Returns -1 if no match before `limit`.
|
|
86
|
+
*/
|
|
87
|
+
export function findMatchingClosingTag(
|
|
88
|
+
html: string,
|
|
89
|
+
from: number,
|
|
90
|
+
tagName: string,
|
|
91
|
+
limit: number = html.length
|
|
92
|
+
): number {
|
|
93
|
+
let depth = 1
|
|
94
|
+
let i = from
|
|
95
|
+
while (i < limit) {
|
|
96
|
+
if (matchesTagAt(html, i, tagName)) {
|
|
97
|
+
const opening = parseOpeningTagAt(html, i)
|
|
98
|
+
if (!opening) {
|
|
99
|
+
i++
|
|
100
|
+
continue
|
|
101
|
+
}
|
|
102
|
+
const tagText = html.slice(i, opening.end)
|
|
103
|
+
if (!tagText.endsWith('/>')) depth++
|
|
104
|
+
i = opening.end
|
|
105
|
+
} else if (matchesClosingTagAt(html, i, tagName)) {
|
|
106
|
+
depth--
|
|
107
|
+
const closing = parseOpeningTagAt(html, i)
|
|
108
|
+
const closingEnd = closing?.end ?? i + `</${tagName}>`.length
|
|
109
|
+
if (depth === 0) return closingEnd
|
|
110
|
+
i = closingEnd
|
|
111
|
+
} else {
|
|
112
|
+
i++
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
return -1
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Returns the opening tag with the given class injected. Locates the real
|
|
120
|
+
* `class` attribute via attribute-aware walking (NOT a flat regex — that
|
|
121
|
+
* would mis-match inside a foreign attribute value like
|
|
122
|
+
* `title="see class='x'"`). When the class already partially overlaps with
|
|
123
|
+
* `cls` — e.g. existing `class="mod"` and we're injecting `mod colspan` —
|
|
124
|
+
* only the missing tokens get appended, so we never end up with
|
|
125
|
+
* `class="mod mod colspan"`.
|
|
126
|
+
*/
|
|
127
|
+
export function injectClass(openingTag: string, cls: string): string {
|
|
128
|
+
const clsTokens = cls.split(/\s+/).filter(Boolean)
|
|
129
|
+
if (clsTokens.length === 0) return openingTag
|
|
130
|
+
|
|
131
|
+
const classAttr = findClassAttribute(openingTag)
|
|
132
|
+
if (classAttr) {
|
|
133
|
+
const existingTokens = classAttr.value.split(/\s+/).filter(Boolean)
|
|
134
|
+
const missing = clsTokens.filter(t => !existingTokens.includes(t))
|
|
135
|
+
if (missing.length === 0) return openingTag
|
|
136
|
+
const updatedValue =
|
|
137
|
+
existingTokens.length === 0 ? missing.join(' ') : `${existingTokens.join(' ')} ${missing.join(' ')}`
|
|
138
|
+
return openingTag.slice(0, classAttr.valueStart) + updatedValue + openingTag.slice(classAttr.valueEnd)
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
const isSelfClosing = openingTag.endsWith('/>')
|
|
142
|
+
const insertAt = isSelfClosing ? openingTag.length - 2 : openingTag.length - 1
|
|
143
|
+
return `${openingTag.slice(0, insertAt).replace(/\s*$/, '')} class='${cls}'${openingTag.slice(insertAt)}`
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Walks the opening tag's attributes (respecting quoted values) to find
|
|
148
|
+
* the actual `class` attribute. Returns the value range (start/end of the
|
|
149
|
+
* value content, *excluding* the surrounding quotes) and the value, or
|
|
150
|
+
* null if no `class` attribute is present.
|
|
151
|
+
*/
|
|
152
|
+
export function findClassAttribute(openingTag: string): ClassAttributeLocation | null {
|
|
153
|
+
// Skip past the tag name. Tag starts with `<`; first run of [A-Za-z0-9-]
|
|
154
|
+
// is the tag name. Anything after is attribute territory.
|
|
155
|
+
let i = 1
|
|
156
|
+
while (i < openingTag.length && /[A-Za-z0-9_:-]/.test(openingTag[i])) i++
|
|
157
|
+
|
|
158
|
+
while (i < openingTag.length) {
|
|
159
|
+
// Skip whitespace
|
|
160
|
+
while (i < openingTag.length && /\s/.test(openingTag[i])) i++
|
|
161
|
+
if (i >= openingTag.length) break
|
|
162
|
+
if (openingTag[i] === '>' || openingTag[i] === '/') break
|
|
163
|
+
|
|
164
|
+
// Read attribute name
|
|
165
|
+
const nameStart = i
|
|
166
|
+
while (i < openingTag.length && !/[\s=>/]/.test(openingTag[i])) i++
|
|
167
|
+
const name = openingTag.slice(nameStart, i)
|
|
168
|
+
|
|
169
|
+
// Optional whitespace + '=' + optional whitespace + value
|
|
170
|
+
while (i < openingTag.length && /\s/.test(openingTag[i])) i++
|
|
171
|
+
if (openingTag[i] !== '=') {
|
|
172
|
+
// Bare attribute (no value) — not class
|
|
173
|
+
continue
|
|
174
|
+
}
|
|
175
|
+
i++ // past '='
|
|
176
|
+
while (i < openingTag.length && /\s/.test(openingTag[i])) i++
|
|
177
|
+
|
|
178
|
+
// Value: quoted or unquoted
|
|
179
|
+
let valueStart: number
|
|
180
|
+
let valueEnd: number
|
|
181
|
+
if (openingTag[i] === '"' || openingTag[i] === "'") {
|
|
182
|
+
const quote = openingTag[i]
|
|
183
|
+
i++
|
|
184
|
+
valueStart = i
|
|
185
|
+
while (i < openingTag.length && openingTag[i] !== quote) i++
|
|
186
|
+
valueEnd = i
|
|
187
|
+
if (i < openingTag.length) i++ // past closing quote
|
|
188
|
+
} else {
|
|
189
|
+
valueStart = i
|
|
190
|
+
while (i < openingTag.length && !/[\s>/]/.test(openingTag[i])) i++
|
|
191
|
+
valueEnd = i
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
if (name.toLowerCase() === 'class') {
|
|
195
|
+
return { valueStart, valueEnd, value: openingTag.slice(valueStart, valueEnd) }
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
return null
|
|
200
|
+
}
|