@createiq/htmldiff 1.1.0 → 1.2.0-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +15 -0
- package/README.md +67 -0
- package/dist/HtmlDiff.cjs +1192 -456
- package/dist/HtmlDiff.cjs.map +1 -1
- package/dist/HtmlDiff.d.cts +160 -7
- package/dist/HtmlDiff.d.mts +159 -7
- package/dist/HtmlDiff.mjs +1192 -456
- package/dist/HtmlDiff.mjs.map +1 -1
- package/package.json +1 -1
- package/src/Alignment.ts +349 -0
- package/src/HtmlDiff.ts +343 -33
- package/src/HtmlScanner.ts +200 -0
- package/src/TableDiff.ts +67 -522
- package/src/ThreeWayDiff.ts +269 -0
- package/src/ThreeWayTable.ts +625 -0
- package/src/Utils.ts +34 -2
- package/test/HtmlDiff.analyze.spec.ts +152 -0
- package/test/HtmlDiff.tables.spec.ts +43 -19
- package/test/HtmlDiff.threeWay.spec.ts +173 -0
- package/test/HtmlDiff.threeWay.tables.spec.ts +301 -0
- package/test/TableDiff.bench.ts +39 -0
- package/test/Utils.spec.ts +48 -0
package/src/HtmlDiff.ts
CHANGED
|
@@ -3,9 +3,74 @@ import Match from './Match'
|
|
|
3
3
|
import MatchFinder from './MatchFinder'
|
|
4
4
|
import Operation from './Operation'
|
|
5
5
|
import { preprocessTables, restoreTablePlaceholders } from './TableDiff'
|
|
6
|
-
import
|
|
6
|
+
import { buildSegments, type Segment, segmentEmissionShape } from './ThreeWayDiff'
|
|
7
|
+
import { preprocessTablesThreeWay } from './ThreeWayTable'
|
|
8
|
+
import Utils, { type WrapMetadata } from './Utils'
|
|
7
9
|
import WordSplitter from './WordSplitter'
|
|
8
10
|
|
|
11
|
+
/**
|
|
12
|
+
* State threaded into the recursive cell-level diff inside
|
|
13
|
+
* `preprocessTables`. Bundles the nesting depth (for the recursion-cap
|
|
14
|
+
* guard) with the caller-configurable settings that must propagate
|
|
15
|
+
* unchanged to inner instances so cell-level output stays consistent
|
|
16
|
+
* with the top-level call. Internal — never crosses the public API.
|
|
17
|
+
*/
|
|
18
|
+
interface RecursionContext {
|
|
19
|
+
depth: number
|
|
20
|
+
blockExpressions: readonly RegExp[]
|
|
21
|
+
repeatingWordsAccuracy: number
|
|
22
|
+
orphanMatchThreshold: number
|
|
23
|
+
ignoreWhitespaceDifferences: boolean
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Options for the `HtmlDiff.analyze` static helper.
|
|
28
|
+
*
|
|
29
|
+
* `useProjections` controls structural-tag normalisation:
|
|
30
|
+
* - `undefined` → use the same heuristic as `build()` (per-call decision)
|
|
31
|
+
* - `true` → force projection on (skipped if either side has no content)
|
|
32
|
+
* - `false` → force projection off (diff runs on raw word arrays)
|
|
33
|
+
* Composers of multiple analyses (e.g. three-way diff) MUST pass the
|
|
34
|
+
* same explicit boolean to all calls so shared inputs tokenise
|
|
35
|
+
* identically across analyses.
|
|
36
|
+
*
|
|
37
|
+
* The remaining options mirror the per-instance fields on `HtmlDiff`
|
|
38
|
+
* itself — they exist on the options bag because `analyze` constructs
|
|
39
|
+
* the inner instance internally.
|
|
40
|
+
*/
|
|
41
|
+
export interface AnalyzeOptions {
|
|
42
|
+
useProjections?: boolean
|
|
43
|
+
blockExpressions?: readonly RegExp[]
|
|
44
|
+
repeatingWordsAccuracy?: number
|
|
45
|
+
orphanMatchThreshold?: number
|
|
46
|
+
ignoreWhitespaceDifferences?: boolean
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export interface AnalyzeResult {
|
|
50
|
+
/** Word array the `operations` index into (projected or raw). */
|
|
51
|
+
readonly oldDiffWords: readonly string[]
|
|
52
|
+
readonly newDiffWords: readonly string[]
|
|
53
|
+
readonly operations: readonly Operation[]
|
|
54
|
+
/** Original WordSplitter output, before any projection. */
|
|
55
|
+
readonly oldOriginalWords: readonly string[]
|
|
56
|
+
readonly newOriginalWords: readonly string[]
|
|
57
|
+
/** Diff-index → original-word-index map; null when projections inactive. */
|
|
58
|
+
readonly oldContentToOriginal: readonly number[] | null
|
|
59
|
+
readonly newContentToOriginal: readonly number[] | null
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Options for `HtmlDiff.executeThreeWay`. Same shape as `AnalyzeOptions`
|
|
64
|
+
* — the values flow unchanged into both internal `analyze` calls so V2's
|
|
65
|
+
* tokenisation stays symmetric. Aliased so future divergence in either
|
|
66
|
+
* direction lives in one place.
|
|
67
|
+
*
|
|
68
|
+
* `useProjections`: when undefined, `executeThreeWay` computes the
|
|
69
|
+
* decision as the conjunction of both pair-wise
|
|
70
|
+
* `evaluateProjectionApplicability` results.
|
|
71
|
+
*/
|
|
72
|
+
export type ThreeWayOptions = AnalyzeOptions
|
|
73
|
+
|
|
9
74
|
export default class HtmlDiff {
|
|
10
75
|
/**
|
|
11
76
|
* This value defines balance between speed and memory utilization. The higher it is the faster it works and more memory consumes.
|
|
@@ -76,10 +141,26 @@ export default class HtmlDiff {
|
|
|
76
141
|
*/
|
|
77
142
|
private static MaxTablePreprocessDepth = 8
|
|
78
143
|
|
|
144
|
+
/**
|
|
145
|
+
* Mirror cap for the three-way path. The 2-way `MaxTablePreprocessDepth`
|
|
146
|
+
* guards the recursion inside `executeWithContext`; the 3-way path has
|
|
147
|
+
* its own recursion (`executeThreeWay` → `preprocessTablesThreeWay` →
|
|
148
|
+
* `cellDiff` → `executeThreeWay`) which needs its own guard. Once the
|
|
149
|
+
* cap is reached, `executeThreeWay` skips table preprocessing and
|
|
150
|
+
* falls back to the word-level merge — same bail-out semantics as the
|
|
151
|
+
* 2-way path.
|
|
152
|
+
*/
|
|
153
|
+
private static MaxThreeWayDepth = 8
|
|
154
|
+
|
|
79
155
|
private content: string[] = []
|
|
80
156
|
private newText: string
|
|
81
157
|
private oldText: string
|
|
82
|
-
|
|
158
|
+
// Written exactly once, by `executeWithContext` on the inner instance
|
|
159
|
+
// for a recursive cell-diff. Top-level instances stay at 0. Treated as
|
|
160
|
+
// effectively-readonly elsewhere — we dropped the modifier only so
|
|
161
|
+
// `executeWithContext` can populate it without needing a private
|
|
162
|
+
// constructor overload that would re-leak the parameter we just hid.
|
|
163
|
+
private tablePreprocessDepth = 0
|
|
83
164
|
|
|
84
165
|
private specialTagDiffStack: string[] = []
|
|
85
166
|
private newWords: string[] = []
|
|
@@ -147,18 +228,245 @@ export default class HtmlDiff {
|
|
|
147
228
|
* Initializes a new instance of the class.
|
|
148
229
|
* @param oldText The old text.
|
|
149
230
|
* @param newText The new text.
|
|
150
|
-
* @param tablePreprocessDepth Internal: nested-call depth for table
|
|
151
|
-
* preprocessing. Callers should leave at default (0); the recursive
|
|
152
|
-
* `diffCell` callback in TableDiff bumps it.
|
|
153
231
|
*/
|
|
154
|
-
constructor(oldText: string, newText: string
|
|
232
|
+
constructor(oldText: string, newText: string) {
|
|
155
233
|
this.oldText = oldText
|
|
156
234
|
this.newText = newText
|
|
157
|
-
this.tablePreprocessDepth = tablePreprocessDepth
|
|
158
235
|
}
|
|
159
236
|
|
|
160
|
-
static execute(oldText: string, newText: string
|
|
161
|
-
return new HtmlDiff(oldText, newText
|
|
237
|
+
static execute(oldText: string, newText: string): string {
|
|
238
|
+
return new HtmlDiff(oldText, newText).build()
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
/**
|
|
242
|
+
* Analyse a two-way diff and return its raw building blocks: the word
|
|
243
|
+
* arrays the diff ran against, the operations produced, the original
|
|
244
|
+
* (pre-projection) word arrays, and the mappings from diff-index back
|
|
245
|
+
* to original-word index when structural projection is active.
|
|
246
|
+
* Consumed by `executeThreeWay` so it can compose two diffs by walking
|
|
247
|
+
* their Operation streams.
|
|
248
|
+
*
|
|
249
|
+
* The caller is expected to coordinate `useProjections` symmetrically
|
|
250
|
+
* across composed analyses — if V1↔V2 projects but V2↔V3 doesn't,
|
|
251
|
+
* V2's "new" array in the first analysis won't equal V2's "old" array
|
|
252
|
+
* in the second. `evaluateProjectionApplicability` exposes the same
|
|
253
|
+
* heuristic `build()` uses internally, so the orchestrator can compute
|
|
254
|
+
* a single decision and pass it into every `analyze` call.
|
|
255
|
+
*
|
|
256
|
+
* Table preprocessing is skipped here. Placeholders mutate the input
|
|
257
|
+
* in ways that don't compose across two independent analyses; the
|
|
258
|
+
* 3-way orchestrator handles tables explicitly before calling analyze.
|
|
259
|
+
*/
|
|
260
|
+
static analyze(oldText: string, newText: string, options: AnalyzeOptions = {}): AnalyzeResult {
|
|
261
|
+
const inner = new HtmlDiff(oldText, newText)
|
|
262
|
+
// Bypass table preprocessing — the caller handles tables.
|
|
263
|
+
inner.tablePreprocessDepth = HtmlDiff.MaxTablePreprocessDepth
|
|
264
|
+
if (options.blockExpressions) {
|
|
265
|
+
for (const expr of options.blockExpressions) inner.addBlockExpression(expr)
|
|
266
|
+
}
|
|
267
|
+
if (options.repeatingWordsAccuracy !== undefined) inner.repeatingWordsAccuracy = options.repeatingWordsAccuracy
|
|
268
|
+
if (options.orphanMatchThreshold !== undefined) inner.orphanMatchThreshold = options.orphanMatchThreshold
|
|
269
|
+
if (options.ignoreWhitespaceDifferences !== undefined) {
|
|
270
|
+
inner.ignoreWhitespaceDifferences = options.ignoreWhitespaceDifferences
|
|
271
|
+
}
|
|
272
|
+
inner.splitInputsToWords()
|
|
273
|
+
if (options.useProjections === undefined) {
|
|
274
|
+
// Mirror build()'s heuristic — same behaviour as a standalone 2-way diff.
|
|
275
|
+
inner.buildContentProjections()
|
|
276
|
+
} else if (options.useProjections) {
|
|
277
|
+
// Caller forced projections on. Still skip if either side has no
|
|
278
|
+
// structural content, since projecting an empty side produces an
|
|
279
|
+
// empty diff space and the merge degrades.
|
|
280
|
+
const oldProj = HtmlDiff.createContentProjection(inner.oldWords)
|
|
281
|
+
const newProj = HtmlDiff.createContentProjection(inner.newWords)
|
|
282
|
+
if (oldProj.contentWords.length > 0 && newProj.contentWords.length > 0) {
|
|
283
|
+
inner.oldContentWords = oldProj.contentWords
|
|
284
|
+
inner.oldContentToOriginal = oldProj.contentToOriginal
|
|
285
|
+
inner.newContentWords = newProj.contentWords
|
|
286
|
+
inner.newContentToOriginal = newProj.contentToOriginal
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
// useProjections === false: leave projections unset, diff runs on raw words.
|
|
290
|
+
const wordsForDiffOld = inner.oldContentWords ?? inner.oldWords
|
|
291
|
+
const wordsForDiffNew = inner.newContentWords ?? inner.newWords
|
|
292
|
+
inner.matchGranularity = Math.min(
|
|
293
|
+
HtmlDiff.MatchGranularityMaximum,
|
|
294
|
+
Math.min(wordsForDiffOld.length, wordsForDiffNew.length)
|
|
295
|
+
)
|
|
296
|
+
return {
|
|
297
|
+
oldDiffWords: wordsForDiffOld,
|
|
298
|
+
newDiffWords: wordsForDiffNew,
|
|
299
|
+
operations: inner.operations(),
|
|
300
|
+
oldOriginalWords: inner.oldWords,
|
|
301
|
+
newOriginalWords: inner.newWords,
|
|
302
|
+
oldContentToOriginal: inner.oldContentToOriginal,
|
|
303
|
+
newContentToOriginal: inner.newContentToOriginal,
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
/**
|
|
308
|
+
* Whether content-projection (structural-tag normalisation) would
|
|
309
|
+
* apply to this pair of inputs under `build()`'s default heuristic.
|
|
310
|
+
* Exposed so composers of multiple analyses can compute a symmetric
|
|
311
|
+
* decision before calling `analyze` — see `analyze`'s docstring for
|
|
312
|
+
* why symmetry matters.
|
|
313
|
+
*/
|
|
314
|
+
static evaluateProjectionApplicability(oldText: string, newText: string): boolean {
|
|
315
|
+
const oldWords = WordSplitter.convertHtmlToListOfWords(oldText, [])
|
|
316
|
+
const newWords = WordSplitter.convertHtmlToListOfWords(newText, [])
|
|
317
|
+
if (!HtmlDiff.hasStructuralDifferences(oldWords, newWords)) return false
|
|
318
|
+
const oldProj = HtmlDiff.createContentProjection(oldWords)
|
|
319
|
+
const newProj = HtmlDiff.createContentProjection(newWords)
|
|
320
|
+
return HtmlDiff.shouldUseContentProjections(oldWords, newWords, oldProj, newProj)
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
/**
|
|
324
|
+
* Three-way HTML diff. Given V1 (the version Me last sent), V2 (the
|
|
325
|
+
* version CP sent back), and V3 (Me's current draft), produces a
|
|
326
|
+
* single attributed HTML output where CP's and Me's changes are
|
|
327
|
+
* distinguished by `data-author` ('cp' or 'me') and matching
|
|
328
|
+
* `class='diffins cp'` / `class='diffdel me'` etc. The "Me rejected
|
|
329
|
+
* CP's proposal" case (Me deleted text CP had inserted) gets a
|
|
330
|
+
* dedicated marker: `data-rejects='cp'` plus `class='... rejects-cp'`.
|
|
331
|
+
*
|
|
332
|
+
* Coordinates the symmetric-projection decision (D1) across both
|
|
333
|
+
* internal `analyze` calls so V2 tokenises identically on each side
|
|
334
|
+
* of the spine. When `useProjections` is left undefined, the decision
|
|
335
|
+
* is the conjunction of both pair-wise heuristics — project iff both
|
|
336
|
+
* pairs would project on their own. Pass an explicit boolean to
|
|
337
|
+
* override.
|
|
338
|
+
*/
|
|
339
|
+
/**
|
|
340
|
+
* Three-way HTML diff against a shared genesis. Produces attributed
|
|
341
|
+
* HTML that distinguishes CP's accumulated changes (genesis → cpLatest)
|
|
342
|
+
* from Me's accumulated changes (genesis → meCurrent). Use this for
|
|
343
|
+
* blackline UX where the negotiation has gone through multiple turns
|
|
344
|
+
* and the reader wants to see "who proposed what" across the whole
|
|
345
|
+
* history, not just the most recent round.
|
|
346
|
+
*
|
|
347
|
+
* When both parties happen to have made the same change (e.g. CP
|
|
348
|
+
* proposed a wording change in turn N, Me adopted it in turn N+1),
|
|
349
|
+
* the change reads as "settled" and is emitted unmarked — only
|
|
350
|
+
* disagreements and pending proposals carry author attribution.
|
|
351
|
+
*
|
|
352
|
+
* @param genesis the shared common ancestor (per-user — the FE
|
|
353
|
+
* picks between V1.0 and /preview/initialAnswers
|
|
354
|
+
* based on `prefillReceiverAnswers`)
|
|
355
|
+
* @param cpLatest the counterparty's current published version
|
|
356
|
+
* @param meCurrent Me's current draft (the document on screen)
|
|
357
|
+
*/
|
|
358
|
+
static executeThreeWay(genesis: string, cpLatest: string, meCurrent: string, options: ThreeWayOptions = {}): string {
|
|
359
|
+
return HtmlDiff.executeThreeWayWithDepth(genesis, cpLatest, meCurrent, options, 0)
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
private static executeThreeWayWithDepth(
|
|
363
|
+
genesis: string,
|
|
364
|
+
cpLatest: string,
|
|
365
|
+
meCurrent: string,
|
|
366
|
+
options: ThreeWayOptions,
|
|
367
|
+
depth: number
|
|
368
|
+
): string {
|
|
369
|
+
// Table preprocessing first — replaces each genesis/cp/me table with a
|
|
370
|
+
// shared-nonce placeholder, then the word-level merge runs over the
|
|
371
|
+
// table-free inputs. Cells are diffed recursively via executeThreeWay
|
|
372
|
+
// so the cell content is itself three-way attributed.
|
|
373
|
+
//
|
|
374
|
+
// Depth-cap the recursion so adversarially-nested input can't blow
|
|
375
|
+
// stack/memory.
|
|
376
|
+
const tablePreprocess =
|
|
377
|
+
depth < HtmlDiff.MaxThreeWayDepth
|
|
378
|
+
? preprocessTablesThreeWay(genesis, cpLatest, meCurrent, (g, c, m) =>
|
|
379
|
+
HtmlDiff.executeThreeWayWithDepth(g, c, m, options, depth + 1)
|
|
380
|
+
)
|
|
381
|
+
: null
|
|
382
|
+
const inGenesis = tablePreprocess?.modifiedGenesis ?? genesis
|
|
383
|
+
const inCp = tablePreprocess?.modifiedCp ?? cpLatest
|
|
384
|
+
const inMe = tablePreprocess?.modifiedMe ?? meCurrent
|
|
385
|
+
|
|
386
|
+
// Symmetric projection across both analyses. The genesis-spine
|
|
387
|
+
// algorithm requires `genesis` to tokenise identically on each
|
|
388
|
+
// pair-wise analysis (both have genesis as the OLD side), so the
|
|
389
|
+
// useProjections decision must agree across both calls.
|
|
390
|
+
const useProjections =
|
|
391
|
+
options.useProjections ??
|
|
392
|
+
(HtmlDiff.evaluateProjectionApplicability(inGenesis, inCp) &&
|
|
393
|
+
HtmlDiff.evaluateProjectionApplicability(inGenesis, inMe))
|
|
394
|
+
|
|
395
|
+
const analyzeOpts: AnalyzeOptions = {
|
|
396
|
+
useProjections,
|
|
397
|
+
blockExpressions: options.blockExpressions,
|
|
398
|
+
repeatingWordsAccuracy: options.repeatingWordsAccuracy,
|
|
399
|
+
orphanMatchThreshold: options.orphanMatchThreshold,
|
|
400
|
+
ignoreWhitespaceDifferences: options.ignoreWhitespaceDifferences,
|
|
401
|
+
}
|
|
402
|
+
const dCp = HtmlDiff.analyze(inGenesis, inCp, analyzeOpts)
|
|
403
|
+
const dMe = HtmlDiff.analyze(inGenesis, inMe, analyzeOpts)
|
|
404
|
+
|
|
405
|
+
// Spine sanity check — both analyses must share an identical genesis
|
|
406
|
+
// tokenisation. Symmetric useProjections guarantees this; if it ever
|
|
407
|
+
// diverges, fail loudly rather than silently misattribute.
|
|
408
|
+
if (dCp.oldDiffWords.length !== dMe.oldDiffWords.length) {
|
|
409
|
+
throw new Error(
|
|
410
|
+
'HtmlDiff.executeThreeWay: genesis tokenisation diverged across pair-wise analyses ' +
|
|
411
|
+
`(${dCp.oldDiffWords.length} vs ${dMe.oldDiffWords.length}). ` +
|
|
412
|
+
'This indicates the symmetric-projection coordination has a bug.'
|
|
413
|
+
)
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
const segments = buildSegments(dCp, dMe)
|
|
417
|
+
const merged = HtmlDiff.emitSegments(segments)
|
|
418
|
+
return tablePreprocess ? restoreTablePlaceholders(merged, tablePreprocess.placeholderToDiff) : merged
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
/**
|
|
422
|
+
* Drives a fresh `HtmlDiff` instance through `insertTag` for ins/del
|
|
423
|
+
* segments and pushes equal segments straight to its `content`
|
|
424
|
+
* buffer. Reusing the instance keeps the formatting-tag stack
|
|
425
|
+
* (`specialTagDiffStack`) coherent across segments — a `<strong>`
|
|
426
|
+
* opened in one segment and closed in another stays balanced.
|
|
427
|
+
*/
|
|
428
|
+
private static emitSegments(segments: Segment[]): string {
|
|
429
|
+
const emitter = new HtmlDiff('', '')
|
|
430
|
+
for (const seg of segments) {
|
|
431
|
+
if (seg.attr.kind === 'equal') {
|
|
432
|
+
emitter.content.push(seg.words.join(''))
|
|
433
|
+
continue
|
|
434
|
+
}
|
|
435
|
+
const { tag, baseClass, metadata } = segmentEmissionShape(seg.attr)
|
|
436
|
+
// insertTag mutates its `words` array; pass a copy.
|
|
437
|
+
emitter.insertTag(tag, baseClass, [...seg.words], metadata)
|
|
438
|
+
}
|
|
439
|
+
// Stack-balance invariant: every special-case opening tag pushed onto
|
|
440
|
+
// `specialTagDiffStack` during emission must have been matched by a
|
|
441
|
+
// closing tag. An unbalanced stack means the input had unbalanced
|
|
442
|
+
// formatting tags AND a Replace at an inconvenient position — the
|
|
443
|
+
// output would be silently malformed (half-closed `<ins>`). Fail
|
|
444
|
+
// loudly so the caller can investigate rather than ship broken HTML.
|
|
445
|
+
if (emitter.specialTagDiffStack.length > 0) {
|
|
446
|
+
throw new Error(
|
|
447
|
+
`HtmlDiff.executeThreeWay: emission left ${emitter.specialTagDiffStack.length} ` +
|
|
448
|
+
'unclosed formatting tag(s) on the stack — input may have unbalanced ' +
|
|
449
|
+
'<strong>/<em>/etc. or there is a bug in segment emission.'
|
|
450
|
+
)
|
|
451
|
+
}
|
|
452
|
+
return emitter.content.join('')
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
/**
|
|
456
|
+
* Internal entry point used by the table-cell recursion. Constructs an
|
|
457
|
+
* inner `HtmlDiff`, applies the caller's settings, and bumps the
|
|
458
|
+
* recursion depth — keeping the public constructor signature clean
|
|
459
|
+
* while still threading the configuration that's required for cell-
|
|
460
|
+
* level output to match the top-level call's behaviour.
|
|
461
|
+
*/
|
|
462
|
+
private static executeWithContext(oldText: string, newText: string, ctx: RecursionContext): string {
|
|
463
|
+
const inner = new HtmlDiff(oldText, newText)
|
|
464
|
+
inner.tablePreprocessDepth = ctx.depth
|
|
465
|
+
for (const expr of ctx.blockExpressions) inner.addBlockExpression(expr)
|
|
466
|
+
inner.repeatingWordsAccuracy = ctx.repeatingWordsAccuracy
|
|
467
|
+
inner.orphanMatchThreshold = ctx.orphanMatchThreshold
|
|
468
|
+
inner.ignoreWhitespaceDifferences = ctx.ignoreWhitespaceDifferences
|
|
469
|
+
return inner.build()
|
|
162
470
|
}
|
|
163
471
|
|
|
164
472
|
/**
|
|
@@ -174,26 +482,26 @@ export default class HtmlDiff {
|
|
|
174
482
|
// Table preprocessing: when both sides have matching `<table>` structures,
|
|
175
483
|
// diff cells positionally so cross-cell content shifts produce one
|
|
176
484
|
// independent del/ins per cell rather than cell-misaligned output.
|
|
177
|
-
// Recursion guarded by MaxTablePreprocessDepth
|
|
178
|
-
//
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
:
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
485
|
+
// Recursion is guarded by MaxTablePreprocessDepth — check the cap
|
|
486
|
+
// first so we don't construct a context that will never be used.
|
|
487
|
+
let tablePreprocess: ReturnType<typeof preprocessTables> = null
|
|
488
|
+
if (this.tablePreprocessDepth < HtmlDiff.MaxTablePreprocessDepth) {
|
|
489
|
+
// Caller-configured settings (block expressions, accuracy
|
|
490
|
+
// thresholds) flow to the recursive cell diff via `RecursionContext`
|
|
491
|
+
// so cell-level output is consistent with the top-level
|
|
492
|
+
// configuration. The context is built once here and reused for
|
|
493
|
+
// every cell-diff callback invocation.
|
|
494
|
+
const ctx: RecursionContext = {
|
|
495
|
+
depth: this.tablePreprocessDepth + 1,
|
|
496
|
+
blockExpressions: this.blockExpressions,
|
|
497
|
+
repeatingWordsAccuracy: this.repeatingWordsAccuracy,
|
|
498
|
+
orphanMatchThreshold: this.orphanMatchThreshold,
|
|
499
|
+
ignoreWhitespaceDifferences: this.ignoreWhitespaceDifferences,
|
|
500
|
+
}
|
|
501
|
+
tablePreprocess = preprocessTables(this.oldText, this.newText, (oldCell, newCell) =>
|
|
502
|
+
HtmlDiff.executeWithContext(oldCell, newCell, ctx)
|
|
503
|
+
)
|
|
504
|
+
}
|
|
197
505
|
if (tablePreprocess) {
|
|
198
506
|
this.oldText = tablePreprocess.modifiedOld
|
|
199
507
|
this.newText = tablePreprocess.modifiedNew
|
|
@@ -491,7 +799,7 @@ export default class HtmlDiff {
|
|
|
491
799
|
* @param words
|
|
492
800
|
* @private
|
|
493
801
|
*/
|
|
494
|
-
private insertTag(tag: string, cssClass: string, words: string[]) {
|
|
802
|
+
private insertTag(tag: string, cssClass: string, words: string[], metadata?: WrapMetadata) {
|
|
495
803
|
while (true) {
|
|
496
804
|
if (words.length === 0) {
|
|
497
805
|
break
|
|
@@ -499,7 +807,7 @@ export default class HtmlDiff {
|
|
|
499
807
|
|
|
500
808
|
const allWordsUntilFirstTag = this.extractConsecutiveWords(words, x => !Utils.isTag(x))
|
|
501
809
|
if (allWordsUntilFirstTag.length > 0) {
|
|
502
|
-
const text = Utils.wrapText(allWordsUntilFirstTag.join(''), tag, cssClass)
|
|
810
|
+
const text = Utils.wrapText(allWordsUntilFirstTag.join(''), tag, cssClass, metadata)
|
|
503
811
|
this.content.push(text)
|
|
504
812
|
}
|
|
505
813
|
|
|
@@ -533,7 +841,9 @@ export default class HtmlDiff {
|
|
|
533
841
|
const styledTagNames = Array.from(tagNames).join(' ')
|
|
534
842
|
|
|
535
843
|
this.specialTagDiffStack.push(words[0])
|
|
536
|
-
|
|
844
|
+
// Carry the caller's metadata into the formatting-tag wrapper so
|
|
845
|
+
// a 3-way author tag survives a `<strong>`/`<em>` content edit.
|
|
846
|
+
specialCaseTagInjection = `<ins${Utils.composeTagAttributes(`mod ${styledTagNames}`, metadata ?? {})}>`
|
|
537
847
|
if (tag === HtmlDiff.DelTag) {
|
|
538
848
|
words.shift()
|
|
539
849
|
|
|
@@ -608,7 +918,7 @@ export default class HtmlDiff {
|
|
|
608
918
|
if (words.length === 0) continue
|
|
609
919
|
|
|
610
920
|
// if there are still words left, they must start with a nonTag and need to be handled in the next iteration.
|
|
611
|
-
this.insertTag(tag, cssClass, words)
|
|
921
|
+
this.insertTag(tag, cssClass, words, metadata)
|
|
612
922
|
break
|
|
613
923
|
}
|
|
614
924
|
}
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Low-level HTML tag-parsing primitives shared by the table-aware
|
|
3
|
+
* preprocessing and (potentially) other consumers. These are deliberately
|
|
4
|
+
* generic over the document type — no table-specific assumptions live
|
|
5
|
+
* here.
|
|
6
|
+
*
|
|
7
|
+
* The goal is to walk HTML at the tag boundary level *without* parsing
|
|
8
|
+
* into a full DOM, so we stay fast and never round-trip through
|
|
9
|
+
* htmlparser2/DOMPurify in the diff hot path.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
export interface OpeningTag {
|
|
13
|
+
/** Index just past the closing `>` of the opening tag. */
|
|
14
|
+
end: number
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export interface ClassAttributeLocation {
|
|
18
|
+
/** Index of the value's first character (inside the surrounding quotes). */
|
|
19
|
+
valueStart: number
|
|
20
|
+
/** Index just past the last character of the value (still inside quotes). */
|
|
21
|
+
valueEnd: number
|
|
22
|
+
/** The class attribute's value, with surrounding quotes stripped. */
|
|
23
|
+
value: string
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Parses the opening tag (or comment/CDATA/PI) starting at `i`. Returns
|
|
28
|
+
* the index just past the closing delimiter, or null if the tag is
|
|
29
|
+
* malformed (unterminated). HTML comments, CDATA, processing
|
|
30
|
+
* instructions, and DOCTYPE need their own terminators — a plain
|
|
31
|
+
* `>`-walker would cut a comment like `<!-- a > b -->` at the first
|
|
32
|
+
* inner `>`, treating the rest as text and corrupting downstream
|
|
33
|
+
* offsets. Word-exported HTML routinely emits comments inside tables
|
|
34
|
+
* (conditional comments, OLE markers) so these have to be handled.
|
|
35
|
+
*/
|
|
36
|
+
export function parseOpeningTagAt(html: string, i: number): OpeningTag | null {
|
|
37
|
+
if (html.startsWith('<!--', i)) {
|
|
38
|
+
const close = html.indexOf('-->', i + 4)
|
|
39
|
+
return close === -1 ? null : { end: close + 3 }
|
|
40
|
+
}
|
|
41
|
+
if (html.startsWith('<![CDATA[', i)) {
|
|
42
|
+
const close = html.indexOf(']]>', i + 9)
|
|
43
|
+
return close === -1 ? null : { end: close + 3 }
|
|
44
|
+
}
|
|
45
|
+
if (html.startsWith('<?', i)) {
|
|
46
|
+
const close = html.indexOf('?>', i + 2)
|
|
47
|
+
return close === -1 ? null : { end: close + 2 }
|
|
48
|
+
}
|
|
49
|
+
// Walk to the next unquoted '>'. Handles attributes whose values contain
|
|
50
|
+
// a literal '>' inside quotes, which a plain indexOf would mishandle.
|
|
51
|
+
let j = i + 1
|
|
52
|
+
let quote: string | null = null
|
|
53
|
+
while (j < html.length) {
|
|
54
|
+
const ch = html[j]
|
|
55
|
+
if (quote) {
|
|
56
|
+
if (ch === quote) quote = null
|
|
57
|
+
} else if (ch === '"' || ch === "'") {
|
|
58
|
+
quote = ch
|
|
59
|
+
} else if (ch === '>') {
|
|
60
|
+
return { end: j + 1 }
|
|
61
|
+
}
|
|
62
|
+
j++
|
|
63
|
+
}
|
|
64
|
+
return null
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
export function matchesTagAt(html: string, i: number, tagName: string): boolean {
|
|
68
|
+
if (html[i] !== '<') return false
|
|
69
|
+
const candidate = html.slice(i + 1, i + 1 + tagName.length).toLowerCase()
|
|
70
|
+
if (candidate !== tagName) return false
|
|
71
|
+
const after = html[i + 1 + tagName.length]
|
|
72
|
+
return after === '>' || after === ' ' || after === '\t' || after === '\n' || after === '\r' || after === '/'
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
export function matchesClosingTagAt(html: string, i: number, tagName: string): boolean {
|
|
76
|
+
if (html[i] !== '<' || html[i + 1] !== '/') return false
|
|
77
|
+
const candidate = html.slice(i + 2, i + 2 + tagName.length).toLowerCase()
|
|
78
|
+
if (candidate !== tagName) return false
|
|
79
|
+
const after = html[i + 2 + tagName.length]
|
|
80
|
+
return after === '>' || after === ' ' || after === '\t' || after === '\n' || after === '\r'
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Returns the index just past the matching `</tagName>`, accounting for
|
|
85
|
+
* nested tags of the same name. Returns -1 if no match before `limit`.
|
|
86
|
+
*/
|
|
87
|
+
export function findMatchingClosingTag(
|
|
88
|
+
html: string,
|
|
89
|
+
from: number,
|
|
90
|
+
tagName: string,
|
|
91
|
+
limit: number = html.length
|
|
92
|
+
): number {
|
|
93
|
+
let depth = 1
|
|
94
|
+
let i = from
|
|
95
|
+
while (i < limit) {
|
|
96
|
+
if (matchesTagAt(html, i, tagName)) {
|
|
97
|
+
const opening = parseOpeningTagAt(html, i)
|
|
98
|
+
if (!opening) {
|
|
99
|
+
i++
|
|
100
|
+
continue
|
|
101
|
+
}
|
|
102
|
+
const tagText = html.slice(i, opening.end)
|
|
103
|
+
if (!tagText.endsWith('/>')) depth++
|
|
104
|
+
i = opening.end
|
|
105
|
+
} else if (matchesClosingTagAt(html, i, tagName)) {
|
|
106
|
+
depth--
|
|
107
|
+
const closing = parseOpeningTagAt(html, i)
|
|
108
|
+
const closingEnd = closing?.end ?? i + `</${tagName}>`.length
|
|
109
|
+
if (depth === 0) return closingEnd
|
|
110
|
+
i = closingEnd
|
|
111
|
+
} else {
|
|
112
|
+
i++
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
return -1
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Returns the opening tag with the given class injected. Locates the real
|
|
120
|
+
* `class` attribute via attribute-aware walking (NOT a flat regex — that
|
|
121
|
+
* would mis-match inside a foreign attribute value like
|
|
122
|
+
* `title="see class='x'"`). When the class already partially overlaps with
|
|
123
|
+
* `cls` — e.g. existing `class="mod"` and we're injecting `mod colspan` —
|
|
124
|
+
* only the missing tokens get appended, so we never end up with
|
|
125
|
+
* `class="mod mod colspan"`.
|
|
126
|
+
*/
|
|
127
|
+
export function injectClass(openingTag: string, cls: string): string {
|
|
128
|
+
const clsTokens = cls.split(/\s+/).filter(Boolean)
|
|
129
|
+
if (clsTokens.length === 0) return openingTag
|
|
130
|
+
|
|
131
|
+
const classAttr = findClassAttribute(openingTag)
|
|
132
|
+
if (classAttr) {
|
|
133
|
+
const existingTokens = classAttr.value.split(/\s+/).filter(Boolean)
|
|
134
|
+
const missing = clsTokens.filter(t => !existingTokens.includes(t))
|
|
135
|
+
if (missing.length === 0) return openingTag
|
|
136
|
+
const updatedValue =
|
|
137
|
+
existingTokens.length === 0 ? missing.join(' ') : `${existingTokens.join(' ')} ${missing.join(' ')}`
|
|
138
|
+
return openingTag.slice(0, classAttr.valueStart) + updatedValue + openingTag.slice(classAttr.valueEnd)
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
const isSelfClosing = openingTag.endsWith('/>')
|
|
142
|
+
const insertAt = isSelfClosing ? openingTag.length - 2 : openingTag.length - 1
|
|
143
|
+
return `${openingTag.slice(0, insertAt).replace(/\s*$/, '')} class='${cls}'${openingTag.slice(insertAt)}`
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Walks the opening tag's attributes (respecting quoted values) to find
|
|
148
|
+
* the actual `class` attribute. Returns the value range (start/end of the
|
|
149
|
+
* value content, *excluding* the surrounding quotes) and the value, or
|
|
150
|
+
* null if no `class` attribute is present.
|
|
151
|
+
*/
|
|
152
|
+
export function findClassAttribute(openingTag: string): ClassAttributeLocation | null {
|
|
153
|
+
// Skip past the tag name. Tag starts with `<`; first run of [A-Za-z0-9-]
|
|
154
|
+
// is the tag name. Anything after is attribute territory.
|
|
155
|
+
let i = 1
|
|
156
|
+
while (i < openingTag.length && /[A-Za-z0-9_:-]/.test(openingTag[i])) i++
|
|
157
|
+
|
|
158
|
+
while (i < openingTag.length) {
|
|
159
|
+
// Skip whitespace
|
|
160
|
+
while (i < openingTag.length && /\s/.test(openingTag[i])) i++
|
|
161
|
+
if (i >= openingTag.length) break
|
|
162
|
+
if (openingTag[i] === '>' || openingTag[i] === '/') break
|
|
163
|
+
|
|
164
|
+
// Read attribute name
|
|
165
|
+
const nameStart = i
|
|
166
|
+
while (i < openingTag.length && !/[\s=>/]/.test(openingTag[i])) i++
|
|
167
|
+
const name = openingTag.slice(nameStart, i)
|
|
168
|
+
|
|
169
|
+
// Optional whitespace + '=' + optional whitespace + value
|
|
170
|
+
while (i < openingTag.length && /\s/.test(openingTag[i])) i++
|
|
171
|
+
if (openingTag[i] !== '=') {
|
|
172
|
+
// Bare attribute (no value) — not class
|
|
173
|
+
continue
|
|
174
|
+
}
|
|
175
|
+
i++ // past '='
|
|
176
|
+
while (i < openingTag.length && /\s/.test(openingTag[i])) i++
|
|
177
|
+
|
|
178
|
+
// Value: quoted or unquoted
|
|
179
|
+
let valueStart: number
|
|
180
|
+
let valueEnd: number
|
|
181
|
+
if (openingTag[i] === '"' || openingTag[i] === "'") {
|
|
182
|
+
const quote = openingTag[i]
|
|
183
|
+
i++
|
|
184
|
+
valueStart = i
|
|
185
|
+
while (i < openingTag.length && openingTag[i] !== quote) i++
|
|
186
|
+
valueEnd = i
|
|
187
|
+
if (i < openingTag.length) i++ // past closing quote
|
|
188
|
+
} else {
|
|
189
|
+
valueStart = i
|
|
190
|
+
while (i < openingTag.length && !/[\s>/]/.test(openingTag[i])) i++
|
|
191
|
+
valueEnd = i
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
if (name.toLowerCase() === 'class') {
|
|
195
|
+
return { valueStart, valueEnd, value: openingTag.slice(valueStart, valueEnd) }
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
return null
|
|
200
|
+
}
|