@createiq/htmldiff 1.1.0 → 1.2.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/HtmlDiff.ts CHANGED
@@ -3,9 +3,74 @@ import Match from './Match'
3
3
  import MatchFinder from './MatchFinder'
4
4
  import Operation from './Operation'
5
5
  import { preprocessTables, restoreTablePlaceholders } from './TableDiff'
6
- import Utils from './Utils'
6
+ import { buildSegments, type Segment, segmentEmissionShape } from './ThreeWayDiff'
7
+ import { preprocessTablesThreeWay } from './ThreeWayTable'
8
+ import Utils, { type WrapMetadata } from './Utils'
7
9
  import WordSplitter from './WordSplitter'
8
10
 
11
+ /**
12
+ * State threaded into the recursive cell-level diff inside
13
+ * `preprocessTables`. Bundles the nesting depth (for the recursion-cap
14
+ * guard) with the caller-configurable settings that must propagate
15
+ * unchanged to inner instances so cell-level output stays consistent
16
+ * with the top-level call. Internal — never crosses the public API.
17
+ */
18
+ interface RecursionContext {
19
+ depth: number
20
+ blockExpressions: readonly RegExp[]
21
+ repeatingWordsAccuracy: number
22
+ orphanMatchThreshold: number
23
+ ignoreWhitespaceDifferences: boolean
24
+ }
25
+
26
+ /**
27
+ * Options for the `HtmlDiff.analyze` static helper.
28
+ *
29
+ * `useProjections` controls structural-tag normalisation:
30
+ * - `undefined` → use the same heuristic as `build()` (per-call decision)
31
+ * - `true` → force projection on (skipped if either side has no content)
32
+ * - `false` → force projection off (diff runs on raw word arrays)
33
+ * Composers of multiple analyses (e.g. three-way diff) MUST pass the
34
+ * same explicit boolean to all calls so shared inputs tokenise
35
+ * identically across analyses.
36
+ *
37
+ * The remaining options mirror the per-instance fields on `HtmlDiff`
38
+ * itself — they exist on the options bag because `analyze` constructs
39
+ * the inner instance internally.
40
+ */
41
+ export interface AnalyzeOptions {
42
+ useProjections?: boolean
43
+ blockExpressions?: readonly RegExp[]
44
+ repeatingWordsAccuracy?: number
45
+ orphanMatchThreshold?: number
46
+ ignoreWhitespaceDifferences?: boolean
47
+ }
48
+
49
+ export interface AnalyzeResult {
50
+ /** Word array the `operations` index into (projected or raw). */
51
+ readonly oldDiffWords: readonly string[]
52
+ readonly newDiffWords: readonly string[]
53
+ readonly operations: readonly Operation[]
54
+ /** Original WordSplitter output, before any projection. */
55
+ readonly oldOriginalWords: readonly string[]
56
+ readonly newOriginalWords: readonly string[]
57
+ /** Diff-index → original-word-index map; null when projections inactive. */
58
+ readonly oldContentToOriginal: readonly number[] | null
59
+ readonly newContentToOriginal: readonly number[] | null
60
+ }
61
+
62
+ /**
63
+ * Options for `HtmlDiff.executeThreeWay`. Same shape as `AnalyzeOptions`
64
+ * — the values flow unchanged into both internal `analyze` calls so V2's
65
+ * tokenisation stays symmetric. Aliased so future divergence in either
66
+ * direction lives in one place.
67
+ *
68
+ * `useProjections`: when undefined, `executeThreeWay` computes the
69
+ * decision as the conjunction of both pair-wise
70
+ * `evaluateProjectionApplicability` results.
71
+ */
72
+ export type ThreeWayOptions = AnalyzeOptions
73
+
9
74
  export default class HtmlDiff {
10
75
  /**
11
76
  * This value defines balance between speed and memory utilization. The higher it is the faster it works and more memory consumes.
@@ -76,10 +141,26 @@ export default class HtmlDiff {
76
141
  */
77
142
  private static MaxTablePreprocessDepth = 8
78
143
 
144
+ /**
145
+ * Mirror cap for the three-way path. The 2-way `MaxTablePreprocessDepth`
146
+ * guards the recursion inside `executeWithContext`; the 3-way path has
147
+ * its own recursion (`executeThreeWay` → `preprocessTablesThreeWay` →
148
+ * `cellDiff` → `executeThreeWay`) which needs its own guard. Once the
149
+ * cap is reached, `executeThreeWay` skips table preprocessing and
150
+ * falls back to the word-level merge — same bail-out semantics as the
151
+ * 2-way path.
152
+ */
153
+ private static MaxThreeWayDepth = 8
154
+
79
155
  private content: string[] = []
80
156
  private newText: string
81
157
  private oldText: string
82
- private readonly tablePreprocessDepth: number
158
+ // Written exactly once, by `executeWithContext` on the inner instance
159
+ // for a recursive cell-diff. Top-level instances stay at 0. Treated as
160
+ // effectively-readonly elsewhere — we dropped the modifier only so
161
+ // `executeWithContext` can populate it without needing a private
162
+ // constructor overload that would re-leak the parameter we just hid.
163
+ private tablePreprocessDepth = 0
83
164
 
84
165
  private specialTagDiffStack: string[] = []
85
166
  private newWords: string[] = []
@@ -147,18 +228,245 @@ export default class HtmlDiff {
147
228
  * Initializes a new instance of the class.
148
229
  * @param oldText The old text.
149
230
  * @param newText The new text.
150
- * @param tablePreprocessDepth Internal: nested-call depth for table
151
- * preprocessing. Callers should leave at default (0); the recursive
152
- * `diffCell` callback in TableDiff bumps it.
153
231
  */
154
- constructor(oldText: string, newText: string, tablePreprocessDepth = 0) {
232
+ constructor(oldText: string, newText: string) {
155
233
  this.oldText = oldText
156
234
  this.newText = newText
157
- this.tablePreprocessDepth = tablePreprocessDepth
158
235
  }
159
236
 
160
- static execute(oldText: string, newText: string, tablePreprocessDepth = 0) {
161
- return new HtmlDiff(oldText, newText, tablePreprocessDepth).build()
237
+ static execute(oldText: string, newText: string): string {
238
+ return new HtmlDiff(oldText, newText).build()
239
+ }
240
+
241
+ /**
242
+ * Analyse a two-way diff and return its raw building blocks: the word
243
+ * arrays the diff ran against, the operations produced, the original
244
+ * (pre-projection) word arrays, and the mappings from diff-index back
245
+ * to original-word index when structural projection is active.
246
+ * Consumed by `executeThreeWay` so it can compose two diffs by walking
247
+ * their Operation streams.
248
+ *
249
+ * The caller is expected to coordinate `useProjections` symmetrically
250
+ * across composed analyses — if V1↔V2 projects but V2↔V3 doesn't,
251
+ * V2's "new" array in the first analysis won't equal V2's "old" array
252
+ * in the second. `evaluateProjectionApplicability` exposes the same
253
+ * heuristic `build()` uses internally, so the orchestrator can compute
254
+ * a single decision and pass it into every `analyze` call.
255
+ *
256
+ * Table preprocessing is skipped here. Placeholders mutate the input
257
+ * in ways that don't compose across two independent analyses; the
258
+ * 3-way orchestrator handles tables explicitly before calling analyze.
259
+ */
260
+ static analyze(oldText: string, newText: string, options: AnalyzeOptions = {}): AnalyzeResult {
261
+ const inner = new HtmlDiff(oldText, newText)
262
+ // Bypass table preprocessing — the caller handles tables.
263
+ inner.tablePreprocessDepth = HtmlDiff.MaxTablePreprocessDepth
264
+ if (options.blockExpressions) {
265
+ for (const expr of options.blockExpressions) inner.addBlockExpression(expr)
266
+ }
267
+ if (options.repeatingWordsAccuracy !== undefined) inner.repeatingWordsAccuracy = options.repeatingWordsAccuracy
268
+ if (options.orphanMatchThreshold !== undefined) inner.orphanMatchThreshold = options.orphanMatchThreshold
269
+ if (options.ignoreWhitespaceDifferences !== undefined) {
270
+ inner.ignoreWhitespaceDifferences = options.ignoreWhitespaceDifferences
271
+ }
272
+ inner.splitInputsToWords()
273
+ if (options.useProjections === undefined) {
274
+ // Mirror build()'s heuristic — same behaviour as a standalone 2-way diff.
275
+ inner.buildContentProjections()
276
+ } else if (options.useProjections) {
277
+ // Caller forced projections on. Still skip if either side has no
278
+ // structural content, since projecting an empty side produces an
279
+ // empty diff space and the merge degrades.
280
+ const oldProj = HtmlDiff.createContentProjection(inner.oldWords)
281
+ const newProj = HtmlDiff.createContentProjection(inner.newWords)
282
+ if (oldProj.contentWords.length > 0 && newProj.contentWords.length > 0) {
283
+ inner.oldContentWords = oldProj.contentWords
284
+ inner.oldContentToOriginal = oldProj.contentToOriginal
285
+ inner.newContentWords = newProj.contentWords
286
+ inner.newContentToOriginal = newProj.contentToOriginal
287
+ }
288
+ }
289
+ // useProjections === false: leave projections unset, diff runs on raw words.
290
+ const wordsForDiffOld = inner.oldContentWords ?? inner.oldWords
291
+ const wordsForDiffNew = inner.newContentWords ?? inner.newWords
292
+ inner.matchGranularity = Math.min(
293
+ HtmlDiff.MatchGranularityMaximum,
294
+ Math.min(wordsForDiffOld.length, wordsForDiffNew.length)
295
+ )
296
+ return {
297
+ oldDiffWords: wordsForDiffOld,
298
+ newDiffWords: wordsForDiffNew,
299
+ operations: inner.operations(),
300
+ oldOriginalWords: inner.oldWords,
301
+ newOriginalWords: inner.newWords,
302
+ oldContentToOriginal: inner.oldContentToOriginal,
303
+ newContentToOriginal: inner.newContentToOriginal,
304
+ }
305
+ }
306
+
307
+ /**
308
+ * Whether content-projection (structural-tag normalisation) would
309
+ * apply to this pair of inputs under `build()`'s default heuristic.
310
+ * Exposed so composers of multiple analyses can compute a symmetric
311
+ * decision before calling `analyze` — see `analyze`'s docstring for
312
+ * why symmetry matters.
313
+ */
314
+ static evaluateProjectionApplicability(oldText: string, newText: string): boolean {
315
+ const oldWords = WordSplitter.convertHtmlToListOfWords(oldText, [])
316
+ const newWords = WordSplitter.convertHtmlToListOfWords(newText, [])
317
+ if (!HtmlDiff.hasStructuralDifferences(oldWords, newWords)) return false
318
+ const oldProj = HtmlDiff.createContentProjection(oldWords)
319
+ const newProj = HtmlDiff.createContentProjection(newWords)
320
+ return HtmlDiff.shouldUseContentProjections(oldWords, newWords, oldProj, newProj)
321
+ }
322
+
323
+ /**
324
+ * Three-way HTML diff. Given V1 (the version Me last sent), V2 (the
325
+ * version CP sent back), and V3 (Me's current draft), produces a
326
+ * single attributed HTML output where CP's and Me's changes are
327
+ * distinguished by `data-author` ('cp' or 'me') and matching
328
+ * `class='diffins cp'` / `class='diffdel me'` etc. The "Me rejected
329
+ * CP's proposal" case (Me deleted text CP had inserted) gets a
330
+ * dedicated marker: `data-rejects='cp'` plus `class='... rejects-cp'`.
331
+ *
332
+ * Coordinates the symmetric-projection decision (D1) across both
333
+ * internal `analyze` calls so V2 tokenises identically on each side
334
+ * of the spine. When `useProjections` is left undefined, the decision
335
+ * is the conjunction of both pair-wise heuristics — project iff both
336
+ * pairs would project on their own. Pass an explicit boolean to
337
+ * override.
338
+ */
339
+ /**
340
+ * Three-way HTML diff against a shared genesis. Produces attributed
341
+ * HTML that distinguishes CP's accumulated changes (genesis → cpLatest)
342
+ * from Me's accumulated changes (genesis → meCurrent). Use this for
343
+ * blackline UX where the negotiation has gone through multiple turns
344
+ * and the reader wants to see "who proposed what" across the whole
345
+ * history, not just the most recent round.
346
+ *
347
+ * When both parties happen to have made the same change (e.g. CP
348
+ * proposed a wording change in turn N, Me adopted it in turn N+1),
349
+ * the change reads as "settled" and is emitted unmarked — only
350
+ * disagreements and pending proposals carry author attribution.
351
+ *
352
+ * @param genesis the shared common ancestor (per-user — the FE
353
+ * picks between V1.0 and /preview/initialAnswers
354
+ * based on `prefillReceiverAnswers`)
355
+ * @param cpLatest the counterparty's current published version
356
+ * @param meCurrent Me's current draft (the document on screen)
357
+ */
358
+ static executeThreeWay(genesis: string, cpLatest: string, meCurrent: string, options: ThreeWayOptions = {}): string {
359
+ return HtmlDiff.executeThreeWayWithDepth(genesis, cpLatest, meCurrent, options, 0)
360
+ }
361
+
362
+ private static executeThreeWayWithDepth(
363
+ genesis: string,
364
+ cpLatest: string,
365
+ meCurrent: string,
366
+ options: ThreeWayOptions,
367
+ depth: number
368
+ ): string {
369
+ // Table preprocessing first — replaces each genesis/cp/me table with a
370
+ // shared-nonce placeholder, then the word-level merge runs over the
371
+ // table-free inputs. Cells are diffed recursively via executeThreeWay
372
+ // so the cell content is itself three-way attributed.
373
+ //
374
+ // Depth-cap the recursion so adversarially-nested input can't blow
375
+ // stack/memory.
376
+ const tablePreprocess =
377
+ depth < HtmlDiff.MaxThreeWayDepth
378
+ ? preprocessTablesThreeWay(genesis, cpLatest, meCurrent, (g, c, m) =>
379
+ HtmlDiff.executeThreeWayWithDepth(g, c, m, options, depth + 1)
380
+ )
381
+ : null
382
+ const inGenesis = tablePreprocess?.modifiedGenesis ?? genesis
383
+ const inCp = tablePreprocess?.modifiedCp ?? cpLatest
384
+ const inMe = tablePreprocess?.modifiedMe ?? meCurrent
385
+
386
+ // Symmetric projection across both analyses. The genesis-spine
387
+ // algorithm requires `genesis` to tokenise identically on each
388
+ // pair-wise analysis (both have genesis as the OLD side), so the
389
+ // useProjections decision must agree across both calls.
390
+ const useProjections =
391
+ options.useProjections ??
392
+ (HtmlDiff.evaluateProjectionApplicability(inGenesis, inCp) &&
393
+ HtmlDiff.evaluateProjectionApplicability(inGenesis, inMe))
394
+
395
+ const analyzeOpts: AnalyzeOptions = {
396
+ useProjections,
397
+ blockExpressions: options.blockExpressions,
398
+ repeatingWordsAccuracy: options.repeatingWordsAccuracy,
399
+ orphanMatchThreshold: options.orphanMatchThreshold,
400
+ ignoreWhitespaceDifferences: options.ignoreWhitespaceDifferences,
401
+ }
402
+ const dCp = HtmlDiff.analyze(inGenesis, inCp, analyzeOpts)
403
+ const dMe = HtmlDiff.analyze(inGenesis, inMe, analyzeOpts)
404
+
405
+ // Spine sanity check — both analyses must share an identical genesis
406
+ // tokenisation. Symmetric useProjections guarantees this; if it ever
407
+ // diverges, fail loudly rather than silently misattribute.
408
+ if (dCp.oldDiffWords.length !== dMe.oldDiffWords.length) {
409
+ throw new Error(
410
+ 'HtmlDiff.executeThreeWay: genesis tokenisation diverged across pair-wise analyses ' +
411
+ `(${dCp.oldDiffWords.length} vs ${dMe.oldDiffWords.length}). ` +
412
+ 'This indicates the symmetric-projection coordination has a bug.'
413
+ )
414
+ }
415
+
416
+ const segments = buildSegments(dCp, dMe)
417
+ const merged = HtmlDiff.emitSegments(segments)
418
+ return tablePreprocess ? restoreTablePlaceholders(merged, tablePreprocess.placeholderToDiff) : merged
419
+ }
420
+
421
+ /**
422
+ * Drives a fresh `HtmlDiff` instance through `insertTag` for ins/del
423
+ * segments and pushes equal segments straight to its `content`
424
+ * buffer. Reusing the instance keeps the formatting-tag stack
425
+ * (`specialTagDiffStack`) coherent across segments — a `<strong>`
426
+ * opened in one segment and closed in another stays balanced.
427
+ */
428
+ private static emitSegments(segments: Segment[]): string {
429
+ const emitter = new HtmlDiff('', '')
430
+ for (const seg of segments) {
431
+ if (seg.attr.kind === 'equal') {
432
+ emitter.content.push(seg.words.join(''))
433
+ continue
434
+ }
435
+ const { tag, baseClass, metadata } = segmentEmissionShape(seg.attr)
436
+ // insertTag mutates its `words` array; pass a copy.
437
+ emitter.insertTag(tag, baseClass, [...seg.words], metadata)
438
+ }
439
+ // Stack-balance invariant: every special-case opening tag pushed onto
440
+ // `specialTagDiffStack` during emission must have been matched by a
441
+ // closing tag. An unbalanced stack means the input had unbalanced
442
+ // formatting tags AND a Replace at an inconvenient position — the
443
+ // output would be silently malformed (half-closed `<ins>`). Fail
444
+ // loudly so the caller can investigate rather than ship broken HTML.
445
+ if (emitter.specialTagDiffStack.length > 0) {
446
+ throw new Error(
447
+ `HtmlDiff.executeThreeWay: emission left ${emitter.specialTagDiffStack.length} ` +
448
+ 'unclosed formatting tag(s) on the stack — input may have unbalanced ' +
449
+ '<strong>/<em>/etc. or there is a bug in segment emission.'
450
+ )
451
+ }
452
+ return emitter.content.join('')
453
+ }
454
+
455
+ /**
456
+ * Internal entry point used by the table-cell recursion. Constructs an
457
+ * inner `HtmlDiff`, applies the caller's settings, and bumps the
458
+ * recursion depth — keeping the public constructor signature clean
459
+ * while still threading the configuration that's required for cell-
460
+ * level output to match the top-level call's behaviour.
461
+ */
462
+ private static executeWithContext(oldText: string, newText: string, ctx: RecursionContext): string {
463
+ const inner = new HtmlDiff(oldText, newText)
464
+ inner.tablePreprocessDepth = ctx.depth
465
+ for (const expr of ctx.blockExpressions) inner.addBlockExpression(expr)
466
+ inner.repeatingWordsAccuracy = ctx.repeatingWordsAccuracy
467
+ inner.orphanMatchThreshold = ctx.orphanMatchThreshold
468
+ inner.ignoreWhitespaceDifferences = ctx.ignoreWhitespaceDifferences
469
+ return inner.build()
162
470
  }
163
471
 
164
472
  /**
@@ -174,26 +482,26 @@ export default class HtmlDiff {
174
482
  // Table preprocessing: when both sides have matching `<table>` structures,
175
483
  // diff cells positionally so cross-cell content shifts produce one
176
484
  // independent del/ins per cell rather than cell-misaligned output.
177
- // Recursion guarded by MaxTablePreprocessDepth to bound work on
178
- // deeply-nested table-in-cell-in-table inputs. Caller-configured
179
- // settings (block expressions, accuracy thresholds) are propagated to
180
- // the recursive cell diff so cell-level output is consistent with the
181
- // top-level configuration.
182
- const blockExpressions = this.blockExpressions
183
- const repeatingWordsAccuracy = this.repeatingWordsAccuracy
184
- const orphanMatchThreshold = this.orphanMatchThreshold
185
- const ignoreWhitespaceDifferences = this.ignoreWhitespaceDifferences
186
- const tablePreprocess =
187
- this.tablePreprocessDepth >= HtmlDiff.MaxTablePreprocessDepth
188
- ? null
189
- : preprocessTables(this.oldText, this.newText, (oldCell, newCell) => {
190
- const inner = new HtmlDiff(oldCell, newCell, this.tablePreprocessDepth + 1)
191
- for (const expr of blockExpressions) inner.addBlockExpression(expr)
192
- inner.repeatingWordsAccuracy = repeatingWordsAccuracy
193
- inner.orphanMatchThreshold = orphanMatchThreshold
194
- inner.ignoreWhitespaceDifferences = ignoreWhitespaceDifferences
195
- return inner.build()
196
- })
485
+ // Recursion is guarded by MaxTablePreprocessDepth check the cap
486
+ // first so we don't construct a context that will never be used.
487
+ let tablePreprocess: ReturnType<typeof preprocessTables> = null
488
+ if (this.tablePreprocessDepth < HtmlDiff.MaxTablePreprocessDepth) {
489
+ // Caller-configured settings (block expressions, accuracy
490
+ // thresholds) flow to the recursive cell diff via `RecursionContext`
491
+ // so cell-level output is consistent with the top-level
492
+ // configuration. The context is built once here and reused for
493
+ // every cell-diff callback invocation.
494
+ const ctx: RecursionContext = {
495
+ depth: this.tablePreprocessDepth + 1,
496
+ blockExpressions: this.blockExpressions,
497
+ repeatingWordsAccuracy: this.repeatingWordsAccuracy,
498
+ orphanMatchThreshold: this.orphanMatchThreshold,
499
+ ignoreWhitespaceDifferences: this.ignoreWhitespaceDifferences,
500
+ }
501
+ tablePreprocess = preprocessTables(this.oldText, this.newText, (oldCell, newCell) =>
502
+ HtmlDiff.executeWithContext(oldCell, newCell, ctx)
503
+ )
504
+ }
197
505
  if (tablePreprocess) {
198
506
  this.oldText = tablePreprocess.modifiedOld
199
507
  this.newText = tablePreprocess.modifiedNew
@@ -491,7 +799,7 @@ export default class HtmlDiff {
491
799
  * @param words
492
800
  * @private
493
801
  */
494
- private insertTag(tag: string, cssClass: string, words: string[]) {
802
+ private insertTag(tag: string, cssClass: string, words: string[], metadata?: WrapMetadata) {
495
803
  while (true) {
496
804
  if (words.length === 0) {
497
805
  break
@@ -499,7 +807,7 @@ export default class HtmlDiff {
499
807
 
500
808
  const allWordsUntilFirstTag = this.extractConsecutiveWords(words, x => !Utils.isTag(x))
501
809
  if (allWordsUntilFirstTag.length > 0) {
502
- const text = Utils.wrapText(allWordsUntilFirstTag.join(''), tag, cssClass)
810
+ const text = Utils.wrapText(allWordsUntilFirstTag.join(''), tag, cssClass, metadata)
503
811
  this.content.push(text)
504
812
  }
505
813
 
@@ -533,7 +841,9 @@ export default class HtmlDiff {
533
841
  const styledTagNames = Array.from(tagNames).join(' ')
534
842
 
535
843
  this.specialTagDiffStack.push(words[0])
536
- specialCaseTagInjection = `<ins class='mod ${styledTagNames}'>`
844
+ // Carry the caller's metadata into the formatting-tag wrapper so
845
+ // a 3-way author tag survives a `<strong>`/`<em>` content edit.
846
+ specialCaseTagInjection = `<ins${Utils.composeTagAttributes(`mod ${styledTagNames}`, metadata ?? {})}>`
537
847
  if (tag === HtmlDiff.DelTag) {
538
848
  words.shift()
539
849
 
@@ -608,7 +918,7 @@ export default class HtmlDiff {
608
918
  if (words.length === 0) continue
609
919
 
610
920
  // if there are still words left, they must start with a nonTag and need to be handled in the next iteration.
611
- this.insertTag(tag, cssClass, words)
921
+ this.insertTag(tag, cssClass, words, metadata)
612
922
  break
613
923
  }
614
924
  }
@@ -0,0 +1,200 @@
1
+ /**
2
+ * Low-level HTML tag-parsing primitives shared by the table-aware
3
+ * preprocessing and (potentially) other consumers. These are deliberately
4
+ * generic over the document type — no table-specific assumptions live
5
+ * here.
6
+ *
7
+ * The goal is to walk HTML at the tag boundary level *without* parsing
8
+ * into a full DOM, so we stay fast and never round-trip through
9
+ * htmlparser2/DOMPurify in the diff hot path.
10
+ */
11
+
12
+ export interface OpeningTag {
13
+ /** Index just past the closing `>` of the opening tag. */
14
+ end: number
15
+ }
16
+
17
+ export interface ClassAttributeLocation {
18
+ /** Index of the value's first character (inside the surrounding quotes). */
19
+ valueStart: number
20
+ /** Index just past the last character of the value (still inside quotes). */
21
+ valueEnd: number
22
+ /** The class attribute's value, with surrounding quotes stripped. */
23
+ value: string
24
+ }
25
+
26
+ /**
27
+ * Parses the opening tag (or comment/CDATA/PI) starting at `i`. Returns
28
+ * the index just past the closing delimiter, or null if the tag is
29
+ * malformed (unterminated). HTML comments, CDATA, processing
30
+ * instructions, and DOCTYPE need their own terminators — a plain
31
+ * `>`-walker would cut a comment like `<!-- a > b -->` at the first
32
+ * inner `>`, treating the rest as text and corrupting downstream
33
+ * offsets. Word-exported HTML routinely emits comments inside tables
34
+ * (conditional comments, OLE markers) so these have to be handled.
35
+ */
36
+ export function parseOpeningTagAt(html: string, i: number): OpeningTag | null {
37
+ if (html.startsWith('<!--', i)) {
38
+ const close = html.indexOf('-->', i + 4)
39
+ return close === -1 ? null : { end: close + 3 }
40
+ }
41
+ if (html.startsWith('<![CDATA[', i)) {
42
+ const close = html.indexOf(']]>', i + 9)
43
+ return close === -1 ? null : { end: close + 3 }
44
+ }
45
+ if (html.startsWith('<?', i)) {
46
+ const close = html.indexOf('?>', i + 2)
47
+ return close === -1 ? null : { end: close + 2 }
48
+ }
49
+ // Walk to the next unquoted '>'. Handles attributes whose values contain
50
+ // a literal '>' inside quotes, which a plain indexOf would mishandle.
51
+ let j = i + 1
52
+ let quote: string | null = null
53
+ while (j < html.length) {
54
+ const ch = html[j]
55
+ if (quote) {
56
+ if (ch === quote) quote = null
57
+ } else if (ch === '"' || ch === "'") {
58
+ quote = ch
59
+ } else if (ch === '>') {
60
+ return { end: j + 1 }
61
+ }
62
+ j++
63
+ }
64
+ return null
65
+ }
66
+
67
+ export function matchesTagAt(html: string, i: number, tagName: string): boolean {
68
+ if (html[i] !== '<') return false
69
+ const candidate = html.slice(i + 1, i + 1 + tagName.length).toLowerCase()
70
+ if (candidate !== tagName) return false
71
+ const after = html[i + 1 + tagName.length]
72
+ return after === '>' || after === ' ' || after === '\t' || after === '\n' || after === '\r' || after === '/'
73
+ }
74
+
75
+ export function matchesClosingTagAt(html: string, i: number, tagName: string): boolean {
76
+ if (html[i] !== '<' || html[i + 1] !== '/') return false
77
+ const candidate = html.slice(i + 2, i + 2 + tagName.length).toLowerCase()
78
+ if (candidate !== tagName) return false
79
+ const after = html[i + 2 + tagName.length]
80
+ return after === '>' || after === ' ' || after === '\t' || after === '\n' || after === '\r'
81
+ }
82
+
83
+ /**
84
+ * Returns the index just past the matching `</tagName>`, accounting for
85
+ * nested tags of the same name. Returns -1 if no match before `limit`.
86
+ */
87
+ export function findMatchingClosingTag(
88
+ html: string,
89
+ from: number,
90
+ tagName: string,
91
+ limit: number = html.length
92
+ ): number {
93
+ let depth = 1
94
+ let i = from
95
+ while (i < limit) {
96
+ if (matchesTagAt(html, i, tagName)) {
97
+ const opening = parseOpeningTagAt(html, i)
98
+ if (!opening) {
99
+ i++
100
+ continue
101
+ }
102
+ const tagText = html.slice(i, opening.end)
103
+ if (!tagText.endsWith('/>')) depth++
104
+ i = opening.end
105
+ } else if (matchesClosingTagAt(html, i, tagName)) {
106
+ depth--
107
+ const closing = parseOpeningTagAt(html, i)
108
+ const closingEnd = closing?.end ?? i + `</${tagName}>`.length
109
+ if (depth === 0) return closingEnd
110
+ i = closingEnd
111
+ } else {
112
+ i++
113
+ }
114
+ }
115
+ return -1
116
+ }
117
+
118
+ /**
119
+ * Returns the opening tag with the given class injected. Locates the real
120
+ * `class` attribute via attribute-aware walking (NOT a flat regex — that
121
+ * would mis-match inside a foreign attribute value like
122
+ * `title="see class='x'"`). When the class already partially overlaps with
123
+ * `cls` — e.g. existing `class="mod"` and we're injecting `mod colspan` —
124
+ * only the missing tokens get appended, so we never end up with
125
+ * `class="mod mod colspan"`.
126
+ */
127
+ export function injectClass(openingTag: string, cls: string): string {
128
+ const clsTokens = cls.split(/\s+/).filter(Boolean)
129
+ if (clsTokens.length === 0) return openingTag
130
+
131
+ const classAttr = findClassAttribute(openingTag)
132
+ if (classAttr) {
133
+ const existingTokens = classAttr.value.split(/\s+/).filter(Boolean)
134
+ const missing = clsTokens.filter(t => !existingTokens.includes(t))
135
+ if (missing.length === 0) return openingTag
136
+ const updatedValue =
137
+ existingTokens.length === 0 ? missing.join(' ') : `${existingTokens.join(' ')} ${missing.join(' ')}`
138
+ return openingTag.slice(0, classAttr.valueStart) + updatedValue + openingTag.slice(classAttr.valueEnd)
139
+ }
140
+
141
+ const isSelfClosing = openingTag.endsWith('/>')
142
+ const insertAt = isSelfClosing ? openingTag.length - 2 : openingTag.length - 1
143
+ return `${openingTag.slice(0, insertAt).replace(/\s*$/, '')} class='${cls}'${openingTag.slice(insertAt)}`
144
+ }
145
+
146
+ /**
147
+ * Walks the opening tag's attributes (respecting quoted values) to find
148
+ * the actual `class` attribute. Returns the value range (start/end of the
149
+ * value content, *excluding* the surrounding quotes) and the value, or
150
+ * null if no `class` attribute is present.
151
+ */
152
+ export function findClassAttribute(openingTag: string): ClassAttributeLocation | null {
153
+ // Skip past the tag name. Tag starts with `<`; first run of [A-Za-z0-9-]
154
+ // is the tag name. Anything after is attribute territory.
155
+ let i = 1
156
+ while (i < openingTag.length && /[A-Za-z0-9_:-]/.test(openingTag[i])) i++
157
+
158
+ while (i < openingTag.length) {
159
+ // Skip whitespace
160
+ while (i < openingTag.length && /\s/.test(openingTag[i])) i++
161
+ if (i >= openingTag.length) break
162
+ if (openingTag[i] === '>' || openingTag[i] === '/') break
163
+
164
+ // Read attribute name
165
+ const nameStart = i
166
+ while (i < openingTag.length && !/[\s=>/]/.test(openingTag[i])) i++
167
+ const name = openingTag.slice(nameStart, i)
168
+
169
+ // Optional whitespace + '=' + optional whitespace + value
170
+ while (i < openingTag.length && /\s/.test(openingTag[i])) i++
171
+ if (openingTag[i] !== '=') {
172
+ // Bare attribute (no value) — not class
173
+ continue
174
+ }
175
+ i++ // past '='
176
+ while (i < openingTag.length && /\s/.test(openingTag[i])) i++
177
+
178
+ // Value: quoted or unquoted
179
+ let valueStart: number
180
+ let valueEnd: number
181
+ if (openingTag[i] === '"' || openingTag[i] === "'") {
182
+ const quote = openingTag[i]
183
+ i++
184
+ valueStart = i
185
+ while (i < openingTag.length && openingTag[i] !== quote) i++
186
+ valueEnd = i
187
+ if (i < openingTag.length) i++ // past closing quote
188
+ } else {
189
+ valueStart = i
190
+ while (i < openingTag.length && !/[\s>/]/.test(openingTag[i])) i++
191
+ valueEnd = i
192
+ }
193
+
194
+ if (name.toLowerCase() === 'class') {
195
+ return { valueStart, valueEnd, value: openingTag.slice(valueStart, valueEnd) }
196
+ }
197
+ }
198
+
199
+ return null
200
+ }