npm - @createiq/htmldiff - Versions diffs - 1.0.0 - Mend

@createiq/htmldiff 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/.gitlab-ci.yml +71 -0
package/.node-version +1 -0
package/LICENSE +21 -0
package/README.md +5 -0
package/biome.json +41 -0
package/dist/HtmlDiff.cjs +815 -0
package/dist/HtmlDiff.cjs.map +1 -0
package/dist/HtmlDiff.d.cts +105 -0
package/dist/HtmlDiff.d.ts +105 -0
package/dist/HtmlDiff.js +792 -0
package/dist/HtmlDiff.js.map +1 -0
package/package.json +44 -0
package/renovate.json5 +10 -0
package/src/Action.ts +9 -0
package/src/HtmlDiff.ts +447 -0
package/src/Match.ts +31 -0
package/src/MatchFinder.ts +144 -0
package/src/MatchOptions.ts +10 -0
package/src/Mode.ts +8 -0
package/src/Operation.ts +17 -0
package/src/Utils.ts +96 -0
package/src/WordSplitter.ts +248 -0
package/test/HtmlDiff.bench.ts +126 -0
package/test/HtmlDiff.spec.ts +251 -0
package/test/MatchFinder.spec.ts +77 -0
package/test/Utils.spec.ts +120 -0
package/test/WordSplitter.spec.ts +27 -0
package/tsconfig.json +16 -0
package/tsup.config.ts +14 -0
package/vitest.config.mts +24 -0

package/src/MatchFinder.ts ADDED Viewed

@@ -0,0 +1,144 @@
+import Match from './Match'
+import type MatchOptions from './MatchOptions'
+import Utils from './Utils'
+/**
+ * Finds the longest match in given texts. It uses indexing with fixed granularity that is used to compare blocks of text.
+ */
+export default class MatchFinder {
+  private oldWords: string[]
+  private newWords: string[]
+  private startInOld: number
+  private endInOld: number
+  private startInNew: number
+  private endInNew: number
+  private wordIndices: { [word: string]: number[] } = {}
+  private options: MatchOptions
+  constructor(
+    oldWords: string[],
+    newWords: string[],
+    startInOld: number,
+    endInOld: number,
+    startInNew: number,
+    endInNew: number,
+    options: MatchOptions
+  ) {
+    this.oldWords = oldWords
+    this.newWords = newWords
+    this.startInOld = startInOld
+    this.endInOld = endInOld
+    this.startInNew = startInNew
+    this.endInNew = endInNew
+    this.options = options
+  }
+  private indexNewWords() {
+    this.wordIndices = {}
+    const block: string[] = []
+    for (let i = this.startInNew; i < this.endInNew; i++) {
+      // if word is a tag, we should ignore attributes as attribute changes are not supported (yet)
+      const word = this.normalizeForIndex(this.newWords[i])
+      const key = MatchFinder.putNewWord(block, word, this.options.blockSize)
+      if (key === null) {
+        continue
+      }
+      this.wordIndices = {
+        ...this.wordIndices,
+        [key]: [...(this.wordIndices[key] ?? []), i],
+      }
+    }
+  }
+  private static putNewWord(block: string[], word: string, blockSize: number): string | null {
+    block.push(word)
+    if (block.length > blockSize) {
+      block.shift()
+    }
+    if (block.length !== blockSize) {
+      return null
+    }
+    return block.join('')
+  }
+  private normalizeForIndex(word: string): string {
+    const output = Utils.stripAnyAttributes(word)
+    if (this.options.ignoreWhitespaceDifferences && Utils.isWhiteSpace(output)) {
+      return ' '
+    }
+    return output
+  }
+  findMatch(): Match | null {
+    this.indexNewWords()
+    this.removeRepeatingWords()
+    if (Object.keys(this.wordIndices).length === 0) {
+      return null
+    }
+    let bestMatchInOld = this.startInOld
+    let bestMatchInNew = this.startInNew
+    let bestMatchSize = 0
+    let matchLengthAt: Map<number, number> = new Map()
+    const block: string[] = []
+    for (let indexInOld = this.startInOld; indexInOld < this.endInOld; indexInOld++) {
+      const word = this.normalizeForIndex(this.oldWords[indexInOld])
+      const index = MatchFinder.putNewWord(block, word, this.options.blockSize)
+      if (index === null) {
+        continue
+      }
+      const newMatchLengthAt: Map<number, number> = new Map()
+      if (!Object.keys(this.wordIndices).includes(index)) {
+        matchLengthAt = newMatchLengthAt
+        continue
+      }
+      for (const indexInNew of this.wordIndices[index]) {
+        // biome-ignore lint/style/noNonNullAssertion: This is safe as guarded by has()
+        const newMatchLength = (matchLengthAt.has(indexInNew - 1) ? matchLengthAt.get(indexInNew - 1)! : 0) + 1
+        newMatchLengthAt.set(indexInNew, newMatchLength)
+        if (newMatchLength > bestMatchSize) {
+          bestMatchInOld = indexInOld - newMatchLength - this.options.blockSize + 2
+          bestMatchInNew = indexInNew - newMatchLength - this.options.blockSize + 2
+          bestMatchSize = newMatchLength
+        }
+      }
+      matchLengthAt = newMatchLengthAt
+    }
+    return bestMatchSize !== 0
+      ? new Match(bestMatchInOld, bestMatchInNew, bestMatchSize + this.options.blockSize - 1)
+      : null
+  }
+  /**
+   * This method removes words that occur too many times. This way it reduces total count of comparison operations
+   * and as result the diff algorithm takes less time. But the side effect is that it may detect false differences of
+   * the repeating words.
+   * @private
+   */
+  private removeRepeatingWords() {
+    const threshold = this.newWords.length * this.options.repeatingWordsAccuracy
+    const repeatingWords = Object.entries(this.wordIndices)
+      .filter(([, indices]) => indices.length > threshold)
+      .map(([word]) => word)
+    for (const w of repeatingWords) {
+      delete this.wordIndices[w]
+    }
+  }
+}

package/src/MatchOptions.ts ADDED Viewed

@@ -0,0 +1,10 @@
+type MatchOptions = {
+  /**
+   * Match granularity, defines how many words are joined into single block
+   */
+  blockSize: number
+  repeatingWordsAccuracy: number
+  ignoreWhitespaceDifferences: boolean
+}
+export default MatchOptions

package/src/Mode.ts ADDED Viewed

@@ -0,0 +1,8 @@
+enum Mode {
+  Character = 0,
+  Tag = 1,
+  Whitespace = 2,
+  Entity = 3,
+}
+export default Mode

package/src/Operation.ts ADDED Viewed

@@ -0,0 +1,17 @@
+import type Action from './Action'
+export default class Operation {
+  action: Action
+  startInOld: number
+  endInOld: number
+  startInNew: number
+  endInNew: number
+  constructor(action: Action, startInOld: number, endInOld: number, startInNew: number, endInNew: number) {
+    this.action = action
+    this.startInOld = startInOld
+    this.endInOld = endInOld
+    this.startInNew = startInNew
+    this.endInNew = endInNew
+  }
+}

package/src/Utils.ts ADDED Viewed

@@ -0,0 +1,96 @@
+const openingTagRegex = /^\s*<[^>]+>\s*$/
+const closingTagTexRegex = /^\s*<\/[^>]+>\s*$/
+const tagWordRegex = /<[^\s>]+/
+const whitespaceRegex = /^(\s|&nbsp;)+$/
+const wordRegex = /[\w#@]+/
+const tagRegex = /<\/?(?<name>[^\s\/>]+)[^>]*>/
+const SpecialCaseWordTags: readonly string[] = ['<img']
+export function isTag(item: string): boolean {
+  if (SpecialCaseWordTags.some(re => item?.startsWith(re))) {
+    return false
+  }
+  return isOpeningTag(item) || isClosingTag(item)
+}
+function isOpeningTag(item: string): boolean {
+  return openingTagRegex.test(item)
+}
+function isClosingTag(item: string): boolean {
+  return closingTagTexRegex.test(item)
+}
+export function stripTagAttributes(word: string): string {
+  const match = tagWordRegex.exec(word)
+  if (match) {
+    return `${match[0]}${word.endsWith('/>') ? '/>' : '>'}`
+  }
+  return word
+}
+export function wrapText(text: string, tagName: string, cssClass: string): string {
+  return `<${tagName} class='${cssClass}'>${text}</${tagName}>`
+}
+export function isStartOfTag(val: string): boolean {
+  return val === '<'
+}
+export function isEndOfTag(val: string): boolean {
+  return val === '>'
+}
+export function isStartOfEntity(val: string): boolean {
+  return val === '&'
+}
+export function isEndOfEntity(val: string): boolean {
+  return val === ';'
+}
+export function isWhiteSpace(value: string): boolean {
+  return whitespaceRegex.test(value)
+}
+export function stripAnyAttributes(word: string): string {
+  if (isTag(word)) {
+    return stripTagAttributes(word)
+  }
+  return word
+}
+export function isWord(text: string): boolean {
+  return wordRegex.test(text)
+}
+export function getTagName(word: string | null): string {
+  if (word === null) {
+    return ''
+  }
+  const match = tagRegex.exec(word)
+  if (match) {
+    return match.groups?.name.toLowerCase() ?? match[1].toLowerCase()
+  }
+  return ''
+}
+export default {
+  isTag,
+  stripTagAttributes,
+  wrapText,
+  isStartOfTag,
+  isEndOfTag,
+  isStartOfEntity,
+  isEndOfEntity,
+  isWhiteSpace,
+  stripAnyAttributes,
+  isWord,
+  getTagName,
+}

package/src/WordSplitter.ts ADDED Viewed

@@ -0,0 +1,248 @@
+import Mode from './Mode'
+import Utils from './Utils'
+export default class WordSplitter {
+  private text: string
+  private isBlockCheckRequired: boolean
+  private blockLocations: BlockFinderResult
+  private mode: Mode
+  private isGrouping = false
+  private globbingUntil: number
+  private currentWord: string[]
+  private words: string[]
+  private static NotGlobbing = -1
+  private get currentWordHasChars() {
+    return this.currentWord.length > 0
+  }
+  constructor(text: string, blockExpressions: RegExp[]) {
+    this.text = text
+    this.blockLocations = new BlockFinder(text, blockExpressions).findBlocks()
+    this.isBlockCheckRequired = this.blockLocations.hasBlocks
+    this.mode = Mode.Character
+    this.globbingUntil = WordSplitter.NotGlobbing
+    this.currentWord = []
+    this.words = []
+  }
+  process(): string[] {
+    for (let index = 0; index < this.text.length; index++) {
+      const character = this.text.charAt(index)
+      this.processCharacter(index, character)
+    }
+    this.appendCurrentWordToWords()
+    return this.words
+  }
+  private processCharacter(index: number, character: string) {
+    if (this.isGlobbing(index, character)) {
+      return
+    }
+    switch (this.mode) {
+      case Mode.Character:
+        this.processTextCharacter(character)
+        break
+      case Mode.Tag:
+        this.processHtmlTagContinuation(character)
+        break
+      case Mode.Whitespace:
+        this.processWhiteSpaceContinuation(character)
+        break
+      case Mode.Entity:
+        this.processEntityContinuation(character)
+        break
+    }
+  }
+  private processEntityContinuation(character: string) {
+    if (Utils.isStartOfTag(character)) {
+      this.appendCurrentWordToWords()
+      this.currentWord.push(character)
+      this.mode = Mode.Tag
+    } else if (character.trim().length === 0) {
+      this.appendCurrentWordToWords()
+      this.currentWord.push(character)
+      this.mode = Mode.Whitespace
+    } else if (Utils.isEndOfEntity(character)) {
+      let switchToNextMode = true
+      if (this.currentWordHasChars) {
+        this.currentWord.push(character)
+        this.words.push(this.currentWord.join(''))
+        //join &nbsp; entity with last whitespace
+        if (
+          this.words.length > 2 &&
+          Utils.isWhiteSpace(this.words[this.words.length - 2]) &&
+          Utils.isWhiteSpace(this.words[this.words.length - 1])
+        ) {
+          const w1 = this.words[this.words.length - 2]
+          const w2 = this.words[this.words.length - 1]
+          this.words.splice(this.words.length - 2, 2)
+          this.currentWord = `${w1}${w2}`.split('')
+          this.mode = Mode.Whitespace
+          switchToNextMode = false
+        }
+      }
+      if (switchToNextMode) {
+        this.currentWord = []
+        this.mode = Mode.Character
+      }
+    } else if (Utils.isWord(character)) {
+      this.currentWord.push(character)
+    } else {
+      this.appendCurrentWordToWords()
+      this.currentWord.push(character)
+      this.mode = Mode.Character
+    }
+  }
+  private processWhiteSpaceContinuation(character: string) {
+    if (Utils.isStartOfTag(character)) {
+      this.appendCurrentWordToWords()
+      this.currentWord.push(character)
+      this.mode = Mode.Tag
+    } else if (Utils.isStartOfEntity(character)) {
+      this.appendCurrentWordToWords()
+      this.currentWord.push(character)
+      this.mode = Mode.Entity
+    } else if (Utils.isWhiteSpace(character)) {
+      this.currentWord.push(character)
+    } else {
+      this.appendCurrentWordToWords()
+      this.currentWord.push(character)
+      this.mode = Mode.Character
+    }
+  }
+  private processHtmlTagContinuation(character: string) {
+    if (Utils.isEndOfTag(character)) {
+      this.currentWord.push(character)
+      this.appendCurrentWordToWords()
+      this.mode = Utils.isWhiteSpace(character) ? Mode.Whitespace : Mode.Character
+    } else {
+      this.currentWord.push(character)
+    }
+  }
+  private processTextCharacter(character: string) {
+    if (Utils.isStartOfTag(character)) {
+      this.appendCurrentWordToWords()
+      this.currentWord.push('<')
+      this.mode = Mode.Tag
+    } else if (Utils.isStartOfEntity(character)) {
+      this.appendCurrentWordToWords()
+      this.currentWord.push(character)
+      this.mode = Mode.Entity
+    } else if (Utils.isWhiteSpace(character)) {
+      this.appendCurrentWordToWords()
+      this.currentWord.push(character)
+      this.mode = Mode.Whitespace
+    } else if (
+      Utils.isWord(character) &&
+      (this.currentWord.length === 0 || Utils.isWord(this.currentWord[this.currentWord.length - 1]))
+    ) {
+      this.currentWord.push(character)
+    } else {
+      this.appendCurrentWordToWords()
+      this.currentWord.push(character)
+    }
+  }
+  private appendCurrentWordToWords() {
+    if (this.currentWordHasChars) {
+      this.words.push(this.currentWord.join(''))
+      this.currentWord = []
+    }
+  }
+  private isGlobbing(index: number, character: string): boolean {
+    if (!this.isBlockCheckRequired) {
+      return false
+    }
+    const isCurrentBlockTerminating = index === this.globbingUntil
+    if (isCurrentBlockTerminating) {
+      this.globbingUntil = WordSplitter.NotGlobbing
+      this.isGrouping = false
+      this.appendCurrentWordToWords()
+    }
+    const until = this.blockLocations.isInBlock(index)
+    if (until) {
+      this.isGrouping = true
+      this.globbingUntil = until
+    }
+    if (this.isGrouping) {
+      this.currentWord.push(character)
+      this.mode = Mode.Character
+    }
+    return this.isGrouping
+  }
+  static convertHtmlToListOfWords(text: string, blockExpressions: RegExp[]): string[] {
+    return new WordSplitter(text, blockExpressions).process()
+  }
+}
+class BlockFinderResult {
+  private blocks: Map<number, number> = new Map()
+  addBlock(from: number, to: number) {
+    if (this.blocks.has(from)) {
+      throw new ArgumentError('One or more block expressions result in a text sequence that overlaps.')
+    }
+    this.blocks.set(from, to)
+  }
+  isInBlock(location: number): number | null {
+    return this.blocks.get(location) ?? null
+  }
+  get hasBlocks() {
+    return this.blocks.size > 0
+  }
+}
+class ArgumentError extends Error {}
+class BlockFinder {
+  private text: string
+  private blockExpressions: RegExp[]
+  constructor(text: string, blockExpressions: RegExp[]) {
+    this.text = text
+    this.blockExpressions = blockExpressions
+  }
+  findBlocks(): BlockFinderResult {
+    const result = new BlockFinderResult()
+    for (const expression of this.blockExpressions) {
+      this.processBlockMatcher(expression, result)
+    }
+    return result
+  }
+  private processBlockMatcher(exp: RegExp, result: BlockFinderResult) {
+    let match: RegExpExecArray | null
+    // biome-ignore lint/suspicious/noAssignInExpressions: Couldn't think of a nicer way to do this
+    while ((match = exp.exec(this.text)) !== null) {
+      this.tryAddBlock(exp, match, result)
+    }
+  }
+  private tryAddBlock(exp: RegExp, match: RegExpExecArray, result: BlockFinderResult) {
+    try {
+      const from = match.index
+      const to = match.index + match[0].length
+      result.addBlock(from, to)
+    } catch (e) {
+      throw new ArgumentError(
+        `One or more block expressions result in a text sequence that overlaps. Current expression: ${exp}`
+      )
+    }
+  }
+}

package/test/HtmlDiff.bench.ts ADDED Viewed

@@ -0,0 +1,126 @@
+import { bench, describe, expect } from 'vitest'
+import HtmlDiff from '../src/HtmlDiff'
+describe('HtmlDiff', () => {
+  const template =
+    'Lorem ipsum dolor sit amet {0}, consectetur adipiscing elit. Nunc sollicitudin mauris eget nibh {1} semper, in bibendum felis rutrum. Aliquam dictum {2} ut ante id dictum. Integer quis tincidunt metus. Maecenas ultricies tristique {3} fringilla. Cras non erat id elit rhoncus accumsan eget quis neque. Fusce accumsan justo mauris, et pulvinar leo lacinia molestie. Nam ullamcorper dapibus velit a pulvinar. Cras a hendrerit neque {4}, sit amet faucibus ante. {5} Nullam in nisl augue. Suspendisse consectetur id ipsum at dignissim. Etiam euismod sollicitudin metus non volutpat,{6}. Nullam non mollis risus, nec consequat ipsum.'
+  const words = [
+    'Donec',
+    'condimentum,',
+    'tellus',
+    'a',
+    'aliquam',
+    'feugiat,',
+    'dui',
+    'diam',
+    'fringilla',
+    'massa,',
+    'sed',
+    'facilisis',
+    'risus',
+    'magna',
+    'quis',
+    'augue.',
+    'Aenean',
+    'tempus',
+    'metus',
+    'at',
+    'quam',
+    'aliquet,',
+    'ultrices',
+    'venenatis',
+    'nulla',
+    'faucibus.',
+    'Maecenas',
+    'sit',
+    'amet',
+    'lobortis',
+    'tortor.',
+    'Vestibulum',
+    'fringilla',
+    'fringilla',
+    'diam,',
+    'non',
+    'tempus',
+    'quam',
+    'pretium',
+    'gravida.',
+    'In',
+    'pretium',
+    'vitae',
+    'erat',
+    'sed',
+    'bibendum.',
+    'Sed',
+    'ultrices',
+    'risus',
+    'et',
+    'aliquet',
+    'sollicitudin.',
+    'Fusce',
+    'ac',
+    'diam',
+    'justo.',
+    'Morbi',
+    'lobortis',
+    'quam',
+    'vestibulum',
+    'volutpat',
+    'cursus.',
+    'Suspendisse',
+    'vestibulum',
+    'augue',
+    'et',
+    'interdum',
+    'convallis.',
+  ] as const
+  const text = (...words: string[]) => {
+    const [word1, word2, word3, word4, word5, word6, word7] = words
+    return template
+      .replace('{0}', word1)
+      .replace('{1}', word2)
+      .replace('{2}', word3)
+      .replace('{3}', word4)
+      .replace('{4}', word5)
+      .replace('{5}', word6)
+      .replace('{6}', word7)
+  }
+  const iterations = 300
+  const oldText = Array(iterations)
+    .fill(null)
+    .reduce((acc, _, i) => {
+      if (i % 2 === 0) {
+        return acc + text('', '', '', '', '', '', '')
+      }
+      if (i % 5 === 0) {
+        return acc + text(...[...words.slice(i % words.length), ...words.slice(words.length - (i % words.length))])
+      }
+      if (i % 7 === 0) {
+        return acc + text(...words)
+      }
+      return acc
+    }, '')
+  const newText = Array(iterations)
+    .fill(null)
+    .reduce((acc, _, i) => {
+      if (i % 3 === 0) {
+        return acc + text('', '', '', '', '', '', '')
+      }
+      if (i % 2 === 0) {
+        return acc + text(...[...words.slice(i % words.length), ...words.slice(words.length - (i % words.length))])
+      }
+      if (i % 11 === 0) {
+        return acc + text(...words)
+      }
+      return acc
+    }, '')
+  bench('diff', () => {
+    expect(HtmlDiff.execute(oldText, newText)).toBeTruthy()
+  })
+})