@createiq/htmldiff 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,144 @@
1
+ import Match from './Match'
2
+ import type MatchOptions from './MatchOptions'
3
+ import Utils from './Utils'
4
+
5
+ /**
6
+ * Finds the longest match in given texts. It uses indexing with fixed granularity that is used to compare blocks of text.
7
+ */
8
+ export default class MatchFinder {
9
+ private oldWords: string[]
10
+ private newWords: string[]
11
+ private startInOld: number
12
+ private endInOld: number
13
+ private startInNew: number
14
+ private endInNew: number
15
+ private wordIndices: { [word: string]: number[] } = {}
16
+ private options: MatchOptions
17
+
18
+ constructor(
19
+ oldWords: string[],
20
+ newWords: string[],
21
+ startInOld: number,
22
+ endInOld: number,
23
+ startInNew: number,
24
+ endInNew: number,
25
+ options: MatchOptions
26
+ ) {
27
+ this.oldWords = oldWords
28
+ this.newWords = newWords
29
+ this.startInOld = startInOld
30
+ this.endInOld = endInOld
31
+ this.startInNew = startInNew
32
+ this.endInNew = endInNew
33
+ this.options = options
34
+ }
35
+
36
+ private indexNewWords() {
37
+ this.wordIndices = {}
38
+ const block: string[] = []
39
+ for (let i = this.startInNew; i < this.endInNew; i++) {
40
+ // if word is a tag, we should ignore attributes as attribute changes are not supported (yet)
41
+ const word = this.normalizeForIndex(this.newWords[i])
42
+ const key = MatchFinder.putNewWord(block, word, this.options.blockSize)
43
+
44
+ if (key === null) {
45
+ continue
46
+ }
47
+
48
+ this.wordIndices = {
49
+ ...this.wordIndices,
50
+ [key]: [...(this.wordIndices[key] ?? []), i],
51
+ }
52
+ }
53
+ }
54
+
55
+ private static putNewWord(block: string[], word: string, blockSize: number): string | null {
56
+ block.push(word)
57
+
58
+ if (block.length > blockSize) {
59
+ block.shift()
60
+ }
61
+
62
+ if (block.length !== blockSize) {
63
+ return null
64
+ }
65
+
66
+ return block.join('')
67
+ }
68
+
69
+ private normalizeForIndex(word: string): string {
70
+ const output = Utils.stripAnyAttributes(word)
71
+ if (this.options.ignoreWhitespaceDifferences && Utils.isWhiteSpace(output)) {
72
+ return ' '
73
+ }
74
+
75
+ return output
76
+ }
77
+
78
+ findMatch(): Match | null {
79
+ this.indexNewWords()
80
+ this.removeRepeatingWords()
81
+
82
+ if (Object.keys(this.wordIndices).length === 0) {
83
+ return null
84
+ }
85
+
86
+ let bestMatchInOld = this.startInOld
87
+ let bestMatchInNew = this.startInNew
88
+ let bestMatchSize = 0
89
+
90
+ let matchLengthAt: Map<number, number> = new Map()
91
+ const block: string[] = []
92
+
93
+ for (let indexInOld = this.startInOld; indexInOld < this.endInOld; indexInOld++) {
94
+ const word = this.normalizeForIndex(this.oldWords[indexInOld])
95
+ const index = MatchFinder.putNewWord(block, word, this.options.blockSize)
96
+
97
+ if (index === null) {
98
+ continue
99
+ }
100
+
101
+ const newMatchLengthAt: Map<number, number> = new Map()
102
+
103
+ if (!Object.keys(this.wordIndices).includes(index)) {
104
+ matchLengthAt = newMatchLengthAt
105
+ continue
106
+ }
107
+
108
+ for (const indexInNew of this.wordIndices[index]) {
109
+ // biome-ignore lint/style/noNonNullAssertion: This is safe as guarded by has()
110
+ const newMatchLength = (matchLengthAt.has(indexInNew - 1) ? matchLengthAt.get(indexInNew - 1)! : 0) + 1
111
+ newMatchLengthAt.set(indexInNew, newMatchLength)
112
+
113
+ if (newMatchLength > bestMatchSize) {
114
+ bestMatchInOld = indexInOld - newMatchLength - this.options.blockSize + 2
115
+ bestMatchInNew = indexInNew - newMatchLength - this.options.blockSize + 2
116
+ bestMatchSize = newMatchLength
117
+ }
118
+ }
119
+
120
+ matchLengthAt = newMatchLengthAt
121
+ }
122
+
123
+ return bestMatchSize !== 0
124
+ ? new Match(bestMatchInOld, bestMatchInNew, bestMatchSize + this.options.blockSize - 1)
125
+ : null
126
+ }
127
+
128
+ /**
129
+ * This method removes words that occur too many times. This way it reduces total count of comparison operations
130
+ * and as result the diff algorithm takes less time. But the side effect is that it may detect false differences of
131
+ * the repeating words.
132
+ * @private
133
+ */
134
+ private removeRepeatingWords() {
135
+ const threshold = this.newWords.length * this.options.repeatingWordsAccuracy
136
+ const repeatingWords = Object.entries(this.wordIndices)
137
+ .filter(([, indices]) => indices.length > threshold)
138
+ .map(([word]) => word)
139
+
140
+ for (const w of repeatingWords) {
141
+ delete this.wordIndices[w]
142
+ }
143
+ }
144
+ }
@@ -0,0 +1,10 @@
1
+ type MatchOptions = {
2
+ /**
3
+ * Match granularity, defines how many words are joined into single block
4
+ */
5
+ blockSize: number
6
+ repeatingWordsAccuracy: number
7
+ ignoreWhitespaceDifferences: boolean
8
+ }
9
+
10
+ export default MatchOptions
package/src/Mode.ts ADDED
@@ -0,0 +1,8 @@
1
+ enum Mode {
2
+ Character = 0,
3
+ Tag = 1,
4
+ Whitespace = 2,
5
+ Entity = 3,
6
+ }
7
+
8
+ export default Mode
@@ -0,0 +1,17 @@
1
+ import type Action from './Action'
2
+
3
+ export default class Operation {
4
+ action: Action
5
+ startInOld: number
6
+ endInOld: number
7
+ startInNew: number
8
+ endInNew: number
9
+
10
+ constructor(action: Action, startInOld: number, endInOld: number, startInNew: number, endInNew: number) {
11
+ this.action = action
12
+ this.startInOld = startInOld
13
+ this.endInOld = endInOld
14
+ this.startInNew = startInNew
15
+ this.endInNew = endInNew
16
+ }
17
+ }
package/src/Utils.ts ADDED
@@ -0,0 +1,96 @@
1
+ const openingTagRegex = /^\s*<[^>]+>\s*$/
2
+ const closingTagTexRegex = /^\s*<\/[^>]+>\s*$/
3
+ const tagWordRegex = /<[^\s>]+/
4
+ const whitespaceRegex = /^(\s|&nbsp;)+$/
5
+ const wordRegex = /[\w#@]+/
6
+ const tagRegex = /<\/?(?<name>[^\s\/>]+)[^>]*>/
7
+
8
+ const SpecialCaseWordTags: readonly string[] = ['<img']
9
+
10
+ export function isTag(item: string): boolean {
11
+ if (SpecialCaseWordTags.some(re => item?.startsWith(re))) {
12
+ return false
13
+ }
14
+
15
+ return isOpeningTag(item) || isClosingTag(item)
16
+ }
17
+
18
+ function isOpeningTag(item: string): boolean {
19
+ return openingTagRegex.test(item)
20
+ }
21
+
22
+ function isClosingTag(item: string): boolean {
23
+ return closingTagTexRegex.test(item)
24
+ }
25
+
26
+ export function stripTagAttributes(word: string): string {
27
+ const match = tagWordRegex.exec(word)
28
+ if (match) {
29
+ return `${match[0]}${word.endsWith('/>') ? '/>' : '>'}`
30
+ }
31
+
32
+ return word
33
+ }
34
+
35
+ export function wrapText(text: string, tagName: string, cssClass: string): string {
36
+ return `<${tagName} class='${cssClass}'>${text}</${tagName}>`
37
+ }
38
+
39
+ export function isStartOfTag(val: string): boolean {
40
+ return val === '<'
41
+ }
42
+
43
+ export function isEndOfTag(val: string): boolean {
44
+ return val === '>'
45
+ }
46
+
47
+ export function isStartOfEntity(val: string): boolean {
48
+ return val === '&'
49
+ }
50
+
51
+ export function isEndOfEntity(val: string): boolean {
52
+ return val === ';'
53
+ }
54
+
55
+ export function isWhiteSpace(value: string): boolean {
56
+ return whitespaceRegex.test(value)
57
+ }
58
+
59
+ export function stripAnyAttributes(word: string): string {
60
+ if (isTag(word)) {
61
+ return stripTagAttributes(word)
62
+ }
63
+
64
+ return word
65
+ }
66
+
67
+ export function isWord(text: string): boolean {
68
+ return wordRegex.test(text)
69
+ }
70
+
71
+ export function getTagName(word: string | null): string {
72
+ if (word === null) {
73
+ return ''
74
+ }
75
+
76
+ const match = tagRegex.exec(word)
77
+ if (match) {
78
+ return match.groups?.name.toLowerCase() ?? match[1].toLowerCase()
79
+ }
80
+
81
+ return ''
82
+ }
83
+
84
+ export default {
85
+ isTag,
86
+ stripTagAttributes,
87
+ wrapText,
88
+ isStartOfTag,
89
+ isEndOfTag,
90
+ isStartOfEntity,
91
+ isEndOfEntity,
92
+ isWhiteSpace,
93
+ stripAnyAttributes,
94
+ isWord,
95
+ getTagName,
96
+ }
@@ -0,0 +1,248 @@
1
+ import Mode from './Mode'
2
+ import Utils from './Utils'
3
+
4
+ export default class WordSplitter {
5
+ private text: string
6
+ private isBlockCheckRequired: boolean
7
+ private blockLocations: BlockFinderResult
8
+ private mode: Mode
9
+ private isGrouping = false
10
+ private globbingUntil: number
11
+ private currentWord: string[]
12
+ private words: string[]
13
+ private static NotGlobbing = -1
14
+
15
+ private get currentWordHasChars() {
16
+ return this.currentWord.length > 0
17
+ }
18
+
19
+ constructor(text: string, blockExpressions: RegExp[]) {
20
+ this.text = text
21
+ this.blockLocations = new BlockFinder(text, blockExpressions).findBlocks()
22
+ this.isBlockCheckRequired = this.blockLocations.hasBlocks
23
+ this.mode = Mode.Character
24
+ this.globbingUntil = WordSplitter.NotGlobbing
25
+ this.currentWord = []
26
+ this.words = []
27
+ }
28
+
29
+ process(): string[] {
30
+ for (let index = 0; index < this.text.length; index++) {
31
+ const character = this.text.charAt(index)
32
+ this.processCharacter(index, character)
33
+ }
34
+
35
+ this.appendCurrentWordToWords()
36
+ return this.words
37
+ }
38
+
39
+ private processCharacter(index: number, character: string) {
40
+ if (this.isGlobbing(index, character)) {
41
+ return
42
+ }
43
+
44
+ switch (this.mode) {
45
+ case Mode.Character:
46
+ this.processTextCharacter(character)
47
+ break
48
+ case Mode.Tag:
49
+ this.processHtmlTagContinuation(character)
50
+ break
51
+ case Mode.Whitespace:
52
+ this.processWhiteSpaceContinuation(character)
53
+ break
54
+ case Mode.Entity:
55
+ this.processEntityContinuation(character)
56
+ break
57
+ }
58
+ }
59
+
60
+ private processEntityContinuation(character: string) {
61
+ if (Utils.isStartOfTag(character)) {
62
+ this.appendCurrentWordToWords()
63
+ this.currentWord.push(character)
64
+ this.mode = Mode.Tag
65
+ } else if (character.trim().length === 0) {
66
+ this.appendCurrentWordToWords()
67
+ this.currentWord.push(character)
68
+ this.mode = Mode.Whitespace
69
+ } else if (Utils.isEndOfEntity(character)) {
70
+ let switchToNextMode = true
71
+ if (this.currentWordHasChars) {
72
+ this.currentWord.push(character)
73
+ this.words.push(this.currentWord.join(''))
74
+
75
+ //join &nbsp; entity with last whitespace
76
+ if (
77
+ this.words.length > 2 &&
78
+ Utils.isWhiteSpace(this.words[this.words.length - 2]) &&
79
+ Utils.isWhiteSpace(this.words[this.words.length - 1])
80
+ ) {
81
+ const w1 = this.words[this.words.length - 2]
82
+ const w2 = this.words[this.words.length - 1]
83
+ this.words.splice(this.words.length - 2, 2)
84
+ this.currentWord = `${w1}${w2}`.split('')
85
+ this.mode = Mode.Whitespace
86
+ switchToNextMode = false
87
+ }
88
+ }
89
+
90
+ if (switchToNextMode) {
91
+ this.currentWord = []
92
+ this.mode = Mode.Character
93
+ }
94
+ } else if (Utils.isWord(character)) {
95
+ this.currentWord.push(character)
96
+ } else {
97
+ this.appendCurrentWordToWords()
98
+ this.currentWord.push(character)
99
+ this.mode = Mode.Character
100
+ }
101
+ }
102
+
103
+ private processWhiteSpaceContinuation(character: string) {
104
+ if (Utils.isStartOfTag(character)) {
105
+ this.appendCurrentWordToWords()
106
+ this.currentWord.push(character)
107
+ this.mode = Mode.Tag
108
+ } else if (Utils.isStartOfEntity(character)) {
109
+ this.appendCurrentWordToWords()
110
+ this.currentWord.push(character)
111
+ this.mode = Mode.Entity
112
+ } else if (Utils.isWhiteSpace(character)) {
113
+ this.currentWord.push(character)
114
+ } else {
115
+ this.appendCurrentWordToWords()
116
+ this.currentWord.push(character)
117
+ this.mode = Mode.Character
118
+ }
119
+ }
120
+
121
+ private processHtmlTagContinuation(character: string) {
122
+ if (Utils.isEndOfTag(character)) {
123
+ this.currentWord.push(character)
124
+ this.appendCurrentWordToWords()
125
+ this.mode = Utils.isWhiteSpace(character) ? Mode.Whitespace : Mode.Character
126
+ } else {
127
+ this.currentWord.push(character)
128
+ }
129
+ }
130
+
131
+ private processTextCharacter(character: string) {
132
+ if (Utils.isStartOfTag(character)) {
133
+ this.appendCurrentWordToWords()
134
+ this.currentWord.push('<')
135
+ this.mode = Mode.Tag
136
+ } else if (Utils.isStartOfEntity(character)) {
137
+ this.appendCurrentWordToWords()
138
+ this.currentWord.push(character)
139
+ this.mode = Mode.Entity
140
+ } else if (Utils.isWhiteSpace(character)) {
141
+ this.appendCurrentWordToWords()
142
+ this.currentWord.push(character)
143
+ this.mode = Mode.Whitespace
144
+ } else if (
145
+ Utils.isWord(character) &&
146
+ (this.currentWord.length === 0 || Utils.isWord(this.currentWord[this.currentWord.length - 1]))
147
+ ) {
148
+ this.currentWord.push(character)
149
+ } else {
150
+ this.appendCurrentWordToWords()
151
+ this.currentWord.push(character)
152
+ }
153
+ }
154
+
155
+ private appendCurrentWordToWords() {
156
+ if (this.currentWordHasChars) {
157
+ this.words.push(this.currentWord.join(''))
158
+ this.currentWord = []
159
+ }
160
+ }
161
+
162
+ private isGlobbing(index: number, character: string): boolean {
163
+ if (!this.isBlockCheckRequired) {
164
+ return false
165
+ }
166
+ const isCurrentBlockTerminating = index === this.globbingUntil
167
+ if (isCurrentBlockTerminating) {
168
+ this.globbingUntil = WordSplitter.NotGlobbing
169
+ this.isGrouping = false
170
+ this.appendCurrentWordToWords()
171
+ }
172
+
173
+ const until = this.blockLocations.isInBlock(index)
174
+ if (until) {
175
+ this.isGrouping = true
176
+ this.globbingUntil = until
177
+ }
178
+ if (this.isGrouping) {
179
+ this.currentWord.push(character)
180
+ this.mode = Mode.Character
181
+ }
182
+ return this.isGrouping
183
+ }
184
+
185
+ static convertHtmlToListOfWords(text: string, blockExpressions: RegExp[]): string[] {
186
+ return new WordSplitter(text, blockExpressions).process()
187
+ }
188
+ }
189
+
190
+ class BlockFinderResult {
191
+ private blocks: Map<number, number> = new Map()
192
+
193
+ addBlock(from: number, to: number) {
194
+ if (this.blocks.has(from)) {
195
+ throw new ArgumentError('One or more block expressions result in a text sequence that overlaps.')
196
+ }
197
+
198
+ this.blocks.set(from, to)
199
+ }
200
+
201
+ isInBlock(location: number): number | null {
202
+ return this.blocks.get(location) ?? null
203
+ }
204
+
205
+ get hasBlocks() {
206
+ return this.blocks.size > 0
207
+ }
208
+ }
209
+
210
+ class ArgumentError extends Error {}
211
+
212
+ class BlockFinder {
213
+ private text: string
214
+ private blockExpressions: RegExp[]
215
+
216
+ constructor(text: string, blockExpressions: RegExp[]) {
217
+ this.text = text
218
+ this.blockExpressions = blockExpressions
219
+ }
220
+
221
+ findBlocks(): BlockFinderResult {
222
+ const result = new BlockFinderResult()
223
+ for (const expression of this.blockExpressions) {
224
+ this.processBlockMatcher(expression, result)
225
+ }
226
+ return result
227
+ }
228
+
229
+ private processBlockMatcher(exp: RegExp, result: BlockFinderResult) {
230
+ let match: RegExpExecArray | null
231
+ // biome-ignore lint/suspicious/noAssignInExpressions: Couldn't think of a nicer way to do this
232
+ while ((match = exp.exec(this.text)) !== null) {
233
+ this.tryAddBlock(exp, match, result)
234
+ }
235
+ }
236
+
237
+ private tryAddBlock(exp: RegExp, match: RegExpExecArray, result: BlockFinderResult) {
238
+ try {
239
+ const from = match.index
240
+ const to = match.index + match[0].length
241
+ result.addBlock(from, to)
242
+ } catch (e) {
243
+ throw new ArgumentError(
244
+ `One or more block expressions result in a text sequence that overlaps. Current expression: ${exp}`
245
+ )
246
+ }
247
+ }
248
+ }
@@ -0,0 +1,126 @@
1
+ import { bench, describe, expect } from 'vitest'
2
+ import HtmlDiff from '../src/HtmlDiff'
3
+
4
+ describe('HtmlDiff', () => {
5
+ const template =
6
+ 'Lorem ipsum dolor sit amet {0}, consectetur adipiscing elit. Nunc sollicitudin mauris eget nibh {1} semper, in bibendum felis rutrum. Aliquam dictum {2} ut ante id dictum. Integer quis tincidunt metus. Maecenas ultricies tristique {3} fringilla. Cras non erat id elit rhoncus accumsan eget quis neque. Fusce accumsan justo mauris, et pulvinar leo lacinia molestie. Nam ullamcorper dapibus velit a pulvinar. Cras a hendrerit neque {4}, sit amet faucibus ante. {5} Nullam in nisl augue. Suspendisse consectetur id ipsum at dignissim. Etiam euismod sollicitudin metus non volutpat,{6}. Nullam non mollis risus, nec consequat ipsum.'
7
+ const words = [
8
+ 'Donec',
9
+ 'condimentum,',
10
+ 'tellus',
11
+ 'a',
12
+ 'aliquam',
13
+ 'feugiat,',
14
+ 'dui',
15
+ 'diam',
16
+ 'fringilla',
17
+ 'massa,',
18
+ 'sed',
19
+ 'facilisis',
20
+ 'risus',
21
+ 'magna',
22
+ 'quis',
23
+ 'augue.',
24
+ 'Aenean',
25
+ 'tempus',
26
+ 'metus',
27
+ 'at',
28
+ 'quam',
29
+ 'aliquet,',
30
+ 'ultrices',
31
+ 'venenatis',
32
+ 'nulla',
33
+ 'faucibus.',
34
+ 'Maecenas',
35
+ 'sit',
36
+ 'amet',
37
+ 'lobortis',
38
+ 'tortor.',
39
+ 'Vestibulum',
40
+ 'fringilla',
41
+ 'fringilla',
42
+ 'diam,',
43
+ 'non',
44
+ 'tempus',
45
+ 'quam',
46
+ 'pretium',
47
+ 'gravida.',
48
+ 'In',
49
+ 'pretium',
50
+ 'vitae',
51
+ 'erat',
52
+ 'sed',
53
+ 'bibendum.',
54
+ 'Sed',
55
+ 'ultrices',
56
+ 'risus',
57
+ 'et',
58
+ 'aliquet',
59
+ 'sollicitudin.',
60
+ 'Fusce',
61
+ 'ac',
62
+ 'diam',
63
+ 'justo.',
64
+ 'Morbi',
65
+ 'lobortis',
66
+ 'quam',
67
+ 'vestibulum',
68
+ 'volutpat',
69
+ 'cursus.',
70
+ 'Suspendisse',
71
+ 'vestibulum',
72
+ 'augue',
73
+ 'et',
74
+ 'interdum',
75
+ 'convallis.',
76
+ ] as const
77
+
78
+ const text = (...words: string[]) => {
79
+ const [word1, word2, word3, word4, word5, word6, word7] = words
80
+
81
+ return template
82
+ .replace('{0}', word1)
83
+ .replace('{1}', word2)
84
+ .replace('{2}', word3)
85
+ .replace('{3}', word4)
86
+ .replace('{4}', word5)
87
+ .replace('{5}', word6)
88
+ .replace('{6}', word7)
89
+ }
90
+
91
+ const iterations = 300
92
+ const oldText = Array(iterations)
93
+ .fill(null)
94
+ .reduce((acc, _, i) => {
95
+ if (i % 2 === 0) {
96
+ return acc + text('', '', '', '', '', '', '')
97
+ }
98
+ if (i % 5 === 0) {
99
+ return acc + text(...[...words.slice(i % words.length), ...words.slice(words.length - (i % words.length))])
100
+ }
101
+ if (i % 7 === 0) {
102
+ return acc + text(...words)
103
+ }
104
+
105
+ return acc
106
+ }, '')
107
+ const newText = Array(iterations)
108
+ .fill(null)
109
+ .reduce((acc, _, i) => {
110
+ if (i % 3 === 0) {
111
+ return acc + text('', '', '', '', '', '', '')
112
+ }
113
+ if (i % 2 === 0) {
114
+ return acc + text(...[...words.slice(i % words.length), ...words.slice(words.length - (i % words.length))])
115
+ }
116
+ if (i % 11 === 0) {
117
+ return acc + text(...words)
118
+ }
119
+
120
+ return acc
121
+ }, '')
122
+
123
+ bench('diff', () => {
124
+ expect(HtmlDiff.execute(oldText, newText)).toBeTruthy()
125
+ })
126
+ })