@createiq/htmldiff 1.1.0 → 1.2.0-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +15 -0
- package/README.md +67 -0
- package/dist/HtmlDiff.cjs +1192 -456
- package/dist/HtmlDiff.cjs.map +1 -1
- package/dist/HtmlDiff.d.cts +160 -7
- package/dist/HtmlDiff.d.mts +159 -7
- package/dist/HtmlDiff.mjs +1192 -456
- package/dist/HtmlDiff.mjs.map +1 -1
- package/package.json +1 -1
- package/src/Alignment.ts +349 -0
- package/src/HtmlDiff.ts +343 -33
- package/src/HtmlScanner.ts +200 -0
- package/src/TableDiff.ts +67 -522
- package/src/ThreeWayDiff.ts +269 -0
- package/src/ThreeWayTable.ts +625 -0
- package/src/Utils.ts +34 -2
- package/test/HtmlDiff.analyze.spec.ts +152 -0
- package/test/HtmlDiff.tables.spec.ts +43 -19
- package/test/HtmlDiff.threeWay.spec.ts +173 -0
- package/test/HtmlDiff.threeWay.tables.spec.ts +301 -0
- package/test/TableDiff.bench.ts +39 -0
- package/test/Utils.spec.ts +48 -0
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
import Action from './Action'
|
|
2
|
+
import type { AnalyzeResult } from './HtmlDiff'
|
|
3
|
+
import type Operation from './Operation'
|
|
4
|
+
import type { WrapMetadata } from './Utils'
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Composes diff(genesis → cp-latest) (CP's accumulated changes from the
|
|
8
|
+
* common ancestor) and diff(genesis → me-current) (Me's accumulated
|
|
9
|
+
* changes from the common ancestor) into a single attributed segment
|
|
10
|
+
* stream. The output is consumed by `HtmlDiff.executeThreeWay` for
|
|
11
|
+
* emission.
|
|
12
|
+
*
|
|
13
|
+
* Genesis is the structural spine. Both pair-wise analyses must
|
|
14
|
+
* tokenise genesis identically (`HtmlDiff.executeThreeWay` enforces
|
|
15
|
+
* this via the symmetric-projection decision), so genesis-diff indices
|
|
16
|
+
* are stable across the two streams.
|
|
17
|
+
*
|
|
18
|
+
* Per genesis token: classify by what each side did to it
|
|
19
|
+
* (kept / deleted) and emit accordingly. Per genesis boundary: collect
|
|
20
|
+
* each side's insertions and check for agreement — when both sides
|
|
21
|
+
* inserted identical content, the insertion is treated as "settled"
|
|
22
|
+
* and emitted unmarked (the reader sees the agreed-on text without
|
|
23
|
+
* authorship markup, matching Word-style track-changes conventions
|
|
24
|
+
* where both authors agreeing is silent).
|
|
25
|
+
*
|
|
26
|
+
* The emission order at a boundary mirrors the 2-way del-then-ins
|
|
27
|
+
* convention: a Replace (genesis token deleted + a paired insertion)
|
|
28
|
+
* reads as `<del>old</del><ins>new</ins>`. Pure insertions are
|
|
29
|
+
* positioned at their natural boundary.
|
|
30
|
+
*/
|
|
31
|
+
|
|
32
|
+
export type Author = 'cp' | 'me'
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Attribution assigned to each output segment.
|
|
36
|
+
*
|
|
37
|
+
* `equal` covers three cases: tokens both authors kept (rendered as the
|
|
38
|
+
* genesis word), insertion spans both authors made identically (rendered
|
|
39
|
+
* plain), and structural tags around both-deleted tokens (rendered to
|
|
40
|
+
* keep layout intact while the content token itself is dropped).
|
|
41
|
+
* Equal segments carry no markup.
|
|
42
|
+
*/
|
|
43
|
+
export type Attribution = { kind: 'equal' } | { kind: 'ins'; author: Author } | { kind: 'del'; author: Author }
|
|
44
|
+
|
|
45
|
+
export interface Segment {
|
|
46
|
+
attr: Attribution
|
|
47
|
+
/** Tokens to emit. For Equal segments these are original genesis words
|
|
48
|
+
* (including structural tags); for ins/del they are diff-space tokens. */
|
|
49
|
+
words: string[]
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Builds the attributed segment stream for a three-way diff.
|
|
54
|
+
*
|
|
55
|
+
* @param dCp analysis of diff(genesis → cp-latest)
|
|
56
|
+
* @param dMe analysis of diff(genesis → me-current)
|
|
57
|
+
*
|
|
58
|
+
* Both analyses must share the same `oldDiffWords` (the genesis tokens)
|
|
59
|
+
* — the caller guarantees this by passing the same genesis input and
|
|
60
|
+
* the same `useProjections` decision to both `HtmlDiff.analyze` calls.
|
|
61
|
+
*/
|
|
62
|
+
export function buildSegments(dCp: AnalyzeResult, dMe: AnalyzeResult): Segment[] {
|
|
63
|
+
const genesisLen = dCp.oldDiffWords.length
|
|
64
|
+
|
|
65
|
+
// Per genesis token: did each author keep it or delete it?
|
|
66
|
+
const cpFate = buildFateFromGenesis(dCp.operations, genesisLen)
|
|
67
|
+
const meFate = buildFateFromGenesis(dMe.operations, genesisLen)
|
|
68
|
+
|
|
69
|
+
// Per boundary: tokens each author inserted at that boundary. Keyed by
|
|
70
|
+
// `endInOld` so a Replace's insertion sits AFTER the deleted genesis
|
|
71
|
+
// token (visual del-then-ins). Pure Insert ops have endInOld ==
|
|
72
|
+
// startInOld so they land at their natural between-tokens boundary.
|
|
73
|
+
const cpInsAt = collectInsertionsKeyedByEnd(dCp)
|
|
74
|
+
const meInsAt = collectInsertionsKeyedByEnd(dMe)
|
|
75
|
+
|
|
76
|
+
// Inverse map genesis-diff-index → genesis-original-index. Identity when
|
|
77
|
+
// no projection. Used to slice the original genesis words for Equal
|
|
78
|
+
// segments so structural tags pass through verbatim.
|
|
79
|
+
const diffToOriginal: readonly number[] = dCp.oldContentToOriginal ?? Array.from({ length: genesisLen }, (_, i) => i)
|
|
80
|
+
const genesisOriginalLen = dCp.oldOriginalWords.length
|
|
81
|
+
|
|
82
|
+
const segments: Segment[] = []
|
|
83
|
+
let originalCursor = 0
|
|
84
|
+
|
|
85
|
+
// Boundary 0 — pure insertions BEFORE genesis[0].
|
|
86
|
+
emitBoundary(0, cpInsAt, meInsAt, dCp.newDiffWords, dMe.newDiffWords, segments)
|
|
87
|
+
|
|
88
|
+
for (let i = 0; i < genesisLen; i++) {
|
|
89
|
+
const cpDel = cpFate[i] === 'deleted'
|
|
90
|
+
const meDel = meFate[i] === 'deleted'
|
|
91
|
+
|
|
92
|
+
// Pick up structural tags from cursor through to this genesis token's
|
|
93
|
+
// original index. Same cursor-based slicing as the 2-way path so a
|
|
94
|
+
// `<p>` opening tag preceding a content token gets attributed with
|
|
95
|
+
// that token's segment.
|
|
96
|
+
const origIdx = diffToOriginal[i]
|
|
97
|
+
const slice = dCp.oldOriginalWords.slice(originalCursor, origIdx + 1)
|
|
98
|
+
originalCursor = origIdx + 1
|
|
99
|
+
|
|
100
|
+
if (!cpDel && !meDel) {
|
|
101
|
+
// Kept by both — equal. Emit the original-word slice (includes
|
|
102
|
+
// any leading structural tags).
|
|
103
|
+
appendSegment(segments, { kind: 'equal' }, slice)
|
|
104
|
+
} else if (cpDel && meDel) {
|
|
105
|
+
// Both deleted — settled. Filter at emission time; pass the
|
|
106
|
+
// structural-tag-bearing slice through as equal so layout
|
|
107
|
+
// survives. The content token itself is the LAST element of the
|
|
108
|
+
// slice (since slice ends at origIdx+1); drop only that.
|
|
109
|
+
// If slice has multiple elements (leading structural tags), they
|
|
110
|
+
// belong to the surrounding flow and should remain.
|
|
111
|
+
if (slice.length > 1) {
|
|
112
|
+
appendSegment(segments, { kind: 'equal' }, slice.slice(0, slice.length - 1))
|
|
113
|
+
}
|
|
114
|
+
// The content token itself is silenced.
|
|
115
|
+
} else if (cpDel) {
|
|
116
|
+
// CP deleted, Me kept → render as <del cp>. Me's keeping means the
|
|
117
|
+
// token is still in V_me; the markup tells the reader "CP wanted
|
|
118
|
+
// this gone, you've kept it."
|
|
119
|
+
appendSegment(segments, { kind: 'del', author: 'cp' }, slice)
|
|
120
|
+
} else {
|
|
121
|
+
// Me deleted, CP kept → render as <del me>.
|
|
122
|
+
appendSegment(segments, { kind: 'del', author: 'me' }, slice)
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// Boundary i+1 — pure insertions between genesis[i] and genesis[i+1],
|
|
126
|
+
// AND replace-insertions paired with genesis[i] (which we just
|
|
127
|
+
// emitted as a deletion).
|
|
128
|
+
emitBoundary(i + 1, cpInsAt, meInsAt, dCp.newDiffWords, dMe.newDiffWords, segments)
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// Trailing original tokens (structural closing tags after the last
|
|
132
|
+
// content word).
|
|
133
|
+
if (originalCursor < genesisOriginalLen) {
|
|
134
|
+
appendSegment(segments, { kind: 'equal' }, dCp.oldOriginalWords.slice(originalCursor))
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
return segments
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// ────────────────────────────────────────────────────────────────────────────
|
|
141
|
+
|
|
142
|
+
type GenesisFate = 'kept' | 'deleted'
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* Per genesis-diff-index, what did this side do to that token? Both
|
|
146
|
+
* Delete and Replace ops remove the token from the side's output, so
|
|
147
|
+
* both contribute `'deleted'`. Equal ops contribute `'kept'`. Insert
|
|
148
|
+
* ops have an empty old range, so they don't touch the genesis fate
|
|
149
|
+
* map.
|
|
150
|
+
*/
|
|
151
|
+
function buildFateFromGenesis(ops: readonly Operation[], genesisLen: number): GenesisFate[] {
|
|
152
|
+
const out: GenesisFate[] = new Array(genesisLen).fill('kept')
|
|
153
|
+
for (const op of ops) {
|
|
154
|
+
if (op.action !== Action.Delete && op.action !== Action.Replace) continue
|
|
155
|
+
for (let i = op.startInOld; i < op.endInOld; i++) {
|
|
156
|
+
if (i >= 0 && i < genesisLen) out[i] = 'deleted'
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
return out
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Per genesis boundary `b`, collect tokens this side inserted at that
|
|
164
|
+
* boundary. Keyed by `endInOld` so a Replace at genesis[k..k+1] has its
|
|
165
|
+
* insertion at boundary k+1 (after the deleted token) rather than k
|
|
166
|
+
* (before) — that produces the del-then-ins visual order.
|
|
167
|
+
*
|
|
168
|
+
* For pure Insert ops the old range is empty (endInOld == startInOld),
|
|
169
|
+
* so the key is the same as the semantic between-tokens position.
|
|
170
|
+
*/
|
|
171
|
+
function collectInsertionsKeyedByEnd(d: AnalyzeResult): Map<number, string[]> {
|
|
172
|
+
const out = new Map<number, string[]>()
|
|
173
|
+
for (const op of d.operations) {
|
|
174
|
+
if (op.action !== Action.Insert && op.action !== Action.Replace) continue
|
|
175
|
+
const words = d.newDiffWords.slice(op.startInNew, op.endInNew)
|
|
176
|
+
if (words.length === 0) continue
|
|
177
|
+
const key = op.endInOld
|
|
178
|
+
const existing = out.get(key) ?? []
|
|
179
|
+
existing.push(...words)
|
|
180
|
+
out.set(key, existing)
|
|
181
|
+
}
|
|
182
|
+
return out
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
/**
|
|
186
|
+
* Emit any insertions at boundary `b`. When both authors inserted at
|
|
187
|
+
* the same boundary AND the inserted token sequences are textually
|
|
188
|
+
* identical, the insertion is treated as agreed and emitted unmarked.
|
|
189
|
+
* Otherwise each side's insertion is emitted with author attribution.
|
|
190
|
+
*
|
|
191
|
+
* The CP-then-Me ordering for disagreement is arbitrary but consistent;
|
|
192
|
+
* callers don't depend on it.
|
|
193
|
+
*/
|
|
194
|
+
function emitBoundary(
|
|
195
|
+
b: number,
|
|
196
|
+
cpInsAt: Map<number, string[]>,
|
|
197
|
+
meInsAt: Map<number, string[]>,
|
|
198
|
+
_cpDiffWords: readonly string[],
|
|
199
|
+
_meDiffWords: readonly string[],
|
|
200
|
+
segments: Segment[]
|
|
201
|
+
) {
|
|
202
|
+
const cpIns = cpInsAt.get(b)
|
|
203
|
+
const meIns = meInsAt.get(b)
|
|
204
|
+
const hasCp = !!cpIns && cpIns.length > 0
|
|
205
|
+
const hasMe = !!meIns && meIns.length > 0
|
|
206
|
+
if (!hasCp && !hasMe) return
|
|
207
|
+
|
|
208
|
+
if (hasCp && hasMe && tokenArraysEqual(cpIns, meIns)) {
|
|
209
|
+
// Both authors inserted the same content — settled. Emit unmarked.
|
|
210
|
+
appendSegment(segments, { kind: 'equal' }, cpIns)
|
|
211
|
+
return
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
if (hasCp) appendSegment(segments, { kind: 'ins', author: 'cp' }, cpIns)
|
|
215
|
+
if (hasMe) appendSegment(segments, { kind: 'ins', author: 'me' }, meIns)
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
function tokenArraysEqual(a: readonly string[], b: readonly string[]): boolean {
|
|
219
|
+
if (a.length !== b.length) return false
|
|
220
|
+
for (let i = 0; i < a.length; i++) if (a[i] !== b[i]) return false
|
|
221
|
+
return true
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
function appendSegment(segments: Segment[], attr: Attribution, words: readonly string[]) {
|
|
225
|
+
if (words.length === 0) return
|
|
226
|
+
const last = segments[segments.length - 1]
|
|
227
|
+
if (last && sameAttribution(last.attr, attr)) {
|
|
228
|
+
last.words.push(...words)
|
|
229
|
+
return
|
|
230
|
+
}
|
|
231
|
+
segments.push({ attr, words: [...words] })
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
function sameAttribution(a: Attribution, b: Attribution): boolean {
|
|
235
|
+
if (a.kind === 'equal' && b.kind === 'equal') return true
|
|
236
|
+
if (a.kind === 'ins' && b.kind === 'ins') return a.author === b.author
|
|
237
|
+
if (a.kind === 'del' && b.kind === 'del') return a.author === b.author
|
|
238
|
+
return false
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
/**
|
|
242
|
+
* Build the `WrapMetadata` for an attribution. Single source of truth
|
|
243
|
+
* for author-class / data-attr shape so the three emission paths
|
|
244
|
+
* (word-level, table-level full-row/cell, multi-table whole-table
|
|
245
|
+
* pre-wrap) stay consistent. A change here propagates to every author
|
|
246
|
+
* marker in the output.
|
|
247
|
+
*/
|
|
248
|
+
export function authorAttribution(author: Author): WrapMetadata {
|
|
249
|
+
return { extraClasses: author, dataAttrs: { author } }
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
/**
|
|
253
|
+
* Resolve a segment's attribution into the wrapper-tag, base CSS class,
|
|
254
|
+
* and `WrapMetadata` consumed by `Utils.wrapText` / `insertTag`. The
|
|
255
|
+
* caller is `HtmlDiff.executeThreeWay`'s emission loop.
|
|
256
|
+
*
|
|
257
|
+
* `equal` segments don't go through this — they're emitted unmarked.
|
|
258
|
+
*/
|
|
259
|
+
export function segmentEmissionShape(attr: Exclude<Attribution, { kind: 'equal' }>): {
|
|
260
|
+
tag: 'ins' | 'del'
|
|
261
|
+
baseClass: 'diffins' | 'diffdel'
|
|
262
|
+
metadata: WrapMetadata
|
|
263
|
+
} {
|
|
264
|
+
return {
|
|
265
|
+
tag: attr.kind,
|
|
266
|
+
baseClass: attr.kind === 'ins' ? 'diffins' : 'diffdel',
|
|
267
|
+
metadata: authorAttribution(attr.author),
|
|
268
|
+
}
|
|
269
|
+
}
|