@createiq/htmldiff 1.2.0-beta.0 → 1.2.0-beta.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,95 +1,138 @@
1
1
  import Action from './Action'
2
+ import { lcsAlign } from './Alignment'
2
3
  import type { AnalyzeResult } from './HtmlDiff'
3
4
  import type Operation from './Operation'
4
5
  import type { WrapMetadata } from './Utils'
5
6
 
6
7
  /**
7
- * Composes diff(V1, V2) (CP's changes) and diff(V2, V3) (Me's changes)
8
- * into a single attributed segment stream. The output is consumed by
9
- * `HtmlDiff.executeThreeWay` for emission.
8
+ * Composes diff(genesis → cp-latest) (CP's accumulated changes from the
9
+ * common ancestor) and diff(genesis me-current) (Me's accumulated
10
+ * changes from the common ancestor) into a single attributed segment
11
+ * stream. The output is consumed by `HtmlDiff.executeThreeWay` for
12
+ * emission.
10
13
  *
11
- * V2 is the structural spine. Both pair-wise analyses must tokenise V2
12
- * identically (`HtmlDiff.executeThreeWay` enforces this via the
13
- * symmetric-projection decision), so V2-diff indices are stable across
14
- * the two streams and we can fold them into a single per-V2-token
15
- * attribution view, interleaved with off-spine CP-deletions (V1-side)
16
- * and Me-insertions (V3-side).
14
+ * Genesis is the structural spine. Both pair-wise analyses must
15
+ * tokenise genesis identically (`HtmlDiff.executeThreeWay` enforces
16
+ * this via the symmetric-projection decision), so genesis-diff indices
17
+ * are stable across the two streams.
18
+ *
19
+ * Per genesis token: classify by what each side did to it
20
+ * (kept / deleted) and emit accordingly. Per genesis boundary: collect
21
+ * each side's insertions and check for agreement — when both sides
22
+ * inserted identical content, the insertion is treated as "settled"
23
+ * and emitted unmarked (the reader sees the agreed-on text without
24
+ * authorship markup, matching Word-style track-changes conventions
25
+ * where both authors agreeing is silent).
26
+ *
27
+ * The emission order at a boundary mirrors the 2-way del-then-ins
28
+ * convention: a Replace (genesis token deleted + a paired insertion)
29
+ * reads as `<del>old</del><ins>new</ins>`. Pure insertions are
30
+ * positioned at their natural boundary.
17
31
  */
18
32
 
19
33
  export type Author = 'cp' | 'me'
20
34
 
21
35
  /**
22
- * Attribution assigned to each output segment. `reject` is its own kind
23
- * (rather than a flavour of `del`) so exhaustive switching is safe — no
24
- * property-presence narrowing required at use sites.
36
+ * Attribution assigned to each output segment.
37
+ *
38
+ * `equal` covers three cases: tokens both authors kept (rendered as the
39
+ * genesis word), insertion spans both authors made identically (rendered
40
+ * plain), and structural tags around both-deleted tokens (rendered to
41
+ * keep layout intact while the content token itself is dropped).
42
+ * Equal segments carry no markup.
25
43
  */
26
- export type Attribution =
27
- | { kind: 'equal' }
28
- | { kind: 'ins'; author: Author }
29
- | { kind: 'del'; author: Author }
30
- // Me deleting tokens that CP inserted = rejecting CP's proposal.
31
- | { kind: 'reject'; by: 'me'; rejected: 'cp' }
44
+ export type Attribution = { kind: 'equal' } | { kind: 'ins'; author: Author } | { kind: 'del'; author: Author }
32
45
 
33
46
  export interface Segment {
34
47
  attr: Attribution
35
- /** Tokens to emit. For Equal segments these are original V2 words
48
+ /** Tokens to emit. For Equal segments these are original genesis words
36
49
  * (including structural tags); for ins/del they are diff-space tokens. */
37
50
  words: string[]
38
51
  }
39
52
 
40
- export function buildSegments(d1: AnalyzeResult, d2: AnalyzeResult): Segment[] {
41
- const v2DiffLen = d1.newDiffWords.length
42
- const fromV1 = buildOriginMap(d1.operations, v2DiffLen)
43
- const toV3 = buildFateMap(d2.operations, v2DiffLen)
44
- const cpDeletionsAt = collectDeletionsAtBoundary(d1)
45
- const meInsertionsAt = collectInsertionsAtBoundary(d2)
53
+ /**
54
+ * Builds the attributed segment stream for a three-way diff.
55
+ *
56
+ * @param dCp analysis of diff(genesis → cp-latest)
57
+ * @param dMe analysis of diff(genesis → me-current)
58
+ *
59
+ * Both analyses must share the same `oldDiffWords` (the genesis tokens)
60
+ * — the caller guarantees this by passing the same genesis input and
61
+ * the same `useProjections` decision to both `HtmlDiff.analyze` calls.
62
+ */
63
+ export function buildSegments(dCp: AnalyzeResult, dMe: AnalyzeResult): Segment[] {
64
+ const genesisLen = dCp.oldDiffWords.length
65
+
66
+ // Per genesis token: did each author keep it or delete it?
67
+ const cpFate = buildFateFromGenesis(dCp.operations, genesisLen)
68
+ const meFate = buildFateFromGenesis(dMe.operations, genesisLen)
69
+
70
+ // Per boundary: tokens each author inserted at that boundary. Keyed by
71
+ // `endInOld` so a Replace's insertion sits AFTER the deleted genesis
72
+ // token (visual del-then-ins). Pure Insert ops have endInOld ==
73
+ // startInOld so they land at their natural between-tokens boundary.
74
+ const cpInsAt = collectInsertionsKeyedByEnd(dCp)
75
+ const meInsAt = collectInsertionsKeyedByEnd(dMe)
46
76
 
47
- // Inverse map V2-diff-index → V2-original-index. Identity when no projection.
48
- const diffToOriginal: readonly number[] = d1.newContentToOriginal ?? Array.from({ length: v2DiffLen }, (_, i) => i)
49
- const v2OriginalLen = d1.newOriginalWords.length
77
+ // Inverse map genesis-diff-index → genesis-original-index. Identity when
78
+ // no projection. Used to slice the original genesis words for Equal
79
+ // segments so structural tags pass through verbatim.
80
+ const diffToOriginal: readonly number[] = dCp.oldContentToOriginal ?? Array.from({ length: genesisLen }, (_, i) => i)
81
+ const genesisOriginalLen = dCp.oldOriginalWords.length
50
82
 
51
83
  const segments: Segment[] = []
52
84
  let originalCursor = 0
53
85
 
54
- for (let i = 0; i < v2DiffLen; i++) {
55
- // CP-deletions from V1 land BEFORE the V2 token at this boundary —
56
- // they conceptually "preceded" V2[i] in V1's stream.
57
- const cpDel = cpDeletionsAt.get(i)
58
- if (cpDel?.length) appendSegment(segments, { kind: 'del', author: 'cp' }, cpDel)
86
+ // Boundary 0 pure insertions BEFORE genesis[0].
87
+ emitBoundary(0, cpInsAt, meInsAt, dCp.newDiffWords, dMe.newDiffWords, segments)
59
88
 
60
- const attr = combine(fromV1[i], toV3[i])
89
+ for (let i = 0; i < genesisLen; i++) {
90
+ const cpDel = cpFate[i] === 'deleted'
91
+ const meDel = meFate[i] === 'deleted'
92
+
93
+ // Pick up structural tags from cursor through to this genesis token's
94
+ // original index. Same cursor-based slicing as the 2-way path so a
95
+ // `<p>` opening tag preceding a content token gets attributed with
96
+ // that token's segment.
61
97
  const origIdx = diffToOriginal[i]
62
- const slice = d1.newOriginalWords.slice(originalCursor, origIdx + 1)
98
+ const slice = dCp.oldOriginalWords.slice(originalCursor, origIdx + 1)
63
99
  originalCursor = origIdx + 1
64
100
 
65
- // Me-insertions at this boundary go BEFORE V2[i] for pure
66
- // insertions, but AFTER V2[i] when V2[i] is itself a Me-deletion
67
- // (i.e. a Me Replace). This mirrors the 2-way del-then-ins
68
- // convention so a Replace reads as `<del>X</del><ins>Y</ins>`.
69
- const meIns = meInsertionsAt.get(i)
70
- const meInsAfterV2 = meIns?.length && isDeletion(attr)
71
-
72
- if (meIns?.length && !meInsAfterV2) {
73
- appendSegment(segments, { kind: 'ins', author: 'me' }, meIns)
74
- }
75
- appendSegment(segments, attr, slice)
76
- if (meInsAfterV2) {
77
- appendSegment(segments, { kind: 'ins', author: 'me' }, meIns)
101
+ if (!cpDel && !meDel) {
102
+ // Kept by both equal. Emit the original-word slice (includes
103
+ // any leading structural tags).
104
+ appendSegment(segments, { kind: 'equal' }, slice)
105
+ } else if (cpDel && meDel) {
106
+ // Both deleted settled. Filter at emission time; pass the
107
+ // structural-tag-bearing slice through as equal so layout
108
+ // survives. The content token itself is the LAST element of the
109
+ // slice (since slice ends at origIdx+1); drop only that.
110
+ // If slice has multiple elements (leading structural tags), they
111
+ // belong to the surrounding flow and should remain.
112
+ if (slice.length > 1) {
113
+ appendSegment(segments, { kind: 'equal' }, slice.slice(0, slice.length - 1))
114
+ }
115
+ // The content token itself is silenced.
116
+ } else if (cpDel) {
117
+ // CP deleted, Me kept → render as <del cp>. Me's keeping means the
118
+ // token is still in V_me; the markup tells the reader "CP wanted
119
+ // this gone, you've kept it."
120
+ appendSegment(segments, { kind: 'del', author: 'cp' }, slice)
121
+ } else {
122
+ // Me deleted, CP kept → render as <del me>.
123
+ appendSegment(segments, { kind: 'del', author: 'me' }, slice)
78
124
  }
125
+
126
+ // Boundary i+1 — pure insertions between genesis[i] and genesis[i+1],
127
+ // AND replace-insertions paired with genesis[i] (which we just
128
+ // emitted as a deletion).
129
+ emitBoundary(i + 1, cpInsAt, meInsAt, dCp.newDiffWords, dMe.newDiffWords, segments)
79
130
  }
80
- // Tail-end interleavings (CP-del / Me-ins at boundary v2DiffLen — i.e.
81
- // after every V2 token). Ordering doesn't matter since there's no
82
- // V2 token to anchor around.
83
- const tailCpDel = cpDeletionsAt.get(v2DiffLen)
84
- if (tailCpDel?.length) appendSegment(segments, { kind: 'del', author: 'cp' }, tailCpDel)
85
- const tailMeIns = meInsertionsAt.get(v2DiffLen)
86
- if (tailMeIns?.length) appendSegment(segments, { kind: 'ins', author: 'me' }, tailMeIns)
87
-
88
- // Trailing V2-original tokens (structural closing tags after the last
89
- // content word). Emit as equal — there's no following segment to claim
90
- // them, and attributing them to either author would be arbitrary.
91
- if (originalCursor < v2OriginalLen) {
92
- appendSegment(segments, { kind: 'equal' }, d1.newOriginalWords.slice(originalCursor))
131
+
132
+ // Trailing original tokens (structural closing tags after the last
133
+ // content word).
134
+ if (originalCursor < genesisOriginalLen) {
135
+ appendSegment(segments, { kind: 'equal' }, dCp.oldOriginalWords.slice(originalCursor))
93
136
  }
94
137
 
95
138
  return segments
@@ -97,80 +140,135 @@ export function buildSegments(d1: AnalyzeResult, d2: AnalyzeResult): Segment[] {
97
140
 
98
141
  // ────────────────────────────────────────────────────────────────────────────
99
142
 
100
- type V2Origin = 'preserved-from-v1' | 'inserted-by-cp' | 'replaced-into-by-cp'
101
- type V2Fate = 'preserved-to-v3' | 'deleted-by-me' | 'replaced-out-by-me'
143
+ type GenesisFate = 'kept' | 'deleted'
102
144
 
103
- function buildOriginMap(ops: readonly Operation[], v2Len: number): V2Origin[] {
104
- const out: V2Origin[] = new Array(v2Len).fill('preserved-from-v1')
105
- for (const op of ops) {
106
- const origin =
107
- op.action === Action.Insert ? 'inserted-by-cp' : op.action === Action.Replace ? 'replaced-into-by-cp' : null
108
- if (origin === null) continue
109
- for (let i = op.startInNew; i < op.endInNew; i++) {
110
- if (i >= 0 && i < v2Len) out[i] = origin
111
- }
112
- }
113
- return out
114
- }
115
-
116
- function buildFateMap(ops: readonly Operation[], v2Len: number): V2Fate[] {
117
- const out: V2Fate[] = new Array(v2Len).fill('preserved-to-v3')
145
+ /**
146
+ * Per genesis-diff-index, what did this side do to that token? Both
147
+ * Delete and Replace ops remove the token from the side's output, so
148
+ * both contribute `'deleted'`. Equal ops contribute `'kept'`. Insert
149
+ * ops have an empty old range, so they don't touch the genesis fate
150
+ * map.
151
+ */
152
+ function buildFateFromGenesis(ops: readonly Operation[], genesisLen: number): GenesisFate[] {
153
+ const out: GenesisFate[] = new Array(genesisLen).fill('kept')
118
154
  for (const op of ops) {
119
- const fate =
120
- op.action === Action.Delete ? 'deleted-by-me' : op.action === Action.Replace ? 'replaced-out-by-me' : null
121
- if (fate === null) continue
155
+ if (op.action !== Action.Delete && op.action !== Action.Replace) continue
122
156
  for (let i = op.startInOld; i < op.endInOld; i++) {
123
- if (i >= 0 && i < v2Len) out[i] = fate
157
+ if (i >= 0 && i < genesisLen) out[i] = 'deleted'
124
158
  }
125
159
  }
126
160
  return out
127
161
  }
128
162
 
129
- function isDeletion(attr: Attribution): boolean {
130
- return attr.kind === 'del' || attr.kind === 'reject'
131
- }
132
-
133
- function combine(origin: V2Origin, fate: V2Fate): Attribution {
134
- const cpInserted = origin === 'inserted-by-cp' || origin === 'replaced-into-by-cp'
135
- const meDeleted = fate === 'deleted-by-me' || fate === 'replaced-out-by-me'
136
- if (!cpInserted && !meDeleted) return { kind: 'equal' }
137
- if (cpInserted && !meDeleted) return { kind: 'ins', author: 'cp' }
138
- if (!cpInserted && meDeleted) return { kind: 'del', author: 'me' }
139
- return { kind: 'reject', by: 'me', rejected: 'cp' }
140
- }
141
-
142
163
  /**
143
- * Map V2-diff-boundary CP-deleted V1 tokens at that boundary. Includes
144
- * both pure Delete ops and the V1-side of Replace ops (semantically a
145
- * Delete+Insert; the Insert half is picked up by the V2-token walk).
164
+ * Per genesis boundary `b`, collect tokens this side inserted at that
165
+ * boundary. Keyed by `endInOld` so a Replace at genesis[k..k+1] has its
166
+ * insertion at boundary k+1 (after the deleted token) rather than k
167
+ * (before) — that produces the del-then-ins visual order.
168
+ *
169
+ * For pure Insert ops the old range is empty (endInOld == startInOld),
170
+ * so the key is the same as the semantic between-tokens position.
146
171
  */
147
- function collectDeletionsAtBoundary(d: AnalyzeResult): Map<number, string[]> {
172
+ function collectInsertionsKeyedByEnd(d: AnalyzeResult): Map<number, string[]> {
148
173
  const out = new Map<number, string[]>()
149
174
  for (const op of d.operations) {
150
- if (op.action !== Action.Delete && op.action !== Action.Replace) continue
151
- const words = d.oldDiffWords.slice(op.startInOld, op.endInOld)
175
+ if (op.action !== Action.Insert && op.action !== Action.Replace) continue
176
+ const words = d.newDiffWords.slice(op.startInNew, op.endInNew)
152
177
  if (words.length === 0) continue
153
- const existing = out.get(op.startInNew) ?? []
178
+ const key = op.endInOld
179
+ const existing = out.get(key) ?? []
154
180
  existing.push(...words)
155
- out.set(op.startInNew, existing)
181
+ out.set(key, existing)
156
182
  }
157
183
  return out
158
184
  }
159
185
 
160
- function collectInsertionsAtBoundary(d: AnalyzeResult): Map<number, string[]> {
161
- const out = new Map<number, string[]>()
162
- for (const op of d.operations) {
163
- if (op.action !== Action.Insert && op.action !== Action.Replace) continue
164
- const words = d.newDiffWords.slice(op.startInNew, op.endInNew)
165
- if (words.length === 0) continue
166
- const existing = out.get(op.startInOld) ?? []
167
- existing.push(...words)
168
- out.set(op.startInOld, existing)
186
+ /**
187
+ * Emit any insertions at boundary `b`.
188
+ *
189
+ * Reading model: a legal reviewer wants to see CP's INTENT relative
190
+ * to Me's current content. Me's content is the base; CP's deltas are
191
+ * what they need to act on. Under that framing:
192
+ * - tokens both authors inserted at the same boundary → settled
193
+ * - tokens CP inserted that Me doesn't have → ins-cp (CP wants
194
+ * this added)
195
+ * - tokens Me inserted that CP doesn't have → del-cp (CP wants
196
+ * this removed from Me's content)
197
+ *
198
+ * The third case is the load-bearing attribution flip. The
199
+ * genesis-spine view technically labels me-only-at-boundary tokens
200
+ * as "ins-me" (Me added them; CP didn't), but that's confusing to
201
+ * a reviewer: they see "Me added X" alongside "CP added Y" and have
202
+ * to mentally derive "CP wants X gone, replaced with Y". Surfacing
203
+ * me-only tokens as `del-cp` shows CP's intent directly:
204
+ * - "CP accepted Me's text minus `things`": settled bulk + del-cp
205
+ * `things` (no parallel redundant insertions)
206
+ * - "CP wants `cruel` where Me wrote `brave`": ins-cp `cruel` +
207
+ * del-cp `brave` (the substitution intent reads directly)
208
+ * - "CP added extra words": cp-extras stay as ins-cp (same as
209
+ * before; the cp-only direction was always intent-correct)
210
+ *
211
+ * Pure single-side insertions (Me added text CP doesn't engage
212
+ * with at all, or vice versa) keep their genesis-spine attribution
213
+ * — these aren't refinement cases, just Me's own content additions.
214
+ */
215
+ function emitBoundary(
216
+ b: number,
217
+ cpInsAt: Map<number, string[]>,
218
+ meInsAt: Map<number, string[]>,
219
+ _cpDiffWords: readonly string[],
220
+ _meDiffWords: readonly string[],
221
+ segments: Segment[]
222
+ ) {
223
+ const cpIns = cpInsAt.get(b)
224
+ const meIns = meInsAt.get(b)
225
+ const hasCp = !!cpIns && cpIns.length > 0
226
+ const hasMe = !!meIns && meIns.length > 0
227
+ if (!hasCp && !hasMe) return
228
+
229
+ // Only-one-side: emit verbatim with that side's attribution.
230
+ // Genuine single-author additions stay author-attributed.
231
+ if (!hasCp) {
232
+ appendSegment(segments, { kind: 'ins', author: 'me' }, meIns!)
233
+ return
234
+ }
235
+ if (!hasMe) {
236
+ appendSegment(segments, { kind: 'ins', author: 'cp' }, cpIns!)
237
+ return
238
+ }
239
+
240
+ // Both sides inserted. Identical → settled. Otherwise LCS-align
241
+ // and apply the asymmetric intent reading.
242
+ if (tokenArraysEqual(cpIns!, meIns!)) {
243
+ appendSegment(segments, { kind: 'equal' }, cpIns!)
244
+ return
245
+ }
246
+
247
+ const alignment = lcsAlign(cpIns! as string[], meIns! as string[])
248
+ for (const a of alignment) {
249
+ if (a.oldIdx !== null && a.newIdx !== null) {
250
+ // Token appears in both insertions → settled.
251
+ appendSegment(segments, { kind: 'equal' }, [cpIns![a.oldIdx]])
252
+ } else if (a.oldIdx !== null) {
253
+ // Token in cp's insertion only → CP wants this added.
254
+ appendSegment(segments, { kind: 'ins', author: 'cp' }, [cpIns![a.oldIdx]])
255
+ } else if (a.newIdx !== null) {
256
+ // Token in me's insertion only → CP wants this removed from
257
+ // Me's content. (Genesis-spine would label this ins-me, but
258
+ // that reading is misleading for a reviewer at this kind of
259
+ // shared boundary — see the function-level comment.)
260
+ appendSegment(segments, { kind: 'del', author: 'cp' }, [meIns![a.newIdx]])
261
+ }
169
262
  }
170
- return out
171
263
  }
172
264
 
173
- function appendSegment(segments: Segment[], attr: Attribution, words: string[]) {
265
+ function tokenArraysEqual(a: readonly string[], b: readonly string[]): boolean {
266
+ if (a.length !== b.length) return false
267
+ for (let i = 0; i < a.length; i++) if (a[i] !== b[i]) return false
268
+ return true
269
+ }
270
+
271
+ function appendSegment(segments: Segment[], attr: Attribution, words: readonly string[]) {
174
272
  if (words.length === 0) return
175
273
  const last = segments[segments.length - 1]
176
274
  if (last && sameAttribution(last.attr, attr)) {
@@ -184,7 +282,6 @@ function sameAttribution(a: Attribution, b: Attribution): boolean {
184
282
  if (a.kind === 'equal' && b.kind === 'equal') return true
185
283
  if (a.kind === 'ins' && b.kind === 'ins') return a.author === b.author
186
284
  if (a.kind === 'del' && b.kind === 'del') return a.author === b.author
187
- if (a.kind === 'reject' && b.kind === 'reject') return true
188
285
  return false
189
286
  }
190
287
 
@@ -195,29 +292,25 @@ function sameAttribution(a: Attribution, b: Attribution): boolean {
195
292
  * pre-wrap) stay consistent. A change here propagates to every author
196
293
  * marker in the output.
197
294
  */
198
- export function authorAttribution(author: Author, rejects?: Author): WrapMetadata {
199
- const dataAttrs: Record<string, string> = { author }
200
- if (rejects !== undefined) dataAttrs.rejects = rejects
201
- const extraClasses = rejects !== undefined ? `${author} rejects-${rejects}` : author
202
- return { extraClasses, dataAttrs }
295
+ export function authorAttribution(author: Author): WrapMetadata {
296
+ return { extraClasses: author, dataAttrs: { author } }
203
297
  }
204
298
 
205
299
  /**
206
300
  * Resolve a segment's attribution into the wrapper-tag, base CSS class,
207
301
  * and `WrapMetadata` consumed by `Utils.wrapText` / `insertTag`. The
208
302
  * caller is `HtmlDiff.executeThreeWay`'s emission loop.
303
+ *
304
+ * `equal` segments don't go through this — they're emitted unmarked.
209
305
  */
210
306
  export function segmentEmissionShape(attr: Exclude<Attribution, { kind: 'equal' }>): {
211
307
  tag: 'ins' | 'del'
212
308
  baseClass: 'diffins' | 'diffdel'
213
309
  metadata: WrapMetadata
214
310
  } {
215
- switch (attr.kind) {
216
- case 'ins':
217
- return { tag: 'ins', baseClass: 'diffins', metadata: authorAttribution(attr.author) }
218
- case 'del':
219
- return { tag: 'del', baseClass: 'diffdel', metadata: authorAttribution(attr.author) }
220
- case 'reject':
221
- return { tag: 'del', baseClass: 'diffdel', metadata: authorAttribution(attr.by, attr.rejected) }
311
+ return {
312
+ tag: attr.kind,
313
+ baseClass: attr.kind === 'ins' ? 'diffins' : 'diffdel',
314
+ metadata: authorAttribution(attr.author),
222
315
  }
223
316
  }