@usejunior/docx-core 0.0.1 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +86 -28
- package/dist/.tsbuildinfo +1 -0
- package/dist/atomizer.d.ts +218 -0
- package/dist/atomizer.d.ts.map +1 -0
- package/dist/atomizer.js +856 -0
- package/dist/atomizer.js.map +1 -0
- package/dist/baselines/atomizer/atomLcs.d.ts +96 -0
- package/dist/baselines/atomizer/atomLcs.d.ts.map +1 -0
- package/dist/baselines/atomizer/atomLcs.js +347 -0
- package/dist/baselines/atomizer/atomLcs.js.map +1 -0
- package/dist/baselines/atomizer/debug.d.ts +41 -0
- package/dist/baselines/atomizer/debug.d.ts.map +1 -0
- package/dist/baselines/atomizer/debug.js +85 -0
- package/dist/baselines/atomizer/debug.js.map +1 -0
- package/dist/baselines/atomizer/documentReconstructor.d.ts +64 -0
- package/dist/baselines/atomizer/documentReconstructor.d.ts.map +1 -0
- package/dist/baselines/atomizer/documentReconstructor.js +939 -0
- package/dist/baselines/atomizer/documentReconstructor.js.map +1 -0
- package/dist/baselines/atomizer/hierarchicalLcs.d.ts +111 -0
- package/dist/baselines/atomizer/hierarchicalLcs.d.ts.map +1 -0
- package/dist/baselines/atomizer/hierarchicalLcs.js +469 -0
- package/dist/baselines/atomizer/hierarchicalLcs.js.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier.d.ts +183 -0
- package/dist/baselines/atomizer/inPlaceModifier.d.ts.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier.js +1600 -0
- package/dist/baselines/atomizer/inPlaceModifier.js.map +1 -0
- package/dist/baselines/atomizer/numberingIntegration.d.ts +59 -0
- package/dist/baselines/atomizer/numberingIntegration.d.ts.map +1 -0
- package/dist/baselines/atomizer/numberingIntegration.js +209 -0
- package/dist/baselines/atomizer/numberingIntegration.js.map +1 -0
- package/dist/baselines/atomizer/pipeline.d.ts +65 -0
- package/dist/baselines/atomizer/pipeline.d.ts.map +1 -0
- package/dist/baselines/atomizer/pipeline.js +510 -0
- package/dist/baselines/atomizer/pipeline.js.map +1 -0
- package/dist/baselines/atomizer/premergeRuns.d.ts +26 -0
- package/dist/baselines/atomizer/premergeRuns.d.ts.map +1 -0
- package/dist/baselines/atomizer/premergeRuns.js +150 -0
- package/dist/baselines/atomizer/premergeRuns.js.map +1 -0
- package/dist/baselines/atomizer/trackChangesAcceptor.d.ts +63 -0
- package/dist/baselines/atomizer/trackChangesAcceptor.d.ts.map +1 -0
- package/dist/baselines/atomizer/trackChangesAcceptor.js +254 -0
- package/dist/baselines/atomizer/trackChangesAcceptor.js.map +1 -0
- package/dist/baselines/atomizer/trackChangesAcceptorAst.d.ts +64 -0
- package/dist/baselines/atomizer/trackChangesAcceptorAst.d.ts.map +1 -0
- package/dist/baselines/atomizer/trackChangesAcceptorAst.js +586 -0
- package/dist/baselines/atomizer/trackChangesAcceptorAst.js.map +1 -0
- package/dist/baselines/atomizer/xmlToWmlElement.d.ts +65 -0
- package/dist/baselines/atomizer/xmlToWmlElement.d.ts.map +1 -0
- package/dist/baselines/atomizer/xmlToWmlElement.js +95 -0
- package/dist/baselines/atomizer/xmlToWmlElement.js.map +1 -0
- package/dist/baselines/diffmatch/documentBuilder.d.ts +44 -0
- package/dist/baselines/diffmatch/documentBuilder.d.ts.map +1 -0
- package/dist/baselines/diffmatch/documentBuilder.js +227 -0
- package/dist/baselines/diffmatch/documentBuilder.js.map +1 -0
- package/dist/baselines/diffmatch/paragraphAlignment.d.ts +75 -0
- package/dist/baselines/diffmatch/paragraphAlignment.d.ts.map +1 -0
- package/dist/baselines/diffmatch/paragraphAlignment.js +206 -0
- package/dist/baselines/diffmatch/paragraphAlignment.js.map +1 -0
- package/dist/baselines/diffmatch/pipeline.d.ts +33 -0
- package/dist/baselines/diffmatch/pipeline.d.ts.map +1 -0
- package/dist/baselines/diffmatch/pipeline.js +84 -0
- package/dist/baselines/diffmatch/pipeline.js.map +1 -0
- package/dist/baselines/diffmatch/runDiff.d.ts +53 -0
- package/dist/baselines/diffmatch/runDiff.d.ts.map +1 -0
- package/dist/baselines/diffmatch/runDiff.js +253 -0
- package/dist/baselines/diffmatch/runDiff.js.map +1 -0
- package/dist/baselines/diffmatch/trackChangesRenderer.d.ts +64 -0
- package/dist/baselines/diffmatch/trackChangesRenderer.d.ts.map +1 -0
- package/dist/baselines/diffmatch/trackChangesRenderer.js +178 -0
- package/dist/baselines/diffmatch/trackChangesRenderer.js.map +1 -0
- package/dist/baselines/diffmatch/xmlParser.d.ts +45 -0
- package/dist/baselines/diffmatch/xmlParser.d.ts.map +1 -0
- package/dist/baselines/diffmatch/xmlParser.js +344 -0
- package/dist/baselines/diffmatch/xmlParser.js.map +1 -0
- package/dist/baselines/wmlcomparer/DocxodusWasm.d.ts +51 -0
- package/dist/baselines/wmlcomparer/DocxodusWasm.d.ts.map +1 -0
- package/dist/baselines/wmlcomparer/DocxodusWasm.js +83 -0
- package/dist/baselines/wmlcomparer/DocxodusWasm.js.map +1 -0
- package/dist/baselines/wmlcomparer/DotnetCli.d.ts +40 -0
- package/dist/baselines/wmlcomparer/DotnetCli.d.ts.map +1 -0
- package/dist/baselines/wmlcomparer/DotnetCli.js +135 -0
- package/dist/baselines/wmlcomparer/DotnetCli.js.map +1 -0
- package/dist/benchmark/metrics.d.ts +72 -0
- package/dist/benchmark/metrics.d.ts.map +1 -0
- package/dist/benchmark/metrics.js +45 -0
- package/dist/benchmark/metrics.js.map +1 -0
- package/dist/benchmark/reporter.d.ts +23 -0
- package/dist/benchmark/reporter.d.ts.map +1 -0
- package/dist/benchmark/reporter.js +147 -0
- package/dist/benchmark/reporter.js.map +1 -0
- package/dist/benchmark/runner.d.ts +30 -0
- package/dist/benchmark/runner.d.ts.map +1 -0
- package/dist/benchmark/runner.js +233 -0
- package/dist/benchmark/runner.js.map +1 -0
- package/dist/cli/compare-two.d.ts +28 -0
- package/dist/cli/compare-two.d.ts.map +1 -0
- package/dist/cli/compare-two.js +110 -0
- package/dist/cli/compare-two.js.map +1 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +21 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/core-types.d.ts +296 -0
- package/dist/core-types.d.ts.map +1 -0
- package/dist/core-types.js +122 -0
- package/dist/core-types.js.map +1 -0
- package/dist/footnotes.d.ts +144 -0
- package/dist/footnotes.d.ts.map +1 -0
- package/dist/footnotes.js +291 -0
- package/dist/footnotes.js.map +1 -0
- package/dist/format-detection.d.ts +120 -0
- package/dist/format-detection.d.ts.map +1 -0
- package/dist/format-detection.js +338 -0
- package/dist/format-detection.js.map +1 -0
- package/dist/index.d.ts +177 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +55 -0
- package/dist/index.js.map +1 -0
- package/dist/integration/output-artifacts.d.ts +6 -0
- package/dist/integration/output-artifacts.d.ts.map +1 -0
- package/dist/integration/output-artifacts.js +30 -0
- package/dist/integration/output-artifacts.js.map +1 -0
- package/dist/move-detection.d.ts +211 -0
- package/dist/move-detection.d.ts.map +1 -0
- package/dist/move-detection.js +391 -0
- package/dist/move-detection.js.map +1 -0
- package/dist/numbering.d.ts +136 -0
- package/dist/numbering.d.ts.map +1 -0
- package/dist/numbering.js +446 -0
- package/dist/numbering.js.map +1 -0
- package/dist/primitives/accept_changes.d.ts +30 -0
- package/dist/primitives/accept_changes.d.ts.map +1 -0
- package/dist/primitives/accept_changes.js +241 -0
- package/dist/primitives/accept_changes.js.map +1 -0
- package/dist/primitives/bookmarks.d.ts +12 -0
- package/dist/primitives/bookmarks.d.ts.map +1 -0
- package/dist/primitives/bookmarks.js +248 -0
- package/dist/primitives/bookmarks.js.map +1 -0
- package/dist/primitives/comments.d.ts +88 -0
- package/dist/primitives/comments.d.ts.map +1 -0
- package/dist/primitives/comments.js +703 -0
- package/dist/primitives/comments.js.map +1 -0
- package/dist/primitives/document.d.ts +168 -0
- package/dist/primitives/document.d.ts.map +1 -0
- package/dist/primitives/document.js +532 -0
- package/dist/primitives/document.js.map +1 -0
- package/dist/primitives/document_view.d.ts +93 -0
- package/dist/primitives/document_view.d.ts.map +1 -0
- package/dist/primitives/document_view.js +722 -0
- package/dist/primitives/document_view.js.map +1 -0
- package/dist/primitives/dom-helpers.d.ts +94 -0
- package/dist/primitives/dom-helpers.d.ts.map +1 -0
- package/dist/primitives/dom-helpers.js +219 -0
- package/dist/primitives/dom-helpers.js.map +1 -0
- package/dist/primitives/errors.d.ts +7 -0
- package/dist/primitives/errors.d.ts.map +1 -0
- package/dist/primitives/errors.js +10 -0
- package/dist/primitives/errors.js.map +1 -0
- package/dist/primitives/extract_revisions.d.ts +50 -0
- package/dist/primitives/extract_revisions.d.ts.map +1 -0
- package/dist/primitives/extract_revisions.js +340 -0
- package/dist/primitives/extract_revisions.js.map +1 -0
- package/dist/primitives/footnotes.d.ts +37 -0
- package/dist/primitives/footnotes.d.ts.map +1 -0
- package/dist/primitives/footnotes.js +552 -0
- package/dist/primitives/footnotes.js.map +1 -0
- package/dist/primitives/formatting_tags.d.ts +30 -0
- package/dist/primitives/formatting_tags.d.ts.map +1 -0
- package/dist/primitives/formatting_tags.js +217 -0
- package/dist/primitives/formatting_tags.js.map +1 -0
- package/dist/primitives/index.d.ts +26 -0
- package/dist/primitives/index.d.ts.map +1 -0
- package/dist/primitives/index.js +26 -0
- package/dist/primitives/index.js.map +1 -0
- package/dist/primitives/layout.d.ts +53 -0
- package/dist/primitives/layout.d.ts.map +1 -0
- package/dist/primitives/layout.js +178 -0
- package/dist/primitives/layout.js.map +1 -0
- package/dist/primitives/list_labels.d.ts +19 -0
- package/dist/primitives/list_labels.d.ts.map +1 -0
- package/dist/primitives/list_labels.js +57 -0
- package/dist/primitives/list_labels.js.map +1 -0
- package/dist/primitives/matching.d.ts +17 -0
- package/dist/primitives/matching.d.ts.map +1 -0
- package/dist/primitives/matching.js +144 -0
- package/dist/primitives/matching.js.map +1 -0
- package/dist/primitives/merge_runs.d.ts +23 -0
- package/dist/primitives/merge_runs.d.ts.map +1 -0
- package/dist/primitives/merge_runs.js +195 -0
- package/dist/primitives/merge_runs.js.map +1 -0
- package/dist/primitives/namespaces.d.ts +90 -0
- package/dist/primitives/namespaces.d.ts.map +1 -0
- package/dist/primitives/namespaces.js +107 -0
- package/dist/primitives/namespaces.js.map +1 -0
- package/dist/primitives/numbering.d.ts +27 -0
- package/dist/primitives/numbering.d.ts.map +1 -0
- package/dist/primitives/numbering.js +182 -0
- package/dist/primitives/numbering.js.map +1 -0
- package/dist/primitives/prevent_double_elevation.d.ts +18 -0
- package/dist/primitives/prevent_double_elevation.d.ts.map +1 -0
- package/dist/primitives/prevent_double_elevation.js +190 -0
- package/dist/primitives/prevent_double_elevation.js.map +1 -0
- package/dist/primitives/reject_changes.d.ts +27 -0
- package/dist/primitives/reject_changes.d.ts.map +1 -0
- package/dist/primitives/reject_changes.js +371 -0
- package/dist/primitives/reject_changes.js.map +1 -0
- package/dist/primitives/relationships.d.ts +7 -0
- package/dist/primitives/relationships.d.ts.map +1 -0
- package/dist/primitives/relationships.js +24 -0
- package/dist/primitives/relationships.js.map +1 -0
- package/dist/primitives/semantic_tags.d.ts +32 -0
- package/dist/primitives/semantic_tags.d.ts.map +1 -0
- package/dist/primitives/semantic_tags.js +139 -0
- package/dist/primitives/semantic_tags.js.map +1 -0
- package/dist/primitives/simplify_redlines.d.ts +19 -0
- package/dist/primitives/simplify_redlines.d.ts.map +1 -0
- package/dist/primitives/simplify_redlines.js +94 -0
- package/dist/primitives/simplify_redlines.js.map +1 -0
- package/dist/primitives/styles.d.ts +36 -0
- package/dist/primitives/styles.d.ts.map +1 -0
- package/dist/primitives/styles.js +190 -0
- package/dist/primitives/styles.js.map +1 -0
- package/dist/primitives/text.d.ts +27 -0
- package/dist/primitives/text.d.ts.map +1 -0
- package/dist/primitives/text.js +416 -0
- package/dist/primitives/text.js.map +1 -0
- package/dist/primitives/validate_document.d.ts +24 -0
- package/dist/primitives/validate_document.d.ts.map +1 -0
- package/dist/primitives/validate_document.js +147 -0
- package/dist/primitives/validate_document.js.map +1 -0
- package/dist/primitives/xml.d.ts +5 -0
- package/dist/primitives/xml.d.ts.map +1 -0
- package/dist/primitives/xml.js +19 -0
- package/dist/primitives/xml.js.map +1 -0
- package/dist/primitives/zip.d.ts +25 -0
- package/dist/primitives/zip.d.ts.map +1 -0
- package/dist/primitives/zip.js +78 -0
- package/dist/primitives/zip.js.map +1 -0
- package/dist/shared/docx/DocxArchive.d.ts +94 -0
- package/dist/shared/docx/DocxArchive.d.ts.map +1 -0
- package/dist/shared/docx/DocxArchive.js +169 -0
- package/dist/shared/docx/DocxArchive.js.map +1 -0
- package/dist/shared/ooxml/namespaces.d.ts +149 -0
- package/dist/shared/ooxml/namespaces.d.ts.map +1 -0
- package/dist/shared/ooxml/namespaces.js +224 -0
- package/dist/shared/ooxml/namespaces.js.map +1 -0
- package/dist/shared/ooxml/types.d.ts +136 -0
- package/dist/shared/ooxml/types.d.ts.map +1 -0
- package/dist/shared/ooxml/types.js +7 -0
- package/dist/shared/ooxml/types.js.map +1 -0
- package/package.json +63 -6
package/dist/atomizer.js
ADDED
|
@@ -0,0 +1,856 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Atomizer Module
|
|
3
|
+
*
|
|
4
|
+
* Provides factory functions for creating ComparisonUnitAtom instances.
|
|
5
|
+
* Implements the core atomization logic from WmlComparer.
|
|
6
|
+
*
|
|
7
|
+
* @see WmlComparer.cs ComparisonUnitAtom constructor (lines 2314-2343)
|
|
8
|
+
*/
|
|
9
|
+
import { createHash } from 'crypto';
|
|
10
|
+
import { DOMParser } from '@xmldom/xmldom';
|
|
11
|
+
import { CorrelationStatus, } from './core-types.js';
|
|
12
|
+
import { getLeafText, setLeafText, childElements, findChildByTagName, } from './primitives/index.js';
|
|
13
|
+
// =============================================================================
|
|
14
|
+
// Shared synthetic document for creating virtual elements
|
|
15
|
+
// =============================================================================
|
|
16
|
+
/**
|
|
17
|
+
* A shared document used to create synthetic/virtual DOM elements.
|
|
18
|
+
* These elements are not part of any real parsed document.
|
|
19
|
+
*/
|
|
20
|
+
const SYNTHETIC_DOC = new DOMParser().parseFromString('<root/>', 'application/xml');
|
|
21
|
+
// =============================================================================
|
|
22
|
+
// SHA1 Hashing
|
|
23
|
+
// =============================================================================
|
|
24
|
+
/**
|
|
25
|
+
* Calculate SHA1 hash of a string.
|
|
26
|
+
*
|
|
27
|
+
* Used for quick equality checking of comparison units.
|
|
28
|
+
*
|
|
29
|
+
* @param content - The string content to hash
|
|
30
|
+
* @returns Hexadecimal SHA1 hash string
|
|
31
|
+
*/
|
|
32
|
+
export function sha1(content) {
|
|
33
|
+
return createHash('sha1').update(content, 'utf8').digest('hex');
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Attributes that should be excluded from hashing for certain elements.
|
|
37
|
+
*
|
|
38
|
+
* - xml:space: A whitespace preservation hint that doesn't affect content.
|
|
39
|
+
* Documents may have this attribute present on some w:t elements and absent
|
|
40
|
+
* on others with identical text, causing spurious hash mismatches.
|
|
41
|
+
*/
|
|
42
|
+
const IGNORED_HASH_ATTRIBUTES = new Set(['xml:space']);
|
|
43
|
+
/**
|
|
44
|
+
* Calculate SHA1 hash for a WmlElement.
|
|
45
|
+
*
|
|
46
|
+
* Includes tag name, attributes, and text content for uniqueness.
|
|
47
|
+
* Excludes presentation-only attributes like xml:space that don't affect content.
|
|
48
|
+
*
|
|
49
|
+
* @param element - The element to hash
|
|
50
|
+
* @returns Hexadecimal SHA1 hash string
|
|
51
|
+
*/
|
|
52
|
+
export function hashElement(element) {
|
|
53
|
+
const parts = [element.tagName];
|
|
54
|
+
// Sort attributes for deterministic hashing, excluding presentation-only attributes
|
|
55
|
+
const attrs = [];
|
|
56
|
+
for (let i = 0; i < element.attributes.length; i++) {
|
|
57
|
+
const attr = element.attributes[i];
|
|
58
|
+
attrs.push([attr.name, attr.value]);
|
|
59
|
+
}
|
|
60
|
+
const sortedAttrs = attrs
|
|
61
|
+
.filter(([key]) => !IGNORED_HASH_ATTRIBUTES.has(key))
|
|
62
|
+
.sort(([a], [b]) => a.localeCompare(b));
|
|
63
|
+
for (const [key, value] of sortedAttrs) {
|
|
64
|
+
parts.push(`${key}=${value}`);
|
|
65
|
+
}
|
|
66
|
+
const leafText = getLeafText(element);
|
|
67
|
+
if (leafText !== undefined) {
|
|
68
|
+
parts.push(leafText);
|
|
69
|
+
}
|
|
70
|
+
return sha1(parts.join('|'));
|
|
71
|
+
}
|
|
72
|
+
// =============================================================================
|
|
73
|
+
// Revision Tracking Detection
|
|
74
|
+
// =============================================================================
|
|
75
|
+
/**
|
|
76
|
+
* Revision tracking element tag names.
|
|
77
|
+
*/
|
|
78
|
+
const REVISION_TRACKING_TAGS = new Set(['w:ins', 'w:del', 'w:moveFrom', 'w:moveTo']);
|
|
79
|
+
/**
|
|
80
|
+
* Find a revision tracking element in the ancestor chain.
|
|
81
|
+
*
|
|
82
|
+
* Searches ancestors from nearest to root for w:ins, w:del, w:moveFrom, or w:moveTo.
|
|
83
|
+
*
|
|
84
|
+
* @param ancestors - Ancestor elements from root to parent
|
|
85
|
+
* @returns The revision tracking element if found, undefined otherwise
|
|
86
|
+
*/
|
|
87
|
+
export function findRevisionTrackingElement(ancestors) {
|
|
88
|
+
// Search from nearest ancestor to root
|
|
89
|
+
for (let i = ancestors.length - 1; i >= 0; i--) {
|
|
90
|
+
const ancestor = ancestors[i];
|
|
91
|
+
if (ancestor && REVISION_TRACKING_TAGS.has(ancestor.tagName)) {
|
|
92
|
+
return ancestor;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
return undefined;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Determine initial correlation status from revision tracking element.
|
|
99
|
+
*
|
|
100
|
+
* @param revTrackElement - The revision tracking element (if any)
|
|
101
|
+
* @returns Initial correlation status
|
|
102
|
+
*/
|
|
103
|
+
export function getStatusFromRevisionTracking(revTrackElement) {
|
|
104
|
+
if (!revTrackElement) {
|
|
105
|
+
return CorrelationStatus.Unknown;
|
|
106
|
+
}
|
|
107
|
+
switch (revTrackElement.tagName) {
|
|
108
|
+
case 'w:ins':
|
|
109
|
+
return CorrelationStatus.Inserted;
|
|
110
|
+
case 'w:del':
|
|
111
|
+
return CorrelationStatus.Deleted;
|
|
112
|
+
case 'w:moveFrom':
|
|
113
|
+
return CorrelationStatus.MovedSource;
|
|
114
|
+
case 'w:moveTo':
|
|
115
|
+
return CorrelationStatus.MovedDestination;
|
|
116
|
+
default:
|
|
117
|
+
return CorrelationStatus.Unknown;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
// =============================================================================
|
|
121
|
+
// Ancestor Unid Extraction
|
|
122
|
+
// =============================================================================
|
|
123
|
+
/**
|
|
124
|
+
* Extract Unid attributes from ancestor elements.
|
|
125
|
+
*
|
|
126
|
+
* WmlComparer uses w:Unid attributes to correlate elements between documents.
|
|
127
|
+
*
|
|
128
|
+
* @param ancestors - Ancestor elements from root to parent
|
|
129
|
+
* @returns Array of Unid values found in ancestors
|
|
130
|
+
*/
|
|
131
|
+
export function extractAncestorUnids(ancestors) {
|
|
132
|
+
const unids = [];
|
|
133
|
+
for (const ancestor of ancestors) {
|
|
134
|
+
const unid = ancestor.getAttribute('w:Unid');
|
|
135
|
+
if (unid) {
|
|
136
|
+
unids.push(unid);
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
return unids;
|
|
140
|
+
}
|
|
141
|
+
// =============================================================================
|
|
142
|
+
// Leaf Node Detection
|
|
143
|
+
// =============================================================================
|
|
144
|
+
/**
|
|
145
|
+
* Tag names that represent leaf nodes in the atomization tree.
|
|
146
|
+
*/
|
|
147
|
+
const LEAF_NODE_TAGS = new Set([
|
|
148
|
+
'w:t', // Text
|
|
149
|
+
'w:br', // Break
|
|
150
|
+
'w:cr', // Carriage return
|
|
151
|
+
'w:tab', // Tab character
|
|
152
|
+
'w:sym', // Symbol
|
|
153
|
+
'w:softHyphen', // Soft hyphen
|
|
154
|
+
'w:noBreakHyphen', // Non-breaking hyphen
|
|
155
|
+
'w:fldChar', // Field character
|
|
156
|
+
'w:instrText', // Field instruction text
|
|
157
|
+
'w:delText', // Deleted text
|
|
158
|
+
'w:dayShort', // Date field short day
|
|
159
|
+
'w:dayLong', // Date field long day
|
|
160
|
+
'w:monthShort', // Date field short month
|
|
161
|
+
'w:monthLong', // Date field long month
|
|
162
|
+
'w:yearShort', // Date field short year
|
|
163
|
+
'w:yearLong', // Date field long year
|
|
164
|
+
'w:annotationRef', // Annotation reference
|
|
165
|
+
'w:footnoteRef', // Footnote reference marker
|
|
166
|
+
'w:endnoteRef', // Endnote reference marker
|
|
167
|
+
'w:footnoteReference', // Footnote reference
|
|
168
|
+
'w:endnoteReference', // Endnote reference
|
|
169
|
+
'w:separator', // Separator
|
|
170
|
+
'w:continuationSeparator', // Continuation separator
|
|
171
|
+
'w:pgNum', // Page number
|
|
172
|
+
'w:drawing', // Drawing (treat as atomic)
|
|
173
|
+
'w:pict', // Picture (VML)
|
|
174
|
+
'w:object', // Embedded object
|
|
175
|
+
'mc:AlternateContent', // Alternate content
|
|
176
|
+
]);
|
|
177
|
+
/**
|
|
178
|
+
* Special tag name for empty paragraph boundary atoms.
|
|
179
|
+
* These atoms are created for paragraphs that have no content (only w:pPr).
|
|
180
|
+
*/
|
|
181
|
+
export const EMPTY_PARAGRAPH_TAG = '__emptyParagraph__';
|
|
182
|
+
/**
|
|
183
|
+
* Check if an element is a leaf node for atomization.
|
|
184
|
+
*
|
|
185
|
+
* Leaf nodes are the smallest units that can be compared.
|
|
186
|
+
*
|
|
187
|
+
* @param element - The element to check
|
|
188
|
+
* @returns True if this is a leaf node
|
|
189
|
+
*/
|
|
190
|
+
export function isLeafNode(element) {
|
|
191
|
+
return LEAF_NODE_TAGS.has(element.tagName);
|
|
192
|
+
}
|
|
193
|
+
/**
|
|
194
|
+
* Create a ComparisonUnitAtom from a leaf element.
|
|
195
|
+
*
|
|
196
|
+
* Replicates the C# ComparisonUnitAtom constructor logic:
|
|
197
|
+
* 1. Finds revision tracking elements in ancestors
|
|
198
|
+
* 2. Sets initial correlation status based on revision type
|
|
199
|
+
* 3. Extracts ancestor Unids for correlation
|
|
200
|
+
* 4. Calculates SHA1 hash for equality checking
|
|
201
|
+
*
|
|
202
|
+
* @param options - Options containing element, ancestors, and part
|
|
203
|
+
* @returns A new ComparisonUnitAtom
|
|
204
|
+
*
|
|
205
|
+
* @see WmlComparer.cs lines 2314-2343
|
|
206
|
+
*/
|
|
207
|
+
export function createComparisonUnitAtom(options) {
|
|
208
|
+
const { contentElement, ancestors, part } = options;
|
|
209
|
+
// Find revision tracking element in ancestors
|
|
210
|
+
const revTrackElement = findRevisionTrackingElement(ancestors);
|
|
211
|
+
// Determine initial correlation status
|
|
212
|
+
const correlationStatus = getStatusFromRevisionTracking(revTrackElement);
|
|
213
|
+
// Extract Unids from ancestors
|
|
214
|
+
const ancestorUnids = extractAncestorUnids(ancestors);
|
|
215
|
+
// Calculate SHA1 hash for the atom
|
|
216
|
+
const sha1Hash = hashElement(contentElement);
|
|
217
|
+
// Extract and clone run properties for first-class rPr access
|
|
218
|
+
const rPrElement = getRunProperties({ ancestorElements: ancestors });
|
|
219
|
+
const rPr = rPrElement ? rPrElement.cloneNode(true) : null;
|
|
220
|
+
return {
|
|
221
|
+
contentElement,
|
|
222
|
+
ancestorElements: [...ancestors], // Copy to avoid mutation
|
|
223
|
+
ancestorUnids,
|
|
224
|
+
part,
|
|
225
|
+
revTrackElement,
|
|
226
|
+
sha1Hash,
|
|
227
|
+
correlationStatus,
|
|
228
|
+
rPr,
|
|
229
|
+
};
|
|
230
|
+
}
|
|
231
|
+
// =============================================================================
|
|
232
|
+
// Tree Atomization
|
|
233
|
+
// =============================================================================
|
|
234
|
+
/**
|
|
235
|
+
* Check if a paragraph element is empty (has no content, only properties).
|
|
236
|
+
*
|
|
237
|
+
* Empty paragraphs have only w:pPr children, no w:r (run) elements.
|
|
238
|
+
*/
|
|
239
|
+
function isEmptyParagraph(node) {
|
|
240
|
+
if (node.tagName !== 'w:p')
|
|
241
|
+
return false;
|
|
242
|
+
const kids = childElements(node);
|
|
243
|
+
if (kids.length === 0)
|
|
244
|
+
return true;
|
|
245
|
+
// Check if all children are w:pPr (no runs)
|
|
246
|
+
for (const child of kids) {
|
|
247
|
+
if (child.tagName !== 'w:pPr') {
|
|
248
|
+
return false;
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
return true;
|
|
252
|
+
}
|
|
253
|
+
/**
|
|
254
|
+
* Create an empty paragraph boundary atom with context-aware hash.
|
|
255
|
+
*
|
|
256
|
+
* These atoms represent empty paragraphs that have no text content,
|
|
257
|
+
* ensuring they are preserved during document reconstruction.
|
|
258
|
+
*
|
|
259
|
+
* The hash includes the previous content hash to ensure empty paragraphs
|
|
260
|
+
* only match if they're at the same logical position in the document.
|
|
261
|
+
*
|
|
262
|
+
* @param paragraphElement - The w:p element
|
|
263
|
+
* @param ancestors - Ancestor elements from root to parent
|
|
264
|
+
* @param part - The OPC part
|
|
265
|
+
* @param state - Atomization state with context information
|
|
266
|
+
*/
|
|
267
|
+
function createEmptyParagraphAtomWithContext(paragraphElement, ancestors, part, state) {
|
|
268
|
+
// Create a virtual element to represent the empty paragraph
|
|
269
|
+
const virtualElement = SYNTHETIC_DOC.createElement(EMPTY_PARAGRAPH_TAG);
|
|
270
|
+
// Find revision tracking element in ancestors
|
|
271
|
+
const revTrackElement = findRevisionTrackingElement(ancestors);
|
|
272
|
+
// Determine initial correlation status
|
|
273
|
+
const correlationStatus = getStatusFromRevisionTracking(revTrackElement);
|
|
274
|
+
// Create a hash that uniquely identifies this empty paragraph
|
|
275
|
+
// Include:
|
|
276
|
+
// 1. pPr content for paragraph properties
|
|
277
|
+
// 2. lastContentHash for context (what content precedes this empty paragraph)
|
|
278
|
+
// 3. emptyParagraphCount for consecutive empty paragraphs with same context
|
|
279
|
+
const pPr = findChildByTagName(paragraphElement, 'w:pPr');
|
|
280
|
+
const pPrHash = pPr ? hashElement(pPr) : 'no-pPr';
|
|
281
|
+
const contextHash = state.lastContentHash || 'document-start';
|
|
282
|
+
const hashContent = `empty-paragraph:${contextHash}:${state.emptyParagraphCount}:${pPrHash}`;
|
|
283
|
+
return {
|
|
284
|
+
contentElement: virtualElement,
|
|
285
|
+
ancestorElements: [...ancestors, paragraphElement],
|
|
286
|
+
ancestorUnids: extractAncestorUnids(ancestors),
|
|
287
|
+
part,
|
|
288
|
+
revTrackElement,
|
|
289
|
+
sha1Hash: sha1(hashContent),
|
|
290
|
+
correlationStatus,
|
|
291
|
+
isEmptyParagraph: true, // Mark this as an empty paragraph atom
|
|
292
|
+
rPr: null, // Empty paragraphs have no run formatting
|
|
293
|
+
};
|
|
294
|
+
}
|
|
295
|
+
/**
|
|
296
|
+
* Internal recursive atomization function with state tracking.
|
|
297
|
+
*/
|
|
298
|
+
function atomizeTreeInternal(node, ancestors, part, state, options) {
|
|
299
|
+
const atoms = [];
|
|
300
|
+
if (isLeafNode(node)) {
|
|
301
|
+
const atom = createComparisonUnitAtom({
|
|
302
|
+
contentElement: options.cloneLeafNodes ? node.cloneNode(true) : node,
|
|
303
|
+
ancestors,
|
|
304
|
+
part,
|
|
305
|
+
});
|
|
306
|
+
atoms.push(atom);
|
|
307
|
+
// Update last content hash for context-aware empty paragraph matching
|
|
308
|
+
state.lastContentHash = atom.sha1Hash;
|
|
309
|
+
}
|
|
310
|
+
else if (isEmptyParagraph(node)) {
|
|
311
|
+
// Create empty paragraph atom with context-aware hash
|
|
312
|
+
atoms.push(createEmptyParagraphAtomWithContext(node, ancestors, part, state));
|
|
313
|
+
state.emptyParagraphCount++;
|
|
314
|
+
}
|
|
315
|
+
else {
|
|
316
|
+
for (const child of childElements(node)) {
|
|
317
|
+
atoms.push(...atomizeTreeInternal(child, [...ancestors, node], part, state, options));
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
return atoms;
|
|
321
|
+
}
|
|
322
|
+
/**
|
|
323
|
+
* Atomize a document tree into a flat list of ComparisonUnitAtoms.
|
|
324
|
+
*
|
|
325
|
+
* Recursively traverses the tree, creating atoms for each leaf node.
|
|
326
|
+
* Also creates special atoms for empty paragraphs to preserve document structure.
|
|
327
|
+
*
|
|
328
|
+
* @param node - The current node in the tree
|
|
329
|
+
* @param ancestors - Ancestor elements from root to parent of node
|
|
330
|
+
* @param part - The OPC part this tree belongs to
|
|
331
|
+
* @returns Array of ComparisonUnitAtoms from leaf nodes
|
|
332
|
+
*/
|
|
333
|
+
export function atomizeTree(node, ancestors, part, options = {}) {
|
|
334
|
+
const normalizedOptions = {
|
|
335
|
+
cloneLeafNodes: options.cloneLeafNodes ?? false,
|
|
336
|
+
mergeAcrossRuns: options.mergeAcrossRuns ?? true,
|
|
337
|
+
mergePunctuationAcrossRuns: options.mergePunctuationAcrossRuns ?? true,
|
|
338
|
+
splitTextIntoWords: options.splitTextIntoWords ?? true,
|
|
339
|
+
};
|
|
340
|
+
const state = {
|
|
341
|
+
emptyParagraphCount: 0,
|
|
342
|
+
lastContentHash: '',
|
|
343
|
+
};
|
|
344
|
+
const rawAtoms = atomizeTreeInternal(node, ancestors, part, state, normalizedOptions);
|
|
345
|
+
// Step 1: Collapse field sequences into single atoms based on visible text
|
|
346
|
+
// This allows matching between hardcoded text and field references
|
|
347
|
+
const fieldCollapsedAtoms = collapseFieldSequences(rawAtoms);
|
|
348
|
+
// Step 2: Merge contiguous text atoms with same formatting
|
|
349
|
+
// This normalizes different w:t split boundaries
|
|
350
|
+
const mergedAtoms = mergeContiguousTextAtoms(fieldCollapsedAtoms, normalizedOptions);
|
|
351
|
+
// Step 3: Split merged atoms at word boundaries for finer-grained comparison
|
|
352
|
+
// This enables word-level diffing within paragraphs
|
|
353
|
+
const wordSplitAtoms = normalizedOptions.splitTextIntoWords
|
|
354
|
+
? splitAtomsIntoWords(mergedAtoms)
|
|
355
|
+
: mergedAtoms;
|
|
356
|
+
// Step 4: Merge punctuation-only atoms with preceding text
|
|
357
|
+
// This handles "Conduct" + "," vs "Conduct," split differences
|
|
358
|
+
// Must run AFTER word split since that's when punctuation becomes separate atoms
|
|
359
|
+
const atoms = mergePunctuationAtoms(wordSplitAtoms, normalizedOptions);
|
|
360
|
+
console.log(`[DEBUG] atomizeTree: created ${rawAtoms.length} atoms, field-collapsed to ${fieldCollapsedAtoms.length}, merged to ${mergedAtoms.length}, word-split to ${wordSplitAtoms.length}, punct-merged to ${atoms.length}, ${state.emptyParagraphCount} empty paragraphs`);
|
|
361
|
+
return { atoms, emptyParagraphCount: state.emptyParagraphCount };
|
|
362
|
+
}
|
|
363
|
+
/**
|
|
364
|
+
* Get all ancestors of a node by following parent references.
|
|
365
|
+
*
|
|
366
|
+
* @param node - The node to get ancestors for
|
|
367
|
+
* @returns Array of ancestors from root to immediate parent
|
|
368
|
+
*/
|
|
369
|
+
export function getAncestors(node) {
|
|
370
|
+
const ancestors = [];
|
|
371
|
+
let current = node.parentNode;
|
|
372
|
+
while (current && current.nodeType === 1 /* ELEMENT_NODE */) {
|
|
373
|
+
ancestors.unshift(current);
|
|
374
|
+
current = current.parentNode;
|
|
375
|
+
}
|
|
376
|
+
return ancestors;
|
|
377
|
+
}
|
|
378
|
+
/**
|
|
379
|
+
* Assign paragraph indices to atoms based on their w:p ancestors.
|
|
380
|
+
*
|
|
381
|
+
* This enables paragraph grouping in the document reconstructor when
|
|
382
|
+
* merging atoms from different source trees (original vs revised).
|
|
383
|
+
*
|
|
384
|
+
* @param atoms - Array of atoms to assign indices to
|
|
385
|
+
*/
|
|
386
|
+
export function assignParagraphIndices(atoms) {
|
|
387
|
+
const paragraphToIndex = new Map();
|
|
388
|
+
let nextIndex = 0;
|
|
389
|
+
for (const atom of atoms) {
|
|
390
|
+
// Find the w:p ancestor
|
|
391
|
+
const pAncestor = atom.ancestorElements.find((a) => a.tagName === 'w:p');
|
|
392
|
+
if (pAncestor) {
|
|
393
|
+
// Get or assign index for this paragraph
|
|
394
|
+
let index = paragraphToIndex.get(pAncestor);
|
|
395
|
+
if (index === undefined) {
|
|
396
|
+
index = nextIndex++;
|
|
397
|
+
paragraphToIndex.set(pAncestor, index);
|
|
398
|
+
}
|
|
399
|
+
atom.paragraphIndex = index;
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
// =============================================================================
|
|
404
|
+
// Field Sequence Collapsing
|
|
405
|
+
// =============================================================================
|
|
406
|
+
/**
|
|
407
|
+
* Special tag name for collapsed field atoms.
|
|
408
|
+
* These represent Word field codes (REF, PAGEREF, etc.) collapsed to their visible result.
|
|
409
|
+
*/
|
|
410
|
+
export const COLLAPSED_FIELD_TAG = '__collapsedField__';
|
|
411
|
+
/**
|
|
412
|
+
* Check if an atom is a field begin marker.
|
|
413
|
+
*/
|
|
414
|
+
function isFieldBegin(atom) {
|
|
415
|
+
return (atom.contentElement.tagName === 'w:fldChar' &&
|
|
416
|
+
atom.contentElement.getAttribute('w:fldCharType') === 'begin');
|
|
417
|
+
}
|
|
418
|
+
/**
|
|
419
|
+
* Check if an atom is a field separate marker.
|
|
420
|
+
*/
|
|
421
|
+
function isFieldSeparate(atom) {
|
|
422
|
+
return (atom.contentElement.tagName === 'w:fldChar' &&
|
|
423
|
+
atom.contentElement.getAttribute('w:fldCharType') === 'separate');
|
|
424
|
+
}
|
|
425
|
+
/**
|
|
426
|
+
* Check if an atom is a field end marker.
|
|
427
|
+
*/
|
|
428
|
+
function isFieldEnd(atom) {
|
|
429
|
+
return (atom.contentElement.tagName === 'w:fldChar' &&
|
|
430
|
+
atom.contentElement.getAttribute('w:fldCharType') === 'end');
|
|
431
|
+
}
|
|
432
|
+
/**
|
|
433
|
+
* Extract visible text from a sequence of atoms (field result portion).
|
|
434
|
+
* Only includes w:t elements, ignoring field markers and instructions.
|
|
435
|
+
*/
|
|
436
|
+
function extractVisibleText(atoms) {
|
|
437
|
+
return atoms
|
|
438
|
+
.filter((a) => a.contentElement.tagName === 'w:t')
|
|
439
|
+
.map((a) => getLeafText(a.contentElement) ?? '')
|
|
440
|
+
.join('');
|
|
441
|
+
}
|
|
442
|
+
/**
|
|
443
|
+
* Check if a field spans multiple paragraphs.
|
|
444
|
+
* Multi-paragraph fields (like TOC) should not be collapsed.
|
|
445
|
+
*/
|
|
446
|
+
function fieldSpansMultipleParagraphs(fieldAtoms) {
|
|
447
|
+
const paragraphs = new Set();
|
|
448
|
+
for (const atom of fieldAtoms) {
|
|
449
|
+
const para = atom.ancestorElements.find((e) => e.tagName === 'w:p');
|
|
450
|
+
if (para) {
|
|
451
|
+
paragraphs.add(para);
|
|
452
|
+
if (paragraphs.size > 1) {
|
|
453
|
+
return true;
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
return false;
|
|
458
|
+
}
|
|
459
|
+
/**
|
|
460
|
+
* Collapse field sequences into single atoms based on visible text.
|
|
461
|
+
*
|
|
462
|
+
* Word fields consist of:
|
|
463
|
+
* - w:fldChar[begin] - field start
|
|
464
|
+
* - w:instrText - field instruction (e.g., "REF _Ref123 \h")
|
|
465
|
+
* - w:fldChar[separate] - separates instruction from result
|
|
466
|
+
* - w:t (one or more) - visible result text
|
|
467
|
+
* - w:fldChar[end] - field end
|
|
468
|
+
*
|
|
469
|
+
* This function collapses each field sequence into a single atom whose hash
|
|
470
|
+
* is based only on the visible text. This allows matching between:
|
|
471
|
+
* - Hardcoded text: "2.6"
|
|
472
|
+
* - Field reference: [REF field]2.6[/field]
|
|
473
|
+
*
|
|
474
|
+
* Both will produce atoms with the same hash if the visible text matches.
|
|
475
|
+
*
|
|
476
|
+
* NOTE: Multi-paragraph fields (like TOC, INDEX) are NOT collapsed because
|
|
477
|
+
* they would lose paragraph structure information.
|
|
478
|
+
*
|
|
479
|
+
* @param atoms - Array of atoms from atomization
|
|
480
|
+
* @returns Array with field sequences collapsed to single atoms
|
|
481
|
+
*/
|
|
482
|
+
export function collapseFieldSequences(atoms) {
|
|
483
|
+
if (atoms.length === 0)
|
|
484
|
+
return atoms;
|
|
485
|
+
const result = [];
|
|
486
|
+
let i = 0;
|
|
487
|
+
while (i < atoms.length) {
|
|
488
|
+
const atom = atoms[i];
|
|
489
|
+
if (isFieldBegin(atom)) {
|
|
490
|
+
// Found field start - collect until matching end
|
|
491
|
+
const fieldAtoms = [atom];
|
|
492
|
+
let depth = 1;
|
|
493
|
+
let separatorIndex = -1;
|
|
494
|
+
i++;
|
|
495
|
+
while (i < atoms.length && depth > 0) {
|
|
496
|
+
const current = atoms[i];
|
|
497
|
+
fieldAtoms.push(current);
|
|
498
|
+
if (isFieldBegin(current)) {
|
|
499
|
+
depth++;
|
|
500
|
+
}
|
|
501
|
+
else if (isFieldEnd(current)) {
|
|
502
|
+
depth--;
|
|
503
|
+
}
|
|
504
|
+
else if (isFieldSeparate(current) && depth === 1) {
|
|
505
|
+
// Track separator position for the outermost field
|
|
506
|
+
separatorIndex = fieldAtoms.length - 1;
|
|
507
|
+
}
|
|
508
|
+
i++;
|
|
509
|
+
}
|
|
510
|
+
// Check if field spans multiple paragraphs (like TOC, INDEX)
|
|
511
|
+
// If so, don't collapse - preserve paragraph structure
|
|
512
|
+
if (fieldSpansMultipleParagraphs(fieldAtoms)) {
|
|
513
|
+
// Pass through all field atoms unchanged
|
|
514
|
+
result.push(...fieldAtoms);
|
|
515
|
+
continue;
|
|
516
|
+
}
|
|
517
|
+
// Extract visible text from the field result (after separator)
|
|
518
|
+
let visibleText;
|
|
519
|
+
if (separatorIndex >= 0) {
|
|
520
|
+
// Get text between separator and end (exclusive of markers)
|
|
521
|
+
const resultAtoms = fieldAtoms.slice(separatorIndex + 1, -1);
|
|
522
|
+
visibleText = extractVisibleText(resultAtoms);
|
|
523
|
+
}
|
|
524
|
+
else {
|
|
525
|
+
// No separator - might be a field with no result yet, use instruction
|
|
526
|
+
visibleText = extractVisibleText(fieldAtoms);
|
|
527
|
+
}
|
|
528
|
+
// Create a collapsed field atom with the visible text
|
|
529
|
+
const firstAtom = fieldAtoms[0];
|
|
530
|
+
// Use w:t so it can merge with adjacent text
|
|
531
|
+
const virtualElement = SYNTHETIC_DOC.createElement('w:t');
|
|
532
|
+
setLeafText(virtualElement, visibleText);
|
|
533
|
+
const collapsedAtom = {
|
|
534
|
+
contentElement: virtualElement,
|
|
535
|
+
ancestorElements: [...firstAtom.ancestorElements],
|
|
536
|
+
ancestorUnids: firstAtom.ancestorUnids,
|
|
537
|
+
part: firstAtom.part,
|
|
538
|
+
revTrackElement: firstAtom.revTrackElement,
|
|
539
|
+
sha1Hash: hashElement(virtualElement),
|
|
540
|
+
correlationStatus: firstAtom.correlationStatus,
|
|
541
|
+
// Store original atoms for document reconstruction
|
|
542
|
+
collapsedFieldAtoms: fieldAtoms,
|
|
543
|
+
// Inherit rPr from first atom in the field sequence
|
|
544
|
+
rPr: firstAtom.rPr,
|
|
545
|
+
};
|
|
546
|
+
result.push(collapsedAtom);
|
|
547
|
+
}
|
|
548
|
+
else {
|
|
549
|
+
// Not a field - pass through unchanged
|
|
550
|
+
result.push(atom);
|
|
551
|
+
i++;
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
return result;
|
|
555
|
+
}
|
|
556
|
+
// =============================================================================
|
|
557
|
+
// Word-Level Splitting
|
|
558
|
+
// =============================================================================
|
|
559
|
+
/**
|
|
560
|
+
* Split a w:t atom into word-level atoms.
|
|
561
|
+
*
|
|
562
|
+
* This enables finer-grained comparison when text is stored in single w:t elements.
|
|
563
|
+
* For example, "Hello World" becomes ["Hello", " ", "World"].
|
|
564
|
+
*
|
|
565
|
+
* Preserves whitespace as separate atoms to maintain spacing.
|
|
566
|
+
*
|
|
567
|
+
* @param atom - A w:t atom to split
|
|
568
|
+
* @returns Array of word-level atoms (or original atom if not w:t)
|
|
569
|
+
*/
|
|
570
|
+
function splitAtomIntoWords(atom) {
|
|
571
|
+
// Only split w:t elements
|
|
572
|
+
if (atom.contentElement.tagName !== 'w:t') {
|
|
573
|
+
return [atom];
|
|
574
|
+
}
|
|
575
|
+
// Don't split collapsed fields - they should stay as-is
|
|
576
|
+
if (atom.collapsedFieldAtoms) {
|
|
577
|
+
return [atom];
|
|
578
|
+
}
|
|
579
|
+
const text = getLeafText(atom.contentElement) ?? '';
|
|
580
|
+
// Don't split short text or single words
|
|
581
|
+
if (text.length <= 1 || !text.includes(' ')) {
|
|
582
|
+
return [atom];
|
|
583
|
+
}
|
|
584
|
+
// Split into words and whitespace, preserving both
|
|
585
|
+
// Uses regex to split on word boundaries while keeping whitespace
|
|
586
|
+
const parts = text.split(/(\s+)/);
|
|
587
|
+
if (parts.length <= 1) {
|
|
588
|
+
return [atom];
|
|
589
|
+
}
|
|
590
|
+
const result = [];
|
|
591
|
+
for (const part of parts) {
|
|
592
|
+
if (part === '')
|
|
593
|
+
continue;
|
|
594
|
+
// Create a new element for this word/whitespace
|
|
595
|
+
const wordElement = SYNTHETIC_DOC.createElement('w:t');
|
|
596
|
+
// Copy attributes from the original content element
|
|
597
|
+
for (let i = 0; i < atom.contentElement.attributes.length; i++) {
|
|
598
|
+
const attr = atom.contentElement.attributes[i];
|
|
599
|
+
wordElement.setAttribute(attr.name, attr.value);
|
|
600
|
+
}
|
|
601
|
+
setLeafText(wordElement, part);
|
|
602
|
+
// Create atom for this word
|
|
603
|
+
const wordAtom = {
|
|
604
|
+
contentElement: wordElement,
|
|
605
|
+
ancestorElements: atom.ancestorElements,
|
|
606
|
+
ancestorUnids: atom.ancestorUnids,
|
|
607
|
+
part: atom.part,
|
|
608
|
+
revTrackElement: atom.revTrackElement,
|
|
609
|
+
sha1Hash: hashElement(wordElement),
|
|
610
|
+
correlationStatus: atom.correlationStatus,
|
|
611
|
+
paragraphIndex: atom.paragraphIndex,
|
|
612
|
+
// Track that this came from a split atom for potential later merge
|
|
613
|
+
splitFromAtom: atom,
|
|
614
|
+
// Share rPr reference (read-only after atomization)
|
|
615
|
+
rPr: atom.rPr,
|
|
616
|
+
};
|
|
617
|
+
result.push(wordAtom);
|
|
618
|
+
}
|
|
619
|
+
return result;
|
|
620
|
+
}
|
|
621
|
+
/**
|
|
622
|
+
* Split all w:t atoms into word-level atoms.
|
|
623
|
+
*
|
|
624
|
+
* @param atoms - Array of atoms
|
|
625
|
+
* @returns Array with w:t atoms split into words
|
|
626
|
+
*/
|
|
627
|
+
export function splitAtomsIntoWords(atoms) {
|
|
628
|
+
const result = [];
|
|
629
|
+
for (const atom of atoms) {
|
|
630
|
+
result.push(...splitAtomIntoWords(atom));
|
|
631
|
+
}
|
|
632
|
+
return result;
|
|
633
|
+
}
|
|
634
|
+
// =============================================================================
|
|
635
|
+
// Atom Boundary Normalization
|
|
636
|
+
// =============================================================================
|
|
637
|
+
/**
|
|
638
|
+
* Get the run properties (w:rPr) from an atom's run ancestor.
|
|
639
|
+
*/
|
|
640
|
+
function getRunProperties(atom) {
|
|
641
|
+
const run = atom.ancestorElements.find((e) => e.tagName === 'w:r');
|
|
642
|
+
if (!run)
|
|
643
|
+
return undefined;
|
|
644
|
+
return findChildByTagName(run, 'w:rPr') ?? undefined;
|
|
645
|
+
}
|
|
646
|
+
/**
|
|
647
|
+
* Compute a deep hash of an element including its children.
|
|
648
|
+
*/
|
|
649
|
+
function hashElementDeep(element) {
|
|
650
|
+
const parts = [element.tagName];
|
|
651
|
+
// Sort attributes for deterministic hashing
|
|
652
|
+
const attrs = [];
|
|
653
|
+
for (let i = 0; i < element.attributes.length; i++) {
|
|
654
|
+
const attr = element.attributes[i];
|
|
655
|
+
attrs.push([attr.name, attr.value]);
|
|
656
|
+
}
|
|
657
|
+
const sortedAttrs = attrs.sort(([a], [b]) => a.localeCompare(b));
|
|
658
|
+
for (const [key, value] of sortedAttrs) {
|
|
659
|
+
parts.push(`${key}=${value}`);
|
|
660
|
+
}
|
|
661
|
+
const leafText = getLeafText(element);
|
|
662
|
+
if (leafText !== undefined) {
|
|
663
|
+
parts.push(leafText);
|
|
664
|
+
}
|
|
665
|
+
// Recursively hash children
|
|
666
|
+
for (const child of childElements(element)) {
|
|
667
|
+
parts.push(hashElementDeep(child));
|
|
668
|
+
}
|
|
669
|
+
return sha1(parts.join('|'));
|
|
670
|
+
}
|
|
671
|
+
/**
|
|
672
|
+
* Compare two w:rPr elements for equivalence.
|
|
673
|
+
* Returns true if they have the same formatting properties.
|
|
674
|
+
*/
|
|
675
|
+
function runPropertiesEqual(a, b) {
|
|
676
|
+
// Both undefined = equal (no formatting)
|
|
677
|
+
if (!a && !b)
|
|
678
|
+
return true;
|
|
679
|
+
// One undefined = not equal
|
|
680
|
+
if (!a || !b)
|
|
681
|
+
return false;
|
|
682
|
+
// Compare by deep hashing (includes children for w:rPr properties)
|
|
683
|
+
return hashElementDeep(a) === hashElementDeep(b);
|
|
684
|
+
}
|
|
685
|
+
/**
|
|
686
|
+
* Check if two atoms can be merged into one.
|
|
687
|
+
*
|
|
688
|
+
* Atoms can be merged if they:
|
|
689
|
+
* - Are both w:t (text) elements
|
|
690
|
+
* - Neither is a collapsed field (fields should stay as separate atoms for finer diff)
|
|
691
|
+
* - Are in the same paragraph
|
|
692
|
+
* - Have the same run formatting (w:rPr) OR are in the same run
|
|
693
|
+
* - Have the same revision tracking status
|
|
694
|
+
*
|
|
695
|
+
* @param a - First atom
|
|
696
|
+
* @param b - Second atom (immediately following a)
|
|
697
|
+
* @returns True if atoms can be merged
|
|
698
|
+
*/
|
|
699
|
+
function canMergeAtoms(a, b, options) {
|
|
700
|
+
// Only merge w:t elements
|
|
701
|
+
if (a.contentElement.tagName !== 'w:t')
|
|
702
|
+
return false;
|
|
703
|
+
if (b.contentElement.tagName !== 'w:t')
|
|
704
|
+
return false;
|
|
705
|
+
// Never merge collapsed fields - they should stay as separate atoms for finer-grained diff
|
|
706
|
+
if (a.collapsedFieldAtoms || b.collapsedFieldAtoms)
|
|
707
|
+
return false;
|
|
708
|
+
// Must be in the same paragraph
|
|
709
|
+
const aPara = a.ancestorElements.find((e) => e.tagName === 'w:p');
|
|
710
|
+
const bPara = b.ancestorElements.find((e) => e.tagName === 'w:p');
|
|
711
|
+
if (aPara !== bPara)
|
|
712
|
+
return false;
|
|
713
|
+
// Must have same revision tracking status
|
|
714
|
+
const aRevTag = a.revTrackElement?.tagName;
|
|
715
|
+
const bRevTag = b.revTrackElement?.tagName;
|
|
716
|
+
if (aRevTag !== bRevTag)
|
|
717
|
+
return false;
|
|
718
|
+
// Check if same run (fast path)
|
|
719
|
+
const aRun = a.ancestorElements.find((e) => e.tagName === 'w:r');
|
|
720
|
+
const bRun = b.ancestorElements.find((e) => e.tagName === 'w:r');
|
|
721
|
+
if (aRun === bRun)
|
|
722
|
+
return true;
|
|
723
|
+
// Different runs - allow cross-run merge only if enabled.
|
|
724
|
+
// (In inplace mode we disable this so each atom stays anchored to a real run.)
|
|
725
|
+
if (!options.mergeAcrossRuns)
|
|
726
|
+
return false;
|
|
727
|
+
// Different runs - check if they have equivalent formatting
|
|
728
|
+
const aRPr = getRunProperties(a);
|
|
729
|
+
const bRPr = getRunProperties(b);
|
|
730
|
+
return runPropertiesEqual(aRPr, bRPr);
|
|
731
|
+
}
|
|
732
|
+
/**
|
|
733
|
+
* Merge source atom's text content into target atom.
|
|
734
|
+
*
|
|
735
|
+
* Concatenates text content and recomputes the hash.
|
|
736
|
+
*
|
|
737
|
+
* @param target - Atom to merge into
|
|
738
|
+
* @param source - Atom to merge from
|
|
739
|
+
*/
|
|
740
|
+
function mergeIntoAtom(target, source) {
|
|
741
|
+
// Concatenate text content
|
|
742
|
+
const newText = (getLeafText(target.contentElement) ?? '') +
|
|
743
|
+
(getLeafText(source.contentElement) ?? '');
|
|
744
|
+
setLeafText(target.contentElement, newText);
|
|
745
|
+
// Recompute hash
|
|
746
|
+
target.sha1Hash = hashElement(target.contentElement);
|
|
747
|
+
}
|
|
748
|
+
/**
|
|
749
|
+
* Check if an atom contains only punctuation.
|
|
750
|
+
*/
|
|
751
|
+
function isPunctuationOnlyAtom(atom) {
|
|
752
|
+
if (atom.contentElement.tagName !== 'w:t')
|
|
753
|
+
return false;
|
|
754
|
+
const text = getLeafText(atom.contentElement) ?? '';
|
|
755
|
+
// Match common punctuation that should attach to adjacent words
|
|
756
|
+
return /^[,.:;!?'")\]}>]+$/.test(text);
|
|
757
|
+
}
|
|
758
|
+
/**
|
|
759
|
+
* Check if two atoms can be merged for punctuation normalization.
|
|
760
|
+
*
|
|
761
|
+
* More permissive than canMergeAtoms - allows merging punctuation with
|
|
762
|
+
* preceding text even if they're in different runs, as long as they're
|
|
763
|
+
* in the same paragraph and have the same revision tracking status.
|
|
764
|
+
*/
|
|
765
|
+
function canMergePunctuation(a, b, options) {
|
|
766
|
+
// Only merge w:t elements
|
|
767
|
+
if (a.contentElement.tagName !== 'w:t')
|
|
768
|
+
return false;
|
|
769
|
+
if (b.contentElement.tagName !== 'w:t')
|
|
770
|
+
return false;
|
|
771
|
+
// B must be punctuation-only
|
|
772
|
+
if (!isPunctuationOnlyAtom(b))
|
|
773
|
+
return false;
|
|
774
|
+
// Never merge collapsed fields
|
|
775
|
+
if (a.collapsedFieldAtoms || b.collapsedFieldAtoms)
|
|
776
|
+
return false;
|
|
777
|
+
// Must be in the same paragraph
|
|
778
|
+
const aPara = a.ancestorElements.find((e) => e.tagName === 'w:p');
|
|
779
|
+
const bPara = b.ancestorElements.find((e) => e.tagName === 'w:p');
|
|
780
|
+
if (aPara !== bPara)
|
|
781
|
+
return false;
|
|
782
|
+
// Must have same revision tracking status
|
|
783
|
+
const aRevTag = a.revTrackElement?.tagName;
|
|
784
|
+
const bRevTag = b.revTrackElement?.tagName;
|
|
785
|
+
if (aRevTag !== bRevTag)
|
|
786
|
+
return false;
|
|
787
|
+
// A must end with a word character (not whitespace or punctuation)
|
|
788
|
+
const aText = getLeafText(a.contentElement) ?? '';
|
|
789
|
+
if (!/\w$/.test(aText))
|
|
790
|
+
return false;
|
|
791
|
+
// If cross-run punctuation merge is disabled, require same run.
|
|
792
|
+
if (!options.mergePunctuationAcrossRuns) {
|
|
793
|
+
const aRun = a.ancestorElements.find((e) => e.tagName === 'w:r');
|
|
794
|
+
const bRun = b.ancestorElements.find((e) => e.tagName === 'w:r');
|
|
795
|
+
if (aRun !== bRun)
|
|
796
|
+
return false;
|
|
797
|
+
}
|
|
798
|
+
return true;
|
|
799
|
+
}
|
|
800
|
+
/**
|
|
801
|
+
* Merge punctuation-only atoms with preceding text.
|
|
802
|
+
*
|
|
803
|
+
* This handles cases where documents have different w:t boundaries around
|
|
804
|
+
* punctuation (e.g., "Conduct" + "," vs "Conduct,"). Punctuation is merged
|
|
805
|
+
* with the preceding word regardless of run formatting differences.
|
|
806
|
+
*
|
|
807
|
+
* @param atoms - Array of atoms
|
|
808
|
+
* @returns Atoms with punctuation merged into preceding text
|
|
809
|
+
*/
|
|
810
|
+
export function mergePunctuationAtoms(atoms, options = { mergePunctuationAcrossRuns: true }) {
|
|
811
|
+
if (atoms.length === 0)
|
|
812
|
+
return atoms;
|
|
813
|
+
const result = [];
|
|
814
|
+
for (const atom of atoms) {
|
|
815
|
+
const prev = result[result.length - 1];
|
|
816
|
+
if (prev && canMergePunctuation(prev, atom, options)) {
|
|
817
|
+
// Merge punctuation into previous atom
|
|
818
|
+
mergeIntoAtom(prev, atom);
|
|
819
|
+
}
|
|
820
|
+
else {
|
|
821
|
+
result.push(atom);
|
|
822
|
+
}
|
|
823
|
+
}
|
|
824
|
+
return result;
|
|
825
|
+
}
|
|
826
|
+
/**
|
|
827
|
+
* Merge contiguous w:t atoms within the same run into single atoms.
|
|
828
|
+
*
|
|
829
|
+
* This normalization ensures that identical text split differently across
|
|
830
|
+
* w:t elements in original vs revised documents will produce matching hashes.
|
|
831
|
+
*
|
|
832
|
+
* Example:
|
|
833
|
+
* Before: ["Def", "initions"] (2 atoms)
|
|
834
|
+
* After: ["Definitions"] (1 atom)
|
|
835
|
+
*
|
|
836
|
+
* @param atoms - Array of atoms from atomization
|
|
837
|
+
* @returns Normalized array with contiguous text atoms merged
|
|
838
|
+
*/
|
|
839
|
+
export function mergeContiguousTextAtoms(atoms, options = { mergeAcrossRuns: true }) {
|
|
840
|
+
if (atoms.length === 0)
|
|
841
|
+
return atoms;
|
|
842
|
+
const result = [];
|
|
843
|
+
for (const atom of atoms) {
|
|
844
|
+
const prev = result[result.length - 1];
|
|
845
|
+
// Only merge w:t elements in the same run
|
|
846
|
+
if (prev && canMergeAtoms(prev, atom, options)) {
|
|
847
|
+
// Merge text content into previous atom
|
|
848
|
+
mergeIntoAtom(prev, atom);
|
|
849
|
+
}
|
|
850
|
+
else {
|
|
851
|
+
result.push(atom);
|
|
852
|
+
}
|
|
853
|
+
}
|
|
854
|
+
return result;
|
|
855
|
+
}
|
|
856
|
+
//# sourceMappingURL=atomizer.js.map
|