@usejunior/docx-core 0.0.1 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +86 -28
  3. package/dist/.tsbuildinfo +1 -0
  4. package/dist/atomizer.d.ts +218 -0
  5. package/dist/atomizer.d.ts.map +1 -0
  6. package/dist/atomizer.js +856 -0
  7. package/dist/atomizer.js.map +1 -0
  8. package/dist/baselines/atomizer/atomLcs.d.ts +96 -0
  9. package/dist/baselines/atomizer/atomLcs.d.ts.map +1 -0
  10. package/dist/baselines/atomizer/atomLcs.js +347 -0
  11. package/dist/baselines/atomizer/atomLcs.js.map +1 -0
  12. package/dist/baselines/atomizer/debug.d.ts +41 -0
  13. package/dist/baselines/atomizer/debug.d.ts.map +1 -0
  14. package/dist/baselines/atomizer/debug.js +85 -0
  15. package/dist/baselines/atomizer/debug.js.map +1 -0
  16. package/dist/baselines/atomizer/documentReconstructor.d.ts +64 -0
  17. package/dist/baselines/atomizer/documentReconstructor.d.ts.map +1 -0
  18. package/dist/baselines/atomizer/documentReconstructor.js +939 -0
  19. package/dist/baselines/atomizer/documentReconstructor.js.map +1 -0
  20. package/dist/baselines/atomizer/hierarchicalLcs.d.ts +111 -0
  21. package/dist/baselines/atomizer/hierarchicalLcs.d.ts.map +1 -0
  22. package/dist/baselines/atomizer/hierarchicalLcs.js +469 -0
  23. package/dist/baselines/atomizer/hierarchicalLcs.js.map +1 -0
  24. package/dist/baselines/atomizer/inPlaceModifier.d.ts +183 -0
  25. package/dist/baselines/atomizer/inPlaceModifier.d.ts.map +1 -0
  26. package/dist/baselines/atomizer/inPlaceModifier.js +1600 -0
  27. package/dist/baselines/atomizer/inPlaceModifier.js.map +1 -0
  28. package/dist/baselines/atomizer/numberingIntegration.d.ts +59 -0
  29. package/dist/baselines/atomizer/numberingIntegration.d.ts.map +1 -0
  30. package/dist/baselines/atomizer/numberingIntegration.js +209 -0
  31. package/dist/baselines/atomizer/numberingIntegration.js.map +1 -0
  32. package/dist/baselines/atomizer/pipeline.d.ts +65 -0
  33. package/dist/baselines/atomizer/pipeline.d.ts.map +1 -0
  34. package/dist/baselines/atomizer/pipeline.js +510 -0
  35. package/dist/baselines/atomizer/pipeline.js.map +1 -0
  36. package/dist/baselines/atomizer/premergeRuns.d.ts +26 -0
  37. package/dist/baselines/atomizer/premergeRuns.d.ts.map +1 -0
  38. package/dist/baselines/atomizer/premergeRuns.js +150 -0
  39. package/dist/baselines/atomizer/premergeRuns.js.map +1 -0
  40. package/dist/baselines/atomizer/trackChangesAcceptor.d.ts +63 -0
  41. package/dist/baselines/atomizer/trackChangesAcceptor.d.ts.map +1 -0
  42. package/dist/baselines/atomizer/trackChangesAcceptor.js +254 -0
  43. package/dist/baselines/atomizer/trackChangesAcceptor.js.map +1 -0
  44. package/dist/baselines/atomizer/trackChangesAcceptorAst.d.ts +64 -0
  45. package/dist/baselines/atomizer/trackChangesAcceptorAst.d.ts.map +1 -0
  46. package/dist/baselines/atomizer/trackChangesAcceptorAst.js +586 -0
  47. package/dist/baselines/atomizer/trackChangesAcceptorAst.js.map +1 -0
  48. package/dist/baselines/atomizer/xmlToWmlElement.d.ts +65 -0
  49. package/dist/baselines/atomizer/xmlToWmlElement.d.ts.map +1 -0
  50. package/dist/baselines/atomizer/xmlToWmlElement.js +95 -0
  51. package/dist/baselines/atomizer/xmlToWmlElement.js.map +1 -0
  52. package/dist/baselines/diffmatch/documentBuilder.d.ts +44 -0
  53. package/dist/baselines/diffmatch/documentBuilder.d.ts.map +1 -0
  54. package/dist/baselines/diffmatch/documentBuilder.js +227 -0
  55. package/dist/baselines/diffmatch/documentBuilder.js.map +1 -0
  56. package/dist/baselines/diffmatch/paragraphAlignment.d.ts +75 -0
  57. package/dist/baselines/diffmatch/paragraphAlignment.d.ts.map +1 -0
  58. package/dist/baselines/diffmatch/paragraphAlignment.js +206 -0
  59. package/dist/baselines/diffmatch/paragraphAlignment.js.map +1 -0
  60. package/dist/baselines/diffmatch/pipeline.d.ts +33 -0
  61. package/dist/baselines/diffmatch/pipeline.d.ts.map +1 -0
  62. package/dist/baselines/diffmatch/pipeline.js +84 -0
  63. package/dist/baselines/diffmatch/pipeline.js.map +1 -0
  64. package/dist/baselines/diffmatch/runDiff.d.ts +53 -0
  65. package/dist/baselines/diffmatch/runDiff.d.ts.map +1 -0
  66. package/dist/baselines/diffmatch/runDiff.js +253 -0
  67. package/dist/baselines/diffmatch/runDiff.js.map +1 -0
  68. package/dist/baselines/diffmatch/trackChangesRenderer.d.ts +64 -0
  69. package/dist/baselines/diffmatch/trackChangesRenderer.d.ts.map +1 -0
  70. package/dist/baselines/diffmatch/trackChangesRenderer.js +178 -0
  71. package/dist/baselines/diffmatch/trackChangesRenderer.js.map +1 -0
  72. package/dist/baselines/diffmatch/xmlParser.d.ts +45 -0
  73. package/dist/baselines/diffmatch/xmlParser.d.ts.map +1 -0
  74. package/dist/baselines/diffmatch/xmlParser.js +344 -0
  75. package/dist/baselines/diffmatch/xmlParser.js.map +1 -0
  76. package/dist/baselines/wmlcomparer/DocxodusWasm.d.ts +51 -0
  77. package/dist/baselines/wmlcomparer/DocxodusWasm.d.ts.map +1 -0
  78. package/dist/baselines/wmlcomparer/DocxodusWasm.js +83 -0
  79. package/dist/baselines/wmlcomparer/DocxodusWasm.js.map +1 -0
  80. package/dist/baselines/wmlcomparer/DotnetCli.d.ts +40 -0
  81. package/dist/baselines/wmlcomparer/DotnetCli.d.ts.map +1 -0
  82. package/dist/baselines/wmlcomparer/DotnetCli.js +135 -0
  83. package/dist/baselines/wmlcomparer/DotnetCli.js.map +1 -0
  84. package/dist/benchmark/metrics.d.ts +72 -0
  85. package/dist/benchmark/metrics.d.ts.map +1 -0
  86. package/dist/benchmark/metrics.js +45 -0
  87. package/dist/benchmark/metrics.js.map +1 -0
  88. package/dist/benchmark/reporter.d.ts +23 -0
  89. package/dist/benchmark/reporter.d.ts.map +1 -0
  90. package/dist/benchmark/reporter.js +147 -0
  91. package/dist/benchmark/reporter.js.map +1 -0
  92. package/dist/benchmark/runner.d.ts +30 -0
  93. package/dist/benchmark/runner.d.ts.map +1 -0
  94. package/dist/benchmark/runner.js +233 -0
  95. package/dist/benchmark/runner.js.map +1 -0
  96. package/dist/cli/compare-two.d.ts +28 -0
  97. package/dist/cli/compare-two.d.ts.map +1 -0
  98. package/dist/cli/compare-two.js +110 -0
  99. package/dist/cli/compare-two.js.map +1 -0
  100. package/dist/cli/index.d.ts +3 -0
  101. package/dist/cli/index.d.ts.map +1 -0
  102. package/dist/cli/index.js +21 -0
  103. package/dist/cli/index.js.map +1 -0
  104. package/dist/core-types.d.ts +296 -0
  105. package/dist/core-types.d.ts.map +1 -0
  106. package/dist/core-types.js +122 -0
  107. package/dist/core-types.js.map +1 -0
  108. package/dist/footnotes.d.ts +144 -0
  109. package/dist/footnotes.d.ts.map +1 -0
  110. package/dist/footnotes.js +291 -0
  111. package/dist/footnotes.js.map +1 -0
  112. package/dist/format-detection.d.ts +120 -0
  113. package/dist/format-detection.d.ts.map +1 -0
  114. package/dist/format-detection.js +338 -0
  115. package/dist/format-detection.js.map +1 -0
  116. package/dist/index.d.ts +177 -0
  117. package/dist/index.d.ts.map +1 -0
  118. package/dist/index.js +55 -0
  119. package/dist/index.js.map +1 -0
  120. package/dist/integration/output-artifacts.d.ts +6 -0
  121. package/dist/integration/output-artifacts.d.ts.map +1 -0
  122. package/dist/integration/output-artifacts.js +30 -0
  123. package/dist/integration/output-artifacts.js.map +1 -0
  124. package/dist/move-detection.d.ts +211 -0
  125. package/dist/move-detection.d.ts.map +1 -0
  126. package/dist/move-detection.js +391 -0
  127. package/dist/move-detection.js.map +1 -0
  128. package/dist/numbering.d.ts +136 -0
  129. package/dist/numbering.d.ts.map +1 -0
  130. package/dist/numbering.js +446 -0
  131. package/dist/numbering.js.map +1 -0
  132. package/dist/primitives/accept_changes.d.ts +30 -0
  133. package/dist/primitives/accept_changes.d.ts.map +1 -0
  134. package/dist/primitives/accept_changes.js +241 -0
  135. package/dist/primitives/accept_changes.js.map +1 -0
  136. package/dist/primitives/bookmarks.d.ts +12 -0
  137. package/dist/primitives/bookmarks.d.ts.map +1 -0
  138. package/dist/primitives/bookmarks.js +248 -0
  139. package/dist/primitives/bookmarks.js.map +1 -0
  140. package/dist/primitives/comments.d.ts +88 -0
  141. package/dist/primitives/comments.d.ts.map +1 -0
  142. package/dist/primitives/comments.js +703 -0
  143. package/dist/primitives/comments.js.map +1 -0
  144. package/dist/primitives/document.d.ts +168 -0
  145. package/dist/primitives/document.d.ts.map +1 -0
  146. package/dist/primitives/document.js +532 -0
  147. package/dist/primitives/document.js.map +1 -0
  148. package/dist/primitives/document_view.d.ts +93 -0
  149. package/dist/primitives/document_view.d.ts.map +1 -0
  150. package/dist/primitives/document_view.js +722 -0
  151. package/dist/primitives/document_view.js.map +1 -0
  152. package/dist/primitives/dom-helpers.d.ts +94 -0
  153. package/dist/primitives/dom-helpers.d.ts.map +1 -0
  154. package/dist/primitives/dom-helpers.js +219 -0
  155. package/dist/primitives/dom-helpers.js.map +1 -0
  156. package/dist/primitives/errors.d.ts +7 -0
  157. package/dist/primitives/errors.d.ts.map +1 -0
  158. package/dist/primitives/errors.js +10 -0
  159. package/dist/primitives/errors.js.map +1 -0
  160. package/dist/primitives/extract_revisions.d.ts +50 -0
  161. package/dist/primitives/extract_revisions.d.ts.map +1 -0
  162. package/dist/primitives/extract_revisions.js +340 -0
  163. package/dist/primitives/extract_revisions.js.map +1 -0
  164. package/dist/primitives/footnotes.d.ts +37 -0
  165. package/dist/primitives/footnotes.d.ts.map +1 -0
  166. package/dist/primitives/footnotes.js +552 -0
  167. package/dist/primitives/footnotes.js.map +1 -0
  168. package/dist/primitives/formatting_tags.d.ts +30 -0
  169. package/dist/primitives/formatting_tags.d.ts.map +1 -0
  170. package/dist/primitives/formatting_tags.js +217 -0
  171. package/dist/primitives/formatting_tags.js.map +1 -0
  172. package/dist/primitives/index.d.ts +26 -0
  173. package/dist/primitives/index.d.ts.map +1 -0
  174. package/dist/primitives/index.js +26 -0
  175. package/dist/primitives/index.js.map +1 -0
  176. package/dist/primitives/layout.d.ts +53 -0
  177. package/dist/primitives/layout.d.ts.map +1 -0
  178. package/dist/primitives/layout.js +178 -0
  179. package/dist/primitives/layout.js.map +1 -0
  180. package/dist/primitives/list_labels.d.ts +19 -0
  181. package/dist/primitives/list_labels.d.ts.map +1 -0
  182. package/dist/primitives/list_labels.js +57 -0
  183. package/dist/primitives/list_labels.js.map +1 -0
  184. package/dist/primitives/matching.d.ts +17 -0
  185. package/dist/primitives/matching.d.ts.map +1 -0
  186. package/dist/primitives/matching.js +144 -0
  187. package/dist/primitives/matching.js.map +1 -0
  188. package/dist/primitives/merge_runs.d.ts +23 -0
  189. package/dist/primitives/merge_runs.d.ts.map +1 -0
  190. package/dist/primitives/merge_runs.js +195 -0
  191. package/dist/primitives/merge_runs.js.map +1 -0
  192. package/dist/primitives/namespaces.d.ts +90 -0
  193. package/dist/primitives/namespaces.d.ts.map +1 -0
  194. package/dist/primitives/namespaces.js +107 -0
  195. package/dist/primitives/namespaces.js.map +1 -0
  196. package/dist/primitives/numbering.d.ts +27 -0
  197. package/dist/primitives/numbering.d.ts.map +1 -0
  198. package/dist/primitives/numbering.js +182 -0
  199. package/dist/primitives/numbering.js.map +1 -0
  200. package/dist/primitives/prevent_double_elevation.d.ts +18 -0
  201. package/dist/primitives/prevent_double_elevation.d.ts.map +1 -0
  202. package/dist/primitives/prevent_double_elevation.js +190 -0
  203. package/dist/primitives/prevent_double_elevation.js.map +1 -0
  204. package/dist/primitives/reject_changes.d.ts +27 -0
  205. package/dist/primitives/reject_changes.d.ts.map +1 -0
  206. package/dist/primitives/reject_changes.js +371 -0
  207. package/dist/primitives/reject_changes.js.map +1 -0
  208. package/dist/primitives/relationships.d.ts +7 -0
  209. package/dist/primitives/relationships.d.ts.map +1 -0
  210. package/dist/primitives/relationships.js +24 -0
  211. package/dist/primitives/relationships.js.map +1 -0
  212. package/dist/primitives/semantic_tags.d.ts +32 -0
  213. package/dist/primitives/semantic_tags.d.ts.map +1 -0
  214. package/dist/primitives/semantic_tags.js +139 -0
  215. package/dist/primitives/semantic_tags.js.map +1 -0
  216. package/dist/primitives/simplify_redlines.d.ts +19 -0
  217. package/dist/primitives/simplify_redlines.d.ts.map +1 -0
  218. package/dist/primitives/simplify_redlines.js +94 -0
  219. package/dist/primitives/simplify_redlines.js.map +1 -0
  220. package/dist/primitives/styles.d.ts +36 -0
  221. package/dist/primitives/styles.d.ts.map +1 -0
  222. package/dist/primitives/styles.js +190 -0
  223. package/dist/primitives/styles.js.map +1 -0
  224. package/dist/primitives/text.d.ts +27 -0
  225. package/dist/primitives/text.d.ts.map +1 -0
  226. package/dist/primitives/text.js +416 -0
  227. package/dist/primitives/text.js.map +1 -0
  228. package/dist/primitives/validate_document.d.ts +24 -0
  229. package/dist/primitives/validate_document.d.ts.map +1 -0
  230. package/dist/primitives/validate_document.js +147 -0
  231. package/dist/primitives/validate_document.js.map +1 -0
  232. package/dist/primitives/xml.d.ts +5 -0
  233. package/dist/primitives/xml.d.ts.map +1 -0
  234. package/dist/primitives/xml.js +19 -0
  235. package/dist/primitives/xml.js.map +1 -0
  236. package/dist/primitives/zip.d.ts +25 -0
  237. package/dist/primitives/zip.d.ts.map +1 -0
  238. package/dist/primitives/zip.js +78 -0
  239. package/dist/primitives/zip.js.map +1 -0
  240. package/dist/shared/docx/DocxArchive.d.ts +94 -0
  241. package/dist/shared/docx/DocxArchive.d.ts.map +1 -0
  242. package/dist/shared/docx/DocxArchive.js +169 -0
  243. package/dist/shared/docx/DocxArchive.js.map +1 -0
  244. package/dist/shared/ooxml/namespaces.d.ts +149 -0
  245. package/dist/shared/ooxml/namespaces.d.ts.map +1 -0
  246. package/dist/shared/ooxml/namespaces.js +224 -0
  247. package/dist/shared/ooxml/namespaces.js.map +1 -0
  248. package/dist/shared/ooxml/types.d.ts +136 -0
  249. package/dist/shared/ooxml/types.d.ts.map +1 -0
  250. package/dist/shared/ooxml/types.js +7 -0
  251. package/dist/shared/ooxml/types.js.map +1 -0
  252. package/package.json +63 -6
@@ -0,0 +1,856 @@
1
+ /**
2
+ * Atomizer Module
3
+ *
4
+ * Provides factory functions for creating ComparisonUnitAtom instances.
5
+ * Implements the core atomization logic from WmlComparer.
6
+ *
7
+ * @see WmlComparer.cs ComparisonUnitAtom constructor (lines 2314-2343)
8
+ */
9
+ import { createHash } from 'crypto';
10
+ import { DOMParser } from '@xmldom/xmldom';
11
+ import { CorrelationStatus, } from './core-types.js';
12
+ import { getLeafText, setLeafText, childElements, findChildByTagName, } from './primitives/index.js';
13
+ // =============================================================================
14
+ // Shared synthetic document for creating virtual elements
15
+ // =============================================================================
16
+ /**
17
+ * A shared document used to create synthetic/virtual DOM elements.
18
+ * These elements are not part of any real parsed document.
19
+ */
20
+ const SYNTHETIC_DOC = new DOMParser().parseFromString('<root/>', 'application/xml');
21
+ // =============================================================================
22
+ // SHA1 Hashing
23
+ // =============================================================================
24
+ /**
25
+ * Calculate SHA1 hash of a string.
26
+ *
27
+ * Used for quick equality checking of comparison units.
28
+ *
29
+ * @param content - The string content to hash
30
+ * @returns Hexadecimal SHA1 hash string
31
+ */
32
+ export function sha1(content) {
33
+ return createHash('sha1').update(content, 'utf8').digest('hex');
34
+ }
35
+ /**
36
+ * Attributes that should be excluded from hashing for certain elements.
37
+ *
38
+ * - xml:space: A whitespace preservation hint that doesn't affect content.
39
+ * Documents may have this attribute present on some w:t elements and absent
40
+ * on others with identical text, causing spurious hash mismatches.
41
+ */
42
+ const IGNORED_HASH_ATTRIBUTES = new Set(['xml:space']);
43
+ /**
44
+ * Calculate SHA1 hash for a WmlElement.
45
+ *
46
+ * Includes tag name, attributes, and text content for uniqueness.
47
+ * Excludes presentation-only attributes like xml:space that don't affect content.
48
+ *
49
+ * @param element - The element to hash
50
+ * @returns Hexadecimal SHA1 hash string
51
+ */
52
+ export function hashElement(element) {
53
+ const parts = [element.tagName];
54
+ // Sort attributes for deterministic hashing, excluding presentation-only attributes
55
+ const attrs = [];
56
+ for (let i = 0; i < element.attributes.length; i++) {
57
+ const attr = element.attributes[i];
58
+ attrs.push([attr.name, attr.value]);
59
+ }
60
+ const sortedAttrs = attrs
61
+ .filter(([key]) => !IGNORED_HASH_ATTRIBUTES.has(key))
62
+ .sort(([a], [b]) => a.localeCompare(b));
63
+ for (const [key, value] of sortedAttrs) {
64
+ parts.push(`${key}=${value}`);
65
+ }
66
+ const leafText = getLeafText(element);
67
+ if (leafText !== undefined) {
68
+ parts.push(leafText);
69
+ }
70
+ return sha1(parts.join('|'));
71
+ }
72
+ // =============================================================================
73
+ // Revision Tracking Detection
74
+ // =============================================================================
75
+ /**
76
+ * Revision tracking element tag names.
77
+ */
78
+ const REVISION_TRACKING_TAGS = new Set(['w:ins', 'w:del', 'w:moveFrom', 'w:moveTo']);
79
+ /**
80
+ * Find a revision tracking element in the ancestor chain.
81
+ *
82
+ * Searches ancestors from nearest to root for w:ins, w:del, w:moveFrom, or w:moveTo.
83
+ *
84
+ * @param ancestors - Ancestor elements from root to parent
85
+ * @returns The revision tracking element if found, undefined otherwise
86
+ */
87
+ export function findRevisionTrackingElement(ancestors) {
88
+ // Search from nearest ancestor to root
89
+ for (let i = ancestors.length - 1; i >= 0; i--) {
90
+ const ancestor = ancestors[i];
91
+ if (ancestor && REVISION_TRACKING_TAGS.has(ancestor.tagName)) {
92
+ return ancestor;
93
+ }
94
+ }
95
+ return undefined;
96
+ }
97
+ /**
98
+ * Determine initial correlation status from revision tracking element.
99
+ *
100
+ * @param revTrackElement - The revision tracking element (if any)
101
+ * @returns Initial correlation status
102
+ */
103
+ export function getStatusFromRevisionTracking(revTrackElement) {
104
+ if (!revTrackElement) {
105
+ return CorrelationStatus.Unknown;
106
+ }
107
+ switch (revTrackElement.tagName) {
108
+ case 'w:ins':
109
+ return CorrelationStatus.Inserted;
110
+ case 'w:del':
111
+ return CorrelationStatus.Deleted;
112
+ case 'w:moveFrom':
113
+ return CorrelationStatus.MovedSource;
114
+ case 'w:moveTo':
115
+ return CorrelationStatus.MovedDestination;
116
+ default:
117
+ return CorrelationStatus.Unknown;
118
+ }
119
+ }
120
+ // =============================================================================
121
+ // Ancestor Unid Extraction
122
+ // =============================================================================
123
+ /**
124
+ * Extract Unid attributes from ancestor elements.
125
+ *
126
+ * WmlComparer uses w:Unid attributes to correlate elements between documents.
127
+ *
128
+ * @param ancestors - Ancestor elements from root to parent
129
+ * @returns Array of Unid values found in ancestors
130
+ */
131
+ export function extractAncestorUnids(ancestors) {
132
+ const unids = [];
133
+ for (const ancestor of ancestors) {
134
+ const unid = ancestor.getAttribute('w:Unid');
135
+ if (unid) {
136
+ unids.push(unid);
137
+ }
138
+ }
139
+ return unids;
140
+ }
141
+ // =============================================================================
142
+ // Leaf Node Detection
143
+ // =============================================================================
144
+ /**
145
+ * Tag names that represent leaf nodes in the atomization tree.
146
+ */
147
+ const LEAF_NODE_TAGS = new Set([
148
+ 'w:t', // Text
149
+ 'w:br', // Break
150
+ 'w:cr', // Carriage return
151
+ 'w:tab', // Tab character
152
+ 'w:sym', // Symbol
153
+ 'w:softHyphen', // Soft hyphen
154
+ 'w:noBreakHyphen', // Non-breaking hyphen
155
+ 'w:fldChar', // Field character
156
+ 'w:instrText', // Field instruction text
157
+ 'w:delText', // Deleted text
158
+ 'w:dayShort', // Date field short day
159
+ 'w:dayLong', // Date field long day
160
+ 'w:monthShort', // Date field short month
161
+ 'w:monthLong', // Date field long month
162
+ 'w:yearShort', // Date field short year
163
+ 'w:yearLong', // Date field long year
164
+ 'w:annotationRef', // Annotation reference
165
+ 'w:footnoteRef', // Footnote reference marker
166
+ 'w:endnoteRef', // Endnote reference marker
167
+ 'w:footnoteReference', // Footnote reference
168
+ 'w:endnoteReference', // Endnote reference
169
+ 'w:separator', // Separator
170
+ 'w:continuationSeparator', // Continuation separator
171
+ 'w:pgNum', // Page number
172
+ 'w:drawing', // Drawing (treat as atomic)
173
+ 'w:pict', // Picture (VML)
174
+ 'w:object', // Embedded object
175
+ 'mc:AlternateContent', // Alternate content
176
+ ]);
177
+ /**
178
+ * Special tag name for empty paragraph boundary atoms.
179
+ * These atoms are created for paragraphs that have no content (only w:pPr).
180
+ */
181
+ export const EMPTY_PARAGRAPH_TAG = '__emptyParagraph__';
182
+ /**
183
+ * Check if an element is a leaf node for atomization.
184
+ *
185
+ * Leaf nodes are the smallest units that can be compared.
186
+ *
187
+ * @param element - The element to check
188
+ * @returns True if this is a leaf node
189
+ */
190
+ export function isLeafNode(element) {
191
+ return LEAF_NODE_TAGS.has(element.tagName);
192
+ }
193
+ /**
194
+ * Create a ComparisonUnitAtom from a leaf element.
195
+ *
196
+ * Replicates the C# ComparisonUnitAtom constructor logic:
197
+ * 1. Finds revision tracking elements in ancestors
198
+ * 2. Sets initial correlation status based on revision type
199
+ * 3. Extracts ancestor Unids for correlation
200
+ * 4. Calculates SHA1 hash for equality checking
201
+ *
202
+ * @param options - Options containing element, ancestors, and part
203
+ * @returns A new ComparisonUnitAtom
204
+ *
205
+ * @see WmlComparer.cs lines 2314-2343
206
+ */
207
+ export function createComparisonUnitAtom(options) {
208
+ const { contentElement, ancestors, part } = options;
209
+ // Find revision tracking element in ancestors
210
+ const revTrackElement = findRevisionTrackingElement(ancestors);
211
+ // Determine initial correlation status
212
+ const correlationStatus = getStatusFromRevisionTracking(revTrackElement);
213
+ // Extract Unids from ancestors
214
+ const ancestorUnids = extractAncestorUnids(ancestors);
215
+ // Calculate SHA1 hash for the atom
216
+ const sha1Hash = hashElement(contentElement);
217
+ // Extract and clone run properties for first-class rPr access
218
+ const rPrElement = getRunProperties({ ancestorElements: ancestors });
219
+ const rPr = rPrElement ? rPrElement.cloneNode(true) : null;
220
+ return {
221
+ contentElement,
222
+ ancestorElements: [...ancestors], // Copy to avoid mutation
223
+ ancestorUnids,
224
+ part,
225
+ revTrackElement,
226
+ sha1Hash,
227
+ correlationStatus,
228
+ rPr,
229
+ };
230
+ }
231
+ // =============================================================================
232
+ // Tree Atomization
233
+ // =============================================================================
234
+ /**
235
+ * Check if a paragraph element is empty (has no content, only properties).
236
+ *
237
+ * Empty paragraphs have only w:pPr children, no w:r (run) elements.
238
+ */
239
+ function isEmptyParagraph(node) {
240
+ if (node.tagName !== 'w:p')
241
+ return false;
242
+ const kids = childElements(node);
243
+ if (kids.length === 0)
244
+ return true;
245
+ // Check if all children are w:pPr (no runs)
246
+ for (const child of kids) {
247
+ if (child.tagName !== 'w:pPr') {
248
+ return false;
249
+ }
250
+ }
251
+ return true;
252
+ }
253
+ /**
254
+ * Create an empty paragraph boundary atom with context-aware hash.
255
+ *
256
+ * These atoms represent empty paragraphs that have no text content,
257
+ * ensuring they are preserved during document reconstruction.
258
+ *
259
+ * The hash includes the previous content hash to ensure empty paragraphs
260
+ * only match if they're at the same logical position in the document.
261
+ *
262
+ * @param paragraphElement - The w:p element
263
+ * @param ancestors - Ancestor elements from root to parent
264
+ * @param part - The OPC part
265
+ * @param state - Atomization state with context information
266
+ */
267
+ function createEmptyParagraphAtomWithContext(paragraphElement, ancestors, part, state) {
268
+ // Create a virtual element to represent the empty paragraph
269
+ const virtualElement = SYNTHETIC_DOC.createElement(EMPTY_PARAGRAPH_TAG);
270
+ // Find revision tracking element in ancestors
271
+ const revTrackElement = findRevisionTrackingElement(ancestors);
272
+ // Determine initial correlation status
273
+ const correlationStatus = getStatusFromRevisionTracking(revTrackElement);
274
+ // Create a hash that uniquely identifies this empty paragraph
275
+ // Include:
276
+ // 1. pPr content for paragraph properties
277
+ // 2. lastContentHash for context (what content precedes this empty paragraph)
278
+ // 3. emptyParagraphCount for consecutive empty paragraphs with same context
279
+ const pPr = findChildByTagName(paragraphElement, 'w:pPr');
280
+ const pPrHash = pPr ? hashElement(pPr) : 'no-pPr';
281
+ const contextHash = state.lastContentHash || 'document-start';
282
+ const hashContent = `empty-paragraph:${contextHash}:${state.emptyParagraphCount}:${pPrHash}`;
283
+ return {
284
+ contentElement: virtualElement,
285
+ ancestorElements: [...ancestors, paragraphElement],
286
+ ancestorUnids: extractAncestorUnids(ancestors),
287
+ part,
288
+ revTrackElement,
289
+ sha1Hash: sha1(hashContent),
290
+ correlationStatus,
291
+ isEmptyParagraph: true, // Mark this as an empty paragraph atom
292
+ rPr: null, // Empty paragraphs have no run formatting
293
+ };
294
+ }
295
+ /**
296
+ * Internal recursive atomization function with state tracking.
297
+ */
298
+ function atomizeTreeInternal(node, ancestors, part, state, options) {
299
+ const atoms = [];
300
+ if (isLeafNode(node)) {
301
+ const atom = createComparisonUnitAtom({
302
+ contentElement: options.cloneLeafNodes ? node.cloneNode(true) : node,
303
+ ancestors,
304
+ part,
305
+ });
306
+ atoms.push(atom);
307
+ // Update last content hash for context-aware empty paragraph matching
308
+ state.lastContentHash = atom.sha1Hash;
309
+ }
310
+ else if (isEmptyParagraph(node)) {
311
+ // Create empty paragraph atom with context-aware hash
312
+ atoms.push(createEmptyParagraphAtomWithContext(node, ancestors, part, state));
313
+ state.emptyParagraphCount++;
314
+ }
315
+ else {
316
+ for (const child of childElements(node)) {
317
+ atoms.push(...atomizeTreeInternal(child, [...ancestors, node], part, state, options));
318
+ }
319
+ }
320
+ return atoms;
321
+ }
322
+ /**
323
+ * Atomize a document tree into a flat list of ComparisonUnitAtoms.
324
+ *
325
+ * Recursively traverses the tree, creating atoms for each leaf node.
326
+ * Also creates special atoms for empty paragraphs to preserve document structure.
327
+ *
328
+ * @param node - The current node in the tree
329
+ * @param ancestors - Ancestor elements from root to parent of node
330
+ * @param part - The OPC part this tree belongs to
331
+ * @returns Array of ComparisonUnitAtoms from leaf nodes
332
+ */
333
+ export function atomizeTree(node, ancestors, part, options = {}) {
334
+ const normalizedOptions = {
335
+ cloneLeafNodes: options.cloneLeafNodes ?? false,
336
+ mergeAcrossRuns: options.mergeAcrossRuns ?? true,
337
+ mergePunctuationAcrossRuns: options.mergePunctuationAcrossRuns ?? true,
338
+ splitTextIntoWords: options.splitTextIntoWords ?? true,
339
+ };
340
+ const state = {
341
+ emptyParagraphCount: 0,
342
+ lastContentHash: '',
343
+ };
344
+ const rawAtoms = atomizeTreeInternal(node, ancestors, part, state, normalizedOptions);
345
+ // Step 1: Collapse field sequences into single atoms based on visible text
346
+ // This allows matching between hardcoded text and field references
347
+ const fieldCollapsedAtoms = collapseFieldSequences(rawAtoms);
348
+ // Step 2: Merge contiguous text atoms with same formatting
349
+ // This normalizes different w:t split boundaries
350
+ const mergedAtoms = mergeContiguousTextAtoms(fieldCollapsedAtoms, normalizedOptions);
351
+ // Step 3: Split merged atoms at word boundaries for finer-grained comparison
352
+ // This enables word-level diffing within paragraphs
353
+ const wordSplitAtoms = normalizedOptions.splitTextIntoWords
354
+ ? splitAtomsIntoWords(mergedAtoms)
355
+ : mergedAtoms;
356
+ // Step 4: Merge punctuation-only atoms with preceding text
357
+ // This handles "Conduct" + "," vs "Conduct," split differences
358
+ // Must run AFTER word split since that's when punctuation becomes separate atoms
359
+ const atoms = mergePunctuationAtoms(wordSplitAtoms, normalizedOptions);
360
+ console.log(`[DEBUG] atomizeTree: created ${rawAtoms.length} atoms, field-collapsed to ${fieldCollapsedAtoms.length}, merged to ${mergedAtoms.length}, word-split to ${wordSplitAtoms.length}, punct-merged to ${atoms.length}, ${state.emptyParagraphCount} empty paragraphs`);
361
+ return { atoms, emptyParagraphCount: state.emptyParagraphCount };
362
+ }
363
+ /**
364
+ * Get all ancestors of a node by following parent references.
365
+ *
366
+ * @param node - The node to get ancestors for
367
+ * @returns Array of ancestors from root to immediate parent
368
+ */
369
+ export function getAncestors(node) {
370
+ const ancestors = [];
371
+ let current = node.parentNode;
372
+ while (current && current.nodeType === 1 /* ELEMENT_NODE */) {
373
+ ancestors.unshift(current);
374
+ current = current.parentNode;
375
+ }
376
+ return ancestors;
377
+ }
378
+ /**
379
+ * Assign paragraph indices to atoms based on their w:p ancestors.
380
+ *
381
+ * This enables paragraph grouping in the document reconstructor when
382
+ * merging atoms from different source trees (original vs revised).
383
+ *
384
+ * @param atoms - Array of atoms to assign indices to
385
+ */
386
+ export function assignParagraphIndices(atoms) {
387
+ const paragraphToIndex = new Map();
388
+ let nextIndex = 0;
389
+ for (const atom of atoms) {
390
+ // Find the w:p ancestor
391
+ const pAncestor = atom.ancestorElements.find((a) => a.tagName === 'w:p');
392
+ if (pAncestor) {
393
+ // Get or assign index for this paragraph
394
+ let index = paragraphToIndex.get(pAncestor);
395
+ if (index === undefined) {
396
+ index = nextIndex++;
397
+ paragraphToIndex.set(pAncestor, index);
398
+ }
399
+ atom.paragraphIndex = index;
400
+ }
401
+ }
402
+ }
403
+ // =============================================================================
404
+ // Field Sequence Collapsing
405
+ // =============================================================================
406
+ /**
407
+ * Special tag name for collapsed field atoms.
408
+ * These represent Word field codes (REF, PAGEREF, etc.) collapsed to their visible result.
409
+ */
410
+ export const COLLAPSED_FIELD_TAG = '__collapsedField__';
411
+ /**
412
+ * Check if an atom is a field begin marker.
413
+ */
414
+ function isFieldBegin(atom) {
415
+ return (atom.contentElement.tagName === 'w:fldChar' &&
416
+ atom.contentElement.getAttribute('w:fldCharType') === 'begin');
417
+ }
418
+ /**
419
+ * Check if an atom is a field separate marker.
420
+ */
421
+ function isFieldSeparate(atom) {
422
+ return (atom.contentElement.tagName === 'w:fldChar' &&
423
+ atom.contentElement.getAttribute('w:fldCharType') === 'separate');
424
+ }
425
+ /**
426
+ * Check if an atom is a field end marker.
427
+ */
428
+ function isFieldEnd(atom) {
429
+ return (atom.contentElement.tagName === 'w:fldChar' &&
430
+ atom.contentElement.getAttribute('w:fldCharType') === 'end');
431
+ }
432
+ /**
433
+ * Extract visible text from a sequence of atoms (field result portion).
434
+ * Only includes w:t elements, ignoring field markers and instructions.
435
+ */
436
+ function extractVisibleText(atoms) {
437
+ return atoms
438
+ .filter((a) => a.contentElement.tagName === 'w:t')
439
+ .map((a) => getLeafText(a.contentElement) ?? '')
440
+ .join('');
441
+ }
442
+ /**
443
+ * Check if a field spans multiple paragraphs.
444
+ * Multi-paragraph fields (like TOC) should not be collapsed.
445
+ */
446
+ function fieldSpansMultipleParagraphs(fieldAtoms) {
447
+ const paragraphs = new Set();
448
+ for (const atom of fieldAtoms) {
449
+ const para = atom.ancestorElements.find((e) => e.tagName === 'w:p');
450
+ if (para) {
451
+ paragraphs.add(para);
452
+ if (paragraphs.size > 1) {
453
+ return true;
454
+ }
455
+ }
456
+ }
457
+ return false;
458
+ }
459
+ /**
460
+ * Collapse field sequences into single atoms based on visible text.
461
+ *
462
+ * Word fields consist of:
463
+ * - w:fldChar[begin] - field start
464
+ * - w:instrText - field instruction (e.g., "REF _Ref123 \h")
465
+ * - w:fldChar[separate] - separates instruction from result
466
+ * - w:t (one or more) - visible result text
467
+ * - w:fldChar[end] - field end
468
+ *
469
+ * This function collapses each field sequence into a single atom whose hash
470
+ * is based only on the visible text. This allows matching between:
471
+ * - Hardcoded text: "2.6"
472
+ * - Field reference: [REF field]2.6[/field]
473
+ *
474
+ * Both will produce atoms with the same hash if the visible text matches.
475
+ *
476
+ * NOTE: Multi-paragraph fields (like TOC, INDEX) are NOT collapsed because
477
+ * they would lose paragraph structure information.
478
+ *
479
+ * @param atoms - Array of atoms from atomization
480
+ * @returns Array with field sequences collapsed to single atoms
481
+ */
482
+ export function collapseFieldSequences(atoms) {
483
+ if (atoms.length === 0)
484
+ return atoms;
485
+ const result = [];
486
+ let i = 0;
487
+ while (i < atoms.length) {
488
+ const atom = atoms[i];
489
+ if (isFieldBegin(atom)) {
490
+ // Found field start - collect until matching end
491
+ const fieldAtoms = [atom];
492
+ let depth = 1;
493
+ let separatorIndex = -1;
494
+ i++;
495
+ while (i < atoms.length && depth > 0) {
496
+ const current = atoms[i];
497
+ fieldAtoms.push(current);
498
+ if (isFieldBegin(current)) {
499
+ depth++;
500
+ }
501
+ else if (isFieldEnd(current)) {
502
+ depth--;
503
+ }
504
+ else if (isFieldSeparate(current) && depth === 1) {
505
+ // Track separator position for the outermost field
506
+ separatorIndex = fieldAtoms.length - 1;
507
+ }
508
+ i++;
509
+ }
510
+ // Check if field spans multiple paragraphs (like TOC, INDEX)
511
+ // If so, don't collapse - preserve paragraph structure
512
+ if (fieldSpansMultipleParagraphs(fieldAtoms)) {
513
+ // Pass through all field atoms unchanged
514
+ result.push(...fieldAtoms);
515
+ continue;
516
+ }
517
+ // Extract visible text from the field result (after separator)
518
+ let visibleText;
519
+ if (separatorIndex >= 0) {
520
+ // Get text between separator and end (exclusive of markers)
521
+ const resultAtoms = fieldAtoms.slice(separatorIndex + 1, -1);
522
+ visibleText = extractVisibleText(resultAtoms);
523
+ }
524
+ else {
525
+ // No separator - might be a field with no result yet, use instruction
526
+ visibleText = extractVisibleText(fieldAtoms);
527
+ }
528
+ // Create a collapsed field atom with the visible text
529
+ const firstAtom = fieldAtoms[0];
530
+ // Use w:t so it can merge with adjacent text
531
+ const virtualElement = SYNTHETIC_DOC.createElement('w:t');
532
+ setLeafText(virtualElement, visibleText);
533
+ const collapsedAtom = {
534
+ contentElement: virtualElement,
535
+ ancestorElements: [...firstAtom.ancestorElements],
536
+ ancestorUnids: firstAtom.ancestorUnids,
537
+ part: firstAtom.part,
538
+ revTrackElement: firstAtom.revTrackElement,
539
+ sha1Hash: hashElement(virtualElement),
540
+ correlationStatus: firstAtom.correlationStatus,
541
+ // Store original atoms for document reconstruction
542
+ collapsedFieldAtoms: fieldAtoms,
543
+ // Inherit rPr from first atom in the field sequence
544
+ rPr: firstAtom.rPr,
545
+ };
546
+ result.push(collapsedAtom);
547
+ }
548
+ else {
549
+ // Not a field - pass through unchanged
550
+ result.push(atom);
551
+ i++;
552
+ }
553
+ }
554
+ return result;
555
+ }
556
+ // =============================================================================
557
+ // Word-Level Splitting
558
+ // =============================================================================
559
+ /**
560
+ * Split a w:t atom into word-level atoms.
561
+ *
562
+ * This enables finer-grained comparison when text is stored in single w:t elements.
563
+ * For example, "Hello World" becomes ["Hello", " ", "World"].
564
+ *
565
+ * Preserves whitespace as separate atoms to maintain spacing.
566
+ *
567
+ * @param atom - A w:t atom to split
568
+ * @returns Array of word-level atoms (or original atom if not w:t)
569
+ */
570
+ function splitAtomIntoWords(atom) {
571
+ // Only split w:t elements
572
+ if (atom.contentElement.tagName !== 'w:t') {
573
+ return [atom];
574
+ }
575
+ // Don't split collapsed fields - they should stay as-is
576
+ if (atom.collapsedFieldAtoms) {
577
+ return [atom];
578
+ }
579
+ const text = getLeafText(atom.contentElement) ?? '';
580
+ // Don't split short text or single words
581
+ if (text.length <= 1 || !text.includes(' ')) {
582
+ return [atom];
583
+ }
584
+ // Split into words and whitespace, preserving both
585
+ // Uses regex to split on word boundaries while keeping whitespace
586
+ const parts = text.split(/(\s+)/);
587
+ if (parts.length <= 1) {
588
+ return [atom];
589
+ }
590
+ const result = [];
591
+ for (const part of parts) {
592
+ if (part === '')
593
+ continue;
594
+ // Create a new element for this word/whitespace
595
+ const wordElement = SYNTHETIC_DOC.createElement('w:t');
596
+ // Copy attributes from the original content element
597
+ for (let i = 0; i < atom.contentElement.attributes.length; i++) {
598
+ const attr = atom.contentElement.attributes[i];
599
+ wordElement.setAttribute(attr.name, attr.value);
600
+ }
601
+ setLeafText(wordElement, part);
602
+ // Create atom for this word
603
+ const wordAtom = {
604
+ contentElement: wordElement,
605
+ ancestorElements: atom.ancestorElements,
606
+ ancestorUnids: atom.ancestorUnids,
607
+ part: atom.part,
608
+ revTrackElement: atom.revTrackElement,
609
+ sha1Hash: hashElement(wordElement),
610
+ correlationStatus: atom.correlationStatus,
611
+ paragraphIndex: atom.paragraphIndex,
612
+ // Track that this came from a split atom for potential later merge
613
+ splitFromAtom: atom,
614
+ // Share rPr reference (read-only after atomization)
615
+ rPr: atom.rPr,
616
+ };
617
+ result.push(wordAtom);
618
+ }
619
+ return result;
620
+ }
621
+ /**
622
+ * Split all w:t atoms into word-level atoms.
623
+ *
624
+ * @param atoms - Array of atoms
625
+ * @returns Array with w:t atoms split into words
626
+ */
627
+ export function splitAtomsIntoWords(atoms) {
628
+ const result = [];
629
+ for (const atom of atoms) {
630
+ result.push(...splitAtomIntoWords(atom));
631
+ }
632
+ return result;
633
+ }
634
+ // =============================================================================
635
+ // Atom Boundary Normalization
636
+ // =============================================================================
637
+ /**
638
+ * Get the run properties (w:rPr) from an atom's run ancestor.
639
+ */
640
+ function getRunProperties(atom) {
641
+ const run = atom.ancestorElements.find((e) => e.tagName === 'w:r');
642
+ if (!run)
643
+ return undefined;
644
+ return findChildByTagName(run, 'w:rPr') ?? undefined;
645
+ }
646
+ /**
647
+ * Compute a deep hash of an element including its children.
648
+ */
649
+ function hashElementDeep(element) {
650
+ const parts = [element.tagName];
651
+ // Sort attributes for deterministic hashing
652
+ const attrs = [];
653
+ for (let i = 0; i < element.attributes.length; i++) {
654
+ const attr = element.attributes[i];
655
+ attrs.push([attr.name, attr.value]);
656
+ }
657
+ const sortedAttrs = attrs.sort(([a], [b]) => a.localeCompare(b));
658
+ for (const [key, value] of sortedAttrs) {
659
+ parts.push(`${key}=${value}`);
660
+ }
661
+ const leafText = getLeafText(element);
662
+ if (leafText !== undefined) {
663
+ parts.push(leafText);
664
+ }
665
+ // Recursively hash children
666
+ for (const child of childElements(element)) {
667
+ parts.push(hashElementDeep(child));
668
+ }
669
+ return sha1(parts.join('|'));
670
+ }
671
+ /**
672
+ * Compare two w:rPr elements for equivalence.
673
+ * Returns true if they have the same formatting properties.
674
+ */
675
+ function runPropertiesEqual(a, b) {
676
+ // Both undefined = equal (no formatting)
677
+ if (!a && !b)
678
+ return true;
679
+ // One undefined = not equal
680
+ if (!a || !b)
681
+ return false;
682
+ // Compare by deep hashing (includes children for w:rPr properties)
683
+ return hashElementDeep(a) === hashElementDeep(b);
684
+ }
685
+ /**
686
+ * Check if two atoms can be merged into one.
687
+ *
688
+ * Atoms can be merged if they:
689
+ * - Are both w:t (text) elements
690
+ * - Neither is a collapsed field (fields should stay as separate atoms for finer diff)
691
+ * - Are in the same paragraph
692
+ * - Have the same run formatting (w:rPr) OR are in the same run
693
+ * - Have the same revision tracking status
694
+ *
695
+ * @param a - First atom
696
+ * @param b - Second atom (immediately following a)
697
+ * @returns True if atoms can be merged
698
+ */
699
+ function canMergeAtoms(a, b, options) {
700
+ // Only merge w:t elements
701
+ if (a.contentElement.tagName !== 'w:t')
702
+ return false;
703
+ if (b.contentElement.tagName !== 'w:t')
704
+ return false;
705
+ // Never merge collapsed fields - they should stay as separate atoms for finer-grained diff
706
+ if (a.collapsedFieldAtoms || b.collapsedFieldAtoms)
707
+ return false;
708
+ // Must be in the same paragraph
709
+ const aPara = a.ancestorElements.find((e) => e.tagName === 'w:p');
710
+ const bPara = b.ancestorElements.find((e) => e.tagName === 'w:p');
711
+ if (aPara !== bPara)
712
+ return false;
713
+ // Must have same revision tracking status
714
+ const aRevTag = a.revTrackElement?.tagName;
715
+ const bRevTag = b.revTrackElement?.tagName;
716
+ if (aRevTag !== bRevTag)
717
+ return false;
718
+ // Check if same run (fast path)
719
+ const aRun = a.ancestorElements.find((e) => e.tagName === 'w:r');
720
+ const bRun = b.ancestorElements.find((e) => e.tagName === 'w:r');
721
+ if (aRun === bRun)
722
+ return true;
723
+ // Different runs - allow cross-run merge only if enabled.
724
+ // (In inplace mode we disable this so each atom stays anchored to a real run.)
725
+ if (!options.mergeAcrossRuns)
726
+ return false;
727
+ // Different runs - check if they have equivalent formatting
728
+ const aRPr = getRunProperties(a);
729
+ const bRPr = getRunProperties(b);
730
+ return runPropertiesEqual(aRPr, bRPr);
731
+ }
732
+ /**
733
+ * Merge source atom's text content into target atom.
734
+ *
735
+ * Concatenates text content and recomputes the hash.
736
+ *
737
+ * @param target - Atom to merge into
738
+ * @param source - Atom to merge from
739
+ */
740
+ function mergeIntoAtom(target, source) {
741
+ // Concatenate text content
742
+ const newText = (getLeafText(target.contentElement) ?? '') +
743
+ (getLeafText(source.contentElement) ?? '');
744
+ setLeafText(target.contentElement, newText);
745
+ // Recompute hash
746
+ target.sha1Hash = hashElement(target.contentElement);
747
+ }
748
+ /**
749
+ * Check if an atom contains only punctuation.
750
+ */
751
+ function isPunctuationOnlyAtom(atom) {
752
+ if (atom.contentElement.tagName !== 'w:t')
753
+ return false;
754
+ const text = getLeafText(atom.contentElement) ?? '';
755
+ // Match common punctuation that should attach to adjacent words
756
+ return /^[,.:;!?'")\]}>]+$/.test(text);
757
+ }
758
+ /**
759
+ * Check if two atoms can be merged for punctuation normalization.
760
+ *
761
+ * More permissive than canMergeAtoms - allows merging punctuation with
762
+ * preceding text even if they're in different runs, as long as they're
763
+ * in the same paragraph and have the same revision tracking status.
764
+ */
765
+ function canMergePunctuation(a, b, options) {
766
+ // Only merge w:t elements
767
+ if (a.contentElement.tagName !== 'w:t')
768
+ return false;
769
+ if (b.contentElement.tagName !== 'w:t')
770
+ return false;
771
+ // B must be punctuation-only
772
+ if (!isPunctuationOnlyAtom(b))
773
+ return false;
774
+ // Never merge collapsed fields
775
+ if (a.collapsedFieldAtoms || b.collapsedFieldAtoms)
776
+ return false;
777
+ // Must be in the same paragraph
778
+ const aPara = a.ancestorElements.find((e) => e.tagName === 'w:p');
779
+ const bPara = b.ancestorElements.find((e) => e.tagName === 'w:p');
780
+ if (aPara !== bPara)
781
+ return false;
782
+ // Must have same revision tracking status
783
+ const aRevTag = a.revTrackElement?.tagName;
784
+ const bRevTag = b.revTrackElement?.tagName;
785
+ if (aRevTag !== bRevTag)
786
+ return false;
787
+ // A must end with a word character (not whitespace or punctuation)
788
+ const aText = getLeafText(a.contentElement) ?? '';
789
+ if (!/\w$/.test(aText))
790
+ return false;
791
+ // If cross-run punctuation merge is disabled, require same run.
792
+ if (!options.mergePunctuationAcrossRuns) {
793
+ const aRun = a.ancestorElements.find((e) => e.tagName === 'w:r');
794
+ const bRun = b.ancestorElements.find((e) => e.tagName === 'w:r');
795
+ if (aRun !== bRun)
796
+ return false;
797
+ }
798
+ return true;
799
+ }
800
+ /**
801
+ * Merge punctuation-only atoms with preceding text.
802
+ *
803
+ * This handles cases where documents have different w:t boundaries around
804
+ * punctuation (e.g., "Conduct" + "," vs "Conduct,"). Punctuation is merged
805
+ * with the preceding word regardless of run formatting differences.
806
+ *
807
+ * @param atoms - Array of atoms
808
+ * @returns Atoms with punctuation merged into preceding text
809
+ */
810
+ export function mergePunctuationAtoms(atoms, options = { mergePunctuationAcrossRuns: true }) {
811
+ if (atoms.length === 0)
812
+ return atoms;
813
+ const result = [];
814
+ for (const atom of atoms) {
815
+ const prev = result[result.length - 1];
816
+ if (prev && canMergePunctuation(prev, atom, options)) {
817
+ // Merge punctuation into previous atom
818
+ mergeIntoAtom(prev, atom);
819
+ }
820
+ else {
821
+ result.push(atom);
822
+ }
823
+ }
824
+ return result;
825
+ }
826
+ /**
827
+ * Merge contiguous w:t atoms within the same run into single atoms.
828
+ *
829
+ * This normalization ensures that identical text split differently across
830
+ * w:t elements in original vs revised documents will produce matching hashes.
831
+ *
832
+ * Example:
833
+ * Before: ["Def", "initions"] (2 atoms)
834
+ * After: ["Definitions"] (1 atom)
835
+ *
836
+ * @param atoms - Array of atoms from atomization
837
+ * @returns Normalized array with contiguous text atoms merged
838
+ */
839
+ export function mergeContiguousTextAtoms(atoms, options = { mergeAcrossRuns: true }) {
840
+ if (atoms.length === 0)
841
+ return atoms;
842
+ const result = [];
843
+ for (const atom of atoms) {
844
+ const prev = result[result.length - 1];
845
+ // Only merge w:t elements in the same run
846
+ if (prev && canMergeAtoms(prev, atom, options)) {
847
+ // Merge text content into previous atom
848
+ mergeIntoAtom(prev, atom);
849
+ }
850
+ else {
851
+ result.push(atom);
852
+ }
853
+ }
854
+ return result;
855
+ }
856
+ //# sourceMappingURL=atomizer.js.map