docxmlater 10.1.3 → 10.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (371) hide show
  1. package/README.md +759 -754
  2. package/dist/constants/legacyCompatFlags.js +1 -1
  3. package/dist/constants/legacyCompatFlags.js.map +1 -1
  4. package/dist/constants/limits.js.map +1 -1
  5. package/dist/core/Document.d.ts +50 -50
  6. package/dist/core/Document.d.ts.map +1 -1
  7. package/dist/core/Document.js +483 -471
  8. package/dist/core/Document.js.map +1 -1
  9. package/dist/core/DocumentContent.d.ts +9 -9
  10. package/dist/core/DocumentContent.d.ts.map +1 -1
  11. package/dist/core/DocumentContent.js +1 -1
  12. package/dist/core/DocumentContent.js.map +1 -1
  13. package/dist/core/DocumentGenerator.d.ts +11 -11
  14. package/dist/core/DocumentGenerator.d.ts.map +1 -1
  15. package/dist/core/DocumentGenerator.js +251 -251
  16. package/dist/core/DocumentGenerator.js.map +1 -1
  17. package/dist/core/DocumentIdManager.js.map +1 -1
  18. package/dist/core/DocumentParser.d.ts +15 -15
  19. package/dist/core/DocumentParser.d.ts.map +1 -1
  20. package/dist/core/DocumentParser.js +2123 -2155
  21. package/dist/core/DocumentParser.js.map +1 -1
  22. package/dist/core/DocumentValidator.d.ts.map +1 -1
  23. package/dist/core/DocumentValidator.js +2 -5
  24. package/dist/core/DocumentValidator.js.map +1 -1
  25. package/dist/core/Relationship.js.map +1 -1
  26. package/dist/core/RelationshipManager.d.ts.map +1 -1
  27. package/dist/core/RelationshipManager.js +3 -3
  28. package/dist/core/RelationshipManager.js.map +1 -1
  29. package/dist/elements/AlternateContent.js.map +1 -1
  30. package/dist/elements/Bookmark.d.ts.map +1 -1
  31. package/dist/elements/Bookmark.js +3 -1
  32. package/dist/elements/Bookmark.js.map +1 -1
  33. package/dist/elements/BookmarkManager.d.ts.map +1 -1
  34. package/dist/elements/BookmarkManager.js.map +1 -1
  35. package/dist/elements/Comment.d.ts.map +1 -1
  36. package/dist/elements/Comment.js +9 -6
  37. package/dist/elements/Comment.js.map +1 -1
  38. package/dist/elements/CommentManager.d.ts.map +1 -1
  39. package/dist/elements/CommentManager.js +18 -17
  40. package/dist/elements/CommentManager.js.map +1 -1
  41. package/dist/elements/CommonTypes.d.ts +21 -21
  42. package/dist/elements/CommonTypes.d.ts.map +1 -1
  43. package/dist/elements/CommonTypes.js +56 -56
  44. package/dist/elements/CommonTypes.js.map +1 -1
  45. package/dist/elements/CustomXml.js.map +1 -1
  46. package/dist/elements/Endnote.d.ts.map +1 -1
  47. package/dist/elements/Endnote.js +6 -6
  48. package/dist/elements/Endnote.js.map +1 -1
  49. package/dist/elements/EndnoteManager.d.ts.map +1 -1
  50. package/dist/elements/EndnoteManager.js +6 -7
  51. package/dist/elements/EndnoteManager.js.map +1 -1
  52. package/dist/elements/Field.d.ts.map +1 -1
  53. package/dist/elements/Field.js +82 -25
  54. package/dist/elements/Field.js.map +1 -1
  55. package/dist/elements/FieldHelpers.d.ts.map +1 -1
  56. package/dist/elements/FieldHelpers.js.map +1 -1
  57. package/dist/elements/FontManager.d.ts.map +1 -1
  58. package/dist/elements/FontManager.js +1 -1
  59. package/dist/elements/FontManager.js.map +1 -1
  60. package/dist/elements/Footer.js +2 -2
  61. package/dist/elements/Footer.js.map +1 -1
  62. package/dist/elements/Footnote.d.ts.map +1 -1
  63. package/dist/elements/Footnote.js +6 -6
  64. package/dist/elements/Footnote.js.map +1 -1
  65. package/dist/elements/FootnoteManager.d.ts.map +1 -1
  66. package/dist/elements/FootnoteManager.js +6 -7
  67. package/dist/elements/FootnoteManager.js.map +1 -1
  68. package/dist/elements/Header.js +2 -2
  69. package/dist/elements/Header.js.map +1 -1
  70. package/dist/elements/HeaderFooterManager.js.map +1 -1
  71. package/dist/elements/Hyperlink.d.ts +5 -3
  72. package/dist/elements/Hyperlink.d.ts.map +1 -1
  73. package/dist/elements/Hyperlink.js +134 -76
  74. package/dist/elements/Hyperlink.js.map +1 -1
  75. package/dist/elements/Image.d.ts.map +1 -1
  76. package/dist/elements/Image.js +238 -106
  77. package/dist/elements/Image.js.map +1 -1
  78. package/dist/elements/ImageManager.d.ts.map +1 -1
  79. package/dist/elements/ImageManager.js +1 -1
  80. package/dist/elements/ImageManager.js.map +1 -1
  81. package/dist/elements/ImageRun.js +1 -1
  82. package/dist/elements/ImageRun.js.map +1 -1
  83. package/dist/elements/MathElement.js.map +1 -1
  84. package/dist/elements/Paragraph.d.ts +24 -24
  85. package/dist/elements/Paragraph.d.ts.map +1 -1
  86. package/dist/elements/Paragraph.js +181 -188
  87. package/dist/elements/Paragraph.js.map +1 -1
  88. package/dist/elements/PreservedElement.js.map +1 -1
  89. package/dist/elements/PropertyChangeTypes.d.ts.map +1 -1
  90. package/dist/elements/PropertyChangeTypes.js +6 -6
  91. package/dist/elements/PropertyChangeTypes.js.map +1 -1
  92. package/dist/elements/RangeMarker.d.ts.map +1 -1
  93. package/dist/elements/RangeMarker.js.map +1 -1
  94. package/dist/elements/Revision.d.ts.map +1 -1
  95. package/dist/elements/Revision.js +4 -5
  96. package/dist/elements/Revision.js.map +1 -1
  97. package/dist/elements/RevisionContent.js.map +1 -1
  98. package/dist/elements/RevisionManager.d.ts.map +1 -1
  99. package/dist/elements/RevisionManager.js +40 -48
  100. package/dist/elements/RevisionManager.js.map +1 -1
  101. package/dist/elements/Run.d.ts +16 -16
  102. package/dist/elements/Run.d.ts.map +1 -1
  103. package/dist/elements/Run.js +256 -238
  104. package/dist/elements/Run.js.map +1 -1
  105. package/dist/elements/Section.d.ts.map +1 -1
  106. package/dist/elements/Section.js +36 -11
  107. package/dist/elements/Section.js.map +1 -1
  108. package/dist/elements/Shape.d.ts.map +1 -1
  109. package/dist/elements/Shape.js.map +1 -1
  110. package/dist/elements/StructuredDocumentTag.d.ts +6 -6
  111. package/dist/elements/StructuredDocumentTag.d.ts.map +1 -1
  112. package/dist/elements/StructuredDocumentTag.js +99 -104
  113. package/dist/elements/StructuredDocumentTag.js.map +1 -1
  114. package/dist/elements/Table.d.ts +11 -11
  115. package/dist/elements/Table.d.ts.map +1 -1
  116. package/dist/elements/Table.js +102 -107
  117. package/dist/elements/Table.js.map +1 -1
  118. package/dist/elements/TableCell.d.ts +10 -10
  119. package/dist/elements/TableCell.d.ts.map +1 -1
  120. package/dist/elements/TableCell.js +105 -106
  121. package/dist/elements/TableCell.js.map +1 -1
  122. package/dist/elements/TableGridChange.d.ts.map +1 -1
  123. package/dist/elements/TableGridChange.js.map +1 -1
  124. package/dist/elements/TableOfContents.d.ts.map +1 -1
  125. package/dist/elements/TableOfContents.js +4 -4
  126. package/dist/elements/TableOfContents.js.map +1 -1
  127. package/dist/elements/TableOfContentsElement.js.map +1 -1
  128. package/dist/elements/TableRow.d.ts.map +1 -1
  129. package/dist/elements/TableRow.js +13 -6
  130. package/dist/elements/TableRow.js.map +1 -1
  131. package/dist/elements/TextBox.d.ts.map +1 -1
  132. package/dist/elements/TextBox.js +3 -5
  133. package/dist/elements/TextBox.js.map +1 -1
  134. package/dist/formatting/AbstractNumbering.d.ts +4 -4
  135. package/dist/formatting/AbstractNumbering.d.ts.map +1 -1
  136. package/dist/formatting/AbstractNumbering.js +54 -49
  137. package/dist/formatting/AbstractNumbering.js.map +1 -1
  138. package/dist/formatting/NumberingInstance.d.ts.map +1 -1
  139. package/dist/formatting/NumberingInstance.js +1 -3
  140. package/dist/formatting/NumberingInstance.js.map +1 -1
  141. package/dist/formatting/NumberingLevel.d.ts +5 -5
  142. package/dist/formatting/NumberingLevel.d.ts.map +1 -1
  143. package/dist/formatting/NumberingLevel.js +119 -125
  144. package/dist/formatting/NumberingLevel.js.map +1 -1
  145. package/dist/formatting/NumberingManager.d.ts.map +1 -1
  146. package/dist/formatting/NumberingManager.js +9 -9
  147. package/dist/formatting/NumberingManager.js.map +1 -1
  148. package/dist/formatting/Style.d.ts +11 -11
  149. package/dist/formatting/Style.d.ts.map +1 -1
  150. package/dist/formatting/Style.js +219 -247
  151. package/dist/formatting/Style.js.map +1 -1
  152. package/dist/formatting/StylesManager.d.ts +2 -2
  153. package/dist/formatting/StylesManager.d.ts.map +1 -1
  154. package/dist/formatting/StylesManager.js +96 -102
  155. package/dist/formatting/StylesManager.js.map +1 -1
  156. package/dist/helpers/CleanupHelper.d.ts +1 -1
  157. package/dist/helpers/CleanupHelper.d.ts.map +1 -1
  158. package/dist/helpers/CleanupHelper.js +6 -6
  159. package/dist/helpers/CleanupHelper.js.map +1 -1
  160. package/dist/images/ImageOptimizer.js +7 -7
  161. package/dist/images/ImageOptimizer.js.map +1 -1
  162. package/dist/index.d.ts +9 -9
  163. package/dist/index.d.ts.map +1 -1
  164. package/dist/index.js.map +1 -1
  165. package/dist/managers/DrawingManager.js.map +1 -1
  166. package/dist/tracking/DocumentTrackingContext.d.ts.map +1 -1
  167. package/dist/tracking/DocumentTrackingContext.js +23 -7
  168. package/dist/tracking/DocumentTrackingContext.js.map +1 -1
  169. package/dist/tracking/TrackingContext.d.ts.map +1 -1
  170. package/dist/tracking/TrackingContext.js.map +1 -1
  171. package/dist/types/compatibility-types.js.map +1 -1
  172. package/dist/types/formatting.js.map +1 -1
  173. package/dist/types/list-types.d.ts +6 -6
  174. package/dist/types/list-types.js.map +1 -1
  175. package/dist/types/settings-types.js.map +1 -1
  176. package/dist/types/styleConfig.d.ts +2 -2
  177. package/dist/types/styleConfig.js.map +1 -1
  178. package/dist/utils/ChangelogGenerator.d.ts.map +1 -1
  179. package/dist/utils/ChangelogGenerator.js +97 -101
  180. package/dist/utils/ChangelogGenerator.js.map +1 -1
  181. package/dist/utils/CompatibilityUpgrader.d.ts.map +1 -1
  182. package/dist/utils/CompatibilityUpgrader.js +1 -1
  183. package/dist/utils/CompatibilityUpgrader.js.map +1 -1
  184. package/dist/utils/InMemoryRevisionAcceptor.d.ts.map +1 -1
  185. package/dist/utils/InMemoryRevisionAcceptor.js +1 -6
  186. package/dist/utils/InMemoryRevisionAcceptor.js.map +1 -1
  187. package/dist/utils/MoveOperationHelper.d.ts.map +1 -1
  188. package/dist/utils/MoveOperationHelper.js +1 -1
  189. package/dist/utils/MoveOperationHelper.js.map +1 -1
  190. package/dist/utils/RevisionAwareProcessor.d.ts.map +1 -1
  191. package/dist/utils/RevisionAwareProcessor.js +2 -4
  192. package/dist/utils/RevisionAwareProcessor.js.map +1 -1
  193. package/dist/utils/RevisionWalker.d.ts.map +1 -1
  194. package/dist/utils/RevisionWalker.js +4 -12
  195. package/dist/utils/RevisionWalker.js.map +1 -1
  196. package/dist/utils/SelectiveRevisionAcceptor.d.ts.map +1 -1
  197. package/dist/utils/SelectiveRevisionAcceptor.js +2 -6
  198. package/dist/utils/SelectiveRevisionAcceptor.js.map +1 -1
  199. package/dist/utils/ShadingResolver.d.ts.map +1 -1
  200. package/dist/utils/ShadingResolver.js +1 -1
  201. package/dist/utils/ShadingResolver.js.map +1 -1
  202. package/dist/utils/acceptRevisions.d.ts.map +1 -1
  203. package/dist/utils/acceptRevisions.js +23 -12
  204. package/dist/utils/acceptRevisions.js.map +1 -1
  205. package/dist/utils/cnfStyleDecoder.d.ts +1 -1
  206. package/dist/utils/cnfStyleDecoder.d.ts.map +1 -1
  207. package/dist/utils/cnfStyleDecoder.js +40 -40
  208. package/dist/utils/cnfStyleDecoder.js.map +1 -1
  209. package/dist/utils/corruptionDetection.d.ts.map +1 -1
  210. package/dist/utils/corruptionDetection.js.map +1 -1
  211. package/dist/utils/dateFormatting.js.map +1 -1
  212. package/dist/utils/deepClone.js +1 -1
  213. package/dist/utils/deepClone.js.map +1 -1
  214. package/dist/utils/diagnostics.d.ts.map +1 -1
  215. package/dist/utils/diagnostics.js +1 -1
  216. package/dist/utils/diagnostics.js.map +1 -1
  217. package/dist/utils/errorHandling.js.map +1 -1
  218. package/dist/utils/formatting.d.ts.map +1 -1
  219. package/dist/utils/formatting.js +10 -2
  220. package/dist/utils/formatting.js.map +1 -1
  221. package/dist/utils/list-detection.d.ts +2 -2
  222. package/dist/utils/list-detection.d.ts.map +1 -1
  223. package/dist/utils/list-detection.js +21 -23
  224. package/dist/utils/list-detection.js.map +1 -1
  225. package/dist/utils/logger.d.ts.map +1 -1
  226. package/dist/utils/logger.js +12 -7
  227. package/dist/utils/logger.js.map +1 -1
  228. package/dist/utils/parsingHelpers.js.map +1 -1
  229. package/dist/utils/stripTrackedChanges.d.ts.map +1 -1
  230. package/dist/utils/stripTrackedChanges.js +3 -3
  231. package/dist/utils/stripTrackedChanges.js.map +1 -1
  232. package/dist/utils/textDiff.d.ts +1 -1
  233. package/dist/utils/textDiff.js +8 -8
  234. package/dist/utils/textDiff.js.map +1 -1
  235. package/dist/utils/units.js.map +1 -1
  236. package/dist/utils/validation.d.ts.map +1 -1
  237. package/dist/utils/validation.js +24 -7
  238. package/dist/utils/validation.js.map +1 -1
  239. package/dist/utils/xmlSanitization.d.ts.map +1 -1
  240. package/dist/utils/xmlSanitization.js +3 -3
  241. package/dist/utils/xmlSanitization.js.map +1 -1
  242. package/dist/validation/RevisionAutoFixer.d.ts.map +1 -1
  243. package/dist/validation/RevisionAutoFixer.js +5 -5
  244. package/dist/validation/RevisionAutoFixer.js.map +1 -1
  245. package/dist/validation/RevisionValidator.d.ts.map +1 -1
  246. package/dist/validation/RevisionValidator.js +7 -9
  247. package/dist/validation/RevisionValidator.js.map +1 -1
  248. package/dist/validation/ValidationRules.js +3 -3
  249. package/dist/validation/ValidationRules.js.map +1 -1
  250. package/dist/validation/index.js.map +1 -1
  251. package/dist/xml/XMLBuilder.d.ts +1 -1
  252. package/dist/xml/XMLBuilder.d.ts.map +1 -1
  253. package/dist/xml/XMLBuilder.js +98 -100
  254. package/dist/xml/XMLBuilder.js.map +1 -1
  255. package/dist/xml/XMLParser.d.ts.map +1 -1
  256. package/dist/xml/XMLParser.js +61 -66
  257. package/dist/xml/XMLParser.js.map +1 -1
  258. package/dist/zip/ZipHandler.d.ts.map +1 -1
  259. package/dist/zip/ZipHandler.js.map +1 -1
  260. package/dist/zip/ZipReader.d.ts.map +1 -1
  261. package/dist/zip/ZipReader.js +1 -3
  262. package/dist/zip/ZipReader.js.map +1 -1
  263. package/dist/zip/ZipWriter.d.ts +1 -1
  264. package/dist/zip/ZipWriter.d.ts.map +1 -1
  265. package/dist/zip/ZipWriter.js +28 -36
  266. package/dist/zip/ZipWriter.js.map +1 -1
  267. package/dist/zip/types.js +1 -1
  268. package/dist/zip/types.js.map +1 -1
  269. package/package.json +92 -92
  270. package/src/__tests__/helper-methods.test.ts +512 -512
  271. package/src/constants/legacyCompatFlags.ts +138 -138
  272. package/src/constants/limits.ts +50 -50
  273. package/src/core/Document.ts +985 -1145
  274. package/src/core/DocumentContent.ts +461 -467
  275. package/src/core/DocumentGenerator.ts +1133 -1104
  276. package/src/core/DocumentIdManager.ts +158 -158
  277. package/src/core/DocumentParser.ts +2347 -2716
  278. package/src/core/DocumentValidator.ts +363 -372
  279. package/src/core/Relationship.ts +367 -367
  280. package/src/core/RelationshipManager.ts +429 -428
  281. package/src/elements/AlternateContent.ts +42 -42
  282. package/src/elements/Bookmark.ts +212 -210
  283. package/src/elements/BookmarkManager.ts +247 -250
  284. package/src/elements/Comment.ts +356 -359
  285. package/src/elements/CommentManager.ts +499 -502
  286. package/src/elements/CommonTypes.ts +524 -549
  287. package/src/elements/CustomXml.ts +36 -36
  288. package/src/elements/Endnote.ts +221 -217
  289. package/src/elements/EndnoteManager.ts +246 -249
  290. package/src/elements/Field.ts +1292 -1233
  291. package/src/elements/FieldHelpers.ts +329 -333
  292. package/src/elements/FontManager.ts +336 -339
  293. package/src/elements/Footer.ts +269 -269
  294. package/src/elements/Footnote.ts +221 -217
  295. package/src/elements/FootnoteManager.ts +246 -249
  296. package/src/elements/Header.ts +269 -269
  297. package/src/elements/HeaderFooterManager.ts +219 -219
  298. package/src/elements/Hyperlink.ts +1288 -1193
  299. package/src/elements/Image.ts +1982 -1756
  300. package/src/elements/ImageManager.ts +437 -432
  301. package/src/elements/ImageRun.ts +59 -59
  302. package/src/elements/MathElement.ts +65 -65
  303. package/src/elements/Paragraph.ts +4347 -4287
  304. package/src/elements/PreservedElement.ts +53 -53
  305. package/src/elements/PropertyChangeTypes.ts +458 -442
  306. package/src/elements/RangeMarker.ts +382 -400
  307. package/src/elements/Revision.ts +1198 -1217
  308. package/src/elements/RevisionContent.ts +73 -73
  309. package/src/elements/RevisionManager.ts +1070 -1070
  310. package/src/elements/Run.ts +3103 -3073
  311. package/src/elements/Section.ts +1521 -1421
  312. package/src/elements/Shape.ts +884 -873
  313. package/src/elements/StructuredDocumentTag.ts +1176 -1207
  314. package/src/elements/Table.ts +2468 -2524
  315. package/src/elements/TableCell.ts +1617 -1621
  316. package/src/elements/TableGridChange.ts +149 -151
  317. package/src/elements/TableOfContents.ts +701 -691
  318. package/src/elements/TableOfContentsElement.ts +89 -89
  319. package/src/elements/TableRow.ts +960 -929
  320. package/src/elements/TextBox.ts +766 -768
  321. package/src/formatting/AbstractNumbering.ts +580 -579
  322. package/src/formatting/NumberingInstance.ts +295 -299
  323. package/src/formatting/NumberingLevel.ts +981 -1040
  324. package/src/formatting/NumberingManager.ts +833 -827
  325. package/src/formatting/Style.ts +1785 -1879
  326. package/src/formatting/StylesManager.ts +1090 -1130
  327. package/src/helpers/CleanupHelper.ts +524 -524
  328. package/src/images/ImageOptimizer.ts +274 -274
  329. package/src/index.ts +559 -554
  330. package/src/managers/DrawingManager.ts +319 -319
  331. package/src/tracking/DocumentTrackingContext.ts +687 -674
  332. package/src/tracking/TrackingContext.ts +175 -173
  333. package/src/types/compatibility-types.ts +49 -49
  334. package/src/types/formatting.ts +210 -210
  335. package/src/types/list-types.ts +14 -14
  336. package/src/types/settings-types.ts +59 -59
  337. package/src/types/styleConfig.ts +189 -189
  338. package/src/utils/ChangelogGenerator.ts +1583 -1581
  339. package/src/utils/CompatibilityUpgrader.ts +235 -237
  340. package/src/utils/InMemoryRevisionAcceptor.ts +691 -696
  341. package/src/utils/MoveOperationHelper.ts +233 -238
  342. package/src/utils/RevisionAwareProcessor.ts +518 -526
  343. package/src/utils/RevisionWalker.ts +427 -457
  344. package/src/utils/SelectiveRevisionAcceptor.ts +662 -683
  345. package/src/utils/ShadingResolver.ts +105 -107
  346. package/src/utils/acceptRevisions.ts +723 -714
  347. package/src/utils/cnfStyleDecoder.ts +212 -217
  348. package/src/utils/corruptionDetection.ts +346 -345
  349. package/src/utils/dateFormatting.ts +20 -20
  350. package/src/utils/deepClone.ts +77 -78
  351. package/src/utils/diagnostics.ts +125 -129
  352. package/src/utils/errorHandling.ts +80 -80
  353. package/src/utils/formatting.ts +220 -213
  354. package/src/utils/list-detection.ts +32 -42
  355. package/src/utils/logger.ts +412 -404
  356. package/src/utils/parsingHelpers.ts +190 -190
  357. package/src/utils/stripTrackedChanges.ts +356 -353
  358. package/src/utils/textDiff.ts +100 -100
  359. package/src/utils/units.ts +421 -421
  360. package/src/utils/validation.ts +553 -542
  361. package/src/utils/xmlSanitization.ts +179 -182
  362. package/src/validation/RevisionAutoFixer.ts +541 -542
  363. package/src/validation/RevisionValidator.ts +470 -460
  364. package/src/validation/ValidationRules.ts +338 -338
  365. package/src/validation/index.ts +30 -30
  366. package/src/xml/XMLBuilder.ts +857 -871
  367. package/src/xml/XMLParser.ts +877 -919
  368. package/src/zip/ZipHandler.ts +629 -637
  369. package/src/zip/ZipReader.ts +295 -299
  370. package/src/zip/ZipWriter.ts +374 -390
  371. package/src/zip/types.ts +116 -116
@@ -1,919 +1,877 @@
1
- /**
2
- * XMLParser - Simple position-based XML parser
3
- * Avoids regex backtracking issues that can cause ReDoS attacks
4
- * Completes the DocXML framework (XMLBuilder + XMLParser)
5
- */
6
-
7
- import { getGlobalLogger, createScopedLogger, ILogger } from "../utils/logger";
8
- import { XMLBuilder } from "./XMLBuilder";
9
-
10
- // Create scoped logger for XMLParser operations
11
- function getLogger(): ILogger {
12
- return createScopedLogger(getGlobalLogger(), 'XMLParser');
13
- }
14
-
15
- /**
16
- * Default maximum nesting depth for XML parsing.
17
- * Prevents stack overflow on deeply nested documents.
18
- */
19
- export const DEFAULT_MAX_NESTING_DEPTH = 256;
20
-
21
- /**
22
- * Options for XML-to-object parsing
23
- */
24
- export interface ParseToObjectOptions {
25
- /** Ignore attributes (default: false) */
26
- ignoreAttributes?: boolean;
27
-
28
- /** Attribute name prefix (default: '@_') */
29
- attributeNamePrefix?: string;
30
-
31
- /** Text node property name (default: '#text') */
32
- textNodeName?: string;
33
-
34
- /** Remove namespace prefixes from element names (default: false) */
35
- ignoreNamespace?: boolean;
36
-
37
- /** Parse numeric attribute values (default: true) */
38
- parseAttributeValue?: boolean;
39
-
40
- /** Trim whitespace from text values (default: true) */
41
- trimValues?: boolean;
42
-
43
- /** Always return arrays for elements (default: false) */
44
- alwaysArray?: boolean;
45
-
46
- /** Maximum nesting depth (default: 256). Prevents stack overflow on deeply nested documents. */
47
- maxNestingDepth?: number;
48
- }
49
-
50
- /**
51
- * Parsed XML object structure
52
- * Can be a string, object, array, or nested structure
53
- */
54
- export type ParsedXMLValue =
55
- | string
56
- | number
57
- | boolean
58
- | ParsedXMLObject
59
- | ParsedXMLObject[]
60
- | null
61
- | undefined;
62
-
63
- /**
64
- * Parsed XML object with dynamic keys
65
- */
66
- export interface ParsedXMLObject {
67
- [key: string]: ParsedXMLValue;
68
- }
69
-
70
- /**
71
- * Internal structure for tracking parsed elements during parsing
72
- */
73
- interface ParsedElement {
74
- name: string;
75
- value: ParsedXMLValue;
76
- }
77
-
78
- /**
79
- * Simple XML parser using position-based parsing instead of regex
80
- * Prevents catastrophic backtracking (ReDoS attacks) by avoiding nested regex patterns
81
- */
82
- export class XMLParser {
83
- /**
84
- * Extracts the body content from a Word document XML
85
- * @param docXml - The complete document.xml content
86
- * @returns The body content, or empty string if not found
87
- */
88
- static extractBody(docXml: string): string {
89
- const startTag = "<w:body";
90
- const endTag = "</w:body>";
91
-
92
- const startIdx = docXml.indexOf(startTag);
93
- if (startIdx === -1) return "";
94
-
95
- // Find the closing > of opening tag
96
- const openEnd = docXml.indexOf(">", startIdx);
97
- if (openEnd === -1) return "";
98
-
99
- // Find matching closing tag
100
- const endIdx = docXml.indexOf(endTag, openEnd);
101
- if (endIdx === -1) return "";
102
-
103
- return docXml.substring(openEnd + 1, endIdx);
104
- }
105
-
106
- /**
107
- * Extracts all elements of a given type using position-based parsing
108
- * Handles nested tags correctly by tracking depth
109
- * @param xml - XML content to parse
110
- * @param tagName - Tag name to extract (e.g., 'w:p', 'w:r')
111
- * @returns Array of XML strings for each element
112
- */
113
- static extractElements(xml: string, tagName: string): string[] {
114
- const elements: string[] = [];
115
- const openTag = `<${tagName}`;
116
- const closeTag = `</${tagName}>`;
117
- const selfClosingEnd = "/>";
118
-
119
- let pos = 0;
120
- while (pos < xml.length) {
121
- const startIdx = xml.indexOf(openTag, pos);
122
- if (startIdx === -1) break;
123
-
124
- // Verify this is the exact tag (not a prefix match like <w:p matching <w:pPr>)
125
- // The character after the tag name must be either '>', '/', whitespace, or '=' (for attributes)
126
- const charAfterTag = xml[startIdx + openTag.length];
127
- if (
128
- charAfterTag &&
129
- charAfterTag !== ">" &&
130
- charAfterTag !== "/" &&
131
- charAfterTag !== " " &&
132
- charAfterTag !== "\t" &&
133
- charAfterTag !== "\n" &&
134
- charAfterTag !== "\r" &&
135
- charAfterTag !== "="
136
- ) {
137
- // This is a prefix match (e.g., <w:pPr> when looking for <w:p>), skip it (Issue #5)
138
- pos = startIdx + openTag.length;
139
- continue;
140
- }
141
-
142
- // Find the end of opening tag
143
- const openEnd = xml.indexOf(">", startIdx);
144
- if (openEnd === -1) break;
145
-
146
- // Check if self-closing
147
- if (xml.substring(openEnd - 1, openEnd + 1) === selfClosingEnd) {
148
- elements.push(xml.substring(startIdx, openEnd + 1));
149
- pos = openEnd + 1;
150
- continue;
151
- }
152
-
153
- // Find matching closing tag (handle nesting)
154
- let depth = 1;
155
- let searchPos = openEnd + 1;
156
-
157
- while (depth > 0 && searchPos < xml.length) {
158
- // Find next potential opening tag
159
- let nextOpen = -1;
160
- let openSearchPos = searchPos;
161
- while (true) {
162
- const candidateOpen = xml.indexOf(openTag, openSearchPos);
163
- if (candidateOpen === -1) {
164
- break;
165
- }
166
- // Verify it's an exact match (not a prefix)
167
- const charAfter = xml[candidateOpen + openTag.length];
168
- if (
169
- charAfter &&
170
- charAfter !== ">" &&
171
- charAfter !== "/" &&
172
- charAfter !== " " &&
173
- charAfter !== "\t" &&
174
- charAfter !== "\n" &&
175
- charAfter !== "\r"
176
- ) {
177
- // Prefix match, keep searching
178
- openSearchPos = candidateOpen + openTag.length;
179
- continue;
180
- }
181
- nextOpen = candidateOpen;
182
- break;
183
- }
184
-
185
- const nextClose = xml.indexOf(closeTag, searchPos);
186
-
187
- if (nextClose === -1) break;
188
-
189
- if (nextOpen !== -1 && nextOpen < nextClose) {
190
- depth++;
191
- searchPos = nextOpen + openTag.length;
192
- } else {
193
- depth--;
194
- if (depth === 0) {
195
- elements.push(xml.substring(startIdx, nextClose + closeTag.length));
196
- pos = nextClose + closeTag.length;
197
- } else {
198
- searchPos = nextClose + closeTag.length;
199
- }
200
- }
201
- }
202
-
203
- if (depth > 0) {
204
- // Unclosed tag - skip it
205
- pos = startIdx + openTag.length;
206
- }
207
- }
208
-
209
- return elements;
210
- }
211
-
212
- /**
213
- * Extracts attribute value from an XML string
214
- * @param xml - XML content
215
- * @param attributeName - Attribute name (e.g., 'w:val')
216
- * @returns Attribute value or undefined
217
- */
218
- static extractAttribute(
219
- xml: string,
220
- attributeName: string
221
- ): string | undefined {
222
- // Use simple indexOf for bounded string search (safe)
223
- const attrPattern = `${attributeName}="`;
224
- const startIdx = xml.indexOf(attrPattern);
225
- if (startIdx === -1) return undefined;
226
-
227
- const valueStart = startIdx + attrPattern.length;
228
- const valueEnd = xml.indexOf('"', valueStart);
229
- if (valueEnd === -1) return undefined;
230
-
231
- const rawValue = xml.substring(valueStart, valueEnd);
232
- // Unescape XML entities to get the actual value
233
- // This prevents double-escaping when the value is later re-serialized
234
- return XMLBuilder.unescapeXml(rawValue);
235
- }
236
-
237
- /**
238
- * Checks if an XML string contains a self-closing tag
239
- * @param xml - XML content
240
- * @param tagName - Tag name to check
241
- * @returns True if the tag exists as self-closing
242
- */
243
- static hasSelfClosingTag(xml: string, tagName: string): boolean {
244
- return xml.includes(`<${tagName}/>`) || xml.includes(`<${tagName} `);
245
- }
246
-
247
- /**
248
- * Checks if a boolean property tag is enabled (w:val="1" or w:val="true")
249
- * Per ECMA-376, boolean properties can be:
250
- * - Present with w:val="1" or w:val="true" (enabled)
251
- * - Present with w:val="0" or w:val="false" (explicitly disabled)
252
- * - Absent (disabled by default)
253
- *
254
- * @param xml - XML content to search
255
- * @param tagName - Tag name (e.g., 'w:keepNext')
256
- * @returns True if tag exists with w:val="1" or w:val="true", false otherwise
257
- *
258
- * @example
259
- * hasBooleanProperty('<w:pPr><w:keepNext w:val="1"/></w:pPr>', 'w:keepNext'); // true
260
- * hasBooleanProperty('<w:pPr><w:keepNext w:val="0"/></w:pPr>', 'w:keepNext'); // false
261
- * hasBooleanProperty('<w:pPr><w:spacing/></w:pPr>', 'w:keepNext'); // false
262
- */
263
- static hasBooleanProperty(xml: string, tagName: string): boolean {
264
- // Check for tag with w:val="1" or w:val="true"
265
- if (
266
- xml.includes(`<${tagName} w:val="1"`) ||
267
- xml.includes(`<${tagName} w:val="true"`)
268
- ) {
269
- return true;
270
- }
271
-
272
- // Check for self-closing tag without w:val attribute (means true per ECMA-376)
273
- // Example: <w:b/> means bold=true
274
- if (xml.includes(`<${tagName}/>`)) {
275
- return true;
276
- }
277
-
278
- return false;
279
- }
280
-
281
- /**
282
- * Extracts text content from within tags
283
- * Finds all <w:t>...</w:t> tags and extracts their text
284
- * @param xml - XML content
285
- * @returns Combined text content
286
- */
287
- static extractText(xml: string): string {
288
- const texts: string[] = [];
289
- const openTag = "<w:t";
290
- const closeTag = "</w:t>";
291
-
292
- let pos = 0;
293
- while (pos < xml.length) {
294
- const startIdx = xml.indexOf(openTag, pos);
295
- if (startIdx === -1) break;
296
-
297
- // Find the end of opening tag
298
- const openEnd = xml.indexOf(">", startIdx);
299
- if (openEnd === -1) break;
300
-
301
- // Find closing tag
302
- const closeIdx = xml.indexOf(closeTag, openEnd);
303
- if (closeIdx === -1) break;
304
-
305
- // Extract text between tags
306
- const text = xml.substring(openEnd + 1, closeIdx);
307
- texts.push(text);
308
-
309
- pos = closeIdx + closeTag.length;
310
- }
311
-
312
- return texts.join("");
313
- }
314
-
315
- /**
316
- * Validates input size to prevent excessive memory usage
317
- * @param xml - XML content
318
- * @param maxSize - Maximum size in bytes (default: 10MB)
319
- * @throws Error if XML exceeds max size
320
- */
321
- static validateSize(xml: string, maxSize: number = 10 * 1024 * 1024): void {
322
- if (xml.length > maxSize) {
323
- throw new Error(
324
- `XML content too large for parsing (${(
325
- xml.length /
326
- 1024 /
327
- 1024
328
- ).toFixed(1)}MB). ` +
329
- `Maximum allowed: ${(maxSize / 1024 / 1024).toFixed(0)}MB`
330
- );
331
- }
332
- }
333
-
334
- /**
335
- * Extracts content between two specific tags
336
- * More efficient than regex for large documents
337
- * @param xml - XML content
338
- * @param startTag - Opening tag (e.g., '<w:pPr')
339
- * @param endTag - Closing tag (e.g., '</w:pPr>')
340
- * @returns Content between tags, or undefined if not found
341
- */
342
- static extractBetweenTags(
343
- xml: string,
344
- startTag: string,
345
- endTag: string
346
- ): string | undefined {
347
- const startIdx = xml.indexOf(startTag);
348
- if (startIdx === -1) return undefined;
349
-
350
- // Find the end of the opening tag
351
- const openEnd = xml.indexOf(">", startIdx);
352
- if (openEnd === -1) return undefined;
353
-
354
- // Find the closing tag
355
- const endIdx = xml.indexOf(endTag, openEnd);
356
- if (endIdx === -1) return undefined;
357
-
358
- return xml.substring(openEnd + 1, endIdx);
359
- }
360
-
361
- /**
362
- * Extracts a complete self-closing tag with its attributes
363
- * Handles cases where multiple similar tags exist (e.g., <w:sz.../> and <w:szCs.../>)
364
- *
365
- * @param xml - XML string to search
366
- * @param tagName - Tag name to find (e.g., "w:color", "w:sz")
367
- * @returns The complete tag content (attributes portion) or undefined if not found
368
- *
369
- * @example
370
- * const xml = '<w:sz w:val="36"/><w:color w:val="FF0000"/>';
371
- * const colorTag = XMLParser.extractSelfClosingTag(xml, 'w:color');
372
- * // Returns: ' w:val="FF0000"'
373
- */
374
- static extractSelfClosingTag(
375
- xml: string,
376
- tagName: string
377
- ): string | undefined {
378
- const startPattern = `<${tagName}`;
379
- let searchPos = 0;
380
-
381
- // Search for the exact tag (not tags that start with this pattern)
382
- while (true) {
383
- const startIdx = xml.indexOf(startPattern, searchPos);
384
- if (startIdx === -1) return undefined;
385
-
386
- // Check what character follows the tag name
387
- const charAfterTag = xml[startIdx + startPattern.length];
388
-
389
- // Valid separators after tag name: space, '/', or '>'
390
- if (charAfterTag === ' ' || charAfterTag === '/' || charAfterTag === '>') {
391
- // Found the exact tag, now find its end
392
- const endIdx = xml.indexOf('/>', startIdx);
393
- if (endIdx === -1) {
394
- // Try finding a closing tag instead (non-self-closing)
395
- const closeTagStart = xml.indexOf('>', startIdx);
396
- if (closeTagStart === -1) return undefined;
397
-
398
- // Return attributes portion
399
- return xml.substring(startIdx + startPattern.length, closeTagStart);
400
- }
401
-
402
- // Return attributes portion (between tag name and />)
403
- return xml.substring(startIdx + startPattern.length, endIdx);
404
- }
405
-
406
- // Not the exact tag (e.g., found "w:sz" when looking for "w:s")
407
- // Continue searching
408
- searchPos = startIdx + 1;
409
- }
410
- }
411
-
412
- /**
413
- * Parse XML string to JavaScript object
414
- * Compatible with fast-xml-parser output format
415
- *
416
- * @param xml - XML string to parse
417
- * @param options - Parsing options
418
- * @returns Parsed JavaScript object
419
- *
420
- * @example
421
- * const xml = '<Relationships><Relationship Id="rId1" Target="https://example.com"/></Relationships>';
422
- * const obj = XMLParser.parseToObject(xml);
423
- * // Returns: { Relationships: { Relationship: { '@_Id': 'rId1', '@_Target': 'https://example.com' } } }
424
- *
425
- * @example
426
- * // Multiple elements become arrays
427
- * const xml = '<Items><Item id="1"/><Item id="2"/></Items>';
428
- * const obj = XMLParser.parseToObject(xml);
429
- * // Returns: { Items: { Item: [{ '@_id': '1' }, { '@_id': '2' }] } }
430
- */
431
- static parseToObject(
432
- xml: string,
433
- options?: ParseToObjectOptions
434
- ): ParsedXMLObject {
435
- const logger = getLogger();
436
- logger.debug('Parsing XML to object', { xmlSize: xml.length });
437
-
438
- // Default options
439
- const opts: Required<ParseToObjectOptions> = {
440
- ignoreAttributes: options?.ignoreAttributes ?? false,
441
- attributeNamePrefix: options?.attributeNamePrefix ?? "@_",
442
- textNodeName: options?.textNodeName ?? "#text",
443
- ignoreNamespace: options?.ignoreNamespace ?? false,
444
- parseAttributeValue: options?.parseAttributeValue ?? true,
445
- trimValues: options?.trimValues ?? true,
446
- alwaysArray: options?.alwaysArray ?? false,
447
- maxNestingDepth: options?.maxNestingDepth ?? DEFAULT_MAX_NESTING_DEPTH,
448
- };
449
-
450
- // Validate input size
451
- XMLParser.validateSize(xml);
452
-
453
- // Remove XML declaration and trim
454
- xml = xml.replace(/<\?xml[^>]*\?>\s*/g, "").trim();
455
-
456
- if (!xml) {
457
- return {};
458
- }
459
-
460
- // Parse root element (start at depth 0)
461
- const result = XMLParser.parseElementToObject(xml, 0, opts, 0);
462
- logger.debug('XML parsed to object');
463
- return result.value as ParsedXMLObject;
464
- }
465
-
466
- /**
467
- * Parses a single XML element into an object
468
- * @private
469
- */
470
- private static parseElementToObject(
471
- xml: string,
472
- startPos: number,
473
- options: Required<ParseToObjectOptions>,
474
- depth: number
475
- ): { value: ParsedXMLValue; endPos: number } {
476
- // Check nesting depth to prevent stack overflow
477
- if (depth > options.maxNestingDepth) {
478
- throw new Error(
479
- `XML nesting depth exceeds maximum of ${options.maxNestingDepth}. ` +
480
- `This may indicate malformed XML or an attack attempt. ` +
481
- `Use the maxNestingDepth option to increase the limit if needed.`
482
- );
483
- }
484
-
485
- // Find opening tag
486
- const openTagStart = xml.indexOf("<", startPos);
487
- if (openTagStart === -1) {
488
- return { value: {}, endPos: xml.length };
489
- }
490
-
491
- // Skip comments
492
- if (xml.substring(openTagStart, openTagStart + 4) === "<!--") {
493
- const commentEnd = xml.indexOf("-->", openTagStart + 4);
494
- if (commentEnd !== -1) {
495
- return XMLParser.parseElementToObject(xml, commentEnd + 3, options, depth);
496
- }
497
- return { value: {}, endPos: xml.length };
498
- }
499
-
500
- // Extract element name
501
- const nameMatch = /^([a-zA-Z0-9:_-]+)/.exec(xml
502
- .substring(openTagStart + 1));
503
- if (!nameMatch) {
504
- return { value: {}, endPos: openTagStart + 1 };
505
- }
506
-
507
- const originalElementName: string = nameMatch[1] || "";
508
- let elementName: string = originalElementName;
509
- const tagHeaderEnd = xml.indexOf(">", openTagStart);
510
- if (tagHeaderEnd === -1) {
511
- return { value: {}, endPos: xml.length };
512
- }
513
-
514
- // Remove namespace if requested (but keep original for offset calculations)
515
- if (options.ignoreNamespace && elementName.includes(":")) {
516
- elementName = elementName.split(":")[1] || elementName;
517
- }
518
-
519
- // Extract attributes using ORIGINAL element name length for correct offset
520
- const tagHeader = xml.substring(
521
- openTagStart + 1 + originalElementName.length,
522
- tagHeaderEnd
523
- );
524
- const attributes = XMLParser.extractAttributesFromTag(tagHeader, options);
525
-
526
- // Check if self-closing
527
- const isSelfClosing =
528
- tagHeader.trim().endsWith("/") || xml[tagHeaderEnd - 1] === "/";
529
-
530
- if (isSelfClosing) {
531
- // Self-closing tag - return object with attributes only
532
- const elementValue: ParsedXMLObject = { ...attributes };
533
- return {
534
- value: { [elementName]: elementValue },
535
- endPos: tagHeaderEnd + 1,
536
- };
537
- }
538
-
539
- // Find closing tag (use original name with namespace for correct matching)
540
- const closingTag = `</${originalElementName}>`;
541
- const contentStart = tagHeaderEnd + 1;
542
- const closingTagPos = XMLParser.findClosingTag(
543
- xml,
544
- originalElementName,
545
- contentStart
546
- );
547
-
548
- if (closingTagPos === -1) {
549
- // No closing tag found - treat as self-closing
550
- return {
551
- value: { [elementName]: { ...attributes } },
552
- endPos: tagHeaderEnd + 1,
553
- };
554
- }
555
-
556
- // Extract content between tags
557
- const content = xml.substring(contentStart, closingTagPos);
558
-
559
- // Parse content (children or text)
560
- const children: ParsedElement[] = [];
561
- let textContent = "";
562
- let pos = 0;
563
-
564
- while (pos < content.length) {
565
- const nextTag = content.indexOf("<", pos);
566
-
567
- if (nextTag === -1) {
568
- // No more tags - rest is text
569
- const text = content.substring(pos);
570
- // When trimValues is false, preserve whitespace-only text
571
- // When trimValues is true, only include text that has non-whitespace content
572
- if (text.length > 0 && (!options.trimValues || text.trim())) {
573
- // Unescape XML entities in text content (e.g., &lt; -> <)
574
- textContent += XMLBuilder.unescapeXml(text);
575
- }
576
- break;
577
- }
578
-
579
- // Collect text before next tag
580
- if (nextTag > pos) {
581
- const text = content.substring(pos, nextTag);
582
- // When trimValues is false, preserve whitespace-only text
583
- // When trimValues is true, only include text that has non-whitespace content
584
- if (text.length > 0 && (!options.trimValues || text.trim())) {
585
- // Unescape XML entities in text content (e.g., &lt; -> <)
586
- textContent += XMLBuilder.unescapeXml(text);
587
- }
588
- }
589
-
590
- // Parse child element (increment depth for children)
591
- const childResult = XMLParser.parseElementToObject(
592
- content,
593
- nextTag,
594
- options,
595
- depth + 1
596
- );
597
- const childObj = childResult.value as ParsedXMLObject;
598
-
599
- // Extract child name and value
600
- const childKeys = Object.keys(childObj);
601
- if (childKeys.length > 0) {
602
- const childName = childKeys[0];
603
- if (childName) {
604
- const childValue = childObj[childName];
605
- children.push({ name: childName, value: childValue });
606
- }
607
- }
608
-
609
- pos = childResult.endPos;
610
- }
611
-
612
- // Build element value
613
- let elementValue: ParsedXMLValue = {};
614
-
615
- // Add attributes
616
- if (!options.ignoreAttributes && Object.keys(attributes).length > 0) {
617
- elementValue = { ...attributes };
618
- }
619
-
620
- // Add text content
621
- // When trimValues is false, include whitespace-only text
622
- // When trimValues is true, only include text with non-whitespace content
623
- if (textContent.length > 0 && (!options.trimValues || textContent.trim())) {
624
- const text = options.trimValues ? textContent.trim() : textContent;
625
- if (typeof elementValue === "object" && !Array.isArray(elementValue)) {
626
- if (Object.keys(elementValue).length === 0) {
627
- // Only text, no attributes - return as direct value if simple
628
- elementValue = text;
629
- } else {
630
- // Text with attributes
631
- (elementValue)[options.textNodeName] = text;
632
- }
633
- }
634
- }
635
-
636
- // Add children
637
- if (children.length > 0) {
638
- const coalescedChildren = XMLParser.coalesceChildren(children, options);
639
- if (typeof elementValue === "object" && !Array.isArray(elementValue)) {
640
- elementValue = { ...elementValue, ...coalescedChildren };
641
- } else {
642
- elementValue = coalescedChildren;
643
- }
644
- }
645
-
646
- // If element has no content, attributes, or children - return empty object
647
- if (
648
- typeof elementValue === "object" &&
649
- !Array.isArray(elementValue) &&
650
- Object.keys(elementValue).length === 0
651
- ) {
652
- elementValue = {};
653
- }
654
-
655
- return {
656
- value: { [elementName]: elementValue },
657
- endPos: closingTagPos + closingTag.length,
658
- };
659
- }
660
-
661
- /**
662
- * Extracts attributes from a tag header
663
- * @private
664
- */
665
- private static extractAttributesFromTag(
666
- tagHeader: string,
667
- options: Required<ParseToObjectOptions>
668
- ): Record<string, string | number | boolean> {
669
- const attributes: Record<string, string | number | boolean> = {};
670
-
671
- if (options.ignoreAttributes) {
672
- return attributes;
673
- }
674
-
675
- // Simple attribute extraction using position-based parsing
676
- let pos = 0;
677
- while (pos < tagHeader.length) {
678
- // Skip whitespace
679
- while (pos < tagHeader.length) {
680
- const char = tagHeader[pos];
681
- if (char && /\s/.test(char)) {
682
- pos++;
683
- } else {
684
- break;
685
- }
686
- }
687
-
688
- if (pos >= tagHeader.length || tagHeader[pos] === "/") {
689
- break;
690
- }
691
-
692
- // Extract attribute name
693
- const nameStart = pos;
694
- while (pos < tagHeader.length) {
695
- const char = tagHeader[pos];
696
- if (char && /[a-zA-Z0-9:_-]/.test(char)) {
697
- pos++;
698
- } else {
699
- break;
700
- }
701
- }
702
-
703
- if (pos === nameStart) {
704
- break;
705
- }
706
-
707
- let attrName = tagHeader.substring(nameStart, pos);
708
-
709
- // Skip whitespace and '='
710
- while (pos < tagHeader.length) {
711
- const char = tagHeader[pos];
712
- if (char && /[\s=]/.test(char)) {
713
- pos++;
714
- } else {
715
- break;
716
- }
717
- }
718
-
719
- // Extract attribute value
720
- let attrValue = "";
721
- if (
722
- pos < tagHeader.length &&
723
- (tagHeader[pos] === '"' || tagHeader[pos] === "'")
724
- ) {
725
- const quote = tagHeader[pos];
726
- pos++; // Skip opening quote
727
- const valueStart = pos;
728
-
729
- while (pos < tagHeader.length && tagHeader[pos] !== quote) {
730
- pos++;
731
- }
732
-
733
- attrValue = tagHeader.substring(valueStart, pos);
734
- pos++; // Skip closing quote
735
- }
736
-
737
- // Remove namespace from attribute name if requested
738
- if (options.ignoreNamespace && attrName.includes(":")) {
739
- attrName = attrName.split(":")[1] || attrName;
740
- }
741
-
742
- // Add prefix to attribute name
743
- const prefixedName = options.attributeNamePrefix + attrName;
744
-
745
- // Parse attribute value
746
- attributes[prefixedName] = options.parseAttributeValue
747
- ? XMLParser.parseValue(attrValue)
748
- : attrValue;
749
- }
750
-
751
- return attributes;
752
- }
753
-
754
- /**
755
- * Finds the closing tag for an element, handling nesting
756
- * @private
757
- */
758
- private static findClosingTag(
759
- xml: string,
760
- elementName: string,
761
- startPos: number
762
- ): number {
763
- const openTag = `<${elementName}`;
764
- const closeTag = `</${elementName}>`;
765
- let depth = 1;
766
- let pos = startPos;
767
-
768
- while (depth > 0 && pos < xml.length) {
769
- const nextClose = xml.indexOf(closeTag, pos);
770
-
771
- if (nextClose === -1) {
772
- return -1; // No closing tag found
773
- }
774
-
775
- // Find the next REAL opening tag (not a prefix match like <w:pPrChange for <w:pPr)
776
- // Must search for all potential matches and verify each one
777
- let realOpenPos = -1;
778
- let searchPos = pos;
779
- while (searchPos < nextClose) {
780
- const candidateOpen = xml.indexOf(openTag, searchPos);
781
- if (candidateOpen === -1 || candidateOpen >= nextClose) {
782
- break; // No more candidates before the closing tag
783
- }
784
-
785
- const charAfter = xml[candidateOpen + openTag.length];
786
- if (
787
- charAfter === ">" ||
788
- charAfter === " " ||
789
- charAfter === "/" ||
790
- charAfter === "\t" ||
791
- charAfter === "\n" ||
792
- charAfter === "\r"
793
- ) {
794
- // This looks like a real opening tag - but check if it's self-closing
795
- // Self-closing tags like <w:rPr/> should NOT increase depth
796
- const tagEnd = xml.indexOf(">", candidateOpen);
797
- if (tagEnd !== -1 && xml[tagEnd - 1] === "/") {
798
- // Self-closing tag - skip it (don't affect depth)
799
- searchPos = tagEnd + 1;
800
- continue;
801
- }
802
- // This is a real opening tag (not self-closing)
803
- realOpenPos = candidateOpen;
804
- break;
805
- }
806
-
807
- // False positive (e.g., <w:pPrChange when looking for <w:pPr)
808
- // Keep searching from after this position
809
- searchPos = candidateOpen + openTag.length;
810
- }
811
-
812
- if (realOpenPos !== -1) {
813
- // Found a real opening tag before the closing tag - increase depth
814
- depth++;
815
- pos = realOpenPos + openTag.length;
816
- } else {
817
- // No real opening tag before this closing tag - decrease depth
818
- depth--;
819
- if (depth === 0) {
820
- return nextClose;
821
- }
822
- pos = nextClose + closeTag.length;
823
- }
824
- }
825
-
826
- return -1;
827
- }
828
-
829
- /**
830
- * Coalesces children with duplicate names into arrays
831
- * @private
832
- */
833
- private static coalesceChildren(
834
- children: ParsedElement[],
835
- options: Required<ParseToObjectOptions>
836
- ): ParsedXMLObject {
837
- const result: ParsedXMLObject = {};
838
- const nameCounts: Record<string, number> = {};
839
- const nameIndices: Record<string, number> = {};
840
-
841
- // Track element order for correct run content parsing (tabs, breaks, text)
842
- // This is critical for preserving the order of mixed content like: text -> tab -> text
843
- const orderedChildren: { type: string; index: number }[] = [];
844
-
845
- // Count occurrences of each child name
846
- for (const child of children) {
847
- nameCounts[child.name] = (nameCounts[child.name] || 0) + 1;
848
- }
849
-
850
- // Build result object while tracking order
851
- for (const child of children) {
852
- const shouldBeArray =
853
- options.alwaysArray || (nameCounts[child.name] || 0) > 1;
854
-
855
- // Track element order with its index in the array
856
- const currentIndex = nameIndices[child.name] || 0;
857
- orderedChildren.push({ type: child.name, index: currentIndex });
858
- nameIndices[child.name] = currentIndex + 1;
859
-
860
- if (shouldBeArray) {
861
- if (!result[child.name]) {
862
- result[child.name] = [];
863
- }
864
- (result[child.name] as ParsedXMLValue[]).push(child.value);
865
- } else {
866
- result[child.name] = child.value;
867
- }
868
- }
869
-
870
- // Add _orderedChildren to track element order (used by DocumentParser for runs)
871
- if (orderedChildren.length > 0) {
872
- result._orderedChildren = orderedChildren;
873
- }
874
-
875
- return result;
876
- }
877
-
878
- /**
879
- * Parses a string value to number or boolean if applicable
880
- * @private
881
- */
882
- private static parseValue(value: string): string | number | boolean {
883
- if (value === "true") return true;
884
- if (value === "false") return false;
885
-
886
- // Preserve 6-character hex color codes (OpenXML standard for colors)
887
- // This includes "000000" (black) which should stay as a string
888
- if (/^[0-9A-Fa-f]{6}$/.test(value)) {
889
- return value.toUpperCase(); // Normalize to uppercase per Microsoft convention
890
- }
891
-
892
- // Preserve long digit-only strings (e.g., cnfStyle binary strings like "100000000000")
893
- // These should not be converted to numbers to avoid losing leading zeros
894
- if (/^\d{7,}$/.test(value)) {
895
- return value; // Keep as string for values with 7+ digits
896
- }
897
-
898
- // Try parsing as number
899
- // 3-character values like "240" will be parsed as numbers
900
- // 6-character hex values are already handled above
901
- if (/^-?\d+$/.test(value)) {
902
- const num = parseInt(value, 10);
903
- if (!isNaN(num)) return num;
904
- }
905
-
906
- if (/^-?\d+\.\d+$/.test(value)) {
907
- const num = parseFloat(value);
908
- if (!isNaN(num)) return num;
909
- }
910
-
911
- // Preserve 3-character hex codes (like "F0A") that have letters
912
- // Pure numeric 3-char values (like "240") are already parsed as numbers above
913
- if (/^[0-9A-Fa-f]{3}$/.test(value) && /[A-Fa-f]/.test(value)) {
914
- return value.toUpperCase();
915
- }
916
-
917
- return value;
918
- }
919
- }
1
+ /**
2
+ * XMLParser - Simple position-based XML parser
3
+ * Avoids regex backtracking issues that can cause ReDoS attacks
4
+ * Completes the DocXML framework (XMLBuilder + XMLParser)
5
+ */
6
+
7
+ import { getGlobalLogger, createScopedLogger, ILogger } from '../utils/logger';
8
+ import { XMLBuilder } from './XMLBuilder';
9
+
10
+ // Create scoped logger for XMLParser operations
11
+ function getLogger(): ILogger {
12
+ return createScopedLogger(getGlobalLogger(), 'XMLParser');
13
+ }
14
+
15
+ /**
16
+ * Default maximum nesting depth for XML parsing.
17
+ * Prevents stack overflow on deeply nested documents.
18
+ */
19
+ export const DEFAULT_MAX_NESTING_DEPTH = 256;
20
+
21
+ /**
22
+ * Options for XML-to-object parsing
23
+ */
24
+ export interface ParseToObjectOptions {
25
+ /** Ignore attributes (default: false) */
26
+ ignoreAttributes?: boolean;
27
+
28
+ /** Attribute name prefix (default: '@_') */
29
+ attributeNamePrefix?: string;
30
+
31
+ /** Text node property name (default: '#text') */
32
+ textNodeName?: string;
33
+
34
+ /** Remove namespace prefixes from element names (default: false) */
35
+ ignoreNamespace?: boolean;
36
+
37
+ /** Parse numeric attribute values (default: true) */
38
+ parseAttributeValue?: boolean;
39
+
40
+ /** Trim whitespace from text values (default: true) */
41
+ trimValues?: boolean;
42
+
43
+ /** Always return arrays for elements (default: false) */
44
+ alwaysArray?: boolean;
45
+
46
+ /** Maximum nesting depth (default: 256). Prevents stack overflow on deeply nested documents. */
47
+ maxNestingDepth?: number;
48
+ }
49
+
50
+ /**
51
+ * Parsed XML object structure
52
+ * Can be a string, object, array, or nested structure
53
+ */
54
+ export type ParsedXMLValue =
55
+ | string
56
+ | number
57
+ | boolean
58
+ | ParsedXMLObject
59
+ | ParsedXMLObject[]
60
+ | null
61
+ | undefined;
62
+
63
+ /**
64
+ * Parsed XML object with dynamic keys
65
+ */
66
+ export interface ParsedXMLObject {
67
+ [key: string]: ParsedXMLValue;
68
+ }
69
+
70
+ /**
71
+ * Internal structure for tracking parsed elements during parsing
72
+ */
73
+ interface ParsedElement {
74
+ name: string;
75
+ value: ParsedXMLValue;
76
+ }
77
+
78
+ /**
79
+ * Simple XML parser using position-based parsing instead of regex
80
+ * Prevents catastrophic backtracking (ReDoS attacks) by avoiding nested regex patterns
81
+ */
82
+ export class XMLParser {
83
+ /**
84
+ * Extracts the body content from a Word document XML
85
+ * @param docXml - The complete document.xml content
86
+ * @returns The body content, or empty string if not found
87
+ */
88
+ static extractBody(docXml: string): string {
89
+ const startTag = '<w:body';
90
+ const endTag = '</w:body>';
91
+
92
+ const startIdx = docXml.indexOf(startTag);
93
+ if (startIdx === -1) return '';
94
+
95
+ // Find the closing > of opening tag
96
+ const openEnd = docXml.indexOf('>', startIdx);
97
+ if (openEnd === -1) return '';
98
+
99
+ // Find matching closing tag
100
+ const endIdx = docXml.indexOf(endTag, openEnd);
101
+ if (endIdx === -1) return '';
102
+
103
+ return docXml.substring(openEnd + 1, endIdx);
104
+ }
105
+
106
+ /**
107
+ * Extracts all elements of a given type using position-based parsing
108
+ * Handles nested tags correctly by tracking depth
109
+ * @param xml - XML content to parse
110
+ * @param tagName - Tag name to extract (e.g., 'w:p', 'w:r')
111
+ * @returns Array of XML strings for each element
112
+ */
113
+ static extractElements(xml: string, tagName: string): string[] {
114
+ const elements: string[] = [];
115
+ const openTag = `<${tagName}`;
116
+ const closeTag = `</${tagName}>`;
117
+ const selfClosingEnd = '/>';
118
+
119
+ let pos = 0;
120
+ while (pos < xml.length) {
121
+ const startIdx = xml.indexOf(openTag, pos);
122
+ if (startIdx === -1) break;
123
+
124
+ // Verify this is the exact tag (not a prefix match like <w:p matching <w:pPr>)
125
+ // The character after the tag name must be either '>', '/', whitespace, or '=' (for attributes)
126
+ const charAfterTag = xml[startIdx + openTag.length];
127
+ if (
128
+ charAfterTag &&
129
+ charAfterTag !== '>' &&
130
+ charAfterTag !== '/' &&
131
+ charAfterTag !== ' ' &&
132
+ charAfterTag !== '\t' &&
133
+ charAfterTag !== '\n' &&
134
+ charAfterTag !== '\r' &&
135
+ charAfterTag !== '='
136
+ ) {
137
+ // This is a prefix match (e.g., <w:pPr> when looking for <w:p>), skip it (Issue #5)
138
+ pos = startIdx + openTag.length;
139
+ continue;
140
+ }
141
+
142
+ // Find the end of opening tag
143
+ const openEnd = xml.indexOf('>', startIdx);
144
+ if (openEnd === -1) break;
145
+
146
+ // Check if self-closing
147
+ if (xml.substring(openEnd - 1, openEnd + 1) === selfClosingEnd) {
148
+ elements.push(xml.substring(startIdx, openEnd + 1));
149
+ pos = openEnd + 1;
150
+ continue;
151
+ }
152
+
153
+ // Find matching closing tag (handle nesting)
154
+ let depth = 1;
155
+ let searchPos = openEnd + 1;
156
+
157
+ while (depth > 0 && searchPos < xml.length) {
158
+ // Find next potential opening tag
159
+ let nextOpen = -1;
160
+ let openSearchPos = searchPos;
161
+ while (true) {
162
+ const candidateOpen = xml.indexOf(openTag, openSearchPos);
163
+ if (candidateOpen === -1) {
164
+ break;
165
+ }
166
+ // Verify it's an exact match (not a prefix)
167
+ const charAfter = xml[candidateOpen + openTag.length];
168
+ if (
169
+ charAfter &&
170
+ charAfter !== '>' &&
171
+ charAfter !== '/' &&
172
+ charAfter !== ' ' &&
173
+ charAfter !== '\t' &&
174
+ charAfter !== '\n' &&
175
+ charAfter !== '\r'
176
+ ) {
177
+ // Prefix match, keep searching
178
+ openSearchPos = candidateOpen + openTag.length;
179
+ continue;
180
+ }
181
+ nextOpen = candidateOpen;
182
+ break;
183
+ }
184
+
185
+ const nextClose = xml.indexOf(closeTag, searchPos);
186
+
187
+ if (nextClose === -1) break;
188
+
189
+ if (nextOpen !== -1 && nextOpen < nextClose) {
190
+ depth++;
191
+ searchPos = nextOpen + openTag.length;
192
+ } else {
193
+ depth--;
194
+ if (depth === 0) {
195
+ elements.push(xml.substring(startIdx, nextClose + closeTag.length));
196
+ pos = nextClose + closeTag.length;
197
+ } else {
198
+ searchPos = nextClose + closeTag.length;
199
+ }
200
+ }
201
+ }
202
+
203
+ if (depth > 0) {
204
+ // Unclosed tag - skip it
205
+ pos = startIdx + openTag.length;
206
+ }
207
+ }
208
+
209
+ return elements;
210
+ }
211
+
212
+ /**
213
+ * Extracts attribute value from an XML string
214
+ * @param xml - XML content
215
+ * @param attributeName - Attribute name (e.g., 'w:val')
216
+ * @returns Attribute value or undefined
217
+ */
218
+ static extractAttribute(xml: string, attributeName: string): string | undefined {
219
+ // Use simple indexOf for bounded string search (safe)
220
+ const attrPattern = `${attributeName}="`;
221
+ const startIdx = xml.indexOf(attrPattern);
222
+ if (startIdx === -1) return undefined;
223
+
224
+ const valueStart = startIdx + attrPattern.length;
225
+ const valueEnd = xml.indexOf('"', valueStart);
226
+ if (valueEnd === -1) return undefined;
227
+
228
+ const rawValue = xml.substring(valueStart, valueEnd);
229
+ // Unescape XML entities to get the actual value
230
+ // This prevents double-escaping when the value is later re-serialized
231
+ return XMLBuilder.unescapeXml(rawValue);
232
+ }
233
+
234
+ /**
235
+ * Checks if an XML string contains a self-closing tag
236
+ * @param xml - XML content
237
+ * @param tagName - Tag name to check
238
+ * @returns True if the tag exists as self-closing
239
+ */
240
+ static hasSelfClosingTag(xml: string, tagName: string): boolean {
241
+ return xml.includes(`<${tagName}/>`) || xml.includes(`<${tagName} `);
242
+ }
243
+
244
+ /**
245
+ * Checks if a boolean property tag is enabled (w:val="1" or w:val="true")
246
+ * Per ECMA-376, boolean properties can be:
247
+ * - Present with w:val="1" or w:val="true" (enabled)
248
+ * - Present with w:val="0" or w:val="false" (explicitly disabled)
249
+ * - Absent (disabled by default)
250
+ *
251
+ * @param xml - XML content to search
252
+ * @param tagName - Tag name (e.g., 'w:keepNext')
253
+ * @returns True if tag exists with w:val="1" or w:val="true", false otherwise
254
+ *
255
+ * @example
256
+ * hasBooleanProperty('<w:pPr><w:keepNext w:val="1"/></w:pPr>', 'w:keepNext'); // true
257
+ * hasBooleanProperty('<w:pPr><w:keepNext w:val="0"/></w:pPr>', 'w:keepNext'); // false
258
+ * hasBooleanProperty('<w:pPr><w:spacing/></w:pPr>', 'w:keepNext'); // false
259
+ */
260
+ static hasBooleanProperty(xml: string, tagName: string): boolean {
261
+ // Check for tag with w:val="1" or w:val="true"
262
+ if (xml.includes(`<${tagName} w:val="1"`) || xml.includes(`<${tagName} w:val="true"`)) {
263
+ return true;
264
+ }
265
+
266
+ // Check for self-closing tag without w:val attribute (means true per ECMA-376)
267
+ // Example: <w:b/> means bold=true
268
+ if (xml.includes(`<${tagName}/>`)) {
269
+ return true;
270
+ }
271
+
272
+ return false;
273
+ }
274
+
275
+ /**
276
+ * Extracts text content from within tags
277
+ * Finds all <w:t>...</w:t> tags and extracts their text
278
+ * @param xml - XML content
279
+ * @returns Combined text content
280
+ */
281
+ static extractText(xml: string): string {
282
+ const texts: string[] = [];
283
+ const openTag = '<w:t';
284
+ const closeTag = '</w:t>';
285
+
286
+ let pos = 0;
287
+ while (pos < xml.length) {
288
+ const startIdx = xml.indexOf(openTag, pos);
289
+ if (startIdx === -1) break;
290
+
291
+ // Find the end of opening tag
292
+ const openEnd = xml.indexOf('>', startIdx);
293
+ if (openEnd === -1) break;
294
+
295
+ // Find closing tag
296
+ const closeIdx = xml.indexOf(closeTag, openEnd);
297
+ if (closeIdx === -1) break;
298
+
299
+ // Extract text between tags
300
+ const text = xml.substring(openEnd + 1, closeIdx);
301
+ texts.push(text);
302
+
303
+ pos = closeIdx + closeTag.length;
304
+ }
305
+
306
+ return texts.join('');
307
+ }
308
+
309
+ /**
310
+ * Validates input size to prevent excessive memory usage
311
+ * @param xml - XML content
312
+ * @param maxSize - Maximum size in bytes (default: 10MB)
313
+ * @throws Error if XML exceeds max size
314
+ */
315
+ static validateSize(xml: string, maxSize: number = 10 * 1024 * 1024): void {
316
+ if (xml.length > maxSize) {
317
+ throw new Error(
318
+ `XML content too large for parsing (${(xml.length / 1024 / 1024).toFixed(1)}MB). ` +
319
+ `Maximum allowed: ${(maxSize / 1024 / 1024).toFixed(0)}MB`
320
+ );
321
+ }
322
+ }
323
+
324
+ /**
325
+ * Extracts content between two specific tags
326
+ * More efficient than regex for large documents
327
+ * @param xml - XML content
328
+ * @param startTag - Opening tag (e.g., '<w:pPr')
329
+ * @param endTag - Closing tag (e.g., '</w:pPr>')
330
+ * @returns Content between tags, or undefined if not found
331
+ */
332
+ static extractBetweenTags(xml: string, startTag: string, endTag: string): string | undefined {
333
+ const startIdx = xml.indexOf(startTag);
334
+ if (startIdx === -1) return undefined;
335
+
336
+ // Find the end of the opening tag
337
+ const openEnd = xml.indexOf('>', startIdx);
338
+ if (openEnd === -1) return undefined;
339
+
340
+ // Find the closing tag
341
+ const endIdx = xml.indexOf(endTag, openEnd);
342
+ if (endIdx === -1) return undefined;
343
+
344
+ return xml.substring(openEnd + 1, endIdx);
345
+ }
346
+
347
+ /**
348
+ * Extracts a complete self-closing tag with its attributes
349
+ * Handles cases where multiple similar tags exist (e.g., <w:sz.../> and <w:szCs.../>)
350
+ *
351
+ * @param xml - XML string to search
352
+ * @param tagName - Tag name to find (e.g., "w:color", "w:sz")
353
+ * @returns The complete tag content (attributes portion) or undefined if not found
354
+ *
355
+ * @example
356
+ * const xml = '<w:sz w:val="36"/><w:color w:val="FF0000"/>';
357
+ * const colorTag = XMLParser.extractSelfClosingTag(xml, 'w:color');
358
+ * // Returns: ' w:val="FF0000"'
359
+ */
360
+ static extractSelfClosingTag(xml: string, tagName: string): string | undefined {
361
+ const startPattern = `<${tagName}`;
362
+ let searchPos = 0;
363
+
364
+ // Search for the exact tag (not tags that start with this pattern)
365
+ while (true) {
366
+ const startIdx = xml.indexOf(startPattern, searchPos);
367
+ if (startIdx === -1) return undefined;
368
+
369
+ // Check what character follows the tag name
370
+ const charAfterTag = xml[startIdx + startPattern.length];
371
+
372
+ // Valid separators after tag name: space, '/', or '>'
373
+ if (charAfterTag === ' ' || charAfterTag === '/' || charAfterTag === '>') {
374
+ // Found the exact tag, now find its end
375
+ const endIdx = xml.indexOf('/>', startIdx);
376
+ if (endIdx === -1) {
377
+ // Try finding a closing tag instead (non-self-closing)
378
+ const closeTagStart = xml.indexOf('>', startIdx);
379
+ if (closeTagStart === -1) return undefined;
380
+
381
+ // Return attributes portion
382
+ return xml.substring(startIdx + startPattern.length, closeTagStart);
383
+ }
384
+
385
+ // Return attributes portion (between tag name and />)
386
+ return xml.substring(startIdx + startPattern.length, endIdx);
387
+ }
388
+
389
+ // Not the exact tag (e.g., found "w:sz" when looking for "w:s")
390
+ // Continue searching
391
+ searchPos = startIdx + 1;
392
+ }
393
+ }
394
+
395
+ /**
396
+ * Parse XML string to JavaScript object
397
+ * Compatible with fast-xml-parser output format
398
+ *
399
+ * @param xml - XML string to parse
400
+ * @param options - Parsing options
401
+ * @returns Parsed JavaScript object
402
+ *
403
+ * @example
404
+ * const xml = '<Relationships><Relationship Id="rId1" Target="https://example.com"/></Relationships>';
405
+ * const obj = XMLParser.parseToObject(xml);
406
+ * // Returns: { Relationships: { Relationship: { '@_Id': 'rId1', '@_Target': 'https://example.com' } } }
407
+ *
408
+ * @example
409
+ * // Multiple elements become arrays
410
+ * const xml = '<Items><Item id="1"/><Item id="2"/></Items>';
411
+ * const obj = XMLParser.parseToObject(xml);
412
+ * // Returns: { Items: { Item: [{ '@_id': '1' }, { '@_id': '2' }] } }
413
+ */
414
+ static parseToObject(xml: string, options?: ParseToObjectOptions): ParsedXMLObject {
415
+ const logger = getLogger();
416
+ logger.debug('Parsing XML to object', { xmlSize: xml.length });
417
+
418
+ // Default options
419
+ const opts: Required<ParseToObjectOptions> = {
420
+ ignoreAttributes: options?.ignoreAttributes ?? false,
421
+ attributeNamePrefix: options?.attributeNamePrefix ?? '@_',
422
+ textNodeName: options?.textNodeName ?? '#text',
423
+ ignoreNamespace: options?.ignoreNamespace ?? false,
424
+ parseAttributeValue: options?.parseAttributeValue ?? true,
425
+ trimValues: options?.trimValues ?? true,
426
+ alwaysArray: options?.alwaysArray ?? false,
427
+ maxNestingDepth: options?.maxNestingDepth ?? DEFAULT_MAX_NESTING_DEPTH,
428
+ };
429
+
430
+ // Validate input size
431
+ XMLParser.validateSize(xml);
432
+
433
+ // Remove XML declaration and trim
434
+ xml = xml.replace(/<\?xml[^>]*\?>\s*/g, '').trim();
435
+
436
+ if (!xml) {
437
+ return {};
438
+ }
439
+
440
+ // Parse root element (start at depth 0)
441
+ const result = XMLParser.parseElementToObject(xml, 0, opts, 0);
442
+ logger.debug('XML parsed to object');
443
+ return result.value as ParsedXMLObject;
444
+ }
445
+
446
+ /**
447
+ * Parses a single XML element into an object
448
+ * @private
449
+ */
450
+ private static parseElementToObject(
451
+ xml: string,
452
+ startPos: number,
453
+ options: Required<ParseToObjectOptions>,
454
+ depth: number
455
+ ): { value: ParsedXMLValue; endPos: number } {
456
+ // Check nesting depth to prevent stack overflow
457
+ if (depth > options.maxNestingDepth) {
458
+ throw new Error(
459
+ `XML nesting depth exceeds maximum of ${options.maxNestingDepth}. ` +
460
+ `This may indicate malformed XML or an attack attempt. ` +
461
+ `Use the maxNestingDepth option to increase the limit if needed.`
462
+ );
463
+ }
464
+
465
+ // Find opening tag
466
+ const openTagStart = xml.indexOf('<', startPos);
467
+ if (openTagStart === -1) {
468
+ return { value: {}, endPos: xml.length };
469
+ }
470
+
471
+ // Skip comments
472
+ if (xml.substring(openTagStart, openTagStart + 4) === '<!--') {
473
+ const commentEnd = xml.indexOf('-->', openTagStart + 4);
474
+ if (commentEnd !== -1) {
475
+ return XMLParser.parseElementToObject(xml, commentEnd + 3, options, depth);
476
+ }
477
+ return { value: {}, endPos: xml.length };
478
+ }
479
+
480
+ // Extract element name
481
+ const nameMatch = /^([a-zA-Z0-9:_-]+)/.exec(xml.substring(openTagStart + 1));
482
+ if (!nameMatch) {
483
+ return { value: {}, endPos: openTagStart + 1 };
484
+ }
485
+
486
+ const originalElementName: string = nameMatch[1] || '';
487
+ let elementName: string = originalElementName;
488
+ const tagHeaderEnd = xml.indexOf('>', openTagStart);
489
+ if (tagHeaderEnd === -1) {
490
+ return { value: {}, endPos: xml.length };
491
+ }
492
+
493
+ // Remove namespace if requested (but keep original for offset calculations)
494
+ if (options.ignoreNamespace && elementName.includes(':')) {
495
+ elementName = elementName.split(':')[1] || elementName;
496
+ }
497
+
498
+ // Extract attributes using ORIGINAL element name length for correct offset
499
+ const tagHeader = xml.substring(openTagStart + 1 + originalElementName.length, tagHeaderEnd);
500
+ const attributes = XMLParser.extractAttributesFromTag(tagHeader, options);
501
+
502
+ // Check if self-closing
503
+ const isSelfClosing = tagHeader.trim().endsWith('/') || xml[tagHeaderEnd - 1] === '/';
504
+
505
+ if (isSelfClosing) {
506
+ // Self-closing tag - return object with attributes only
507
+ const elementValue: ParsedXMLObject = { ...attributes };
508
+ return {
509
+ value: { [elementName]: elementValue },
510
+ endPos: tagHeaderEnd + 1,
511
+ };
512
+ }
513
+
514
+ // Find closing tag (use original name with namespace for correct matching)
515
+ const closingTag = `</${originalElementName}>`;
516
+ const contentStart = tagHeaderEnd + 1;
517
+ const closingTagPos = XMLParser.findClosingTag(xml, originalElementName, contentStart);
518
+
519
+ if (closingTagPos === -1) {
520
+ // No closing tag found - treat as self-closing
521
+ return {
522
+ value: { [elementName]: { ...attributes } },
523
+ endPos: tagHeaderEnd + 1,
524
+ };
525
+ }
526
+
527
+ // Extract content between tags
528
+ const content = xml.substring(contentStart, closingTagPos);
529
+
530
+ // Parse content (children or text)
531
+ const children: ParsedElement[] = [];
532
+ let textContent = '';
533
+ let pos = 0;
534
+
535
+ while (pos < content.length) {
536
+ const nextTag = content.indexOf('<', pos);
537
+
538
+ if (nextTag === -1) {
539
+ // No more tags - rest is text
540
+ const text = content.substring(pos);
541
+ // When trimValues is false, preserve whitespace-only text
542
+ // When trimValues is true, only include text that has non-whitespace content
543
+ if (text.length > 0 && (!options.trimValues || text.trim())) {
544
+ // Unescape XML entities in text content (e.g., &lt; -> <)
545
+ textContent += XMLBuilder.unescapeXml(text);
546
+ }
547
+ break;
548
+ }
549
+
550
+ // Collect text before next tag
551
+ if (nextTag > pos) {
552
+ const text = content.substring(pos, nextTag);
553
+ // When trimValues is false, preserve whitespace-only text
554
+ // When trimValues is true, only include text that has non-whitespace content
555
+ if (text.length > 0 && (!options.trimValues || text.trim())) {
556
+ // Unescape XML entities in text content (e.g., &lt; -> <)
557
+ textContent += XMLBuilder.unescapeXml(text);
558
+ }
559
+ }
560
+
561
+ // Parse child element (increment depth for children)
562
+ const childResult = XMLParser.parseElementToObject(content, nextTag, options, depth + 1);
563
+ const childObj = childResult.value as ParsedXMLObject;
564
+
565
+ // Extract child name and value
566
+ const childKeys = Object.keys(childObj);
567
+ if (childKeys.length > 0) {
568
+ const childName = childKeys[0];
569
+ if (childName) {
570
+ const childValue = childObj[childName];
571
+ children.push({ name: childName, value: childValue });
572
+ }
573
+ }
574
+
575
+ pos = childResult.endPos;
576
+ }
577
+
578
+ // Build element value
579
+ let elementValue: ParsedXMLValue = {};
580
+
581
+ // Add attributes
582
+ if (!options.ignoreAttributes && Object.keys(attributes).length > 0) {
583
+ elementValue = { ...attributes };
584
+ }
585
+
586
+ // Add text content
587
+ // When trimValues is false, include whitespace-only text
588
+ // When trimValues is true, only include text with non-whitespace content
589
+ if (textContent.length > 0 && (!options.trimValues || textContent.trim())) {
590
+ const text = options.trimValues ? textContent.trim() : textContent;
591
+ if (typeof elementValue === 'object' && !Array.isArray(elementValue)) {
592
+ if (Object.keys(elementValue).length === 0) {
593
+ // Only text, no attributes - return as direct value if simple
594
+ elementValue = text;
595
+ } else {
596
+ // Text with attributes
597
+ elementValue[options.textNodeName] = text;
598
+ }
599
+ }
600
+ }
601
+
602
+ // Add children
603
+ if (children.length > 0) {
604
+ const coalescedChildren = XMLParser.coalesceChildren(children, options);
605
+ if (typeof elementValue === 'object' && !Array.isArray(elementValue)) {
606
+ elementValue = { ...elementValue, ...coalescedChildren };
607
+ } else {
608
+ elementValue = coalescedChildren;
609
+ }
610
+ }
611
+
612
+ // If element has no content, attributes, or children - return empty object
613
+ if (
614
+ typeof elementValue === 'object' &&
615
+ !Array.isArray(elementValue) &&
616
+ Object.keys(elementValue).length === 0
617
+ ) {
618
+ elementValue = {};
619
+ }
620
+
621
+ return {
622
+ value: { [elementName]: elementValue },
623
+ endPos: closingTagPos + closingTag.length,
624
+ };
625
+ }
626
+
627
+ /**
628
+ * Extracts attributes from a tag header
629
+ * @private
630
+ */
631
+ private static extractAttributesFromTag(
632
+ tagHeader: string,
633
+ options: Required<ParseToObjectOptions>
634
+ ): Record<string, string | number | boolean> {
635
+ const attributes: Record<string, string | number | boolean> = {};
636
+
637
+ if (options.ignoreAttributes) {
638
+ return attributes;
639
+ }
640
+
641
+ // Simple attribute extraction using position-based parsing
642
+ let pos = 0;
643
+ while (pos < tagHeader.length) {
644
+ // Skip whitespace
645
+ while (pos < tagHeader.length) {
646
+ const char = tagHeader[pos];
647
+ if (char && /\s/.test(char)) {
648
+ pos++;
649
+ } else {
650
+ break;
651
+ }
652
+ }
653
+
654
+ if (pos >= tagHeader.length || tagHeader[pos] === '/') {
655
+ break;
656
+ }
657
+
658
+ // Extract attribute name
659
+ const nameStart = pos;
660
+ while (pos < tagHeader.length) {
661
+ const char = tagHeader[pos];
662
+ if (char && /[a-zA-Z0-9:_-]/.test(char)) {
663
+ pos++;
664
+ } else {
665
+ break;
666
+ }
667
+ }
668
+
669
+ if (pos === nameStart) {
670
+ break;
671
+ }
672
+
673
+ let attrName = tagHeader.substring(nameStart, pos);
674
+
675
+ // Skip whitespace and '='
676
+ while (pos < tagHeader.length) {
677
+ const char = tagHeader[pos];
678
+ if (char && /[\s=]/.test(char)) {
679
+ pos++;
680
+ } else {
681
+ break;
682
+ }
683
+ }
684
+
685
+ // Extract attribute value
686
+ let attrValue = '';
687
+ if (pos < tagHeader.length && (tagHeader[pos] === '"' || tagHeader[pos] === "'")) {
688
+ const quote = tagHeader[pos];
689
+ pos++; // Skip opening quote
690
+ const valueStart = pos;
691
+
692
+ while (pos < tagHeader.length && tagHeader[pos] !== quote) {
693
+ pos++;
694
+ }
695
+
696
+ attrValue = tagHeader.substring(valueStart, pos);
697
+ pos++; // Skip closing quote
698
+ }
699
+
700
+ // Remove namespace from attribute name if requested
701
+ if (options.ignoreNamespace && attrName.includes(':')) {
702
+ attrName = attrName.split(':')[1] || attrName;
703
+ }
704
+
705
+ // Add prefix to attribute name
706
+ const prefixedName = options.attributeNamePrefix + attrName;
707
+
708
+ // Parse attribute value
709
+ attributes[prefixedName] = options.parseAttributeValue
710
+ ? XMLParser.parseValue(attrValue)
711
+ : attrValue;
712
+ }
713
+
714
+ return attributes;
715
+ }
716
+
717
+ /**
718
+ * Finds the closing tag for an element, handling nesting
719
+ * @private
720
+ */
721
+ private static findClosingTag(xml: string, elementName: string, startPos: number): number {
722
+ const openTag = `<${elementName}`;
723
+ const closeTag = `</${elementName}>`;
724
+ let depth = 1;
725
+ let pos = startPos;
726
+
727
+ while (depth > 0 && pos < xml.length) {
728
+ const nextClose = xml.indexOf(closeTag, pos);
729
+
730
+ if (nextClose === -1) {
731
+ return -1; // No closing tag found
732
+ }
733
+
734
+ // Find the next REAL opening tag (not a prefix match like <w:pPrChange for <w:pPr)
735
+ // Must search for all potential matches and verify each one
736
+ let realOpenPos = -1;
737
+ let searchPos = pos;
738
+ while (searchPos < nextClose) {
739
+ const candidateOpen = xml.indexOf(openTag, searchPos);
740
+ if (candidateOpen === -1 || candidateOpen >= nextClose) {
741
+ break; // No more candidates before the closing tag
742
+ }
743
+
744
+ const charAfter = xml[candidateOpen + openTag.length];
745
+ if (
746
+ charAfter === '>' ||
747
+ charAfter === ' ' ||
748
+ charAfter === '/' ||
749
+ charAfter === '\t' ||
750
+ charAfter === '\n' ||
751
+ charAfter === '\r'
752
+ ) {
753
+ // This looks like a real opening tag - but check if it's self-closing
754
+ // Self-closing tags like <w:rPr/> should NOT increase depth
755
+ const tagEnd = xml.indexOf('>', candidateOpen);
756
+ if (tagEnd !== -1 && xml[tagEnd - 1] === '/') {
757
+ // Self-closing tag - skip it (don't affect depth)
758
+ searchPos = tagEnd + 1;
759
+ continue;
760
+ }
761
+ // This is a real opening tag (not self-closing)
762
+ realOpenPos = candidateOpen;
763
+ break;
764
+ }
765
+
766
+ // False positive (e.g., <w:pPrChange when looking for <w:pPr)
767
+ // Keep searching from after this position
768
+ searchPos = candidateOpen + openTag.length;
769
+ }
770
+
771
+ if (realOpenPos !== -1) {
772
+ // Found a real opening tag before the closing tag - increase depth
773
+ depth++;
774
+ pos = realOpenPos + openTag.length;
775
+ } else {
776
+ // No real opening tag before this closing tag - decrease depth
777
+ depth--;
778
+ if (depth === 0) {
779
+ return nextClose;
780
+ }
781
+ pos = nextClose + closeTag.length;
782
+ }
783
+ }
784
+
785
+ return -1;
786
+ }
787
+
788
+ /**
789
+ * Coalesces children with duplicate names into arrays
790
+ * @private
791
+ */
792
+ private static coalesceChildren(
793
+ children: ParsedElement[],
794
+ options: Required<ParseToObjectOptions>
795
+ ): ParsedXMLObject {
796
+ const result: ParsedXMLObject = {};
797
+ const nameCounts: Record<string, number> = {};
798
+ const nameIndices: Record<string, number> = {};
799
+
800
+ // Track element order for correct run content parsing (tabs, breaks, text)
801
+ // This is critical for preserving the order of mixed content like: text -> tab -> text
802
+ const orderedChildren: { type: string; index: number }[] = [];
803
+
804
+ // Count occurrences of each child name
805
+ for (const child of children) {
806
+ nameCounts[child.name] = (nameCounts[child.name] || 0) + 1;
807
+ }
808
+
809
+ // Build result object while tracking order
810
+ for (const child of children) {
811
+ const shouldBeArray = options.alwaysArray || (nameCounts[child.name] || 0) > 1;
812
+
813
+ // Track element order with its index in the array
814
+ const currentIndex = nameIndices[child.name] || 0;
815
+ orderedChildren.push({ type: child.name, index: currentIndex });
816
+ nameIndices[child.name] = currentIndex + 1;
817
+
818
+ if (shouldBeArray) {
819
+ if (!result[child.name]) {
820
+ result[child.name] = [];
821
+ }
822
+ (result[child.name] as ParsedXMLValue[]).push(child.value);
823
+ } else {
824
+ result[child.name] = child.value;
825
+ }
826
+ }
827
+
828
+ // Add _orderedChildren to track element order (used by DocumentParser for runs)
829
+ if (orderedChildren.length > 0) {
830
+ result._orderedChildren = orderedChildren;
831
+ }
832
+
833
+ return result;
834
+ }
835
+
836
+ /**
837
+ * Parses a string value to number or boolean if applicable
838
+ * @private
839
+ */
840
+ private static parseValue(value: string): string | number | boolean {
841
+ if (value === 'true') return true;
842
+ if (value === 'false') return false;
843
+
844
+ // Preserve 6-character hex color codes (OpenXML standard for colors)
845
+ // This includes "000000" (black) which should stay as a string
846
+ if (/^[0-9A-Fa-f]{6}$/.test(value)) {
847
+ return value.toUpperCase(); // Normalize to uppercase per Microsoft convention
848
+ }
849
+
850
+ // Preserve long digit-only strings (e.g., cnfStyle binary strings like "100000000000")
851
+ // These should not be converted to numbers to avoid losing leading zeros
852
+ if (/^\d{7,}$/.test(value)) {
853
+ return value; // Keep as string for values with 7+ digits
854
+ }
855
+
856
+ // Try parsing as number
857
+ // 3-character values like "240" will be parsed as numbers
858
+ // 6-character hex values are already handled above
859
+ if (/^-?\d+$/.test(value)) {
860
+ const num = parseInt(value, 10);
861
+ if (!isNaN(num)) return num;
862
+ }
863
+
864
+ if (/^-?\d+\.\d+$/.test(value)) {
865
+ const num = parseFloat(value);
866
+ if (!isNaN(num)) return num;
867
+ }
868
+
869
+ // Preserve 3-character hex codes (like "F0A") that have letters
870
+ // Pure numeric 3-char values (like "240") are already parsed as numbers above
871
+ if (/^[0-9A-Fa-f]{3}$/.test(value) && /[A-Fa-f]/.test(value)) {
872
+ return value.toUpperCase();
873
+ }
874
+
875
+ return value;
876
+ }
877
+ }