@farazirfan/costar-server-executor 1.7.37 → 1.7.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (253) hide show
  1. package/dist/agent/agent.d.ts +90 -0
  2. package/dist/agent/agent.d.ts.map +1 -1
  3. package/dist/agent/agent.js +606 -0
  4. package/dist/agent/agent.js.map +1 -1
  5. package/dist/agent/pi-embedded-runner/run.d.ts.map +1 -1
  6. package/dist/agent/pi-embedded-runner/run.js +2 -1
  7. package/dist/agent/pi-embedded-runner/run.js.map +1 -1
  8. package/dist/agent/pi-embedded-runner/system-prompt.d.ts.map +1 -1
  9. package/dist/agent/pi-embedded-runner/system-prompt.js +16 -37
  10. package/dist/agent/pi-embedded-runner/system-prompt.js.map +1 -1
  11. package/dist/agent/pi-embedded-runner/tools.d.ts +4 -1
  12. package/dist/agent/pi-embedded-runner/tools.d.ts.map +1 -1
  13. package/dist/agent/pi-embedded-runner/tools.js +3 -1
  14. package/dist/agent/pi-embedded-runner/tools.js.map +1 -1
  15. package/dist/agent/pi-embedded-runner/types.d.ts +4 -0
  16. package/dist/agent/pi-embedded-runner/types.d.ts.map +1 -1
  17. package/dist/cli/env-loader.d.ts.map +1 -1
  18. package/dist/cli/env-loader.js +1 -0
  19. package/dist/cli/env-loader.js.map +1 -1
  20. package/dist/cli/setup.js +2 -2
  21. package/dist/cli/setup.js.map +1 -1
  22. package/dist/cron/normalize.d.ts +31 -0
  23. package/dist/cron/normalize.d.ts.map +1 -0
  24. package/dist/cron/normalize.js +211 -0
  25. package/dist/cron/normalize.js.map +1 -0
  26. package/dist/cron/scheduler.d.ts +33 -3
  27. package/dist/cron/scheduler.d.ts.map +1 -1
  28. package/dist/cron/scheduler.js +253 -48
  29. package/dist/cron/scheduler.js.map +1 -1
  30. package/dist/heartbeat/runner.d.ts +27 -12
  31. package/dist/heartbeat/runner.d.ts.map +1 -1
  32. package/dist/heartbeat/runner.js +82 -104
  33. package/dist/heartbeat/runner.js.map +1 -1
  34. package/dist/infra/heartbeat-events-filter.d.ts +29 -0
  35. package/dist/infra/heartbeat-events-filter.d.ts.map +1 -0
  36. package/dist/infra/heartbeat-events-filter.js +80 -0
  37. package/dist/infra/heartbeat-events-filter.js.map +1 -0
  38. package/dist/infra/index.d.ts +9 -0
  39. package/dist/infra/index.d.ts.map +1 -0
  40. package/dist/infra/index.js +9 -0
  41. package/dist/infra/index.js.map +1 -0
  42. package/dist/infra/system-events.d.ts +58 -2
  43. package/dist/infra/system-events.d.ts.map +1 -1
  44. package/dist/infra/system-events.js +80 -14
  45. package/dist/infra/system-events.js.map +1 -1
  46. package/dist/server.d.ts.map +1 -1
  47. package/dist/server.js +6 -1
  48. package/dist/server.js.map +1 -1
  49. package/dist/services/platform-keys.d.ts +19 -0
  50. package/dist/services/platform-keys.d.ts.map +1 -0
  51. package/dist/services/platform-keys.js +74 -0
  52. package/dist/services/platform-keys.js.map +1 -0
  53. package/dist/subagent/registry.d.ts +96 -0
  54. package/dist/subagent/registry.d.ts.map +1 -0
  55. package/dist/subagent/registry.js +180 -0
  56. package/dist/subagent/registry.js.map +1 -0
  57. package/dist/tools/complete-turn.d.ts +2 -2
  58. package/dist/tools/complete-turn.js +10 -10
  59. package/dist/tools/complete-turn.js.map +1 -1
  60. package/dist/tools/contacts.d.ts +13 -0
  61. package/dist/tools/contacts.d.ts.map +1 -0
  62. package/dist/tools/contacts.js +80 -0
  63. package/dist/tools/contacts.js.map +1 -0
  64. package/dist/tools/cron.d.ts +17 -2
  65. package/dist/tools/cron.d.ts.map +1 -1
  66. package/dist/tools/cron.js +117 -35
  67. package/dist/tools/cron.js.map +1 -1
  68. package/dist/tools/google-maps.d.ts +6 -6
  69. package/dist/tools/google-maps.d.ts.map +1 -1
  70. package/dist/tools/google-maps.js +207 -262
  71. package/dist/tools/google-maps.js.map +1 -1
  72. package/dist/tools/index.d.ts +17 -7
  73. package/dist/tools/index.d.ts.map +1 -1
  74. package/dist/tools/index.js +40 -9
  75. package/dist/tools/index.js.map +1 -1
  76. package/dist/tools/phone-call.d.ts +11 -0
  77. package/dist/tools/phone-call.d.ts.map +1 -0
  78. package/dist/tools/phone-call.js +151 -0
  79. package/dist/tools/phone-call.js.map +1 -0
  80. package/dist/tools/sessions-spawn.d.ts +33 -0
  81. package/dist/tools/sessions-spawn.d.ts.map +1 -0
  82. package/dist/tools/sessions-spawn.js +164 -0
  83. package/dist/tools/sessions-spawn.js.map +1 -0
  84. package/dist/tools/spotify.d.ts +12 -0
  85. package/dist/tools/spotify.d.ts.map +1 -0
  86. package/dist/tools/spotify.js +251 -0
  87. package/dist/tools/spotify.js.map +1 -0
  88. package/dist/tools/subagents.d.ts +23 -0
  89. package/dist/tools/subagents.d.ts.map +1 -0
  90. package/dist/tools/subagents.js +209 -0
  91. package/dist/tools/subagents.js.map +1 -0
  92. package/dist/tools/whatsapp.d.ts +13 -0
  93. package/dist/tools/whatsapp.d.ts.map +1 -0
  94. package/dist/tools/whatsapp.js +215 -0
  95. package/dist/tools/whatsapp.js.map +1 -0
  96. package/dist/tools/youtube.d.ts +12 -0
  97. package/dist/tools/youtube.d.ts.map +1 -0
  98. package/dist/tools/youtube.js +218 -0
  99. package/dist/tools/youtube.js.map +1 -0
  100. package/dist/utils/asterizk-auth.d.ts +43 -0
  101. package/dist/utils/asterizk-auth.d.ts.map +1 -0
  102. package/dist/utils/asterizk-auth.js +125 -0
  103. package/dist/utils/asterizk-auth.js.map +1 -0
  104. package/dist/web-server.d.ts.map +1 -1
  105. package/dist/web-server.js +132 -0
  106. package/dist/web-server.js.map +1 -1
  107. package/dist/workspace/index.d.ts +3 -4
  108. package/dist/workspace/index.d.ts.map +1 -1
  109. package/dist/workspace/index.js +3 -4
  110. package/dist/workspace/index.js.map +1 -1
  111. package/dist/workspace/templates.d.ts +8 -7
  112. package/dist/workspace/templates.d.ts.map +1 -1
  113. package/dist/workspace/templates.js +18 -127
  114. package/dist/workspace/templates.js.map +1 -1
  115. package/dist/workspace/workspace.d.ts +2 -4
  116. package/dist/workspace/workspace.d.ts.map +1 -1
  117. package/dist/workspace/workspace.js +7 -16
  118. package/dist/workspace/workspace.js.map +1 -1
  119. package/package.json +1 -1
  120. package/public/index.html +231 -0
  121. package/skills/docx/SKILL.md +468 -0
  122. package/skills/docx/scripts/__init__.py +1 -0
  123. package/skills/docx/scripts/accept_changes.py +181 -0
  124. package/skills/docx/scripts/comment.py +347 -0
  125. package/skills/docx/scripts/helpers/__init__.py +0 -0
  126. package/skills/docx/scripts/helpers/merge_runs.py +231 -0
  127. package/skills/docx/scripts/helpers/simplify_redlines.py +240 -0
  128. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
  129. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
  130. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
  131. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
  132. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
  133. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
  134. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
  135. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
  136. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
  137. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
  138. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
  139. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
  140. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
  141. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
  142. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
  143. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
  144. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
  145. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
  146. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
  147. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
  148. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
  149. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
  150. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
  151. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
  152. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
  153. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
  154. package/skills/docx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
  155. package/skills/docx/scripts/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
  156. package/skills/docx/scripts/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
  157. package/skills/docx/scripts/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
  158. package/skills/docx/scripts/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
  159. package/skills/docx/scripts/ooxml/schemas/mce/mc.xsd +75 -0
  160. package/skills/docx/scripts/ooxml/schemas/microsoft/wml-2010.xsd +560 -0
  161. package/skills/docx/scripts/ooxml/schemas/microsoft/wml-2012.xsd +67 -0
  162. package/skills/docx/scripts/ooxml/schemas/microsoft/wml-2018.xsd +14 -0
  163. package/skills/docx/scripts/ooxml/schemas/microsoft/wml-cex-2018.xsd +20 -0
  164. package/skills/docx/scripts/ooxml/schemas/microsoft/wml-cid-2016.xsd +13 -0
  165. package/skills/docx/scripts/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
  166. package/skills/docx/scripts/ooxml/schemas/microsoft/wml-symex-2015.xsd +8 -0
  167. package/skills/docx/scripts/ooxml/scripts/pack.py +159 -0
  168. package/skills/docx/scripts/ooxml/scripts/unpack.py +29 -0
  169. package/skills/docx/scripts/ooxml/scripts/validate.py +106 -0
  170. package/skills/docx/scripts/ooxml/scripts/validation/__init__.py +15 -0
  171. package/skills/docx/scripts/ooxml/scripts/validation/base.py +1023 -0
  172. package/skills/docx/scripts/ooxml/scripts/validation/docx.py +519 -0
  173. package/skills/docx/scripts/ooxml/scripts/validation/pptx.py +315 -0
  174. package/skills/docx/scripts/ooxml/scripts/validation/redlining.py +284 -0
  175. package/skills/docx/scripts/pack.py +166 -0
  176. package/skills/docx/scripts/templates/comments.xml +3 -0
  177. package/skills/docx/scripts/templates/commentsExtended.xml +3 -0
  178. package/skills/docx/scripts/templates/commentsExtensible.xml +3 -0
  179. package/skills/docx/scripts/templates/commentsIds.xml +3 -0
  180. package/skills/docx/scripts/templates/people.xml +3 -0
  181. package/skills/docx/scripts/unpack.py +134 -0
  182. package/skills/longform-video-generation/SKILL.md +298 -0
  183. package/skills/longform-video-generation/references/advanced_techniques.md +474 -0
  184. package/skills/longform-video-generation/references/google_api_guide.md +288 -0
  185. package/skills/longform-video-generation/scripts/video_generator.py +579 -0
  186. package/skills/pdf/FORMS.md +305 -0
  187. package/skills/pdf/REFERENCE.md +612 -0
  188. package/skills/pdf/SKILL.md +293 -0
  189. package/skills/pdf/scripts/check_bounding_boxes.py +70 -0
  190. package/skills/pdf/scripts/check_fillable_fields.py +12 -0
  191. package/skills/pdf/scripts/convert_pdf_to_images.py +35 -0
  192. package/skills/pdf/scripts/create_validation_image.py +41 -0
  193. package/skills/pdf/scripts/extract_form_field_info.py +152 -0
  194. package/skills/pdf/scripts/extract_form_structure.py +124 -0
  195. package/skills/pdf/scripts/fill_fillable_fields.py +116 -0
  196. package/skills/pdf/scripts/fill_pdf_form_with_annotations.py +136 -0
  197. package/skills/pptx/SKILL.md +171 -0
  198. package/skills/pptx/editing.md +205 -0
  199. package/skills/pptx/pptxgenjs.md +377 -0
  200. package/skills/pptx/scripts/add_slide.py +225 -0
  201. package/skills/pptx/scripts/clean.py +309 -0
  202. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
  203. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
  204. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
  205. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
  206. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
  207. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
  208. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
  209. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
  210. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
  211. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
  212. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
  213. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
  214. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
  215. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
  216. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
  217. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
  218. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
  219. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
  220. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
  221. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
  222. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
  223. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
  224. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
  225. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
  226. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
  227. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
  228. package/skills/pptx/scripts/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
  229. package/skills/pptx/scripts/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
  230. package/skills/pptx/scripts/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
  231. package/skills/pptx/scripts/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
  232. package/skills/pptx/scripts/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
  233. package/skills/pptx/scripts/ooxml/schemas/mce/mc.xsd +75 -0
  234. package/skills/pptx/scripts/ooxml/schemas/microsoft/wml-2010.xsd +560 -0
  235. package/skills/pptx/scripts/ooxml/schemas/microsoft/wml-2012.xsd +67 -0
  236. package/skills/pptx/scripts/ooxml/schemas/microsoft/wml-2018.xsd +14 -0
  237. package/skills/pptx/scripts/ooxml/schemas/microsoft/wml-cex-2018.xsd +20 -0
  238. package/skills/pptx/scripts/ooxml/schemas/microsoft/wml-cid-2016.xsd +13 -0
  239. package/skills/pptx/scripts/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
  240. package/skills/pptx/scripts/ooxml/schemas/microsoft/wml-symex-2015.xsd +8 -0
  241. package/skills/pptx/scripts/ooxml/scripts/pack.py +159 -0
  242. package/skills/pptx/scripts/ooxml/scripts/unpack.py +29 -0
  243. package/skills/pptx/scripts/ooxml/scripts/validate.py +106 -0
  244. package/skills/pptx/scripts/ooxml/scripts/validation/__init__.py +15 -0
  245. package/skills/pptx/scripts/ooxml/scripts/validation/base.py +1023 -0
  246. package/skills/pptx/scripts/ooxml/scripts/validation/docx.py +519 -0
  247. package/skills/pptx/scripts/ooxml/scripts/validation/pptx.py +315 -0
  248. package/skills/pptx/scripts/ooxml/scripts/validation/redlining.py +284 -0
  249. package/skills/pptx/scripts/pack.py +168 -0
  250. package/skills/pptx/scripts/thumbnail.py +318 -0
  251. package/skills/pptx/scripts/unpack.py +86 -0
  252. package/skills/xlsx/SKILL.md +291 -0
  253. package/skills/xlsx/recalc.py +247 -0
@@ -0,0 +1,1023 @@
1
+ """
2
+ Base validator with common validation logic for document files.
3
+ """
4
+
5
+ import re
6
+ from pathlib import Path
7
+
8
+ import defusedxml.minidom
9
+ import lxml.etree
10
+
11
+
12
+ class BaseSchemaValidator:
13
+ """Base validator with common validation logic for document files."""
14
+
15
+ # Validation errors to ignore (patterns that appear in error messages)
16
+ # These are XSD schema errors that don't affect document functionality,
17
+ # typically caused by specific editors like LibreOffice.
18
+ IGNORED_VALIDATION_ERRORS = [
19
+ # LibreOffice writes hyphenationZone in wrong order in word/settings.xml.
20
+ # The XSD requires strict element ordering, but LibreOffice puts doNotHyphenateCaps
21
+ # before hyphenationZone. This doesn't affect document rendering.
22
+ "hyphenationZone",
23
+ ]
24
+
25
+ # Elements whose 'id' attributes must be unique within their file
26
+ # Format: element_name -> (attribute_name, scope)
27
+ # scope can be 'file' (unique within file) or 'global' (unique across all files)
28
+ UNIQUE_ID_REQUIREMENTS = {
29
+ # Word elements
30
+ "comment": ("id", "file"), # Comment IDs in comments.xml
31
+ "commentrangestart": ("id", "file"), # Must match comment IDs
32
+ "commentrangeend": ("id", "file"), # Must match comment IDs
33
+ "bookmarkstart": ("id", "file"), # Bookmark start IDs
34
+ "bookmarkend": ("id", "file"), # Bookmark end IDs
35
+ # Note: ins and del (track changes) can share IDs when part of same revision
36
+ # PowerPoint elements
37
+ "sldid": ("id", "file"), # Slide IDs in presentation.xml
38
+ "sldmasterid": ("id", "global"), # Slide master IDs must be globally unique
39
+ "sldlayoutid": ("id", "global"), # Slide layout IDs must be globally unique
40
+ "cm": ("authorid", "file"), # Comment author IDs
41
+ # Excel elements
42
+ "sheet": ("sheetid", "file"), # Sheet IDs in workbook.xml
43
+ "definedname": ("id", "file"), # Named range IDs
44
+ # Drawing/Shape elements (all formats)
45
+ "cxnsp": ("id", "file"), # Connection shape IDs
46
+ "sp": ("id", "file"), # Shape IDs
47
+ "pic": ("id", "file"), # Picture IDs
48
+ "grpsp": ("id", "file"), # Group shape IDs
49
+ }
50
+
51
+ # Container elements where ID uniqueness checks should be skipped
52
+ # These hold references that intentionally duplicate IDs of elements they reference
53
+ # Example: <p14:sldId id="301"> in sectionLst references <p:sldId id="301"> in sldIdLst
54
+ EXCLUDED_ID_CONTAINERS = {
55
+ "sectionlst", # PowerPoint sections - sldId elements reference slides by ID
56
+ }
57
+
58
+ # Mapping of element names to expected relationship types
59
+ # Subclasses should override this with format-specific mappings
60
+ ELEMENT_RELATIONSHIP_TYPES = {}
61
+
62
+ # Unified schema mappings for all Office document types
63
+ SCHEMA_MAPPINGS = {
64
+ # Document type specific schemas
65
+ "word": "ISO-IEC29500-4_2016/wml.xsd", # Word documents
66
+ "ppt": "ISO-IEC29500-4_2016/pml.xsd", # PowerPoint presentations
67
+ "xl": "ISO-IEC29500-4_2016/sml.xsd", # Excel spreadsheets
68
+ # Common file types
69
+ "[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd",
70
+ "app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd",
71
+ "core.xml": "ecma/fouth-edition/opc-coreProperties.xsd",
72
+ "custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd",
73
+ ".rels": "ecma/fouth-edition/opc-relationships.xsd",
74
+ # Word-specific files
75
+ "people.xml": "microsoft/wml-2012.xsd",
76
+ "commentsIds.xml": "microsoft/wml-cid-2016.xsd",
77
+ "commentsExtensible.xml": "microsoft/wml-cex-2018.xsd",
78
+ "commentsExtended.xml": "microsoft/wml-2012.xsd",
79
+ # Chart files (common across document types)
80
+ "chart": "ISO-IEC29500-4_2016/dml-chart.xsd",
81
+ # Theme files (common across document types)
82
+ "theme": "ISO-IEC29500-4_2016/dml-main.xsd",
83
+ # Drawing and media files
84
+ "drawing": "ISO-IEC29500-4_2016/dml-main.xsd",
85
+ }
86
+
87
+ # Unified namespace constants
88
+ MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006"
89
+ XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
90
+
91
+ # Common OOXML namespaces used across validators
92
+ PACKAGE_RELATIONSHIPS_NAMESPACE = (
93
+ "http://schemas.openxmlformats.org/package/2006/relationships"
94
+ )
95
+ OFFICE_RELATIONSHIPS_NAMESPACE = (
96
+ "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
97
+ )
98
+ CONTENT_TYPES_NAMESPACE = (
99
+ "http://schemas.openxmlformats.org/package/2006/content-types"
100
+ )
101
+
102
+ # Folders where we should clean ignorable namespaces
103
+ MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"}
104
+
105
+ # All allowed OOXML namespaces (superset of all document types)
106
+ OOXML_NAMESPACES = {
107
+ "http://schemas.openxmlformats.org/officeDocument/2006/math",
108
+ "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
109
+ "http://schemas.openxmlformats.org/schemaLibrary/2006/main",
110
+ "http://schemas.openxmlformats.org/drawingml/2006/main",
111
+ "http://schemas.openxmlformats.org/drawingml/2006/chart",
112
+ "http://schemas.openxmlformats.org/drawingml/2006/chartDrawing",
113
+ "http://schemas.openxmlformats.org/drawingml/2006/diagram",
114
+ "http://schemas.openxmlformats.org/drawingml/2006/picture",
115
+ "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing",
116
+ "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
117
+ "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
118
+ "http://schemas.openxmlformats.org/presentationml/2006/main",
119
+ "http://schemas.openxmlformats.org/spreadsheetml/2006/main",
120
+ "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes",
121
+ "http://www.w3.org/XML/1998/namespace",
122
+ }
123
+
124
+ def __init__(self, unpacked_dir, original_file, verbose=False):
125
+ self.unpacked_dir = Path(unpacked_dir).resolve()
126
+ self.original_file = Path(original_file)
127
+ self.verbose = verbose
128
+
129
+ # Set schemas directory
130
+ self.schemas_dir = Path(__file__).parent.parent.parent / "schemas"
131
+
132
+ # Get all XML and .rels files
133
+ patterns = ["*.xml", "*.rels"]
134
+ self.xml_files = [
135
+ f for pattern in patterns for f in self.unpacked_dir.rglob(pattern)
136
+ ]
137
+
138
+ if not self.xml_files:
139
+ print(f"Warning: No XML files found in {self.unpacked_dir}")
140
+
141
+ def validate(self):
142
+ """Run all validation checks and return True if all pass."""
143
+ raise NotImplementedError("Subclasses must implement the validate method")
144
+
145
+ def repair(self) -> int:
146
+ """Run auto-repairs. Returns count of repairs made. Subclasses should override and call super()."""
147
+ return self.repair_whitespace_preservation()
148
+
149
+ def repair_whitespace_preservation(self) -> int:
150
+ """Add xml:space='preserve' to w:t/a:t elements with leading/trailing whitespace."""
151
+ repairs = 0
152
+
153
+ for xml_file in self.xml_files:
154
+ try:
155
+ content = xml_file.read_text(encoding="utf-8")
156
+ dom = defusedxml.minidom.parseString(content)
157
+ modified = False
158
+
159
+ for elem in dom.getElementsByTagName("*"):
160
+ if elem.tagName.endswith(":t") and elem.firstChild:
161
+ text = elem.firstChild.nodeValue
162
+ if text and (text.startswith((' ', '\t')) or text.endswith((' ', '\t'))):
163
+ if elem.getAttribute("xml:space") != "preserve":
164
+ elem.setAttribute("xml:space", "preserve")
165
+ text_preview = repr(text[:30]) + "..." if len(text) > 30 else repr(text)
166
+ print(f" Repaired: {xml_file.name}: Added xml:space='preserve' to {elem.tagName}: {text_preview}")
167
+ repairs += 1
168
+ modified = True
169
+
170
+ if modified:
171
+ xml_file.write_bytes(dom.toxml(encoding="UTF-8"))
172
+
173
+ except Exception:
174
+ pass
175
+
176
+ return repairs
177
+
178
+ def validate_xml(self):
179
+ """Validate that all XML files are well-formed."""
180
+ errors = []
181
+
182
+ for xml_file in self.xml_files:
183
+ try:
184
+ # Try to parse the XML file
185
+ lxml.etree.parse(str(xml_file))
186
+ except lxml.etree.XMLSyntaxError as e:
187
+ errors.append(
188
+ f" {xml_file.relative_to(self.unpacked_dir)}: "
189
+ f"Line {e.lineno}: {e.msg}"
190
+ )
191
+ except Exception as e:
192
+ errors.append(
193
+ f" {xml_file.relative_to(self.unpacked_dir)}: "
194
+ f"Unexpected error: {str(e)}"
195
+ )
196
+
197
+ if errors:
198
+ print(f"FAILED - Found {len(errors)} XML violations:")
199
+ for error in errors:
200
+ print(error)
201
+ return False
202
+ else:
203
+ if self.verbose:
204
+ print("PASSED - All XML files are well-formed")
205
+ return True
206
+
207
+ def validate_namespaces(self):
208
+ """Validate that namespace prefixes in Ignorable attributes are declared."""
209
+ errors = []
210
+
211
+ for xml_file in self.xml_files:
212
+ try:
213
+ root = lxml.etree.parse(str(xml_file)).getroot()
214
+ declared = set(root.nsmap.keys()) - {None} # Exclude default namespace
215
+
216
+ for attr_val in [
217
+ v for k, v in root.attrib.items() if k.endswith("Ignorable")
218
+ ]:
219
+ undeclared = set(attr_val.split()) - declared
220
+ errors.extend(
221
+ f" {xml_file.relative_to(self.unpacked_dir)}: "
222
+ f"Namespace '{ns}' in Ignorable but not declared"
223
+ for ns in undeclared
224
+ )
225
+ except lxml.etree.XMLSyntaxError:
226
+ continue
227
+
228
+ if errors:
229
+ print(f"FAILED - {len(errors)} namespace issues:")
230
+ for error in errors:
231
+ print(error)
232
+ return False
233
+ if self.verbose:
234
+ print("PASSED - All namespace prefixes properly declared")
235
+ return True
236
+
237
+ def validate_unique_ids(self):
238
+ """Validate that specific IDs are unique according to OOXML requirements."""
239
+ errors = []
240
+ global_ids = {} # Track globally unique IDs across all files
241
+
242
+ for xml_file in self.xml_files:
243
+ try:
244
+ root = lxml.etree.parse(str(xml_file)).getroot()
245
+ file_ids = {} # Track IDs that must be unique within this file
246
+
247
+ # Remove all mc:AlternateContent elements from the tree
248
+ mc_elements = root.xpath(
249
+ ".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE}
250
+ )
251
+ for elem in mc_elements:
252
+ elem.getparent().remove(elem)
253
+
254
+ # Now check IDs in the cleaned tree
255
+ for elem in root.iter():
256
+ # Get the element name without namespace
257
+ tag = (
258
+ elem.tag.split("}")[-1].lower()
259
+ if "}" in elem.tag
260
+ else elem.tag.lower()
261
+ )
262
+
263
+ # Check if this element type has ID uniqueness requirements
264
+ if tag in self.UNIQUE_ID_REQUIREMENTS:
265
+ # Skip if element is inside an excluded container
266
+ # (e.g., <p14:sldId> inside <p14:sectionLst> is a reference, not a definition)
267
+ in_excluded_container = any(
268
+ ancestor.tag.split("}")[-1].lower() in self.EXCLUDED_ID_CONTAINERS
269
+ for ancestor in elem.iterancestors()
270
+ )
271
+ if in_excluded_container:
272
+ continue
273
+
274
+ attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag]
275
+
276
+ # Look for the specified attribute
277
+ id_value = None
278
+ for attr, value in elem.attrib.items():
279
+ attr_local = (
280
+ attr.split("}")[-1].lower()
281
+ if "}" in attr
282
+ else attr.lower()
283
+ )
284
+ if attr_local == attr_name:
285
+ id_value = value
286
+ break
287
+
288
+ if id_value is not None:
289
+ if scope == "global":
290
+ # Check global uniqueness
291
+ if id_value in global_ids:
292
+ prev_file, prev_line, prev_tag = global_ids[
293
+ id_value
294
+ ]
295
+ errors.append(
296
+ f" {xml_file.relative_to(self.unpacked_dir)}: "
297
+ f"Line {elem.sourceline}: Global ID '{id_value}' in <{tag}> "
298
+ f"already used in {prev_file} at line {prev_line} in <{prev_tag}>"
299
+ )
300
+ else:
301
+ global_ids[id_value] = (
302
+ xml_file.relative_to(self.unpacked_dir),
303
+ elem.sourceline,
304
+ tag,
305
+ )
306
+ elif scope == "file":
307
+ # Check file-level uniqueness
308
+ key = (tag, attr_name)
309
+ if key not in file_ids:
310
+ file_ids[key] = {}
311
+
312
+ if id_value in file_ids[key]:
313
+ prev_line = file_ids[key][id_value]
314
+ errors.append(
315
+ f" {xml_file.relative_to(self.unpacked_dir)}: "
316
+ f"Line {elem.sourceline}: Duplicate {attr_name}='{id_value}' in <{tag}> "
317
+ f"(first occurrence at line {prev_line})"
318
+ )
319
+ else:
320
+ file_ids[key][id_value] = elem.sourceline
321
+
322
+ except (lxml.etree.XMLSyntaxError, Exception) as e:
323
+ errors.append(
324
+ f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
325
+ )
326
+
327
+ if errors:
328
+ print(f"FAILED - Found {len(errors)} ID uniqueness violations:")
329
+ for error in errors:
330
+ print(error)
331
+ return False
332
+ else:
333
+ if self.verbose:
334
+ print("PASSED - All required IDs are unique")
335
+ return True
336
+
337
+ def validate_file_references(self):
338
+ """
339
+ Validate that all .rels files properly reference files and that all files are referenced.
340
+ """
341
+ errors = []
342
+
343
+ # Find all .rels files
344
+ rels_files = list(self.unpacked_dir.rglob("*.rels"))
345
+
346
+ if not rels_files:
347
+ if self.verbose:
348
+ print("PASSED - No .rels files found")
349
+ return True
350
+
351
+ # Get all files in the unpacked directory (excluding reference files)
352
+ all_files = []
353
+ for file_path in self.unpacked_dir.rglob("*"):
354
+ if (
355
+ file_path.is_file()
356
+ and file_path.name != "[Content_Types].xml"
357
+ and not file_path.name.endswith(".rels")
358
+ ): # This file is not referenced by .rels
359
+ all_files.append(file_path.resolve())
360
+
361
+ # Track all files that are referenced by any .rels file
362
+ all_referenced_files = set()
363
+
364
+ if self.verbose:
365
+ print(
366
+ f"Found {len(rels_files)} .rels files and {len(all_files)} target files"
367
+ )
368
+
369
+ # Check each .rels file
370
+ for rels_file in rels_files:
371
+ try:
372
+ # Parse relationships file
373
+ rels_root = lxml.etree.parse(str(rels_file)).getroot()
374
+
375
+ # Get the directory where this .rels file is located
376
+ rels_dir = rels_file.parent
377
+
378
+ # Find all relationships and their targets
379
+ referenced_files = set()
380
+ broken_refs = []
381
+
382
+ for rel in rels_root.findall(
383
+ ".//ns:Relationship",
384
+ namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE},
385
+ ):
386
+ target = rel.get("Target")
387
+ if target and not target.startswith(
388
+ ("http", "mailto:")
389
+ ): # Skip external URLs
390
+ # Resolve the target path
391
+ # Absolute paths (starting with /) are relative to package root
392
+ # Relative paths are relative to the .rels file's parent directory
393
+ if target.startswith("/"):
394
+ # Absolute path - resolve from unpacked_dir root
395
+ # Strip leading / to avoid pathlib replacing the base
396
+ target_path = self.unpacked_dir / target.lstrip("/")
397
+ elif rels_file.name == ".rels":
398
+ # Root .rels file - relative targets are relative to unpacked_dir
399
+ target_path = self.unpacked_dir / target
400
+ else:
401
+ # Other .rels files - relative targets are relative to their parent's parent
402
+ # e.g., word/_rels/document.xml.rels -> targets relative to word/
403
+ base_dir = rels_dir.parent
404
+ target_path = base_dir / target
405
+
406
+ # Normalize the path and check if it exists
407
+ try:
408
+ target_path = target_path.resolve()
409
+ if target_path.exists() and target_path.is_file():
410
+ referenced_files.add(target_path)
411
+ all_referenced_files.add(target_path)
412
+ else:
413
+ broken_refs.append((target, rel.sourceline))
414
+ except (OSError, ValueError):
415
+ broken_refs.append((target, rel.sourceline))
416
+
417
+ # Report broken references
418
+ if broken_refs:
419
+ rel_path = rels_file.relative_to(self.unpacked_dir)
420
+ for broken_ref, line_num in broken_refs:
421
+ errors.append(
422
+ f" {rel_path}: Line {line_num}: Broken reference to {broken_ref}"
423
+ )
424
+
425
+ except Exception as e:
426
+ rel_path = rels_file.relative_to(self.unpacked_dir)
427
+ errors.append(f" Error parsing {rel_path}: {e}")
428
+
429
+ # Check for unreferenced files (files that exist but are not referenced anywhere)
430
+ unreferenced_files = set(all_files) - all_referenced_files
431
+
432
+ if unreferenced_files:
433
+ for unref_file in sorted(unreferenced_files):
434
+ unref_rel_path = unref_file.relative_to(self.unpacked_dir)
435
+ errors.append(f" Unreferenced file: {unref_rel_path}")
436
+
437
+ if errors:
438
+ print(f"FAILED - Found {len(errors)} relationship validation errors:")
439
+ for error in errors:
440
+ print(error)
441
+ print(
442
+ "CRITICAL: These errors will cause the document to appear corrupt. "
443
+ + "Broken references MUST be fixed, "
444
+ + "and unreferenced files MUST be referenced or removed."
445
+ )
446
+ return False
447
+ else:
448
+ if self.verbose:
449
+ print(
450
+ "PASSED - All references are valid and all files are properly referenced"
451
+ )
452
+ return True
453
+
454
+ def validate_all_relationship_ids(self):
455
+ """
456
+ Validate that all r:id attributes in XML files reference existing IDs
457
+ in their corresponding .rels files, and optionally validate relationship types.
458
+ """
459
+ import lxml.etree
460
+
461
+ errors = []
462
+
463
+ # Process each XML file that might contain r:id references
464
+ for xml_file in self.xml_files:
465
+ # Skip .rels files themselves
466
+ if xml_file.suffix == ".rels":
467
+ continue
468
+
469
+ # Determine the corresponding .rels file
470
+ # For dir/file.xml, it's dir/_rels/file.xml.rels
471
+ rels_dir = xml_file.parent / "_rels"
472
+ rels_file = rels_dir / f"{xml_file.name}.rels"
473
+
474
+ # Skip if there's no corresponding .rels file (that's okay)
475
+ if not rels_file.exists():
476
+ continue
477
+
478
+ try:
479
+ # Parse the .rels file to get valid relationship IDs and their types
480
+ rels_root = lxml.etree.parse(str(rels_file)).getroot()
481
+ rid_to_type = {}
482
+
483
+ for rel in rels_root.findall(
484
+ f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
485
+ ):
486
+ rid = rel.get("Id")
487
+ rel_type = rel.get("Type", "")
488
+ if rid:
489
+ # Check for duplicate rIds
490
+ if rid in rid_to_type:
491
+ rels_rel_path = rels_file.relative_to(self.unpacked_dir)
492
+ errors.append(
493
+ f" {rels_rel_path}: Line {rel.sourceline}: "
494
+ f"Duplicate relationship ID '{rid}' (IDs must be unique)"
495
+ )
496
+ # Extract just the type name from the full URL
497
+ type_name = (
498
+ rel_type.split("/")[-1] if "/" in rel_type else rel_type
499
+ )
500
+ rid_to_type[rid] = type_name
501
+
502
+ # Parse the XML file to find all r:id references
503
+ xml_root = lxml.etree.parse(str(xml_file)).getroot()
504
+
505
+ # Find all elements with r:id attributes
506
+ for elem in xml_root.iter():
507
+ # Check for r:id attribute (relationship ID)
508
+ rid_attr = elem.get(f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id")
509
+ if rid_attr:
510
+ xml_rel_path = xml_file.relative_to(self.unpacked_dir)
511
+ elem_name = (
512
+ elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
513
+ )
514
+
515
+ # Check if the ID exists
516
+ if rid_attr not in rid_to_type:
517
+ errors.append(
518
+ f" {xml_rel_path}: Line {elem.sourceline}: "
519
+ f"<{elem_name}> references non-existent relationship '{rid_attr}' "
520
+ f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})"
521
+ )
522
+ # Check if we have type expectations for this element
523
+ elif self.ELEMENT_RELATIONSHIP_TYPES:
524
+ expected_type = self._get_expected_relationship_type(
525
+ elem_name
526
+ )
527
+ if expected_type:
528
+ actual_type = rid_to_type[rid_attr]
529
+ # Check if the actual type matches or contains the expected type
530
+ if expected_type not in actual_type.lower():
531
+ errors.append(
532
+ f" {xml_rel_path}: Line {elem.sourceline}: "
533
+ f"<{elem_name}> references '{rid_attr}' which points to '{actual_type}' "
534
+ f"but should point to a '{expected_type}' relationship"
535
+ )
536
+
537
+ except Exception as e:
538
+ xml_rel_path = xml_file.relative_to(self.unpacked_dir)
539
+ errors.append(f" Error processing {xml_rel_path}: {e}")
540
+
541
+ if errors:
542
+ print(f"FAILED - Found {len(errors)} relationship ID reference errors:")
543
+ for error in errors:
544
+ print(error)
545
+ print("\nThese ID mismatches will cause the document to appear corrupt!")
546
+ return False
547
+ else:
548
+ if self.verbose:
549
+ print("PASSED - All relationship ID references are valid")
550
+ return True
551
+
552
+ def _get_expected_relationship_type(self, element_name):
553
+ """
554
+ Get the expected relationship type for an element.
555
+ First checks the explicit mapping, then tries pattern detection.
556
+ """
557
+ # Normalize element name to lowercase
558
+ elem_lower = element_name.lower()
559
+
560
+ # Check explicit mapping first
561
+ if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES:
562
+ return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower]
563
+
564
+ # Try pattern detection for common patterns
565
+ # Pattern 1: Elements ending in "Id" often expect a relationship of the prefix type
566
+ if elem_lower.endswith("id") and len(elem_lower) > 2:
567
+ # e.g., "sldId" -> "sld", "sldMasterId" -> "sldMaster"
568
+ prefix = elem_lower[:-2] # Remove "id"
569
+ # Check if this might be a compound like "sldMasterId"
570
+ if prefix.endswith("master"):
571
+ return prefix.lower()
572
+ elif prefix.endswith("layout"):
573
+ return prefix.lower()
574
+ else:
575
+ # Simple case like "sldId" -> "slide"
576
+ # Common transformations
577
+ if prefix == "sld":
578
+ return "slide"
579
+ return prefix.lower()
580
+
581
+ # Pattern 2: Elements ending in "Reference" expect a relationship of the prefix type
582
+ if elem_lower.endswith("reference") and len(elem_lower) > 9:
583
+ prefix = elem_lower[:-9] # Remove "reference"
584
+ return prefix.lower()
585
+
586
+ return None
587
+
588
+ def validate_content_types(self):
589
+ """Validate that all content files are properly declared in [Content_Types].xml."""
590
+ errors = []
591
+
592
+ # Find [Content_Types].xml file
593
+ content_types_file = self.unpacked_dir / "[Content_Types].xml"
594
+ if not content_types_file.exists():
595
+ print("FAILED - [Content_Types].xml file not found")
596
+ return False
597
+
598
+ try:
599
+ # Parse and get all declared parts and extensions
600
+ root = lxml.etree.parse(str(content_types_file)).getroot()
601
+ declared_parts = set()
602
+ declared_extensions = set()
603
+
604
+ # Get Override declarations (specific files)
605
+ for override in root.findall(
606
+ f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override"
607
+ ):
608
+ part_name = override.get("PartName")
609
+ if part_name is not None:
610
+ declared_parts.add(part_name.lstrip("/"))
611
+
612
+ # Get Default declarations (by extension)
613
+ for default in root.findall(
614
+ f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default"
615
+ ):
616
+ extension = default.get("Extension")
617
+ if extension is not None:
618
+ declared_extensions.add(extension.lower())
619
+
620
+ # Root elements that require content type declaration
621
+ declarable_roots = {
622
+ "sld",
623
+ "sldLayout",
624
+ "sldMaster",
625
+ "presentation", # PowerPoint
626
+ "document", # Word
627
+ "workbook",
628
+ "worksheet", # Excel
629
+ "theme", # Common
630
+ }
631
+
632
+ # Common media file extensions that should be declared
633
+ media_extensions = {
634
+ "png": "image/png",
635
+ "jpg": "image/jpeg",
636
+ "jpeg": "image/jpeg",
637
+ "gif": "image/gif",
638
+ "bmp": "image/bmp",
639
+ "tiff": "image/tiff",
640
+ "wmf": "image/x-wmf",
641
+ "emf": "image/x-emf",
642
+ }
643
+
644
+ # Get all files in the unpacked directory
645
+ all_files = list(self.unpacked_dir.rglob("*"))
646
+ all_files = [f for f in all_files if f.is_file()]
647
+
648
+ # Check all XML files for Override declarations
649
+ for xml_file in self.xml_files:
650
+ path_str = str(xml_file.relative_to(self.unpacked_dir)).replace(
651
+ "\\", "/"
652
+ )
653
+
654
+ # Skip non-content files
655
+ if any(
656
+ skip in path_str
657
+ for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"]
658
+ ):
659
+ continue
660
+
661
+ try:
662
+ root_tag = lxml.etree.parse(str(xml_file)).getroot().tag
663
+ root_name = root_tag.split("}")[-1] if "}" in root_tag else root_tag
664
+
665
+ if root_name in declarable_roots and path_str not in declared_parts:
666
+ errors.append(
667
+ f" {path_str}: File with <{root_name}> root not declared in [Content_Types].xml"
668
+ )
669
+
670
+ except Exception:
671
+ continue # Skip unparseable files
672
+
673
+ # Check all non-XML files for Default extension declarations
674
+ for file_path in all_files:
675
+ # Skip XML files and metadata files (already checked above)
676
+ if file_path.suffix.lower() in {".xml", ".rels"}:
677
+ continue
678
+ if file_path.name == "[Content_Types].xml":
679
+ continue
680
+ if "_rels" in file_path.parts or "docProps" in file_path.parts:
681
+ continue
682
+
683
+ extension = file_path.suffix.lstrip(".").lower()
684
+ if extension and extension not in declared_extensions:
685
+ # Check if it's a known media extension that should be declared
686
+ if extension in media_extensions:
687
+ relative_path = file_path.relative_to(self.unpacked_dir)
688
+ errors.append(
689
+ f' {relative_path}: File with extension \'{extension}\' not declared in [Content_Types].xml - should add: <Default Extension="{extension}" ContentType="{media_extensions[extension]}"/>'
690
+ )
691
+
692
+ except Exception as e:
693
+ errors.append(f" Error parsing [Content_Types].xml: {e}")
694
+
695
+ if errors:
696
+ print(f"FAILED - Found {len(errors)} content type declaration errors:")
697
+ for error in errors:
698
+ print(error)
699
+ return False
700
+ else:
701
+ if self.verbose:
702
+ print(
703
+ "PASSED - All content files are properly declared in [Content_Types].xml"
704
+ )
705
+ return True
706
+
707
+ def validate_file_against_xsd(self, xml_file, verbose=False):
708
+ """Validate a single XML file against XSD schema, comparing with original.
709
+
710
+ Args:
711
+ xml_file: Path to XML file to validate
712
+ verbose: Enable verbose output
713
+
714
+ Returns:
715
+ tuple: (is_valid, new_errors_set) where is_valid is True/False/None (skipped)
716
+ """
717
+ # Resolve both paths to handle symlinks
718
+ xml_file = Path(xml_file).resolve()
719
+ unpacked_dir = self.unpacked_dir.resolve()
720
+
721
+ # Validate current file
722
+ is_valid, current_errors = self._validate_single_file_xsd(
723
+ xml_file, unpacked_dir
724
+ )
725
+
726
+ if is_valid is None:
727
+ return None, set() # Skipped
728
+ elif is_valid:
729
+ return True, set() # Valid, no errors
730
+
731
+ # Get errors from original file for this specific file
732
+ original_errors = self._get_original_file_errors(xml_file)
733
+
734
+ # Compare with original (both are guaranteed to be sets here)
735
+ assert current_errors is not None
736
+ new_errors = current_errors - original_errors
737
+
738
+ # Filter out known harmless errors (e.g., LibreOffice element ordering issues)
739
+ new_errors = {
740
+ e for e in new_errors
741
+ if not any(pattern in e for pattern in self.IGNORED_VALIDATION_ERRORS)
742
+ }
743
+
744
+ if new_errors:
745
+ if verbose:
746
+ relative_path = xml_file.relative_to(unpacked_dir)
747
+ print(f"FAILED - {relative_path}: {len(new_errors)} new error(s)")
748
+ for error in list(new_errors)[:3]:
749
+ truncated = error[:250] + "..." if len(error) > 250 else error
750
+ print(f" - {truncated}")
751
+ return False, new_errors
752
+ else:
753
+ # All errors existed in original
754
+ if verbose:
755
+ print(
756
+ f"PASSED - No new errors (original had {len(current_errors)} errors)"
757
+ )
758
+ return True, set()
759
+
760
+ def validate_against_xsd(self):
761
+ """Validate XML files against XSD schemas, showing only new errors compared to original."""
762
+ new_errors = []
763
+ original_error_count = 0
764
+ valid_count = 0
765
+ skipped_count = 0
766
+
767
+ for xml_file in self.xml_files:
768
+ relative_path = str(xml_file.relative_to(self.unpacked_dir))
769
+ is_valid, new_file_errors = self.validate_file_against_xsd(
770
+ xml_file, verbose=False
771
+ )
772
+
773
+ if is_valid is None:
774
+ skipped_count += 1
775
+ continue
776
+ elif is_valid and not new_file_errors:
777
+ valid_count += 1
778
+ continue
779
+ elif is_valid:
780
+ # Had errors but all existed in original
781
+ original_error_count += 1
782
+ valid_count += 1
783
+ continue
784
+
785
+ # Has new errors
786
+ new_errors.append(f" {relative_path}: {len(new_file_errors)} new error(s)")
787
+ for error in list(new_file_errors)[:3]: # Show first 3 errors
788
+ new_errors.append(
789
+ f" - {error[:250]}..." if len(error) > 250 else f" - {error}"
790
+ )
791
+
792
+ # Print summary
793
+ if self.verbose:
794
+ print(f"Validated {len(self.xml_files)} files:")
795
+ print(f" - Valid: {valid_count}")
796
+ print(f" - Skipped (no schema): {skipped_count}")
797
+ if original_error_count:
798
+ print(f" - With original errors (ignored): {original_error_count}")
799
+ print(
800
+ f" - With NEW errors: {len(new_errors) > 0 and len([e for e in new_errors if not e.startswith(' ')]) or 0}"
801
+ )
802
+
803
+ if new_errors:
804
+ print("\nFAILED - Found NEW validation errors:")
805
+ for error in new_errors:
806
+ print(error)
807
+ return False
808
+ else:
809
+ if self.verbose:
810
+ print("\nPASSED - No new XSD validation errors introduced")
811
+ return True
812
+
813
+ def _get_schema_path(self, xml_file):
814
+ """Determine the appropriate schema path for an XML file."""
815
+ # Check exact filename match
816
+ if xml_file.name in self.SCHEMA_MAPPINGS:
817
+ return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name]
818
+
819
+ # Check .rels files
820
+ if xml_file.suffix == ".rels":
821
+ return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"]
822
+
823
+ # Check chart files
824
+ if "charts/" in str(xml_file) and xml_file.name.startswith("chart"):
825
+ return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"]
826
+
827
+ # Check theme files
828
+ if "theme/" in str(xml_file) and xml_file.name.startswith("theme"):
829
+ return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"]
830
+
831
+ # Check if file is in a main content folder and use appropriate schema
832
+ if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS:
833
+ return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name]
834
+
835
+ return None
836
+
837
+ def _clean_ignorable_namespaces(self, xml_doc):
838
+ """Remove attributes and elements not in allowed namespaces."""
839
+ # Create a clean copy
840
+ xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
841
+ xml_copy = lxml.etree.fromstring(xml_string)
842
+
843
+ # Remove attributes not in allowed namespaces
844
+ for elem in xml_copy.iter():
845
+ attrs_to_remove = []
846
+
847
+ for attr in elem.attrib:
848
+ # Check if attribute is from a namespace other than allowed ones
849
+ if "{" in attr:
850
+ ns = attr.split("}")[0][1:]
851
+ if ns not in self.OOXML_NAMESPACES:
852
+ attrs_to_remove.append(attr)
853
+
854
+ # Remove collected attributes
855
+ for attr in attrs_to_remove:
856
+ del elem.attrib[attr]
857
+
858
+ # Remove elements not in allowed namespaces
859
+ self._remove_ignorable_elements(xml_copy)
860
+
861
+ return lxml.etree.ElementTree(xml_copy)
862
+
863
+ def _remove_ignorable_elements(self, root):
864
+ """Recursively remove all elements not in allowed namespaces."""
865
+ elements_to_remove = []
866
+
867
+ # Find elements to remove
868
+ for elem in list(root):
869
+ # Skip non-element nodes (comments, processing instructions, etc.)
870
+ if not hasattr(elem, "tag") or callable(elem.tag):
871
+ continue
872
+
873
+ tag_str = str(elem.tag)
874
+ if tag_str.startswith("{"):
875
+ ns = tag_str.split("}")[0][1:]
876
+ if ns not in self.OOXML_NAMESPACES:
877
+ elements_to_remove.append(elem)
878
+ continue
879
+
880
+ # Recursively clean child elements
881
+ self._remove_ignorable_elements(elem)
882
+
883
+ # Remove collected elements
884
+ for elem in elements_to_remove:
885
+ root.remove(elem)
886
+
887
+ def _preprocess_for_mc_ignorable(self, xml_doc):
888
+ """Preprocess XML to handle mc:Ignorable attribute properly."""
889
+ # Remove mc:Ignorable attributes before validation
890
+ root = xml_doc.getroot()
891
+
892
+ # Remove mc:Ignorable attribute from root
893
+ if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib:
894
+ del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"]
895
+
896
+ return xml_doc
897
+
898
+ def _validate_single_file_xsd(self, xml_file, base_path):
899
+ """Validate a single XML file against XSD schema. Returns (is_valid, errors_set)."""
900
+ schema_path = self._get_schema_path(xml_file)
901
+ if not schema_path:
902
+ return None, None # Skip file
903
+
904
+ try:
905
+ # Load schema
906
+ with open(schema_path, "rb") as xsd_file:
907
+ parser = lxml.etree.XMLParser()
908
+ xsd_doc = lxml.etree.parse(
909
+ xsd_file, parser=parser, base_url=str(schema_path)
910
+ )
911
+ schema = lxml.etree.XMLSchema(xsd_doc)
912
+
913
+ # Load and preprocess XML
914
+ with open(xml_file, "r") as f:
915
+ xml_doc = lxml.etree.parse(f)
916
+
917
+ xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc)
918
+ xml_doc = self._preprocess_for_mc_ignorable(xml_doc)
919
+
920
+ # Clean ignorable namespaces if needed
921
+ relative_path = xml_file.relative_to(base_path)
922
+ if (
923
+ relative_path.parts
924
+ and relative_path.parts[0] in self.MAIN_CONTENT_FOLDERS
925
+ ):
926
+ xml_doc = self._clean_ignorable_namespaces(xml_doc)
927
+
928
+ # Validate
929
+ if schema.validate(xml_doc):
930
+ return True, set()
931
+ else:
932
+ errors = set()
933
+ for error in schema.error_log:
934
+ # Store normalized error message (without line numbers for comparison)
935
+ errors.add(error.message)
936
+ return False, errors
937
+
938
+ except Exception as e:
939
+ return False, {str(e)}
940
+
941
+ def _get_original_file_errors(self, xml_file):
942
+ """Get XSD validation errors from a single file in the original document.
943
+
944
+ Args:
945
+ xml_file: Path to the XML file in unpacked_dir to check
946
+
947
+ Returns:
948
+ set: Set of error messages from the original file
949
+ """
950
+ import tempfile
951
+ import zipfile
952
+
953
+ # Resolve both paths to handle symlinks (e.g., /var vs /private/var on macOS)
954
+ xml_file = Path(xml_file).resolve()
955
+ unpacked_dir = self.unpacked_dir.resolve()
956
+ relative_path = xml_file.relative_to(unpacked_dir)
957
+
958
+ with tempfile.TemporaryDirectory() as temp_dir:
959
+ temp_path = Path(temp_dir)
960
+
961
+ # Extract original file
962
+ with zipfile.ZipFile(self.original_file, "r") as zip_ref:
963
+ zip_ref.extractall(temp_path)
964
+
965
+ # Find corresponding file in original
966
+ original_xml_file = temp_path / relative_path
967
+
968
+ if not original_xml_file.exists():
969
+ # File didn't exist in original, so no original errors
970
+ return set()
971
+
972
+ # Validate the specific file in original
973
+ is_valid, errors = self._validate_single_file_xsd(
974
+ original_xml_file, temp_path
975
+ )
976
+ return errors if errors else set()
977
+
978
+ def _remove_template_tags_from_text_nodes(self, xml_doc):
979
+ """Remove template tags from XML text nodes and collect warnings.
980
+
981
+ Template tags follow the pattern {{ ... }} and are used as placeholders
982
+ for content replacement. They should be removed from text content before
983
+ XSD validation while preserving XML structure.
984
+
985
+ Returns:
986
+ tuple: (cleaned_xml_doc, warnings_list)
987
+ """
988
+ warnings = []
989
+ template_pattern = re.compile(r"\{\{[^}]*\}\}")
990
+
991
+ # Create a copy of the document to avoid modifying the original
992
+ xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
993
+ xml_copy = lxml.etree.fromstring(xml_string)
994
+
995
+ def process_text_content(text, content_type):
996
+ if not text:
997
+ return text
998
+ matches = list(template_pattern.finditer(text))
999
+ if matches:
1000
+ for match in matches:
1001
+ warnings.append(
1002
+ f"Found template tag in {content_type}: {match.group()}"
1003
+ )
1004
+ return template_pattern.sub("", text)
1005
+ return text
1006
+
1007
+ # Process all text nodes in the document
1008
+ for elem in xml_copy.iter():
1009
+ # Skip processing if this is a w:t element
1010
+ if not hasattr(elem, "tag") or callable(elem.tag):
1011
+ continue
1012
+ tag_str = str(elem.tag)
1013
+ if tag_str.endswith("}t") or tag_str == "t":
1014
+ continue
1015
+
1016
+ elem.text = process_text_content(elem.text, "text content")
1017
+ elem.tail = process_text_content(elem.tail, "tail content")
1018
+
1019
+ return lxml.etree.ElementTree(xml_copy), warnings
1020
+
1021
+
1022
+ if __name__ == "__main__":
1023
+ raise RuntimeError("This module should not be run directly.")