wormclaude 1.0.119 → 1.0.121

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (232) hide show
  1. package/dist/theme.js +1 -1
  2. package/dist/tui.js +6 -1
  3. package/package.json +1 -1
  4. package/skills/build-mcp-app/SKILL.md +0 -393
  5. package/skills/build-mcp-app/references/abuse-protection.md +0 -60
  6. package/skills/build-mcp-app/references/apps-sdk-messages.md +0 -227
  7. package/skills/build-mcp-app/references/directory-checklist.md +0 -18
  8. package/skills/build-mcp-app/references/iframe-sandbox.md +0 -164
  9. package/skills/build-mcp-app/references/payload-budgeting.md +0 -54
  10. package/skills/build-mcp-app/references/widget-templates.md +0 -249
  11. package/skills/build-mcp-server/SKILL.md +0 -222
  12. package/skills/build-mcp-server/references/auth.md +0 -108
  13. package/skills/build-mcp-server/references/deploy-cloudflare-workers.md +0 -106
  14. package/skills/build-mcp-server/references/elicitation.md +0 -129
  15. package/skills/build-mcp-server/references/remote-http-scaffold.md +0 -211
  16. package/skills/build-mcp-server/references/resources-and-prompts.md +0 -122
  17. package/skills/build-mcp-server/references/server-capabilities.md +0 -164
  18. package/skills/build-mcp-server/references/tool-design.md +0 -189
  19. package/skills/build-mcp-server/references/versions.md +0 -25
  20. package/skills/build-mcpb/SKILL.md +0 -200
  21. package/skills/build-mcpb/references/local-security.md +0 -149
  22. package/skills/build-mcpb/references/manifest-schema.md +0 -156
  23. package/skills/docx/script/__init__.py +0 -1
  24. package/skills/docx/script/accept_chages.py +0 -135
  25. package/skills/docx/script/comment.py +0 -318
  26. package/skills/docx/script/office/helpers/__init__.py +0 -0
  27. package/skills/docx/script/office/helpers/merge_runs.py +0 -199
  28. package/skills/docx/script/office/helpers/simplify_redlines.py +0 -197
  29. package/skills/docx/script/office/pack.py +0 -159
  30. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +0 -1499
  31. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +0 -146
  32. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +0 -1085
  33. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +0 -11
  34. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +0 -3081
  35. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +0 -23
  36. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +0 -185
  37. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +0 -287
  38. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/pml.xsd +0 -1676
  39. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +0 -28
  40. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +0 -144
  41. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +0 -174
  42. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +0 -25
  43. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +0 -18
  44. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +0 -59
  45. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +0 -56
  46. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +0 -195
  47. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +0 -582
  48. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +0 -25
  49. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/sml.xsd +0 -4439
  50. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +0 -570
  51. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +0 -509
  52. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +0 -12
  53. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +0 -108
  54. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +0 -96
  55. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/wml.xsd +0 -3646
  56. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/xml.xsd +0 -116
  57. package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +0 -42
  58. package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +0 -50
  59. package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-digSig.xsd +0 -49
  60. package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-relationships.xsd +0 -33
  61. package/skills/docx/script/office/schemas/mce/mc.xsd +0 -75
  62. package/skills/docx/script/office/schemas/microsoft/wml-2010.xsd +0 -560
  63. package/skills/docx/script/office/schemas/microsoft/wml-2012.xsd +0 -67
  64. package/skills/docx/script/office/schemas/microsoft/wml-2018.xsd +0 -14
  65. package/skills/docx/script/office/schemas/microsoft/wml-cex-2018.xsd +0 -20
  66. package/skills/docx/script/office/schemas/microsoft/wml-cid-2016.xsd +0 -13
  67. package/skills/docx/script/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +0 -4
  68. package/skills/docx/script/office/schemas/microsoft/wml-symex-2015.xsd +0 -8
  69. package/skills/docx/script/office/soffice.py +0 -183
  70. package/skills/docx/script/office/unpack.py +0 -132
  71. package/skills/docx/script/office/validate.py +0 -117
  72. package/skills/docx/script/office/validators/__init__.py +0 -15
  73. package/skills/docx/script/office/validators/base.py +0 -851
  74. package/skills/docx/script/office/validators/docx.py +0 -446
  75. package/skills/docx/script/office/validators/pptx.py +0 -275
  76. package/skills/docx/script/office/validators/redlining.py +0 -247
  77. package/skills/docx/script/templates/comments.xml +0 -3
  78. package/skills/docx/script/templates/commentsExtended.xml +0 -3
  79. package/skills/docx/script/templates/commentsExtensible.xml +0 -3
  80. package/skills/docx/script/templates/commentsIds.xml +0 -3
  81. package/skills/docx/script/templates/people.xml +0 -3
  82. package/skills/docx/skill.md +0 -593
  83. package/skills/explain.md +0 -14
  84. package/skills/frontend-design/SKILL.md +0 -42
  85. package/skills/pdf/FORMS.md +0 -294
  86. package/skills/pdf/REFERENCE.md +0 -612
  87. package/skills/pdf/SKILL.md +0 -314
  88. package/skills/pdf/scripts/check_bounding_boxes.py +0 -65
  89. package/skills/pdf/scripts/check_fillable_fields.py +0 -11
  90. package/skills/pdf/scripts/convert_pdf_to_images.py +0 -33
  91. package/skills/pdf/scripts/create_validation_image.py +0 -37
  92. package/skills/pdf/scripts/extract_form_field_info.py +0 -122
  93. package/skills/pdf/scripts/extract_form_structure.py +0 -115
  94. package/skills/pdf/scripts/fill_fillable_fields.py +0 -98
  95. package/skills/pdf/scripts/fill_pdf_form_with_annotations.py +0 -107
  96. package/skills/playground/SKILL.md +0 -77
  97. package/skills/playground/templates/code-map.md +0 -158
  98. package/skills/playground/templates/concept-map.md +0 -73
  99. package/skills/playground/templates/data-explorer.md +0 -67
  100. package/skills/playground/templates/design-playground.md +0 -67
  101. package/skills/playground/templates/diff-review.md +0 -179
  102. package/skills/playground/templates/document-critique.md +0 -171
  103. package/skills/pptx/SKILL.md +0 -230
  104. package/skills/pptx/editing.md +0 -205
  105. package/skills/pptx/pptxgenjs.md +0 -437
  106. package/skills/pptx/scripts/__init__.py +0 -0
  107. package/skills/pptx/scripts/add_slide.py +0 -195
  108. package/skills/pptx/scripts/clean.py +0 -286
  109. package/skills/pptx/scripts/office/helpers/__init__.py +0 -0
  110. package/skills/pptx/scripts/office/helpers/merge_runs.py +0 -199
  111. package/skills/pptx/scripts/office/helpers/simplify_redlines.py +0 -197
  112. package/skills/pptx/scripts/office/pack.py +0 -159
  113. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +0 -1499
  114. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +0 -146
  115. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +0 -1085
  116. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +0 -11
  117. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +0 -3081
  118. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +0 -23
  119. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +0 -185
  120. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +0 -287
  121. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +0 -1676
  122. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +0 -28
  123. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +0 -144
  124. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +0 -174
  125. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +0 -25
  126. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +0 -18
  127. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +0 -59
  128. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +0 -56
  129. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +0 -195
  130. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +0 -582
  131. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +0 -25
  132. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +0 -4439
  133. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +0 -570
  134. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +0 -509
  135. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +0 -12
  136. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +0 -108
  137. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +0 -96
  138. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +0 -3646
  139. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +0 -116
  140. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +0 -42
  141. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +0 -50
  142. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +0 -49
  143. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +0 -33
  144. package/skills/pptx/scripts/office/schemas/mce/mc.xsd +0 -75
  145. package/skills/pptx/scripts/office/schemas/microsoft/wml-2010.xsd +0 -560
  146. package/skills/pptx/scripts/office/schemas/microsoft/wml-2012.xsd +0 -67
  147. package/skills/pptx/scripts/office/schemas/microsoft/wml-2018.xsd +0 -14
  148. package/skills/pptx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +0 -20
  149. package/skills/pptx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +0 -13
  150. package/skills/pptx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +0 -4
  151. package/skills/pptx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +0 -8
  152. package/skills/pptx/scripts/office/soffice.py +0 -183
  153. package/skills/pptx/scripts/office/unpack.py +0 -132
  154. package/skills/pptx/scripts/office/validate.py +0 -117
  155. package/skills/pptx/scripts/office/validators/__init__.py +0 -15
  156. package/skills/pptx/scripts/office/validators/base.py +0 -851
  157. package/skills/pptx/scripts/office/validators/docx.py +0 -446
  158. package/skills/pptx/scripts/office/validators/pptx.py +0 -275
  159. package/skills/pptx/scripts/office/validators/redlining.py +0 -247
  160. package/skills/pptx/scripts/thumbnail.py +0 -289
  161. package/skills/recon.md +0 -16
  162. package/skills/security-audit/SKILL.md +0 -26
  163. package/skills/talent-creator/SKILL.md +0 -486
  164. package/skills/talent-creator/agents/analyzer.md +0 -274
  165. package/skills/talent-creator/agents/comparator.md +0 -202
  166. package/skills/talent-creator/agents/grader.md +0 -223
  167. package/skills/talent-creator/assets/eval_review.html +0 -146
  168. package/skills/talent-creator/eval-viewer/generate_review.py +0 -471
  169. package/skills/talent-creator/eval-viewer/viewer.html +0 -1325
  170. package/skills/talent-creator/references/schemas.md +0 -430
  171. package/skills/talent-creator/scripts/__init__.py +0 -0
  172. package/skills/talent-creator/scripts/aggregate_benchmark.py +0 -401
  173. package/skills/talent-creator/scripts/generate_report.py +0 -326
  174. package/skills/talent-creator/scripts/improve_description.py +0 -247
  175. package/skills/talent-creator/scripts/package_skill.py +0 -136
  176. package/skills/talent-creator/scripts/quick_validate.py +0 -146
  177. package/skills/talent-creator/scripts/run_eval.py +0 -310
  178. package/skills/talent-creator/scripts/run_loop.py +0 -328
  179. package/skills/talent-creator/scripts/utils.py +0 -47
  180. package/skills/xlsx/SKILL.md +0 -300
  181. package/skills/xlsx/scripts/office/helpers/__init__.py +0 -0
  182. package/skills/xlsx/scripts/office/helpers/merge_runs.py +0 -199
  183. package/skills/xlsx/scripts/office/helpers/simplify_redlines.py +0 -197
  184. package/skills/xlsx/scripts/office/pack.py +0 -159
  185. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +0 -1499
  186. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +0 -146
  187. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +0 -1085
  188. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +0 -11
  189. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +0 -3081
  190. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +0 -23
  191. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +0 -185
  192. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +0 -287
  193. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +0 -1676
  194. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +0 -28
  195. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +0 -144
  196. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +0 -174
  197. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +0 -25
  198. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +0 -18
  199. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +0 -59
  200. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +0 -56
  201. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +0 -195
  202. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +0 -582
  203. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +0 -25
  204. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +0 -4439
  205. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +0 -570
  206. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +0 -509
  207. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +0 -12
  208. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +0 -108
  209. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +0 -96
  210. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +0 -3646
  211. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +0 -116
  212. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +0 -42
  213. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +0 -50
  214. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +0 -49
  215. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +0 -33
  216. package/skills/xlsx/scripts/office/schemas/mce/mc.xsd +0 -75
  217. package/skills/xlsx/scripts/office/schemas/microsoft/wml-2010.xsd +0 -560
  218. package/skills/xlsx/scripts/office/schemas/microsoft/wml-2012.xsd +0 -67
  219. package/skills/xlsx/scripts/office/schemas/microsoft/wml-2018.xsd +0 -14
  220. package/skills/xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +0 -20
  221. package/skills/xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +0 -13
  222. package/skills/xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +0 -4
  223. package/skills/xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +0 -8
  224. package/skills/xlsx/scripts/office/soffice.py +0 -183
  225. package/skills/xlsx/scripts/office/unpack.py +0 -132
  226. package/skills/xlsx/scripts/office/validate.py +0 -117
  227. package/skills/xlsx/scripts/office/validators/__init__.py +0 -15
  228. package/skills/xlsx/scripts/office/validators/base.py +0 -851
  229. package/skills/xlsx/scripts/office/validators/docx.py +0 -446
  230. package/skills/xlsx/scripts/office/validators/pptx.py +0 -275
  231. package/skills/xlsx/scripts/office/validators/redlining.py +0 -247
  232. package/skills/xlsx/scripts/recalc.py +0 -184
@@ -1,851 +0,0 @@
1
- """
2
- Base validator with common validation logic for document files.
3
- """
4
-
5
- import re
6
- from pathlib import Path
7
-
8
- import defusedxml.minidom
9
- import lxml.etree
10
-
11
-
12
- class BaseSchemaValidator:
13
-
14
- IGNORED_VALIDATION_ERRORS = [
15
- "hyphenationZone",
16
- "purl.org/dc/terms",
17
- ]
18
-
19
- UNIQUE_ID_REQUIREMENTS = {
20
- "comment": ("id", "file"),
21
- "commentrangestart": ("id", "file"),
22
- "commentrangeend": ("id", "file"),
23
- "bookmarkstart": ("id", "file"),
24
- "bookmarkend": ("id", "file"),
25
- "sldid": ("id", "file"),
26
- "sldmasterid": ("id", "global"),
27
- "sldlayoutid": ("id", "global"),
28
- "cm": ("authorid", "file"),
29
- "sheet": ("sheetid", "file"),
30
- "definedname": ("id", "file"),
31
- "cxnsp": ("id", "file"),
32
- "sp": ("id", "file"),
33
- "pic": ("id", "file"),
34
- "grpsp": ("id", "file"),
35
- }
36
-
37
- EXCLUDED_ID_CONTAINERS = {
38
- "sectionlst",
39
- }
40
-
41
- ELEMENT_RELATIONSHIP_TYPES = {}
42
-
43
- SCHEMA_MAPPINGS = {
44
- "word": "ISO-IEC29500-4_2016/wml.xsd",
45
- "ppt": "ISO-IEC29500-4_2016/pml.xsd",
46
- "xl": "ISO-IEC29500-4_2016/sml.xsd",
47
- "[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd",
48
- "app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd",
49
- "core.xml": "ecma/fouth-edition/opc-coreProperties.xsd",
50
- "custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd",
51
- ".rels": "ecma/fouth-edition/opc-relationships.xsd",
52
- "people.xml": "microsoft/wml-2012.xsd",
53
- "commentsIds.xml": "microsoft/wml-cid-2016.xsd",
54
- "commentsExtensible.xml": "microsoft/wml-cex-2018.xsd",
55
- "commentsExtended.xml": "microsoft/wml-2012.xsd",
56
- "chart": "ISO-IEC29500-4_2016/dml-chart.xsd",
57
- "theme": "ISO-IEC29500-4_2016/dml-main.xsd",
58
- "drawing": "ISO-IEC29500-4_2016/dml-main.xsd",
59
- }
60
-
61
- MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006"
62
- XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
63
-
64
- PACKAGE_RELATIONSHIPS_NAMESPACE = (
65
- "http://schemas.openxmlformats.org/package/2006/relationships"
66
- )
67
- OFFICE_RELATIONSHIPS_NAMESPACE = (
68
- "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
69
- )
70
- CONTENT_TYPES_NAMESPACE = (
71
- "http://schemas.openxmlformats.org/package/2006/content-types"
72
- )
73
-
74
- MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"}
75
-
76
- OOXML_NAMESPACES = {
77
- "http://schemas.openxmlformats.org/officeDocument/2006/math",
78
- "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
79
- "http://schemas.openxmlformats.org/schemaLibrary/2006/main",
80
- "http://schemas.openxmlformats.org/drawingml/2006/main",
81
- "http://schemas.openxmlformats.org/drawingml/2006/chart",
82
- "http://schemas.openxmlformats.org/drawingml/2006/chartDrawing",
83
- "http://schemas.openxmlformats.org/drawingml/2006/diagram",
84
- "http://schemas.openxmlformats.org/drawingml/2006/picture",
85
- "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing",
86
- "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
87
- "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
88
- "http://schemas.openxmlformats.org/presentationml/2006/main",
89
- "http://schemas.openxmlformats.org/spreadsheetml/2006/main",
90
- "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes",
91
- "http://www.w3.org/XML/1998/namespace",
92
- }
93
-
94
- def __init__(self, unpacked_dir, original_file=None, verbose=False):
95
- self.unpacked_dir = Path(unpacked_dir).resolve()
96
- self.original_file = Path(original_file) if original_file else None
97
- self.verbose = verbose
98
-
99
- self.schemas_dir = Path(__file__).parent.parent / "schemas"
100
-
101
- patterns = ["*.xml", "*.rels"]
102
- self.xml_files = [
103
- f for pattern in patterns for f in self.unpacked_dir.rglob(pattern)
104
- ]
105
-
106
- if not self.xml_files:
107
- print(f"Warning: No XML files found in {self.unpacked_dir}")
108
-
109
- def validate(self):
110
- raise NotImplementedError("Subclasses must implement the validate method")
111
-
112
- def repair(self) -> int:
113
- return self.repair_whitespace_preservation()
114
-
115
- def repair_whitespace_preservation(self) -> int:
116
- repairs = 0
117
-
118
- for xml_file in self.xml_files:
119
- try:
120
- content = xml_file.read_text(encoding="utf-8")
121
- dom = defusedxml.minidom.parseString(content)
122
- modified = False
123
-
124
- for elem in dom.getElementsByTagName("*"):
125
- if elem.tagName.endswith(":t") and elem.firstChild:
126
- text = elem.firstChild.nodeValue
127
- if text and (text.startswith((' ', '\t')) or text.endswith((' ', '\t'))):
128
- if elem.getAttribute("xml:space") != "preserve":
129
- elem.setAttribute("xml:space", "preserve")
130
- text_preview = repr(text[:30]) + "..." if len(text) > 30 else repr(text)
131
- print(f" Repaired: {xml_file.name}: Added xml:space='preserve' to {elem.tagName}: {text_preview}")
132
- repairs += 1
133
- modified = True
134
-
135
- if modified:
136
- xml_file.write_bytes(dom.toxml(encoding="UTF-8"))
137
-
138
- except Exception:
139
- pass
140
-
141
- return repairs
142
-
143
- def validate_xml(self):
144
- errors = []
145
-
146
- for xml_file in self.xml_files:
147
- try:
148
- lxml.etree.parse(str(xml_file))
149
- except lxml.etree.XMLSyntaxError as e:
150
- errors.append(
151
- f" {xml_file.relative_to(self.unpacked_dir)}: "
152
- f"Line {e.lineno}: {e.msg}"
153
- )
154
- except Exception as e:
155
- errors.append(
156
- f" {xml_file.relative_to(self.unpacked_dir)}: "
157
- f"Unexpected error: {str(e)}"
158
- )
159
-
160
- if errors:
161
- print(f"FAILED - Found {len(errors)} XML violations:")
162
- for error in errors:
163
- print(error)
164
- return False
165
- else:
166
- if self.verbose:
167
- print("PASSED - All XML files are well-formed")
168
- return True
169
-
170
- def validate_namespaces(self):
171
- errors = []
172
-
173
- for xml_file in self.xml_files:
174
- try:
175
- root = lxml.etree.parse(str(xml_file)).getroot()
176
- declared = set(root.nsmap.keys()) - {None}
177
-
178
- for attr_val in [
179
- v for k, v in root.attrib.items() if k.endswith("Ignorable")
180
- ]:
181
- undeclared = set(attr_val.split()) - declared
182
- errors.extend(
183
- f" {xml_file.relative_to(self.unpacked_dir)}: "
184
- f"Namespace '{ns}' in Ignorable but not declared"
185
- for ns in undeclared
186
- )
187
- except lxml.etree.XMLSyntaxError:
188
- continue
189
-
190
- if errors:
191
- print(f"FAILED - {len(errors)} namespace issues:")
192
- for error in errors:
193
- print(error)
194
- return False
195
- if self.verbose:
196
- print("PASSED - All namespace prefixes properly declared")
197
- return True
198
-
199
- def validate_unique_ids(self):
200
- errors = []
201
- global_ids = {}
202
-
203
- for xml_file in self.xml_files:
204
- try:
205
- root = lxml.etree.parse(str(xml_file)).getroot()
206
- file_ids = {}
207
-
208
- mc_elements = root.xpath(
209
- ".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE}
210
- )
211
- for elem in mc_elements:
212
- elem.getparent().remove(elem)
213
-
214
- for elem in root.iter():
215
- if not hasattr(elem, "tag") or callable(elem.tag):
216
- continue
217
- tag = (
218
- elem.tag.split("}")[-1].lower()
219
- if "}" in elem.tag
220
- else elem.tag.lower()
221
- )
222
-
223
- if tag in self.UNIQUE_ID_REQUIREMENTS:
224
- in_excluded_container = any(
225
- ancestor.tag.split("}")[-1].lower() in self.EXCLUDED_ID_CONTAINERS
226
- for ancestor in elem.iterancestors()
227
- )
228
- if in_excluded_container:
229
- continue
230
-
231
- attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag]
232
-
233
- id_value = None
234
- for attr, value in elem.attrib.items():
235
- attr_local = (
236
- attr.split("}")[-1].lower()
237
- if "}" in attr
238
- else attr.lower()
239
- )
240
- if attr_local == attr_name:
241
- id_value = value
242
- break
243
-
244
- if id_value is not None:
245
- if scope == "global":
246
- if id_value in global_ids:
247
- prev_file, prev_line, prev_tag = global_ids[
248
- id_value
249
- ]
250
- errors.append(
251
- f" {xml_file.relative_to(self.unpacked_dir)}: "
252
- f"Line {elem.sourceline}: Global ID '{id_value}' in <{tag}> "
253
- f"already used in {prev_file} at line {prev_line} in <{prev_tag}>"
254
- )
255
- else:
256
- global_ids[id_value] = (
257
- xml_file.relative_to(self.unpacked_dir),
258
- elem.sourceline,
259
- tag,
260
- )
261
- elif scope == "file":
262
- key = (tag, attr_name)
263
- if key not in file_ids:
264
- file_ids[key] = {}
265
-
266
- if id_value in file_ids[key]:
267
- prev_line = file_ids[key][id_value]
268
- errors.append(
269
- f" {xml_file.relative_to(self.unpacked_dir)}: "
270
- f"Line {elem.sourceline}: Duplicate {attr_name}='{id_value}' in <{tag}> "
271
- f"(first occurrence at line {prev_line})"
272
- )
273
- else:
274
- file_ids[key][id_value] = elem.sourceline
275
-
276
- except (lxml.etree.XMLSyntaxError, Exception) as e:
277
- errors.append(
278
- f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
279
- )
280
-
281
- if errors:
282
- print(f"FAILED - Found {len(errors)} ID uniqueness violations:")
283
- for error in errors:
284
- print(error)
285
- return False
286
- else:
287
- if self.verbose:
288
- print("PASSED - All required IDs are unique")
289
- return True
290
-
291
- def validate_file_references(self):
292
- errors = []
293
-
294
- rels_files = list(self.unpacked_dir.rglob("*.rels"))
295
-
296
- if not rels_files:
297
- if self.verbose:
298
- print("PASSED - No .rels files found")
299
- return True
300
-
301
- all_files = []
302
- for file_path in self.unpacked_dir.rglob("*"):
303
- if (
304
- file_path.is_file()
305
- and file_path.name != "[Content_Types].xml"
306
- and not file_path.name.endswith(".rels")
307
- ):
308
- all_files.append(file_path.resolve())
309
-
310
- all_referenced_files = set()
311
-
312
- if self.verbose:
313
- print(
314
- f"Found {len(rels_files)} .rels files and {len(all_files)} target files"
315
- )
316
-
317
- for rels_file in rels_files:
318
- try:
319
- rels_root = lxml.etree.parse(str(rels_file)).getroot()
320
-
321
- rels_dir = rels_file.parent
322
-
323
- referenced_files = set()
324
- broken_refs = []
325
-
326
- for rel in rels_root.findall(
327
- ".//ns:Relationship",
328
- namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE},
329
- ):
330
- target = rel.get("Target")
331
- if target and not target.startswith(
332
- ("http", "mailto:")
333
- ):
334
- if target.startswith("/"):
335
- target_path = self.unpacked_dir / target.lstrip("/")
336
- elif rels_file.name == ".rels":
337
- target_path = self.unpacked_dir / target
338
- else:
339
- base_dir = rels_dir.parent
340
- target_path = base_dir / target
341
-
342
- try:
343
- target_path = target_path.resolve()
344
- if target_path.exists() and target_path.is_file():
345
- referenced_files.add(target_path)
346
- all_referenced_files.add(target_path)
347
- else:
348
- broken_refs.append((target, rel.sourceline))
349
- except (OSError, ValueError):
350
- broken_refs.append((target, rel.sourceline))
351
-
352
- if broken_refs:
353
- rel_path = rels_file.relative_to(self.unpacked_dir)
354
- for broken_ref, line_num in broken_refs:
355
- errors.append(
356
- f" {rel_path}: Line {line_num}: Broken reference to {broken_ref}"
357
- )
358
-
359
- except Exception as e:
360
- rel_path = rels_file.relative_to(self.unpacked_dir)
361
- errors.append(f" Error parsing {rel_path}: {e}")
362
-
363
- unreferenced_files = set(all_files) - all_referenced_files
364
-
365
- if unreferenced_files:
366
- for unref_file in sorted(unreferenced_files):
367
- unref_rel_path = unref_file.relative_to(self.unpacked_dir)
368
- errors.append(f" Unreferenced file: {unref_rel_path}")
369
-
370
- if errors:
371
- print(f"FAILED - Found {len(errors)} relationship validation errors:")
372
- for error in errors:
373
- print(error)
374
- print(
375
- "CRITICAL: These errors will cause the document to appear corrupt. "
376
- + "Broken references MUST be fixed, "
377
- + "and unreferenced files MUST be referenced or removed."
378
- )
379
- return False
380
- else:
381
- if self.verbose:
382
- print(
383
- "PASSED - All references are valid and all files are properly referenced"
384
- )
385
- return True
386
-
387
- def validate_all_relationship_ids(self):
388
- import lxml.etree
389
-
390
- errors = []
391
-
392
- for xml_file in self.xml_files:
393
- if xml_file.suffix == ".rels":
394
- continue
395
-
396
- rels_dir = xml_file.parent / "_rels"
397
- rels_file = rels_dir / f"{xml_file.name}.rels"
398
-
399
- if not rels_file.exists():
400
- continue
401
-
402
- try:
403
- rels_root = lxml.etree.parse(str(rels_file)).getroot()
404
- rid_to_type = {}
405
-
406
- for rel in rels_root.findall(
407
- f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
408
- ):
409
- rid = rel.get("Id")
410
- rel_type = rel.get("Type", "")
411
- if rid:
412
- if rid in rid_to_type:
413
- rels_rel_path = rels_file.relative_to(self.unpacked_dir)
414
- errors.append(
415
- f" {rels_rel_path}: Line {rel.sourceline}: "
416
- f"Duplicate relationship ID '{rid}' (IDs must be unique)"
417
- )
418
- type_name = (
419
- rel_type.split("/")[-1] if "/" in rel_type else rel_type
420
- )
421
- rid_to_type[rid] = type_name
422
-
423
- xml_root = lxml.etree.parse(str(xml_file)).getroot()
424
-
425
- r_ns = self.OFFICE_RELATIONSHIPS_NAMESPACE
426
- rid_attrs_to_check = ["id", "embed", "link"]
427
- for elem in xml_root.iter():
428
- if not hasattr(elem, "tag") or callable(elem.tag):
429
- continue
430
- for attr_name in rid_attrs_to_check:
431
- rid_attr = elem.get(f"{{{r_ns}}}{attr_name}")
432
- if not rid_attr:
433
- continue
434
- xml_rel_path = xml_file.relative_to(self.unpacked_dir)
435
- elem_name = (
436
- elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
437
- )
438
-
439
- if rid_attr not in rid_to_type:
440
- errors.append(
441
- f" {xml_rel_path}: Line {elem.sourceline}: "
442
- f"<{elem_name}> r:{attr_name} references non-existent relationship '{rid_attr}' "
443
- f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})"
444
- )
445
- elif attr_name == "id" and self.ELEMENT_RELATIONSHIP_TYPES:
446
- expected_type = self._get_expected_relationship_type(
447
- elem_name
448
- )
449
- if expected_type:
450
- actual_type = rid_to_type[rid_attr]
451
- if expected_type not in actual_type.lower():
452
- errors.append(
453
- f" {xml_rel_path}: Line {elem.sourceline}: "
454
- f"<{elem_name}> references '{rid_attr}' which points to '{actual_type}' "
455
- f"but should point to a '{expected_type}' relationship"
456
- )
457
-
458
- except Exception as e:
459
- xml_rel_path = xml_file.relative_to(self.unpacked_dir)
460
- errors.append(f" Error processing {xml_rel_path}: {e}")
461
-
462
- if errors:
463
- print(f"FAILED - Found {len(errors)} relationship ID reference errors:")
464
- for error in errors:
465
- print(error)
466
- print("\nThese ID mismatches will cause the document to appear corrupt!")
467
- return False
468
- else:
469
- if self.verbose:
470
- print("PASSED - All relationship ID references are valid")
471
- return True
472
-
473
- def _get_expected_relationship_type(self, element_name):
474
- elem_lower = element_name.lower()
475
-
476
- if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES:
477
- return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower]
478
-
479
- if elem_lower.endswith("id") and len(elem_lower) > 2:
480
- prefix = elem_lower[:-2]
481
- if prefix.endswith("master"):
482
- return prefix.lower()
483
- elif prefix.endswith("layout"):
484
- return prefix.lower()
485
- else:
486
- if prefix == "sld":
487
- return "slide"
488
- return prefix.lower()
489
-
490
- if elem_lower.endswith("reference") and len(elem_lower) > 9:
491
- prefix = elem_lower[:-9]
492
- return prefix.lower()
493
-
494
- return None
495
-
496
- def validate_content_types(self):
497
- errors = []
498
-
499
- content_types_file = self.unpacked_dir / "[Content_Types].xml"
500
- if not content_types_file.exists():
501
- print("FAILED - [Content_Types].xml file not found")
502
- return False
503
-
504
- try:
505
- root = lxml.etree.parse(str(content_types_file)).getroot()
506
- declared_parts = set()
507
- declared_extensions = set()
508
-
509
- for override in root.findall(
510
- f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override"
511
- ):
512
- part_name = override.get("PartName")
513
- if part_name is not None:
514
- declared_parts.add(part_name.lstrip("/"))
515
-
516
- for default in root.findall(
517
- f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default"
518
- ):
519
- extension = default.get("Extension")
520
- if extension is not None:
521
- declared_extensions.add(extension.lower())
522
-
523
- declarable_roots = {
524
- "sld",
525
- "sldLayout",
526
- "sldMaster",
527
- "presentation",
528
- "document",
529
- "workbook",
530
- "worksheet",
531
- "theme",
532
- }
533
-
534
- media_extensions = {
535
- "png": "image/png",
536
- "jpg": "image/jpeg",
537
- "jpeg": "image/jpeg",
538
- "gif": "image/gif",
539
- "bmp": "image/bmp",
540
- "tiff": "image/tiff",
541
- "wmf": "image/x-wmf",
542
- "emf": "image/x-emf",
543
- }
544
-
545
- all_files = list(self.unpacked_dir.rglob("*"))
546
- all_files = [f for f in all_files if f.is_file()]
547
-
548
- for xml_file in self.xml_files:
549
- path_str = str(xml_file.relative_to(self.unpacked_dir)).replace(
550
- "\\", "/"
551
- )
552
-
553
- if any(
554
- skip in path_str
555
- for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"]
556
- ):
557
- continue
558
-
559
- try:
560
- root_tag = lxml.etree.parse(str(xml_file)).getroot().tag
561
- root_name = root_tag.split("}")[-1] if "}" in root_tag else root_tag
562
-
563
- if root_name in declarable_roots and path_str not in declared_parts:
564
- errors.append(
565
- f" {path_str}: File with <{root_name}> root not declared in [Content_Types].xml"
566
- )
567
-
568
- except Exception:
569
- continue
570
-
571
- for file_path in all_files:
572
- if file_path.suffix.lower() in {".xml", ".rels"}:
573
- continue
574
- if file_path.name == "[Content_Types].xml":
575
- continue
576
- if "_rels" in file_path.parts or "docProps" in file_path.parts:
577
- continue
578
-
579
- extension = file_path.suffix.lstrip(".").lower()
580
- if extension and extension not in declared_extensions:
581
- if extension in media_extensions:
582
- relative_path = file_path.relative_to(self.unpacked_dir)
583
- errors.append(
584
- f' {relative_path}: File with extension \'{extension}\' not declared in [Content_Types].xml - should add: <Default Extension="{extension}" ContentType="{media_extensions[extension]}"/>'
585
- )
586
-
587
- except Exception as e:
588
- errors.append(f" Error parsing [Content_Types].xml: {e}")
589
-
590
- if errors:
591
- print(f"FAILED - Found {len(errors)} content type declaration errors:")
592
- for error in errors:
593
- print(error)
594
- return False
595
- else:
596
- if self.verbose:
597
- print(
598
- "PASSED - All content files are properly declared in [Content_Types].xml"
599
- )
600
- return True
601
-
602
- def validate_file_against_xsd(self, xml_file, verbose=False):
603
- xml_file = Path(xml_file).resolve()
604
- unpacked_dir = self.unpacked_dir.resolve()
605
-
606
- is_valid, current_errors = self._validate_single_file_xsd(
607
- xml_file, unpacked_dir
608
- )
609
-
610
- if is_valid is None:
611
- return None, set()
612
- elif is_valid:
613
- return True, set()
614
-
615
- original_errors = self._get_original_file_errors(xml_file)
616
-
617
- assert current_errors is not None
618
- new_errors = current_errors - original_errors
619
-
620
- new_errors = {
621
- e for e in new_errors
622
- if not any(pattern in e for pattern in self.IGNORED_VALIDATION_ERRORS)
623
- }
624
-
625
- if new_errors:
626
- if verbose:
627
- relative_path = xml_file.relative_to(unpacked_dir)
628
- print(f"FAILED - {relative_path}: {len(new_errors)} new error(s)")
629
- for error in list(new_errors)[:3]:
630
- truncated = error[:250] + "..." if len(error) > 250 else error
631
- print(f" - {truncated}")
632
- return False, new_errors
633
- else:
634
- if verbose:
635
- print(
636
- f"PASSED - No new errors (original had {len(current_errors)} errors)"
637
- )
638
- return True, set()
639
-
640
- def validate_against_xsd(self):
641
- new_errors = []
642
- original_error_count = 0
643
- valid_count = 0
644
- skipped_count = 0
645
-
646
- for xml_file in self.xml_files:
647
- relative_path = str(xml_file.relative_to(self.unpacked_dir))
648
- is_valid, new_file_errors = self.validate_file_against_xsd(
649
- xml_file, verbose=False
650
- )
651
-
652
- if is_valid is None:
653
- skipped_count += 1
654
- continue
655
- elif is_valid and not new_file_errors:
656
- valid_count += 1
657
- continue
658
- elif is_valid:
659
- original_error_count += 1
660
- valid_count += 1
661
- continue
662
-
663
- new_errors.append(f" {relative_path}: {len(new_file_errors)} new error(s)")
664
- for error in list(new_file_errors)[:3]:
665
- new_errors.append(
666
- f" - {error[:250]}..." if len(error) > 250 else f" - {error}"
667
- )
668
-
669
- if self.verbose:
670
- print(f"Validated {len(self.xml_files)} files:")
671
- print(f" - Valid: {valid_count}")
672
- print(f" - Skipped (no schema): {skipped_count}")
673
- if original_error_count:
674
- print(f" - With original errors (ignored): {original_error_count}")
675
- print(
676
- f" - With NEW errors: {len(new_errors) > 0 and len([e for e in new_errors if not e.startswith(' ')]) or 0}"
677
- )
678
-
679
- if new_errors:
680
- print("\nFAILED - Found NEW validation errors:")
681
- for error in new_errors:
682
- print(error)
683
- return False
684
- else:
685
- if self.verbose:
686
- print("\nPASSED - No new XSD validation errors introduced")
687
- return True
688
-
689
- def _get_schema_path(self, xml_file):
690
- if xml_file.name in self.SCHEMA_MAPPINGS:
691
- return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name]
692
-
693
- if xml_file.suffix == ".rels":
694
- return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"]
695
-
696
- if "charts/" in str(xml_file) and xml_file.name.startswith("chart"):
697
- return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"]
698
-
699
- if "theme/" in str(xml_file) and xml_file.name.startswith("theme"):
700
- return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"]
701
-
702
- if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS:
703
- return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name]
704
-
705
- return None
706
-
707
- def _clean_ignorable_namespaces(self, xml_doc):
708
- xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
709
- xml_copy = lxml.etree.fromstring(xml_string)
710
-
711
- for elem in xml_copy.iter():
712
- attrs_to_remove = []
713
-
714
- for attr in elem.attrib:
715
- if "{" in attr:
716
- ns = attr.split("}")[0][1:]
717
- if ns not in self.OOXML_NAMESPACES:
718
- attrs_to_remove.append(attr)
719
-
720
- for attr in attrs_to_remove:
721
- del elem.attrib[attr]
722
-
723
- self._remove_ignorable_elements(xml_copy)
724
-
725
- return lxml.etree.ElementTree(xml_copy)
726
-
727
- def _remove_ignorable_elements(self, root):
728
- elements_to_remove = []
729
-
730
- for elem in list(root):
731
- if not hasattr(elem, "tag") or callable(elem.tag):
732
- continue
733
-
734
- tag_str = str(elem.tag)
735
- if tag_str.startswith("{"):
736
- ns = tag_str.split("}")[0][1:]
737
- if ns not in self.OOXML_NAMESPACES:
738
- elements_to_remove.append(elem)
739
- continue
740
-
741
- self._remove_ignorable_elements(elem)
742
-
743
- for elem in elements_to_remove:
744
- root.remove(elem)
745
-
746
- def _preprocess_for_mc_ignorable(self, xml_doc):
747
- root = xml_doc.getroot()
748
-
749
- if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib:
750
- del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"]
751
-
752
- return xml_doc
753
-
754
- def _validate_single_file_xsd(self, xml_file, base_path):
755
- schema_path = self._get_schema_path(xml_file)
756
- if not schema_path:
757
- return None, None
758
-
759
- try:
760
- with open(schema_path, "rb") as xsd_file:
761
- parser = lxml.etree.XMLParser()
762
- xsd_doc = lxml.etree.parse(
763
- xsd_file, parser=parser, base_url=str(schema_path)
764
- )
765
- schema = lxml.etree.XMLSchema(xsd_doc)
766
-
767
- with open(xml_file, "r") as f:
768
- xml_doc = lxml.etree.parse(f)
769
-
770
- xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc)
771
- xml_doc = self._preprocess_for_mc_ignorable(xml_doc)
772
-
773
- relative_path = xml_file.relative_to(base_path)
774
- if (
775
- relative_path.parts
776
- and relative_path.parts[0] in self.MAIN_CONTENT_FOLDERS
777
- ):
778
- xml_doc = self._clean_ignorable_namespaces(xml_doc)
779
-
780
- if schema.validate(xml_doc):
781
- return True, set()
782
- else:
783
- errors = set()
784
- for error in schema.error_log:
785
- errors.add(error.message)
786
- return False, errors
787
-
788
- except Exception as e:
789
- return False, {str(e)}
790
-
791
- def _get_original_file_errors(self, xml_file):
792
- if self.original_file is None:
793
- return set()
794
-
795
- import tempfile
796
- import zipfile
797
-
798
- xml_file = Path(xml_file).resolve()
799
- unpacked_dir = self.unpacked_dir.resolve()
800
- relative_path = xml_file.relative_to(unpacked_dir)
801
-
802
- with tempfile.TemporaryDirectory() as temp_dir:
803
- temp_path = Path(temp_dir)
804
-
805
- with zipfile.ZipFile(self.original_file, "r") as zip_ref:
806
- zip_ref.extractall(temp_path)
807
-
808
- original_xml_file = temp_path / relative_path
809
-
810
- if not original_xml_file.exists():
811
- return set()
812
-
813
- is_valid, errors = self._validate_single_file_xsd(
814
- original_xml_file, temp_path
815
- )
816
- return errors if errors else set()
817
-
818
- def _remove_template_tags_from_text_nodes(self, xml_doc):
819
- warnings = []
820
- template_pattern = re.compile(r"\{\{[^}]*\}\}")
821
-
822
- xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
823
- xml_copy = lxml.etree.fromstring(xml_string)
824
-
825
- def process_text_content(text, content_type):
826
- if not text:
827
- return text
828
- matches = list(template_pattern.finditer(text))
829
- if matches:
830
- for match in matches:
831
- warnings.append(
832
- f"Found template tag in {content_type}: {match.group()}"
833
- )
834
- return template_pattern.sub("", text)
835
- return text
836
-
837
- for elem in xml_copy.iter():
838
- if not hasattr(elem, "tag") or callable(elem.tag):
839
- continue
840
- tag_str = str(elem.tag)
841
- if tag_str.endswith("}t") or tag_str == "t":
842
- continue
843
-
844
- elem.text = process_text_content(elem.text, "text content")
845
- elem.tail = process_text_content(elem.tail, "tail content")
846
-
847
- return lxml.etree.ElementTree(xml_copy), warnings
848
-
849
-
850
- if __name__ == "__main__":
851
- raise RuntimeError("This module should not be run directly.")