@panda-agent/panda-cli 0.1.29 → 0.1.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/pandacli.mjs +6 -1
- package/bundled-preset-skills/.gitkeep +0 -0
- package/bundled-preset-skills/README.md +17 -0
- package/bundled-preset-skills/docx/.skill-metadata.yaml +173 -0
- package/bundled-preset-skills/docx/LICENSE.txt +30 -0
- package/bundled-preset-skills/docx/SKILL.md +589 -0
- package/bundled-preset-skills/docx/scripts/__init__.py +1 -0
- package/bundled-preset-skills/docx/scripts/accept_changes.py +206 -0
- package/bundled-preset-skills/docx/scripts/comment.py +442 -0
- package/bundled-preset-skills/docx/scripts/office/helpers/__init__.py +1 -0
- package/bundled-preset-skills/docx/scripts/office/helpers/merge_runs.py +190 -0
- package/bundled-preset-skills/docx/scripts/office/helpers/simplify_redlines.py +185 -0
- package/bundled-preset-skills/docx/scripts/office/pack.py +167 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/mce/mc.xsd +75 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
- package/bundled-preset-skills/docx/scripts/office/soffice.py +194 -0
- package/bundled-preset-skills/docx/scripts/office/unpack.py +145 -0
- package/bundled-preset-skills/docx/scripts/office/validate.py +114 -0
- package/bundled-preset-skills/docx/scripts/office/validators/__init__.py +16 -0
- package/bundled-preset-skills/docx/scripts/office/validators/base.py +733 -0
- package/bundled-preset-skills/docx/scripts/office/validators/docx.py +354 -0
- package/bundled-preset-skills/docx/scripts/office/validators/pptx.py +230 -0
- package/bundled-preset-skills/docx/scripts/office/validators/redlining.py +212 -0
- package/bundled-preset-skills/docx/scripts/templates/comments.xml +3 -0
- package/bundled-preset-skills/docx/scripts/templates/commentsExtended.xml +3 -0
- package/bundled-preset-skills/docx/scripts/templates/commentsExtensible.xml +3 -0
- package/bundled-preset-skills/docx/scripts/templates/commentsIds.xml +3 -0
- package/bundled-preset-skills/docx/scripts/templates/people.xml +3 -0
- package/bundled-preset-skills/frontend-design/LICENSE.txt +177 -0
- package/bundled-preset-skills/frontend-design/SKILL.md +42 -0
- package/bundled-preset-skills/pdf/.skill-metadata.yaml +273 -0
- package/bundled-preset-skills/pdf/LICENSE.txt +30 -0
- package/bundled-preset-skills/pdf/SKILL.md +324 -0
- package/bundled-preset-skills/pdf/advanced-reference.md +609 -0
- package/bundled-preset-skills/pdf/form-filling-guide.md +318 -0
- package/bundled-preset-skills/pdf/forms.md +294 -0
- package/bundled-preset-skills/pdf/reference.md +612 -0
- package/bundled-preset-skills/pdf/scripts/check_bounding_boxes.py +198 -0
- package/bundled-preset-skills/pdf/scripts/check_fillable_fields.py +64 -0
- package/bundled-preset-skills/pdf/scripts/convert_pdf_to_images.py +102 -0
- package/bundled-preset-skills/pdf/scripts/create_validation_image.py +125 -0
- package/bundled-preset-skills/pdf/scripts/extract_form_field_info.py +220 -0
- package/bundled-preset-skills/pdf/scripts/extract_form_structure.py +202 -0
- package/bundled-preset-skills/pdf/scripts/fill_fillable_fields.py +205 -0
- package/bundled-preset-skills/pdf/scripts/fill_pdf_form_with_annotations.py +193 -0
- package/bundled-preset-skills/pptx-generator/SKILL.md +204 -0
- package/bundled-preset-skills/pptx-generator/assets/styles/business.json +8 -0
- package/bundled-preset-skills/pptx-generator/assets/styles/minimal.json +8 -0
- package/bundled-preset-skills/pptx-generator/assets/styles/modern.json +8 -0
- package/bundled-preset-skills/pptx-generator/assets/templates/ppt_data_template.json +40 -0
- package/bundled-preset-skills/pptx-generator/references/collaboration_guide.md +381 -0
- package/bundled-preset-skills/pptx-generator/references/json_format_spec.md +215 -0
- package/bundled-preset-skills/pptx-generator/references/layout_guide.md +290 -0
- package/bundled-preset-skills/pptx-generator/scripts/json_validator.py +194 -0
- package/bundled-preset-skills/pptx-generator/scripts/pptx_builder.py +340 -0
- package/bundled-preset-skills/pptx-generator/scripts/pptx_validator.py +162 -0
- package/bundled-preset-skills/skill-creator/LICENSE.txt +202 -0
- package/bundled-preset-skills/skill-creator/SKILL.md +479 -0
- package/bundled-preset-skills/skill-creator/agents/analyzer.md +274 -0
- package/bundled-preset-skills/skill-creator/agents/comparator.md +202 -0
- package/bundled-preset-skills/skill-creator/agents/grader.md +223 -0
- package/bundled-preset-skills/skill-creator/assets/eval_review.html +146 -0
- package/bundled-preset-skills/skill-creator/eval-viewer/generate_review.py +471 -0
- package/bundled-preset-skills/skill-creator/eval-viewer/viewer.html +1325 -0
- package/bundled-preset-skills/skill-creator/references/schemas.md +430 -0
- package/bundled-preset-skills/skill-creator/scripts/__init__.py +0 -0
- package/bundled-preset-skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
- package/bundled-preset-skills/skill-creator/scripts/generate_report.py +326 -0
- package/bundled-preset-skills/skill-creator/scripts/improve_description.py +248 -0
- package/bundled-preset-skills/skill-creator/scripts/package_skill.py +136 -0
- package/bundled-preset-skills/skill-creator/scripts/quick_validate.py +103 -0
- package/bundled-preset-skills/skill-creator/scripts/run_eval.py +310 -0
- package/bundled-preset-skills/skill-creator/scripts/run_loop.py +332 -0
- package/bundled-preset-skills/skill-creator/scripts/utils.py +47 -0
- package/bundled-preset-skills/xlsx/.skill-metadata.yaml +185 -0
- package/bundled-preset-skills/xlsx/LICENSE.txt +30 -0
- package/bundled-preset-skills/xlsx/SKILL.md +233 -0
- package/bundled-preset-skills/xlsx/scripts/office/helpers/__init__.py +1 -0
- package/bundled-preset-skills/xlsx/scripts/office/helpers/merge_runs.py +226 -0
- package/bundled-preset-skills/xlsx/scripts/office/helpers/simplify_redlines.py +198 -0
- package/bundled-preset-skills/xlsx/scripts/office/pack.py +162 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/mce/mc.xsd +75 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
- package/bundled-preset-skills/xlsx/scripts/office/soffice.py +185 -0
- package/bundled-preset-skills/xlsx/scripts/office/unpack.py +146 -0
- package/bundled-preset-skills/xlsx/scripts/office/validate.py +108 -0
- package/bundled-preset-skills/xlsx/scripts/office/validators/__init__.py +13 -0
- package/bundled-preset-skills/xlsx/scripts/office/validators/base.py +800 -0
- package/bundled-preset-skills/xlsx/scripts/office/validators/docx.py +383 -0
- package/bundled-preset-skills/xlsx/scripts/office/validators/pptx.py +250 -0
- package/bundled-preset-skills/xlsx/scripts/office/validators/redlining.py +229 -0
- package/bundled-preset-skills/xlsx/scripts/recalc.py +296 -0
- package/dist/panda-cli-ink.bundle.mjs +276 -342
- package/package.json +6 -4
|
@@ -0,0 +1,733 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Foundation class providing shared validation primitives for Office XML packages.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
import pathlib
|
|
7
|
+
import tempfile
|
|
8
|
+
import zipfile
|
|
9
|
+
|
|
10
|
+
import defusedxml.minidom
|
|
11
|
+
import lxml.etree
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BaseSchemaValidator:
|
|
15
|
+
"""Abstract base that concrete validators (DOCX, PPTX …) inherit from."""
|
|
16
|
+
|
|
17
|
+
# Errors matching any of these substrings are silently suppressed.
|
|
18
|
+
IGNORED_VALIDATION_ERRORS = [
|
|
19
|
+
"hyphenationZone",
|
|
20
|
+
"purl.org/dc/terms",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
# Mapping: element local-name → (id-attribute, scope)
|
|
24
|
+
# scope = "file" → unique within the same XML file
|
|
25
|
+
# scope = "global" → unique across the entire package
|
|
26
|
+
UNIQUE_ID_REQUIREMENTS = {
|
|
27
|
+
"comment": ("id", "file"),
|
|
28
|
+
"commentrangestart": ("id", "file"),
|
|
29
|
+
"commentrangeend": ("id", "file"),
|
|
30
|
+
"bookmarkstart": ("id", "file"),
|
|
31
|
+
"bookmarkend": ("id", "file"),
|
|
32
|
+
"sldid": ("id", "file"),
|
|
33
|
+
"sldmasterid": ("id", "global"),
|
|
34
|
+
"sldlayoutid": ("id", "global"),
|
|
35
|
+
"cm": ("authorid", "file"),
|
|
36
|
+
"sheet": ("sheetid", "file"),
|
|
37
|
+
"definedname": ("id", "file"),
|
|
38
|
+
"cxnsp": ("id", "file"),
|
|
39
|
+
"sp": ("id", "file"),
|
|
40
|
+
"pic": ("id", "file"),
|
|
41
|
+
"grpsp": ("id", "file"),
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
EXCLUDED_ID_CONTAINERS = {"sectionlst"}
|
|
45
|
+
|
|
46
|
+
ELEMENT_RELATIONSHIP_TYPES = {}
|
|
47
|
+
|
|
48
|
+
SCHEMA_MAPPINGS = {
|
|
49
|
+
"word": "ISO-IEC29500-4_2016/wml.xsd",
|
|
50
|
+
"ppt": "ISO-IEC29500-4_2016/pml.xsd",
|
|
51
|
+
"xl": "ISO-IEC29500-4_2016/sml.xsd",
|
|
52
|
+
"[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd",
|
|
53
|
+
"app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd",
|
|
54
|
+
"core.xml": "ecma/fouth-edition/opc-coreProperties.xsd",
|
|
55
|
+
"custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd",
|
|
56
|
+
".rels": "ecma/fouth-edition/opc-relationships.xsd",
|
|
57
|
+
"people.xml": "microsoft/wml-2012.xsd",
|
|
58
|
+
"commentsIds.xml": "microsoft/wml-cid-2016.xsd",
|
|
59
|
+
"commentsExtensible.xml": "microsoft/wml-cex-2018.xsd",
|
|
60
|
+
"commentsExtended.xml": "microsoft/wml-2012.xsd",
|
|
61
|
+
"chart": "ISO-IEC29500-4_2016/dml-chart.xsd",
|
|
62
|
+
"theme": "ISO-IEC29500-4_2016/dml-main.xsd",
|
|
63
|
+
"drawing": "ISO-IEC29500-4_2016/dml-main.xsd",
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006"
|
|
67
|
+
XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
|
|
68
|
+
|
|
69
|
+
PACKAGE_RELATIONSHIPS_NAMESPACE = (
|
|
70
|
+
"http://schemas.openxmlformats.org/package/2006/relationships"
|
|
71
|
+
)
|
|
72
|
+
OFFICE_RELATIONSHIPS_NAMESPACE = (
|
|
73
|
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
|
74
|
+
)
|
|
75
|
+
CONTENT_TYPES_NAMESPACE = (
|
|
76
|
+
"http://schemas.openxmlformats.org/package/2006/content-types"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"}
|
|
80
|
+
|
|
81
|
+
OOXML_NAMESPACES = {
|
|
82
|
+
"http://schemas.openxmlformats.org/officeDocument/2006/math",
|
|
83
|
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
|
84
|
+
"http://schemas.openxmlformats.org/schemaLibrary/2006/main",
|
|
85
|
+
"http://schemas.openxmlformats.org/drawingml/2006/main",
|
|
86
|
+
"http://schemas.openxmlformats.org/drawingml/2006/chart",
|
|
87
|
+
"http://schemas.openxmlformats.org/drawingml/2006/chartDrawing",
|
|
88
|
+
"http://schemas.openxmlformats.org/drawingml/2006/diagram",
|
|
89
|
+
"http://schemas.openxmlformats.org/drawingml/2006/picture",
|
|
90
|
+
"http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing",
|
|
91
|
+
"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
|
|
92
|
+
"http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
|
93
|
+
"http://schemas.openxmlformats.org/presentationml/2006/main",
|
|
94
|
+
"http://schemas.openxmlformats.org/spreadsheetml/2006/main",
|
|
95
|
+
"http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes",
|
|
96
|
+
"http://www.w3.org/XML/1998/namespace",
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
# ── Initialisation ───────────────────────────────────────────────────
|
|
100
|
+
|
|
101
|
+
def __init__(self, unpacked_dir, original_file=None, verbose=False):
|
|
102
|
+
self.unpacked_dir = pathlib.Path(unpacked_dir).resolve()
|
|
103
|
+
self.original_file = pathlib.Path(original_file) if original_file else None
|
|
104
|
+
self.verbose = verbose
|
|
105
|
+
|
|
106
|
+
self.schemas_dir = pathlib.Path(__file__).parent.parent / "schemas"
|
|
107
|
+
|
|
108
|
+
self.xml_files = [
|
|
109
|
+
fp
|
|
110
|
+
for glob in ("*.xml", "*.rels")
|
|
111
|
+
for fp in self.unpacked_dir.rglob(glob)
|
|
112
|
+
]
|
|
113
|
+
|
|
114
|
+
if not self.xml_files:
|
|
115
|
+
print("Warning: No XML files found in %s" % self.unpacked_dir)
|
|
116
|
+
|
|
117
|
+
# ── Abstract interface ───────────────────────────────────────────────
|
|
118
|
+
|
|
119
|
+
def validate(self):
|
|
120
|
+
raise NotImplementedError("Subclasses must implement the validate method")
|
|
121
|
+
|
|
122
|
+
def repair(self) -> int:
|
|
123
|
+
return self._fix_whitespace_preservation()
|
|
124
|
+
|
|
125
|
+
# ── Repair: xml:space="preserve" ─────────────────────────────────────
|
|
126
|
+
|
|
127
|
+
def repair_whitespace_preservation(self) -> int:
|
|
128
|
+
return self._fix_whitespace_preservation()
|
|
129
|
+
|
|
130
|
+
def _fix_whitespace_preservation(self) -> int:
|
|
131
|
+
n_fixed = 0
|
|
132
|
+
for fp in self.xml_files:
|
|
133
|
+
try:
|
|
134
|
+
raw = fp.read_text(encoding="utf-8")
|
|
135
|
+
dom = defusedxml.minidom.parseString(raw)
|
|
136
|
+
touched = False
|
|
137
|
+
|
|
138
|
+
for el in dom.getElementsByTagName("*"):
|
|
139
|
+
if not el.tagName.endswith(":t"):
|
|
140
|
+
continue
|
|
141
|
+
if el.firstChild is None:
|
|
142
|
+
continue
|
|
143
|
+
txt = el.firstChild.nodeValue
|
|
144
|
+
if txt and (txt.startswith((' ', '\t')) or txt.endswith((' ', '\t'))):
|
|
145
|
+
if el.getAttribute("xml:space") != "preserve":
|
|
146
|
+
el.setAttribute("xml:space", "preserve")
|
|
147
|
+
preview = repr(txt[:30]) + "..." if len(txt) > 30 else repr(txt)
|
|
148
|
+
print(" Repaired: %s: Added xml:space='preserve' to %s: %s"
|
|
149
|
+
% (fp.name, el.tagName, preview))
|
|
150
|
+
n_fixed += 1
|
|
151
|
+
touched = True
|
|
152
|
+
|
|
153
|
+
if touched:
|
|
154
|
+
fp.write_bytes(dom.toxml(encoding="UTF-8"))
|
|
155
|
+
except Exception:
|
|
156
|
+
pass
|
|
157
|
+
return n_fixed
|
|
158
|
+
|
|
159
|
+
# ── Well-formedness ──────────────────────────────────────────────────
|
|
160
|
+
|
|
161
|
+
def validate_xml(self):
|
|
162
|
+
problems: list[str] = []
|
|
163
|
+
for fp in self.xml_files:
|
|
164
|
+
try:
|
|
165
|
+
lxml.etree.parse(str(fp))
|
|
166
|
+
except lxml.etree.XMLSyntaxError as exc:
|
|
167
|
+
problems.append(" %s: Line %d: %s"
|
|
168
|
+
% (fp.relative_to(self.unpacked_dir), exc.lineno, exc.msg))
|
|
169
|
+
except Exception as exc:
|
|
170
|
+
problems.append(" %s: Unexpected error: %s"
|
|
171
|
+
% (fp.relative_to(self.unpacked_dir), exc))
|
|
172
|
+
|
|
173
|
+
if problems:
|
|
174
|
+
print("FAILED - Found %d XML violations:" % len(problems))
|
|
175
|
+
for p in problems:
|
|
176
|
+
print(p)
|
|
177
|
+
return False
|
|
178
|
+
if self.verbose:
|
|
179
|
+
print("PASSED - All XML files are well-formed")
|
|
180
|
+
return True
|
|
181
|
+
|
|
182
|
+
# ── Namespace coherence ──────────────────────────────────────────────
|
|
183
|
+
|
|
184
|
+
def validate_namespaces(self):
|
|
185
|
+
problems: list[str] = []
|
|
186
|
+
for fp in self.xml_files:
|
|
187
|
+
try:
|
|
188
|
+
root = lxml.etree.parse(str(fp)).getroot()
|
|
189
|
+
declared = set(root.nsmap.keys()) - {None}
|
|
190
|
+
for attr_val in [v for k, v in root.attrib.items() if k.endswith("Ignorable")]:
|
|
191
|
+
missing = set(attr_val.split()) - declared
|
|
192
|
+
problems.extend(
|
|
193
|
+
" %s: Namespace '%s' in Ignorable but not declared"
|
|
194
|
+
% (fp.relative_to(self.unpacked_dir), ns)
|
|
195
|
+
for ns in missing
|
|
196
|
+
)
|
|
197
|
+
except lxml.etree.XMLSyntaxError:
|
|
198
|
+
continue
|
|
199
|
+
|
|
200
|
+
if problems:
|
|
201
|
+
print("FAILED - %d namespace issues:" % len(problems))
|
|
202
|
+
for p in problems:
|
|
203
|
+
print(p)
|
|
204
|
+
return False
|
|
205
|
+
if self.verbose:
|
|
206
|
+
print("PASSED - All namespace prefixes properly declared")
|
|
207
|
+
return True
|
|
208
|
+
|
|
209
|
+
# ── ID uniqueness ────────────────────────────────────────────────────
|
|
210
|
+
|
|
211
|
+
def validate_unique_ids(self):
|
|
212
|
+
problems: list[str] = []
|
|
213
|
+
gids: dict = {}
|
|
214
|
+
|
|
215
|
+
for fp in self.xml_files:
|
|
216
|
+
try:
|
|
217
|
+
root = lxml.etree.parse(str(fp)).getroot()
|
|
218
|
+
fids: dict = {}
|
|
219
|
+
|
|
220
|
+
for mc_elem in root.xpath(".//mc:AlternateContent",
|
|
221
|
+
namespaces={"mc": self.MC_NAMESPACE}):
|
|
222
|
+
mc_elem.getparent().remove(mc_elem)
|
|
223
|
+
|
|
224
|
+
for el in root.iter():
|
|
225
|
+
raw_tag = el.tag.split("}")[-1].lower() if "}" in el.tag else el.tag.lower()
|
|
226
|
+
|
|
227
|
+
if raw_tag not in self.UNIQUE_ID_REQUIREMENTS:
|
|
228
|
+
continue
|
|
229
|
+
|
|
230
|
+
excluded = any(
|
|
231
|
+
anc.tag.split("}")[-1].lower() in self.EXCLUDED_ID_CONTAINERS
|
|
232
|
+
for anc in el.iterancestors()
|
|
233
|
+
)
|
|
234
|
+
if excluded:
|
|
235
|
+
continue
|
|
236
|
+
|
|
237
|
+
attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[raw_tag]
|
|
238
|
+
|
|
239
|
+
id_val = None
|
|
240
|
+
for a, v in el.attrib.items():
|
|
241
|
+
a_local = a.split("}")[-1].lower() if "}" in a else a.lower()
|
|
242
|
+
if a_local == attr_name:
|
|
243
|
+
id_val = v
|
|
244
|
+
break
|
|
245
|
+
|
|
246
|
+
if id_val is None:
|
|
247
|
+
continue
|
|
248
|
+
|
|
249
|
+
if scope == "global":
|
|
250
|
+
if id_val in gids:
|
|
251
|
+
pf, pl, pt = gids[id_val]
|
|
252
|
+
problems.append(
|
|
253
|
+
" %s: Line %s: Global ID '%s' in <%s> "
|
|
254
|
+
"already used in %s at line %s in <%s>"
|
|
255
|
+
% (fp.relative_to(self.unpacked_dir), el.sourceline,
|
|
256
|
+
id_val, raw_tag, pf, pl, pt))
|
|
257
|
+
else:
|
|
258
|
+
gids[id_val] = (fp.relative_to(self.unpacked_dir),
|
|
259
|
+
el.sourceline, raw_tag)
|
|
260
|
+
else:
|
|
261
|
+
key = (raw_tag, attr_name)
|
|
262
|
+
fids.setdefault(key, {})
|
|
263
|
+
if id_val in fids[key]:
|
|
264
|
+
problems.append(
|
|
265
|
+
" %s: Line %s: Duplicate %s='%s' in <%s> "
|
|
266
|
+
"(first occurrence at line %s)"
|
|
267
|
+
% (fp.relative_to(self.unpacked_dir), el.sourceline,
|
|
268
|
+
attr_name, id_val, raw_tag, fids[key][id_val]))
|
|
269
|
+
else:
|
|
270
|
+
fids[key][id_val] = el.sourceline
|
|
271
|
+
|
|
272
|
+
except (lxml.etree.XMLSyntaxError, Exception) as exc:
|
|
273
|
+
problems.append(" %s: Error: %s" % (fp.relative_to(self.unpacked_dir), exc))
|
|
274
|
+
|
|
275
|
+
if problems:
|
|
276
|
+
print("FAILED - Found %d ID uniqueness violations:" % len(problems))
|
|
277
|
+
for p in problems:
|
|
278
|
+
print(p)
|
|
279
|
+
return False
|
|
280
|
+
if self.verbose:
|
|
281
|
+
print("PASSED - All required IDs are unique")
|
|
282
|
+
return True
|
|
283
|
+
|
|
284
|
+
# ── Relationship file references ─────────────────────────────────────
|
|
285
|
+
|
|
286
|
+
def validate_file_references(self):
|
|
287
|
+
problems: list[str] = []
|
|
288
|
+
rels_list = list(self.unpacked_dir.rglob("*.rels"))
|
|
289
|
+
|
|
290
|
+
if not rels_list:
|
|
291
|
+
if self.verbose:
|
|
292
|
+
print("PASSED - No .rels files found")
|
|
293
|
+
return True
|
|
294
|
+
|
|
295
|
+
physical_files = [
|
|
296
|
+
fp.resolve()
|
|
297
|
+
for fp in self.unpacked_dir.rglob("*")
|
|
298
|
+
if fp.is_file()
|
|
299
|
+
and fp.name != "[Content_Types].xml"
|
|
300
|
+
and not fp.name.endswith(".rels")
|
|
301
|
+
]
|
|
302
|
+
|
|
303
|
+
touched: set = set()
|
|
304
|
+
|
|
305
|
+
if self.verbose:
|
|
306
|
+
print("Found %d .rels files and %d target files" % (len(rels_list), len(physical_files)))
|
|
307
|
+
|
|
308
|
+
for rf in rels_list:
|
|
309
|
+
try:
|
|
310
|
+
rroot = lxml.etree.parse(str(rf)).getroot()
|
|
311
|
+
rdir = rf.parent
|
|
312
|
+
found_here: set = set()
|
|
313
|
+
broken: list = []
|
|
314
|
+
|
|
315
|
+
for rel in rroot.findall(".//ns:Relationship",
|
|
316
|
+
namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE}):
|
|
317
|
+
tgt = rel.get("Target")
|
|
318
|
+
if not tgt or tgt.startswith(("http", "mailto:")):
|
|
319
|
+
continue
|
|
320
|
+
if tgt.startswith("/"):
|
|
321
|
+
resolved = self.unpacked_dir / tgt.lstrip("/")
|
|
322
|
+
elif rf.name == ".rels":
|
|
323
|
+
resolved = self.unpacked_dir / tgt
|
|
324
|
+
else:
|
|
325
|
+
resolved = rdir.parent / tgt
|
|
326
|
+
|
|
327
|
+
try:
|
|
328
|
+
resolved = resolved.resolve()
|
|
329
|
+
if resolved.exists() and resolved.is_file():
|
|
330
|
+
found_here.add(resolved)
|
|
331
|
+
touched.add(resolved)
|
|
332
|
+
else:
|
|
333
|
+
broken.append((tgt, rel.sourceline))
|
|
334
|
+
except (OSError, ValueError):
|
|
335
|
+
broken.append((tgt, rel.sourceline))
|
|
336
|
+
|
|
337
|
+
if broken:
|
|
338
|
+
rp = rf.relative_to(self.unpacked_dir)
|
|
339
|
+
for b_tgt, b_line in broken:
|
|
340
|
+
problems.append(" %s: Line %s: Broken reference to %s" % (rp, b_line, b_tgt))
|
|
341
|
+
|
|
342
|
+
except Exception as exc:
|
|
343
|
+
problems.append(" Error parsing %s: %s" % (rf.relative_to(self.unpacked_dir), exc))
|
|
344
|
+
|
|
345
|
+
orphans = set(physical_files) - touched
|
|
346
|
+
for o in sorted(orphans):
|
|
347
|
+
problems.append(" Unreferenced file: %s" % o.relative_to(self.unpacked_dir))
|
|
348
|
+
|
|
349
|
+
if problems:
|
|
350
|
+
print("FAILED - Found %d relationship validation errors:" % len(problems))
|
|
351
|
+
for p in problems:
|
|
352
|
+
print(p)
|
|
353
|
+
print(
|
|
354
|
+
"CRITICAL: These errors will cause the document to appear corrupt. "
|
|
355
|
+
+ "Broken references MUST be fixed, "
|
|
356
|
+
+ "and unreferenced files MUST be referenced or removed."
|
|
357
|
+
)
|
|
358
|
+
return False
|
|
359
|
+
if self.verbose:
|
|
360
|
+
print("PASSED - All references are valid and all files are properly referenced")
|
|
361
|
+
return True
|
|
362
|
+
# ── Relationship ID cross-check ─────────────────────────────────────
|
|
363
|
+
|
|
364
|
+
def validate_all_relationship_ids(self):
|
|
365
|
+
import lxml.etree
|
|
366
|
+
|
|
367
|
+
problems: list[str] = []
|
|
368
|
+
|
|
369
|
+
for fp in self.xml_files:
|
|
370
|
+
if fp.suffix == ".rels":
|
|
371
|
+
continue
|
|
372
|
+
|
|
373
|
+
rels_dir = fp.parent / "_rels"
|
|
374
|
+
companion = rels_dir / ("%s.rels" % fp.name)
|
|
375
|
+
if not companion.exists():
|
|
376
|
+
continue
|
|
377
|
+
|
|
378
|
+
try:
|
|
379
|
+
rroot = lxml.etree.parse(str(companion)).getroot()
|
|
380
|
+
rid_map: dict[str, str] = {}
|
|
381
|
+
|
|
382
|
+
for rel in rroot.findall("{%s}Relationship" % self.PACKAGE_RELATIONSHIPS_NAMESPACE):
|
|
383
|
+
rid = rel.get("Id")
|
|
384
|
+
rtype = rel.get("Type", "")
|
|
385
|
+
if not rid:
|
|
386
|
+
continue
|
|
387
|
+
if rid in rid_map:
|
|
388
|
+
problems.append(
|
|
389
|
+
" %s: Line %s: Duplicate relationship ID '%s' (IDs must be unique)"
|
|
390
|
+
% (companion.relative_to(self.unpacked_dir), rel.sourceline, rid))
|
|
391
|
+
type_short = rtype.rsplit("/", 1)[-1] if "/" in rtype else rtype
|
|
392
|
+
rid_map[rid] = type_short
|
|
393
|
+
|
|
394
|
+
xroot = lxml.etree.parse(str(fp)).getroot()
|
|
395
|
+
r_ns = self.OFFICE_RELATIONSHIPS_NAMESPACE
|
|
396
|
+
for el in xroot.iter():
|
|
397
|
+
for aname in ("id", "embed", "link"):
|
|
398
|
+
ref = el.get("{%s}%s" % (r_ns, aname))
|
|
399
|
+
if not ref:
|
|
400
|
+
continue
|
|
401
|
+
xrp = fp.relative_to(self.unpacked_dir)
|
|
402
|
+
ename = el.tag.split("}")[-1] if "}" in el.tag else el.tag
|
|
403
|
+
|
|
404
|
+
if ref not in rid_map:
|
|
405
|
+
top5 = ", ".join(sorted(rid_map.keys())[:5])
|
|
406
|
+
suffix = "..." if len(rid_map) > 5 else ""
|
|
407
|
+
problems.append(
|
|
408
|
+
" %s: Line %s: <%s> r:%s references non-existent relationship '%s' "
|
|
409
|
+
"(valid IDs: %s%s)"
|
|
410
|
+
% (xrp, el.sourceline, ename, aname, ref, top5, suffix))
|
|
411
|
+
elif aname == "id" and self.ELEMENT_RELATIONSHIP_TYPES:
|
|
412
|
+
expected = self._get_expected_relationship_type(ename)
|
|
413
|
+
if expected and expected not in rid_map[ref].lower():
|
|
414
|
+
problems.append(
|
|
415
|
+
" %s: Line %s: <%s> references '%s' which points to '%s' "
|
|
416
|
+
"but should point to a '%s' relationship"
|
|
417
|
+
% (xrp, el.sourceline, ename, ref, rid_map[ref], expected))
|
|
418
|
+
|
|
419
|
+
except Exception as exc:
|
|
420
|
+
problems.append(" Error processing %s: %s" % (fp.relative_to(self.unpacked_dir), exc))
|
|
421
|
+
|
|
422
|
+
if problems:
|
|
423
|
+
print("FAILED - Found %d relationship ID reference errors:" % len(problems))
|
|
424
|
+
for p in problems:
|
|
425
|
+
print(p)
|
|
426
|
+
print("\nThese ID mismatches will cause the document to appear corrupt!")
|
|
427
|
+
return False
|
|
428
|
+
if self.verbose:
|
|
429
|
+
print("PASSED - All relationship ID references are valid")
|
|
430
|
+
return True
|
|
431
|
+
|
|
432
|
+
def _get_expected_relationship_type(self, element_name):
|
|
433
|
+
low = element_name.lower()
|
|
434
|
+
|
|
435
|
+
if low in self.ELEMENT_RELATIONSHIP_TYPES:
|
|
436
|
+
return self.ELEMENT_RELATIONSHIP_TYPES[low]
|
|
437
|
+
|
|
438
|
+
if low.endswith("id") and len(low) > 2:
|
|
439
|
+
stem = low[:-2]
|
|
440
|
+
if stem.endswith("master") or stem.endswith("layout"):
|
|
441
|
+
return stem
|
|
442
|
+
return "slide" if stem == "sld" else stem
|
|
443
|
+
|
|
444
|
+
if low.endswith("reference") and len(low) > 9:
|
|
445
|
+
return low[:-9]
|
|
446
|
+
|
|
447
|
+
return None
|
|
448
|
+
|
|
449
|
+
# ── Content-type declarations ────────────────────────────────────────
|
|
450
|
+
|
|
451
|
+
def validate_content_types(self):
|
|
452
|
+
problems: list[str] = []
|
|
453
|
+
ct_file = self.unpacked_dir / "[Content_Types].xml"
|
|
454
|
+
if not ct_file.exists():
|
|
455
|
+
print("FAILED - [Content_Types].xml file not found")
|
|
456
|
+
return False
|
|
457
|
+
|
|
458
|
+
try:
|
|
459
|
+
ct_root = lxml.etree.parse(str(ct_file)).getroot()
|
|
460
|
+
declared_parts: set[str] = set()
|
|
461
|
+
declared_exts: set[str] = set()
|
|
462
|
+
|
|
463
|
+
for ov in ct_root.findall("{%s}Override" % self.CONTENT_TYPES_NAMESPACE):
|
|
464
|
+
pname = ov.get("PartName")
|
|
465
|
+
if pname is not None:
|
|
466
|
+
declared_parts.add(pname.lstrip("/"))
|
|
467
|
+
|
|
468
|
+
for df in ct_root.findall("{%s}Default" % self.CONTENT_TYPES_NAMESPACE):
|
|
469
|
+
ext = df.get("Extension")
|
|
470
|
+
if ext is not None:
|
|
471
|
+
declared_exts.add(ext.lower())
|
|
472
|
+
|
|
473
|
+
_declarable = {
|
|
474
|
+
"sld", "sldLayout", "sldMaster", "presentation",
|
|
475
|
+
"document", "workbook", "worksheet", "theme",
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
_media_ct = {
|
|
479
|
+
"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg",
|
|
480
|
+
"gif": "image/gif", "bmp": "image/bmp", "tiff": "image/tiff",
|
|
481
|
+
"wmf": "image/x-wmf", "emf": "image/x-emf",
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
for xf in self.xml_files:
|
|
485
|
+
rel = str(xf.relative_to(self.unpacked_dir)).replace("\\", "/")
|
|
486
|
+
if any(s in rel for s in (".rels", "[Content_Types]", "docProps/", "_rels/")):
|
|
487
|
+
continue
|
|
488
|
+
try:
|
|
489
|
+
rtag = lxml.etree.parse(str(xf)).getroot().tag
|
|
490
|
+
rname = rtag.split("}")[-1] if "}" in rtag else rtag
|
|
491
|
+
if rname in _declarable and rel not in declared_parts:
|
|
492
|
+
problems.append(
|
|
493
|
+
" %s: File with <%s> root not declared in [Content_Types].xml"
|
|
494
|
+
% (rel, rname))
|
|
495
|
+
except Exception:
|
|
496
|
+
continue
|
|
497
|
+
|
|
498
|
+
for fp in self.unpacked_dir.rglob("*"):
|
|
499
|
+
if not fp.is_file():
|
|
500
|
+
continue
|
|
501
|
+
if fp.suffix.lower() in {".xml", ".rels"}:
|
|
502
|
+
continue
|
|
503
|
+
if fp.name == "[Content_Types].xml":
|
|
504
|
+
continue
|
|
505
|
+
if "_rels" in fp.parts or "docProps" in fp.parts:
|
|
506
|
+
continue
|
|
507
|
+
ext = fp.suffix.lstrip(".").lower()
|
|
508
|
+
if ext and ext not in declared_exts and ext in _media_ct:
|
|
509
|
+
problems.append(
|
|
510
|
+
' %s: File with extension \'%s\' not declared in [Content_Types].xml '
|
|
511
|
+
'- should add: <Default Extension="%s" ContentType="%s"/>'
|
|
512
|
+
% (fp.relative_to(self.unpacked_dir), ext, ext, _media_ct[ext]))
|
|
513
|
+
|
|
514
|
+
except Exception as exc:
|
|
515
|
+
problems.append(" Error parsing [Content_Types].xml: %s" % exc)
|
|
516
|
+
|
|
517
|
+
if problems:
|
|
518
|
+
print("FAILED - Found %d content type declaration errors:" % len(problems))
|
|
519
|
+
for p in problems:
|
|
520
|
+
print(p)
|
|
521
|
+
return False
|
|
522
|
+
if self.verbose:
|
|
523
|
+
print("PASSED - All content files are properly declared in [Content_Types].xml")
|
|
524
|
+
return True
|
|
525
|
+
|
|
526
|
+
# ── Single-file XSD validation ───────────────────────────────────────
|
|
527
|
+
|
|
528
|
+
def validate_file_against_xsd(self, xml_file, verbose=False):
|
|
529
|
+
xml_file = pathlib.Path(xml_file).resolve()
|
|
530
|
+
base = self.unpacked_dir.resolve()
|
|
531
|
+
|
|
532
|
+
ok, cur_errs = self._check_single_xsd(xml_file, base)
|
|
533
|
+
|
|
534
|
+
if ok is None:
|
|
535
|
+
return None, set()
|
|
536
|
+
if ok:
|
|
537
|
+
return True, set()
|
|
538
|
+
|
|
539
|
+
orig_errs = self._original_errors(xml_file)
|
|
540
|
+
|
|
541
|
+
assert cur_errs is not None
|
|
542
|
+
fresh = cur_errs - orig_errs
|
|
543
|
+
fresh = {e for e in fresh
|
|
544
|
+
if not any(pat in e for pat in self.IGNORED_VALIDATION_ERRORS)}
|
|
545
|
+
|
|
546
|
+
if fresh:
|
|
547
|
+
if verbose:
|
|
548
|
+
rp = xml_file.relative_to(base)
|
|
549
|
+
print("FAILED - %s: %d new error(s)" % (rp, len(fresh)))
|
|
550
|
+
for e in list(fresh)[:3]:
|
|
551
|
+
trunc = (e[:250] + "...") if len(e) > 250 else e
|
|
552
|
+
print(" - %s" % trunc)
|
|
553
|
+
return False, fresh
|
|
554
|
+
if verbose:
|
|
555
|
+
print("PASSED - No new errors (original had %d errors)" % len(cur_errs))
|
|
556
|
+
return True, set()
|
|
557
|
+
|
|
558
|
+
# ── Batch XSD validation ─────────────────────────────────────────────
|
|
559
|
+
|
|
560
|
+
def validate_against_xsd(self):
|
|
561
|
+
fresh_errors: list[str] = []
|
|
562
|
+
orig_err_count = 0
|
|
563
|
+
ok_count = 0
|
|
564
|
+
skip_count = 0
|
|
565
|
+
|
|
566
|
+
for fp in self.xml_files:
|
|
567
|
+
rp = str(fp.relative_to(self.unpacked_dir))
|
|
568
|
+
ok, file_errs = self.validate_file_against_xsd(fp, verbose=False)
|
|
569
|
+
|
|
570
|
+
if ok is None:
|
|
571
|
+
skip_count += 1
|
|
572
|
+
elif ok and not file_errs:
|
|
573
|
+
ok_count += 1
|
|
574
|
+
elif ok:
|
|
575
|
+
orig_err_count += 1
|
|
576
|
+
ok_count += 1
|
|
577
|
+
else:
|
|
578
|
+
fresh_errors.append(" %s: %d new error(s)" % (rp, len(file_errs)))
|
|
579
|
+
for e in list(file_errs)[:3]:
|
|
580
|
+
fresh_errors.append(
|
|
581
|
+
" - %s..." % e[:250] if len(e) > 250 else " - %s" % e)
|
|
582
|
+
|
|
583
|
+
if self.verbose:
|
|
584
|
+
print("Validated %d files:" % len(self.xml_files))
|
|
585
|
+
print(" - Valid: %d" % ok_count)
|
|
586
|
+
print(" - Skipped (no schema): %d" % skip_count)
|
|
587
|
+
if orig_err_count:
|
|
588
|
+
print(" - With original errors (ignored): %d" % orig_err_count)
|
|
589
|
+
n_err_files = len([ln for ln in fresh_errors if not ln.startswith(" ")])
|
|
590
|
+
print(" - With NEW errors: %d" % n_err_files)
|
|
591
|
+
|
|
592
|
+
if fresh_errors:
|
|
593
|
+
print("\nFAILED - Found NEW validation errors:")
|
|
594
|
+
for ln in fresh_errors:
|
|
595
|
+
print(ln)
|
|
596
|
+
return False
|
|
597
|
+
if self.verbose:
|
|
598
|
+
print("\nPASSED - No new XSD validation errors introduced")
|
|
599
|
+
return True
|
|
600
|
+
|
|
601
|
+
# ── Internal: schema resolution ──────────────────────────────────────
|
|
602
|
+
|
|
603
|
+
def _get_schema_path(self, fp):
|
|
604
|
+
if fp.name in self.SCHEMA_MAPPINGS:
|
|
605
|
+
return self.schemas_dir / self.SCHEMA_MAPPINGS[fp.name]
|
|
606
|
+
if fp.suffix == ".rels":
|
|
607
|
+
return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"]
|
|
608
|
+
if "charts/" in str(fp) and fp.name.startswith("chart"):
|
|
609
|
+
return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"]
|
|
610
|
+
if "theme/" in str(fp) and fp.name.startswith("theme"):
|
|
611
|
+
return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"]
|
|
612
|
+
if fp.parent.name in self.MAIN_CONTENT_FOLDERS:
|
|
613
|
+
return self.schemas_dir / self.SCHEMA_MAPPINGS[fp.parent.name]
|
|
614
|
+
return None
|
|
615
|
+
|
|
616
|
+
# ── Internal: MC namespace stripping ─────────────────────────────────
|
|
617
|
+
|
|
618
|
+
def _clean_ignorable_namespaces(self, tree):
|
|
619
|
+
xml_str = lxml.etree.tostring(tree, encoding="unicode")
|
|
620
|
+
copy = lxml.etree.fromstring(xml_str)
|
|
621
|
+
|
|
622
|
+
for el in copy.iter():
|
|
623
|
+
bad_attrs = [a for a in el.attrib
|
|
624
|
+
if "{" in a and a.split("}")[0][1:] not in self.OOXML_NAMESPACES]
|
|
625
|
+
for a in bad_attrs:
|
|
626
|
+
del el.attrib[a]
|
|
627
|
+
|
|
628
|
+
self._drop_non_ooxml_elements(copy)
|
|
629
|
+
return lxml.etree.ElementTree(copy)
|
|
630
|
+
|
|
631
|
+
def _drop_non_ooxml_elements(self, root):
|
|
632
|
+
doomed = []
|
|
633
|
+
for child in list(root):
|
|
634
|
+
if not hasattr(child, "tag") or callable(child.tag):
|
|
635
|
+
continue
|
|
636
|
+
tag_s = str(child.tag)
|
|
637
|
+
if tag_s.startswith("{"):
|
|
638
|
+
ns = tag_s.split("}")[0][1:]
|
|
639
|
+
if ns not in self.OOXML_NAMESPACES:
|
|
640
|
+
doomed.append(child)
|
|
641
|
+
continue
|
|
642
|
+
self._drop_non_ooxml_elements(child)
|
|
643
|
+
for d in doomed:
|
|
644
|
+
root.remove(d)
|
|
645
|
+
|
|
646
|
+
def _strip_mc_ignorable(self, tree):
|
|
647
|
+
rt = tree.getroot()
|
|
648
|
+
key = "{%s}Ignorable" % self.MC_NAMESPACE
|
|
649
|
+
if key in rt.attrib:
|
|
650
|
+
del rt.attrib[key]
|
|
651
|
+
return tree
|
|
652
|
+
|
|
653
|
+
# ── Internal: XSD check for one file ─────────────────────────────────
|
|
654
|
+
|
|
655
|
+
def _check_single_xsd(self, fp, base):
|
|
656
|
+
schema_path = self._get_schema_path(fp)
|
|
657
|
+
if schema_path is None:
|
|
658
|
+
return None, None
|
|
659
|
+
|
|
660
|
+
try:
|
|
661
|
+
with open(schema_path, "rb") as fh:
|
|
662
|
+
xsd_doc = lxml.etree.parse(fh, parser=lxml.etree.XMLParser(),
|
|
663
|
+
base_url=str(schema_path))
|
|
664
|
+
schema = lxml.etree.XMLSchema(xsd_doc)
|
|
665
|
+
|
|
666
|
+
with open(fp, "r") as fh:
|
|
667
|
+
xml_tree = lxml.etree.parse(fh)
|
|
668
|
+
|
|
669
|
+
xml_tree, _ = self._scrub_template_placeholders(xml_tree)
|
|
670
|
+
xml_tree = self._strip_mc_ignorable(xml_tree)
|
|
671
|
+
|
|
672
|
+
rp = fp.relative_to(base)
|
|
673
|
+
if rp.parts and rp.parts[0] in self.MAIN_CONTENT_FOLDERS:
|
|
674
|
+
xml_tree = self._clean_ignorable_namespaces(xml_tree)
|
|
675
|
+
|
|
676
|
+
if schema.validate(xml_tree):
|
|
677
|
+
return True, set()
|
|
678
|
+
return False, {err.message for err in schema.error_log}
|
|
679
|
+
except Exception as exc:
|
|
680
|
+
return False, {str(exc)}
|
|
681
|
+
|
|
682
|
+
# ── Internal: original-file error baseline ───────────────────────────
|
|
683
|
+
|
|
684
|
+
def _original_errors(self, fp):
|
|
685
|
+
if self.original_file is None:
|
|
686
|
+
return set()
|
|
687
|
+
|
|
688
|
+
fp = pathlib.Path(fp).resolve()
|
|
689
|
+
base = self.unpacked_dir.resolve()
|
|
690
|
+
rp = fp.relative_to(base)
|
|
691
|
+
|
|
692
|
+
with tempfile.TemporaryDirectory() as td:
|
|
693
|
+
tp = pathlib.Path(td)
|
|
694
|
+
with zipfile.ZipFile(self.original_file, "r") as zf:
|
|
695
|
+
zf.extractall(tp)
|
|
696
|
+
orig_fp = tp / rp
|
|
697
|
+
if not orig_fp.exists():
|
|
698
|
+
return set()
|
|
699
|
+
_, errs = self._check_single_xsd(orig_fp, tp)
|
|
700
|
+
return errs if errs else set()
|
|
701
|
+
|
|
702
|
+
# ── Internal: template-tag removal ───────────────────────────────────
|
|
703
|
+
|
|
704
|
+
def _scrub_template_placeholders(self, tree):
|
|
705
|
+
warnings: list[str] = []
|
|
706
|
+
pat = re.compile(r"\{\{[^}]*\}\}")
|
|
707
|
+
|
|
708
|
+
xml_str = lxml.etree.tostring(tree, encoding="unicode")
|
|
709
|
+
copy = lxml.etree.fromstring(xml_str)
|
|
710
|
+
|
|
711
|
+
def _clean(txt, kind):
|
|
712
|
+
if not txt:
|
|
713
|
+
return txt
|
|
714
|
+
hits = list(pat.finditer(txt))
|
|
715
|
+
if hits:
|
|
716
|
+
warnings.extend("Found template tag in %s: %s" % (kind, m.group()) for m in hits)
|
|
717
|
+
return pat.sub("", txt)
|
|
718
|
+
return txt
|
|
719
|
+
|
|
720
|
+
for el in copy.iter():
|
|
721
|
+
if not hasattr(el, "tag") or callable(el.tag):
|
|
722
|
+
continue
|
|
723
|
+
tag_s = str(el.tag)
|
|
724
|
+
if tag_s.endswith("}t") or tag_s == "t":
|
|
725
|
+
continue
|
|
726
|
+
el.text = _clean(el.text, "text content")
|
|
727
|
+
el.tail = _clean(el.tail, "tail content")
|
|
728
|
+
|
|
729
|
+
return lxml.etree.ElementTree(copy), warnings
|
|
730
|
+
|
|
731
|
+
|
|
732
|
+
if __name__ == "__main__":
|
|
733
|
+
raise RuntimeError("This module should not be run directly.")
|