@panda-agent/panda-cli 0.1.29 → 0.1.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/panda-cli-ink.bundle.mjs +258 -247
- package/package.json +6 -4
- package/skills/.gitkeep +0 -0
- package/skills/README.md +13 -0
- package/skills/docx/.skill-metadata.yaml +173 -0
- package/skills/docx/LICENSE.txt +30 -0
- package/skills/docx/SKILL.md +589 -0
- package/skills/docx/scripts/__init__.py +1 -0
- package/skills/docx/scripts/accept_changes.py +206 -0
- package/skills/docx/scripts/comment.py +442 -0
- package/skills/docx/scripts/office/helpers/__init__.py +1 -0
- package/skills/docx/scripts/office/helpers/merge_runs.py +190 -0
- package/skills/docx/scripts/office/helpers/simplify_redlines.py +185 -0
- package/skills/docx/scripts/office/pack.py +167 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
- package/skills/docx/scripts/office/schemas/mce/mc.xsd +75 -0
- package/skills/docx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
- package/skills/docx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
- package/skills/docx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
- package/skills/docx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
- package/skills/docx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
- package/skills/docx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
- package/skills/docx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
- package/skills/docx/scripts/office/soffice.py +194 -0
- package/skills/docx/scripts/office/unpack.py +145 -0
- package/skills/docx/scripts/office/validate.py +114 -0
- package/skills/docx/scripts/office/validators/__init__.py +16 -0
- package/skills/docx/scripts/office/validators/base.py +733 -0
- package/skills/docx/scripts/office/validators/docx.py +354 -0
- package/skills/docx/scripts/office/validators/pptx.py +230 -0
- package/skills/docx/scripts/office/validators/redlining.py +212 -0
- package/skills/docx/scripts/templates/comments.xml +3 -0
- package/skills/docx/scripts/templates/commentsExtended.xml +3 -0
- package/skills/docx/scripts/templates/commentsExtensible.xml +3 -0
- package/skills/docx/scripts/templates/commentsIds.xml +3 -0
- package/skills/docx/scripts/templates/people.xml +3 -0
- package/skills/frontend-design/LICENSE.txt +177 -0
- package/skills/frontend-design/SKILL.md +42 -0
- package/skills/pdf/.skill-metadata.yaml +273 -0
- package/skills/pdf/LICENSE.txt +30 -0
- package/skills/pdf/SKILL.md +324 -0
- package/skills/pdf/advanced-reference.md +609 -0
- package/skills/pdf/form-filling-guide.md +318 -0
- package/skills/pdf/forms.md +294 -0
- package/skills/pdf/reference.md +612 -0
- package/skills/pdf/scripts/check_bounding_boxes.py +198 -0
- package/skills/pdf/scripts/check_fillable_fields.py +64 -0
- package/skills/pdf/scripts/convert_pdf_to_images.py +102 -0
- package/skills/pdf/scripts/create_validation_image.py +125 -0
- package/skills/pdf/scripts/extract_form_field_info.py +220 -0
- package/skills/pdf/scripts/extract_form_structure.py +202 -0
- package/skills/pdf/scripts/fill_fillable_fields.py +205 -0
- package/skills/pdf/scripts/fill_pdf_form_with_annotations.py +193 -0
- package/skills/pptx-generator/SKILL.md +204 -0
- package/skills/pptx-generator/assets/styles/business.json +8 -0
- package/skills/pptx-generator/assets/styles/minimal.json +8 -0
- package/skills/pptx-generator/assets/styles/modern.json +8 -0
- package/skills/pptx-generator/assets/templates/ppt_data_template.json +40 -0
- package/skills/pptx-generator/references/collaboration_guide.md +381 -0
- package/skills/pptx-generator/references/json_format_spec.md +215 -0
- package/skills/pptx-generator/references/layout_guide.md +290 -0
- package/skills/pptx-generator/scripts/json_validator.py +194 -0
- package/skills/pptx-generator/scripts/pptx_builder.py +340 -0
- package/skills/pptx-generator/scripts/pptx_validator.py +162 -0
- package/skills/skill-creator/LICENSE.txt +202 -0
- package/skills/skill-creator/SKILL.md +479 -0
- package/skills/skill-creator/agents/analyzer.md +274 -0
- package/skills/skill-creator/agents/comparator.md +202 -0
- package/skills/skill-creator/agents/grader.md +223 -0
- package/skills/skill-creator/assets/eval_review.html +146 -0
- package/skills/skill-creator/eval-viewer/generate_review.py +471 -0
- package/skills/skill-creator/eval-viewer/viewer.html +1325 -0
- package/skills/skill-creator/references/schemas.md +430 -0
- package/skills/skill-creator/scripts/__init__.py +0 -0
- package/skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
- package/skills/skill-creator/scripts/generate_report.py +326 -0
- package/skills/skill-creator/scripts/improve_description.py +248 -0
- package/skills/skill-creator/scripts/package_skill.py +136 -0
- package/skills/skill-creator/scripts/quick_validate.py +103 -0
- package/skills/skill-creator/scripts/run_eval.py +310 -0
- package/skills/skill-creator/scripts/run_loop.py +332 -0
- package/skills/skill-creator/scripts/utils.py +47 -0
- package/skills/xlsx/.skill-metadata.yaml +185 -0
- package/skills/xlsx/LICENSE.txt +30 -0
- package/skills/xlsx/SKILL.md +233 -0
- package/skills/xlsx/scripts/office/helpers/__init__.py +1 -0
- package/skills/xlsx/scripts/office/helpers/merge_runs.py +226 -0
- package/skills/xlsx/scripts/office/helpers/simplify_redlines.py +198 -0
- package/skills/xlsx/scripts/office/pack.py +162 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
- package/skills/xlsx/scripts/office/schemas/mce/mc.xsd +75 -0
- package/skills/xlsx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
- package/skills/xlsx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
- package/skills/xlsx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
- package/skills/xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
- package/skills/xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
- package/skills/xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
- package/skills/xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
- package/skills/xlsx/scripts/office/soffice.py +185 -0
- package/skills/xlsx/scripts/office/unpack.py +146 -0
- package/skills/xlsx/scripts/office/validate.py +108 -0
- package/skills/xlsx/scripts/office/validators/__init__.py +13 -0
- package/skills/xlsx/scripts/office/validators/base.py +800 -0
- package/skills/xlsx/scripts/office/validators/docx.py +383 -0
- package/skills/xlsx/scripts/office/validators/pptx.py +250 -0
- package/skills/xlsx/scripts/office/validators/redlining.py +229 -0
- package/skills/xlsx/scripts/recalc.py +296 -0
|
@@ -0,0 +1,800 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# ──────────────────────────────────────────────────────────────────
|
|
3
|
+
# Foundation class for OOXML schema validation.
|
|
4
|
+
#
|
|
5
|
+
# Subclasses (DOCX, PPTX) override `validate()` and optionally
|
|
6
|
+
# `repair()`. The base provides shared XSD checking, namespace
|
|
7
|
+
# auditing, unique-ID enforcement, relationship verification, and
|
|
8
|
+
# content-type validation.
|
|
9
|
+
# ──────────────────────────────────────────────────────────────────
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import defusedxml.minidom
|
|
15
|
+
import lxml.etree
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class BaseSchemaValidator:
|
|
19
|
+
"""Shared validation infrastructure for Office Open XML packages."""
|
|
20
|
+
|
|
21
|
+
# ── patterns that we silently ignore in XSD output ──
|
|
22
|
+
IGNORED_VALIDATION_ERRORS = ["hyphenationZone", "purl.org/dc/terms"]
|
|
23
|
+
|
|
24
|
+
# tag → (attribute, scope) for uniqueness checks
|
|
25
|
+
UNIQUE_ID_REQUIREMENTS = {
|
|
26
|
+
"comment": ("id", "file"),
|
|
27
|
+
"commentrangestart": ("id", "file"),
|
|
28
|
+
"commentrangeend": ("id", "file"),
|
|
29
|
+
"bookmarkstart": ("id", "file"),
|
|
30
|
+
"bookmarkend": ("id", "file"),
|
|
31
|
+
"sldid": ("id", "file"),
|
|
32
|
+
"sldmasterid": ("id", "global"),
|
|
33
|
+
"sldlayoutid": ("id", "global"),
|
|
34
|
+
"cm": ("authorid", "file"),
|
|
35
|
+
"sheet": ("sheetid", "file"),
|
|
36
|
+
"definedname": ("id", "file"),
|
|
37
|
+
"cxnsp": ("id", "file"),
|
|
38
|
+
"sp": ("id", "file"),
|
|
39
|
+
"pic": ("id", "file"),
|
|
40
|
+
"grpsp": ("id", "file"),
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
EXCLUDED_ID_CONTAINERS = {"sectionlst"}
|
|
44
|
+
|
|
45
|
+
ELEMENT_RELATIONSHIP_TYPES = {}
|
|
46
|
+
|
|
47
|
+
SCHEMA_MAPPINGS = {
|
|
48
|
+
"word": "ISO-IEC29500-4_2016/wml.xsd",
|
|
49
|
+
"ppt": "ISO-IEC29500-4_2016/pml.xsd",
|
|
50
|
+
"xl": "ISO-IEC29500-4_2016/sml.xsd",
|
|
51
|
+
"[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd",
|
|
52
|
+
"app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd",
|
|
53
|
+
"core.xml": "ecma/fouth-edition/opc-coreProperties.xsd",
|
|
54
|
+
"custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd",
|
|
55
|
+
".rels": "ecma/fouth-edition/opc-relationships.xsd",
|
|
56
|
+
"people.xml": "microsoft/wml-2012.xsd",
|
|
57
|
+
"commentsIds.xml": "microsoft/wml-cid-2016.xsd",
|
|
58
|
+
"commentsExtensible.xml": "microsoft/wml-cex-2018.xsd",
|
|
59
|
+
"commentsExtended.xml": "microsoft/wml-2012.xsd",
|
|
60
|
+
"chart": "ISO-IEC29500-4_2016/dml-chart.xsd",
|
|
61
|
+
"theme": "ISO-IEC29500-4_2016/dml-main.xsd",
|
|
62
|
+
"drawing": "ISO-IEC29500-4_2016/dml-main.xsd",
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# well-known XML / OPC / OOXML namespace URIs
|
|
66
|
+
MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006"
|
|
67
|
+
XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
|
|
68
|
+
PACKAGE_RELATIONSHIPS_NAMESPACE = (
|
|
69
|
+
"http://schemas.openxmlformats.org/package/2006/relationships"
|
|
70
|
+
)
|
|
71
|
+
OFFICE_RELATIONSHIPS_NAMESPACE = (
|
|
72
|
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
|
73
|
+
)
|
|
74
|
+
CONTENT_TYPES_NAMESPACE = (
|
|
75
|
+
"http://schemas.openxmlformats.org/package/2006/content-types"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"}
|
|
79
|
+
|
|
80
|
+
OOXML_NAMESPACES = {
|
|
81
|
+
"http://schemas.openxmlformats.org/officeDocument/2006/math",
|
|
82
|
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
|
83
|
+
"http://schemas.openxmlformats.org/schemaLibrary/2006/main",
|
|
84
|
+
"http://schemas.openxmlformats.org/drawingml/2006/main",
|
|
85
|
+
"http://schemas.openxmlformats.org/drawingml/2006/chart",
|
|
86
|
+
"http://schemas.openxmlformats.org/drawingml/2006/chartDrawing",
|
|
87
|
+
"http://schemas.openxmlformats.org/drawingml/2006/diagram",
|
|
88
|
+
"http://schemas.openxmlformats.org/drawingml/2006/picture",
|
|
89
|
+
"http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing",
|
|
90
|
+
"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
|
|
91
|
+
"http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
|
92
|
+
"http://schemas.openxmlformats.org/presentationml/2006/main",
|
|
93
|
+
"http://schemas.openxmlformats.org/spreadsheetml/2006/main",
|
|
94
|
+
"http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes",
|
|
95
|
+
"http://www.w3.org/XML/1998/namespace",
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
# ──────────────────────────────────────────────────────────────
|
|
99
|
+
|
|
100
|
+
def __init__(self, unpacked_dir, original_file=None, verbose=False):
|
|
101
|
+
self.unpacked_dir = Path(unpacked_dir).resolve()
|
|
102
|
+
self.original_file = Path(original_file) if original_file else None
|
|
103
|
+
self.verbose = verbose
|
|
104
|
+
|
|
105
|
+
self.schemas_dir = Path(__file__).parent.parent / "schemas"
|
|
106
|
+
|
|
107
|
+
self.xml_files = [
|
|
108
|
+
fp
|
|
109
|
+
for glob in ("*.xml", "*.rels")
|
|
110
|
+
for fp in self.unpacked_dir.rglob(glob)
|
|
111
|
+
]
|
|
112
|
+
if not self.xml_files:
|
|
113
|
+
print("Warning: No XML files found in {}".format(self.unpacked_dir))
|
|
114
|
+
|
|
115
|
+
# ── abstract / default methods ──
|
|
116
|
+
|
|
117
|
+
def validate(self):
|
|
118
|
+
raise NotImplementedError("Subclasses must implement the validate method")
|
|
119
|
+
|
|
120
|
+
def repair(self) -> int:
|
|
121
|
+
return self._fix_whitespace_preservation()
|
|
122
|
+
|
|
123
|
+
# ──────────────────────────────────────────────────────────────
|
|
124
|
+
# Repair: xml:space="preserve" on <w:t> with leading/trailing ws
|
|
125
|
+
# ──────────────────────────────────────────────────────────────
|
|
126
|
+
|
|
127
|
+
def _fix_whitespace_preservation(self) -> int:
|
|
128
|
+
n_fixed = 0
|
|
129
|
+
for fp in self.xml_files:
|
|
130
|
+
try:
|
|
131
|
+
raw = fp.read_text(encoding="utf-8")
|
|
132
|
+
dom = defusedxml.minidom.parseString(raw)
|
|
133
|
+
touched = False
|
|
134
|
+
|
|
135
|
+
for el in dom.getElementsByTagName("*"):
|
|
136
|
+
if not el.tagName.endswith(":t"):
|
|
137
|
+
continue
|
|
138
|
+
if el.firstChild is None:
|
|
139
|
+
continue
|
|
140
|
+
txt = el.firstChild.nodeValue
|
|
141
|
+
if not txt:
|
|
142
|
+
continue
|
|
143
|
+
has_ws = txt.startswith((' ', '\t')) or txt.endswith((' ', '\t'))
|
|
144
|
+
if has_ws and el.getAttribute("xml:space") != "preserve":
|
|
145
|
+
el.setAttribute("xml:space", "preserve")
|
|
146
|
+
preview = repr(txt[:30]) + "..." if len(txt) > 30 else repr(txt)
|
|
147
|
+
print(" Repaired: {}: Added xml:space='preserve' to {}: {}".format(
|
|
148
|
+
fp.name, el.tagName, preview
|
|
149
|
+
))
|
|
150
|
+
n_fixed += 1
|
|
151
|
+
touched = True
|
|
152
|
+
|
|
153
|
+
if touched:
|
|
154
|
+
fp.write_bytes(dom.toxml(encoding="UTF-8"))
|
|
155
|
+
except Exception:
|
|
156
|
+
pass
|
|
157
|
+
return n_fixed
|
|
158
|
+
|
|
159
|
+
# alternative name kept for backward compat
|
|
160
|
+
repair_whitespace_preservation = _fix_whitespace_preservation
|
|
161
|
+
|
|
162
|
+
# ──────────────────────────────────────────────────────────────
|
|
163
|
+
# Check 1 – well-formed XML
|
|
164
|
+
# ──────────────────────────────────────────────────────────────
|
|
165
|
+
|
|
166
|
+
def validate_xml(self):
|
|
167
|
+
issues = []
|
|
168
|
+
for fp in self.xml_files:
|
|
169
|
+
try:
|
|
170
|
+
lxml.etree.parse(str(fp))
|
|
171
|
+
except lxml.etree.XMLSyntaxError as exc:
|
|
172
|
+
issues.append(" {}: Line {}: {}".format(
|
|
173
|
+
fp.relative_to(self.unpacked_dir), exc.lineno, exc.msg
|
|
174
|
+
))
|
|
175
|
+
except Exception as exc:
|
|
176
|
+
issues.append(" {}: Unexpected error: {}".format(
|
|
177
|
+
fp.relative_to(self.unpacked_dir), exc
|
|
178
|
+
))
|
|
179
|
+
|
|
180
|
+
if issues:
|
|
181
|
+
print("FAILED - Found {} XML violations:".format(len(issues)))
|
|
182
|
+
for ln in issues:
|
|
183
|
+
print(ln)
|
|
184
|
+
return False
|
|
185
|
+
if self.verbose:
|
|
186
|
+
print("PASSED - All XML files are well-formed")
|
|
187
|
+
return True
|
|
188
|
+
|
|
189
|
+
# ──────────────────────────────────────────────────────────────
|
|
190
|
+
# Check 2 – mc:Ignorable namespace prefixes
|
|
191
|
+
# ──────────────────────────────────────────────────────────────
|
|
192
|
+
|
|
193
|
+
def validate_namespaces(self):
|
|
194
|
+
issues = []
|
|
195
|
+
for fp in self.xml_files:
|
|
196
|
+
try:
|
|
197
|
+
root = lxml.etree.parse(str(fp)).getroot()
|
|
198
|
+
declared = set(root.nsmap.keys()) - {None}
|
|
199
|
+
|
|
200
|
+
ignorable_vals = [
|
|
201
|
+
v for k, v in root.attrib.items() if k.endswith("Ignorable")
|
|
202
|
+
]
|
|
203
|
+
for val in ignorable_vals:
|
|
204
|
+
for ns in set(val.split()) - declared:
|
|
205
|
+
issues.append(" {}: Namespace '{}' in Ignorable but not declared".format(
|
|
206
|
+
fp.relative_to(self.unpacked_dir), ns
|
|
207
|
+
))
|
|
208
|
+
except lxml.etree.XMLSyntaxError:
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
if issues:
|
|
212
|
+
print("FAILED - {} namespace issues:".format(len(issues)))
|
|
213
|
+
for ln in issues:
|
|
214
|
+
print(ln)
|
|
215
|
+
return False
|
|
216
|
+
if self.verbose:
|
|
217
|
+
print("PASSED - All namespace prefixes properly declared")
|
|
218
|
+
return True
|
|
219
|
+
|
|
220
|
+
# ──────────────────────────────────────────────────────────────
|
|
221
|
+
# Check 3 – unique IDs
|
|
222
|
+
# ──────────────────────────────────────────────────────────────
|
|
223
|
+
|
|
224
|
+
def validate_unique_ids(self):
|
|
225
|
+
issues = []
|
|
226
|
+
g_ids = {}
|
|
227
|
+
|
|
228
|
+
for fp in self.xml_files:
|
|
229
|
+
try:
|
|
230
|
+
root = lxml.etree.parse(str(fp)).getroot()
|
|
231
|
+
per_file = {}
|
|
232
|
+
|
|
233
|
+
# strip mc:AlternateContent before scanning
|
|
234
|
+
for ac in root.xpath(
|
|
235
|
+
".//mc:AlternateContent",
|
|
236
|
+
namespaces={"mc": self.MC_NAMESPACE},
|
|
237
|
+
):
|
|
238
|
+
ac.getparent().remove(ac)
|
|
239
|
+
|
|
240
|
+
for nd in root.iter():
|
|
241
|
+
raw_tag = nd.tag
|
|
242
|
+
local = raw_tag.split("}")[-1].lower() if "}" in raw_tag else raw_tag.lower()
|
|
243
|
+
|
|
244
|
+
if local not in self.UNIQUE_ID_REQUIREMENTS:
|
|
245
|
+
continue
|
|
246
|
+
|
|
247
|
+
# skip elements inside excluded containers
|
|
248
|
+
if any(
|
|
249
|
+
anc.tag.split("}")[-1].lower() in self.EXCLUDED_ID_CONTAINERS
|
|
250
|
+
for anc in nd.iterancestors()
|
|
251
|
+
):
|
|
252
|
+
continue
|
|
253
|
+
|
|
254
|
+
attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[local]
|
|
255
|
+
|
|
256
|
+
# find the actual attribute value
|
|
257
|
+
id_val = None
|
|
258
|
+
for ak, av in nd.attrib.items():
|
|
259
|
+
a_local = ak.split("}")[-1].lower() if "}" in ak else ak.lower()
|
|
260
|
+
if a_local == attr_name:
|
|
261
|
+
id_val = av
|
|
262
|
+
break
|
|
263
|
+
|
|
264
|
+
if id_val is None:
|
|
265
|
+
continue
|
|
266
|
+
|
|
267
|
+
rel_path = fp.relative_to(self.unpacked_dir)
|
|
268
|
+
|
|
269
|
+
if scope == "global":
|
|
270
|
+
if id_val in g_ids:
|
|
271
|
+
pf, pl, pt = g_ids[id_val]
|
|
272
|
+
issues.append(
|
|
273
|
+
" {}: Line {}: Global ID '{}' in <{}> "
|
|
274
|
+
"already used in {} at line {} in <{}>".format(
|
|
275
|
+
rel_path, nd.sourceline, id_val, local, pf, pl, pt
|
|
276
|
+
)
|
|
277
|
+
)
|
|
278
|
+
else:
|
|
279
|
+
g_ids[id_val] = (rel_path, nd.sourceline, local)
|
|
280
|
+
else:
|
|
281
|
+
bucket_key = (local, attr_name)
|
|
282
|
+
if bucket_key not in per_file:
|
|
283
|
+
per_file[bucket_key] = {}
|
|
284
|
+
if id_val in per_file[bucket_key]:
|
|
285
|
+
issues.append(
|
|
286
|
+
" {}: Line {}: Duplicate {}='{}' in <{}> "
|
|
287
|
+
"(first occurrence at line {})".format(
|
|
288
|
+
rel_path, nd.sourceline, attr_name, id_val, local,
|
|
289
|
+
per_file[bucket_key][id_val],
|
|
290
|
+
)
|
|
291
|
+
)
|
|
292
|
+
else:
|
|
293
|
+
per_file[bucket_key][id_val] = nd.sourceline
|
|
294
|
+
|
|
295
|
+
except (lxml.etree.XMLSyntaxError, Exception) as exc:
|
|
296
|
+
issues.append(" {}: Error: {}".format(
|
|
297
|
+
fp.relative_to(self.unpacked_dir), exc
|
|
298
|
+
))
|
|
299
|
+
|
|
300
|
+
if issues:
|
|
301
|
+
print("FAILED - Found {} ID uniqueness violations:".format(len(issues)))
|
|
302
|
+
for ln in issues:
|
|
303
|
+
print(ln)
|
|
304
|
+
return False
|
|
305
|
+
if self.verbose:
|
|
306
|
+
print("PASSED - All required IDs are unique")
|
|
307
|
+
return True
|
|
308
|
+
|
|
309
|
+
# ──────────────────────────────────────────────────────────────
|
|
310
|
+
# Check 4 – .rels file references
|
|
311
|
+
# ──────────────────────────────────────────────────────────────
|
|
312
|
+
|
|
313
|
+
def validate_file_references(self):
|
|
314
|
+
issues = []
|
|
315
|
+
rels = list(self.unpacked_dir.rglob("*.rels"))
|
|
316
|
+
|
|
317
|
+
if not rels:
|
|
318
|
+
if self.verbose:
|
|
319
|
+
print("PASSED - No .rels files found")
|
|
320
|
+
return True
|
|
321
|
+
|
|
322
|
+
physical_files = [
|
|
323
|
+
fp.resolve()
|
|
324
|
+
for fp in self.unpacked_dir.rglob("*")
|
|
325
|
+
if fp.is_file()
|
|
326
|
+
and fp.name != "[Content_Types].xml"
|
|
327
|
+
and not fp.name.endswith(".rels")
|
|
328
|
+
]
|
|
329
|
+
|
|
330
|
+
all_referenced = set()
|
|
331
|
+
if self.verbose:
|
|
332
|
+
print("Found {} .rels files and {} target files".format(len(rels), len(physical_files)))
|
|
333
|
+
|
|
334
|
+
for rf in rels:
|
|
335
|
+
try:
|
|
336
|
+
rr = lxml.etree.parse(str(rf)).getroot()
|
|
337
|
+
rd = rf.parent
|
|
338
|
+
referenced = set()
|
|
339
|
+
broken = []
|
|
340
|
+
|
|
341
|
+
for rel in rr.findall(
|
|
342
|
+
".//ns:Relationship",
|
|
343
|
+
namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE},
|
|
344
|
+
):
|
|
345
|
+
tgt = rel.get("Target")
|
|
346
|
+
if not tgt or tgt.startswith(("http", "mailto:")):
|
|
347
|
+
continue
|
|
348
|
+
|
|
349
|
+
if tgt.startswith("/"):
|
|
350
|
+
tp = self.unpacked_dir / tgt.lstrip("/")
|
|
351
|
+
elif rf.name == ".rels":
|
|
352
|
+
tp = self.unpacked_dir / tgt
|
|
353
|
+
else:
|
|
354
|
+
tp = rd.parent / tgt
|
|
355
|
+
|
|
356
|
+
try:
|
|
357
|
+
tp = tp.resolve()
|
|
358
|
+
if tp.exists() and tp.is_file():
|
|
359
|
+
referenced.add(tp)
|
|
360
|
+
all_referenced.add(tp)
|
|
361
|
+
else:
|
|
362
|
+
broken.append((tgt, rel.sourceline))
|
|
363
|
+
except (OSError, ValueError):
|
|
364
|
+
broken.append((tgt, rel.sourceline))
|
|
365
|
+
|
|
366
|
+
if broken:
|
|
367
|
+
rp = rf.relative_to(self.unpacked_dir)
|
|
368
|
+
for ref, line in broken:
|
|
369
|
+
issues.append(" {}: Line {}: Broken reference to {}".format(rp, line, ref))
|
|
370
|
+
|
|
371
|
+
except Exception as exc:
|
|
372
|
+
issues.append(" Error parsing {}: {}".format(
|
|
373
|
+
rf.relative_to(self.unpacked_dir), exc
|
|
374
|
+
))
|
|
375
|
+
|
|
376
|
+
orphans = set(physical_files) - all_referenced
|
|
377
|
+
for orphan in sorted(orphans):
|
|
378
|
+
issues.append(" Unreferenced file: {}".format(
|
|
379
|
+
orphan.relative_to(self.unpacked_dir)
|
|
380
|
+
))
|
|
381
|
+
|
|
382
|
+
if issues:
|
|
383
|
+
print("FAILED - Found {} relationship validation errors:".format(len(issues)))
|
|
384
|
+
for ln in issues:
|
|
385
|
+
print(ln)
|
|
386
|
+
print(
|
|
387
|
+
"CRITICAL: These errors will cause the document to appear corrupt. "
|
|
388
|
+
"Broken references MUST be fixed, "
|
|
389
|
+
"and unreferenced files MUST be referenced or removed."
|
|
390
|
+
)
|
|
391
|
+
return False
|
|
392
|
+
if self.verbose:
|
|
393
|
+
print("PASSED - All references are valid and all files are properly referenced")
|
|
394
|
+
return True
|
|
395
|
+
|
|
396
|
+
# ──────────────────────────────────────────────────────────────
|
|
397
|
+
# Check 5 – relationship ID cross-references
|
|
398
|
+
# ──────────────────────────────────────────────────────────────
|
|
399
|
+
|
|
400
|
+
def validate_all_relationship_ids(self):
|
|
401
|
+
import lxml.etree
|
|
402
|
+
|
|
403
|
+
issues = []
|
|
404
|
+
for fp in self.xml_files:
|
|
405
|
+
if fp.suffix == ".rels":
|
|
406
|
+
continue
|
|
407
|
+
|
|
408
|
+
rels_dir = fp.parent / "_rels"
|
|
409
|
+
companion = rels_dir / "{}.rels".format(fp.name)
|
|
410
|
+
if not companion.exists():
|
|
411
|
+
continue
|
|
412
|
+
|
|
413
|
+
try:
|
|
414
|
+
rr = lxml.etree.parse(str(companion)).getroot()
|
|
415
|
+
id_map = {}
|
|
416
|
+
|
|
417
|
+
for rel in rr.findall(
|
|
418
|
+
".//{{{}}}Relationship".format(self.PACKAGE_RELATIONSHIPS_NAMESPACE)
|
|
419
|
+
):
|
|
420
|
+
rid = rel.get("Id")
|
|
421
|
+
rtype = rel.get("Type", "")
|
|
422
|
+
if rid is None:
|
|
423
|
+
continue
|
|
424
|
+
if rid in id_map:
|
|
425
|
+
issues.append(
|
|
426
|
+
" {}: Line {}: Duplicate relationship ID '{}' (IDs must be unique)".format(
|
|
427
|
+
companion.relative_to(self.unpacked_dir), rel.sourceline, rid
|
|
428
|
+
)
|
|
429
|
+
)
|
|
430
|
+
short_type = rtype.rsplit("/", 1)[-1] if "/" in rtype else rtype
|
|
431
|
+
id_map[rid] = short_type
|
|
432
|
+
|
|
433
|
+
xr = lxml.etree.parse(str(fp)).getroot()
|
|
434
|
+
r_ns = self.OFFICE_RELATIONSHIPS_NAMESPACE
|
|
435
|
+
for nd in xr.iter():
|
|
436
|
+
for aname in ("id", "embed", "link"):
|
|
437
|
+
val = nd.get("{{{}}}{}" .format(r_ns, aname))
|
|
438
|
+
if not val:
|
|
439
|
+
continue
|
|
440
|
+
rel_p = fp.relative_to(self.unpacked_dir)
|
|
441
|
+
tag = nd.tag.split("}")[-1] if "}" in nd.tag else nd.tag
|
|
442
|
+
|
|
443
|
+
if val not in id_map:
|
|
444
|
+
preview = ", ".join(sorted(id_map.keys())[:5])
|
|
445
|
+
if len(id_map) > 5:
|
|
446
|
+
preview += "..."
|
|
447
|
+
issues.append(
|
|
448
|
+
" {}: Line {}: <{}> r:{} references non-existent relationship '{}' "
|
|
449
|
+
"(valid IDs: {})".format(rel_p, nd.sourceline, tag, aname, val, preview)
|
|
450
|
+
)
|
|
451
|
+
elif aname == "id" and self.ELEMENT_RELATIONSHIP_TYPES:
|
|
452
|
+
expected = self._get_expected_relationship_type(tag)
|
|
453
|
+
if expected and expected not in id_map[val].lower():
|
|
454
|
+
issues.append(
|
|
455
|
+
" {}: Line {}: <{}> references '{}' which points to '{}' "
|
|
456
|
+
"but should point to a '{}' relationship".format(
|
|
457
|
+
rel_p, nd.sourceline, tag, val, id_map[val], expected
|
|
458
|
+
)
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
except Exception as exc:
|
|
462
|
+
issues.append(" Error processing {}: {}".format(
|
|
463
|
+
fp.relative_to(self.unpacked_dir), exc
|
|
464
|
+
))
|
|
465
|
+
|
|
466
|
+
if issues:
|
|
467
|
+
print("FAILED - Found {} relationship ID reference errors:".format(len(issues)))
|
|
468
|
+
for ln in issues:
|
|
469
|
+
print(ln)
|
|
470
|
+
print("\nThese ID mismatches will cause the document to appear corrupt!")
|
|
471
|
+
return False
|
|
472
|
+
if self.verbose:
|
|
473
|
+
print("PASSED - All relationship ID references are valid")
|
|
474
|
+
return True
|
|
475
|
+
|
|
476
|
+
def _get_expected_relationship_type(self, elem_tag):
|
|
477
|
+
lc = elem_tag.lower()
|
|
478
|
+
if lc in self.ELEMENT_RELATIONSHIP_TYPES:
|
|
479
|
+
return self.ELEMENT_RELATIONSHIP_TYPES[lc]
|
|
480
|
+
if lc.endswith("id") and len(lc) > 2:
|
|
481
|
+
stem = lc[:-2]
|
|
482
|
+
if stem.endswith("master") or stem.endswith("layout"):
|
|
483
|
+
return stem
|
|
484
|
+
return "slide" if stem == "sld" else stem
|
|
485
|
+
if lc.endswith("reference") and len(lc) > 9:
|
|
486
|
+
return lc[:-9]
|
|
487
|
+
return None
|
|
488
|
+
|
|
489
|
+
# ──────────────────────────────────────────────────────────────
|
|
490
|
+
# Check 6 – [Content_Types].xml completeness
|
|
491
|
+
# ──────────────────────────────────────────────────────────────
|
|
492
|
+
|
|
493
|
+
def validate_content_types(self):
|
|
494
|
+
issues = []
|
|
495
|
+
ct_file = self.unpacked_dir / "[Content_Types].xml"
|
|
496
|
+
if not ct_file.exists():
|
|
497
|
+
print("FAILED - [Content_Types].xml file not found")
|
|
498
|
+
return False
|
|
499
|
+
|
|
500
|
+
try:
|
|
501
|
+
root = lxml.etree.parse(str(ct_file)).getroot()
|
|
502
|
+
|
|
503
|
+
overrides = {
|
|
504
|
+
ov.get("PartName").lstrip("/")
|
|
505
|
+
for ov in root.findall(
|
|
506
|
+
".//{{{}}}Override".format(self.CONTENT_TYPES_NAMESPACE)
|
|
507
|
+
)
|
|
508
|
+
if ov.get("PartName") is not None
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
defaults = {
|
|
512
|
+
df.get("Extension").lower()
|
|
513
|
+
for df in root.findall(
|
|
514
|
+
".//{{{}}}Default".format(self.CONTENT_TYPES_NAMESPACE)
|
|
515
|
+
)
|
|
516
|
+
if df.get("Extension") is not None
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
important_roots = {
|
|
520
|
+
"sld", "sldLayout", "sldMaster", "presentation",
|
|
521
|
+
"document", "workbook", "worksheet", "theme",
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
_MEDIA_CT = {
|
|
525
|
+
"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg",
|
|
526
|
+
"gif": "image/gif", "bmp": "image/bmp", "tiff": "image/tiff",
|
|
527
|
+
"wmf": "image/x-wmf", "emf": "image/x-emf",
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
for xf in self.xml_files:
|
|
531
|
+
rp = str(xf.relative_to(self.unpacked_dir)).replace("\\", "/")
|
|
532
|
+
if any(skip in rp for skip in (".rels", "[Content_Types]", "docProps/", "_rels/")):
|
|
533
|
+
continue
|
|
534
|
+
try:
|
|
535
|
+
tag = lxml.etree.parse(str(xf)).getroot().tag
|
|
536
|
+
root_name = tag.split("}")[-1] if "}" in tag else tag
|
|
537
|
+
if root_name in important_roots and rp not in overrides:
|
|
538
|
+
issues.append(
|
|
539
|
+
" {}: File with <{}> root not declared in [Content_Types].xml".format(
|
|
540
|
+
rp, root_name
|
|
541
|
+
)
|
|
542
|
+
)
|
|
543
|
+
except Exception:
|
|
544
|
+
continue
|
|
545
|
+
|
|
546
|
+
all_files = [f for f in self.unpacked_dir.rglob("*") if f.is_file()]
|
|
547
|
+
for fp in all_files:
|
|
548
|
+
if fp.suffix.lower() in (".xml", ".rels"):
|
|
549
|
+
continue
|
|
550
|
+
if fp.name == "[Content_Types].xml":
|
|
551
|
+
continue
|
|
552
|
+
if "_rels" in fp.parts or "docProps" in fp.parts:
|
|
553
|
+
continue
|
|
554
|
+
ext = fp.suffix.lstrip(".").lower()
|
|
555
|
+
if ext and ext not in defaults and ext in _MEDIA_CT:
|
|
556
|
+
issues.append(
|
|
557
|
+
' {}: File with extension \'{}\' not declared in [Content_Types].xml'
|
|
558
|
+
' - should add: <Default Extension="{}" ContentType="{}"/>'.format(
|
|
559
|
+
fp.relative_to(self.unpacked_dir), ext, ext, _MEDIA_CT[ext]
|
|
560
|
+
)
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
except Exception as exc:
|
|
564
|
+
issues.append(" Error parsing [Content_Types].xml: {}".format(exc))
|
|
565
|
+
|
|
566
|
+
if issues:
|
|
567
|
+
print("FAILED - Found {} content type declaration errors:".format(len(issues)))
|
|
568
|
+
for ln in issues:
|
|
569
|
+
print(ln)
|
|
570
|
+
return False
|
|
571
|
+
if self.verbose:
|
|
572
|
+
print("PASSED - All content files are properly declared in [Content_Types].xml")
|
|
573
|
+
return True
|
|
574
|
+
|
|
575
|
+
# ──────────────────────────────────────────────────────────────
|
|
576
|
+
# XSD validation (single file + batch)
|
|
577
|
+
# ──────────────────────────────────────────────────────────────
|
|
578
|
+
|
|
579
|
+
def validate_file_against_xsd(self, xml_file, verbose=False):
|
|
580
|
+
xml_file = Path(xml_file).resolve()
|
|
581
|
+
base = self.unpacked_dir.resolve()
|
|
582
|
+
|
|
583
|
+
ok, cur_errs = self._check_one_xsd(xml_file, base)
|
|
584
|
+
|
|
585
|
+
if ok is None:
|
|
586
|
+
return None, set()
|
|
587
|
+
if ok:
|
|
588
|
+
return True, set()
|
|
589
|
+
|
|
590
|
+
orig_errs = self._original_xsd_errors(xml_file)
|
|
591
|
+
|
|
592
|
+
assert cur_errs is not None
|
|
593
|
+
fresh = cur_errs - orig_errs
|
|
594
|
+
fresh = {
|
|
595
|
+
e for e in fresh
|
|
596
|
+
if not any(pat in e for pat in self.IGNORED_VALIDATION_ERRORS)
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
if fresh:
|
|
600
|
+
if verbose:
|
|
601
|
+
rp = xml_file.relative_to(base)
|
|
602
|
+
print("FAILED - {}: {} new error(s)".format(rp, len(fresh)))
|
|
603
|
+
for e in list(fresh)[:3]:
|
|
604
|
+
print(" - {}".format(e[:250] + "..." if len(e) > 250 else e))
|
|
605
|
+
return False, fresh
|
|
606
|
+
if verbose:
|
|
607
|
+
print("PASSED - No new errors (original had {} errors)".format(len(cur_errs)))
|
|
608
|
+
return True, set()
|
|
609
|
+
|
|
610
|
+
def validate_against_xsd(self):
|
|
611
|
+
fresh_issues = []
|
|
612
|
+
n_orig_err = 0
|
|
613
|
+
n_ok = 0
|
|
614
|
+
n_skip = 0
|
|
615
|
+
|
|
616
|
+
for fp in self.xml_files:
|
|
617
|
+
rp = str(fp.relative_to(self.unpacked_dir))
|
|
618
|
+
ok, errs = self.validate_file_against_xsd(fp, verbose=False)
|
|
619
|
+
|
|
620
|
+
if ok is None:
|
|
621
|
+
n_skip += 1
|
|
622
|
+
elif ok and not errs:
|
|
623
|
+
n_ok += 1
|
|
624
|
+
elif ok:
|
|
625
|
+
n_orig_err += 1
|
|
626
|
+
n_ok += 1
|
|
627
|
+
else:
|
|
628
|
+
fresh_issues.append(" {}: {} new error(s)".format(rp, len(errs)))
|
|
629
|
+
for e in list(errs)[:3]:
|
|
630
|
+
fresh_issues.append(
|
|
631
|
+
" - {}".format(e[:250] + "..." if len(e) > 250 else e)
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
if self.verbose:
|
|
635
|
+
print("Validated {} files:".format(len(self.xml_files)))
|
|
636
|
+
print(" - Valid: {}".format(n_ok))
|
|
637
|
+
print(" - Skipped (no schema): {}".format(n_skip))
|
|
638
|
+
if n_orig_err:
|
|
639
|
+
print(" - With original errors (ignored): {}".format(n_orig_err))
|
|
640
|
+
print(
|
|
641
|
+
" - With NEW errors: {}".format(
|
|
642
|
+
len(fresh_issues) > 0
|
|
643
|
+
and len([e for e in fresh_issues if not e.startswith(" ")])
|
|
644
|
+
or 0
|
|
645
|
+
)
|
|
646
|
+
)
|
|
647
|
+
|
|
648
|
+
if fresh_issues:
|
|
649
|
+
print("\nFAILED - Found NEW validation errors:")
|
|
650
|
+
for ln in fresh_issues:
|
|
651
|
+
print(ln)
|
|
652
|
+
return False
|
|
653
|
+
if self.verbose:
|
|
654
|
+
print("\nPASSED - No new XSD validation errors introduced")
|
|
655
|
+
return True
|
|
656
|
+
|
|
657
|
+
# ── private XSD helpers ──
|
|
658
|
+
|
|
659
|
+
def _resolve_schema(self, fp):
|
|
660
|
+
if fp.name in self.SCHEMA_MAPPINGS:
|
|
661
|
+
return self.schemas_dir / self.SCHEMA_MAPPINGS[fp.name]
|
|
662
|
+
if fp.suffix == ".rels":
|
|
663
|
+
return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"]
|
|
664
|
+
if "charts/" in str(fp) and fp.name.startswith("chart"):
|
|
665
|
+
return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"]
|
|
666
|
+
if "theme/" in str(fp) and fp.name.startswith("theme"):
|
|
667
|
+
return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"]
|
|
668
|
+
if fp.parent.name in self.MAIN_CONTENT_FOLDERS:
|
|
669
|
+
return self.schemas_dir / self.SCHEMA_MAPPINGS[fp.parent.name]
|
|
670
|
+
return None
|
|
671
|
+
|
|
672
|
+
# keep old name as alias
|
|
673
|
+
_get_schema_path = _resolve_schema
|
|
674
|
+
|
|
675
|
+
def _strip_non_ooxml_attrs(self, doc):
|
|
676
|
+
"""Return a cleaned ElementTree with non-OOXML attrs/elements removed."""
|
|
677
|
+
s = lxml.etree.tostring(doc, encoding="unicode")
|
|
678
|
+
copy = lxml.etree.fromstring(s)
|
|
679
|
+
|
|
680
|
+
for nd in copy.iter():
|
|
681
|
+
bad = [
|
|
682
|
+
a for a in nd.attrib
|
|
683
|
+
if "{" in a and a.split("}")[0][1:] not in self.OOXML_NAMESPACES
|
|
684
|
+
]
|
|
685
|
+
for a in bad:
|
|
686
|
+
del nd.attrib[a]
|
|
687
|
+
|
|
688
|
+
self._strip_foreign_elements(copy)
|
|
689
|
+
return lxml.etree.ElementTree(copy)
|
|
690
|
+
|
|
691
|
+
# keep old name
|
|
692
|
+
_clean_ignorable_namespaces = _strip_non_ooxml_attrs
|
|
693
|
+
|
|
694
|
+
def _strip_foreign_elements(self, root):
|
|
695
|
+
doomed = []
|
|
696
|
+
for el in list(root):
|
|
697
|
+
if not hasattr(el, "tag") or callable(el.tag):
|
|
698
|
+
continue
|
|
699
|
+
t = str(el.tag)
|
|
700
|
+
if t.startswith("{") and t.split("}")[0][1:] not in self.OOXML_NAMESPACES:
|
|
701
|
+
doomed.append(el)
|
|
702
|
+
continue
|
|
703
|
+
self._strip_foreign_elements(el)
|
|
704
|
+
for el in doomed:
|
|
705
|
+
root.remove(el)
|
|
706
|
+
|
|
707
|
+
_remove_ignorable_elements = _strip_foreign_elements
|
|
708
|
+
|
|
709
|
+
def _drop_mc_ignorable(self, doc):
|
|
710
|
+
root = doc.getroot()
|
|
711
|
+
mc_key = "{{{}}}Ignorable".format(self.MC_NAMESPACE)
|
|
712
|
+
if mc_key in root.attrib:
|
|
713
|
+
del root.attrib[mc_key]
|
|
714
|
+
return doc
|
|
715
|
+
|
|
716
|
+
_preprocess_for_mc_ignorable = _drop_mc_ignorable
|
|
717
|
+
|
|
718
|
+
def _check_one_xsd(self, fp, base):
|
|
719
|
+
schema_path = self._resolve_schema(fp)
|
|
720
|
+
if schema_path is None:
|
|
721
|
+
return None, None
|
|
722
|
+
|
|
723
|
+
try:
|
|
724
|
+
with open(schema_path, "rb") as fh:
|
|
725
|
+
xsd_doc = lxml.etree.parse(fh, parser=lxml.etree.XMLParser(), base_url=str(schema_path))
|
|
726
|
+
schema = lxml.etree.XMLSchema(xsd_doc)
|
|
727
|
+
|
|
728
|
+
with open(fp, "r") as fh:
|
|
729
|
+
xml_doc = lxml.etree.parse(fh)
|
|
730
|
+
|
|
731
|
+
xml_doc, _ = self._scrub_template_tags(xml_doc)
|
|
732
|
+
xml_doc = self._drop_mc_ignorable(xml_doc)
|
|
733
|
+
|
|
734
|
+
rp = fp.relative_to(base)
|
|
735
|
+
if rp.parts and rp.parts[0] in self.MAIN_CONTENT_FOLDERS:
|
|
736
|
+
xml_doc = self._strip_non_ooxml_attrs(xml_doc)
|
|
737
|
+
|
|
738
|
+
if schema.validate(xml_doc):
|
|
739
|
+
return True, set()
|
|
740
|
+
return False, {e.message for e in schema.error_log}
|
|
741
|
+
|
|
742
|
+
except Exception as exc:
|
|
743
|
+
return False, {str(exc)}
|
|
744
|
+
|
|
745
|
+
_validate_single_file_xsd = _check_one_xsd
|
|
746
|
+
|
|
747
|
+
def _original_xsd_errors(self, fp):
|
|
748
|
+
if self.original_file is None:
|
|
749
|
+
return set()
|
|
750
|
+
|
|
751
|
+
import tempfile, zipfile
|
|
752
|
+
|
|
753
|
+
fp = Path(fp).resolve()
|
|
754
|
+
rp = fp.relative_to(self.unpacked_dir.resolve())
|
|
755
|
+
|
|
756
|
+
with tempfile.TemporaryDirectory() as td:
|
|
757
|
+
tp = Path(td)
|
|
758
|
+
with zipfile.ZipFile(self.original_file, "r") as zr:
|
|
759
|
+
zr.extractall(tp)
|
|
760
|
+
orig = tp / rp
|
|
761
|
+
if not orig.exists():
|
|
762
|
+
return set()
|
|
763
|
+
_, errs = self._check_one_xsd(orig, tp)
|
|
764
|
+
return errs if errs else set()
|
|
765
|
+
|
|
766
|
+
_get_original_file_errors = _original_xsd_errors
|
|
767
|
+
|
|
768
|
+
def _scrub_template_tags(self, doc):
|
|
769
|
+
warnings = []
|
|
770
|
+
tpl_re = re.compile(r"\{\{[^}]*\}\}")
|
|
771
|
+
|
|
772
|
+
s = lxml.etree.tostring(doc, encoding="unicode")
|
|
773
|
+
copy = lxml.etree.fromstring(s)
|
|
774
|
+
|
|
775
|
+
def _clean(txt, ctx):
|
|
776
|
+
if not txt:
|
|
777
|
+
return txt
|
|
778
|
+
hits = list(tpl_re.finditer(txt))
|
|
779
|
+
if hits:
|
|
780
|
+
for h in hits:
|
|
781
|
+
warnings.append("Found template tag in {}: {}".format(ctx, h.group()))
|
|
782
|
+
return tpl_re.sub("", txt)
|
|
783
|
+
return txt
|
|
784
|
+
|
|
785
|
+
for nd in copy.iter():
|
|
786
|
+
if not hasattr(nd, "tag") or callable(nd.tag):
|
|
787
|
+
continue
|
|
788
|
+
t = str(nd.tag)
|
|
789
|
+
if t.endswith("}t") or t == "t":
|
|
790
|
+
continue
|
|
791
|
+
nd.text = _clean(nd.text, "text content")
|
|
792
|
+
nd.tail = _clean(nd.tail, "tail content")
|
|
793
|
+
|
|
794
|
+
return lxml.etree.ElementTree(copy), warnings
|
|
795
|
+
|
|
796
|
+
_remove_template_tags_from_text_nodes = _scrub_template_tags
|
|
797
|
+
|
|
798
|
+
|
|
799
|
+
if __name__ == "__main__":
|
|
800
|
+
raise RuntimeError("This module should not be run directly.")
|