@heylemon/lemonade 0.0.4 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/build-info.json +3 -3
- package/dist/canvas-host/a2ui/.bundle.hash +1 -1
- package/dist/gateway/skills-http.js +74 -19
- package/package.json +1 -1
- package/skills/docx/SKILL.md +25 -30
- package/skills/docx/scripts/accept_changes.py +0 -17
- package/skills/docx/scripts/comment.py +10 -39
- package/skills/docx/scripts/office/helpers/merge_runs.py +1 -33
- package/skills/docx/scripts/office/helpers/simplify_redlines.py +0 -43
- package/skills/docx/scripts/office/pack.py +0 -30
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
- package/skills/docx/scripts/office/soffice.py +0 -55
- package/skills/docx/scripts/office/unpack.py +5 -27
- package/skills/docx/scripts/office/validate.py +19 -14
- package/skills/docx/scripts/office/validators/base.py +48 -224
- package/skills/docx/scripts/office/validators/docx.py +44 -117
- package/skills/docx/scripts/office/validators/pptx.py +2 -42
- package/skills/docx/scripts/office/validators/redlining.py +3 -40
- package/skills/pdf/SKILL.md +22 -15
- package/skills/pdf/{FORMS.md → forms.md} +0 -14
- package/skills/pdf/scripts/check_bounding_boxes.py +0 -5
- package/skills/pdf/scripts/check_fillable_fields.py +0 -1
- package/skills/pdf/scripts/convert_pdf_to_images.py +0 -2
- package/skills/pdf/scripts/create_validation_image.py +0 -4
- package/skills/pdf/scripts/extract_form_field_info.py +1 -31
- package/skills/pdf/scripts/extract_form_structure.py +0 -9
- package/skills/pdf/scripts/fill_fillable_fields.py +0 -23
- package/skills/pdf/scripts/fill_pdf_form_with_annotations.py +3 -38
- package/skills/pptx/SKILL.md +2 -29
- package/skills/pptx/editing.md +2 -2
- package/skills/pptx/pptxgenjs.md +53 -8
- package/skills/pptx/scripts/add_slide.py +0 -30
- package/skills/pptx/scripts/clean.py +0 -23
- package/skills/pptx/scripts/office/helpers/merge_runs.py +1 -33
- package/skills/pptx/scripts/office/helpers/simplify_redlines.py +0 -43
- package/skills/pptx/scripts/office/pack.py +0 -30
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
- package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
- package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
- package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
- package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
- package/skills/pptx/scripts/office/soffice.py +0 -55
- package/skills/pptx/scripts/office/unpack.py +5 -27
- package/skills/pptx/scripts/office/validate.py +19 -14
- package/skills/pptx/scripts/office/validators/base.py +48 -224
- package/skills/pptx/scripts/office/validators/docx.py +44 -117
- package/skills/pptx/scripts/office/validators/pptx.py +2 -42
- package/skills/pptx/scripts/office/validators/redlining.py +3 -40
- package/skills/pptx/scripts/thumbnail.py +0 -31
- package/skills/xlsx/SKILL.md +3 -26
- package/skills/xlsx/scripts/office/helpers/merge_runs.py +1 -33
- package/skills/xlsx/scripts/office/helpers/simplify_redlines.py +0 -43
- package/skills/xlsx/scripts/office/pack.py +0 -30
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
- package/skills/xlsx/scripts/office/soffice.py +0 -55
- package/skills/xlsx/scripts/office/unpack.py +5 -27
- package/skills/xlsx/scripts/office/validate.py +19 -14
- package/skills/xlsx/scripts/office/validators/base.py +48 -224
- package/skills/xlsx/scripts/office/validators/docx.py +44 -117
- package/skills/xlsx/scripts/office/validators/pptx.py +2 -42
- package/skills/xlsx/scripts/office/validators/redlining.py +3 -40
- package/skills/xlsx/scripts/recalc.py +2 -26
- package/skills/docx/scripts/__init__.py +0 -1
- package/skills/docx/scripts/office/helpers/__init__.py +0 -0
- package/skills/docx/scripts/office/validators/__init__.py +0 -15
- package/skills/pptx/scripts/__init__.py +0 -0
- package/skills/pptx/scripts/office/helpers/__init__.py +0 -0
- package/skills/pptx/scripts/office/validators/__init__.py +0 -15
- package/skills/xlsx/scripts/office/helpers/__init__.py +0 -0
- package/skills/xlsx/scripts/office/validators/__init__.py +0 -15
- /package/skills/pdf/{REFERENCE.md → reference.md} +0 -0
|
@@ -10,85 +10,57 @@ import lxml.etree
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class BaseSchemaValidator:
|
|
13
|
-
"""Base validator with common validation logic for document files."""
|
|
14
13
|
|
|
15
|
-
# Validation errors to ignore (patterns that appear in error messages)
|
|
16
|
-
# These are XSD schema errors that don't affect document functionality,
|
|
17
|
-
# typically caused by specific editors like LibreOffice.
|
|
18
14
|
IGNORED_VALIDATION_ERRORS = [
|
|
19
|
-
# LibreOffice writes hyphenationZone in wrong order in word/settings.xml.
|
|
20
|
-
# The XSD requires strict element ordering, but LibreOffice puts doNotHyphenateCaps
|
|
21
|
-
# before hyphenationZone. This doesn't affect document rendering.
|
|
22
15
|
"hyphenationZone",
|
|
16
|
+
"purl.org/dc/terms",
|
|
23
17
|
]
|
|
24
18
|
|
|
25
|
-
# Elements whose 'id' attributes must be unique within their file
|
|
26
|
-
# Format: element_name -> (attribute_name, scope)
|
|
27
|
-
# scope can be 'file' (unique within file) or 'global' (unique across all files)
|
|
28
19
|
UNIQUE_ID_REQUIREMENTS = {
|
|
29
|
-
|
|
30
|
-
"
|
|
31
|
-
"
|
|
32
|
-
"
|
|
33
|
-
"
|
|
34
|
-
"
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
"
|
|
38
|
-
"
|
|
39
|
-
"
|
|
40
|
-
"
|
|
41
|
-
|
|
42
|
-
"
|
|
43
|
-
"
|
|
44
|
-
# Drawing/Shape elements (all formats)
|
|
45
|
-
"cxnsp": ("id", "file"), # Connection shape IDs
|
|
46
|
-
"sp": ("id", "file"), # Shape IDs
|
|
47
|
-
"pic": ("id", "file"), # Picture IDs
|
|
48
|
-
"grpsp": ("id", "file"), # Group shape IDs
|
|
20
|
+
"comment": ("id", "file"),
|
|
21
|
+
"commentrangestart": ("id", "file"),
|
|
22
|
+
"commentrangeend": ("id", "file"),
|
|
23
|
+
"bookmarkstart": ("id", "file"),
|
|
24
|
+
"bookmarkend": ("id", "file"),
|
|
25
|
+
"sldid": ("id", "file"),
|
|
26
|
+
"sldmasterid": ("id", "global"),
|
|
27
|
+
"sldlayoutid": ("id", "global"),
|
|
28
|
+
"cm": ("authorid", "file"),
|
|
29
|
+
"sheet": ("sheetid", "file"),
|
|
30
|
+
"definedname": ("id", "file"),
|
|
31
|
+
"cxnsp": ("id", "file"),
|
|
32
|
+
"sp": ("id", "file"),
|
|
33
|
+
"pic": ("id", "file"),
|
|
34
|
+
"grpsp": ("id", "file"),
|
|
49
35
|
}
|
|
50
36
|
|
|
51
|
-
# Container elements where ID uniqueness checks should be skipped
|
|
52
|
-
# These hold references that intentionally duplicate IDs of elements they reference
|
|
53
|
-
# Example: <p14:sldId id="301"> in sectionLst references <p:sldId id="301"> in sldIdLst
|
|
54
37
|
EXCLUDED_ID_CONTAINERS = {
|
|
55
|
-
"sectionlst",
|
|
38
|
+
"sectionlst",
|
|
56
39
|
}
|
|
57
40
|
|
|
58
|
-
# Mapping of element names to expected relationship types
|
|
59
|
-
# Subclasses should override this with format-specific mappings
|
|
60
41
|
ELEMENT_RELATIONSHIP_TYPES = {}
|
|
61
42
|
|
|
62
|
-
# Unified schema mappings for all Office document types
|
|
63
43
|
SCHEMA_MAPPINGS = {
|
|
64
|
-
|
|
65
|
-
"
|
|
66
|
-
"
|
|
67
|
-
"xl": "ISO-IEC29500-4_2016/sml.xsd", # Excel spreadsheets
|
|
68
|
-
# Common file types
|
|
44
|
+
"word": "ISO-IEC29500-4_2016/wml.xsd",
|
|
45
|
+
"ppt": "ISO-IEC29500-4_2016/pml.xsd",
|
|
46
|
+
"xl": "ISO-IEC29500-4_2016/sml.xsd",
|
|
69
47
|
"[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd",
|
|
70
48
|
"app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd",
|
|
71
49
|
"core.xml": "ecma/fouth-edition/opc-coreProperties.xsd",
|
|
72
50
|
"custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd",
|
|
73
51
|
".rels": "ecma/fouth-edition/opc-relationships.xsd",
|
|
74
|
-
# Word-specific files
|
|
75
52
|
"people.xml": "microsoft/wml-2012.xsd",
|
|
76
53
|
"commentsIds.xml": "microsoft/wml-cid-2016.xsd",
|
|
77
54
|
"commentsExtensible.xml": "microsoft/wml-cex-2018.xsd",
|
|
78
55
|
"commentsExtended.xml": "microsoft/wml-2012.xsd",
|
|
79
|
-
# Chart files (common across document types)
|
|
80
56
|
"chart": "ISO-IEC29500-4_2016/dml-chart.xsd",
|
|
81
|
-
# Theme files (common across document types)
|
|
82
57
|
"theme": "ISO-IEC29500-4_2016/dml-main.xsd",
|
|
83
|
-
# Drawing and media files
|
|
84
58
|
"drawing": "ISO-IEC29500-4_2016/dml-main.xsd",
|
|
85
59
|
}
|
|
86
60
|
|
|
87
|
-
# Unified namespace constants
|
|
88
61
|
MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006"
|
|
89
62
|
XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
|
|
90
63
|
|
|
91
|
-
# Common OOXML namespaces used across validators
|
|
92
64
|
PACKAGE_RELATIONSHIPS_NAMESPACE = (
|
|
93
65
|
"http://schemas.openxmlformats.org/package/2006/relationships"
|
|
94
66
|
)
|
|
@@ -99,10 +71,8 @@ class BaseSchemaValidator:
|
|
|
99
71
|
"http://schemas.openxmlformats.org/package/2006/content-types"
|
|
100
72
|
)
|
|
101
73
|
|
|
102
|
-
# Folders where we should clean ignorable namespaces
|
|
103
74
|
MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"}
|
|
104
75
|
|
|
105
|
-
# All allowed OOXML namespaces (superset of all document types)
|
|
106
76
|
OOXML_NAMESPACES = {
|
|
107
77
|
"http://schemas.openxmlformats.org/officeDocument/2006/math",
|
|
108
78
|
"http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
|
@@ -121,15 +91,13 @@ class BaseSchemaValidator:
|
|
|
121
91
|
"http://www.w3.org/XML/1998/namespace",
|
|
122
92
|
}
|
|
123
93
|
|
|
124
|
-
def __init__(self, unpacked_dir, original_file, verbose=False):
|
|
94
|
+
def __init__(self, unpacked_dir, original_file=None, verbose=False):
|
|
125
95
|
self.unpacked_dir = Path(unpacked_dir).resolve()
|
|
126
|
-
self.original_file = Path(original_file)
|
|
96
|
+
self.original_file = Path(original_file) if original_file else None
|
|
127
97
|
self.verbose = verbose
|
|
128
98
|
|
|
129
|
-
# Set schemas directory
|
|
130
99
|
self.schemas_dir = Path(__file__).parent.parent / "schemas"
|
|
131
100
|
|
|
132
|
-
# Get all XML and .rels files
|
|
133
101
|
patterns = ["*.xml", "*.rels"]
|
|
134
102
|
self.xml_files = [
|
|
135
103
|
f for pattern in patterns for f in self.unpacked_dir.rglob(pattern)
|
|
@@ -139,15 +107,12 @@ class BaseSchemaValidator:
|
|
|
139
107
|
print(f"Warning: No XML files found in {self.unpacked_dir}")
|
|
140
108
|
|
|
141
109
|
def validate(self):
|
|
142
|
-
"""Run all validation checks and return True if all pass."""
|
|
143
110
|
raise NotImplementedError("Subclasses must implement the validate method")
|
|
144
111
|
|
|
145
112
|
def repair(self) -> int:
|
|
146
|
-
"""Run auto-repairs. Returns count of repairs made. Subclasses should override and call super()."""
|
|
147
113
|
return self.repair_whitespace_preservation()
|
|
148
114
|
|
|
149
115
|
def repair_whitespace_preservation(self) -> int:
|
|
150
|
-
"""Add xml:space='preserve' to w:t/a:t elements with leading/trailing whitespace."""
|
|
151
116
|
repairs = 0
|
|
152
117
|
|
|
153
118
|
for xml_file in self.xml_files:
|
|
@@ -176,12 +141,10 @@ class BaseSchemaValidator:
|
|
|
176
141
|
return repairs
|
|
177
142
|
|
|
178
143
|
def validate_xml(self):
|
|
179
|
-
"""Validate that all XML files are well-formed."""
|
|
180
144
|
errors = []
|
|
181
145
|
|
|
182
146
|
for xml_file in self.xml_files:
|
|
183
147
|
try:
|
|
184
|
-
# Try to parse the XML file
|
|
185
148
|
lxml.etree.parse(str(xml_file))
|
|
186
149
|
except lxml.etree.XMLSyntaxError as e:
|
|
187
150
|
errors.append(
|
|
@@ -205,13 +168,12 @@ class BaseSchemaValidator:
|
|
|
205
168
|
return True
|
|
206
169
|
|
|
207
170
|
def validate_namespaces(self):
|
|
208
|
-
"""Validate that namespace prefixes in Ignorable attributes are declared."""
|
|
209
171
|
errors = []
|
|
210
172
|
|
|
211
173
|
for xml_file in self.xml_files:
|
|
212
174
|
try:
|
|
213
175
|
root = lxml.etree.parse(str(xml_file)).getroot()
|
|
214
|
-
declared = set(root.nsmap.keys()) - {None}
|
|
176
|
+
declared = set(root.nsmap.keys()) - {None}
|
|
215
177
|
|
|
216
178
|
for attr_val in [
|
|
217
179
|
v for k, v in root.attrib.items() if k.endswith("Ignorable")
|
|
@@ -235,35 +197,28 @@ class BaseSchemaValidator:
|
|
|
235
197
|
return True
|
|
236
198
|
|
|
237
199
|
def validate_unique_ids(self):
|
|
238
|
-
"""Validate that specific IDs are unique according to OOXML requirements."""
|
|
239
200
|
errors = []
|
|
240
|
-
global_ids = {}
|
|
201
|
+
global_ids = {}
|
|
241
202
|
|
|
242
203
|
for xml_file in self.xml_files:
|
|
243
204
|
try:
|
|
244
205
|
root = lxml.etree.parse(str(xml_file)).getroot()
|
|
245
|
-
file_ids = {}
|
|
206
|
+
file_ids = {}
|
|
246
207
|
|
|
247
|
-
# Remove all mc:AlternateContent elements from the tree
|
|
248
208
|
mc_elements = root.xpath(
|
|
249
209
|
".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE}
|
|
250
210
|
)
|
|
251
211
|
for elem in mc_elements:
|
|
252
212
|
elem.getparent().remove(elem)
|
|
253
213
|
|
|
254
|
-
# Now check IDs in the cleaned tree
|
|
255
214
|
for elem in root.iter():
|
|
256
|
-
# Get the element name without namespace
|
|
257
215
|
tag = (
|
|
258
216
|
elem.tag.split("}")[-1].lower()
|
|
259
217
|
if "}" in elem.tag
|
|
260
218
|
else elem.tag.lower()
|
|
261
219
|
)
|
|
262
220
|
|
|
263
|
-
# Check if this element type has ID uniqueness requirements
|
|
264
221
|
if tag in self.UNIQUE_ID_REQUIREMENTS:
|
|
265
|
-
# Skip if element is inside an excluded container
|
|
266
|
-
# (e.g., <p14:sldId> inside <p14:sectionLst> is a reference, not a definition)
|
|
267
222
|
in_excluded_container = any(
|
|
268
223
|
ancestor.tag.split("}")[-1].lower() in self.EXCLUDED_ID_CONTAINERS
|
|
269
224
|
for ancestor in elem.iterancestors()
|
|
@@ -273,7 +228,6 @@ class BaseSchemaValidator:
|
|
|
273
228
|
|
|
274
229
|
attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag]
|
|
275
230
|
|
|
276
|
-
# Look for the specified attribute
|
|
277
231
|
id_value = None
|
|
278
232
|
for attr, value in elem.attrib.items():
|
|
279
233
|
attr_local = (
|
|
@@ -287,7 +241,6 @@ class BaseSchemaValidator:
|
|
|
287
241
|
|
|
288
242
|
if id_value is not None:
|
|
289
243
|
if scope == "global":
|
|
290
|
-
# Check global uniqueness
|
|
291
244
|
if id_value in global_ids:
|
|
292
245
|
prev_file, prev_line, prev_tag = global_ids[
|
|
293
246
|
id_value
|
|
@@ -304,7 +257,6 @@ class BaseSchemaValidator:
|
|
|
304
257
|
tag,
|
|
305
258
|
)
|
|
306
259
|
elif scope == "file":
|
|
307
|
-
# Check file-level uniqueness
|
|
308
260
|
key = (tag, attr_name)
|
|
309
261
|
if key not in file_ids:
|
|
310
262
|
file_ids[key] = {}
|
|
@@ -335,12 +287,8 @@ class BaseSchemaValidator:
|
|
|
335
287
|
return True
|
|
336
288
|
|
|
337
289
|
def validate_file_references(self):
|
|
338
|
-
"""
|
|
339
|
-
Validate that all .rels files properly reference files and that all files are referenced.
|
|
340
|
-
"""
|
|
341
290
|
errors = []
|
|
342
291
|
|
|
343
|
-
# Find all .rels files
|
|
344
292
|
rels_files = list(self.unpacked_dir.rglob("*.rels"))
|
|
345
293
|
|
|
346
294
|
if not rels_files:
|
|
@@ -348,17 +296,15 @@ class BaseSchemaValidator:
|
|
|
348
296
|
print("PASSED - No .rels files found")
|
|
349
297
|
return True
|
|
350
298
|
|
|
351
|
-
# Get all files in the unpacked directory (excluding reference files)
|
|
352
299
|
all_files = []
|
|
353
300
|
for file_path in self.unpacked_dir.rglob("*"):
|
|
354
301
|
if (
|
|
355
302
|
file_path.is_file()
|
|
356
303
|
and file_path.name != "[Content_Types].xml"
|
|
357
304
|
and not file_path.name.endswith(".rels")
|
|
358
|
-
):
|
|
305
|
+
):
|
|
359
306
|
all_files.append(file_path.resolve())
|
|
360
307
|
|
|
361
|
-
# Track all files that are referenced by any .rels file
|
|
362
308
|
all_referenced_files = set()
|
|
363
309
|
|
|
364
310
|
if self.verbose:
|
|
@@ -366,16 +312,12 @@ class BaseSchemaValidator:
|
|
|
366
312
|
f"Found {len(rels_files)} .rels files and {len(all_files)} target files"
|
|
367
313
|
)
|
|
368
314
|
|
|
369
|
-
# Check each .rels file
|
|
370
315
|
for rels_file in rels_files:
|
|
371
316
|
try:
|
|
372
|
-
# Parse relationships file
|
|
373
317
|
rels_root = lxml.etree.parse(str(rels_file)).getroot()
|
|
374
318
|
|
|
375
|
-
# Get the directory where this .rels file is located
|
|
376
319
|
rels_dir = rels_file.parent
|
|
377
320
|
|
|
378
|
-
# Find all relationships and their targets
|
|
379
321
|
referenced_files = set()
|
|
380
322
|
broken_refs = []
|
|
381
323
|
|
|
@@ -386,24 +328,15 @@ class BaseSchemaValidator:
|
|
|
386
328
|
target = rel.get("Target")
|
|
387
329
|
if target and not target.startswith(
|
|
388
330
|
("http", "mailto:")
|
|
389
|
-
):
|
|
390
|
-
# Resolve the target path
|
|
391
|
-
# Absolute paths (starting with /) are relative to package root
|
|
392
|
-
# Relative paths are relative to the .rels file's parent directory
|
|
331
|
+
):
|
|
393
332
|
if target.startswith("/"):
|
|
394
|
-
# Absolute path - resolve from unpacked_dir root
|
|
395
|
-
# Strip leading / to avoid pathlib replacing the base
|
|
396
333
|
target_path = self.unpacked_dir / target.lstrip("/")
|
|
397
334
|
elif rels_file.name == ".rels":
|
|
398
|
-
# Root .rels file - relative targets are relative to unpacked_dir
|
|
399
335
|
target_path = self.unpacked_dir / target
|
|
400
336
|
else:
|
|
401
|
-
# Other .rels files - relative targets are relative to their parent's parent
|
|
402
|
-
# e.g., word/_rels/document.xml.rels -> targets relative to word/
|
|
403
337
|
base_dir = rels_dir.parent
|
|
404
338
|
target_path = base_dir / target
|
|
405
339
|
|
|
406
|
-
# Normalize the path and check if it exists
|
|
407
340
|
try:
|
|
408
341
|
target_path = target_path.resolve()
|
|
409
342
|
if target_path.exists() and target_path.is_file():
|
|
@@ -414,7 +347,6 @@ class BaseSchemaValidator:
|
|
|
414
347
|
except (OSError, ValueError):
|
|
415
348
|
broken_refs.append((target, rel.sourceline))
|
|
416
349
|
|
|
417
|
-
# Report broken references
|
|
418
350
|
if broken_refs:
|
|
419
351
|
rel_path = rels_file.relative_to(self.unpacked_dir)
|
|
420
352
|
for broken_ref, line_num in broken_refs:
|
|
@@ -426,7 +358,6 @@ class BaseSchemaValidator:
|
|
|
426
358
|
rel_path = rels_file.relative_to(self.unpacked_dir)
|
|
427
359
|
errors.append(f" Error parsing {rel_path}: {e}")
|
|
428
360
|
|
|
429
|
-
# Check for unreferenced files (files that exist but are not referenced anywhere)
|
|
430
361
|
unreferenced_files = set(all_files) - all_referenced_files
|
|
431
362
|
|
|
432
363
|
if unreferenced_files:
|
|
@@ -452,31 +383,21 @@ class BaseSchemaValidator:
|
|
|
452
383
|
return True
|
|
453
384
|
|
|
454
385
|
def validate_all_relationship_ids(self):
|
|
455
|
-
"""
|
|
456
|
-
Validate that all r:id attributes in XML files reference existing IDs
|
|
457
|
-
in their corresponding .rels files, and optionally validate relationship types.
|
|
458
|
-
"""
|
|
459
386
|
import lxml.etree
|
|
460
387
|
|
|
461
388
|
errors = []
|
|
462
389
|
|
|
463
|
-
# Process each XML file that might contain r:id references
|
|
464
390
|
for xml_file in self.xml_files:
|
|
465
|
-
# Skip .rels files themselves
|
|
466
391
|
if xml_file.suffix == ".rels":
|
|
467
392
|
continue
|
|
468
393
|
|
|
469
|
-
# Determine the corresponding .rels file
|
|
470
|
-
# For dir/file.xml, it's dir/_rels/file.xml.rels
|
|
471
394
|
rels_dir = xml_file.parent / "_rels"
|
|
472
395
|
rels_file = rels_dir / f"{xml_file.name}.rels"
|
|
473
396
|
|
|
474
|
-
# Skip if there's no corresponding .rels file (that's okay)
|
|
475
397
|
if not rels_file.exists():
|
|
476
398
|
continue
|
|
477
399
|
|
|
478
400
|
try:
|
|
479
|
-
# Parse the .rels file to get valid relationship IDs and their types
|
|
480
401
|
rels_root = lxml.etree.parse(str(rels_file)).getroot()
|
|
481
402
|
rid_to_type = {}
|
|
482
403
|
|
|
@@ -486,47 +407,43 @@ class BaseSchemaValidator:
|
|
|
486
407
|
rid = rel.get("Id")
|
|
487
408
|
rel_type = rel.get("Type", "")
|
|
488
409
|
if rid:
|
|
489
|
-
# Check for duplicate rIds
|
|
490
410
|
if rid in rid_to_type:
|
|
491
411
|
rels_rel_path = rels_file.relative_to(self.unpacked_dir)
|
|
492
412
|
errors.append(
|
|
493
413
|
f" {rels_rel_path}: Line {rel.sourceline}: "
|
|
494
414
|
f"Duplicate relationship ID '{rid}' (IDs must be unique)"
|
|
495
415
|
)
|
|
496
|
-
# Extract just the type name from the full URL
|
|
497
416
|
type_name = (
|
|
498
417
|
rel_type.split("/")[-1] if "/" in rel_type else rel_type
|
|
499
418
|
)
|
|
500
419
|
rid_to_type[rid] = type_name
|
|
501
420
|
|
|
502
|
-
# Parse the XML file to find all r:id references
|
|
503
421
|
xml_root = lxml.etree.parse(str(xml_file)).getroot()
|
|
504
422
|
|
|
505
|
-
|
|
423
|
+
r_ns = self.OFFICE_RELATIONSHIPS_NAMESPACE
|
|
424
|
+
rid_attrs_to_check = ["id", "embed", "link"]
|
|
506
425
|
for elem in xml_root.iter():
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
426
|
+
for attr_name in rid_attrs_to_check:
|
|
427
|
+
rid_attr = elem.get(f"{{{r_ns}}}{attr_name}")
|
|
428
|
+
if not rid_attr:
|
|
429
|
+
continue
|
|
510
430
|
xml_rel_path = xml_file.relative_to(self.unpacked_dir)
|
|
511
431
|
elem_name = (
|
|
512
432
|
elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
|
|
513
433
|
)
|
|
514
434
|
|
|
515
|
-
# Check if the ID exists
|
|
516
435
|
if rid_attr not in rid_to_type:
|
|
517
436
|
errors.append(
|
|
518
437
|
f" {xml_rel_path}: Line {elem.sourceline}: "
|
|
519
|
-
f"<{elem_name}> references non-existent relationship '{rid_attr}' "
|
|
438
|
+
f"<{elem_name}> r:{attr_name} references non-existent relationship '{rid_attr}' "
|
|
520
439
|
f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})"
|
|
521
440
|
)
|
|
522
|
-
|
|
523
|
-
elif self.ELEMENT_RELATIONSHIP_TYPES:
|
|
441
|
+
elif attr_name == "id" and self.ELEMENT_RELATIONSHIP_TYPES:
|
|
524
442
|
expected_type = self._get_expected_relationship_type(
|
|
525
443
|
elem_name
|
|
526
444
|
)
|
|
527
445
|
if expected_type:
|
|
528
446
|
actual_type = rid_to_type[rid_attr]
|
|
529
|
-
# Check if the actual type matches or contains the expected type
|
|
530
447
|
if expected_type not in actual_type.lower():
|
|
531
448
|
errors.append(
|
|
532
449
|
f" {xml_rel_path}: Line {elem.sourceline}: "
|
|
@@ -550,58 +467,41 @@ class BaseSchemaValidator:
|
|
|
550
467
|
return True
|
|
551
468
|
|
|
552
469
|
def _get_expected_relationship_type(self, element_name):
|
|
553
|
-
"""
|
|
554
|
-
Get the expected relationship type for an element.
|
|
555
|
-
First checks the explicit mapping, then tries pattern detection.
|
|
556
|
-
"""
|
|
557
|
-
# Normalize element name to lowercase
|
|
558
470
|
elem_lower = element_name.lower()
|
|
559
471
|
|
|
560
|
-
# Check explicit mapping first
|
|
561
472
|
if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES:
|
|
562
473
|
return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower]
|
|
563
474
|
|
|
564
|
-
# Try pattern detection for common patterns
|
|
565
|
-
# Pattern 1: Elements ending in "Id" often expect a relationship of the prefix type
|
|
566
475
|
if elem_lower.endswith("id") and len(elem_lower) > 2:
|
|
567
|
-
|
|
568
|
-
prefix = elem_lower[:-2] # Remove "id"
|
|
569
|
-
# Check if this might be a compound like "sldMasterId"
|
|
476
|
+
prefix = elem_lower[:-2]
|
|
570
477
|
if prefix.endswith("master"):
|
|
571
478
|
return prefix.lower()
|
|
572
479
|
elif prefix.endswith("layout"):
|
|
573
480
|
return prefix.lower()
|
|
574
481
|
else:
|
|
575
|
-
# Simple case like "sldId" -> "slide"
|
|
576
|
-
# Common transformations
|
|
577
482
|
if prefix == "sld":
|
|
578
483
|
return "slide"
|
|
579
484
|
return prefix.lower()
|
|
580
485
|
|
|
581
|
-
# Pattern 2: Elements ending in "Reference" expect a relationship of the prefix type
|
|
582
486
|
if elem_lower.endswith("reference") and len(elem_lower) > 9:
|
|
583
|
-
prefix = elem_lower[:-9]
|
|
487
|
+
prefix = elem_lower[:-9]
|
|
584
488
|
return prefix.lower()
|
|
585
489
|
|
|
586
490
|
return None
|
|
587
491
|
|
|
588
492
|
def validate_content_types(self):
|
|
589
|
-
"""Validate that all content files are properly declared in [Content_Types].xml."""
|
|
590
493
|
errors = []
|
|
591
494
|
|
|
592
|
-
# Find [Content_Types].xml file
|
|
593
495
|
content_types_file = self.unpacked_dir / "[Content_Types].xml"
|
|
594
496
|
if not content_types_file.exists():
|
|
595
497
|
print("FAILED - [Content_Types].xml file not found")
|
|
596
498
|
return False
|
|
597
499
|
|
|
598
500
|
try:
|
|
599
|
-
# Parse and get all declared parts and extensions
|
|
600
501
|
root = lxml.etree.parse(str(content_types_file)).getroot()
|
|
601
502
|
declared_parts = set()
|
|
602
503
|
declared_extensions = set()
|
|
603
504
|
|
|
604
|
-
# Get Override declarations (specific files)
|
|
605
505
|
for override in root.findall(
|
|
606
506
|
f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override"
|
|
607
507
|
):
|
|
@@ -609,7 +509,6 @@ class BaseSchemaValidator:
|
|
|
609
509
|
if part_name is not None:
|
|
610
510
|
declared_parts.add(part_name.lstrip("/"))
|
|
611
511
|
|
|
612
|
-
# Get Default declarations (by extension)
|
|
613
512
|
for default in root.findall(
|
|
614
513
|
f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default"
|
|
615
514
|
):
|
|
@@ -617,19 +516,17 @@ class BaseSchemaValidator:
|
|
|
617
516
|
if extension is not None:
|
|
618
517
|
declared_extensions.add(extension.lower())
|
|
619
518
|
|
|
620
|
-
# Root elements that require content type declaration
|
|
621
519
|
declarable_roots = {
|
|
622
520
|
"sld",
|
|
623
521
|
"sldLayout",
|
|
624
522
|
"sldMaster",
|
|
625
|
-
"presentation",
|
|
626
|
-
"document",
|
|
523
|
+
"presentation",
|
|
524
|
+
"document",
|
|
627
525
|
"workbook",
|
|
628
|
-
"worksheet",
|
|
629
|
-
"theme",
|
|
526
|
+
"worksheet",
|
|
527
|
+
"theme",
|
|
630
528
|
}
|
|
631
529
|
|
|
632
|
-
# Common media file extensions that should be declared
|
|
633
530
|
media_extensions = {
|
|
634
531
|
"png": "image/png",
|
|
635
532
|
"jpg": "image/jpeg",
|
|
@@ -641,17 +538,14 @@ class BaseSchemaValidator:
|
|
|
641
538
|
"emf": "image/x-emf",
|
|
642
539
|
}
|
|
643
540
|
|
|
644
|
-
# Get all files in the unpacked directory
|
|
645
541
|
all_files = list(self.unpacked_dir.rglob("*"))
|
|
646
542
|
all_files = [f for f in all_files if f.is_file()]
|
|
647
543
|
|
|
648
|
-
# Check all XML files for Override declarations
|
|
649
544
|
for xml_file in self.xml_files:
|
|
650
545
|
path_str = str(xml_file.relative_to(self.unpacked_dir)).replace(
|
|
651
546
|
"\\", "/"
|
|
652
547
|
)
|
|
653
548
|
|
|
654
|
-
# Skip non-content files
|
|
655
549
|
if any(
|
|
656
550
|
skip in path_str
|
|
657
551
|
for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"]
|
|
@@ -668,11 +562,9 @@ class BaseSchemaValidator:
|
|
|
668
562
|
)
|
|
669
563
|
|
|
670
564
|
except Exception:
|
|
671
|
-
continue
|
|
565
|
+
continue
|
|
672
566
|
|
|
673
|
-
# Check all non-XML files for Default extension declarations
|
|
674
567
|
for file_path in all_files:
|
|
675
|
-
# Skip XML files and metadata files (already checked above)
|
|
676
568
|
if file_path.suffix.lower() in {".xml", ".rels"}:
|
|
677
569
|
continue
|
|
678
570
|
if file_path.name == "[Content_Types].xml":
|
|
@@ -682,7 +574,6 @@ class BaseSchemaValidator:
|
|
|
682
574
|
|
|
683
575
|
extension = file_path.suffix.lstrip(".").lower()
|
|
684
576
|
if extension and extension not in declared_extensions:
|
|
685
|
-
# Check if it's a known media extension that should be declared
|
|
686
577
|
if extension in media_extensions:
|
|
687
578
|
relative_path = file_path.relative_to(self.unpacked_dir)
|
|
688
579
|
errors.append(
|
|
@@ -705,37 +596,23 @@ class BaseSchemaValidator:
|
|
|
705
596
|
return True
|
|
706
597
|
|
|
707
598
|
def validate_file_against_xsd(self, xml_file, verbose=False):
|
|
708
|
-
"""Validate a single XML file against XSD schema, comparing with original.
|
|
709
|
-
|
|
710
|
-
Args:
|
|
711
|
-
xml_file: Path to XML file to validate
|
|
712
|
-
verbose: Enable verbose output
|
|
713
|
-
|
|
714
|
-
Returns:
|
|
715
|
-
tuple: (is_valid, new_errors_set) where is_valid is True/False/None (skipped)
|
|
716
|
-
"""
|
|
717
|
-
# Resolve both paths to handle symlinks
|
|
718
599
|
xml_file = Path(xml_file).resolve()
|
|
719
600
|
unpacked_dir = self.unpacked_dir.resolve()
|
|
720
601
|
|
|
721
|
-
# Validate current file
|
|
722
602
|
is_valid, current_errors = self._validate_single_file_xsd(
|
|
723
603
|
xml_file, unpacked_dir
|
|
724
604
|
)
|
|
725
605
|
|
|
726
606
|
if is_valid is None:
|
|
727
|
-
return None, set()
|
|
607
|
+
return None, set()
|
|
728
608
|
elif is_valid:
|
|
729
|
-
return True, set()
|
|
609
|
+
return True, set()
|
|
730
610
|
|
|
731
|
-
# Get errors from original file for this specific file
|
|
732
611
|
original_errors = self._get_original_file_errors(xml_file)
|
|
733
612
|
|
|
734
|
-
# Compare with original (both are guaranteed to be sets here)
|
|
735
613
|
assert current_errors is not None
|
|
736
614
|
new_errors = current_errors - original_errors
|
|
737
615
|
|
|
738
|
-
# Filter out known harmless errors (e.g., LibreOffice element ordering issues)
|
|
739
616
|
new_errors = {
|
|
740
617
|
e for e in new_errors
|
|
741
618
|
if not any(pattern in e for pattern in self.IGNORED_VALIDATION_ERRORS)
|
|
@@ -750,7 +627,6 @@ class BaseSchemaValidator:
|
|
|
750
627
|
print(f" - {truncated}")
|
|
751
628
|
return False, new_errors
|
|
752
629
|
else:
|
|
753
|
-
# All errors existed in original
|
|
754
630
|
if verbose:
|
|
755
631
|
print(
|
|
756
632
|
f"PASSED - No new errors (original had {len(current_errors)} errors)"
|
|
@@ -758,7 +634,6 @@ class BaseSchemaValidator:
|
|
|
758
634
|
return True, set()
|
|
759
635
|
|
|
760
636
|
def validate_against_xsd(self):
|
|
761
|
-
"""Validate XML files against XSD schemas, showing only new errors compared to original."""
|
|
762
637
|
new_errors = []
|
|
763
638
|
original_error_count = 0
|
|
764
639
|
valid_count = 0
|
|
@@ -777,19 +652,16 @@ class BaseSchemaValidator:
|
|
|
777
652
|
valid_count += 1
|
|
778
653
|
continue
|
|
779
654
|
elif is_valid:
|
|
780
|
-
# Had errors but all existed in original
|
|
781
655
|
original_error_count += 1
|
|
782
656
|
valid_count += 1
|
|
783
657
|
continue
|
|
784
658
|
|
|
785
|
-
# Has new errors
|
|
786
659
|
new_errors.append(f" {relative_path}: {len(new_file_errors)} new error(s)")
|
|
787
|
-
for error in list(new_file_errors)[:3]:
|
|
660
|
+
for error in list(new_file_errors)[:3]:
|
|
788
661
|
new_errors.append(
|
|
789
662
|
f" - {error[:250]}..." if len(error) > 250 else f" - {error}"
|
|
790
663
|
)
|
|
791
664
|
|
|
792
|
-
# Print summary
|
|
793
665
|
if self.verbose:
|
|
794
666
|
print(f"Validated {len(self.xml_files)} files:")
|
|
795
667
|
print(f" - Valid: {valid_count}")
|
|
@@ -811,62 +683,47 @@ class BaseSchemaValidator:
|
|
|
811
683
|
return True
|
|
812
684
|
|
|
813
685
|
def _get_schema_path(self, xml_file):
|
|
814
|
-
"""Determine the appropriate schema path for an XML file."""
|
|
815
|
-
# Check exact filename match
|
|
816
686
|
if xml_file.name in self.SCHEMA_MAPPINGS:
|
|
817
687
|
return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name]
|
|
818
688
|
|
|
819
|
-
# Check .rels files
|
|
820
689
|
if xml_file.suffix == ".rels":
|
|
821
690
|
return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"]
|
|
822
691
|
|
|
823
|
-
# Check chart files
|
|
824
692
|
if "charts/" in str(xml_file) and xml_file.name.startswith("chart"):
|
|
825
693
|
return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"]
|
|
826
694
|
|
|
827
|
-
# Check theme files
|
|
828
695
|
if "theme/" in str(xml_file) and xml_file.name.startswith("theme"):
|
|
829
696
|
return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"]
|
|
830
697
|
|
|
831
|
-
# Check if file is in a main content folder and use appropriate schema
|
|
832
698
|
if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS:
|
|
833
699
|
return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name]
|
|
834
700
|
|
|
835
701
|
return None
|
|
836
702
|
|
|
837
703
|
def _clean_ignorable_namespaces(self, xml_doc):
|
|
838
|
-
"""Remove attributes and elements not in allowed namespaces."""
|
|
839
|
-
# Create a clean copy
|
|
840
704
|
xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
|
|
841
705
|
xml_copy = lxml.etree.fromstring(xml_string)
|
|
842
706
|
|
|
843
|
-
# Remove attributes not in allowed namespaces
|
|
844
707
|
for elem in xml_copy.iter():
|
|
845
708
|
attrs_to_remove = []
|
|
846
709
|
|
|
847
710
|
for attr in elem.attrib:
|
|
848
|
-
# Check if attribute is from a namespace other than allowed ones
|
|
849
711
|
if "{" in attr:
|
|
850
712
|
ns = attr.split("}")[0][1:]
|
|
851
713
|
if ns not in self.OOXML_NAMESPACES:
|
|
852
714
|
attrs_to_remove.append(attr)
|
|
853
715
|
|
|
854
|
-
# Remove collected attributes
|
|
855
716
|
for attr in attrs_to_remove:
|
|
856
717
|
del elem.attrib[attr]
|
|
857
718
|
|
|
858
|
-
# Remove elements not in allowed namespaces
|
|
859
719
|
self._remove_ignorable_elements(xml_copy)
|
|
860
720
|
|
|
861
721
|
return lxml.etree.ElementTree(xml_copy)
|
|
862
722
|
|
|
863
723
|
def _remove_ignorable_elements(self, root):
|
|
864
|
-
"""Recursively remove all elements not in allowed namespaces."""
|
|
865
724
|
elements_to_remove = []
|
|
866
725
|
|
|
867
|
-
# Find elements to remove
|
|
868
726
|
for elem in list(root):
|
|
869
|
-
# Skip non-element nodes (comments, processing instructions, etc.)
|
|
870
727
|
if not hasattr(elem, "tag") or callable(elem.tag):
|
|
871
728
|
continue
|
|
872
729
|
|
|
@@ -877,32 +734,25 @@ class BaseSchemaValidator:
|
|
|
877
734
|
elements_to_remove.append(elem)
|
|
878
735
|
continue
|
|
879
736
|
|
|
880
|
-
# Recursively clean child elements
|
|
881
737
|
self._remove_ignorable_elements(elem)
|
|
882
738
|
|
|
883
|
-
# Remove collected elements
|
|
884
739
|
for elem in elements_to_remove:
|
|
885
740
|
root.remove(elem)
|
|
886
741
|
|
|
887
742
|
def _preprocess_for_mc_ignorable(self, xml_doc):
|
|
888
|
-
"""Preprocess XML to handle mc:Ignorable attribute properly."""
|
|
889
|
-
# Remove mc:Ignorable attributes before validation
|
|
890
743
|
root = xml_doc.getroot()
|
|
891
744
|
|
|
892
|
-
# Remove mc:Ignorable attribute from root
|
|
893
745
|
if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib:
|
|
894
746
|
del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"]
|
|
895
747
|
|
|
896
748
|
return xml_doc
|
|
897
749
|
|
|
898
750
|
def _validate_single_file_xsd(self, xml_file, base_path):
|
|
899
|
-
"""Validate a single XML file against XSD schema. Returns (is_valid, errors_set)."""
|
|
900
751
|
schema_path = self._get_schema_path(xml_file)
|
|
901
752
|
if not schema_path:
|
|
902
|
-
return None, None
|
|
753
|
+
return None, None
|
|
903
754
|
|
|
904
755
|
try:
|
|
905
|
-
# Load schema
|
|
906
756
|
with open(schema_path, "rb") as xsd_file:
|
|
907
757
|
parser = lxml.etree.XMLParser()
|
|
908
758
|
xsd_doc = lxml.etree.parse(
|
|
@@ -910,14 +760,12 @@ class BaseSchemaValidator:
|
|
|
910
760
|
)
|
|
911
761
|
schema = lxml.etree.XMLSchema(xsd_doc)
|
|
912
762
|
|
|
913
|
-
# Load and preprocess XML
|
|
914
763
|
with open(xml_file, "r") as f:
|
|
915
764
|
xml_doc = lxml.etree.parse(f)
|
|
916
765
|
|
|
917
766
|
xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc)
|
|
918
767
|
xml_doc = self._preprocess_for_mc_ignorable(xml_doc)
|
|
919
768
|
|
|
920
|
-
# Clean ignorable namespaces if needed
|
|
921
769
|
relative_path = xml_file.relative_to(base_path)
|
|
922
770
|
if (
|
|
923
771
|
relative_path.parts
|
|
@@ -925,13 +773,11 @@ class BaseSchemaValidator:
|
|
|
925
773
|
):
|
|
926
774
|
xml_doc = self._clean_ignorable_namespaces(xml_doc)
|
|
927
775
|
|
|
928
|
-
# Validate
|
|
929
776
|
if schema.validate(xml_doc):
|
|
930
777
|
return True, set()
|
|
931
778
|
else:
|
|
932
779
|
errors = set()
|
|
933
780
|
for error in schema.error_log:
|
|
934
|
-
# Store normalized error message (without line numbers for comparison)
|
|
935
781
|
errors.add(error.message)
|
|
936
782
|
return False, errors
|
|
937
783
|
|
|
@@ -939,18 +785,12 @@ class BaseSchemaValidator:
|
|
|
939
785
|
return False, {str(e)}
|
|
940
786
|
|
|
941
787
|
def _get_original_file_errors(self, xml_file):
|
|
942
|
-
|
|
788
|
+
if self.original_file is None:
|
|
789
|
+
return set()
|
|
943
790
|
|
|
944
|
-
Args:
|
|
945
|
-
xml_file: Path to the XML file in unpacked_dir to check
|
|
946
|
-
|
|
947
|
-
Returns:
|
|
948
|
-
set: Set of error messages from the original file
|
|
949
|
-
"""
|
|
950
791
|
import tempfile
|
|
951
792
|
import zipfile
|
|
952
793
|
|
|
953
|
-
# Resolve both paths to handle symlinks (e.g., /var vs /private/var on macOS)
|
|
954
794
|
xml_file = Path(xml_file).resolve()
|
|
955
795
|
unpacked_dir = self.unpacked_dir.resolve()
|
|
956
796
|
relative_path = xml_file.relative_to(unpacked_dir)
|
|
@@ -958,37 +798,23 @@ class BaseSchemaValidator:
|
|
|
958
798
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
959
799
|
temp_path = Path(temp_dir)
|
|
960
800
|
|
|
961
|
-
# Extract original file
|
|
962
801
|
with zipfile.ZipFile(self.original_file, "r") as zip_ref:
|
|
963
802
|
zip_ref.extractall(temp_path)
|
|
964
803
|
|
|
965
|
-
# Find corresponding file in original
|
|
966
804
|
original_xml_file = temp_path / relative_path
|
|
967
805
|
|
|
968
806
|
if not original_xml_file.exists():
|
|
969
|
-
# File didn't exist in original, so no original errors
|
|
970
807
|
return set()
|
|
971
808
|
|
|
972
|
-
# Validate the specific file in original
|
|
973
809
|
is_valid, errors = self._validate_single_file_xsd(
|
|
974
810
|
original_xml_file, temp_path
|
|
975
811
|
)
|
|
976
812
|
return errors if errors else set()
|
|
977
813
|
|
|
978
814
|
def _remove_template_tags_from_text_nodes(self, xml_doc):
|
|
979
|
-
"""Remove template tags from XML text nodes and collect warnings.
|
|
980
|
-
|
|
981
|
-
Template tags follow the pattern {{ ... }} and are used as placeholders
|
|
982
|
-
for content replacement. They should be removed from text content before
|
|
983
|
-
XSD validation while preserving XML structure.
|
|
984
|
-
|
|
985
|
-
Returns:
|
|
986
|
-
tuple: (cleaned_xml_doc, warnings_list)
|
|
987
|
-
"""
|
|
988
815
|
warnings = []
|
|
989
816
|
template_pattern = re.compile(r"\{\{[^}]*\}\}")
|
|
990
817
|
|
|
991
|
-
# Create a copy of the document to avoid modifying the original
|
|
992
818
|
xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
|
|
993
819
|
xml_copy = lxml.etree.fromstring(xml_string)
|
|
994
820
|
|
|
@@ -1004,9 +830,7 @@ class BaseSchemaValidator:
|
|
|
1004
830
|
return template_pattern.sub("", text)
|
|
1005
831
|
return text
|
|
1006
832
|
|
|
1007
|
-
# Process all text nodes in the document
|
|
1008
833
|
for elem in xml_copy.iter():
|
|
1009
|
-
# Skip processing if this is a w:t element
|
|
1010
834
|
if not hasattr(elem, "tag") or callable(elem.tag):
|
|
1011
835
|
continue
|
|
1012
836
|
tag_str = str(elem.tag)
|