@heylemon/lemonade 0.0.4 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/build-info.json +3 -3
- package/dist/canvas-host/a2ui/.bundle.hash +1 -1
- package/dist/gateway/skills-http.js +74 -19
- package/package.json +1 -1
- package/skills/docx/SKILL.md +25 -30
- package/skills/docx/scripts/accept_changes.py +0 -17
- package/skills/docx/scripts/comment.py +10 -39
- package/skills/docx/scripts/office/helpers/merge_runs.py +1 -33
- package/skills/docx/scripts/office/helpers/simplify_redlines.py +0 -43
- package/skills/docx/scripts/office/pack.py +0 -30
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
- package/skills/docx/scripts/office/soffice.py +0 -55
- package/skills/docx/scripts/office/unpack.py +5 -27
- package/skills/docx/scripts/office/validate.py +19 -14
- package/skills/docx/scripts/office/validators/base.py +48 -224
- package/skills/docx/scripts/office/validators/docx.py +44 -117
- package/skills/docx/scripts/office/validators/pptx.py +2 -42
- package/skills/docx/scripts/office/validators/redlining.py +3 -40
- package/skills/pdf/SKILL.md +22 -15
- package/skills/pdf/{FORMS.md → forms.md} +0 -14
- package/skills/pdf/scripts/check_bounding_boxes.py +0 -5
- package/skills/pdf/scripts/check_fillable_fields.py +0 -1
- package/skills/pdf/scripts/convert_pdf_to_images.py +0 -2
- package/skills/pdf/scripts/create_validation_image.py +0 -4
- package/skills/pdf/scripts/extract_form_field_info.py +1 -31
- package/skills/pdf/scripts/extract_form_structure.py +0 -9
- package/skills/pdf/scripts/fill_fillable_fields.py +0 -23
- package/skills/pdf/scripts/fill_pdf_form_with_annotations.py +3 -38
- package/skills/pptx/SKILL.md +2 -29
- package/skills/pptx/editing.md +2 -2
- package/skills/pptx/pptxgenjs.md +53 -8
- package/skills/pptx/scripts/add_slide.py +0 -30
- package/skills/pptx/scripts/clean.py +0 -23
- package/skills/pptx/scripts/office/helpers/merge_runs.py +1 -33
- package/skills/pptx/scripts/office/helpers/simplify_redlines.py +0 -43
- package/skills/pptx/scripts/office/pack.py +0 -30
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
- package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
- package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
- package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
- package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
- package/skills/pptx/scripts/office/soffice.py +0 -55
- package/skills/pptx/scripts/office/unpack.py +5 -27
- package/skills/pptx/scripts/office/validate.py +19 -14
- package/skills/pptx/scripts/office/validators/base.py +48 -224
- package/skills/pptx/scripts/office/validators/docx.py +44 -117
- package/skills/pptx/scripts/office/validators/pptx.py +2 -42
- package/skills/pptx/scripts/office/validators/redlining.py +3 -40
- package/skills/pptx/scripts/thumbnail.py +0 -31
- package/skills/xlsx/SKILL.md +3 -26
- package/skills/xlsx/scripts/office/helpers/merge_runs.py +1 -33
- package/skills/xlsx/scripts/office/helpers/simplify_redlines.py +0 -43
- package/skills/xlsx/scripts/office/pack.py +0 -30
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
- package/skills/xlsx/scripts/office/soffice.py +0 -55
- package/skills/xlsx/scripts/office/unpack.py +5 -27
- package/skills/xlsx/scripts/office/validate.py +19 -14
- package/skills/xlsx/scripts/office/validators/base.py +48 -224
- package/skills/xlsx/scripts/office/validators/docx.py +44 -117
- package/skills/xlsx/scripts/office/validators/pptx.py +2 -42
- package/skills/xlsx/scripts/office/validators/redlining.py +3 -40
- package/skills/xlsx/scripts/recalc.py +2 -26
- package/skills/docx/scripts/__init__.py +0 -1
- package/skills/docx/scripts/office/helpers/__init__.py +0 -0
- package/skills/docx/scripts/office/validators/__init__.py +0 -15
- package/skills/pptx/scripts/__init__.py +0 -0
- package/skills/pptx/scripts/office/helpers/__init__.py +0 -0
- package/skills/pptx/scripts/office/validators/__init__.py +0 -15
- package/skills/xlsx/scripts/office/helpers/__init__.py +0 -0
- package/skills/xlsx/scripts/office/validators/__init__.py +0 -15
- /package/skills/pdf/{REFERENCE.md → reference.md} +0 -0
|
@@ -14,100 +14,76 @@ from .base import BaseSchemaValidator
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class DOCXSchemaValidator(BaseSchemaValidator):
|
|
17
|
-
"""Validator for Word document XML files against XSD schemas."""
|
|
18
17
|
|
|
19
|
-
# Word-specific namespaces
|
|
20
18
|
WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
21
19
|
W14_NAMESPACE = "http://schemas.microsoft.com/office/word/2010/wordml"
|
|
22
20
|
W16CID_NAMESPACE = "http://schemas.microsoft.com/office/word/2016/wordml/cid"
|
|
23
21
|
|
|
24
|
-
# Word-specific element to relationship type mappings
|
|
25
|
-
# Start with empty mapping - add specific cases as we discover them
|
|
26
22
|
ELEMENT_RELATIONSHIP_TYPES = {}
|
|
27
23
|
|
|
28
24
|
def validate(self):
|
|
29
|
-
"""Run all validation checks and return True if all pass."""
|
|
30
|
-
# Test 0: XML well-formedness
|
|
31
25
|
if not self.validate_xml():
|
|
32
26
|
return False
|
|
33
27
|
|
|
34
|
-
# Test 1: Namespace declarations
|
|
35
28
|
all_valid = True
|
|
36
29
|
if not self.validate_namespaces():
|
|
37
30
|
all_valid = False
|
|
38
31
|
|
|
39
|
-
# Test 2: Unique IDs
|
|
40
32
|
if not self.validate_unique_ids():
|
|
41
33
|
all_valid = False
|
|
42
34
|
|
|
43
|
-
# Test 3: Relationship and file reference validation
|
|
44
35
|
if not self.validate_file_references():
|
|
45
36
|
all_valid = False
|
|
46
37
|
|
|
47
|
-
# Test 4: Content type declarations
|
|
48
38
|
if not self.validate_content_types():
|
|
49
39
|
all_valid = False
|
|
50
40
|
|
|
51
|
-
# Test 5: XSD schema validation
|
|
52
41
|
if not self.validate_against_xsd():
|
|
53
42
|
all_valid = False
|
|
54
43
|
|
|
55
|
-
# Test 6: Whitespace preservation
|
|
56
44
|
if not self.validate_whitespace_preservation():
|
|
57
45
|
all_valid = False
|
|
58
46
|
|
|
59
|
-
# Test 7: Deletion validation
|
|
60
47
|
if not self.validate_deletions():
|
|
61
48
|
all_valid = False
|
|
62
49
|
|
|
63
|
-
# Test 8: Insertion validation
|
|
64
50
|
if not self.validate_insertions():
|
|
65
51
|
all_valid = False
|
|
66
52
|
|
|
67
|
-
# Test 9: Relationship ID reference validation
|
|
68
53
|
if not self.validate_all_relationship_ids():
|
|
69
54
|
all_valid = False
|
|
70
55
|
|
|
71
|
-
# Test 10: ID constraints (paraId, durableId)
|
|
72
56
|
if not self.validate_id_constraints():
|
|
73
57
|
all_valid = False
|
|
74
58
|
|
|
75
|
-
# Test 11: Comment marker validation
|
|
76
59
|
if not self.validate_comment_markers():
|
|
77
60
|
all_valid = False
|
|
78
61
|
|
|
79
|
-
# Count and compare paragraphs
|
|
80
62
|
self.compare_paragraph_counts()
|
|
81
63
|
|
|
82
64
|
return all_valid
|
|
83
65
|
|
|
84
66
|
def validate_whitespace_preservation(self):
|
|
85
|
-
"""
|
|
86
|
-
Validate that w:t elements with whitespace have xml:space='preserve'.
|
|
87
|
-
"""
|
|
88
67
|
errors = []
|
|
89
68
|
|
|
90
69
|
for xml_file in self.xml_files:
|
|
91
|
-
# Only check document.xml files
|
|
92
70
|
if xml_file.name != "document.xml":
|
|
93
71
|
continue
|
|
94
72
|
|
|
95
73
|
try:
|
|
96
74
|
root = lxml.etree.parse(str(xml_file)).getroot()
|
|
97
75
|
|
|
98
|
-
# Find all w:t elements
|
|
99
76
|
for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"):
|
|
100
77
|
if elem.text:
|
|
101
78
|
text = elem.text
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
79
|
+
if re.search(r"^[ \t\n\r]", text) or re.search(
|
|
80
|
+
r"[ \t\n\r]$", text
|
|
81
|
+
):
|
|
105
82
|
xml_space_attr = f"{{{self.XML_NAMESPACE}}}space"
|
|
106
83
|
if (
|
|
107
84
|
xml_space_attr not in elem.attrib
|
|
108
85
|
or elem.attrib[xml_space_attr] != "preserve"
|
|
109
86
|
):
|
|
110
|
-
# Show a preview of the text
|
|
111
87
|
text_preview = (
|
|
112
88
|
repr(text)[:50] + "..."
|
|
113
89
|
if len(repr(text)) > 50
|
|
@@ -134,15 +110,9 @@ class DOCXSchemaValidator(BaseSchemaValidator):
|
|
|
134
110
|
return True
|
|
135
111
|
|
|
136
112
|
def validate_deletions(self):
|
|
137
|
-
"""
|
|
138
|
-
Validate that w:t and w:instrText elements are not within w:del elements.
|
|
139
|
-
Inside w:del, use w:delText and w:delInstrText instead.
|
|
140
|
-
XSD validation does not catch this, so we do it manually.
|
|
141
|
-
"""
|
|
142
113
|
errors = []
|
|
143
114
|
|
|
144
115
|
for xml_file in self.xml_files:
|
|
145
|
-
# Only check document.xml files
|
|
146
116
|
if xml_file.name != "document.xml":
|
|
147
117
|
continue
|
|
148
118
|
|
|
@@ -150,10 +120,8 @@ class DOCXSchemaValidator(BaseSchemaValidator):
|
|
|
150
120
|
root = lxml.etree.parse(str(xml_file)).getroot()
|
|
151
121
|
namespaces = {"w": self.WORD_2006_NAMESPACE}
|
|
152
122
|
|
|
153
|
-
# Find all w:t elements that are descendants of w:del elements
|
|
154
123
|
for t_elem in root.xpath(".//w:del//w:t", namespaces=namespaces):
|
|
155
124
|
if t_elem.text:
|
|
156
|
-
# Show a preview of the text
|
|
157
125
|
text_preview = (
|
|
158
126
|
repr(t_elem.text)[:50] + "..."
|
|
159
127
|
if len(repr(t_elem.text)) > 50
|
|
@@ -164,9 +132,9 @@ class DOCXSchemaValidator(BaseSchemaValidator):
|
|
|
164
132
|
f"Line {t_elem.sourceline}: <w:t> found within <w:del>: {text_preview}"
|
|
165
133
|
)
|
|
166
134
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
135
|
+
for instr_elem in root.xpath(
|
|
136
|
+
".//w:del//w:instrText", namespaces=namespaces
|
|
137
|
+
):
|
|
170
138
|
text_preview = (
|
|
171
139
|
repr(instr_elem.text or "")[:50] + "..."
|
|
172
140
|
if len(repr(instr_elem.text or "")) > 50
|
|
@@ -193,17 +161,14 @@ class DOCXSchemaValidator(BaseSchemaValidator):
|
|
|
193
161
|
return True
|
|
194
162
|
|
|
195
163
|
def count_paragraphs_in_unpacked(self):
|
|
196
|
-
"""Count the number of paragraphs in the unpacked document."""
|
|
197
164
|
count = 0
|
|
198
165
|
|
|
199
166
|
for xml_file in self.xml_files:
|
|
200
|
-
# Only check document.xml files
|
|
201
167
|
if xml_file.name != "document.xml":
|
|
202
168
|
continue
|
|
203
169
|
|
|
204
170
|
try:
|
|
205
171
|
root = lxml.etree.parse(str(xml_file)).getroot()
|
|
206
|
-
# Count all w:p elements
|
|
207
172
|
paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p")
|
|
208
173
|
count = len(paragraphs)
|
|
209
174
|
except Exception as e:
|
|
@@ -212,21 +177,20 @@ class DOCXSchemaValidator(BaseSchemaValidator):
|
|
|
212
177
|
return count
|
|
213
178
|
|
|
214
179
|
def count_paragraphs_in_original(self):
|
|
215
|
-
|
|
180
|
+
original = self.original_file
|
|
181
|
+
if original is None:
|
|
182
|
+
return 0
|
|
183
|
+
|
|
216
184
|
count = 0
|
|
217
185
|
|
|
218
186
|
try:
|
|
219
|
-
# Create temporary directory to unpack original
|
|
220
187
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
221
|
-
|
|
222
|
-
with zipfile.ZipFile(self.original_file, "r") as zip_ref:
|
|
188
|
+
with zipfile.ZipFile(original, "r") as zip_ref:
|
|
223
189
|
zip_ref.extractall(temp_dir)
|
|
224
190
|
|
|
225
|
-
# Parse document.xml
|
|
226
191
|
doc_xml_path = temp_dir + "/word/document.xml"
|
|
227
192
|
root = lxml.etree.parse(doc_xml_path).getroot()
|
|
228
193
|
|
|
229
|
-
# Count all w:p elements
|
|
230
194
|
paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p")
|
|
231
195
|
count = len(paragraphs)
|
|
232
196
|
|
|
@@ -236,10 +200,6 @@ class DOCXSchemaValidator(BaseSchemaValidator):
|
|
|
236
200
|
return count
|
|
237
201
|
|
|
238
202
|
def validate_insertions(self):
|
|
239
|
-
"""
|
|
240
|
-
Validate that w:delText elements are not within w:ins elements.
|
|
241
|
-
w:delText is only allowed in w:ins if nested within a w:del.
|
|
242
|
-
"""
|
|
243
203
|
errors = []
|
|
244
204
|
|
|
245
205
|
for xml_file in self.xml_files:
|
|
@@ -250,7 +210,6 @@ class DOCXSchemaValidator(BaseSchemaValidator):
|
|
|
250
210
|
root = lxml.etree.parse(str(xml_file)).getroot()
|
|
251
211
|
namespaces = {"w": self.WORD_2006_NAMESPACE}
|
|
252
212
|
|
|
253
|
-
# Find w:delText in w:ins that are NOT within w:del
|
|
254
213
|
invalid_elements = root.xpath(
|
|
255
214
|
".//w:ins//w:delText[not(ancestor::w:del)]", namespaces=namespaces
|
|
256
215
|
)
|
|
@@ -282,7 +241,6 @@ class DOCXSchemaValidator(BaseSchemaValidator):
|
|
|
282
241
|
return True
|
|
283
242
|
|
|
284
243
|
def compare_paragraph_counts(self):
|
|
285
|
-
"""Compare paragraph counts between original and new document."""
|
|
286
244
|
original_count = self.count_paragraphs_in_original()
|
|
287
245
|
new_count = self.count_paragraphs_in_unpacked()
|
|
288
246
|
|
|
@@ -291,24 +249,9 @@ class DOCXSchemaValidator(BaseSchemaValidator):
|
|
|
291
249
|
print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})")
|
|
292
250
|
|
|
293
251
|
def _parse_id_value(self, val: str, base: int = 16) -> int:
|
|
294
|
-
"""Parse an ID value as hex (base=16) or decimal (base=10).
|
|
295
|
-
|
|
296
|
-
Args:
|
|
297
|
-
val: The string value to parse
|
|
298
|
-
base: The numeric base (16 for hex, 10 for decimal)
|
|
299
|
-
|
|
300
|
-
Returns:
|
|
301
|
-
The parsed integer value
|
|
302
|
-
"""
|
|
303
252
|
return int(val, base)
|
|
304
253
|
|
|
305
254
|
def validate_id_constraints(self):
|
|
306
|
-
"""Validate paraId and durableId values per OOXML spec.
|
|
307
|
-
|
|
308
|
-
Checks:
|
|
309
|
-
- paraId < 0x80000000 (always hex)
|
|
310
|
-
- durableId < 0x7FFFFFFF (decimal in numbering.xml, hex elsewhere)
|
|
311
|
-
"""
|
|
312
255
|
errors = []
|
|
313
256
|
para_id_attr = f"{{{self.W14_NAMESPACE}}}paraId"
|
|
314
257
|
durable_id_attr = f"{{{self.W16CID_NAMESPACE}}}durableId"
|
|
@@ -316,7 +259,6 @@ class DOCXSchemaValidator(BaseSchemaValidator):
|
|
|
316
259
|
for xml_file in self.xml_files:
|
|
317
260
|
try:
|
|
318
261
|
for elem in lxml.etree.parse(str(xml_file)).iter():
|
|
319
|
-
# paraId is always hex format
|
|
320
262
|
if val := elem.get(para_id_attr):
|
|
321
263
|
if self._parse_id_value(val, base=16) >= 0x80000000:
|
|
322
264
|
errors.append(
|
|
@@ -324,8 +266,6 @@ class DOCXSchemaValidator(BaseSchemaValidator):
|
|
|
324
266
|
)
|
|
325
267
|
|
|
326
268
|
if val := elem.get(durable_id_attr):
|
|
327
|
-
# durableId in numbering.xml must be decimal.
|
|
328
|
-
# Word rejects hex-formatted durableIds in numbering.xml.
|
|
329
269
|
if xml_file.name == "numbering.xml":
|
|
330
270
|
try:
|
|
331
271
|
if self._parse_id_value(val, base=10) >= 0x7FFFFFFF:
|
|
@@ -334,12 +274,10 @@ class DOCXSchemaValidator(BaseSchemaValidator):
|
|
|
334
274
|
f"durableId={val} >= 0x7FFFFFFF"
|
|
335
275
|
)
|
|
336
276
|
except ValueError:
|
|
337
|
-
# Contains non-decimal characters (e.g., hex letters A-F)
|
|
338
277
|
errors.append(
|
|
339
278
|
f" {xml_file.name}:{elem.sourceline}: "
|
|
340
279
|
f"durableId={val} must be decimal in numbering.xml"
|
|
341
280
|
)
|
|
342
|
-
# durableId in other files (e.g. commentsIds.xml) uses hex format
|
|
343
281
|
else:
|
|
344
282
|
if self._parse_id_value(val, base=16) >= 0x7FFFFFFF:
|
|
345
283
|
errors.append(
|
|
@@ -358,16 +296,8 @@ class DOCXSchemaValidator(BaseSchemaValidator):
|
|
|
358
296
|
return not errors
|
|
359
297
|
|
|
360
298
|
def validate_comment_markers(self):
|
|
361
|
-
"""Validate comment markers are properly paired and reference existing comments.
|
|
362
|
-
|
|
363
|
-
Checks:
|
|
364
|
-
- Every commentRangeStart has a matching commentRangeEnd
|
|
365
|
-
- Every commentRangeEnd has a matching commentRangeStart
|
|
366
|
-
- Every marker in document.xml references an existing comment
|
|
367
|
-
"""
|
|
368
299
|
errors = []
|
|
369
300
|
|
|
370
|
-
# Find document.xml and comments.xml
|
|
371
301
|
document_xml = None
|
|
372
302
|
comments_xml = None
|
|
373
303
|
for xml_file in self.xml_files:
|
|
@@ -385,50 +315,59 @@ class DOCXSchemaValidator(BaseSchemaValidator):
|
|
|
385
315
|
doc_root = lxml.etree.parse(str(document_xml)).getroot()
|
|
386
316
|
namespaces = {"w": self.WORD_2006_NAMESPACE}
|
|
387
317
|
|
|
388
|
-
# Collect all comment marker IDs from document.xml
|
|
389
318
|
range_starts = {
|
|
390
319
|
elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id")
|
|
391
|
-
for elem in doc_root.xpath(
|
|
320
|
+
for elem in doc_root.xpath(
|
|
321
|
+
".//w:commentRangeStart", namespaces=namespaces
|
|
322
|
+
)
|
|
392
323
|
}
|
|
393
324
|
range_ends = {
|
|
394
325
|
elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id")
|
|
395
|
-
for elem in doc_root.xpath(
|
|
326
|
+
for elem in doc_root.xpath(
|
|
327
|
+
".//w:commentRangeEnd", namespaces=namespaces
|
|
328
|
+
)
|
|
396
329
|
}
|
|
397
330
|
references = {
|
|
398
331
|
elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id")
|
|
399
|
-
for elem in doc_root.xpath(
|
|
332
|
+
for elem in doc_root.xpath(
|
|
333
|
+
".//w:commentReference", namespaces=namespaces
|
|
334
|
+
)
|
|
400
335
|
}
|
|
401
336
|
|
|
402
|
-
# Check for orphaned commentRangeEnd (missing commentRangeStart)
|
|
403
337
|
orphaned_ends = range_ends - range_starts
|
|
404
|
-
for comment_id in sorted(
|
|
338
|
+
for comment_id in sorted(
|
|
339
|
+
orphaned_ends, key=lambda x: int(x) if x and x.isdigit() else 0
|
|
340
|
+
):
|
|
405
341
|
errors.append(
|
|
406
|
-
f
|
|
342
|
+
f' document.xml: commentRangeEnd id="{comment_id}" has no matching commentRangeStart'
|
|
407
343
|
)
|
|
408
344
|
|
|
409
|
-
# Check for orphaned commentRangeStart (missing commentRangeEnd)
|
|
410
345
|
orphaned_starts = range_starts - range_ends
|
|
411
|
-
for comment_id in sorted(
|
|
346
|
+
for comment_id in sorted(
|
|
347
|
+
orphaned_starts, key=lambda x: int(x) if x and x.isdigit() else 0
|
|
348
|
+
):
|
|
412
349
|
errors.append(
|
|
413
|
-
f
|
|
350
|
+
f' document.xml: commentRangeStart id="{comment_id}" has no matching commentRangeEnd'
|
|
414
351
|
)
|
|
415
352
|
|
|
416
|
-
# Get comment IDs from comments.xml if it exists
|
|
417
353
|
comment_ids = set()
|
|
418
354
|
if comments_xml and comments_xml.exists():
|
|
419
355
|
comments_root = lxml.etree.parse(str(comments_xml)).getroot()
|
|
420
356
|
comment_ids = {
|
|
421
357
|
elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id")
|
|
422
|
-
for elem in comments_root.xpath(
|
|
358
|
+
for elem in comments_root.xpath(
|
|
359
|
+
".//w:comment", namespaces=namespaces
|
|
360
|
+
)
|
|
423
361
|
}
|
|
424
362
|
|
|
425
|
-
# Check for markers referencing non-existent comments
|
|
426
363
|
marker_ids = range_starts | range_ends | references
|
|
427
364
|
invalid_refs = marker_ids - comment_ids
|
|
428
|
-
for comment_id in sorted(
|
|
429
|
-
|
|
365
|
+
for comment_id in sorted(
|
|
366
|
+
invalid_refs, key=lambda x: int(x) if x and x.isdigit() else 0
|
|
367
|
+
):
|
|
368
|
+
if comment_id:
|
|
430
369
|
errors.append(
|
|
431
|
-
f
|
|
370
|
+
f' document.xml: marker id="{comment_id}" references non-existent comment'
|
|
432
371
|
)
|
|
433
372
|
|
|
434
373
|
except (lxml.etree.XMLSyntaxError, Exception) as e:
|
|
@@ -445,22 +384,11 @@ class DOCXSchemaValidator(BaseSchemaValidator):
|
|
|
445
384
|
return True
|
|
446
385
|
|
|
447
386
|
def repair(self) -> int:
|
|
448
|
-
"""Run DOCX-specific auto-repairs."""
|
|
449
387
|
repairs = super().repair()
|
|
450
388
|
repairs += self.repair_durableId()
|
|
451
389
|
return repairs
|
|
452
390
|
|
|
453
391
|
def repair_durableId(self) -> int:
|
|
454
|
-
"""Fix invalid durableId values.
|
|
455
|
-
|
|
456
|
-
Repairs:
|
|
457
|
-
- durableId >= 0x7FFFFFFF (value out of range)
|
|
458
|
-
- durableId with hex letters in numbering.xml (wrong format)
|
|
459
|
-
|
|
460
|
-
Note: paraId is not auto-repaired because it may be referenced by
|
|
461
|
-
commentsExtended.xml, commentsIds.xml, and comment threading (paraIdParent).
|
|
462
|
-
Changing paraId without updating all references would break comment associations.
|
|
463
|
-
"""
|
|
464
392
|
repairs = 0
|
|
465
393
|
|
|
466
394
|
for xml_file in self.xml_files:
|
|
@@ -476,28 +404,27 @@ class DOCXSchemaValidator(BaseSchemaValidator):
|
|
|
476
404
|
durable_id = elem.getAttribute("w16cid:durableId")
|
|
477
405
|
needs_repair = False
|
|
478
406
|
|
|
479
|
-
# Check if durableId needs repair based on file type
|
|
480
407
|
if xml_file.name == "numbering.xml":
|
|
481
|
-
# numbering.xml requires decimal format
|
|
482
408
|
try:
|
|
483
|
-
needs_repair =
|
|
409
|
+
needs_repair = (
|
|
410
|
+
self._parse_id_value(durable_id, base=10) >= 0x7FFFFFFF
|
|
411
|
+
)
|
|
484
412
|
except ValueError:
|
|
485
|
-
# Contains non-decimal characters (e.g., hex letters A-F)
|
|
486
413
|
needs_repair = True
|
|
487
414
|
else:
|
|
488
|
-
# Other files (e.g. commentsIds.xml) use hex format
|
|
489
415
|
try:
|
|
490
|
-
needs_repair =
|
|
416
|
+
needs_repair = (
|
|
417
|
+
self._parse_id_value(durable_id, base=16) >= 0x7FFFFFFF
|
|
418
|
+
)
|
|
491
419
|
except ValueError:
|
|
492
420
|
needs_repair = True
|
|
493
421
|
|
|
494
422
|
if needs_repair:
|
|
495
|
-
# Generate new ID in the correct format for this file type
|
|
496
423
|
value = random.randint(1, 0x7FFFFFFE)
|
|
497
424
|
if xml_file.name == "numbering.xml":
|
|
498
|
-
new_id = str(value)
|
|
425
|
+
new_id = str(value)
|
|
499
426
|
else:
|
|
500
|
-
new_id = f"{value:08X}"
|
|
427
|
+
new_id = f"{value:08X}"
|
|
501
428
|
|
|
502
429
|
elem.setAttribute("w16cid:durableId", new_id)
|
|
503
430
|
print(
|
|
@@ -8,14 +8,11 @@ from .base import BaseSchemaValidator
|
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class PPTXSchemaValidator(BaseSchemaValidator):
|
|
11
|
-
"""Validator for PowerPoint presentation XML files against XSD schemas."""
|
|
12
11
|
|
|
13
|
-
# PowerPoint presentation namespace
|
|
14
12
|
PRESENTATIONML_NAMESPACE = (
|
|
15
13
|
"http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
16
14
|
)
|
|
17
15
|
|
|
18
|
-
# PowerPoint-specific element to relationship type mappings
|
|
19
16
|
ELEMENT_RELATIONSHIP_TYPES = {
|
|
20
17
|
"sldid": "slide",
|
|
21
18
|
"sldmasterid": "slidemaster",
|
|
@@ -26,60 +23,46 @@ class PPTXSchemaValidator(BaseSchemaValidator):
|
|
|
26
23
|
}
|
|
27
24
|
|
|
28
25
|
def validate(self):
|
|
29
|
-
"""Run all validation checks and return True if all pass."""
|
|
30
|
-
# Test 0: XML well-formedness
|
|
31
26
|
if not self.validate_xml():
|
|
32
27
|
return False
|
|
33
28
|
|
|
34
|
-
# Test 1: Namespace declarations
|
|
35
29
|
all_valid = True
|
|
36
30
|
if not self.validate_namespaces():
|
|
37
31
|
all_valid = False
|
|
38
32
|
|
|
39
|
-
# Test 2: Unique IDs
|
|
40
33
|
if not self.validate_unique_ids():
|
|
41
34
|
all_valid = False
|
|
42
35
|
|
|
43
|
-
# Test 3: UUID ID validation
|
|
44
36
|
if not self.validate_uuid_ids():
|
|
45
37
|
all_valid = False
|
|
46
38
|
|
|
47
|
-
# Test 4: Relationship and file reference validation
|
|
48
39
|
if not self.validate_file_references():
|
|
49
40
|
all_valid = False
|
|
50
41
|
|
|
51
|
-
# Test 5: Slide layout ID validation
|
|
52
42
|
if not self.validate_slide_layout_ids():
|
|
53
43
|
all_valid = False
|
|
54
44
|
|
|
55
|
-
# Test 6: Content type declarations
|
|
56
45
|
if not self.validate_content_types():
|
|
57
46
|
all_valid = False
|
|
58
47
|
|
|
59
|
-
# Test 7: XSD schema validation
|
|
60
48
|
if not self.validate_against_xsd():
|
|
61
49
|
all_valid = False
|
|
62
50
|
|
|
63
|
-
# Test 8: Notes slide reference validation
|
|
64
51
|
if not self.validate_notes_slide_references():
|
|
65
52
|
all_valid = False
|
|
66
53
|
|
|
67
|
-
# Test 9: Relationship ID reference validation
|
|
68
54
|
if not self.validate_all_relationship_ids():
|
|
69
55
|
all_valid = False
|
|
70
56
|
|
|
71
|
-
# Test 10: Duplicate slide layout references validation
|
|
72
57
|
if not self.validate_no_duplicate_slide_layouts():
|
|
73
58
|
all_valid = False
|
|
74
59
|
|
|
75
60
|
return all_valid
|
|
76
61
|
|
|
77
62
|
def validate_uuid_ids(self):
|
|
78
|
-
"""Validate that ID attributes that look like UUIDs contain only hex values."""
|
|
79
63
|
import lxml.etree
|
|
80
64
|
|
|
81
65
|
errors = []
|
|
82
|
-
# UUID pattern: 8-4-4-4-12 hex digits with optional braces/hyphens
|
|
83
66
|
uuid_pattern = re.compile(
|
|
84
67
|
r"^[\{\(]?[0-9A-Fa-f]{8}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{12}[\}\)]?$"
|
|
85
68
|
)
|
|
@@ -88,15 +71,11 @@ class PPTXSchemaValidator(BaseSchemaValidator):
|
|
|
88
71
|
try:
|
|
89
72
|
root = lxml.etree.parse(str(xml_file)).getroot()
|
|
90
73
|
|
|
91
|
-
# Check all elements for ID attributes
|
|
92
74
|
for elem in root.iter():
|
|
93
75
|
for attr, value in elem.attrib.items():
|
|
94
|
-
# Check if this is an ID attribute
|
|
95
76
|
attr_name = attr.split("}")[-1].lower()
|
|
96
77
|
if attr_name == "id" or attr_name.endswith("id"):
|
|
97
|
-
# Check if value looks like a UUID (has the right length and pattern structure)
|
|
98
78
|
if self._looks_like_uuid(value):
|
|
99
|
-
# Validate that it contains only hex characters in the right positions
|
|
100
79
|
if not uuid_pattern.match(value):
|
|
101
80
|
errors.append(
|
|
102
81
|
f" {xml_file.relative_to(self.unpacked_dir)}: "
|
|
@@ -119,19 +98,14 @@ class PPTXSchemaValidator(BaseSchemaValidator):
|
|
|
119
98
|
return True
|
|
120
99
|
|
|
121
100
|
def _looks_like_uuid(self, value):
|
|
122
|
-
"""Check if a value has the general structure of a UUID."""
|
|
123
|
-
# Remove common UUID delimiters
|
|
124
101
|
clean_value = value.strip("{}()").replace("-", "")
|
|
125
|
-
# Check if it's 32 hex-like characters (could include invalid hex chars)
|
|
126
102
|
return len(clean_value) == 32 and all(c.isalnum() for c in clean_value)
|
|
127
103
|
|
|
128
104
|
def validate_slide_layout_ids(self):
|
|
129
|
-
"""Validate that sldLayoutId elements in slide masters reference valid slide layouts."""
|
|
130
105
|
import lxml.etree
|
|
131
106
|
|
|
132
107
|
errors = []
|
|
133
108
|
|
|
134
|
-
# Find all slide master files
|
|
135
109
|
slide_masters = list(self.unpacked_dir.glob("ppt/slideMasters/*.xml"))
|
|
136
110
|
|
|
137
111
|
if not slide_masters:
|
|
@@ -141,10 +115,8 @@ class PPTXSchemaValidator(BaseSchemaValidator):
|
|
|
141
115
|
|
|
142
116
|
for slide_master in slide_masters:
|
|
143
117
|
try:
|
|
144
|
-
# Parse the slide master file
|
|
145
118
|
root = lxml.etree.parse(str(slide_master)).getroot()
|
|
146
119
|
|
|
147
|
-
# Find the corresponding _rels file for this slide master
|
|
148
120
|
rels_file = slide_master.parent / "_rels" / f"{slide_master.name}.rels"
|
|
149
121
|
|
|
150
122
|
if not rels_file.exists():
|
|
@@ -154,10 +126,8 @@ class PPTXSchemaValidator(BaseSchemaValidator):
|
|
|
154
126
|
)
|
|
155
127
|
continue
|
|
156
128
|
|
|
157
|
-
# Parse the relationships file
|
|
158
129
|
rels_root = lxml.etree.parse(str(rels_file)).getroot()
|
|
159
130
|
|
|
160
|
-
# Build a set of valid relationship IDs that point to slide layouts
|
|
161
131
|
valid_layout_rids = set()
|
|
162
132
|
for rel in rels_root.findall(
|
|
163
133
|
f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
|
|
@@ -166,7 +136,6 @@ class PPTXSchemaValidator(BaseSchemaValidator):
|
|
|
166
136
|
if "slideLayout" in rel_type:
|
|
167
137
|
valid_layout_rids.add(rel.get("Id"))
|
|
168
138
|
|
|
169
|
-
# Find all sldLayoutId elements in the slide master
|
|
170
139
|
for sld_layout_id in root.findall(
|
|
171
140
|
f".//{{{self.PRESENTATIONML_NAMESPACE}}}sldLayoutId"
|
|
172
141
|
):
|
|
@@ -201,7 +170,6 @@ class PPTXSchemaValidator(BaseSchemaValidator):
|
|
|
201
170
|
return True
|
|
202
171
|
|
|
203
172
|
def validate_no_duplicate_slide_layouts(self):
|
|
204
|
-
"""Validate that each slide has exactly one slideLayout reference."""
|
|
205
173
|
import lxml.etree
|
|
206
174
|
|
|
207
175
|
errors = []
|
|
@@ -211,7 +179,6 @@ class PPTXSchemaValidator(BaseSchemaValidator):
|
|
|
211
179
|
try:
|
|
212
180
|
root = lxml.etree.parse(str(rels_file)).getroot()
|
|
213
181
|
|
|
214
|
-
# Find all slideLayout relationships
|
|
215
182
|
layout_rels = [
|
|
216
183
|
rel
|
|
217
184
|
for rel in root.findall(
|
|
@@ -241,13 +208,11 @@ class PPTXSchemaValidator(BaseSchemaValidator):
|
|
|
241
208
|
return True
|
|
242
209
|
|
|
243
210
|
def validate_notes_slide_references(self):
|
|
244
|
-
"""Validate that each notesSlide file is referenced by only one slide."""
|
|
245
211
|
import lxml.etree
|
|
246
212
|
|
|
247
213
|
errors = []
|
|
248
|
-
notes_slide_references = {}
|
|
214
|
+
notes_slide_references = {}
|
|
249
215
|
|
|
250
|
-
# Find all slide relationship files
|
|
251
216
|
slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels"))
|
|
252
217
|
|
|
253
218
|
if not slide_rels_files:
|
|
@@ -257,10 +222,8 @@ class PPTXSchemaValidator(BaseSchemaValidator):
|
|
|
257
222
|
|
|
258
223
|
for rels_file in slide_rels_files:
|
|
259
224
|
try:
|
|
260
|
-
# Parse the relationships file
|
|
261
225
|
root = lxml.etree.parse(str(rels_file)).getroot()
|
|
262
226
|
|
|
263
|
-
# Find all notesSlide relationships
|
|
264
227
|
for rel in root.findall(
|
|
265
228
|
f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
|
|
266
229
|
):
|
|
@@ -268,13 +231,11 @@ class PPTXSchemaValidator(BaseSchemaValidator):
|
|
|
268
231
|
if "notesSlide" in rel_type:
|
|
269
232
|
target = rel.get("Target", "")
|
|
270
233
|
if target:
|
|
271
|
-
# Normalize the target path to handle relative paths
|
|
272
234
|
normalized_target = target.replace("../", "")
|
|
273
235
|
|
|
274
|
-
# Track which slide references this notesSlide
|
|
275
236
|
slide_name = rels_file.stem.replace(
|
|
276
237
|
".xml", ""
|
|
277
|
-
)
|
|
238
|
+
)
|
|
278
239
|
|
|
279
240
|
if normalized_target not in notes_slide_references:
|
|
280
241
|
notes_slide_references[normalized_target] = []
|
|
@@ -287,7 +248,6 @@ class PPTXSchemaValidator(BaseSchemaValidator):
|
|
|
287
248
|
f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}"
|
|
288
249
|
)
|
|
289
250
|
|
|
290
|
-
# Check for duplicate references
|
|
291
251
|
for target, references in notes_slide_references.items():
|
|
292
252
|
if len(references) > 1:
|
|
293
253
|
slide_names = [ref[0] for ref in references]
|